diff --git "a/checkpoint-30000/trainer_state.json" "b/checkpoint-30000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-30000/trainer_state.json" @@ -0,0 +1,105033 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.6465422612513723, + "eval_steps": 1000, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0001097694840834248, + "grad_norm": 2.557079792022705, + "learning_rate": 4.9999999451688654e-05, + "loss": 0.3518, + "step": 2 + }, + { + "epoch": 0.0002195389681668496, + "grad_norm": 2.850750684738159, + "learning_rate": 4.9999997806754614e-05, + "loss": 0.3907, + "step": 4 + }, + { + "epoch": 0.00032930845225027445, + "grad_norm": 3.050617218017578, + "learning_rate": 4.9999995065197964e-05, + "loss": 0.3802, + "step": 6 + }, + { + "epoch": 0.0004390779363336992, + "grad_norm": 2.0603723526000977, + "learning_rate": 4.999999122701883e-05, + "loss": 0.3794, + "step": 8 + }, + { + "epoch": 0.0005488474204171241, + "grad_norm": 1.9942203760147095, + "learning_rate": 4.9999986292217365e-05, + "loss": 0.4352, + "step": 10 + }, + { + "epoch": 0.0006586169045005489, + "grad_norm": 2.1194818019866943, + "learning_rate": 4.99999802607938e-05, + "loss": 0.396, + "step": 12 + }, + { + "epoch": 0.0007683863885839736, + "grad_norm": 1.5206925868988037, + "learning_rate": 4.999997313274839e-05, + "loss": 0.3935, + "step": 14 + }, + { + "epoch": 0.0008781558726673985, + "grad_norm": 2.800589084625244, + "learning_rate": 4.9999964908081455e-05, + "loss": 0.4614, + "step": 16 + }, + { + "epoch": 0.0009879253567508233, + "grad_norm": 1.7923312187194824, + "learning_rate": 4.999995558679334e-05, + "loss": 0.3399, + "step": 18 + }, + { + "epoch": 0.0010976948408342481, + "grad_norm": 2.009481906890869, + "learning_rate": 4.999994516888449e-05, + "loss": 0.5326, + "step": 20 + }, + { + "epoch": 0.001207464324917673, + "grad_norm": 3.2257771492004395, + "learning_rate": 4.999993365435532e-05, + "loss": 0.4816, + "step": 22 + }, + { + "epoch": 0.0013172338090010978, + "grad_norm": 2.10583233833313, + "learning_rate": 4.999992104320636e-05, + "loss": 0.5, + "step": 24 + }, + { + "epoch": 0.0014270032930845224, + "grad_norm": 2.787137031555176, + "learning_rate": 4.999990733543815e-05, + "loss": 0.5253, + "step": 26 + }, + { + "epoch": 0.0015367727771679472, + "grad_norm": 2.3569934368133545, + "learning_rate": 4.99998925310513e-05, + "loss": 0.4531, + "step": 28 + }, + { + "epoch": 0.001646542261251372, + "grad_norm": 1.9834591150283813, + "learning_rate": 4.999987663004646e-05, + "loss": 0.4146, + "step": 30 + }, + { + "epoch": 0.001756311745334797, + "grad_norm": 2.1990232467651367, + "learning_rate": 4.999985963242432e-05, + "loss": 0.378, + "step": 32 + }, + { + "epoch": 0.0018660812294182217, + "grad_norm": 3.174633026123047, + "learning_rate": 4.999984153818563e-05, + "loss": 0.4677, + "step": 34 + }, + { + "epoch": 0.0019758507135016466, + "grad_norm": 2.6213908195495605, + "learning_rate": 4.999982234733118e-05, + "loss": 0.4694, + "step": 36 + }, + { + "epoch": 0.0020856201975850714, + "grad_norm": 2.84000301361084, + "learning_rate": 4.9999802059861825e-05, + "loss": 0.5833, + "step": 38 + }, + { + "epoch": 0.0021953896816684962, + "grad_norm": 1.7546024322509766, + "learning_rate": 4.999978067577844e-05, + "loss": 0.3876, + "step": 40 + }, + { + "epoch": 0.002305159165751921, + "grad_norm": 5.890688419342041, + "learning_rate": 4.9999758195081974e-05, + "loss": 0.5361, + "step": 42 + }, + { + "epoch": 0.002414928649835346, + "grad_norm": 2.2697315216064453, + "learning_rate": 4.9999734617773405e-05, + "loss": 0.4894, + "step": 44 + }, + { + "epoch": 0.0025246981339187707, + "grad_norm": 2.9422004222869873, + "learning_rate": 4.9999709943853766e-05, + "loss": 0.4012, + "step": 46 + }, + { + "epoch": 0.0026344676180021956, + "grad_norm": 1.6737319231033325, + "learning_rate": 4.999968417332415e-05, + "loss": 0.3837, + "step": 48 + }, + { + "epoch": 0.0027442371020856204, + "grad_norm": 4.7673139572143555, + "learning_rate": 4.999965730618567e-05, + "loss": 0.5721, + "step": 50 + }, + { + "epoch": 0.002854006586169045, + "grad_norm": 2.0007219314575195, + "learning_rate": 4.9999629342439524e-05, + "loss": 0.3446, + "step": 52 + }, + { + "epoch": 0.0029637760702524696, + "grad_norm": 1.9145058393478394, + "learning_rate": 4.999960028208692e-05, + "loss": 0.2976, + "step": 54 + }, + { + "epoch": 0.0030735455543358945, + "grad_norm": 2.3646247386932373, + "learning_rate": 4.999957012512916e-05, + "loss": 0.3558, + "step": 56 + }, + { + "epoch": 0.0031833150384193193, + "grad_norm": 2.5521936416625977, + "learning_rate": 4.999953887156753e-05, + "loss": 0.4267, + "step": 58 + }, + { + "epoch": 0.003293084522502744, + "grad_norm": 1.896323800086975, + "learning_rate": 4.999950652140343e-05, + "loss": 0.4997, + "step": 60 + }, + { + "epoch": 0.003402854006586169, + "grad_norm": 4.5184807777404785, + "learning_rate": 4.999947307463827e-05, + "loss": 0.4033, + "step": 62 + }, + { + "epoch": 0.003512623490669594, + "grad_norm": 1.7074967622756958, + "learning_rate": 4.999943853127351e-05, + "loss": 0.4078, + "step": 64 + }, + { + "epoch": 0.0036223929747530186, + "grad_norm": 1.8653061389923096, + "learning_rate": 4.999940289131067e-05, + "loss": 0.3757, + "step": 66 + }, + { + "epoch": 0.0037321624588364435, + "grad_norm": 5.915266513824463, + "learning_rate": 4.999936615475133e-05, + "loss": 0.5523, + "step": 68 + }, + { + "epoch": 0.0038419319429198683, + "grad_norm": 2.2616639137268066, + "learning_rate": 4.999932832159707e-05, + "loss": 0.4799, + "step": 70 + }, + { + "epoch": 0.003951701427003293, + "grad_norm": 8.216501235961914, + "learning_rate": 4.999928939184958e-05, + "loss": 0.513, + "step": 72 + }, + { + "epoch": 0.0040614709110867175, + "grad_norm": 2.222851276397705, + "learning_rate": 4.999924936551054e-05, + "loss": 0.4124, + "step": 74 + }, + { + "epoch": 0.004171240395170143, + "grad_norm": 1.774841547012329, + "learning_rate": 4.999920824258173e-05, + "loss": 0.4218, + "step": 76 + }, + { + "epoch": 0.004281009879253567, + "grad_norm": 2.2803995609283447, + "learning_rate": 4.999916602306494e-05, + "loss": 0.3897, + "step": 78 + }, + { + "epoch": 0.0043907793633369925, + "grad_norm": 3.8175692558288574, + "learning_rate": 4.999912270696202e-05, + "loss": 0.3947, + "step": 80 + }, + { + "epoch": 0.004500548847420417, + "grad_norm": 2.3315610885620117, + "learning_rate": 4.999907829427488e-05, + "loss": 0.4537, + "step": 82 + }, + { + "epoch": 0.004610318331503842, + "grad_norm": 2.33901309967041, + "learning_rate": 4.9999032785005464e-05, + "loss": 0.4582, + "step": 84 + }, + { + "epoch": 0.0047200878155872665, + "grad_norm": 3.2068264484405518, + "learning_rate": 4.999898617915576e-05, + "loss": 0.4916, + "step": 86 + }, + { + "epoch": 0.004829857299670692, + "grad_norm": 1.928565502166748, + "learning_rate": 4.9998938476727826e-05, + "loss": 0.4681, + "step": 88 + }, + { + "epoch": 0.004939626783754116, + "grad_norm": 2.491704225540161, + "learning_rate": 4.999888967772375e-05, + "loss": 0.5815, + "step": 90 + }, + { + "epoch": 0.0050493962678375415, + "grad_norm": 1.9054763317108154, + "learning_rate": 4.999883978214567e-05, + "loss": 0.6628, + "step": 92 + }, + { + "epoch": 0.005159165751920966, + "grad_norm": 1.9857672452926636, + "learning_rate": 4.999878878999577e-05, + "loss": 0.3585, + "step": 94 + }, + { + "epoch": 0.005268935236004391, + "grad_norm": 1.6148121356964111, + "learning_rate": 4.9998736701276295e-05, + "loss": 0.4727, + "step": 96 + }, + { + "epoch": 0.0053787047200878155, + "grad_norm": 3.6325631141662598, + "learning_rate": 4.9998683515989534e-05, + "loss": 0.3691, + "step": 98 + }, + { + "epoch": 0.005488474204171241, + "grad_norm": 2.1458756923675537, + "learning_rate": 4.999862923413781e-05, + "loss": 0.5753, + "step": 100 + }, + { + "epoch": 0.005598243688254665, + "grad_norm": 1.6250834465026855, + "learning_rate": 4.9998573855723504e-05, + "loss": 0.3745, + "step": 102 + }, + { + "epoch": 0.00570801317233809, + "grad_norm": 1.5428743362426758, + "learning_rate": 4.999851738074904e-05, + "loss": 0.4161, + "step": 104 + }, + { + "epoch": 0.005817782656421515, + "grad_norm": 1.7840986251831055, + "learning_rate": 4.9998459809216924e-05, + "loss": 0.4606, + "step": 106 + }, + { + "epoch": 0.005927552140504939, + "grad_norm": 2.1396255493164062, + "learning_rate": 4.999840114112965e-05, + "loss": 0.4151, + "step": 108 + }, + { + "epoch": 0.0060373216245883645, + "grad_norm": 2.5107650756835938, + "learning_rate": 4.99983413764898e-05, + "loss": 0.4793, + "step": 110 + }, + { + "epoch": 0.006147091108671789, + "grad_norm": 1.97507905960083, + "learning_rate": 4.99982805153e-05, + "loss": 0.3939, + "step": 112 + }, + { + "epoch": 0.006256860592755214, + "grad_norm": 1.0682778358459473, + "learning_rate": 4.999821855756293e-05, + "loss": 0.4538, + "step": 114 + }, + { + "epoch": 0.006366630076838639, + "grad_norm": 1.4954508543014526, + "learning_rate": 4.999815550328128e-05, + "loss": 0.4137, + "step": 116 + }, + { + "epoch": 0.006476399560922064, + "grad_norm": 2.2385611534118652, + "learning_rate": 4.999809135245783e-05, + "loss": 0.6805, + "step": 118 + }, + { + "epoch": 0.006586169045005488, + "grad_norm": 2.5691096782684326, + "learning_rate": 4.9998026105095405e-05, + "loss": 0.4807, + "step": 120 + }, + { + "epoch": 0.0066959385290889135, + "grad_norm": 1.7923052310943604, + "learning_rate": 4.999795976119686e-05, + "loss": 0.4317, + "step": 122 + }, + { + "epoch": 0.006805708013172338, + "grad_norm": 2.357499837875366, + "learning_rate": 4.999789232076509e-05, + "loss": 0.4647, + "step": 124 + }, + { + "epoch": 0.006915477497255763, + "grad_norm": 2.4390525817871094, + "learning_rate": 4.999782378380308e-05, + "loss": 0.3755, + "step": 126 + }, + { + "epoch": 0.007025246981339188, + "grad_norm": 1.678443431854248, + "learning_rate": 4.9997754150313815e-05, + "loss": 0.5009, + "step": 128 + }, + { + "epoch": 0.007135016465422613, + "grad_norm": 2.314382791519165, + "learning_rate": 4.999768342030035e-05, + "loss": 0.3623, + "step": 130 + }, + { + "epoch": 0.007244785949506037, + "grad_norm": 3.029951333999634, + "learning_rate": 4.99976115937658e-05, + "loss": 0.405, + "step": 132 + }, + { + "epoch": 0.0073545554335894625, + "grad_norm": 2.0923995971679688, + "learning_rate": 4.9997538670713316e-05, + "loss": 0.465, + "step": 134 + }, + { + "epoch": 0.007464324917672887, + "grad_norm": 2.0314981937408447, + "learning_rate": 4.999746465114609e-05, + "loss": 0.5008, + "step": 136 + }, + { + "epoch": 0.007574094401756311, + "grad_norm": 2.8472678661346436, + "learning_rate": 4.9997389535067365e-05, + "loss": 0.5633, + "step": 138 + }, + { + "epoch": 0.007683863885839737, + "grad_norm": 2.208470344543457, + "learning_rate": 4.999731332248044e-05, + "loss": 0.4828, + "step": 140 + }, + { + "epoch": 0.007793633369923161, + "grad_norm": 2.5636181831359863, + "learning_rate": 4.999723601338866e-05, + "loss": 0.3904, + "step": 142 + }, + { + "epoch": 0.007903402854006586, + "grad_norm": 2.4006967544555664, + "learning_rate": 4.999715760779541e-05, + "loss": 0.5266, + "step": 144 + }, + { + "epoch": 0.008013172338090012, + "grad_norm": 2.2951340675354004, + "learning_rate": 4.9997078105704144e-05, + "loss": 0.4501, + "step": 146 + }, + { + "epoch": 0.008122941822173435, + "grad_norm": 1.9132189750671387, + "learning_rate": 4.999699750711833e-05, + "loss": 0.6174, + "step": 148 + }, + { + "epoch": 0.00823271130625686, + "grad_norm": 1.8691729307174683, + "learning_rate": 4.999691581204152e-05, + "loss": 0.4311, + "step": 150 + }, + { + "epoch": 0.008342480790340286, + "grad_norm": 3.5761911869049072, + "learning_rate": 4.9996833020477285e-05, + "loss": 0.4258, + "step": 152 + }, + { + "epoch": 0.008452250274423711, + "grad_norm": 2.718625068664551, + "learning_rate": 4.999674913242927e-05, + "loss": 0.5168, + "step": 154 + }, + { + "epoch": 0.008562019758507134, + "grad_norm": 3.083103895187378, + "learning_rate": 4.999666414790113e-05, + "loss": 0.374, + "step": 156 + }, + { + "epoch": 0.00867178924259056, + "grad_norm": 3.961289167404175, + "learning_rate": 4.9996578066896624e-05, + "loss": 0.5195, + "step": 158 + }, + { + "epoch": 0.008781558726673985, + "grad_norm": 1.5836013555526733, + "learning_rate": 4.9996490889419514e-05, + "loss": 0.418, + "step": 160 + }, + { + "epoch": 0.00889132821075741, + "grad_norm": 2.400146245956421, + "learning_rate": 4.999640261547362e-05, + "loss": 0.4792, + "step": 162 + }, + { + "epoch": 0.009001097694840834, + "grad_norm": 1.7619861364364624, + "learning_rate": 4.9996313245062823e-05, + "loss": 0.3351, + "step": 164 + }, + { + "epoch": 0.009110867178924259, + "grad_norm": 4.94334077835083, + "learning_rate": 4.999622277819102e-05, + "loss": 0.6349, + "step": 166 + }, + { + "epoch": 0.009220636663007684, + "grad_norm": 1.6957963705062866, + "learning_rate": 4.999613121486222e-05, + "loss": 0.3091, + "step": 168 + }, + { + "epoch": 0.009330406147091108, + "grad_norm": 3.4800655841827393, + "learning_rate": 4.999603855508041e-05, + "loss": 0.3734, + "step": 170 + }, + { + "epoch": 0.009440175631174533, + "grad_norm": 8.3698091506958, + "learning_rate": 4.999594479884965e-05, + "loss": 0.6043, + "step": 172 + }, + { + "epoch": 0.009549945115257958, + "grad_norm": 2.5790274143218994, + "learning_rate": 4.9995849946174075e-05, + "loss": 0.5206, + "step": 174 + }, + { + "epoch": 0.009659714599341384, + "grad_norm": 2.969326972961426, + "learning_rate": 4.999575399705783e-05, + "loss": 0.6153, + "step": 176 + }, + { + "epoch": 0.009769484083424807, + "grad_norm": 3.885664224624634, + "learning_rate": 4.999565695150513e-05, + "loss": 0.4733, + "step": 178 + }, + { + "epoch": 0.009879253567508232, + "grad_norm": 3.3647103309631348, + "learning_rate": 4.999555880952023e-05, + "loss": 0.4964, + "step": 180 + }, + { + "epoch": 0.009989023051591658, + "grad_norm": 2.9069266319274902, + "learning_rate": 4.999545957110743e-05, + "loss": 0.5088, + "step": 182 + }, + { + "epoch": 0.010098792535675083, + "grad_norm": 6.2329816818237305, + "learning_rate": 4.999535923627109e-05, + "loss": 0.4816, + "step": 184 + }, + { + "epoch": 0.010208562019758506, + "grad_norm": 2.556114435195923, + "learning_rate": 4.9995257805015615e-05, + "loss": 0.5517, + "step": 186 + }, + { + "epoch": 0.010318331503841932, + "grad_norm": 1.8373204469680786, + "learning_rate": 4.999515527734545e-05, + "loss": 0.4065, + "step": 188 + }, + { + "epoch": 0.010428100987925357, + "grad_norm": 1.971653938293457, + "learning_rate": 4.999505165326509e-05, + "loss": 0.5708, + "step": 190 + }, + { + "epoch": 0.010537870472008782, + "grad_norm": 4.439863681793213, + "learning_rate": 4.999494693277907e-05, + "loss": 0.5668, + "step": 192 + }, + { + "epoch": 0.010647639956092206, + "grad_norm": 2.5268876552581787, + "learning_rate": 4.9994841115892013e-05, + "loss": 0.46, + "step": 194 + }, + { + "epoch": 0.010757409440175631, + "grad_norm": 2.570751190185547, + "learning_rate": 4.999473420260853e-05, + "loss": 0.5012, + "step": 196 + }, + { + "epoch": 0.010867178924259056, + "grad_norm": 2.0545859336853027, + "learning_rate": 4.9994626192933324e-05, + "loss": 0.4301, + "step": 198 + }, + { + "epoch": 0.010976948408342482, + "grad_norm": 2.0516347885131836, + "learning_rate": 4.999451708687114e-05, + "loss": 0.5197, + "step": 200 + }, + { + "epoch": 0.011086717892425905, + "grad_norm": 2.426163673400879, + "learning_rate": 4.999440688442675e-05, + "loss": 0.4939, + "step": 202 + }, + { + "epoch": 0.01119648737650933, + "grad_norm": 1.9083834886550903, + "learning_rate": 4.9994295585605e-05, + "loss": 0.4282, + "step": 204 + }, + { + "epoch": 0.011306256860592756, + "grad_norm": 1.9721384048461914, + "learning_rate": 4.999418319041076e-05, + "loss": 0.4643, + "step": 206 + }, + { + "epoch": 0.01141602634467618, + "grad_norm": 2.1012933254241943, + "learning_rate": 4.999406969884897e-05, + "loss": 0.3743, + "step": 208 + }, + { + "epoch": 0.011525795828759604, + "grad_norm": 2.4777934551239014, + "learning_rate": 4.999395511092461e-05, + "loss": 0.4665, + "step": 210 + }, + { + "epoch": 0.01163556531284303, + "grad_norm": 2.7673542499542236, + "learning_rate": 4.9993839426642685e-05, + "loss": 0.5402, + "step": 212 + }, + { + "epoch": 0.011745334796926455, + "grad_norm": 2.625732183456421, + "learning_rate": 4.99937226460083e-05, + "loss": 0.4198, + "step": 214 + }, + { + "epoch": 0.011855104281009879, + "grad_norm": 1.670381784439087, + "learning_rate": 4.999360476902656e-05, + "loss": 0.433, + "step": 216 + }, + { + "epoch": 0.011964873765093304, + "grad_norm": 1.9097658395767212, + "learning_rate": 4.9993485795702636e-05, + "loss": 0.3762, + "step": 218 + }, + { + "epoch": 0.012074643249176729, + "grad_norm": 2.7011396884918213, + "learning_rate": 4.999336572604175e-05, + "loss": 0.4823, + "step": 220 + }, + { + "epoch": 0.012184412733260154, + "grad_norm": 1.9383732080459595, + "learning_rate": 4.9993244560049174e-05, + "loss": 0.4486, + "step": 222 + }, + { + "epoch": 0.012294182217343578, + "grad_norm": 4.055792331695557, + "learning_rate": 4.999312229773022e-05, + "loss": 0.4777, + "step": 224 + }, + { + "epoch": 0.012403951701427003, + "grad_norm": 2.4447357654571533, + "learning_rate": 4.999299893909024e-05, + "loss": 0.3436, + "step": 226 + }, + { + "epoch": 0.012513721185510428, + "grad_norm": 2.659576654434204, + "learning_rate": 4.9992874484134653e-05, + "loss": 0.5232, + "step": 228 + }, + { + "epoch": 0.012623490669593854, + "grad_norm": 1.3218145370483398, + "learning_rate": 4.9992748932868926e-05, + "loss": 0.3032, + "step": 230 + }, + { + "epoch": 0.012733260153677277, + "grad_norm": 2.559384346008301, + "learning_rate": 4.999262228529855e-05, + "loss": 0.5144, + "step": 232 + }, + { + "epoch": 0.012843029637760702, + "grad_norm": 3.2997920513153076, + "learning_rate": 4.9992494541429094e-05, + "loss": 0.4599, + "step": 234 + }, + { + "epoch": 0.012952799121844128, + "grad_norm": 2.290081024169922, + "learning_rate": 4.999236570126616e-05, + "loss": 0.4421, + "step": 236 + }, + { + "epoch": 0.013062568605927551, + "grad_norm": 3.9387454986572266, + "learning_rate": 4.99922357648154e-05, + "loss": 0.495, + "step": 238 + }, + { + "epoch": 0.013172338090010977, + "grad_norm": 3.952310800552368, + "learning_rate": 4.99921047320825e-05, + "loss": 0.5761, + "step": 240 + }, + { + "epoch": 0.013282107574094402, + "grad_norm": 2.323775053024292, + "learning_rate": 4.999197260307322e-05, + "loss": 0.4212, + "step": 242 + }, + { + "epoch": 0.013391877058177827, + "grad_norm": 3.432781457901001, + "learning_rate": 4.999183937779336e-05, + "loss": 0.4155, + "step": 244 + }, + { + "epoch": 0.01350164654226125, + "grad_norm": 2.945915937423706, + "learning_rate": 4.999170505624875e-05, + "loss": 0.4117, + "step": 246 + }, + { + "epoch": 0.013611416026344676, + "grad_norm": 2.3180205821990967, + "learning_rate": 4.99915696384453e-05, + "loss": 0.6149, + "step": 248 + }, + { + "epoch": 0.013721185510428101, + "grad_norm": 1.5271575450897217, + "learning_rate": 4.999143312438893e-05, + "loss": 0.3581, + "step": 250 + }, + { + "epoch": 0.013830954994511526, + "grad_norm": 3.0977282524108887, + "learning_rate": 4.9991295514085644e-05, + "loss": 0.3943, + "step": 252 + }, + { + "epoch": 0.01394072447859495, + "grad_norm": 2.6382391452789307, + "learning_rate": 4.9991156807541476e-05, + "loss": 0.547, + "step": 254 + }, + { + "epoch": 0.014050493962678375, + "grad_norm": 1.3192814588546753, + "learning_rate": 4.9991017004762496e-05, + "loss": 0.4141, + "step": 256 + }, + { + "epoch": 0.0141602634467618, + "grad_norm": 8.30486011505127, + "learning_rate": 4.999087610575485e-05, + "loss": 0.4859, + "step": 258 + }, + { + "epoch": 0.014270032930845226, + "grad_norm": 2.055199384689331, + "learning_rate": 4.999073411052472e-05, + "loss": 0.4138, + "step": 260 + }, + { + "epoch": 0.01437980241492865, + "grad_norm": 2.0361897945404053, + "learning_rate": 4.9990591019078324e-05, + "loss": 0.5227, + "step": 262 + }, + { + "epoch": 0.014489571899012075, + "grad_norm": 12.27379322052002, + "learning_rate": 4.9990446831421955e-05, + "loss": 0.4654, + "step": 264 + }, + { + "epoch": 0.0145993413830955, + "grad_norm": 2.4411630630493164, + "learning_rate": 4.999030154756192e-05, + "loss": 0.3839, + "step": 266 + }, + { + "epoch": 0.014709110867178925, + "grad_norm": 4.12604284286499, + "learning_rate": 4.99901551675046e-05, + "loss": 0.6066, + "step": 268 + }, + { + "epoch": 0.014818880351262349, + "grad_norm": 2.7374014854431152, + "learning_rate": 4.999000769125641e-05, + "loss": 0.5165, + "step": 270 + }, + { + "epoch": 0.014928649835345774, + "grad_norm": 5.116325855255127, + "learning_rate": 4.998985911882384e-05, + "loss": 0.5098, + "step": 272 + }, + { + "epoch": 0.0150384193194292, + "grad_norm": 3.904822826385498, + "learning_rate": 4.998970945021338e-05, + "loss": 0.4464, + "step": 274 + }, + { + "epoch": 0.015148188803512623, + "grad_norm": 2.4889214038848877, + "learning_rate": 4.998955868543161e-05, + "loss": 0.5001, + "step": 276 + }, + { + "epoch": 0.015257958287596048, + "grad_norm": 3.7128818035125732, + "learning_rate": 4.9989406824485136e-05, + "loss": 0.5631, + "step": 278 + }, + { + "epoch": 0.015367727771679473, + "grad_norm": 1.989249348640442, + "learning_rate": 4.998925386738063e-05, + "loss": 0.5654, + "step": 280 + }, + { + "epoch": 0.015477497255762898, + "grad_norm": 1.648297667503357, + "learning_rate": 4.9989099814124785e-05, + "loss": 0.3781, + "step": 282 + }, + { + "epoch": 0.015587266739846322, + "grad_norm": 3.6875505447387695, + "learning_rate": 4.998894466472438e-05, + "loss": 0.4945, + "step": 284 + }, + { + "epoch": 0.015697036223929747, + "grad_norm": 2.025315999984741, + "learning_rate": 4.99887884191862e-05, + "loss": 0.3932, + "step": 286 + }, + { + "epoch": 0.015806805708013173, + "grad_norm": 1.986636996269226, + "learning_rate": 4.998863107751711e-05, + "loss": 0.3524, + "step": 288 + }, + { + "epoch": 0.015916575192096598, + "grad_norm": 1.7608904838562012, + "learning_rate": 4.998847263972401e-05, + "loss": 0.3883, + "step": 290 + }, + { + "epoch": 0.016026344676180023, + "grad_norm": 1.9678391218185425, + "learning_rate": 4.9988313105813856e-05, + "loss": 0.3489, + "step": 292 + }, + { + "epoch": 0.01613611416026345, + "grad_norm": 2.2896852493286133, + "learning_rate": 4.998815247579364e-05, + "loss": 0.8149, + "step": 294 + }, + { + "epoch": 0.01624588364434687, + "grad_norm": 2.032093048095703, + "learning_rate": 4.9987990749670395e-05, + "loss": 0.3988, + "step": 296 + }, + { + "epoch": 0.016355653128430295, + "grad_norm": 1.7942979335784912, + "learning_rate": 4.9987827927451234e-05, + "loss": 0.3098, + "step": 298 + }, + { + "epoch": 0.01646542261251372, + "grad_norm": 1.751418113708496, + "learning_rate": 4.998766400914329e-05, + "loss": 0.529, + "step": 300 + }, + { + "epoch": 0.016575192096597146, + "grad_norm": 2.692558765411377, + "learning_rate": 4.9987498994753755e-05, + "loss": 0.5702, + "step": 302 + }, + { + "epoch": 0.01668496158068057, + "grad_norm": 3.0610227584838867, + "learning_rate": 4.998733288428987e-05, + "loss": 0.5446, + "step": 304 + }, + { + "epoch": 0.016794731064763996, + "grad_norm": 3.1607353687286377, + "learning_rate": 4.998716567775893e-05, + "loss": 0.4383, + "step": 306 + }, + { + "epoch": 0.016904500548847422, + "grad_norm": 2.056636095046997, + "learning_rate": 4.9986997375168246e-05, + "loss": 0.4416, + "step": 308 + }, + { + "epoch": 0.017014270032930844, + "grad_norm": 2.724821090698242, + "learning_rate": 4.998682797652522e-05, + "loss": 0.3488, + "step": 310 + }, + { + "epoch": 0.01712403951701427, + "grad_norm": 2.768576145172119, + "learning_rate": 4.9986657481837277e-05, + "loss": 0.5313, + "step": 312 + }, + { + "epoch": 0.017233809001097694, + "grad_norm": 1.3880505561828613, + "learning_rate": 4.998648589111189e-05, + "loss": 0.4797, + "step": 314 + }, + { + "epoch": 0.01734357848518112, + "grad_norm": 2.7955899238586426, + "learning_rate": 4.9986313204356594e-05, + "loss": 0.4718, + "step": 316 + }, + { + "epoch": 0.017453347969264545, + "grad_norm": 5.333556175231934, + "learning_rate": 4.9986139421578956e-05, + "loss": 0.4795, + "step": 318 + }, + { + "epoch": 0.01756311745334797, + "grad_norm": 2.9731924533843994, + "learning_rate": 4.9985964542786614e-05, + "loss": 0.4276, + "step": 320 + }, + { + "epoch": 0.017672886937431395, + "grad_norm": 2.1031415462493896, + "learning_rate": 4.998578856798722e-05, + "loss": 0.4733, + "step": 322 + }, + { + "epoch": 0.01778265642151482, + "grad_norm": 3.1342737674713135, + "learning_rate": 4.99856114971885e-05, + "loss": 0.4531, + "step": 324 + }, + { + "epoch": 0.017892425905598242, + "grad_norm": 1.6222102642059326, + "learning_rate": 4.9985433330398226e-05, + "loss": 0.5015, + "step": 326 + }, + { + "epoch": 0.018002195389681667, + "grad_norm": 3.8368356227874756, + "learning_rate": 4.9985254067624215e-05, + "loss": 0.4797, + "step": 328 + }, + { + "epoch": 0.018111964873765093, + "grad_norm": 1.6327639818191528, + "learning_rate": 4.998507370887433e-05, + "loss": 0.5055, + "step": 330 + }, + { + "epoch": 0.018221734357848518, + "grad_norm": 4.729297161102295, + "learning_rate": 4.9984892254156465e-05, + "loss": 0.3617, + "step": 332 + }, + { + "epoch": 0.018331503841931943, + "grad_norm": 3.088893175125122, + "learning_rate": 4.99847097034786e-05, + "loss": 0.58, + "step": 334 + }, + { + "epoch": 0.01844127332601537, + "grad_norm": 3.057692766189575, + "learning_rate": 4.998452605684874e-05, + "loss": 0.5477, + "step": 336 + }, + { + "epoch": 0.018551042810098794, + "grad_norm": 5.3366804122924805, + "learning_rate": 4.9984341314274926e-05, + "loss": 0.337, + "step": 338 + }, + { + "epoch": 0.018660812294182216, + "grad_norm": 3.2676730155944824, + "learning_rate": 4.998415547576527e-05, + "loss": 0.3934, + "step": 340 + }, + { + "epoch": 0.01877058177826564, + "grad_norm": 7.169600486755371, + "learning_rate": 4.998396854132793e-05, + "loss": 0.5814, + "step": 342 + }, + { + "epoch": 0.018880351262349066, + "grad_norm": 4.167252540588379, + "learning_rate": 4.998378051097111e-05, + "loss": 0.628, + "step": 344 + }, + { + "epoch": 0.01899012074643249, + "grad_norm": 4.291174411773682, + "learning_rate": 4.998359138470304e-05, + "loss": 0.7401, + "step": 346 + }, + { + "epoch": 0.019099890230515917, + "grad_norm": 2.706035852432251, + "learning_rate": 4.9983401162532025e-05, + "loss": 0.4571, + "step": 348 + }, + { + "epoch": 0.019209659714599342, + "grad_norm": 1.7345889806747437, + "learning_rate": 4.998320984446641e-05, + "loss": 0.6087, + "step": 350 + }, + { + "epoch": 0.019319429198682767, + "grad_norm": 2.5041158199310303, + "learning_rate": 4.998301743051459e-05, + "loss": 0.5458, + "step": 352 + }, + { + "epoch": 0.019429198682766192, + "grad_norm": 2.0580880641937256, + "learning_rate": 4.9982823920685e-05, + "loss": 0.4367, + "step": 354 + }, + { + "epoch": 0.019538968166849614, + "grad_norm": 1.7691562175750732, + "learning_rate": 4.9982629314986126e-05, + "loss": 0.4188, + "step": 356 + }, + { + "epoch": 0.01964873765093304, + "grad_norm": 2.0171966552734375, + "learning_rate": 4.9982433613426507e-05, + "loss": 0.3985, + "step": 358 + }, + { + "epoch": 0.019758507135016465, + "grad_norm": 5.527328968048096, + "learning_rate": 4.998223681601473e-05, + "loss": 0.4037, + "step": 360 + }, + { + "epoch": 0.01986827661909989, + "grad_norm": 1.6386452913284302, + "learning_rate": 4.9982038922759435e-05, + "loss": 0.421, + "step": 362 + }, + { + "epoch": 0.019978046103183315, + "grad_norm": 2.638803482055664, + "learning_rate": 4.998183993366928e-05, + "loss": 0.4673, + "step": 364 + }, + { + "epoch": 0.02008781558726674, + "grad_norm": 5.055490970611572, + "learning_rate": 4.998163984875302e-05, + "loss": 0.5864, + "step": 366 + }, + { + "epoch": 0.020197585071350166, + "grad_norm": 3.25020432472229, + "learning_rate": 4.998143866801942e-05, + "loss": 0.4569, + "step": 368 + }, + { + "epoch": 0.02030735455543359, + "grad_norm": 2.476228713989258, + "learning_rate": 4.998123639147729e-05, + "loss": 0.37, + "step": 370 + }, + { + "epoch": 0.020417124039517013, + "grad_norm": 2.1863350868225098, + "learning_rate": 4.998103301913552e-05, + "loss": 0.5872, + "step": 372 + }, + { + "epoch": 0.020526893523600438, + "grad_norm": 2.483417272567749, + "learning_rate": 4.998082855100304e-05, + "loss": 0.3723, + "step": 374 + }, + { + "epoch": 0.020636663007683863, + "grad_norm": 2.830279588699341, + "learning_rate": 4.9980622987088795e-05, + "loss": 0.3715, + "step": 376 + }, + { + "epoch": 0.02074643249176729, + "grad_norm": 2.4156241416931152, + "learning_rate": 4.9980416327401826e-05, + "loss": 0.532, + "step": 378 + }, + { + "epoch": 0.020856201975850714, + "grad_norm": 2.06491756439209, + "learning_rate": 4.998020857195117e-05, + "loss": 0.3271, + "step": 380 + }, + { + "epoch": 0.02096597145993414, + "grad_norm": 2.198441505432129, + "learning_rate": 4.997999972074596e-05, + "loss": 0.5696, + "step": 382 + }, + { + "epoch": 0.021075740944017565, + "grad_norm": 2.558924436569214, + "learning_rate": 4.997978977379536e-05, + "loss": 0.3956, + "step": 384 + }, + { + "epoch": 0.021185510428100986, + "grad_norm": 1.7543673515319824, + "learning_rate": 4.997957873110857e-05, + "loss": 0.4315, + "step": 386 + }, + { + "epoch": 0.02129527991218441, + "grad_norm": 3.0632100105285645, + "learning_rate": 4.997936659269486e-05, + "loss": 0.4939, + "step": 388 + }, + { + "epoch": 0.021405049396267837, + "grad_norm": 3.542673349380493, + "learning_rate": 4.99791533585635e-05, + "loss": 0.4117, + "step": 390 + }, + { + "epoch": 0.021514818880351262, + "grad_norm": 2.055260181427002, + "learning_rate": 4.9978939028723894e-05, + "loss": 0.4832, + "step": 392 + }, + { + "epoch": 0.021624588364434687, + "grad_norm": 1.485547661781311, + "learning_rate": 4.99787236031854e-05, + "loss": 0.4811, + "step": 394 + }, + { + "epoch": 0.021734357848518113, + "grad_norm": 1.7831851243972778, + "learning_rate": 4.9978507081957494e-05, + "loss": 0.5676, + "step": 396 + }, + { + "epoch": 0.021844127332601538, + "grad_norm": 1.8413515090942383, + "learning_rate": 4.997828946504967e-05, + "loss": 0.3872, + "step": 398 + }, + { + "epoch": 0.021953896816684963, + "grad_norm": 1.7489122152328491, + "learning_rate": 4.997807075247146e-05, + "loss": 0.4085, + "step": 400 + }, + { + "epoch": 0.022063666300768385, + "grad_norm": 2.207770347595215, + "learning_rate": 4.9977850944232476e-05, + "loss": 0.5129, + "step": 402 + }, + { + "epoch": 0.02217343578485181, + "grad_norm": 2.5982301235198975, + "learning_rate": 4.9977630040342346e-05, + "loss": 0.2848, + "step": 404 + }, + { + "epoch": 0.022283205268935236, + "grad_norm": 2.6613497734069824, + "learning_rate": 4.997740804081076e-05, + "loss": 0.5466, + "step": 406 + }, + { + "epoch": 0.02239297475301866, + "grad_norm": 2.7582859992980957, + "learning_rate": 4.9977184945647473e-05, + "loss": 0.5216, + "step": 408 + }, + { + "epoch": 0.022502744237102086, + "grad_norm": 2.7774007320404053, + "learning_rate": 4.9976960754862254e-05, + "loss": 0.5945, + "step": 410 + }, + { + "epoch": 0.02261251372118551, + "grad_norm": 3.178353786468506, + "learning_rate": 4.9976735468464935e-05, + "loss": 0.4447, + "step": 412 + }, + { + "epoch": 0.022722283205268937, + "grad_norm": 2.7877423763275146, + "learning_rate": 4.997650908646542e-05, + "loss": 0.4724, + "step": 414 + }, + { + "epoch": 0.02283205268935236, + "grad_norm": 1.4482954740524292, + "learning_rate": 4.997628160887361e-05, + "loss": 0.4613, + "step": 416 + }, + { + "epoch": 0.022941822173435784, + "grad_norm": 1.9307631254196167, + "learning_rate": 4.99760530356995e-05, + "loss": 0.4083, + "step": 418 + }, + { + "epoch": 0.02305159165751921, + "grad_norm": 1.9201465845108032, + "learning_rate": 4.9975823366953124e-05, + "loss": 0.5191, + "step": 420 + }, + { + "epoch": 0.023161361141602634, + "grad_norm": 2.7016708850860596, + "learning_rate": 4.9975592602644536e-05, + "loss": 0.4773, + "step": 422 + }, + { + "epoch": 0.02327113062568606, + "grad_norm": 2.160069704055786, + "learning_rate": 4.997536074278387e-05, + "loss": 0.5804, + "step": 424 + }, + { + "epoch": 0.023380900109769485, + "grad_norm": 2.851585865020752, + "learning_rate": 4.9975127787381305e-05, + "loss": 0.3691, + "step": 426 + }, + { + "epoch": 0.02349066959385291, + "grad_norm": 4.179362773895264, + "learning_rate": 4.9974893736447045e-05, + "loss": 0.4591, + "step": 428 + }, + { + "epoch": 0.023600439077936335, + "grad_norm": 2.302598476409912, + "learning_rate": 4.997465858999136e-05, + "loss": 0.4642, + "step": 430 + }, + { + "epoch": 0.023710208562019757, + "grad_norm": 1.7333037853240967, + "learning_rate": 4.997442234802456e-05, + "loss": 0.4799, + "step": 432 + }, + { + "epoch": 0.023819978046103182, + "grad_norm": 2.000971555709839, + "learning_rate": 4.997418501055703e-05, + "loss": 0.348, + "step": 434 + }, + { + "epoch": 0.023929747530186608, + "grad_norm": 1.381555438041687, + "learning_rate": 4.997394657759915e-05, + "loss": 0.33, + "step": 436 + }, + { + "epoch": 0.024039517014270033, + "grad_norm": 1.5750107765197754, + "learning_rate": 4.99737070491614e-05, + "loss": 0.3761, + "step": 438 + }, + { + "epoch": 0.024149286498353458, + "grad_norm": 2.0134642124176025, + "learning_rate": 4.9973466425254286e-05, + "loss": 0.3954, + "step": 440 + }, + { + "epoch": 0.024259055982436883, + "grad_norm": 2.217087745666504, + "learning_rate": 4.997322470588835e-05, + "loss": 0.5099, + "step": 442 + }, + { + "epoch": 0.02436882546652031, + "grad_norm": 2.516678810119629, + "learning_rate": 4.997298189107421e-05, + "loss": 0.3277, + "step": 444 + }, + { + "epoch": 0.02447859495060373, + "grad_norm": 1.7652076482772827, + "learning_rate": 4.99727379808225e-05, + "loss": 0.3434, + "step": 446 + }, + { + "epoch": 0.024588364434687156, + "grad_norm": 1.549936056137085, + "learning_rate": 4.997249297514394e-05, + "loss": 0.4121, + "step": 448 + }, + { + "epoch": 0.02469813391877058, + "grad_norm": 2.695810317993164, + "learning_rate": 4.9972246874049254e-05, + "loss": 0.6277, + "step": 450 + }, + { + "epoch": 0.024807903402854006, + "grad_norm": 3.5317413806915283, + "learning_rate": 4.997199967754925e-05, + "loss": 0.5658, + "step": 452 + }, + { + "epoch": 0.02491767288693743, + "grad_norm": 2.299586057662964, + "learning_rate": 4.997175138565477e-05, + "loss": 0.4707, + "step": 454 + }, + { + "epoch": 0.025027442371020857, + "grad_norm": 1.8567231893539429, + "learning_rate": 4.997150199837671e-05, + "loss": 0.4417, + "step": 456 + }, + { + "epoch": 0.025137211855104282, + "grad_norm": 3.4330191612243652, + "learning_rate": 4.997125151572601e-05, + "loss": 0.5033, + "step": 458 + }, + { + "epoch": 0.025246981339187707, + "grad_norm": 3.696591854095459, + "learning_rate": 4.997099993771365e-05, + "loss": 0.5768, + "step": 460 + }, + { + "epoch": 0.02535675082327113, + "grad_norm": 6.798633098602295, + "learning_rate": 4.997074726435066e-05, + "loss": 0.4855, + "step": 462 + }, + { + "epoch": 0.025466520307354554, + "grad_norm": 1.8703426122665405, + "learning_rate": 4.997049349564814e-05, + "loss": 0.5201, + "step": 464 + }, + { + "epoch": 0.02557628979143798, + "grad_norm": 2.0299525260925293, + "learning_rate": 4.997023863161721e-05, + "loss": 0.4744, + "step": 466 + }, + { + "epoch": 0.025686059275521405, + "grad_norm": 1.8392144441604614, + "learning_rate": 4.996998267226905e-05, + "loss": 0.4928, + "step": 468 + }, + { + "epoch": 0.02579582875960483, + "grad_norm": 2.501338243484497, + "learning_rate": 4.996972561761489e-05, + "loss": 0.3243, + "step": 470 + }, + { + "epoch": 0.025905598243688256, + "grad_norm": 1.4138623476028442, + "learning_rate": 4.996946746766601e-05, + "loss": 0.316, + "step": 472 + }, + { + "epoch": 0.02601536772777168, + "grad_norm": 2.7725794315338135, + "learning_rate": 4.9969208222433736e-05, + "loss": 0.556, + "step": 474 + }, + { + "epoch": 0.026125137211855103, + "grad_norm": 2.3026537895202637, + "learning_rate": 4.9968947881929414e-05, + "loss": 0.3826, + "step": 476 + }, + { + "epoch": 0.026234906695938528, + "grad_norm": 3.3363640308380127, + "learning_rate": 4.9968686446164505e-05, + "loss": 0.4556, + "step": 478 + }, + { + "epoch": 0.026344676180021953, + "grad_norm": 2.0402796268463135, + "learning_rate": 4.996842391515044e-05, + "loss": 0.4595, + "step": 480 + }, + { + "epoch": 0.02645444566410538, + "grad_norm": 2.6300950050354004, + "learning_rate": 4.996816028889876e-05, + "loss": 0.4526, + "step": 482 + }, + { + "epoch": 0.026564215148188804, + "grad_norm": 1.8785053491592407, + "learning_rate": 4.996789556742101e-05, + "loss": 0.3751, + "step": 484 + }, + { + "epoch": 0.02667398463227223, + "grad_norm": 1.853649377822876, + "learning_rate": 4.99676297507288e-05, + "loss": 0.4058, + "step": 486 + }, + { + "epoch": 0.026783754116355654, + "grad_norm": 1.9056929349899292, + "learning_rate": 4.996736283883382e-05, + "loss": 0.3119, + "step": 488 + }, + { + "epoch": 0.02689352360043908, + "grad_norm": 1.4847012758255005, + "learning_rate": 4.996709483174776e-05, + "loss": 0.409, + "step": 490 + }, + { + "epoch": 0.0270032930845225, + "grad_norm": 2.361370325088501, + "learning_rate": 4.9966825729482364e-05, + "loss": 0.5896, + "step": 492 + }, + { + "epoch": 0.027113062568605927, + "grad_norm": 1.9981611967086792, + "learning_rate": 4.9966555532049455e-05, + "loss": 0.4634, + "step": 494 + }, + { + "epoch": 0.027222832052689352, + "grad_norm": 1.7409052848815918, + "learning_rate": 4.996628423946087e-05, + "loss": 0.4276, + "step": 496 + }, + { + "epoch": 0.027332601536772777, + "grad_norm": 2.136425733566284, + "learning_rate": 4.9966011851728524e-05, + "loss": 0.5592, + "step": 498 + }, + { + "epoch": 0.027442371020856202, + "grad_norm": 1.5588865280151367, + "learning_rate": 4.996573836886435e-05, + "loss": 0.5937, + "step": 500 + }, + { + "epoch": 0.027552140504939628, + "grad_norm": 2.6874852180480957, + "learning_rate": 4.996546379088035e-05, + "loss": 0.3501, + "step": 502 + }, + { + "epoch": 0.027661909989023053, + "grad_norm": 2.0354061126708984, + "learning_rate": 4.996518811778858e-05, + "loss": 0.6379, + "step": 504 + }, + { + "epoch": 0.027771679473106478, + "grad_norm": 1.7898533344268799, + "learning_rate": 4.9964911349601116e-05, + "loss": 0.4637, + "step": 506 + }, + { + "epoch": 0.0278814489571899, + "grad_norm": 2.2580642700195312, + "learning_rate": 4.9964633486330116e-05, + "loss": 0.4912, + "step": 508 + }, + { + "epoch": 0.027991218441273325, + "grad_norm": 1.6026685237884521, + "learning_rate": 4.996435452798774e-05, + "loss": 0.4033, + "step": 510 + }, + { + "epoch": 0.02810098792535675, + "grad_norm": 2.684690475463867, + "learning_rate": 4.996407447458626e-05, + "loss": 0.4821, + "step": 512 + }, + { + "epoch": 0.028210757409440176, + "grad_norm": 2.0446603298187256, + "learning_rate": 4.996379332613793e-05, + "loss": 0.587, + "step": 514 + }, + { + "epoch": 0.0283205268935236, + "grad_norm": 3.204998016357422, + "learning_rate": 4.99635110826551e-05, + "loss": 0.632, + "step": 516 + }, + { + "epoch": 0.028430296377607026, + "grad_norm": 1.525963306427002, + "learning_rate": 4.996322774415015e-05, + "loss": 0.4202, + "step": 518 + }, + { + "epoch": 0.02854006586169045, + "grad_norm": 1.9659796953201294, + "learning_rate": 4.99629433106355e-05, + "loss": 0.4438, + "step": 520 + }, + { + "epoch": 0.028649835345773873, + "grad_norm": 1.4567357301712036, + "learning_rate": 4.996265778212363e-05, + "loss": 0.3998, + "step": 522 + }, + { + "epoch": 0.0287596048298573, + "grad_norm": 1.604117512702942, + "learning_rate": 4.996237115862706e-05, + "loss": 0.5394, + "step": 524 + }, + { + "epoch": 0.028869374313940724, + "grad_norm": 4.599597930908203, + "learning_rate": 4.996208344015838e-05, + "loss": 0.4853, + "step": 526 + }, + { + "epoch": 0.02897914379802415, + "grad_norm": 2.3027455806732178, + "learning_rate": 4.99617946267302e-05, + "loss": 0.5789, + "step": 528 + }, + { + "epoch": 0.029088913282107574, + "grad_norm": 1.937966227531433, + "learning_rate": 4.996150471835518e-05, + "loss": 0.4028, + "step": 530 + }, + { + "epoch": 0.029198682766191, + "grad_norm": 2.205972194671631, + "learning_rate": 4.9961213715046045e-05, + "loss": 0.44, + "step": 532 + }, + { + "epoch": 0.029308452250274425, + "grad_norm": 1.4079662561416626, + "learning_rate": 4.996092161681556e-05, + "loss": 0.3206, + "step": 534 + }, + { + "epoch": 0.02941822173435785, + "grad_norm": 2.3553192615509033, + "learning_rate": 4.996062842367654e-05, + "loss": 0.5114, + "step": 536 + }, + { + "epoch": 0.029527991218441272, + "grad_norm": 1.3909388780593872, + "learning_rate": 4.996033413564184e-05, + "loss": 0.4173, + "step": 538 + }, + { + "epoch": 0.029637760702524697, + "grad_norm": 1.3438464403152466, + "learning_rate": 4.996003875272438e-05, + "loss": 0.3706, + "step": 540 + }, + { + "epoch": 0.029747530186608123, + "grad_norm": 4.967839241027832, + "learning_rate": 4.995974227493711e-05, + "loss": 0.5098, + "step": 542 + }, + { + "epoch": 0.029857299670691548, + "grad_norm": 2.069465160369873, + "learning_rate": 4.995944470229302e-05, + "loss": 0.3661, + "step": 544 + }, + { + "epoch": 0.029967069154774973, + "grad_norm": 1.7696164846420288, + "learning_rate": 4.995914603480519e-05, + "loss": 0.5946, + "step": 546 + }, + { + "epoch": 0.0300768386388584, + "grad_norm": 4.360721588134766, + "learning_rate": 4.9958846272486704e-05, + "loss": 0.513, + "step": 548 + }, + { + "epoch": 0.030186608122941824, + "grad_norm": 2.4083313941955566, + "learning_rate": 4.995854541535071e-05, + "loss": 0.6865, + "step": 550 + }, + { + "epoch": 0.030296377607025245, + "grad_norm": 1.3227423429489136, + "learning_rate": 4.9958243463410414e-05, + "loss": 0.3637, + "step": 552 + }, + { + "epoch": 0.03040614709110867, + "grad_norm": 2.1085596084594727, + "learning_rate": 4.995794041667906e-05, + "loss": 0.421, + "step": 554 + }, + { + "epoch": 0.030515916575192096, + "grad_norm": 1.5773733854293823, + "learning_rate": 4.995763627516994e-05, + "loss": 0.284, + "step": 556 + }, + { + "epoch": 0.03062568605927552, + "grad_norm": 3.0532870292663574, + "learning_rate": 4.995733103889639e-05, + "loss": 0.3563, + "step": 558 + }, + { + "epoch": 0.030735455543358946, + "grad_norm": 1.8613656759262085, + "learning_rate": 4.9957024707871806e-05, + "loss": 0.4456, + "step": 560 + }, + { + "epoch": 0.03084522502744237, + "grad_norm": 2.5658886432647705, + "learning_rate": 4.995671728210962e-05, + "loss": 0.4749, + "step": 562 + }, + { + "epoch": 0.030954994511525797, + "grad_norm": 1.5188041925430298, + "learning_rate": 4.995640876162332e-05, + "loss": 0.439, + "step": 564 + }, + { + "epoch": 0.031064763995609222, + "grad_norm": 3.9638278484344482, + "learning_rate": 4.9956099146426435e-05, + "loss": 0.2938, + "step": 566 + }, + { + "epoch": 0.031174533479692644, + "grad_norm": 1.8980274200439453, + "learning_rate": 4.9955788436532545e-05, + "loss": 0.5946, + "step": 568 + }, + { + "epoch": 0.03128430296377607, + "grad_norm": 1.4961771965026855, + "learning_rate": 4.99554766319553e-05, + "loss": 0.4834, + "step": 570 + }, + { + "epoch": 0.031394072447859495, + "grad_norm": 2.0114455223083496, + "learning_rate": 4.9955163732708346e-05, + "loss": 0.5125, + "step": 572 + }, + { + "epoch": 0.03150384193194292, + "grad_norm": 2.7518510818481445, + "learning_rate": 4.995484973880543e-05, + "loss": 0.4456, + "step": 574 + }, + { + "epoch": 0.031613611416026345, + "grad_norm": 1.4463770389556885, + "learning_rate": 4.995453465026032e-05, + "loss": 0.4281, + "step": 576 + }, + { + "epoch": 0.03172338090010977, + "grad_norm": 1.3946725130081177, + "learning_rate": 4.995421846708683e-05, + "loss": 0.3615, + "step": 578 + }, + { + "epoch": 0.031833150384193196, + "grad_norm": 2.254066228866577, + "learning_rate": 4.9953901189298845e-05, + "loss": 0.5113, + "step": 580 + }, + { + "epoch": 0.03194291986827662, + "grad_norm": 2.8974239826202393, + "learning_rate": 4.995358281691027e-05, + "loss": 0.414, + "step": 582 + }, + { + "epoch": 0.032052689352360046, + "grad_norm": 1.7802329063415527, + "learning_rate": 4.9953263349935074e-05, + "loss": 0.4282, + "step": 584 + }, + { + "epoch": 0.03216245883644347, + "grad_norm": 1.7174638509750366, + "learning_rate": 4.995294278838727e-05, + "loss": 0.4616, + "step": 586 + }, + { + "epoch": 0.0322722283205269, + "grad_norm": 3.0465006828308105, + "learning_rate": 4.995262113228091e-05, + "loss": 0.5006, + "step": 588 + }, + { + "epoch": 0.03238199780461032, + "grad_norm": 1.8720701932907104, + "learning_rate": 4.995229838163012e-05, + "loss": 0.5133, + "step": 590 + }, + { + "epoch": 0.03249176728869374, + "grad_norm": 1.7356452941894531, + "learning_rate": 4.995197453644905e-05, + "loss": 0.4952, + "step": 592 + }, + { + "epoch": 0.03260153677277717, + "grad_norm": 2.3734219074249268, + "learning_rate": 4.99516495967519e-05, + "loss": 0.4456, + "step": 594 + }, + { + "epoch": 0.03271130625686059, + "grad_norm": 1.67129647731781, + "learning_rate": 4.9951323562552934e-05, + "loss": 0.3177, + "step": 596 + }, + { + "epoch": 0.03282107574094402, + "grad_norm": 2.0500364303588867, + "learning_rate": 4.995099643386645e-05, + "loss": 0.4085, + "step": 598 + }, + { + "epoch": 0.03293084522502744, + "grad_norm": 2.3916726112365723, + "learning_rate": 4.995066821070679e-05, + "loss": 0.6165, + "step": 600 + }, + { + "epoch": 0.03304061470911087, + "grad_norm": 2.2469286918640137, + "learning_rate": 4.9950338893088364e-05, + "loss": 0.6196, + "step": 602 + }, + { + "epoch": 0.03315038419319429, + "grad_norm": 2.6706979274749756, + "learning_rate": 4.99500084810256e-05, + "loss": 0.6205, + "step": 604 + }, + { + "epoch": 0.033260153677277714, + "grad_norm": 2.2114977836608887, + "learning_rate": 4.994967697453301e-05, + "loss": 0.602, + "step": 606 + }, + { + "epoch": 0.03336992316136114, + "grad_norm": 1.5265508890151978, + "learning_rate": 4.994934437362513e-05, + "loss": 0.4118, + "step": 608 + }, + { + "epoch": 0.033479692645444564, + "grad_norm": 2.043682336807251, + "learning_rate": 4.994901067831654e-05, + "loss": 0.4266, + "step": 610 + }, + { + "epoch": 0.03358946212952799, + "grad_norm": 3.4007959365844727, + "learning_rate": 4.994867588862189e-05, + "loss": 0.4927, + "step": 612 + }, + { + "epoch": 0.033699231613611415, + "grad_norm": 1.8730354309082031, + "learning_rate": 4.994834000455586e-05, + "loss": 0.3594, + "step": 614 + }, + { + "epoch": 0.033809001097694844, + "grad_norm": 2.651376962661743, + "learning_rate": 4.994800302613318e-05, + "loss": 0.3677, + "step": 616 + }, + { + "epoch": 0.033918770581778265, + "grad_norm": 2.1092488765716553, + "learning_rate": 4.994766495336864e-05, + "loss": 0.5131, + "step": 618 + }, + { + "epoch": 0.03402854006586169, + "grad_norm": 1.6894471645355225, + "learning_rate": 4.994732578627706e-05, + "loss": 0.4009, + "step": 620 + }, + { + "epoch": 0.034138309549945116, + "grad_norm": 2.367831230163574, + "learning_rate": 4.9946985524873324e-05, + "loss": 0.4312, + "step": 622 + }, + { + "epoch": 0.03424807903402854, + "grad_norm": 1.886276364326477, + "learning_rate": 4.9946644169172355e-05, + "loss": 0.5514, + "step": 624 + }, + { + "epoch": 0.034357848518111966, + "grad_norm": 1.662509799003601, + "learning_rate": 4.994630171918913e-05, + "loss": 0.3994, + "step": 626 + }, + { + "epoch": 0.03446761800219539, + "grad_norm": 1.8703560829162598, + "learning_rate": 4.994595817493867e-05, + "loss": 0.4581, + "step": 628 + }, + { + "epoch": 0.03457738748627882, + "grad_norm": 2.2708146572113037, + "learning_rate": 4.994561353643604e-05, + "loss": 0.463, + "step": 630 + }, + { + "epoch": 0.03468715697036224, + "grad_norm": 1.5967164039611816, + "learning_rate": 4.9945267803696364e-05, + "loss": 0.3801, + "step": 632 + }, + { + "epoch": 0.03479692645444567, + "grad_norm": 1.8556278944015503, + "learning_rate": 4.994492097673479e-05, + "loss": 0.5233, + "step": 634 + }, + { + "epoch": 0.03490669593852909, + "grad_norm": 4.490135192871094, + "learning_rate": 4.9944573055566556e-05, + "loss": 0.5758, + "step": 636 + }, + { + "epoch": 0.03501646542261251, + "grad_norm": 1.9489893913269043, + "learning_rate": 4.9944224040206913e-05, + "loss": 0.4253, + "step": 638 + }, + { + "epoch": 0.03512623490669594, + "grad_norm": 2.204827070236206, + "learning_rate": 4.994387393067117e-05, + "loss": 0.3885, + "step": 640 + }, + { + "epoch": 0.03523600439077936, + "grad_norm": 2.6080117225646973, + "learning_rate": 4.994352272697468e-05, + "loss": 0.4318, + "step": 642 + }, + { + "epoch": 0.03534577387486279, + "grad_norm": 1.8217462301254272, + "learning_rate": 4.9943170429132855e-05, + "loss": 0.4366, + "step": 644 + }, + { + "epoch": 0.03545554335894621, + "grad_norm": 4.1973137855529785, + "learning_rate": 4.994281703716115e-05, + "loss": 0.4985, + "step": 646 + }, + { + "epoch": 0.03556531284302964, + "grad_norm": 1.6765116453170776, + "learning_rate": 4.9942462551075056e-05, + "loss": 0.3254, + "step": 648 + }, + { + "epoch": 0.03567508232711306, + "grad_norm": 1.7548681497573853, + "learning_rate": 4.994210697089014e-05, + "loss": 0.3859, + "step": 650 + }, + { + "epoch": 0.035784851811196484, + "grad_norm": 1.191675066947937, + "learning_rate": 4.994175029662198e-05, + "loss": 0.4175, + "step": 652 + }, + { + "epoch": 0.03589462129527991, + "grad_norm": 1.9427241086959839, + "learning_rate": 4.994139252828624e-05, + "loss": 0.4441, + "step": 654 + }, + { + "epoch": 0.036004390779363335, + "grad_norm": 1.559734582901001, + "learning_rate": 4.994103366589859e-05, + "loss": 0.4722, + "step": 656 + }, + { + "epoch": 0.036114160263446764, + "grad_norm": 1.470824956893921, + "learning_rate": 4.99406737094748e-05, + "loss": 0.3258, + "step": 658 + }, + { + "epoch": 0.036223929747530186, + "grad_norm": 3.07967209815979, + "learning_rate": 4.994031265903063e-05, + "loss": 0.2955, + "step": 660 + }, + { + "epoch": 0.036333699231613614, + "grad_norm": 4.356321334838867, + "learning_rate": 4.9939950514581944e-05, + "loss": 0.4686, + "step": 662 + }, + { + "epoch": 0.036443468715697036, + "grad_norm": 3.3561367988586426, + "learning_rate": 4.9939587276144616e-05, + "loss": 0.3869, + "step": 664 + }, + { + "epoch": 0.03655323819978046, + "grad_norm": 9.011635780334473, + "learning_rate": 4.993922294373458e-05, + "loss": 0.5763, + "step": 666 + }, + { + "epoch": 0.03666300768386389, + "grad_norm": 1.900537371635437, + "learning_rate": 4.993885751736781e-05, + "loss": 0.4157, + "step": 668 + }, + { + "epoch": 0.03677277716794731, + "grad_norm": 3.920666456222534, + "learning_rate": 4.993849099706035e-05, + "loss": 0.4799, + "step": 670 + }, + { + "epoch": 0.03688254665203074, + "grad_norm": 2.0436856746673584, + "learning_rate": 4.993812338282826e-05, + "loss": 0.4993, + "step": 672 + }, + { + "epoch": 0.03699231613611416, + "grad_norm": 2.713021993637085, + "learning_rate": 4.993775467468768e-05, + "loss": 0.508, + "step": 674 + }, + { + "epoch": 0.03710208562019759, + "grad_norm": 2.404468297958374, + "learning_rate": 4.993738487265478e-05, + "loss": 0.3722, + "step": 676 + }, + { + "epoch": 0.03721185510428101, + "grad_norm": 4.5672383308410645, + "learning_rate": 4.993701397674577e-05, + "loss": 0.429, + "step": 678 + }, + { + "epoch": 0.03732162458836443, + "grad_norm": 3.230536460876465, + "learning_rate": 4.993664198697694e-05, + "loss": 0.529, + "step": 680 + }, + { + "epoch": 0.03743139407244786, + "grad_norm": 2.0221283435821533, + "learning_rate": 4.993626890336459e-05, + "loss": 0.3837, + "step": 682 + }, + { + "epoch": 0.03754116355653128, + "grad_norm": 2.6156678199768066, + "learning_rate": 4.99358947259251e-05, + "loss": 0.6178, + "step": 684 + }, + { + "epoch": 0.03765093304061471, + "grad_norm": 3.183774948120117, + "learning_rate": 4.993551945467487e-05, + "loss": 0.3418, + "step": 686 + }, + { + "epoch": 0.03776070252469813, + "grad_norm": 2.737471103668213, + "learning_rate": 4.993514308963036e-05, + "loss": 0.4282, + "step": 688 + }, + { + "epoch": 0.03787047200878156, + "grad_norm": 1.8533951044082642, + "learning_rate": 4.9934765630808095e-05, + "loss": 0.3556, + "step": 690 + }, + { + "epoch": 0.03798024149286498, + "grad_norm": 1.453902244567871, + "learning_rate": 4.993438707822462e-05, + "loss": 0.411, + "step": 692 + }, + { + "epoch": 0.03809001097694841, + "grad_norm": 1.8138813972473145, + "learning_rate": 4.9934007431896536e-05, + "loss": 0.4075, + "step": 694 + }, + { + "epoch": 0.03819978046103183, + "grad_norm": 2.6364591121673584, + "learning_rate": 4.993362669184051e-05, + "loss": 0.4795, + "step": 696 + }, + { + "epoch": 0.038309549945115255, + "grad_norm": 1.6225872039794922, + "learning_rate": 4.993324485807323e-05, + "loss": 0.5974, + "step": 698 + }, + { + "epoch": 0.038419319429198684, + "grad_norm": 2.6435599327087402, + "learning_rate": 4.9932861930611454e-05, + "loss": 0.3858, + "step": 700 + }, + { + "epoch": 0.038529088913282106, + "grad_norm": 1.961139440536499, + "learning_rate": 4.9932477909471975e-05, + "loss": 0.3304, + "step": 702 + }, + { + "epoch": 0.038638858397365534, + "grad_norm": 2.6405115127563477, + "learning_rate": 4.993209279467164e-05, + "loss": 0.4557, + "step": 704 + }, + { + "epoch": 0.038748627881448956, + "grad_norm": 4.010139465332031, + "learning_rate": 4.993170658622735e-05, + "loss": 0.5814, + "step": 706 + }, + { + "epoch": 0.038858397365532385, + "grad_norm": 3.29146409034729, + "learning_rate": 4.993131928415602e-05, + "loss": 0.4539, + "step": 708 + }, + { + "epoch": 0.03896816684961581, + "grad_norm": 1.7264498472213745, + "learning_rate": 4.9930930888474666e-05, + "loss": 0.4759, + "step": 710 + }, + { + "epoch": 0.03907793633369923, + "grad_norm": 1.7772456407546997, + "learning_rate": 4.993054139920032e-05, + "loss": 0.5008, + "step": 712 + }, + { + "epoch": 0.03918770581778266, + "grad_norm": 2.497615337371826, + "learning_rate": 4.9930150816350055e-05, + "loss": 0.4226, + "step": 714 + }, + { + "epoch": 0.03929747530186608, + "grad_norm": 2.2512576580047607, + "learning_rate": 4.9929759139941e-05, + "loss": 0.3614, + "step": 716 + }, + { + "epoch": 0.03940724478594951, + "grad_norm": 2.8669204711914062, + "learning_rate": 4.992936636999036e-05, + "loss": 0.505, + "step": 718 + }, + { + "epoch": 0.03951701427003293, + "grad_norm": 1.817746639251709, + "learning_rate": 4.992897250651535e-05, + "loss": 0.3477, + "step": 720 + }, + { + "epoch": 0.03962678375411636, + "grad_norm": 2.3656811714172363, + "learning_rate": 4.9928577549533243e-05, + "loss": 0.4363, + "step": 722 + }, + { + "epoch": 0.03973655323819978, + "grad_norm": 2.08229398727417, + "learning_rate": 4.992818149906138e-05, + "loss": 0.5534, + "step": 724 + }, + { + "epoch": 0.0398463227222832, + "grad_norm": 2.610219717025757, + "learning_rate": 4.99277843551171e-05, + "loss": 0.4138, + "step": 726 + }, + { + "epoch": 0.03995609220636663, + "grad_norm": 3.264014959335327, + "learning_rate": 4.992738611771787e-05, + "loss": 0.7574, + "step": 728 + }, + { + "epoch": 0.04006586169045005, + "grad_norm": 2.9344170093536377, + "learning_rate": 4.992698678688111e-05, + "loss": 0.4999, + "step": 730 + }, + { + "epoch": 0.04017563117453348, + "grad_norm": 2.3180177211761475, + "learning_rate": 4.992658636262438e-05, + "loss": 0.4726, + "step": 732 + }, + { + "epoch": 0.0402854006586169, + "grad_norm": 2.5026323795318604, + "learning_rate": 4.992618484496522e-05, + "loss": 0.4293, + "step": 734 + }, + { + "epoch": 0.04039517014270033, + "grad_norm": 1.7041943073272705, + "learning_rate": 4.992578223392124e-05, + "loss": 0.4286, + "step": 736 + }, + { + "epoch": 0.040504939626783754, + "grad_norm": 2.6693599224090576, + "learning_rate": 4.992537852951011e-05, + "loss": 0.4725, + "step": 738 + }, + { + "epoch": 0.04061470911086718, + "grad_norm": 2.1256370544433594, + "learning_rate": 4.992497373174955e-05, + "loss": 0.548, + "step": 740 + }, + { + "epoch": 0.040724478594950604, + "grad_norm": 1.8762153387069702, + "learning_rate": 4.9924567840657286e-05, + "loss": 0.4913, + "step": 742 + }, + { + "epoch": 0.040834248079034026, + "grad_norm": 3.146735668182373, + "learning_rate": 4.992416085625115e-05, + "loss": 0.6082, + "step": 744 + }, + { + "epoch": 0.040944017563117455, + "grad_norm": 2.348720073699951, + "learning_rate": 4.9923752778548985e-05, + "loss": 0.3805, + "step": 746 + }, + { + "epoch": 0.041053787047200876, + "grad_norm": 2.484318494796753, + "learning_rate": 4.9923343607568684e-05, + "loss": 0.4768, + "step": 748 + }, + { + "epoch": 0.041163556531284305, + "grad_norm": 2.672071933746338, + "learning_rate": 4.99229333433282e-05, + "loss": 0.5392, + "step": 750 + }, + { + "epoch": 0.04127332601536773, + "grad_norm": 4.7053542137146, + "learning_rate": 4.992252198584554e-05, + "loss": 0.4865, + "step": 752 + }, + { + "epoch": 0.041383095499451156, + "grad_norm": 1.7434351444244385, + "learning_rate": 4.9922109535138725e-05, + "loss": 0.4132, + "step": 754 + }, + { + "epoch": 0.04149286498353458, + "grad_norm": 1.81113600730896, + "learning_rate": 4.992169599122587e-05, + "loss": 0.485, + "step": 756 + }, + { + "epoch": 0.041602634467618, + "grad_norm": 2.2844316959381104, + "learning_rate": 4.99212813541251e-05, + "loss": 0.5746, + "step": 758 + }, + { + "epoch": 0.04171240395170143, + "grad_norm": 2.35425066947937, + "learning_rate": 4.9920865623854615e-05, + "loss": 0.4168, + "step": 760 + }, + { + "epoch": 0.04182217343578485, + "grad_norm": 1.9008493423461914, + "learning_rate": 4.992044880043265e-05, + "loss": 0.4478, + "step": 762 + }, + { + "epoch": 0.04193194291986828, + "grad_norm": 1.6378251314163208, + "learning_rate": 4.9920030883877476e-05, + "loss": 0.5619, + "step": 764 + }, + { + "epoch": 0.0420417124039517, + "grad_norm": 1.630005121231079, + "learning_rate": 4.991961187420744e-05, + "loss": 0.3913, + "step": 766 + }, + { + "epoch": 0.04215148188803513, + "grad_norm": 2.319826126098633, + "learning_rate": 4.9919191771440905e-05, + "loss": 0.4773, + "step": 768 + }, + { + "epoch": 0.04226125137211855, + "grad_norm": 3.127916097640991, + "learning_rate": 4.9918770575596305e-05, + "loss": 0.4916, + "step": 770 + }, + { + "epoch": 0.04237102085620197, + "grad_norm": 1.7175869941711426, + "learning_rate": 4.991834828669213e-05, + "loss": 0.5445, + "step": 772 + }, + { + "epoch": 0.0424807903402854, + "grad_norm": 2.172255039215088, + "learning_rate": 4.99179249047469e-05, + "loss": 0.4451, + "step": 774 + }, + { + "epoch": 0.04259055982436882, + "grad_norm": 2.0628418922424316, + "learning_rate": 4.991750042977916e-05, + "loss": 0.3739, + "step": 776 + }, + { + "epoch": 0.04270032930845225, + "grad_norm": 2.0632472038269043, + "learning_rate": 4.991707486180756e-05, + "loss": 0.4271, + "step": 778 + }, + { + "epoch": 0.042810098792535674, + "grad_norm": 2.8328897953033447, + "learning_rate": 4.991664820085074e-05, + "loss": 0.5256, + "step": 780 + }, + { + "epoch": 0.0429198682766191, + "grad_norm": 2.2355775833129883, + "learning_rate": 4.991622044692745e-05, + "loss": 0.4282, + "step": 782 + }, + { + "epoch": 0.043029637760702524, + "grad_norm": 1.7225903272628784, + "learning_rate": 4.991579160005644e-05, + "loss": 0.5223, + "step": 784 + }, + { + "epoch": 0.043139407244785946, + "grad_norm": 2.7113354206085205, + "learning_rate": 4.99153616602565e-05, + "loss": 0.3846, + "step": 786 + }, + { + "epoch": 0.043249176728869375, + "grad_norm": 2.3711600303649902, + "learning_rate": 4.991493062754651e-05, + "loss": 0.5733, + "step": 788 + }, + { + "epoch": 0.0433589462129528, + "grad_norm": 2.3337483406066895, + "learning_rate": 4.991449850194538e-05, + "loss": 0.4451, + "step": 790 + }, + { + "epoch": 0.043468715697036225, + "grad_norm": 1.8799583911895752, + "learning_rate": 4.991406528347206e-05, + "loss": 0.4173, + "step": 792 + }, + { + "epoch": 0.04357848518111965, + "grad_norm": 1.93858003616333, + "learning_rate": 4.991363097214554e-05, + "loss": 0.4059, + "step": 794 + }, + { + "epoch": 0.043688254665203076, + "grad_norm": 1.3629542589187622, + "learning_rate": 4.991319556798488e-05, + "loss": 0.3713, + "step": 796 + }, + { + "epoch": 0.0437980241492865, + "grad_norm": 2.1564605236053467, + "learning_rate": 4.991275907100919e-05, + "loss": 0.4101, + "step": 798 + }, + { + "epoch": 0.043907793633369926, + "grad_norm": 2.658707618713379, + "learning_rate": 4.991232148123761e-05, + "loss": 0.4214, + "step": 800 + }, + { + "epoch": 0.04401756311745335, + "grad_norm": 2.147395372390747, + "learning_rate": 4.991188279868933e-05, + "loss": 0.456, + "step": 802 + }, + { + "epoch": 0.04412733260153677, + "grad_norm": 1.165541410446167, + "learning_rate": 4.99114430233836e-05, + "loss": 0.4301, + "step": 804 + }, + { + "epoch": 0.0442371020856202, + "grad_norm": 2.2732136249542236, + "learning_rate": 4.99110021553397e-05, + "loss": 0.4775, + "step": 806 + }, + { + "epoch": 0.04434687156970362, + "grad_norm": 4.72205114364624, + "learning_rate": 4.991056019457697e-05, + "loss": 0.4271, + "step": 808 + }, + { + "epoch": 0.04445664105378705, + "grad_norm": 2.44466495513916, + "learning_rate": 4.991011714111481e-05, + "loss": 0.5621, + "step": 810 + }, + { + "epoch": 0.04456641053787047, + "grad_norm": 1.6993792057037354, + "learning_rate": 4.990967299497264e-05, + "loss": 0.4461, + "step": 812 + }, + { + "epoch": 0.0446761800219539, + "grad_norm": 2.158451795578003, + "learning_rate": 4.990922775616996e-05, + "loss": 0.4867, + "step": 814 + }, + { + "epoch": 0.04478594950603732, + "grad_norm": 2.4753594398498535, + "learning_rate": 4.990878142472628e-05, + "loss": 0.5004, + "step": 816 + }, + { + "epoch": 0.04489571899012074, + "grad_norm": 1.3509736061096191, + "learning_rate": 4.990833400066119e-05, + "loss": 0.5487, + "step": 818 + }, + { + "epoch": 0.04500548847420417, + "grad_norm": 3.497817277908325, + "learning_rate": 4.990788548399432e-05, + "loss": 0.5924, + "step": 820 + }, + { + "epoch": 0.045115257958287594, + "grad_norm": 1.653350830078125, + "learning_rate": 4.990743587474532e-05, + "loss": 0.374, + "step": 822 + }, + { + "epoch": 0.04522502744237102, + "grad_norm": 1.6767979860305786, + "learning_rate": 4.990698517293395e-05, + "loss": 0.5593, + "step": 824 + }, + { + "epoch": 0.045334796926454445, + "grad_norm": 3.660229206085205, + "learning_rate": 4.9906533378579944e-05, + "loss": 0.3824, + "step": 826 + }, + { + "epoch": 0.04544456641053787, + "grad_norm": 2.361825466156006, + "learning_rate": 4.9906080491703146e-05, + "loss": 0.3811, + "step": 828 + }, + { + "epoch": 0.045554335894621295, + "grad_norm": 2.4623172283172607, + "learning_rate": 4.99056265123234e-05, + "loss": 0.5121, + "step": 830 + }, + { + "epoch": 0.04566410537870472, + "grad_norm": 1.2564808130264282, + "learning_rate": 4.990517144046064e-05, + "loss": 0.3279, + "step": 832 + }, + { + "epoch": 0.045773874862788146, + "grad_norm": 1.7974836826324463, + "learning_rate": 4.990471527613482e-05, + "loss": 0.4375, + "step": 834 + }, + { + "epoch": 0.04588364434687157, + "grad_norm": 1.8186591863632202, + "learning_rate": 4.990425801936594e-05, + "loss": 0.5492, + "step": 836 + }, + { + "epoch": 0.045993413830954996, + "grad_norm": 3.0327155590057373, + "learning_rate": 4.990379967017407e-05, + "loss": 0.5277, + "step": 838 + }, + { + "epoch": 0.04610318331503842, + "grad_norm": 5.590970516204834, + "learning_rate": 4.990334022857932e-05, + "loss": 0.6514, + "step": 840 + }, + { + "epoch": 0.04621295279912185, + "grad_norm": 2.33721661567688, + "learning_rate": 4.990287969460182e-05, + "loss": 0.5004, + "step": 842 + }, + { + "epoch": 0.04632272228320527, + "grad_norm": 2.565613031387329, + "learning_rate": 4.990241806826179e-05, + "loss": 0.6361, + "step": 844 + }, + { + "epoch": 0.0464324917672887, + "grad_norm": 1.5572712421417236, + "learning_rate": 4.9901955349579484e-05, + "loss": 0.4207, + "step": 846 + }, + { + "epoch": 0.04654226125137212, + "grad_norm": 2.485509157180786, + "learning_rate": 4.9901491538575185e-05, + "loss": 0.5363, + "step": 848 + }, + { + "epoch": 0.04665203073545554, + "grad_norm": 2.9114747047424316, + "learning_rate": 4.990102663526924e-05, + "loss": 0.3729, + "step": 850 + }, + { + "epoch": 0.04676180021953897, + "grad_norm": 2.6872103214263916, + "learning_rate": 4.9900560639682045e-05, + "loss": 0.4984, + "step": 852 + }, + { + "epoch": 0.04687156970362239, + "grad_norm": 4.465526580810547, + "learning_rate": 4.990009355183405e-05, + "loss": 0.4478, + "step": 854 + }, + { + "epoch": 0.04698133918770582, + "grad_norm": 2.2470245361328125, + "learning_rate": 4.9899625371745726e-05, + "loss": 0.5279, + "step": 856 + }, + { + "epoch": 0.04709110867178924, + "grad_norm": 3.414216995239258, + "learning_rate": 4.989915609943763e-05, + "loss": 0.4273, + "step": 858 + }, + { + "epoch": 0.04720087815587267, + "grad_norm": 1.7805373668670654, + "learning_rate": 4.989868573493032e-05, + "loss": 0.4547, + "step": 860 + }, + { + "epoch": 0.04731064763995609, + "grad_norm": 3.6561760902404785, + "learning_rate": 4.989821427824446e-05, + "loss": 0.5813, + "step": 862 + }, + { + "epoch": 0.047420417124039514, + "grad_norm": 1.321749210357666, + "learning_rate": 4.9897741729400705e-05, + "loss": 0.41, + "step": 864 + }, + { + "epoch": 0.04753018660812294, + "grad_norm": 2.347018003463745, + "learning_rate": 4.989726808841979e-05, + "loss": 0.4855, + "step": 866 + }, + { + "epoch": 0.047639956092206365, + "grad_norm": 3.531938314437866, + "learning_rate": 4.9896793355322495e-05, + "loss": 0.5547, + "step": 868 + }, + { + "epoch": 0.047749725576289793, + "grad_norm": 2.6691057682037354, + "learning_rate": 4.989631753012964e-05, + "loss": 0.4891, + "step": 870 + }, + { + "epoch": 0.047859495060373215, + "grad_norm": 1.7175266742706299, + "learning_rate": 4.989584061286211e-05, + "loss": 0.4725, + "step": 872 + }, + { + "epoch": 0.047969264544456644, + "grad_norm": 3.1910715103149414, + "learning_rate": 4.989536260354081e-05, + "loss": 0.5044, + "step": 874 + }, + { + "epoch": 0.048079034028540066, + "grad_norm": 1.5439561605453491, + "learning_rate": 4.989488350218671e-05, + "loss": 0.3565, + "step": 876 + }, + { + "epoch": 0.04818880351262349, + "grad_norm": 1.9704481363296509, + "learning_rate": 4.989440330882082e-05, + "loss": 0.4881, + "step": 878 + }, + { + "epoch": 0.048298572996706916, + "grad_norm": 1.3991576433181763, + "learning_rate": 4.9893922023464236e-05, + "loss": 0.3261, + "step": 880 + }, + { + "epoch": 0.04840834248079034, + "grad_norm": 2.343942880630493, + "learning_rate": 4.989343964613803e-05, + "loss": 0.4821, + "step": 882 + }, + { + "epoch": 0.04851811196487377, + "grad_norm": 2.795376777648926, + "learning_rate": 4.989295617686337e-05, + "loss": 0.3996, + "step": 884 + }, + { + "epoch": 0.04862788144895719, + "grad_norm": 2.294735908508301, + "learning_rate": 4.9892471615661484e-05, + "loss": 0.3819, + "step": 886 + }, + { + "epoch": 0.04873765093304062, + "grad_norm": 1.4445233345031738, + "learning_rate": 4.9891985962553606e-05, + "loss": 0.4069, + "step": 888 + }, + { + "epoch": 0.04884742041712404, + "grad_norm": 3.642369270324707, + "learning_rate": 4.989149921756105e-05, + "loss": 0.4873, + "step": 890 + }, + { + "epoch": 0.04895718990120746, + "grad_norm": 4.252140045166016, + "learning_rate": 4.989101138070516e-05, + "loss": 0.4717, + "step": 892 + }, + { + "epoch": 0.04906695938529089, + "grad_norm": 1.7390509843826294, + "learning_rate": 4.9890522452007334e-05, + "loss": 0.3344, + "step": 894 + }, + { + "epoch": 0.04917672886937431, + "grad_norm": 2.086243152618408, + "learning_rate": 4.989003243148904e-05, + "loss": 0.4864, + "step": 896 + }, + { + "epoch": 0.04928649835345774, + "grad_norm": 1.699199914932251, + "learning_rate": 4.988954131917174e-05, + "loss": 0.3843, + "step": 898 + }, + { + "epoch": 0.04939626783754116, + "grad_norm": 1.3045551776885986, + "learning_rate": 4.9889049115077005e-05, + "loss": 0.3732, + "step": 900 + }, + { + "epoch": 0.04950603732162459, + "grad_norm": 2.3170082569122314, + "learning_rate": 4.98885558192264e-05, + "loss": 0.5105, + "step": 902 + }, + { + "epoch": 0.04961580680570801, + "grad_norm": 2.9296257495880127, + "learning_rate": 4.988806143164159e-05, + "loss": 0.6904, + "step": 904 + }, + { + "epoch": 0.04972557628979144, + "grad_norm": 2.1555259227752686, + "learning_rate": 4.988756595234424e-05, + "loss": 0.5866, + "step": 906 + }, + { + "epoch": 0.04983534577387486, + "grad_norm": 2.4860827922821045, + "learning_rate": 4.9887069381356094e-05, + "loss": 0.3392, + "step": 908 + }, + { + "epoch": 0.049945115257958285, + "grad_norm": 2.0037200450897217, + "learning_rate": 4.988657171869893e-05, + "loss": 0.5268, + "step": 910 + }, + { + "epoch": 0.050054884742041714, + "grad_norm": 2.386451005935669, + "learning_rate": 4.988607296439458e-05, + "loss": 0.5155, + "step": 912 + }, + { + "epoch": 0.050164654226125135, + "grad_norm": 2.444303274154663, + "learning_rate": 4.9885573118464925e-05, + "loss": 0.4092, + "step": 914 + }, + { + "epoch": 0.050274423710208564, + "grad_norm": 1.2874751091003418, + "learning_rate": 4.988507218093189e-05, + "loss": 0.3398, + "step": 916 + }, + { + "epoch": 0.050384193194291986, + "grad_norm": 2.03812313079834, + "learning_rate": 4.988457015181743e-05, + "loss": 0.4442, + "step": 918 + }, + { + "epoch": 0.050493962678375415, + "grad_norm": 1.1444135904312134, + "learning_rate": 4.98840670311436e-05, + "loss": 0.4241, + "step": 920 + }, + { + "epoch": 0.05060373216245884, + "grad_norm": 2.143876791000366, + "learning_rate": 4.988356281893245e-05, + "loss": 0.4555, + "step": 922 + }, + { + "epoch": 0.05071350164654226, + "grad_norm": 1.726468563079834, + "learning_rate": 4.988305751520609e-05, + "loss": 0.4278, + "step": 924 + }, + { + "epoch": 0.05082327113062569, + "grad_norm": 2.237064838409424, + "learning_rate": 4.98825511199867e-05, + "loss": 0.5967, + "step": 926 + }, + { + "epoch": 0.05093304061470911, + "grad_norm": 2.720860004425049, + "learning_rate": 4.988204363329648e-05, + "loss": 0.4939, + "step": 928 + }, + { + "epoch": 0.05104281009879254, + "grad_norm": 2.4488840103149414, + "learning_rate": 4.988153505515771e-05, + "loss": 0.471, + "step": 930 + }, + { + "epoch": 0.05115257958287596, + "grad_norm": 1.7968485355377197, + "learning_rate": 4.988102538559268e-05, + "loss": 0.4472, + "step": 932 + }, + { + "epoch": 0.05126234906695939, + "grad_norm": 3.51719331741333, + "learning_rate": 4.9880514624623755e-05, + "loss": 0.4978, + "step": 934 + }, + { + "epoch": 0.05137211855104281, + "grad_norm": 1.7505348920822144, + "learning_rate": 4.988000277227334e-05, + "loss": 0.3655, + "step": 936 + }, + { + "epoch": 0.05148188803512623, + "grad_norm": 2.069267988204956, + "learning_rate": 4.987948982856388e-05, + "loss": 0.3918, + "step": 938 + }, + { + "epoch": 0.05159165751920966, + "grad_norm": 4.40700626373291, + "learning_rate": 4.987897579351788e-05, + "loss": 0.4127, + "step": 940 + }, + { + "epoch": 0.05170142700329308, + "grad_norm": 2.553508758544922, + "learning_rate": 4.9878460667157895e-05, + "loss": 0.472, + "step": 942 + }, + { + "epoch": 0.05181119648737651, + "grad_norm": 2.667774200439453, + "learning_rate": 4.987794444950651e-05, + "loss": 0.4457, + "step": 944 + }, + { + "epoch": 0.05192096597145993, + "grad_norm": 2.9341893196105957, + "learning_rate": 4.987742714058637e-05, + "loss": 0.4374, + "step": 946 + }, + { + "epoch": 0.05203073545554336, + "grad_norm": 1.716961145401001, + "learning_rate": 4.9876908740420175e-05, + "loss": 0.4229, + "step": 948 + }, + { + "epoch": 0.05214050493962678, + "grad_norm": 2.3017358779907227, + "learning_rate": 4.987638924903067e-05, + "loss": 0.4281, + "step": 950 + }, + { + "epoch": 0.052250274423710205, + "grad_norm": 2.418440818786621, + "learning_rate": 4.9875868666440604e-05, + "loss": 0.4169, + "step": 952 + }, + { + "epoch": 0.052360043907793634, + "grad_norm": 1.8574697971343994, + "learning_rate": 4.987534699267287e-05, + "loss": 0.4225, + "step": 954 + }, + { + "epoch": 0.052469813391877056, + "grad_norm": 2.2646639347076416, + "learning_rate": 4.9874824227750305e-05, + "loss": 0.3566, + "step": 956 + }, + { + "epoch": 0.052579582875960484, + "grad_norm": 1.9320656061172485, + "learning_rate": 4.9874300371695856e-05, + "loss": 0.4618, + "step": 958 + }, + { + "epoch": 0.052689352360043906, + "grad_norm": 3.4200541973114014, + "learning_rate": 4.987377542453251e-05, + "loss": 0.386, + "step": 960 + }, + { + "epoch": 0.052799121844127335, + "grad_norm": 2.0273003578186035, + "learning_rate": 4.987324938628328e-05, + "loss": 0.4622, + "step": 962 + }, + { + "epoch": 0.05290889132821076, + "grad_norm": 2.4417943954467773, + "learning_rate": 4.987272225697125e-05, + "loss": 0.5204, + "step": 964 + }, + { + "epoch": 0.053018660812294185, + "grad_norm": 3.8699522018432617, + "learning_rate": 4.9872194036619535e-05, + "loss": 0.5036, + "step": 966 + }, + { + "epoch": 0.05312843029637761, + "grad_norm": 3.033482551574707, + "learning_rate": 4.9871664725251314e-05, + "loss": 0.4003, + "step": 968 + }, + { + "epoch": 0.05323819978046103, + "grad_norm": 3.0448460578918457, + "learning_rate": 4.9871134322889804e-05, + "loss": 0.5203, + "step": 970 + }, + { + "epoch": 0.05334796926454446, + "grad_norm": 3.0527617931365967, + "learning_rate": 4.987060282955826e-05, + "loss": 0.3981, + "step": 972 + }, + { + "epoch": 0.05345773874862788, + "grad_norm": 3.8588778972625732, + "learning_rate": 4.9870070245280006e-05, + "loss": 0.5631, + "step": 974 + }, + { + "epoch": 0.05356750823271131, + "grad_norm": 3.273505687713623, + "learning_rate": 4.986953657007841e-05, + "loss": 0.6616, + "step": 976 + }, + { + "epoch": 0.05367727771679473, + "grad_norm": 2.5195374488830566, + "learning_rate": 4.986900180397686e-05, + "loss": 0.4722, + "step": 978 + }, + { + "epoch": 0.05378704720087816, + "grad_norm": 4.51970911026001, + "learning_rate": 4.986846594699883e-05, + "loss": 0.5271, + "step": 980 + }, + { + "epoch": 0.05389681668496158, + "grad_norm": 3.307293653488159, + "learning_rate": 4.986792899916784e-05, + "loss": 0.4359, + "step": 982 + }, + { + "epoch": 0.054006586169045, + "grad_norm": 3.32153058052063, + "learning_rate": 4.98673909605074e-05, + "loss": 0.4563, + "step": 984 + }, + { + "epoch": 0.05411635565312843, + "grad_norm": 2.266650676727295, + "learning_rate": 4.986685183104115e-05, + "loss": 0.5171, + "step": 986 + }, + { + "epoch": 0.05422612513721185, + "grad_norm": 3.3576607704162598, + "learning_rate": 4.986631161079272e-05, + "loss": 0.5171, + "step": 988 + }, + { + "epoch": 0.05433589462129528, + "grad_norm": 3.8039710521698, + "learning_rate": 4.986577029978581e-05, + "loss": 0.4811, + "step": 990 + }, + { + "epoch": 0.054445664105378704, + "grad_norm": 3.7508862018585205, + "learning_rate": 4.986522789804417e-05, + "loss": 0.3627, + "step": 992 + }, + { + "epoch": 0.05455543358946213, + "grad_norm": 1.870471715927124, + "learning_rate": 4.986468440559159e-05, + "loss": 0.4512, + "step": 994 + }, + { + "epoch": 0.054665203073545554, + "grad_norm": 1.8001744747161865, + "learning_rate": 4.9864139822451905e-05, + "loss": 0.3729, + "step": 996 + }, + { + "epoch": 0.054774972557628976, + "grad_norm": 1.435326099395752, + "learning_rate": 4.986359414864901e-05, + "loss": 0.5323, + "step": 998 + }, + { + "epoch": 0.054884742041712405, + "grad_norm": 2.5657951831817627, + "learning_rate": 4.9863047384206835e-05, + "loss": 0.321, + "step": 1000 + }, + { + "epoch": 0.054994511525795826, + "grad_norm": 2.272712469100952, + "learning_rate": 4.986249952914937e-05, + "loss": 0.5539, + "step": 1002 + }, + { + "epoch": 0.055104281009879255, + "grad_norm": 2.7366015911102295, + "learning_rate": 4.9861950583500636e-05, + "loss": 0.4787, + "step": 1004 + }, + { + "epoch": 0.05521405049396268, + "grad_norm": 1.9208272695541382, + "learning_rate": 4.986140054728473e-05, + "loss": 0.5802, + "step": 1006 + }, + { + "epoch": 0.055323819978046106, + "grad_norm": 1.8958157300949097, + "learning_rate": 4.9860849420525766e-05, + "loss": 0.4817, + "step": 1008 + }, + { + "epoch": 0.05543358946212953, + "grad_norm": 2.572070360183716, + "learning_rate": 4.986029720324792e-05, + "loss": 0.4915, + "step": 1010 + }, + { + "epoch": 0.055543358946212956, + "grad_norm": 2.1066503524780273, + "learning_rate": 4.9859743895475416e-05, + "loss": 0.417, + "step": 1012 + }, + { + "epoch": 0.05565312843029638, + "grad_norm": 1.6408382654190063, + "learning_rate": 4.9859189497232526e-05, + "loss": 0.4392, + "step": 1014 + }, + { + "epoch": 0.0557628979143798, + "grad_norm": 1.2843135595321655, + "learning_rate": 4.985863400854358e-05, + "loss": 0.3992, + "step": 1016 + }, + { + "epoch": 0.05587266739846323, + "grad_norm": 1.6135112047195435, + "learning_rate": 4.985807742943292e-05, + "loss": 0.4733, + "step": 1018 + }, + { + "epoch": 0.05598243688254665, + "grad_norm": 3.258601665496826, + "learning_rate": 4.9857519759924974e-05, + "loss": 0.4992, + "step": 1020 + }, + { + "epoch": 0.05609220636663008, + "grad_norm": 2.6568429470062256, + "learning_rate": 4.985696100004421e-05, + "loss": 0.4453, + "step": 1022 + }, + { + "epoch": 0.0562019758507135, + "grad_norm": 2.810068130493164, + "learning_rate": 4.9856401149815126e-05, + "loss": 0.4662, + "step": 1024 + }, + { + "epoch": 0.05631174533479693, + "grad_norm": 2.299020767211914, + "learning_rate": 4.985584020926228e-05, + "loss": 0.6255, + "step": 1026 + }, + { + "epoch": 0.05642151481888035, + "grad_norm": 2.3658361434936523, + "learning_rate": 4.985527817841029e-05, + "loss": 0.4536, + "step": 1028 + }, + { + "epoch": 0.05653128430296377, + "grad_norm": 1.7822438478469849, + "learning_rate": 4.9854715057283805e-05, + "loss": 0.4564, + "step": 1030 + }, + { + "epoch": 0.0566410537870472, + "grad_norm": 2.9134278297424316, + "learning_rate": 4.985415084590752e-05, + "loss": 0.4654, + "step": 1032 + }, + { + "epoch": 0.056750823271130624, + "grad_norm": 2.480680227279663, + "learning_rate": 4.985358554430619e-05, + "loss": 0.4526, + "step": 1034 + }, + { + "epoch": 0.05686059275521405, + "grad_norm": 2.8261044025421143, + "learning_rate": 4.9853019152504607e-05, + "loss": 0.6284, + "step": 1036 + }, + { + "epoch": 0.056970362239297474, + "grad_norm": 1.2583746910095215, + "learning_rate": 4.985245167052762e-05, + "loss": 0.3414, + "step": 1038 + }, + { + "epoch": 0.0570801317233809, + "grad_norm": 1.9130700826644897, + "learning_rate": 4.985188309840012e-05, + "loss": 0.3713, + "step": 1040 + }, + { + "epoch": 0.057189901207464325, + "grad_norm": 2.6297409534454346, + "learning_rate": 4.985131343614704e-05, + "loss": 0.4876, + "step": 1042 + }, + { + "epoch": 0.05729967069154775, + "grad_norm": 3.5088694095611572, + "learning_rate": 4.985074268379338e-05, + "loss": 0.498, + "step": 1044 + }, + { + "epoch": 0.057409440175631175, + "grad_norm": 1.611140251159668, + "learning_rate": 4.985017084136417e-05, + "loss": 0.3649, + "step": 1046 + }, + { + "epoch": 0.0575192096597146, + "grad_norm": 2.760319232940674, + "learning_rate": 4.98495979088845e-05, + "loss": 0.5427, + "step": 1048 + }, + { + "epoch": 0.057628979143798026, + "grad_norm": 1.729722023010254, + "learning_rate": 4.98490238863795e-05, + "loss": 0.3923, + "step": 1050 + }, + { + "epoch": 0.05773874862788145, + "grad_norm": 3.4703452587127686, + "learning_rate": 4.984844877387433e-05, + "loss": 0.5224, + "step": 1052 + }, + { + "epoch": 0.057848518111964876, + "grad_norm": 2.051114559173584, + "learning_rate": 4.984787257139425e-05, + "loss": 0.5327, + "step": 1054 + }, + { + "epoch": 0.0579582875960483, + "grad_norm": 2.578420400619507, + "learning_rate": 4.9847295278964514e-05, + "loss": 0.5271, + "step": 1056 + }, + { + "epoch": 0.05806805708013172, + "grad_norm": 4.609646320343018, + "learning_rate": 4.9846716896610445e-05, + "loss": 0.638, + "step": 1058 + }, + { + "epoch": 0.05817782656421515, + "grad_norm": 4.527503490447998, + "learning_rate": 4.984613742435742e-05, + "loss": 0.4914, + "step": 1060 + }, + { + "epoch": 0.05828759604829857, + "grad_norm": 3.7712674140930176, + "learning_rate": 4.9845556862230855e-05, + "loss": 0.3824, + "step": 1062 + }, + { + "epoch": 0.058397365532382, + "grad_norm": 1.6237224340438843, + "learning_rate": 4.9844975210256217e-05, + "loss": 0.3141, + "step": 1064 + }, + { + "epoch": 0.05850713501646542, + "grad_norm": 7.220519542694092, + "learning_rate": 4.984439246845902e-05, + "loss": 0.3952, + "step": 1066 + }, + { + "epoch": 0.05861690450054885, + "grad_norm": 1.7830177545547485, + "learning_rate": 4.984380863686482e-05, + "loss": 0.5049, + "step": 1068 + }, + { + "epoch": 0.05872667398463227, + "grad_norm": 2.961592435836792, + "learning_rate": 4.984322371549924e-05, + "loss": 0.5032, + "step": 1070 + }, + { + "epoch": 0.0588364434687157, + "grad_norm": 1.9592653512954712, + "learning_rate": 4.984263770438793e-05, + "loss": 0.4292, + "step": 1072 + }, + { + "epoch": 0.05894621295279912, + "grad_norm": 3.26139497756958, + "learning_rate": 4.98420506035566e-05, + "loss": 0.5145, + "step": 1074 + }, + { + "epoch": 0.059055982436882544, + "grad_norm": 2.5563547611236572, + "learning_rate": 4.9841462413030995e-05, + "loss": 0.6342, + "step": 1076 + }, + { + "epoch": 0.05916575192096597, + "grad_norm": 1.606816053390503, + "learning_rate": 4.984087313283691e-05, + "loss": 0.2775, + "step": 1078 + }, + { + "epoch": 0.059275521405049394, + "grad_norm": 2.758261203765869, + "learning_rate": 4.984028276300021e-05, + "loss": 0.5364, + "step": 1080 + }, + { + "epoch": 0.05938529088913282, + "grad_norm": 2.3764030933380127, + "learning_rate": 4.9839691303546785e-05, + "loss": 0.4672, + "step": 1082 + }, + { + "epoch": 0.059495060373216245, + "grad_norm": 1.5163699388504028, + "learning_rate": 4.983909875450258e-05, + "loss": 0.3764, + "step": 1084 + }, + { + "epoch": 0.059604829857299674, + "grad_norm": 1.1406607627868652, + "learning_rate": 4.983850511589358e-05, + "loss": 0.3226, + "step": 1086 + }, + { + "epoch": 0.059714599341383096, + "grad_norm": 3.722994089126587, + "learning_rate": 4.9837910387745845e-05, + "loss": 0.4306, + "step": 1088 + }, + { + "epoch": 0.05982436882546652, + "grad_norm": 1.8437087535858154, + "learning_rate": 4.983731457008544e-05, + "loss": 0.4252, + "step": 1090 + }, + { + "epoch": 0.059934138309549946, + "grad_norm": 1.8271889686584473, + "learning_rate": 4.983671766293851e-05, + "loss": 0.5323, + "step": 1092 + }, + { + "epoch": 0.06004390779363337, + "grad_norm": 2.358571767807007, + "learning_rate": 4.9836119666331235e-05, + "loss": 0.4096, + "step": 1094 + }, + { + "epoch": 0.0601536772777168, + "grad_norm": 1.783514380455017, + "learning_rate": 4.9835520580289854e-05, + "loss": 0.3438, + "step": 1096 + }, + { + "epoch": 0.06026344676180022, + "grad_norm": 3.498642921447754, + "learning_rate": 4.983492040484064e-05, + "loss": 0.4575, + "step": 1098 + }, + { + "epoch": 0.06037321624588365, + "grad_norm": 2.7874975204467773, + "learning_rate": 4.983431914000991e-05, + "loss": 0.5707, + "step": 1100 + }, + { + "epoch": 0.06048298572996707, + "grad_norm": 1.6899256706237793, + "learning_rate": 4.983371678582406e-05, + "loss": 0.4684, + "step": 1102 + }, + { + "epoch": 0.06059275521405049, + "grad_norm": 2.1684889793395996, + "learning_rate": 4.98331133423095e-05, + "loss": 0.49, + "step": 1104 + }, + { + "epoch": 0.06070252469813392, + "grad_norm": 2.0974478721618652, + "learning_rate": 4.9832508809492694e-05, + "loss": 0.3741, + "step": 1106 + }, + { + "epoch": 0.06081229418221734, + "grad_norm": 2.3188464641571045, + "learning_rate": 4.9831903187400166e-05, + "loss": 0.3327, + "step": 1108 + }, + { + "epoch": 0.06092206366630077, + "grad_norm": 1.731326699256897, + "learning_rate": 4.9831296476058484e-05, + "loss": 0.426, + "step": 1110 + }, + { + "epoch": 0.06103183315038419, + "grad_norm": 2.965094566345215, + "learning_rate": 4.9830688675494265e-05, + "loss": 0.5083, + "step": 1112 + }, + { + "epoch": 0.06114160263446762, + "grad_norm": 1.8496700525283813, + "learning_rate": 4.983007978573416e-05, + "loss": 0.3974, + "step": 1114 + }, + { + "epoch": 0.06125137211855104, + "grad_norm": 2.4411020278930664, + "learning_rate": 4.982946980680488e-05, + "loss": 0.3526, + "step": 1116 + }, + { + "epoch": 0.061361141602634464, + "grad_norm": 4.100210189819336, + "learning_rate": 4.982885873873319e-05, + "loss": 0.4249, + "step": 1118 + }, + { + "epoch": 0.06147091108671789, + "grad_norm": 2.256807804107666, + "learning_rate": 4.982824658154589e-05, + "loss": 0.4488, + "step": 1120 + }, + { + "epoch": 0.061580680570801315, + "grad_norm": 4.486173152923584, + "learning_rate": 4.982763333526982e-05, + "loss": 0.4881, + "step": 1122 + }, + { + "epoch": 0.06169045005488474, + "grad_norm": 2.0042433738708496, + "learning_rate": 4.982701899993189e-05, + "loss": 0.4491, + "step": 1124 + }, + { + "epoch": 0.061800219538968165, + "grad_norm": 2.1326398849487305, + "learning_rate": 4.982640357555907e-05, + "loss": 0.4606, + "step": 1126 + }, + { + "epoch": 0.061909989023051594, + "grad_norm": 1.7117959260940552, + "learning_rate": 4.9825787062178315e-05, + "loss": 0.4492, + "step": 1128 + }, + { + "epoch": 0.062019758507135016, + "grad_norm": 3.7019777297973633, + "learning_rate": 4.982516945981669e-05, + "loss": 0.2908, + "step": 1130 + }, + { + "epoch": 0.062129527991218444, + "grad_norm": 2.5198380947113037, + "learning_rate": 4.982455076850129e-05, + "loss": 0.4618, + "step": 1132 + }, + { + "epoch": 0.062239297475301866, + "grad_norm": 1.8583816289901733, + "learning_rate": 4.9823930988259236e-05, + "loss": 0.4599, + "step": 1134 + }, + { + "epoch": 0.06234906695938529, + "grad_norm": 1.9440760612487793, + "learning_rate": 4.982331011911774e-05, + "loss": 0.396, + "step": 1136 + }, + { + "epoch": 0.06245883644346872, + "grad_norm": 1.8765228986740112, + "learning_rate": 4.982268816110401e-05, + "loss": 0.5192, + "step": 1138 + }, + { + "epoch": 0.06256860592755215, + "grad_norm": 1.9601404666900635, + "learning_rate": 4.982206511424534e-05, + "loss": 0.4606, + "step": 1140 + }, + { + "epoch": 0.06267837541163557, + "grad_norm": 2.6527223587036133, + "learning_rate": 4.9821440978569066e-05, + "loss": 0.4129, + "step": 1142 + }, + { + "epoch": 0.06278814489571899, + "grad_norm": 2.724247932434082, + "learning_rate": 4.982081575410256e-05, + "loss": 0.3928, + "step": 1144 + }, + { + "epoch": 0.06289791437980241, + "grad_norm": 2.3376028537750244, + "learning_rate": 4.982018944087325e-05, + "loss": 0.4603, + "step": 1146 + }, + { + "epoch": 0.06300768386388585, + "grad_norm": 2.12777042388916, + "learning_rate": 4.98195620389086e-05, + "loss": 0.4027, + "step": 1148 + }, + { + "epoch": 0.06311745334796927, + "grad_norm": 2.1916894912719727, + "learning_rate": 4.981893354823614e-05, + "loss": 0.5265, + "step": 1150 + }, + { + "epoch": 0.06322722283205269, + "grad_norm": 2.466592311859131, + "learning_rate": 4.981830396888344e-05, + "loss": 0.3878, + "step": 1152 + }, + { + "epoch": 0.06333699231613611, + "grad_norm": 1.8875963687896729, + "learning_rate": 4.981767330087811e-05, + "loss": 0.5095, + "step": 1154 + }, + { + "epoch": 0.06344676180021953, + "grad_norm": 4.612142562866211, + "learning_rate": 4.981704154424781e-05, + "loss": 0.4232, + "step": 1156 + }, + { + "epoch": 0.06355653128430297, + "grad_norm": 3.304377555847168, + "learning_rate": 4.981640869902027e-05, + "loss": 0.3963, + "step": 1158 + }, + { + "epoch": 0.06366630076838639, + "grad_norm": 3.227165937423706, + "learning_rate": 4.9815774765223226e-05, + "loss": 0.4422, + "step": 1160 + }, + { + "epoch": 0.06377607025246981, + "grad_norm": 2.4671239852905273, + "learning_rate": 4.981513974288451e-05, + "loss": 0.6368, + "step": 1162 + }, + { + "epoch": 0.06388583973655323, + "grad_norm": 1.9955382347106934, + "learning_rate": 4.9814503632031954e-05, + "loss": 0.6109, + "step": 1164 + }, + { + "epoch": 0.06399560922063666, + "grad_norm": 1.807688593864441, + "learning_rate": 4.981386643269348e-05, + "loss": 0.4232, + "step": 1166 + }, + { + "epoch": 0.06410537870472009, + "grad_norm": 1.7031397819519043, + "learning_rate": 4.981322814489703e-05, + "loss": 0.5035, + "step": 1168 + }, + { + "epoch": 0.06421514818880351, + "grad_norm": 2.2626101970672607, + "learning_rate": 4.98125887686706e-05, + "loss": 0.4632, + "step": 1170 + }, + { + "epoch": 0.06432491767288694, + "grad_norm": 2.9094724655151367, + "learning_rate": 4.9811948304042234e-05, + "loss": 0.5155, + "step": 1172 + }, + { + "epoch": 0.06443468715697036, + "grad_norm": 2.112231492996216, + "learning_rate": 4.9811306751040037e-05, + "loss": 0.3726, + "step": 1174 + }, + { + "epoch": 0.0645444566410538, + "grad_norm": 1.8627220392227173, + "learning_rate": 4.981066410969215e-05, + "loss": 0.3822, + "step": 1176 + }, + { + "epoch": 0.06465422612513722, + "grad_norm": 1.8811994791030884, + "learning_rate": 4.981002038002675e-05, + "loss": 0.527, + "step": 1178 + }, + { + "epoch": 0.06476399560922064, + "grad_norm": 1.9095158576965332, + "learning_rate": 4.980937556207208e-05, + "loss": 0.3965, + "step": 1180 + }, + { + "epoch": 0.06487376509330406, + "grad_norm": 2.149390697479248, + "learning_rate": 4.9808729655856434e-05, + "loss": 0.5163, + "step": 1182 + }, + { + "epoch": 0.06498353457738748, + "grad_norm": 1.8205047845840454, + "learning_rate": 4.980808266140813e-05, + "loss": 0.4399, + "step": 1184 + }, + { + "epoch": 0.06509330406147092, + "grad_norm": 1.8267725706100464, + "learning_rate": 4.980743457875556e-05, + "loss": 0.3473, + "step": 1186 + }, + { + "epoch": 0.06520307354555434, + "grad_norm": 3.346379280090332, + "learning_rate": 4.980678540792715e-05, + "loss": 0.4547, + "step": 1188 + }, + { + "epoch": 0.06531284302963776, + "grad_norm": 1.982531189918518, + "learning_rate": 4.980613514895136e-05, + "loss": 0.4273, + "step": 1190 + }, + { + "epoch": 0.06542261251372118, + "grad_norm": 5.309198379516602, + "learning_rate": 4.980548380185674e-05, + "loss": 0.4724, + "step": 1192 + }, + { + "epoch": 0.0655323819978046, + "grad_norm": 3.4168004989624023, + "learning_rate": 4.980483136667185e-05, + "loss": 0.4338, + "step": 1194 + }, + { + "epoch": 0.06564215148188804, + "grad_norm": 1.6829313039779663, + "learning_rate": 4.9804177843425295e-05, + "loss": 0.4968, + "step": 1196 + }, + { + "epoch": 0.06575192096597146, + "grad_norm": 1.1707305908203125, + "learning_rate": 4.980352323214575e-05, + "loss": 0.4698, + "step": 1198 + }, + { + "epoch": 0.06586169045005488, + "grad_norm": 1.8975781202316284, + "learning_rate": 4.980286753286195e-05, + "loss": 0.4502, + "step": 1200 + }, + { + "epoch": 0.0659714599341383, + "grad_norm": 2.4474830627441406, + "learning_rate": 4.980221074560263e-05, + "loss": 0.5496, + "step": 1202 + }, + { + "epoch": 0.06608122941822174, + "grad_norm": 1.9411165714263916, + "learning_rate": 4.980155287039662e-05, + "loss": 0.4918, + "step": 1204 + }, + { + "epoch": 0.06619099890230516, + "grad_norm": 1.198267936706543, + "learning_rate": 4.980089390727275e-05, + "loss": 0.3033, + "step": 1206 + }, + { + "epoch": 0.06630076838638858, + "grad_norm": 1.876011610031128, + "learning_rate": 4.980023385625996e-05, + "loss": 0.4611, + "step": 1208 + }, + { + "epoch": 0.066410537870472, + "grad_norm": 1.5111809968948364, + "learning_rate": 4.9799572717387175e-05, + "loss": 0.3518, + "step": 1210 + }, + { + "epoch": 0.06652030735455543, + "grad_norm": 1.7499897480010986, + "learning_rate": 4.979891049068342e-05, + "loss": 0.3699, + "step": 1212 + }, + { + "epoch": 0.06663007683863886, + "grad_norm": 2.6900734901428223, + "learning_rate": 4.979824717617771e-05, + "loss": 0.6581, + "step": 1214 + }, + { + "epoch": 0.06673984632272228, + "grad_norm": 2.686737298965454, + "learning_rate": 4.979758277389919e-05, + "loss": 0.4102, + "step": 1216 + }, + { + "epoch": 0.0668496158068057, + "grad_norm": 2.3781299591064453, + "learning_rate": 4.979691728387696e-05, + "loss": 0.48, + "step": 1218 + }, + { + "epoch": 0.06695938529088913, + "grad_norm": 1.7419235706329346, + "learning_rate": 4.9796250706140224e-05, + "loss": 0.4931, + "step": 1220 + }, + { + "epoch": 0.06706915477497256, + "grad_norm": 1.4593462944030762, + "learning_rate": 4.979558304071823e-05, + "loss": 0.3789, + "step": 1222 + }, + { + "epoch": 0.06717892425905599, + "grad_norm": 3.8548173904418945, + "learning_rate": 4.979491428764026e-05, + "loss": 0.5627, + "step": 1224 + }, + { + "epoch": 0.06728869374313941, + "grad_norm": 2.1368348598480225, + "learning_rate": 4.9794244446935646e-05, + "loss": 0.5195, + "step": 1226 + }, + { + "epoch": 0.06739846322722283, + "grad_norm": 1.799569010734558, + "learning_rate": 4.979357351863377e-05, + "loss": 0.4831, + "step": 1228 + }, + { + "epoch": 0.06750823271130625, + "grad_norm": 1.6860439777374268, + "learning_rate": 4.9792901502764075e-05, + "loss": 0.3981, + "step": 1230 + }, + { + "epoch": 0.06761800219538969, + "grad_norm": 2.2666032314300537, + "learning_rate": 4.979222839935602e-05, + "loss": 0.3913, + "step": 1232 + }, + { + "epoch": 0.06772777167947311, + "grad_norm": 3.2870471477508545, + "learning_rate": 4.979155420843915e-05, + "loss": 0.4863, + "step": 1234 + }, + { + "epoch": 0.06783754116355653, + "grad_norm": 1.8008981943130493, + "learning_rate": 4.979087893004302e-05, + "loss": 0.5103, + "step": 1236 + }, + { + "epoch": 0.06794731064763995, + "grad_norm": 1.6409200429916382, + "learning_rate": 4.9790202564197264e-05, + "loss": 0.3363, + "step": 1238 + }, + { + "epoch": 0.06805708013172337, + "grad_norm": 1.7395111322402954, + "learning_rate": 4.9789525110931545e-05, + "loss": 0.3635, + "step": 1240 + }, + { + "epoch": 0.06816684961580681, + "grad_norm": 2.173182725906372, + "learning_rate": 4.978884657027558e-05, + "loss": 0.4143, + "step": 1242 + }, + { + "epoch": 0.06827661909989023, + "grad_norm": 2.09313702583313, + "learning_rate": 4.9788166942259135e-05, + "loss": 0.4964, + "step": 1244 + }, + { + "epoch": 0.06838638858397365, + "grad_norm": 1.7448314428329468, + "learning_rate": 4.9787486226912014e-05, + "loss": 0.4115, + "step": 1246 + }, + { + "epoch": 0.06849615806805708, + "grad_norm": 1.5633306503295898, + "learning_rate": 4.9786804424264085e-05, + "loss": 0.3978, + "step": 1248 + }, + { + "epoch": 0.06860592755214051, + "grad_norm": 1.4984726905822754, + "learning_rate": 4.9786121534345265e-05, + "loss": 0.3653, + "step": 1250 + }, + { + "epoch": 0.06871569703622393, + "grad_norm": 1.5834602117538452, + "learning_rate": 4.978543755718549e-05, + "loss": 0.446, + "step": 1252 + }, + { + "epoch": 0.06882546652030735, + "grad_norm": 1.9671735763549805, + "learning_rate": 4.978475249281477e-05, + "loss": 0.501, + "step": 1254 + }, + { + "epoch": 0.06893523600439078, + "grad_norm": 2.863701105117798, + "learning_rate": 4.978406634126315e-05, + "loss": 0.4077, + "step": 1256 + }, + { + "epoch": 0.0690450054884742, + "grad_norm": 1.948574185371399, + "learning_rate": 4.978337910256073e-05, + "loss": 0.4724, + "step": 1258 + }, + { + "epoch": 0.06915477497255763, + "grad_norm": 1.5324403047561646, + "learning_rate": 4.978269077673767e-05, + "loss": 0.4444, + "step": 1260 + }, + { + "epoch": 0.06926454445664106, + "grad_norm": 3.6917941570281982, + "learning_rate": 4.9782001363824146e-05, + "loss": 0.4385, + "step": 1262 + }, + { + "epoch": 0.06937431394072448, + "grad_norm": 2.8190650939941406, + "learning_rate": 4.9781310863850405e-05, + "loss": 0.448, + "step": 1264 + }, + { + "epoch": 0.0694840834248079, + "grad_norm": 1.62575364112854, + "learning_rate": 4.978061927684673e-05, + "loss": 0.3259, + "step": 1266 + }, + { + "epoch": 0.06959385290889133, + "grad_norm": 1.7630960941314697, + "learning_rate": 4.977992660284347e-05, + "loss": 0.3859, + "step": 1268 + }, + { + "epoch": 0.06970362239297476, + "grad_norm": 3.1869819164276123, + "learning_rate": 4.977923284187101e-05, + "loss": 0.4117, + "step": 1270 + }, + { + "epoch": 0.06981339187705818, + "grad_norm": 3.3737900257110596, + "learning_rate": 4.977853799395976e-05, + "loss": 0.6535, + "step": 1272 + }, + { + "epoch": 0.0699231613611416, + "grad_norm": 3.231808662414551, + "learning_rate": 4.977784205914022e-05, + "loss": 0.5492, + "step": 1274 + }, + { + "epoch": 0.07003293084522502, + "grad_norm": 1.8682773113250732, + "learning_rate": 4.9777145037442906e-05, + "loss": 0.3861, + "step": 1276 + }, + { + "epoch": 0.07014270032930846, + "grad_norm": 1.1819719076156616, + "learning_rate": 4.97764469288984e-05, + "loss": 0.5512, + "step": 1278 + }, + { + "epoch": 0.07025246981339188, + "grad_norm": 1.6344634294509888, + "learning_rate": 4.977574773353732e-05, + "loss": 0.3661, + "step": 1280 + }, + { + "epoch": 0.0703622392974753, + "grad_norm": 2.0616872310638428, + "learning_rate": 4.977504745139034e-05, + "loss": 0.3494, + "step": 1282 + }, + { + "epoch": 0.07047200878155872, + "grad_norm": 2.048205614089966, + "learning_rate": 4.9774346082488176e-05, + "loss": 0.5412, + "step": 1284 + }, + { + "epoch": 0.07058177826564214, + "grad_norm": 1.3545513153076172, + "learning_rate": 4.977364362686159e-05, + "loss": 0.3526, + "step": 1286 + }, + { + "epoch": 0.07069154774972558, + "grad_norm": 2.6548078060150146, + "learning_rate": 4.9772940084541405e-05, + "loss": 0.5047, + "step": 1288 + }, + { + "epoch": 0.070801317233809, + "grad_norm": 1.8332488536834717, + "learning_rate": 4.977223545555847e-05, + "loss": 0.399, + "step": 1290 + }, + { + "epoch": 0.07091108671789242, + "grad_norm": 2.047394037246704, + "learning_rate": 4.97715297399437e-05, + "loss": 0.3638, + "step": 1292 + }, + { + "epoch": 0.07102085620197585, + "grad_norm": 1.642471432685852, + "learning_rate": 4.9770822937728046e-05, + "loss": 0.4132, + "step": 1294 + }, + { + "epoch": 0.07113062568605928, + "grad_norm": 2.076873779296875, + "learning_rate": 4.977011504894252e-05, + "loss": 0.4648, + "step": 1296 + }, + { + "epoch": 0.0712403951701427, + "grad_norm": 1.7554806470870972, + "learning_rate": 4.9769406073618176e-05, + "loss": 0.4357, + "step": 1298 + }, + { + "epoch": 0.07135016465422613, + "grad_norm": 2.143167495727539, + "learning_rate": 4.976869601178609e-05, + "loss": 0.4855, + "step": 1300 + }, + { + "epoch": 0.07145993413830955, + "grad_norm": 2.0658464431762695, + "learning_rate": 4.976798486347743e-05, + "loss": 0.4395, + "step": 1302 + }, + { + "epoch": 0.07156970362239297, + "grad_norm": 2.847214937210083, + "learning_rate": 4.9767272628723396e-05, + "loss": 0.4859, + "step": 1304 + }, + { + "epoch": 0.0716794731064764, + "grad_norm": 3.35766863822937, + "learning_rate": 4.976655930755521e-05, + "loss": 0.4316, + "step": 1306 + }, + { + "epoch": 0.07178924259055983, + "grad_norm": 2.068476676940918, + "learning_rate": 4.9765844900004176e-05, + "loss": 0.5039, + "step": 1308 + }, + { + "epoch": 0.07189901207464325, + "grad_norm": 2.455807685852051, + "learning_rate": 4.976512940610163e-05, + "loss": 0.5164, + "step": 1310 + }, + { + "epoch": 0.07200878155872667, + "grad_norm": 2.29729962348938, + "learning_rate": 4.9764412825878943e-05, + "loss": 0.3241, + "step": 1312 + }, + { + "epoch": 0.0721185510428101, + "grad_norm": 1.607513189315796, + "learning_rate": 4.976369515936756e-05, + "loss": 0.3833, + "step": 1314 + }, + { + "epoch": 0.07222832052689353, + "grad_norm": 3.2942299842834473, + "learning_rate": 4.976297640659897e-05, + "loss": 0.4495, + "step": 1316 + }, + { + "epoch": 0.07233809001097695, + "grad_norm": 2.184696912765503, + "learning_rate": 4.976225656760468e-05, + "loss": 0.4374, + "step": 1318 + }, + { + "epoch": 0.07244785949506037, + "grad_norm": 2.0553860664367676, + "learning_rate": 4.976153564241628e-05, + "loss": 0.4853, + "step": 1320 + }, + { + "epoch": 0.07255762897914379, + "grad_norm": 3.0093307495117188, + "learning_rate": 4.976081363106539e-05, + "loss": 0.5408, + "step": 1322 + }, + { + "epoch": 0.07266739846322723, + "grad_norm": 4.46053409576416, + "learning_rate": 4.9760090533583686e-05, + "loss": 0.5705, + "step": 1324 + }, + { + "epoch": 0.07277716794731065, + "grad_norm": 2.160020589828491, + "learning_rate": 4.975936635000288e-05, + "loss": 0.4394, + "step": 1326 + }, + { + "epoch": 0.07288693743139407, + "grad_norm": 2.6854352951049805, + "learning_rate": 4.975864108035474e-05, + "loss": 0.4479, + "step": 1328 + }, + { + "epoch": 0.0729967069154775, + "grad_norm": 2.377037525177002, + "learning_rate": 4.9757914724671074e-05, + "loss": 0.4973, + "step": 1330 + }, + { + "epoch": 0.07310647639956092, + "grad_norm": 3.012934684753418, + "learning_rate": 4.975718728298375e-05, + "loss": 0.4783, + "step": 1332 + }, + { + "epoch": 0.07321624588364435, + "grad_norm": 1.603089451789856, + "learning_rate": 4.975645875532468e-05, + "loss": 0.4771, + "step": 1334 + }, + { + "epoch": 0.07332601536772777, + "grad_norm": 3.4755430221557617, + "learning_rate": 4.975572914172582e-05, + "loss": 0.4505, + "step": 1336 + }, + { + "epoch": 0.0734357848518112, + "grad_norm": 1.8746482133865356, + "learning_rate": 4.9754998442219166e-05, + "loss": 0.4, + "step": 1338 + }, + { + "epoch": 0.07354555433589462, + "grad_norm": 1.473403811454773, + "learning_rate": 4.975426665683678e-05, + "loss": 0.3476, + "step": 1340 + }, + { + "epoch": 0.07365532381997805, + "grad_norm": 3.7610507011413574, + "learning_rate": 4.9753533785610754e-05, + "loss": 0.5882, + "step": 1342 + }, + { + "epoch": 0.07376509330406147, + "grad_norm": 2.545409917831421, + "learning_rate": 4.975279982857324e-05, + "loss": 0.3901, + "step": 1344 + }, + { + "epoch": 0.0738748627881449, + "grad_norm": 4.424686431884766, + "learning_rate": 4.9752064785756425e-05, + "loss": 0.4014, + "step": 1346 + }, + { + "epoch": 0.07398463227222832, + "grad_norm": 2.0439844131469727, + "learning_rate": 4.9751328657192565e-05, + "loss": 0.6046, + "step": 1348 + }, + { + "epoch": 0.07409440175631174, + "grad_norm": 1.6646015644073486, + "learning_rate": 4.975059144291394e-05, + "loss": 0.3263, + "step": 1350 + }, + { + "epoch": 0.07420417124039518, + "grad_norm": 1.8253958225250244, + "learning_rate": 4.97498531429529e-05, + "loss": 0.4486, + "step": 1352 + }, + { + "epoch": 0.0743139407244786, + "grad_norm": 2.27825927734375, + "learning_rate": 4.974911375734181e-05, + "loss": 0.4548, + "step": 1354 + }, + { + "epoch": 0.07442371020856202, + "grad_norm": 1.9971617460250854, + "learning_rate": 4.974837328611312e-05, + "loss": 0.375, + "step": 1356 + }, + { + "epoch": 0.07453347969264544, + "grad_norm": 2.275268316268921, + "learning_rate": 4.974763172929931e-05, + "loss": 0.447, + "step": 1358 + }, + { + "epoch": 0.07464324917672886, + "grad_norm": 3.1994991302490234, + "learning_rate": 4.9746889086932895e-05, + "loss": 0.4338, + "step": 1360 + }, + { + "epoch": 0.0747530186608123, + "grad_norm": 1.797350287437439, + "learning_rate": 4.974614535904646e-05, + "loss": 0.4615, + "step": 1362 + }, + { + "epoch": 0.07486278814489572, + "grad_norm": 2.0760128498077393, + "learning_rate": 4.974540054567264e-05, + "loss": 0.5006, + "step": 1364 + }, + { + "epoch": 0.07497255762897914, + "grad_norm": 1.2099391222000122, + "learning_rate": 4.974465464684409e-05, + "loss": 0.3643, + "step": 1366 + }, + { + "epoch": 0.07508232711306256, + "grad_norm": 1.3272854089736938, + "learning_rate": 4.9743907662593524e-05, + "loss": 0.5253, + "step": 1368 + }, + { + "epoch": 0.075192096597146, + "grad_norm": 1.445914626121521, + "learning_rate": 4.974315959295373e-05, + "loss": 0.4525, + "step": 1370 + }, + { + "epoch": 0.07530186608122942, + "grad_norm": 2.3311679363250732, + "learning_rate": 4.97424104379575e-05, + "loss": 0.4419, + "step": 1372 + }, + { + "epoch": 0.07541163556531284, + "grad_norm": 4.13897705078125, + "learning_rate": 4.9741660197637705e-05, + "loss": 0.3583, + "step": 1374 + }, + { + "epoch": 0.07552140504939626, + "grad_norm": 2.3738443851470947, + "learning_rate": 4.974090887202726e-05, + "loss": 0.4169, + "step": 1376 + }, + { + "epoch": 0.07563117453347969, + "grad_norm": 2.2207579612731934, + "learning_rate": 4.9740156461159114e-05, + "loss": 0.4876, + "step": 1378 + }, + { + "epoch": 0.07574094401756312, + "grad_norm": 4.05877161026001, + "learning_rate": 4.9739402965066276e-05, + "loss": 0.495, + "step": 1380 + }, + { + "epoch": 0.07585071350164654, + "grad_norm": 2.977750539779663, + "learning_rate": 4.9738648383781795e-05, + "loss": 0.4547, + "step": 1382 + }, + { + "epoch": 0.07596048298572997, + "grad_norm": 2.74615216255188, + "learning_rate": 4.9737892717338774e-05, + "loss": 0.4552, + "step": 1384 + }, + { + "epoch": 0.07607025246981339, + "grad_norm": 1.674535870552063, + "learning_rate": 4.9737135965770355e-05, + "loss": 0.5555, + "step": 1386 + }, + { + "epoch": 0.07618002195389682, + "grad_norm": 1.8581138849258423, + "learning_rate": 4.973637812910973e-05, + "loss": 0.4217, + "step": 1388 + }, + { + "epoch": 0.07628979143798024, + "grad_norm": 5.762228012084961, + "learning_rate": 4.973561920739015e-05, + "loss": 0.5637, + "step": 1390 + }, + { + "epoch": 0.07639956092206367, + "grad_norm": 1.6939054727554321, + "learning_rate": 4.9734859200644905e-05, + "loss": 0.4323, + "step": 1392 + }, + { + "epoch": 0.07650933040614709, + "grad_norm": 2.5716586112976074, + "learning_rate": 4.973409810890733e-05, + "loss": 0.2861, + "step": 1394 + }, + { + "epoch": 0.07661909989023051, + "grad_norm": 2.3523201942443848, + "learning_rate": 4.9733335932210814e-05, + "loss": 0.4497, + "step": 1396 + }, + { + "epoch": 0.07672886937431395, + "grad_norm": 1.7844517230987549, + "learning_rate": 4.973257267058877e-05, + "loss": 0.4485, + "step": 1398 + }, + { + "epoch": 0.07683863885839737, + "grad_norm": 1.9370191097259521, + "learning_rate": 4.9731808324074717e-05, + "loss": 0.5452, + "step": 1400 + }, + { + "epoch": 0.07694840834248079, + "grad_norm": 1.6047096252441406, + "learning_rate": 4.973104289270214e-05, + "loss": 0.4296, + "step": 1402 + }, + { + "epoch": 0.07705817782656421, + "grad_norm": 1.9951426982879639, + "learning_rate": 4.973027637650464e-05, + "loss": 0.4011, + "step": 1404 + }, + { + "epoch": 0.07716794731064763, + "grad_norm": 2.113271713256836, + "learning_rate": 4.972950877551584e-05, + "loss": 0.4596, + "step": 1406 + }, + { + "epoch": 0.07727771679473107, + "grad_norm": 1.967775583267212, + "learning_rate": 4.97287400897694e-05, + "loss": 0.4879, + "step": 1408 + }, + { + "epoch": 0.07738748627881449, + "grad_norm": 1.2790850400924683, + "learning_rate": 4.9727970319299044e-05, + "loss": 0.5067, + "step": 1410 + }, + { + "epoch": 0.07749725576289791, + "grad_norm": 2.1025073528289795, + "learning_rate": 4.972719946413854e-05, + "loss": 0.4168, + "step": 1412 + }, + { + "epoch": 0.07760702524698133, + "grad_norm": 2.4632163047790527, + "learning_rate": 4.972642752432171e-05, + "loss": 0.4523, + "step": 1414 + }, + { + "epoch": 0.07771679473106477, + "grad_norm": 1.719483494758606, + "learning_rate": 4.972565449988239e-05, + "loss": 0.5448, + "step": 1416 + }, + { + "epoch": 0.07782656421514819, + "grad_norm": 2.4091501235961914, + "learning_rate": 4.97248803908545e-05, + "loss": 0.5646, + "step": 1418 + }, + { + "epoch": 0.07793633369923161, + "grad_norm": 1.6323883533477783, + "learning_rate": 4.972410519727201e-05, + "loss": 0.3475, + "step": 1420 + }, + { + "epoch": 0.07804610318331504, + "grad_norm": 2.714904546737671, + "learning_rate": 4.972332891916891e-05, + "loss": 0.5441, + "step": 1422 + }, + { + "epoch": 0.07815587266739846, + "grad_norm": 1.7113326787948608, + "learning_rate": 4.972255155657925e-05, + "loss": 0.516, + "step": 1424 + }, + { + "epoch": 0.07826564215148189, + "grad_norm": 2.082491159439087, + "learning_rate": 4.972177310953714e-05, + "loss": 0.3715, + "step": 1426 + }, + { + "epoch": 0.07837541163556531, + "grad_norm": 2.6524291038513184, + "learning_rate": 4.972099357807671e-05, + "loss": 0.5128, + "step": 1428 + }, + { + "epoch": 0.07848518111964874, + "grad_norm": 2.4822254180908203, + "learning_rate": 4.972021296223217e-05, + "loss": 0.4242, + "step": 1430 + }, + { + "epoch": 0.07859495060373216, + "grad_norm": 3.452951192855835, + "learning_rate": 4.9719431262037755e-05, + "loss": 0.3679, + "step": 1432 + }, + { + "epoch": 0.0787047200878156, + "grad_norm": 3.056638479232788, + "learning_rate": 4.971864847752776e-05, + "loss": 0.4589, + "step": 1434 + }, + { + "epoch": 0.07881448957189902, + "grad_norm": 4.239192485809326, + "learning_rate": 4.9717864608736506e-05, + "loss": 0.4378, + "step": 1436 + }, + { + "epoch": 0.07892425905598244, + "grad_norm": 1.7480167150497437, + "learning_rate": 4.97170796556984e-05, + "loss": 0.4026, + "step": 1438 + }, + { + "epoch": 0.07903402854006586, + "grad_norm": 3.0691394805908203, + "learning_rate": 4.971629361844785e-05, + "loss": 0.6376, + "step": 1440 + }, + { + "epoch": 0.07914379802414928, + "grad_norm": 1.5674787759780884, + "learning_rate": 4.9715506497019355e-05, + "loss": 0.3781, + "step": 1442 + }, + { + "epoch": 0.07925356750823272, + "grad_norm": 1.2227840423583984, + "learning_rate": 4.971471829144743e-05, + "loss": 0.4799, + "step": 1444 + }, + { + "epoch": 0.07936333699231614, + "grad_norm": 1.6871943473815918, + "learning_rate": 4.971392900176666e-05, + "loss": 0.4316, + "step": 1446 + }, + { + "epoch": 0.07947310647639956, + "grad_norm": 1.351342797279358, + "learning_rate": 4.9713138628011654e-05, + "loss": 0.3659, + "step": 1448 + }, + { + "epoch": 0.07958287596048298, + "grad_norm": 2.1624252796173096, + "learning_rate": 4.971234717021709e-05, + "loss": 0.4187, + "step": 1450 + }, + { + "epoch": 0.0796926454445664, + "grad_norm": 2.107962131500244, + "learning_rate": 4.971155462841769e-05, + "loss": 0.4338, + "step": 1452 + }, + { + "epoch": 0.07980241492864984, + "grad_norm": 1.9297070503234863, + "learning_rate": 4.9710761002648196e-05, + "loss": 0.5138, + "step": 1454 + }, + { + "epoch": 0.07991218441273326, + "grad_norm": 2.166930913925171, + "learning_rate": 4.9709966292943455e-05, + "loss": 0.369, + "step": 1456 + }, + { + "epoch": 0.08002195389681668, + "grad_norm": 2.434677839279175, + "learning_rate": 4.9709170499338295e-05, + "loss": 0.4731, + "step": 1458 + }, + { + "epoch": 0.0801317233809001, + "grad_norm": 1.6464452743530273, + "learning_rate": 4.9708373621867656e-05, + "loss": 0.4743, + "step": 1460 + }, + { + "epoch": 0.08024149286498354, + "grad_norm": 2.065507411956787, + "learning_rate": 4.970757566056646e-05, + "loss": 0.4729, + "step": 1462 + }, + { + "epoch": 0.08035126234906696, + "grad_norm": 3.8853867053985596, + "learning_rate": 4.9706776615469716e-05, + "loss": 0.4379, + "step": 1464 + }, + { + "epoch": 0.08046103183315038, + "grad_norm": 1.795507550239563, + "learning_rate": 4.9705976486612496e-05, + "loss": 0.412, + "step": 1466 + }, + { + "epoch": 0.0805708013172338, + "grad_norm": 1.7997970581054688, + "learning_rate": 4.970517527402988e-05, + "loss": 0.48, + "step": 1468 + }, + { + "epoch": 0.08068057080131723, + "grad_norm": 2.423989772796631, + "learning_rate": 4.970437297775702e-05, + "loss": 0.3086, + "step": 1470 + }, + { + "epoch": 0.08079034028540066, + "grad_norm": 2.5824551582336426, + "learning_rate": 4.970356959782909e-05, + "loss": 0.4781, + "step": 1472 + }, + { + "epoch": 0.08090010976948409, + "grad_norm": 2.855727195739746, + "learning_rate": 4.970276513428136e-05, + "loss": 0.3536, + "step": 1474 + }, + { + "epoch": 0.08100987925356751, + "grad_norm": 2.6748769283294678, + "learning_rate": 4.970195958714909e-05, + "loss": 0.4551, + "step": 1476 + }, + { + "epoch": 0.08111964873765093, + "grad_norm": 2.177062749862671, + "learning_rate": 4.9701152956467645e-05, + "loss": 0.371, + "step": 1478 + }, + { + "epoch": 0.08122941822173436, + "grad_norm": 1.297822117805481, + "learning_rate": 4.970034524227238e-05, + "loss": 0.3246, + "step": 1480 + }, + { + "epoch": 0.08133918770581779, + "grad_norm": 2.3642914295196533, + "learning_rate": 4.969953644459874e-05, + "loss": 0.5324, + "step": 1482 + }, + { + "epoch": 0.08144895718990121, + "grad_norm": 1.8671852350234985, + "learning_rate": 4.96987265634822e-05, + "loss": 0.4508, + "step": 1484 + }, + { + "epoch": 0.08155872667398463, + "grad_norm": 1.1008070707321167, + "learning_rate": 4.969791559895828e-05, + "loss": 0.3309, + "step": 1486 + }, + { + "epoch": 0.08166849615806805, + "grad_norm": 2.0505120754241943, + "learning_rate": 4.9697103551062556e-05, + "loss": 0.4008, + "step": 1488 + }, + { + "epoch": 0.08177826564215149, + "grad_norm": 2.554454803466797, + "learning_rate": 4.9696290419830654e-05, + "loss": 0.391, + "step": 1490 + }, + { + "epoch": 0.08188803512623491, + "grad_norm": 1.7010999917984009, + "learning_rate": 4.9695476205298235e-05, + "loss": 0.5289, + "step": 1492 + }, + { + "epoch": 0.08199780461031833, + "grad_norm": 2.066810369491577, + "learning_rate": 4.9694660907501024e-05, + "loss": 0.4367, + "step": 1494 + }, + { + "epoch": 0.08210757409440175, + "grad_norm": 2.53515362739563, + "learning_rate": 4.969384452647477e-05, + "loss": 0.4541, + "step": 1496 + }, + { + "epoch": 0.08221734357848517, + "grad_norm": 2.734351396560669, + "learning_rate": 4.9693027062255296e-05, + "loss": 0.5134, + "step": 1498 + }, + { + "epoch": 0.08232711306256861, + "grad_norm": 2.1988978385925293, + "learning_rate": 4.9692208514878444e-05, + "loss": 0.4124, + "step": 1500 + }, + { + "epoch": 0.08243688254665203, + "grad_norm": 1.3103734254837036, + "learning_rate": 4.969138888438014e-05, + "loss": 0.4038, + "step": 1502 + }, + { + "epoch": 0.08254665203073545, + "grad_norm": 3.4975767135620117, + "learning_rate": 4.969056817079633e-05, + "loss": 0.4945, + "step": 1504 + }, + { + "epoch": 0.08265642151481888, + "grad_norm": 2.1662983894348145, + "learning_rate": 4.9689746374163e-05, + "loss": 0.3863, + "step": 1506 + }, + { + "epoch": 0.08276619099890231, + "grad_norm": 3.261486053466797, + "learning_rate": 4.968892349451621e-05, + "loss": 0.5932, + "step": 1508 + }, + { + "epoch": 0.08287596048298573, + "grad_norm": 2.3572170734405518, + "learning_rate": 4.968809953189206e-05, + "loss": 0.4081, + "step": 1510 + }, + { + "epoch": 0.08298572996706916, + "grad_norm": 1.5817608833312988, + "learning_rate": 4.968727448632669e-05, + "loss": 0.3599, + "step": 1512 + }, + { + "epoch": 0.08309549945115258, + "grad_norm": 2.6833693981170654, + "learning_rate": 4.9686448357856286e-05, + "loss": 0.4666, + "step": 1514 + }, + { + "epoch": 0.083205268935236, + "grad_norm": 1.9109543561935425, + "learning_rate": 4.968562114651709e-05, + "loss": 0.4777, + "step": 1516 + }, + { + "epoch": 0.08331503841931943, + "grad_norm": 1.293790578842163, + "learning_rate": 4.968479285234538e-05, + "loss": 0.3213, + "step": 1518 + }, + { + "epoch": 0.08342480790340286, + "grad_norm": 2.9989542961120605, + "learning_rate": 4.968396347537751e-05, + "loss": 0.4882, + "step": 1520 + }, + { + "epoch": 0.08353457738748628, + "grad_norm": 3.254138946533203, + "learning_rate": 4.9683133015649844e-05, + "loss": 0.551, + "step": 1522 + }, + { + "epoch": 0.0836443468715697, + "grad_norm": 1.6969789266586304, + "learning_rate": 4.96823014731988e-05, + "loss": 0.3694, + "step": 1524 + }, + { + "epoch": 0.08375411635565312, + "grad_norm": 1.629355549812317, + "learning_rate": 4.9681468848060874e-05, + "loss": 0.3003, + "step": 1526 + }, + { + "epoch": 0.08386388583973656, + "grad_norm": 1.305407166481018, + "learning_rate": 4.9680635140272575e-05, + "loss": 0.2909, + "step": 1528 + }, + { + "epoch": 0.08397365532381998, + "grad_norm": 2.323150634765625, + "learning_rate": 4.967980034987048e-05, + "loss": 0.4102, + "step": 1530 + }, + { + "epoch": 0.0840834248079034, + "grad_norm": 4.301831245422363, + "learning_rate": 4.967896447689121e-05, + "loss": 0.4095, + "step": 1532 + }, + { + "epoch": 0.08419319429198682, + "grad_norm": 2.6009533405303955, + "learning_rate": 4.967812752137142e-05, + "loss": 0.5013, + "step": 1534 + }, + { + "epoch": 0.08430296377607026, + "grad_norm": 2.1959621906280518, + "learning_rate": 4.967728948334784e-05, + "loss": 0.3013, + "step": 1536 + }, + { + "epoch": 0.08441273326015368, + "grad_norm": 1.9119952917099, + "learning_rate": 4.967645036285721e-05, + "loss": 0.3981, + "step": 1538 + }, + { + "epoch": 0.0845225027442371, + "grad_norm": 1.6922976970672607, + "learning_rate": 4.967561015993635e-05, + "loss": 0.3424, + "step": 1540 + }, + { + "epoch": 0.08463227222832052, + "grad_norm": 4.207444667816162, + "learning_rate": 4.967476887462212e-05, + "loss": 0.4636, + "step": 1542 + }, + { + "epoch": 0.08474204171240395, + "grad_norm": 1.8900500535964966, + "learning_rate": 4.9673926506951404e-05, + "loss": 0.5147, + "step": 1544 + }, + { + "epoch": 0.08485181119648738, + "grad_norm": 2.2209105491638184, + "learning_rate": 4.967308305696118e-05, + "loss": 0.3398, + "step": 1546 + }, + { + "epoch": 0.0849615806805708, + "grad_norm": 1.4874863624572754, + "learning_rate": 4.967223852468842e-05, + "loss": 0.3868, + "step": 1548 + }, + { + "epoch": 0.08507135016465422, + "grad_norm": 1.4582382440567017, + "learning_rate": 4.9671392910170185e-05, + "loss": 0.3449, + "step": 1550 + }, + { + "epoch": 0.08518111964873765, + "grad_norm": 2.3056488037109375, + "learning_rate": 4.967054621344356e-05, + "loss": 0.4321, + "step": 1552 + }, + { + "epoch": 0.08529088913282108, + "grad_norm": 1.6737662553787231, + "learning_rate": 4.966969843454569e-05, + "loss": 0.4522, + "step": 1554 + }, + { + "epoch": 0.0854006586169045, + "grad_norm": 4.456511974334717, + "learning_rate": 4.966884957351375e-05, + "loss": 0.4641, + "step": 1556 + }, + { + "epoch": 0.08551042810098793, + "grad_norm": 1.4723759889602661, + "learning_rate": 4.9667999630384996e-05, + "loss": 0.6021, + "step": 1558 + }, + { + "epoch": 0.08562019758507135, + "grad_norm": 2.852248430252075, + "learning_rate": 4.96671486051967e-05, + "loss": 0.4766, + "step": 1560 + }, + { + "epoch": 0.08572996706915477, + "grad_norm": 3.519808769226074, + "learning_rate": 4.96662964979862e-05, + "loss": 0.4852, + "step": 1562 + }, + { + "epoch": 0.0858397365532382, + "grad_norm": 1.9337600469589233, + "learning_rate": 4.966544330879085e-05, + "loss": 0.5405, + "step": 1564 + }, + { + "epoch": 0.08594950603732163, + "grad_norm": 2.3185508251190186, + "learning_rate": 4.96645890376481e-05, + "loss": 0.5384, + "step": 1566 + }, + { + "epoch": 0.08605927552140505, + "grad_norm": 1.8977993726730347, + "learning_rate": 4.966373368459541e-05, + "loss": 0.6058, + "step": 1568 + }, + { + "epoch": 0.08616904500548847, + "grad_norm": 1.6487147808074951, + "learning_rate": 4.966287724967031e-05, + "loss": 0.3363, + "step": 1570 + }, + { + "epoch": 0.08627881448957189, + "grad_norm": 1.897972822189331, + "learning_rate": 4.966201973291036e-05, + "loss": 0.2988, + "step": 1572 + }, + { + "epoch": 0.08638858397365533, + "grad_norm": 4.946324348449707, + "learning_rate": 4.966116113435317e-05, + "loss": 0.5141, + "step": 1574 + }, + { + "epoch": 0.08649835345773875, + "grad_norm": 2.835561513900757, + "learning_rate": 4.966030145403642e-05, + "loss": 0.4463, + "step": 1576 + }, + { + "epoch": 0.08660812294182217, + "grad_norm": 2.563922643661499, + "learning_rate": 4.965944069199781e-05, + "loss": 0.3881, + "step": 1578 + }, + { + "epoch": 0.0867178924259056, + "grad_norm": 1.4909789562225342, + "learning_rate": 4.9658578848275076e-05, + "loss": 0.2722, + "step": 1580 + }, + { + "epoch": 0.08682766190998903, + "grad_norm": 5.723266124725342, + "learning_rate": 4.965771592290606e-05, + "loss": 0.548, + "step": 1582 + }, + { + "epoch": 0.08693743139407245, + "grad_norm": 3.943152904510498, + "learning_rate": 4.965685191592859e-05, + "loss": 0.4074, + "step": 1584 + }, + { + "epoch": 0.08704720087815587, + "grad_norm": 1.9588027000427246, + "learning_rate": 4.9655986827380565e-05, + "loss": 0.3558, + "step": 1586 + }, + { + "epoch": 0.0871569703622393, + "grad_norm": 3.840147018432617, + "learning_rate": 4.9655120657299945e-05, + "loss": 0.5859, + "step": 1588 + }, + { + "epoch": 0.08726673984632272, + "grad_norm": 3.581636428833008, + "learning_rate": 4.9654253405724724e-05, + "loss": 0.5179, + "step": 1590 + }, + { + "epoch": 0.08737650933040615, + "grad_norm": 2.2196922302246094, + "learning_rate": 4.965338507269294e-05, + "loss": 0.557, + "step": 1592 + }, + { + "epoch": 0.08748627881448957, + "grad_norm": 2.772043466567993, + "learning_rate": 4.965251565824267e-05, + "loss": 0.5129, + "step": 1594 + }, + { + "epoch": 0.087596048298573, + "grad_norm": 2.0500285625457764, + "learning_rate": 4.965164516241206e-05, + "loss": 0.5416, + "step": 1596 + }, + { + "epoch": 0.08770581778265642, + "grad_norm": 1.8471778631210327, + "learning_rate": 4.965077358523931e-05, + "loss": 0.3489, + "step": 1598 + }, + { + "epoch": 0.08781558726673985, + "grad_norm": 2.879105567932129, + "learning_rate": 4.964990092676263e-05, + "loss": 0.6935, + "step": 1600 + }, + { + "epoch": 0.08792535675082327, + "grad_norm": 1.928931713104248, + "learning_rate": 4.96490271870203e-05, + "loss": 0.3813, + "step": 1602 + }, + { + "epoch": 0.0880351262349067, + "grad_norm": 5.914320945739746, + "learning_rate": 4.964815236605066e-05, + "loss": 0.5438, + "step": 1604 + }, + { + "epoch": 0.08814489571899012, + "grad_norm": 2.446261405944824, + "learning_rate": 4.964727646389208e-05, + "loss": 0.3935, + "step": 1606 + }, + { + "epoch": 0.08825466520307354, + "grad_norm": 2.407395124435425, + "learning_rate": 4.964639948058297e-05, + "loss": 0.5037, + "step": 1608 + }, + { + "epoch": 0.08836443468715698, + "grad_norm": 2.3384554386138916, + "learning_rate": 4.964552141616181e-05, + "loss": 0.3723, + "step": 1610 + }, + { + "epoch": 0.0884742041712404, + "grad_norm": 1.9486784934997559, + "learning_rate": 4.964464227066712e-05, + "loss": 0.3789, + "step": 1612 + }, + { + "epoch": 0.08858397365532382, + "grad_norm": 1.475759744644165, + "learning_rate": 4.964376204413745e-05, + "loss": 0.4699, + "step": 1614 + }, + { + "epoch": 0.08869374313940724, + "grad_norm": 1.6083623170852661, + "learning_rate": 4.964288073661142e-05, + "loss": 0.3199, + "step": 1616 + }, + { + "epoch": 0.08880351262349066, + "grad_norm": 1.771324872970581, + "learning_rate": 4.964199834812768e-05, + "loss": 0.4908, + "step": 1618 + }, + { + "epoch": 0.0889132821075741, + "grad_norm": 3.539027452468872, + "learning_rate": 4.9641114878724956e-05, + "loss": 0.4014, + "step": 1620 + }, + { + "epoch": 0.08902305159165752, + "grad_norm": 1.3294570446014404, + "learning_rate": 4.964023032844198e-05, + "loss": 0.3093, + "step": 1622 + }, + { + "epoch": 0.08913282107574094, + "grad_norm": 2.1849374771118164, + "learning_rate": 4.963934469731756e-05, + "loss": 0.4978, + "step": 1624 + }, + { + "epoch": 0.08924259055982436, + "grad_norm": 2.344160318374634, + "learning_rate": 4.963845798539054e-05, + "loss": 0.5545, + "step": 1626 + }, + { + "epoch": 0.0893523600439078, + "grad_norm": 1.8380173444747925, + "learning_rate": 4.963757019269983e-05, + "loss": 0.3805, + "step": 1628 + }, + { + "epoch": 0.08946212952799122, + "grad_norm": 1.8738735914230347, + "learning_rate": 4.963668131928436e-05, + "loss": 0.4401, + "step": 1630 + }, + { + "epoch": 0.08957189901207464, + "grad_norm": 3.7194414138793945, + "learning_rate": 4.963579136518312e-05, + "loss": 0.3996, + "step": 1632 + }, + { + "epoch": 0.08968166849615807, + "grad_norm": 2.0868451595306396, + "learning_rate": 4.963490033043515e-05, + "loss": 0.3782, + "step": 1634 + }, + { + "epoch": 0.08979143798024149, + "grad_norm": 2.7007710933685303, + "learning_rate": 4.963400821507954e-05, + "loss": 0.4476, + "step": 1636 + }, + { + "epoch": 0.08990120746432492, + "grad_norm": 2.2203407287597656, + "learning_rate": 4.963311501915542e-05, + "loss": 0.4431, + "step": 1638 + }, + { + "epoch": 0.09001097694840834, + "grad_norm": 1.5560448169708252, + "learning_rate": 4.9632220742701965e-05, + "loss": 0.4095, + "step": 1640 + }, + { + "epoch": 0.09012074643249177, + "grad_norm": 1.8507685661315918, + "learning_rate": 4.963132538575841e-05, + "loss": 0.4834, + "step": 1642 + }, + { + "epoch": 0.09023051591657519, + "grad_norm": 2.5851950645446777, + "learning_rate": 4.963042894836403e-05, + "loss": 0.4411, + "step": 1644 + }, + { + "epoch": 0.09034028540065862, + "grad_norm": 1.3569010496139526, + "learning_rate": 4.962953143055813e-05, + "loss": 0.4529, + "step": 1646 + }, + { + "epoch": 0.09045005488474205, + "grad_norm": 2.0030837059020996, + "learning_rate": 4.96286328323801e-05, + "loss": 0.4156, + "step": 1648 + }, + { + "epoch": 0.09055982436882547, + "grad_norm": 1.5840579271316528, + "learning_rate": 4.962773315386935e-05, + "loss": 0.3585, + "step": 1650 + }, + { + "epoch": 0.09066959385290889, + "grad_norm": 1.9359923601150513, + "learning_rate": 4.962683239506534e-05, + "loss": 0.4793, + "step": 1652 + }, + { + "epoch": 0.09077936333699231, + "grad_norm": 2.9567744731903076, + "learning_rate": 4.9625930556007596e-05, + "loss": 0.4659, + "step": 1654 + }, + { + "epoch": 0.09088913282107575, + "grad_norm": 1.6988526582717896, + "learning_rate": 4.962502763673565e-05, + "loss": 0.3861, + "step": 1656 + }, + { + "epoch": 0.09099890230515917, + "grad_norm": 1.7865264415740967, + "learning_rate": 4.9624123637289146e-05, + "loss": 0.3356, + "step": 1658 + }, + { + "epoch": 0.09110867178924259, + "grad_norm": 1.5166332721710205, + "learning_rate": 4.9623218557707694e-05, + "loss": 0.5181, + "step": 1660 + }, + { + "epoch": 0.09121844127332601, + "grad_norm": 1.8882105350494385, + "learning_rate": 4.9622312398031035e-05, + "loss": 0.4352, + "step": 1662 + }, + { + "epoch": 0.09132821075740943, + "grad_norm": 3.2773218154907227, + "learning_rate": 4.96214051582989e-05, + "loss": 0.6857, + "step": 1664 + }, + { + "epoch": 0.09143798024149287, + "grad_norm": 2.8895809650421143, + "learning_rate": 4.962049683855108e-05, + "loss": 0.5483, + "step": 1666 + }, + { + "epoch": 0.09154774972557629, + "grad_norm": 4.0331525802612305, + "learning_rate": 4.961958743882742e-05, + "loss": 0.4941, + "step": 1668 + }, + { + "epoch": 0.09165751920965971, + "grad_norm": 1.631052017211914, + "learning_rate": 4.961867695916782e-05, + "loss": 0.4379, + "step": 1670 + }, + { + "epoch": 0.09176728869374313, + "grad_norm": 2.738257884979248, + "learning_rate": 4.961776539961222e-05, + "loss": 0.4245, + "step": 1672 + }, + { + "epoch": 0.09187705817782657, + "grad_norm": 1.835517168045044, + "learning_rate": 4.961685276020058e-05, + "loss": 0.4191, + "step": 1674 + }, + { + "epoch": 0.09198682766190999, + "grad_norm": 1.965050220489502, + "learning_rate": 4.961593904097297e-05, + "loss": 0.5177, + "step": 1676 + }, + { + "epoch": 0.09209659714599341, + "grad_norm": 1.993556261062622, + "learning_rate": 4.9615024241969446e-05, + "loss": 0.4058, + "step": 1678 + }, + { + "epoch": 0.09220636663007684, + "grad_norm": 2.084379196166992, + "learning_rate": 4.9614108363230135e-05, + "loss": 0.4011, + "step": 1680 + }, + { + "epoch": 0.09231613611416026, + "grad_norm": 2.4940402507781982, + "learning_rate": 4.9613191404795226e-05, + "loss": 0.5717, + "step": 1682 + }, + { + "epoch": 0.0924259055982437, + "grad_norm": 2.2692036628723145, + "learning_rate": 4.961227336670493e-05, + "loss": 0.4874, + "step": 1684 + }, + { + "epoch": 0.09253567508232712, + "grad_norm": 2.415292263031006, + "learning_rate": 4.961135424899952e-05, + "loss": 0.483, + "step": 1686 + }, + { + "epoch": 0.09264544456641054, + "grad_norm": 3.2184865474700928, + "learning_rate": 4.961043405171931e-05, + "loss": 0.4787, + "step": 1688 + }, + { + "epoch": 0.09275521405049396, + "grad_norm": 2.2991814613342285, + "learning_rate": 4.960951277490467e-05, + "loss": 0.4943, + "step": 1690 + }, + { + "epoch": 0.0928649835345774, + "grad_norm": 3.30875825881958, + "learning_rate": 4.9608590418596016e-05, + "loss": 0.5198, + "step": 1692 + }, + { + "epoch": 0.09297475301866082, + "grad_norm": 2.0127665996551514, + "learning_rate": 4.960766698283379e-05, + "loss": 0.573, + "step": 1694 + }, + { + "epoch": 0.09308452250274424, + "grad_norm": 1.8975090980529785, + "learning_rate": 4.960674246765851e-05, + "loss": 0.5668, + "step": 1696 + }, + { + "epoch": 0.09319429198682766, + "grad_norm": 2.643425464630127, + "learning_rate": 4.9605816873110736e-05, + "loss": 0.5439, + "step": 1698 + }, + { + "epoch": 0.09330406147091108, + "grad_norm": 2.4725396633148193, + "learning_rate": 4.960489019923105e-05, + "loss": 0.4228, + "step": 1700 + }, + { + "epoch": 0.09341383095499452, + "grad_norm": 1.237222671508789, + "learning_rate": 4.960396244606012e-05, + "loss": 0.2912, + "step": 1702 + }, + { + "epoch": 0.09352360043907794, + "grad_norm": 2.7197954654693604, + "learning_rate": 4.9603033613638626e-05, + "loss": 0.309, + "step": 1704 + }, + { + "epoch": 0.09363336992316136, + "grad_norm": 4.123082160949707, + "learning_rate": 4.960210370200733e-05, + "loss": 0.3509, + "step": 1706 + }, + { + "epoch": 0.09374313940724478, + "grad_norm": 2.393568992614746, + "learning_rate": 4.9601172711207005e-05, + "loss": 0.5206, + "step": 1708 + }, + { + "epoch": 0.0938529088913282, + "grad_norm": 2.058398485183716, + "learning_rate": 4.9600240641278496e-05, + "loss": 0.5329, + "step": 1710 + }, + { + "epoch": 0.09396267837541164, + "grad_norm": 1.4930353164672852, + "learning_rate": 4.959930749226269e-05, + "loss": 0.4277, + "step": 1712 + }, + { + "epoch": 0.09407244785949506, + "grad_norm": 1.7625492811203003, + "learning_rate": 4.9598373264200515e-05, + "loss": 0.3936, + "step": 1714 + }, + { + "epoch": 0.09418221734357848, + "grad_norm": 1.5839118957519531, + "learning_rate": 4.9597437957132955e-05, + "loss": 0.3292, + "step": 1716 + }, + { + "epoch": 0.0942919868276619, + "grad_norm": 3.546184539794922, + "learning_rate": 4.959650157110103e-05, + "loss": 0.4614, + "step": 1718 + }, + { + "epoch": 0.09440175631174534, + "grad_norm": 1.1406798362731934, + "learning_rate": 4.959556410614582e-05, + "loss": 0.3465, + "step": 1720 + }, + { + "epoch": 0.09451152579582876, + "grad_norm": 1.7085702419281006, + "learning_rate": 4.959462556230844e-05, + "loss": 0.3573, + "step": 1722 + }, + { + "epoch": 0.09462129527991218, + "grad_norm": 2.7848801612854004, + "learning_rate": 4.959368593963007e-05, + "loss": 0.2569, + "step": 1724 + }, + { + "epoch": 0.0947310647639956, + "grad_norm": 4.561282634735107, + "learning_rate": 4.959274523815193e-05, + "loss": 0.3618, + "step": 1726 + }, + { + "epoch": 0.09484083424807903, + "grad_norm": 1.8563063144683838, + "learning_rate": 4.959180345791528e-05, + "loss": 0.5897, + "step": 1728 + }, + { + "epoch": 0.09495060373216246, + "grad_norm": 2.125514507293701, + "learning_rate": 4.959086059896141e-05, + "loss": 0.3624, + "step": 1730 + }, + { + "epoch": 0.09506037321624589, + "grad_norm": 2.247169256210327, + "learning_rate": 4.95899166613317e-05, + "loss": 0.4935, + "step": 1732 + }, + { + "epoch": 0.09517014270032931, + "grad_norm": 1.419965147972107, + "learning_rate": 4.958897164506755e-05, + "loss": 0.3363, + "step": 1734 + }, + { + "epoch": 0.09527991218441273, + "grad_norm": 3.0122644901275635, + "learning_rate": 4.958802555021042e-05, + "loss": 0.5629, + "step": 1736 + }, + { + "epoch": 0.09538968166849615, + "grad_norm": 2.941861152648926, + "learning_rate": 4.95870783768018e-05, + "loss": 0.5183, + "step": 1738 + }, + { + "epoch": 0.09549945115257959, + "grad_norm": 1.476583480834961, + "learning_rate": 4.958613012488324e-05, + "loss": 0.4353, + "step": 1740 + }, + { + "epoch": 0.09560922063666301, + "grad_norm": 4.134450912475586, + "learning_rate": 4.9585180794496345e-05, + "loss": 0.5321, + "step": 1742 + }, + { + "epoch": 0.09571899012074643, + "grad_norm": 1.358006238937378, + "learning_rate": 4.958423038568274e-05, + "loss": 0.4921, + "step": 1744 + }, + { + "epoch": 0.09582875960482985, + "grad_norm": 1.40552818775177, + "learning_rate": 4.958327889848413e-05, + "loss": 0.3531, + "step": 1746 + }, + { + "epoch": 0.09593852908891329, + "grad_norm": 2.9179012775421143, + "learning_rate": 4.9582326332942244e-05, + "loss": 0.4545, + "step": 1748 + }, + { + "epoch": 0.09604829857299671, + "grad_norm": 2.150498628616333, + "learning_rate": 4.958137268909887e-05, + "loss": 0.5332, + "step": 1750 + }, + { + "epoch": 0.09615806805708013, + "grad_norm": 1.8023475408554077, + "learning_rate": 4.958041796699583e-05, + "loss": 0.3612, + "step": 1752 + }, + { + "epoch": 0.09626783754116355, + "grad_norm": 5.3879899978637695, + "learning_rate": 4.9579462166675015e-05, + "loss": 0.5728, + "step": 1754 + }, + { + "epoch": 0.09637760702524698, + "grad_norm": 2.6372292041778564, + "learning_rate": 4.957850528817834e-05, + "loss": 0.3858, + "step": 1756 + }, + { + "epoch": 0.09648737650933041, + "grad_norm": 2.606325626373291, + "learning_rate": 4.95775473315478e-05, + "loss": 0.4955, + "step": 1758 + }, + { + "epoch": 0.09659714599341383, + "grad_norm": 5.73064661026001, + "learning_rate": 4.9576588296825386e-05, + "loss": 0.4086, + "step": 1760 + }, + { + "epoch": 0.09670691547749725, + "grad_norm": 3.024963855743408, + "learning_rate": 4.957562818405319e-05, + "loss": 0.467, + "step": 1762 + }, + { + "epoch": 0.09681668496158068, + "grad_norm": 1.7806618213653564, + "learning_rate": 4.957466699327331e-05, + "loss": 0.4088, + "step": 1764 + }, + { + "epoch": 0.09692645444566411, + "grad_norm": 2.621886968612671, + "learning_rate": 4.957370472452792e-05, + "loss": 0.4832, + "step": 1766 + }, + { + "epoch": 0.09703622392974753, + "grad_norm": 2.4131014347076416, + "learning_rate": 4.957274137785922e-05, + "loss": 0.3996, + "step": 1768 + }, + { + "epoch": 0.09714599341383096, + "grad_norm": 1.9427906274795532, + "learning_rate": 4.957177695330948e-05, + "loss": 0.4101, + "step": 1770 + }, + { + "epoch": 0.09725576289791438, + "grad_norm": 3.6557106971740723, + "learning_rate": 4.9570811450921e-05, + "loss": 0.4699, + "step": 1772 + }, + { + "epoch": 0.0973655323819978, + "grad_norm": 1.6783902645111084, + "learning_rate": 4.956984487073613e-05, + "loss": 0.4305, + "step": 1774 + }, + { + "epoch": 0.09747530186608123, + "grad_norm": 2.319164276123047, + "learning_rate": 4.956887721279726e-05, + "loss": 0.395, + "step": 1776 + }, + { + "epoch": 0.09758507135016466, + "grad_norm": 1.9361863136291504, + "learning_rate": 4.956790847714684e-05, + "loss": 0.427, + "step": 1778 + }, + { + "epoch": 0.09769484083424808, + "grad_norm": 3.3300416469573975, + "learning_rate": 4.9566938663827377e-05, + "loss": 0.3776, + "step": 1780 + }, + { + "epoch": 0.0978046103183315, + "grad_norm": 3.1671109199523926, + "learning_rate": 4.95659677728814e-05, + "loss": 0.5067, + "step": 1782 + }, + { + "epoch": 0.09791437980241492, + "grad_norm": 1.781873106956482, + "learning_rate": 4.95649958043515e-05, + "loss": 0.5593, + "step": 1784 + }, + { + "epoch": 0.09802414928649836, + "grad_norm": 2.4715237617492676, + "learning_rate": 4.9564022758280315e-05, + "loss": 0.4003, + "step": 1786 + }, + { + "epoch": 0.09813391877058178, + "grad_norm": 1.8470726013183594, + "learning_rate": 4.9563048634710516e-05, + "loss": 0.5491, + "step": 1788 + }, + { + "epoch": 0.0982436882546652, + "grad_norm": 2.369525671005249, + "learning_rate": 4.956207343368485e-05, + "loss": 0.3481, + "step": 1790 + }, + { + "epoch": 0.09835345773874862, + "grad_norm": 1.7562495470046997, + "learning_rate": 4.956109715524608e-05, + "loss": 0.61, + "step": 1792 + }, + { + "epoch": 0.09846322722283206, + "grad_norm": 1.8443551063537598, + "learning_rate": 4.956011979943704e-05, + "loss": 0.4013, + "step": 1794 + }, + { + "epoch": 0.09857299670691548, + "grad_norm": 2.8196375370025635, + "learning_rate": 4.9559141366300594e-05, + "loss": 0.4593, + "step": 1796 + }, + { + "epoch": 0.0986827661909989, + "grad_norm": 2.5943007469177246, + "learning_rate": 4.955816185587967e-05, + "loss": 0.4689, + "step": 1798 + }, + { + "epoch": 0.09879253567508232, + "grad_norm": 1.9150199890136719, + "learning_rate": 4.9557181268217227e-05, + "loss": 0.4387, + "step": 1800 + }, + { + "epoch": 0.09890230515916575, + "grad_norm": 1.6930148601531982, + "learning_rate": 4.955619960335627e-05, + "loss": 0.3133, + "step": 1802 + }, + { + "epoch": 0.09901207464324918, + "grad_norm": 1.4063451290130615, + "learning_rate": 4.9555216861339876e-05, + "loss": 0.4853, + "step": 1804 + }, + { + "epoch": 0.0991218441273326, + "grad_norm": 2.667992353439331, + "learning_rate": 4.9554233042211146e-05, + "loss": 0.5413, + "step": 1806 + }, + { + "epoch": 0.09923161361141603, + "grad_norm": 2.4720706939697266, + "learning_rate": 4.955324814601324e-05, + "loss": 0.3566, + "step": 1808 + }, + { + "epoch": 0.09934138309549945, + "grad_norm": 1.4657936096191406, + "learning_rate": 4.955226217278935e-05, + "loss": 0.391, + "step": 1810 + }, + { + "epoch": 0.09945115257958288, + "grad_norm": 1.2721844911575317, + "learning_rate": 4.955127512258273e-05, + "loss": 0.4351, + "step": 1812 + }, + { + "epoch": 0.0995609220636663, + "grad_norm": 2.4900083541870117, + "learning_rate": 4.9550286995436685e-05, + "loss": 0.3322, + "step": 1814 + }, + { + "epoch": 0.09967069154774973, + "grad_norm": 2.235671281814575, + "learning_rate": 4.954929779139455e-05, + "loss": 0.4163, + "step": 1816 + }, + { + "epoch": 0.09978046103183315, + "grad_norm": 2.985668420791626, + "learning_rate": 4.954830751049972e-05, + "loss": 0.6012, + "step": 1818 + }, + { + "epoch": 0.09989023051591657, + "grad_norm": 2.4545223712921143, + "learning_rate": 4.954731615279563e-05, + "loss": 0.459, + "step": 1820 + }, + { + "epoch": 0.1, + "grad_norm": 2.27345871925354, + "learning_rate": 4.954632371832576e-05, + "loss": 0.3658, + "step": 1822 + }, + { + "epoch": 0.10010976948408343, + "grad_norm": 1.6850107908248901, + "learning_rate": 4.9545330207133664e-05, + "loss": 0.4118, + "step": 1824 + }, + { + "epoch": 0.10021953896816685, + "grad_norm": 1.5056796073913574, + "learning_rate": 4.95443356192629e-05, + "loss": 0.3194, + "step": 1826 + }, + { + "epoch": 0.10032930845225027, + "grad_norm": 2.6000680923461914, + "learning_rate": 4.954333995475712e-05, + "loss": 0.4389, + "step": 1828 + }, + { + "epoch": 0.10043907793633369, + "grad_norm": 1.3203067779541016, + "learning_rate": 4.9542343213659974e-05, + "loss": 0.3263, + "step": 1830 + }, + { + "epoch": 0.10054884742041713, + "grad_norm": 2.3095200061798096, + "learning_rate": 4.9541345396015193e-05, + "loss": 0.4443, + "step": 1832 + }, + { + "epoch": 0.10065861690450055, + "grad_norm": 2.9167208671569824, + "learning_rate": 4.954034650186655e-05, + "loss": 0.5253, + "step": 1834 + }, + { + "epoch": 0.10076838638858397, + "grad_norm": 2.0420854091644287, + "learning_rate": 4.953934653125786e-05, + "loss": 0.4921, + "step": 1836 + }, + { + "epoch": 0.1008781558726674, + "grad_norm": 2.4542927742004395, + "learning_rate": 4.953834548423298e-05, + "loss": 0.4754, + "step": 1838 + }, + { + "epoch": 0.10098792535675083, + "grad_norm": 1.2920480966567993, + "learning_rate": 4.953734336083583e-05, + "loss": 0.4006, + "step": 1840 + }, + { + "epoch": 0.10109769484083425, + "grad_norm": 1.5333784818649292, + "learning_rate": 4.9536340161110354e-05, + "loss": 0.3306, + "step": 1842 + }, + { + "epoch": 0.10120746432491767, + "grad_norm": 2.6295199394226074, + "learning_rate": 4.9535335885100575e-05, + "loss": 0.4642, + "step": 1844 + }, + { + "epoch": 0.1013172338090011, + "grad_norm": 1.8022338151931763, + "learning_rate": 4.953433053285054e-05, + "loss": 0.4163, + "step": 1846 + }, + { + "epoch": 0.10142700329308452, + "grad_norm": 2.017718553543091, + "learning_rate": 4.953332410440435e-05, + "loss": 0.4185, + "step": 1848 + }, + { + "epoch": 0.10153677277716795, + "grad_norm": 2.3483431339263916, + "learning_rate": 4.9532316599806124e-05, + "loss": 0.4649, + "step": 1850 + }, + { + "epoch": 0.10164654226125137, + "grad_norm": 2.273658514022827, + "learning_rate": 4.953130801910011e-05, + "loss": 0.5579, + "step": 1852 + }, + { + "epoch": 0.1017563117453348, + "grad_norm": 2.028225898742676, + "learning_rate": 4.9530298362330503e-05, + "loss": 0.374, + "step": 1854 + }, + { + "epoch": 0.10186608122941822, + "grad_norm": 1.5721445083618164, + "learning_rate": 4.952928762954161e-05, + "loss": 0.3917, + "step": 1856 + }, + { + "epoch": 0.10197585071350165, + "grad_norm": 2.9496874809265137, + "learning_rate": 4.952827582077777e-05, + "loss": 0.448, + "step": 1858 + }, + { + "epoch": 0.10208562019758508, + "grad_norm": 1.7123289108276367, + "learning_rate": 4.952726293608335e-05, + "loss": 0.4408, + "step": 1860 + }, + { + "epoch": 0.1021953896816685, + "grad_norm": 1.8911689519882202, + "learning_rate": 4.9526248975502805e-05, + "loss": 0.3902, + "step": 1862 + }, + { + "epoch": 0.10230515916575192, + "grad_norm": 1.952642560005188, + "learning_rate": 4.952523393908059e-05, + "loss": 0.5441, + "step": 1864 + }, + { + "epoch": 0.10241492864983534, + "grad_norm": 1.6522396802902222, + "learning_rate": 4.952421782686124e-05, + "loss": 0.4088, + "step": 1866 + }, + { + "epoch": 0.10252469813391878, + "grad_norm": 2.612327814102173, + "learning_rate": 4.952320063888932e-05, + "loss": 0.3992, + "step": 1868 + }, + { + "epoch": 0.1026344676180022, + "grad_norm": 1.3157440423965454, + "learning_rate": 4.9522182375209455e-05, + "loss": 0.4316, + "step": 1870 + }, + { + "epoch": 0.10274423710208562, + "grad_norm": 2.379434823989868, + "learning_rate": 4.952116303586631e-05, + "loss": 0.6375, + "step": 1872 + }, + { + "epoch": 0.10285400658616904, + "grad_norm": 1.517968773841858, + "learning_rate": 4.9520142620904595e-05, + "loss": 0.2755, + "step": 1874 + }, + { + "epoch": 0.10296377607025246, + "grad_norm": 0.9970090389251709, + "learning_rate": 4.951912113036908e-05, + "loss": 0.4589, + "step": 1876 + }, + { + "epoch": 0.1030735455543359, + "grad_norm": 2.5735723972320557, + "learning_rate": 4.951809856430456e-05, + "loss": 0.3914, + "step": 1878 + }, + { + "epoch": 0.10318331503841932, + "grad_norm": 4.101471424102783, + "learning_rate": 4.951707492275589e-05, + "loss": 0.4302, + "step": 1880 + }, + { + "epoch": 0.10329308452250274, + "grad_norm": 2.042055130004883, + "learning_rate": 4.951605020576798e-05, + "loss": 0.3349, + "step": 1882 + }, + { + "epoch": 0.10340285400658616, + "grad_norm": 3.3276760578155518, + "learning_rate": 4.951502441338578e-05, + "loss": 0.425, + "step": 1884 + }, + { + "epoch": 0.1035126234906696, + "grad_norm": 1.794630527496338, + "learning_rate": 4.951399754565429e-05, + "loss": 0.5021, + "step": 1886 + }, + { + "epoch": 0.10362239297475302, + "grad_norm": 1.070343017578125, + "learning_rate": 4.951296960261853e-05, + "loss": 0.2946, + "step": 1888 + }, + { + "epoch": 0.10373216245883644, + "grad_norm": 2.8769989013671875, + "learning_rate": 4.951194058432361e-05, + "loss": 0.4352, + "step": 1890 + }, + { + "epoch": 0.10384193194291987, + "grad_norm": 1.4287083148956299, + "learning_rate": 4.9510910490814666e-05, + "loss": 0.5044, + "step": 1892 + }, + { + "epoch": 0.10395170142700329, + "grad_norm": 2.712960958480835, + "learning_rate": 4.950987932213689e-05, + "loss": 0.4562, + "step": 1894 + }, + { + "epoch": 0.10406147091108672, + "grad_norm": 1.5597306489944458, + "learning_rate": 4.9508847078335495e-05, + "loss": 0.428, + "step": 1896 + }, + { + "epoch": 0.10417124039517014, + "grad_norm": 2.7124598026275635, + "learning_rate": 4.9507813759455774e-05, + "loss": 0.4322, + "step": 1898 + }, + { + "epoch": 0.10428100987925357, + "grad_norm": 2.713620662689209, + "learning_rate": 4.9506779365543046e-05, + "loss": 0.4937, + "step": 1900 + }, + { + "epoch": 0.10439077936333699, + "grad_norm": 2.210167407989502, + "learning_rate": 4.95057438966427e-05, + "loss": 0.4923, + "step": 1902 + }, + { + "epoch": 0.10450054884742041, + "grad_norm": 1.9898028373718262, + "learning_rate": 4.9504707352800125e-05, + "loss": 0.3454, + "step": 1904 + }, + { + "epoch": 0.10461031833150385, + "grad_norm": 2.872634172439575, + "learning_rate": 4.950366973406083e-05, + "loss": 0.3892, + "step": 1906 + }, + { + "epoch": 0.10472008781558727, + "grad_norm": 1.5743720531463623, + "learning_rate": 4.950263104047031e-05, + "loss": 0.4289, + "step": 1908 + }, + { + "epoch": 0.10482985729967069, + "grad_norm": 1.6129796504974365, + "learning_rate": 4.950159127207411e-05, + "loss": 0.3838, + "step": 1910 + }, + { + "epoch": 0.10493962678375411, + "grad_norm": 2.1703128814697266, + "learning_rate": 4.950055042891786e-05, + "loss": 0.3603, + "step": 1912 + }, + { + "epoch": 0.10504939626783755, + "grad_norm": 1.8392038345336914, + "learning_rate": 4.949950851104722e-05, + "loss": 0.3454, + "step": 1914 + }, + { + "epoch": 0.10515916575192097, + "grad_norm": 2.494269847869873, + "learning_rate": 4.949846551850788e-05, + "loss": 0.5847, + "step": 1916 + }, + { + "epoch": 0.10526893523600439, + "grad_norm": 2.684849739074707, + "learning_rate": 4.94974214513456e-05, + "loss": 0.507, + "step": 1918 + }, + { + "epoch": 0.10537870472008781, + "grad_norm": 1.8811652660369873, + "learning_rate": 4.949637630960617e-05, + "loss": 0.3946, + "step": 1920 + }, + { + "epoch": 0.10548847420417123, + "grad_norm": 3.0134310722351074, + "learning_rate": 4.9495330093335444e-05, + "loss": 0.5087, + "step": 1922 + }, + { + "epoch": 0.10559824368825467, + "grad_norm": 2.75594425201416, + "learning_rate": 4.949428280257932e-05, + "loss": 0.5579, + "step": 1924 + }, + { + "epoch": 0.10570801317233809, + "grad_norm": 1.9982796907424927, + "learning_rate": 4.9493234437383706e-05, + "loss": 0.5162, + "step": 1926 + }, + { + "epoch": 0.10581778265642151, + "grad_norm": 1.8032664060592651, + "learning_rate": 4.9492184997794624e-05, + "loss": 0.6235, + "step": 1928 + }, + { + "epoch": 0.10592755214050494, + "grad_norm": 1.977350115776062, + "learning_rate": 4.9491134483858095e-05, + "loss": 0.512, + "step": 1930 + }, + { + "epoch": 0.10603732162458837, + "grad_norm": 2.2316627502441406, + "learning_rate": 4.949008289562019e-05, + "loss": 0.4666, + "step": 1932 + }, + { + "epoch": 0.10614709110867179, + "grad_norm": 2.236330032348633, + "learning_rate": 4.9489030233127044e-05, + "loss": 0.6299, + "step": 1934 + }, + { + "epoch": 0.10625686059275521, + "grad_norm": 3.3198797702789307, + "learning_rate": 4.948797649642484e-05, + "loss": 0.4603, + "step": 1936 + }, + { + "epoch": 0.10636663007683864, + "grad_norm": 1.7775179147720337, + "learning_rate": 4.948692168555978e-05, + "loss": 0.3887, + "step": 1938 + }, + { + "epoch": 0.10647639956092206, + "grad_norm": 1.451076626777649, + "learning_rate": 4.948586580057816e-05, + "loss": 0.4347, + "step": 1940 + }, + { + "epoch": 0.1065861690450055, + "grad_norm": 2.3668372631073, + "learning_rate": 4.948480884152628e-05, + "loss": 0.4549, + "step": 1942 + }, + { + "epoch": 0.10669593852908892, + "grad_norm": 2.471370220184326, + "learning_rate": 4.94837508084505e-05, + "loss": 0.4871, + "step": 1944 + }, + { + "epoch": 0.10680570801317234, + "grad_norm": 2.61167311668396, + "learning_rate": 4.948269170139724e-05, + "loss": 0.4157, + "step": 1946 + }, + { + "epoch": 0.10691547749725576, + "grad_norm": 2.9495723247528076, + "learning_rate": 4.948163152041295e-05, + "loss": 0.5081, + "step": 1948 + }, + { + "epoch": 0.10702524698133918, + "grad_norm": 1.8446156978607178, + "learning_rate": 4.9480570265544144e-05, + "loss": 0.559, + "step": 1950 + }, + { + "epoch": 0.10713501646542262, + "grad_norm": 2.031792163848877, + "learning_rate": 4.9479507936837364e-05, + "loss": 0.3206, + "step": 1952 + }, + { + "epoch": 0.10724478594950604, + "grad_norm": 2.182684898376465, + "learning_rate": 4.947844453433922e-05, + "loss": 0.5032, + "step": 1954 + }, + { + "epoch": 0.10735455543358946, + "grad_norm": 4.756406307220459, + "learning_rate": 4.9477380058096343e-05, + "loss": 0.432, + "step": 1956 + }, + { + "epoch": 0.10746432491767288, + "grad_norm": 1.8578556776046753, + "learning_rate": 4.947631450815544e-05, + "loss": 0.4234, + "step": 1958 + }, + { + "epoch": 0.10757409440175632, + "grad_norm": 2.6720492839813232, + "learning_rate": 4.947524788456325e-05, + "loss": 0.4983, + "step": 1960 + }, + { + "epoch": 0.10768386388583974, + "grad_norm": 2.448456287384033, + "learning_rate": 4.947418018736655e-05, + "loss": 0.5372, + "step": 1962 + }, + { + "epoch": 0.10779363336992316, + "grad_norm": 1.2243717908859253, + "learning_rate": 4.947311141661218e-05, + "loss": 0.3899, + "step": 1964 + }, + { + "epoch": 0.10790340285400658, + "grad_norm": 1.6027812957763672, + "learning_rate": 4.947204157234702e-05, + "loss": 0.4159, + "step": 1966 + }, + { + "epoch": 0.10801317233809, + "grad_norm": 3.1816442012786865, + "learning_rate": 4.947097065461801e-05, + "loss": 0.5276, + "step": 1968 + }, + { + "epoch": 0.10812294182217344, + "grad_norm": 2.3610422611236572, + "learning_rate": 4.9469898663472105e-05, + "loss": 0.4613, + "step": 1970 + }, + { + "epoch": 0.10823271130625686, + "grad_norm": 2.8902699947357178, + "learning_rate": 4.946882559895635e-05, + "loss": 0.4116, + "step": 1972 + }, + { + "epoch": 0.10834248079034028, + "grad_norm": 1.6339620351791382, + "learning_rate": 4.94677514611178e-05, + "loss": 0.5796, + "step": 1974 + }, + { + "epoch": 0.1084522502744237, + "grad_norm": 2.0883257389068604, + "learning_rate": 4.9466676250003576e-05, + "loss": 0.4449, + "step": 1976 + }, + { + "epoch": 0.10856201975850714, + "grad_norm": 1.8622233867645264, + "learning_rate": 4.946559996566083e-05, + "loss": 0.3934, + "step": 1978 + }, + { + "epoch": 0.10867178924259056, + "grad_norm": 2.068906545639038, + "learning_rate": 4.9464522608136805e-05, + "loss": 0.3589, + "step": 1980 + }, + { + "epoch": 0.10878155872667399, + "grad_norm": 1.8537366390228271, + "learning_rate": 4.946344417747874e-05, + "loss": 0.4437, + "step": 1982 + }, + { + "epoch": 0.10889132821075741, + "grad_norm": 2.081730365753174, + "learning_rate": 4.946236467373392e-05, + "loss": 0.3238, + "step": 1984 + }, + { + "epoch": 0.10900109769484083, + "grad_norm": 1.4451576471328735, + "learning_rate": 4.9461284096949734e-05, + "loss": 0.3083, + "step": 1986 + }, + { + "epoch": 0.10911086717892426, + "grad_norm": 2.0036568641662598, + "learning_rate": 4.946020244717355e-05, + "loss": 0.443, + "step": 1988 + }, + { + "epoch": 0.10922063666300769, + "grad_norm": 2.479008674621582, + "learning_rate": 4.945911972445284e-05, + "loss": 0.5587, + "step": 1990 + }, + { + "epoch": 0.10933040614709111, + "grad_norm": 1.3525069952011108, + "learning_rate": 4.945803592883509e-05, + "loss": 0.3573, + "step": 1992 + }, + { + "epoch": 0.10944017563117453, + "grad_norm": 1.8048330545425415, + "learning_rate": 4.945695106036783e-05, + "loss": 0.3792, + "step": 1994 + }, + { + "epoch": 0.10954994511525795, + "grad_norm": 3.1906509399414062, + "learning_rate": 4.945586511909865e-05, + "loss": 0.4465, + "step": 1996 + }, + { + "epoch": 0.10965971459934139, + "grad_norm": 1.6378921270370483, + "learning_rate": 4.9454778105075195e-05, + "loss": 0.5576, + "step": 1998 + }, + { + "epoch": 0.10976948408342481, + "grad_norm": 1.8467786312103271, + "learning_rate": 4.9453690018345144e-05, + "loss": 0.4402, + "step": 2000 + }, + { + "epoch": 0.10987925356750823, + "grad_norm": 2.1463005542755127, + "learning_rate": 4.9452600858956225e-05, + "loss": 0.4232, + "step": 2002 + }, + { + "epoch": 0.10998902305159165, + "grad_norm": 1.8095157146453857, + "learning_rate": 4.9451510626956196e-05, + "loss": 0.4856, + "step": 2004 + }, + { + "epoch": 0.11009879253567509, + "grad_norm": 1.7149986028671265, + "learning_rate": 4.945041932239292e-05, + "loss": 0.5091, + "step": 2006 + }, + { + "epoch": 0.11020856201975851, + "grad_norm": 1.1926326751708984, + "learning_rate": 4.944932694531422e-05, + "loss": 0.3112, + "step": 2008 + }, + { + "epoch": 0.11031833150384193, + "grad_norm": 1.8622310161590576, + "learning_rate": 4.944823349576805e-05, + "loss": 0.4727, + "step": 2010 + }, + { + "epoch": 0.11042810098792535, + "grad_norm": 2.2021963596343994, + "learning_rate": 4.944713897380235e-05, + "loss": 0.3337, + "step": 2012 + }, + { + "epoch": 0.11053787047200878, + "grad_norm": 1.8463306427001953, + "learning_rate": 4.9446043379465155e-05, + "loss": 0.3557, + "step": 2014 + }, + { + "epoch": 0.11064763995609221, + "grad_norm": 1.580902099609375, + "learning_rate": 4.9444946712804494e-05, + "loss": 0.5661, + "step": 2016 + }, + { + "epoch": 0.11075740944017563, + "grad_norm": 1.9106030464172363, + "learning_rate": 4.9443848973868495e-05, + "loss": 0.4287, + "step": 2018 + }, + { + "epoch": 0.11086717892425905, + "grad_norm": 1.4694464206695557, + "learning_rate": 4.9442750162705295e-05, + "loss": 0.327, + "step": 2020 + }, + { + "epoch": 0.11097694840834248, + "grad_norm": 2.5347683429718018, + "learning_rate": 4.9441650279363116e-05, + "loss": 0.4446, + "step": 2022 + }, + { + "epoch": 0.11108671789242591, + "grad_norm": 1.4557771682739258, + "learning_rate": 4.9440549323890176e-05, + "loss": 0.2781, + "step": 2024 + }, + { + "epoch": 0.11119648737650933, + "grad_norm": 2.126173496246338, + "learning_rate": 4.943944729633478e-05, + "loss": 0.3724, + "step": 2026 + }, + { + "epoch": 0.11130625686059276, + "grad_norm": 1.8603967428207397, + "learning_rate": 4.943834419674529e-05, + "loss": 0.4454, + "step": 2028 + }, + { + "epoch": 0.11141602634467618, + "grad_norm": 1.9281820058822632, + "learning_rate": 4.943724002517005e-05, + "loss": 0.394, + "step": 2030 + }, + { + "epoch": 0.1115257958287596, + "grad_norm": 2.422109603881836, + "learning_rate": 4.943613478165753e-05, + "loss": 0.3971, + "step": 2032 + }, + { + "epoch": 0.11163556531284304, + "grad_norm": 2.7411491870880127, + "learning_rate": 4.94350284662562e-05, + "loss": 0.3238, + "step": 2034 + }, + { + "epoch": 0.11174533479692646, + "grad_norm": 2.238548755645752, + "learning_rate": 4.943392107901458e-05, + "loss": 0.4078, + "step": 2036 + }, + { + "epoch": 0.11185510428100988, + "grad_norm": 2.7631819248199463, + "learning_rate": 4.943281261998125e-05, + "loss": 0.454, + "step": 2038 + }, + { + "epoch": 0.1119648737650933, + "grad_norm": 1.7849088907241821, + "learning_rate": 4.943170308920484e-05, + "loss": 0.5521, + "step": 2040 + }, + { + "epoch": 0.11207464324917672, + "grad_norm": 1.8946006298065186, + "learning_rate": 4.943059248673402e-05, + "loss": 0.4467, + "step": 2042 + }, + { + "epoch": 0.11218441273326016, + "grad_norm": 5.120302677154541, + "learning_rate": 4.942948081261749e-05, + "loss": 0.4872, + "step": 2044 + }, + { + "epoch": 0.11229418221734358, + "grad_norm": 1.979935884475708, + "learning_rate": 4.942836806690403e-05, + "loss": 0.4078, + "step": 2046 + }, + { + "epoch": 0.112403951701427, + "grad_norm": 3.8063747882843018, + "learning_rate": 4.9427254249642444e-05, + "loss": 0.6413, + "step": 2048 + }, + { + "epoch": 0.11251372118551042, + "grad_norm": 1.887818455696106, + "learning_rate": 4.94261393608816e-05, + "loss": 0.3693, + "step": 2050 + }, + { + "epoch": 0.11262349066959386, + "grad_norm": 2.3915693759918213, + "learning_rate": 4.942502340067038e-05, + "loss": 0.3957, + "step": 2052 + }, + { + "epoch": 0.11273326015367728, + "grad_norm": 1.782618761062622, + "learning_rate": 4.9423906369057747e-05, + "loss": 0.4857, + "step": 2054 + }, + { + "epoch": 0.1128430296377607, + "grad_norm": 2.2880051136016846, + "learning_rate": 4.9422788266092715e-05, + "loss": 0.4496, + "step": 2056 + }, + { + "epoch": 0.11295279912184412, + "grad_norm": 2.082711935043335, + "learning_rate": 4.9421669091824304e-05, + "loss": 0.4121, + "step": 2058 + }, + { + "epoch": 0.11306256860592755, + "grad_norm": 1.672890305519104, + "learning_rate": 4.942054884630162e-05, + "loss": 0.3824, + "step": 2060 + }, + { + "epoch": 0.11317233809001098, + "grad_norm": 2.4590983390808105, + "learning_rate": 4.9419427529573805e-05, + "loss": 0.457, + "step": 2062 + }, + { + "epoch": 0.1132821075740944, + "grad_norm": 2.362276077270508, + "learning_rate": 4.941830514169004e-05, + "loss": 0.4944, + "step": 2064 + }, + { + "epoch": 0.11339187705817783, + "grad_norm": 2.6345343589782715, + "learning_rate": 4.9417181682699556e-05, + "loss": 0.4977, + "step": 2066 + }, + { + "epoch": 0.11350164654226125, + "grad_norm": 1.7693243026733398, + "learning_rate": 4.941605715265164e-05, + "loss": 0.3601, + "step": 2068 + }, + { + "epoch": 0.11361141602634467, + "grad_norm": 5.068342685699463, + "learning_rate": 4.941493155159562e-05, + "loss": 0.3849, + "step": 2070 + }, + { + "epoch": 0.1137211855104281, + "grad_norm": 2.370790481567383, + "learning_rate": 4.941380487958086e-05, + "loss": 0.3215, + "step": 2072 + }, + { + "epoch": 0.11383095499451153, + "grad_norm": 2.8701391220092773, + "learning_rate": 4.9412677136656785e-05, + "loss": 0.4112, + "step": 2074 + }, + { + "epoch": 0.11394072447859495, + "grad_norm": 1.4719524383544922, + "learning_rate": 4.941154832287288e-05, + "loss": 0.3161, + "step": 2076 + }, + { + "epoch": 0.11405049396267837, + "grad_norm": 1.78270423412323, + "learning_rate": 4.941041843827863e-05, + "loss": 0.386, + "step": 2078 + }, + { + "epoch": 0.1141602634467618, + "grad_norm": 1.866724967956543, + "learning_rate": 4.940928748292363e-05, + "loss": 0.3696, + "step": 2080 + }, + { + "epoch": 0.11427003293084523, + "grad_norm": 1.9284876585006714, + "learning_rate": 4.9408155456857455e-05, + "loss": 0.3713, + "step": 2082 + }, + { + "epoch": 0.11437980241492865, + "grad_norm": 1.5542187690734863, + "learning_rate": 4.9407022360129796e-05, + "loss": 0.4444, + "step": 2084 + }, + { + "epoch": 0.11448957189901207, + "grad_norm": 2.6232340335845947, + "learning_rate": 4.940588819279033e-05, + "loss": 0.4805, + "step": 2086 + }, + { + "epoch": 0.1145993413830955, + "grad_norm": 2.8205795288085938, + "learning_rate": 4.9404752954888824e-05, + "loss": 0.4936, + "step": 2088 + }, + { + "epoch": 0.11470911086717893, + "grad_norm": 1.3625234365463257, + "learning_rate": 4.940361664647506e-05, + "loss": 0.562, + "step": 2090 + }, + { + "epoch": 0.11481888035126235, + "grad_norm": 1.8624225854873657, + "learning_rate": 4.9402479267598887e-05, + "loss": 0.3826, + "step": 2092 + }, + { + "epoch": 0.11492864983534577, + "grad_norm": 1.8893054723739624, + "learning_rate": 4.940134081831021e-05, + "loss": 0.3802, + "step": 2094 + }, + { + "epoch": 0.1150384193194292, + "grad_norm": 2.102720022201538, + "learning_rate": 4.940020129865895e-05, + "loss": 0.5913, + "step": 2096 + }, + { + "epoch": 0.11514818880351263, + "grad_norm": 3.342316150665283, + "learning_rate": 4.93990607086951e-05, + "loss": 0.3261, + "step": 2098 + }, + { + "epoch": 0.11525795828759605, + "grad_norm": 1.7942569255828857, + "learning_rate": 4.939791904846869e-05, + "loss": 0.4424, + "step": 2100 + }, + { + "epoch": 0.11536772777167947, + "grad_norm": 1.2204077243804932, + "learning_rate": 4.939677631802979e-05, + "loss": 0.3792, + "step": 2102 + }, + { + "epoch": 0.1154774972557629, + "grad_norm": 3.479297399520874, + "learning_rate": 4.939563251742855e-05, + "loss": 0.4106, + "step": 2104 + }, + { + "epoch": 0.11558726673984632, + "grad_norm": 1.8564677238464355, + "learning_rate": 4.939448764671512e-05, + "loss": 0.3371, + "step": 2106 + }, + { + "epoch": 0.11569703622392975, + "grad_norm": 2.380967617034912, + "learning_rate": 4.939334170593972e-05, + "loss": 0.4367, + "step": 2108 + }, + { + "epoch": 0.11580680570801317, + "grad_norm": 1.5568088293075562, + "learning_rate": 4.939219469515263e-05, + "loss": 0.3594, + "step": 2110 + }, + { + "epoch": 0.1159165751920966, + "grad_norm": 2.902057647705078, + "learning_rate": 4.939104661440415e-05, + "loss": 0.3351, + "step": 2112 + }, + { + "epoch": 0.11602634467618002, + "grad_norm": 3.271711587905884, + "learning_rate": 4.938989746374465e-05, + "loss": 0.4725, + "step": 2114 + }, + { + "epoch": 0.11613611416026344, + "grad_norm": 2.176306962966919, + "learning_rate": 4.938874724322454e-05, + "loss": 0.4825, + "step": 2116 + }, + { + "epoch": 0.11624588364434688, + "grad_norm": 2.5621156692504883, + "learning_rate": 4.938759595289426e-05, + "loss": 0.3919, + "step": 2118 + }, + { + "epoch": 0.1163556531284303, + "grad_norm": 2.537994384765625, + "learning_rate": 4.938644359280433e-05, + "loss": 0.6113, + "step": 2120 + }, + { + "epoch": 0.11646542261251372, + "grad_norm": 1.4369235038757324, + "learning_rate": 4.938529016300528e-05, + "loss": 0.3258, + "step": 2122 + }, + { + "epoch": 0.11657519209659714, + "grad_norm": 3.380106210708618, + "learning_rate": 4.938413566354772e-05, + "loss": 0.6367, + "step": 2124 + }, + { + "epoch": 0.11668496158068058, + "grad_norm": 2.2420923709869385, + "learning_rate": 4.938298009448229e-05, + "loss": 0.6045, + "step": 2126 + }, + { + "epoch": 0.116794731064764, + "grad_norm": 2.559889078140259, + "learning_rate": 4.938182345585966e-05, + "loss": 0.4717, + "step": 2128 + }, + { + "epoch": 0.11690450054884742, + "grad_norm": 1.7486873865127563, + "learning_rate": 4.938066574773059e-05, + "loss": 0.4678, + "step": 2130 + }, + { + "epoch": 0.11701427003293084, + "grad_norm": 1.4853971004486084, + "learning_rate": 4.937950697014585e-05, + "loss": 0.3651, + "step": 2132 + }, + { + "epoch": 0.11712403951701426, + "grad_norm": 1.9780151844024658, + "learning_rate": 4.937834712315627e-05, + "loss": 0.4835, + "step": 2134 + }, + { + "epoch": 0.1172338090010977, + "grad_norm": 0.9474751949310303, + "learning_rate": 4.937718620681273e-05, + "loss": 0.3942, + "step": 2136 + }, + { + "epoch": 0.11734357848518112, + "grad_norm": 1.2867741584777832, + "learning_rate": 4.937602422116616e-05, + "loss": 0.4383, + "step": 2138 + }, + { + "epoch": 0.11745334796926454, + "grad_norm": 2.5439412593841553, + "learning_rate": 4.937486116626752e-05, + "loss": 0.5758, + "step": 2140 + }, + { + "epoch": 0.11756311745334797, + "grad_norm": 2.4157605171203613, + "learning_rate": 4.9373697042167824e-05, + "loss": 0.5299, + "step": 2142 + }, + { + "epoch": 0.1176728869374314, + "grad_norm": 1.903255820274353, + "learning_rate": 4.9372531848918145e-05, + "loss": 0.4703, + "step": 2144 + }, + { + "epoch": 0.11778265642151482, + "grad_norm": 2.171814203262329, + "learning_rate": 4.9371365586569595e-05, + "loss": 0.4748, + "step": 2146 + }, + { + "epoch": 0.11789242590559824, + "grad_norm": 1.4433095455169678, + "learning_rate": 4.937019825517333e-05, + "loss": 0.2609, + "step": 2148 + }, + { + "epoch": 0.11800219538968167, + "grad_norm": 2.953599452972412, + "learning_rate": 4.936902985478055e-05, + "loss": 0.4559, + "step": 2150 + }, + { + "epoch": 0.11811196487376509, + "grad_norm": 2.5545365810394287, + "learning_rate": 4.936786038544251e-05, + "loss": 0.446, + "step": 2152 + }, + { + "epoch": 0.11822173435784852, + "grad_norm": 1.3747881650924683, + "learning_rate": 4.9366689847210505e-05, + "loss": 0.2834, + "step": 2154 + }, + { + "epoch": 0.11833150384193195, + "grad_norm": 2.4246153831481934, + "learning_rate": 4.936551824013589e-05, + "loss": 0.4228, + "step": 2156 + }, + { + "epoch": 0.11844127332601537, + "grad_norm": 1.2890535593032837, + "learning_rate": 4.9364345564270053e-05, + "loss": 0.4867, + "step": 2158 + }, + { + "epoch": 0.11855104281009879, + "grad_norm": 2.0845391750335693, + "learning_rate": 4.9363171819664434e-05, + "loss": 0.3288, + "step": 2160 + }, + { + "epoch": 0.11866081229418221, + "grad_norm": 2.8480303287506104, + "learning_rate": 4.9361997006370505e-05, + "loss": 0.4809, + "step": 2162 + }, + { + "epoch": 0.11877058177826565, + "grad_norm": 1.7564014196395874, + "learning_rate": 4.936082112443983e-05, + "loss": 0.4746, + "step": 2164 + }, + { + "epoch": 0.11888035126234907, + "grad_norm": 1.4413477182388306, + "learning_rate": 4.935964417392396e-05, + "loss": 0.4528, + "step": 2166 + }, + { + "epoch": 0.11899012074643249, + "grad_norm": 3.2888224124908447, + "learning_rate": 4.935846615487453e-05, + "loss": 0.444, + "step": 2168 + }, + { + "epoch": 0.11909989023051591, + "grad_norm": 1.3139119148254395, + "learning_rate": 4.935728706734322e-05, + "loss": 0.3609, + "step": 2170 + }, + { + "epoch": 0.11920965971459935, + "grad_norm": 1.2717046737670898, + "learning_rate": 4.935610691138175e-05, + "loss": 0.2741, + "step": 2172 + }, + { + "epoch": 0.11931942919868277, + "grad_norm": 2.6246912479400635, + "learning_rate": 4.9354925687041873e-05, + "loss": 0.5006, + "step": 2174 + }, + { + "epoch": 0.11942919868276619, + "grad_norm": 3.4331774711608887, + "learning_rate": 4.935374339437543e-05, + "loss": 0.4284, + "step": 2176 + }, + { + "epoch": 0.11953896816684961, + "grad_norm": 2.0616726875305176, + "learning_rate": 4.935256003343426e-05, + "loss": 0.3548, + "step": 2178 + }, + { + "epoch": 0.11964873765093303, + "grad_norm": 2.600053071975708, + "learning_rate": 4.935137560427027e-05, + "loss": 0.483, + "step": 2180 + }, + { + "epoch": 0.11975850713501647, + "grad_norm": 1.5614051818847656, + "learning_rate": 4.935019010693543e-05, + "loss": 0.4462, + "step": 2182 + }, + { + "epoch": 0.11986827661909989, + "grad_norm": 2.6832430362701416, + "learning_rate": 4.934900354148173e-05, + "loss": 0.5004, + "step": 2184 + }, + { + "epoch": 0.11997804610318331, + "grad_norm": 1.7385988235473633, + "learning_rate": 4.934781590796122e-05, + "loss": 0.4338, + "step": 2186 + }, + { + "epoch": 0.12008781558726674, + "grad_norm": 2.555136203765869, + "learning_rate": 4.934662720642601e-05, + "loss": 0.3855, + "step": 2188 + }, + { + "epoch": 0.12019758507135017, + "grad_norm": 1.436065912246704, + "learning_rate": 4.934543743692822e-05, + "loss": 0.4627, + "step": 2190 + }, + { + "epoch": 0.1203073545554336, + "grad_norm": 1.8139227628707886, + "learning_rate": 4.934424659952006e-05, + "loss": 0.4323, + "step": 2192 + }, + { + "epoch": 0.12041712403951702, + "grad_norm": 1.3282554149627686, + "learning_rate": 4.934305469425374e-05, + "loss": 0.4151, + "step": 2194 + }, + { + "epoch": 0.12052689352360044, + "grad_norm": 2.8062307834625244, + "learning_rate": 4.934186172118157e-05, + "loss": 0.4236, + "step": 2196 + }, + { + "epoch": 0.12063666300768386, + "grad_norm": 2.075378894805908, + "learning_rate": 4.934066768035587e-05, + "loss": 0.3338, + "step": 2198 + }, + { + "epoch": 0.1207464324917673, + "grad_norm": 2.159003496170044, + "learning_rate": 4.933947257182901e-05, + "loss": 0.4679, + "step": 2200 + }, + { + "epoch": 0.12085620197585072, + "grad_norm": 1.6575664281845093, + "learning_rate": 4.9338276395653416e-05, + "loss": 0.4243, + "step": 2202 + }, + { + "epoch": 0.12096597145993414, + "grad_norm": 2.5847558975219727, + "learning_rate": 4.933707915188156e-05, + "loss": 0.5315, + "step": 2204 + }, + { + "epoch": 0.12107574094401756, + "grad_norm": 2.372537851333618, + "learning_rate": 4.933588084056596e-05, + "loss": 0.5038, + "step": 2206 + }, + { + "epoch": 0.12118551042810098, + "grad_norm": 1.7261955738067627, + "learning_rate": 4.933468146175918e-05, + "loss": 0.595, + "step": 2208 + }, + { + "epoch": 0.12129527991218442, + "grad_norm": 2.596900224685669, + "learning_rate": 4.933348101551383e-05, + "loss": 0.3753, + "step": 2210 + }, + { + "epoch": 0.12140504939626784, + "grad_norm": 1.6070046424865723, + "learning_rate": 4.9332279501882564e-05, + "loss": 0.3959, + "step": 2212 + }, + { + "epoch": 0.12151481888035126, + "grad_norm": 3.0740299224853516, + "learning_rate": 4.9331076920918093e-05, + "loss": 0.4514, + "step": 2214 + }, + { + "epoch": 0.12162458836443468, + "grad_norm": 3.094674587249756, + "learning_rate": 4.932987327267316e-05, + "loss": 0.5484, + "step": 2216 + }, + { + "epoch": 0.12173435784851812, + "grad_norm": 2.28470516204834, + "learning_rate": 4.932866855720057e-05, + "loss": 0.3751, + "step": 2218 + }, + { + "epoch": 0.12184412733260154, + "grad_norm": 4.48421573638916, + "learning_rate": 4.9327462774553166e-05, + "loss": 0.5142, + "step": 2220 + }, + { + "epoch": 0.12195389681668496, + "grad_norm": 2.827080488204956, + "learning_rate": 4.9326255924783835e-05, + "loss": 0.4077, + "step": 2222 + }, + { + "epoch": 0.12206366630076838, + "grad_norm": 1.506633996963501, + "learning_rate": 4.9325048007945526e-05, + "loss": 0.2931, + "step": 2224 + }, + { + "epoch": 0.1221734357848518, + "grad_norm": 2.2288715839385986, + "learning_rate": 4.932383902409121e-05, + "loss": 0.5474, + "step": 2226 + }, + { + "epoch": 0.12228320526893524, + "grad_norm": 2.7006261348724365, + "learning_rate": 4.932262897327393e-05, + "loss": 0.4266, + "step": 2228 + }, + { + "epoch": 0.12239297475301866, + "grad_norm": 1.5879645347595215, + "learning_rate": 4.932141785554676e-05, + "loss": 0.496, + "step": 2230 + }, + { + "epoch": 0.12250274423710208, + "grad_norm": 1.6000926494598389, + "learning_rate": 4.9320205670962814e-05, + "loss": 0.4639, + "step": 2232 + }, + { + "epoch": 0.1226125137211855, + "grad_norm": 2.14624285697937, + "learning_rate": 4.9318992419575295e-05, + "loss": 0.497, + "step": 2234 + }, + { + "epoch": 0.12272228320526893, + "grad_norm": 2.22786545753479, + "learning_rate": 4.93177781014374e-05, + "loss": 0.3937, + "step": 2236 + }, + { + "epoch": 0.12283205268935236, + "grad_norm": 2.0973148345947266, + "learning_rate": 4.9316562716602387e-05, + "loss": 0.4052, + "step": 2238 + }, + { + "epoch": 0.12294182217343579, + "grad_norm": 1.9583170413970947, + "learning_rate": 4.9315346265123594e-05, + "loss": 0.4823, + "step": 2240 + }, + { + "epoch": 0.12305159165751921, + "grad_norm": 2.1264760494232178, + "learning_rate": 4.9314128747054355e-05, + "loss": 0.3603, + "step": 2242 + }, + { + "epoch": 0.12316136114160263, + "grad_norm": 1.6149016618728638, + "learning_rate": 4.93129101624481e-05, + "loss": 0.5182, + "step": 2244 + }, + { + "epoch": 0.12327113062568607, + "grad_norm": 2.166541337966919, + "learning_rate": 4.9311690511358266e-05, + "loss": 0.2602, + "step": 2246 + }, + { + "epoch": 0.12338090010976949, + "grad_norm": 2.5931084156036377, + "learning_rate": 4.931046979383835e-05, + "loss": 0.3924, + "step": 2248 + }, + { + "epoch": 0.12349066959385291, + "grad_norm": 1.722564458847046, + "learning_rate": 4.9309248009941914e-05, + "loss": 0.3376, + "step": 2250 + }, + { + "epoch": 0.12360043907793633, + "grad_norm": 5.468147277832031, + "learning_rate": 4.930802515972255e-05, + "loss": 0.4788, + "step": 2252 + }, + { + "epoch": 0.12371020856201975, + "grad_norm": 2.215975761413574, + "learning_rate": 4.930680124323388e-05, + "loss": 0.6563, + "step": 2254 + }, + { + "epoch": 0.12381997804610319, + "grad_norm": 1.8800575733184814, + "learning_rate": 4.9305576260529607e-05, + "loss": 0.583, + "step": 2256 + }, + { + "epoch": 0.12392974753018661, + "grad_norm": 1.8931972980499268, + "learning_rate": 4.930435021166346e-05, + "loss": 0.414, + "step": 2258 + }, + { + "epoch": 0.12403951701427003, + "grad_norm": 1.493498682975769, + "learning_rate": 4.930312309668922e-05, + "loss": 0.403, + "step": 2260 + }, + { + "epoch": 0.12414928649835345, + "grad_norm": 2.572075843811035, + "learning_rate": 4.9301894915660715e-05, + "loss": 0.5168, + "step": 2262 + }, + { + "epoch": 0.12425905598243689, + "grad_norm": 1.8005114793777466, + "learning_rate": 4.930066566863182e-05, + "loss": 0.4724, + "step": 2264 + }, + { + "epoch": 0.12436882546652031, + "grad_norm": 1.9123156070709229, + "learning_rate": 4.929943535565645e-05, + "loss": 0.4401, + "step": 2266 + }, + { + "epoch": 0.12447859495060373, + "grad_norm": 2.167553424835205, + "learning_rate": 4.929820397678858e-05, + "loss": 0.5312, + "step": 2268 + }, + { + "epoch": 0.12458836443468715, + "grad_norm": 3.724436044692993, + "learning_rate": 4.929697153208222e-05, + "loss": 0.4227, + "step": 2270 + }, + { + "epoch": 0.12469813391877058, + "grad_norm": 3.6483817100524902, + "learning_rate": 4.929573802159143e-05, + "loss": 0.4938, + "step": 2272 + }, + { + "epoch": 0.12480790340285401, + "grad_norm": 1.2744888067245483, + "learning_rate": 4.929450344537032e-05, + "loss": 0.3824, + "step": 2274 + }, + { + "epoch": 0.12491767288693743, + "grad_norm": 2.2032430171966553, + "learning_rate": 4.9293267803473046e-05, + "loss": 0.3367, + "step": 2276 + }, + { + "epoch": 0.12502744237102087, + "grad_norm": 2.6989798545837402, + "learning_rate": 4.92920310959538e-05, + "loss": 0.3879, + "step": 2278 + }, + { + "epoch": 0.1251372118551043, + "grad_norm": 1.8238331079483032, + "learning_rate": 4.929079332286685e-05, + "loss": 0.4262, + "step": 2280 + }, + { + "epoch": 0.1252469813391877, + "grad_norm": 2.450435161590576, + "learning_rate": 4.9289554484266474e-05, + "loss": 0.5157, + "step": 2282 + }, + { + "epoch": 0.12535675082327113, + "grad_norm": 2.324441432952881, + "learning_rate": 4.928831458020702e-05, + "loss": 0.3576, + "step": 2284 + }, + { + "epoch": 0.12546652030735456, + "grad_norm": 2.433436870574951, + "learning_rate": 4.928707361074287e-05, + "loss": 0.4286, + "step": 2286 + }, + { + "epoch": 0.12557628979143798, + "grad_norm": 3.4190268516540527, + "learning_rate": 4.9285831575928465e-05, + "loss": 0.3973, + "step": 2288 + }, + { + "epoch": 0.1256860592755214, + "grad_norm": 3.0218052864074707, + "learning_rate": 4.928458847581829e-05, + "loss": 0.4698, + "step": 2290 + }, + { + "epoch": 0.12579582875960482, + "grad_norm": 2.6072490215301514, + "learning_rate": 4.928334431046686e-05, + "loss": 0.4638, + "step": 2292 + }, + { + "epoch": 0.12590559824368824, + "grad_norm": 3.1420516967773438, + "learning_rate": 4.9282099079928764e-05, + "loss": 0.4214, + "step": 2294 + }, + { + "epoch": 0.1260153677277717, + "grad_norm": 1.7873505353927612, + "learning_rate": 4.9280852784258624e-05, + "loss": 0.4202, + "step": 2296 + }, + { + "epoch": 0.12612513721185512, + "grad_norm": 3.5895750522613525, + "learning_rate": 4.9279605423511095e-05, + "loss": 0.4122, + "step": 2298 + }, + { + "epoch": 0.12623490669593854, + "grad_norm": 1.6021192073822021, + "learning_rate": 4.9278356997740904e-05, + "loss": 0.3765, + "step": 2300 + }, + { + "epoch": 0.12634467618002196, + "grad_norm": 2.3041181564331055, + "learning_rate": 4.927710750700281e-05, + "loss": 0.3911, + "step": 2302 + }, + { + "epoch": 0.12645444566410538, + "grad_norm": 1.8278930187225342, + "learning_rate": 4.927585695135162e-05, + "loss": 0.4546, + "step": 2304 + }, + { + "epoch": 0.1265642151481888, + "grad_norm": 1.9765435457229614, + "learning_rate": 4.92746053308422e-05, + "loss": 0.4711, + "step": 2306 + }, + { + "epoch": 0.12667398463227222, + "grad_norm": 3.042189598083496, + "learning_rate": 4.927335264552943e-05, + "loss": 0.4143, + "step": 2308 + }, + { + "epoch": 0.12678375411635565, + "grad_norm": 1.8807296752929688, + "learning_rate": 4.9272098895468277e-05, + "loss": 0.3495, + "step": 2310 + }, + { + "epoch": 0.12689352360043907, + "grad_norm": 1.949692726135254, + "learning_rate": 4.927084408071373e-05, + "loss": 0.3423, + "step": 2312 + }, + { + "epoch": 0.1270032930845225, + "grad_norm": 1.242095708847046, + "learning_rate": 4.926958820132084e-05, + "loss": 0.502, + "step": 2314 + }, + { + "epoch": 0.12711306256860594, + "grad_norm": 2.138465404510498, + "learning_rate": 4.9268331257344685e-05, + "loss": 0.3644, + "step": 2316 + }, + { + "epoch": 0.12722283205268936, + "grad_norm": 2.2460014820098877, + "learning_rate": 4.9267073248840405e-05, + "loss": 0.3396, + "step": 2318 + }, + { + "epoch": 0.12733260153677278, + "grad_norm": 1.3262128829956055, + "learning_rate": 4.9265814175863186e-05, + "loss": 0.3799, + "step": 2320 + }, + { + "epoch": 0.1274423710208562, + "grad_norm": 2.010472059249878, + "learning_rate": 4.926455403846825e-05, + "loss": 0.3804, + "step": 2322 + }, + { + "epoch": 0.12755214050493963, + "grad_norm": 2.3595287799835205, + "learning_rate": 4.926329283671088e-05, + "loss": 0.6012, + "step": 2324 + }, + { + "epoch": 0.12766190998902305, + "grad_norm": 2.7946531772613525, + "learning_rate": 4.926203057064639e-05, + "loss": 0.4546, + "step": 2326 + }, + { + "epoch": 0.12777167947310647, + "grad_norm": 2.3963847160339355, + "learning_rate": 4.926076724033016e-05, + "loss": 0.4285, + "step": 2328 + }, + { + "epoch": 0.1278814489571899, + "grad_norm": 2.394993782043457, + "learning_rate": 4.9259502845817594e-05, + "loss": 0.4483, + "step": 2330 + }, + { + "epoch": 0.1279912184412733, + "grad_norm": 2.8583664894104004, + "learning_rate": 4.925823738716416e-05, + "loss": 0.4908, + "step": 2332 + }, + { + "epoch": 0.12810098792535676, + "grad_norm": 3.6495726108551025, + "learning_rate": 4.925697086442537e-05, + "loss": 0.421, + "step": 2334 + }, + { + "epoch": 0.12821075740944018, + "grad_norm": 1.5393778085708618, + "learning_rate": 4.925570327765678e-05, + "loss": 0.3105, + "step": 2336 + }, + { + "epoch": 0.1283205268935236, + "grad_norm": 1.3255722522735596, + "learning_rate": 4.9254434626913994e-05, + "loss": 0.3957, + "step": 2338 + }, + { + "epoch": 0.12843029637760703, + "grad_norm": 3.2047102451324463, + "learning_rate": 4.925316491225265e-05, + "loss": 0.459, + "step": 2340 + }, + { + "epoch": 0.12854006586169045, + "grad_norm": 1.6838712692260742, + "learning_rate": 4.925189413372845e-05, + "loss": 0.4415, + "step": 2342 + }, + { + "epoch": 0.12864983534577387, + "grad_norm": 1.7940456867218018, + "learning_rate": 4.925062229139714e-05, + "loss": 0.3992, + "step": 2344 + }, + { + "epoch": 0.1287596048298573, + "grad_norm": 1.8221032619476318, + "learning_rate": 4.924934938531451e-05, + "loss": 0.3342, + "step": 2346 + }, + { + "epoch": 0.12886937431394072, + "grad_norm": 1.7127752304077148, + "learning_rate": 4.924807541553639e-05, + "loss": 0.3812, + "step": 2348 + }, + { + "epoch": 0.12897914379802414, + "grad_norm": 0.9945545792579651, + "learning_rate": 4.924680038211867e-05, + "loss": 0.4628, + "step": 2350 + }, + { + "epoch": 0.1290889132821076, + "grad_norm": 2.8047237396240234, + "learning_rate": 4.9245524285117274e-05, + "loss": 0.3305, + "step": 2352 + }, + { + "epoch": 0.129198682766191, + "grad_norm": 1.738644003868103, + "learning_rate": 4.924424712458818e-05, + "loss": 0.3659, + "step": 2354 + }, + { + "epoch": 0.12930845225027443, + "grad_norm": 1.4172710180282593, + "learning_rate": 4.924296890058741e-05, + "loss": 0.2378, + "step": 2356 + }, + { + "epoch": 0.12941822173435785, + "grad_norm": 1.8157973289489746, + "learning_rate": 4.924168961317103e-05, + "loss": 0.3773, + "step": 2358 + }, + { + "epoch": 0.12952799121844127, + "grad_norm": 3.919949531555176, + "learning_rate": 4.924040926239515e-05, + "loss": 0.4571, + "step": 2360 + }, + { + "epoch": 0.1296377607025247, + "grad_norm": 2.8389244079589844, + "learning_rate": 4.9239127848315946e-05, + "loss": 0.3925, + "step": 2362 + }, + { + "epoch": 0.12974753018660812, + "grad_norm": 1.5892261266708374, + "learning_rate": 4.923784537098963e-05, + "loss": 0.3594, + "step": 2364 + }, + { + "epoch": 0.12985729967069154, + "grad_norm": 1.32912278175354, + "learning_rate": 4.9236561830472446e-05, + "loss": 0.3285, + "step": 2366 + }, + { + "epoch": 0.12996706915477496, + "grad_norm": 2.445984363555908, + "learning_rate": 4.9235277226820695e-05, + "loss": 0.4836, + "step": 2368 + }, + { + "epoch": 0.1300768386388584, + "grad_norm": 2.231541633605957, + "learning_rate": 4.923399156009073e-05, + "loss": 0.4954, + "step": 2370 + }, + { + "epoch": 0.13018660812294183, + "grad_norm": 2.292766809463501, + "learning_rate": 4.923270483033896e-05, + "loss": 0.626, + "step": 2372 + }, + { + "epoch": 0.13029637760702525, + "grad_norm": 4.32710599899292, + "learning_rate": 4.9231417037621806e-05, + "loss": 0.6211, + "step": 2374 + }, + { + "epoch": 0.13040614709110868, + "grad_norm": 2.7705001831054688, + "learning_rate": 4.923012818199576e-05, + "loss": 0.4019, + "step": 2376 + }, + { + "epoch": 0.1305159165751921, + "grad_norm": 2.813234567642212, + "learning_rate": 4.9228838263517374e-05, + "loss": 0.372, + "step": 2378 + }, + { + "epoch": 0.13062568605927552, + "grad_norm": 1.4007803201675415, + "learning_rate": 4.9227547282243214e-05, + "loss": 0.3215, + "step": 2380 + }, + { + "epoch": 0.13073545554335894, + "grad_norm": 1.9918622970581055, + "learning_rate": 4.922625523822992e-05, + "loss": 0.3549, + "step": 2382 + }, + { + "epoch": 0.13084522502744236, + "grad_norm": 1.319931149482727, + "learning_rate": 4.922496213153416e-05, + "loss": 0.3662, + "step": 2384 + }, + { + "epoch": 0.13095499451152579, + "grad_norm": 1.8870978355407715, + "learning_rate": 4.922366796221265e-05, + "loss": 0.3525, + "step": 2386 + }, + { + "epoch": 0.1310647639956092, + "grad_norm": 1.8441921472549438, + "learning_rate": 4.9222372730322176e-05, + "loss": 0.7852, + "step": 2388 + }, + { + "epoch": 0.13117453347969266, + "grad_norm": 2.9106884002685547, + "learning_rate": 4.9221076435919546e-05, + "loss": 0.5173, + "step": 2390 + }, + { + "epoch": 0.13128430296377608, + "grad_norm": 2.7179408073425293, + "learning_rate": 4.92197790790616e-05, + "loss": 0.4981, + "step": 2392 + }, + { + "epoch": 0.1313940724478595, + "grad_norm": 1.667979121208191, + "learning_rate": 4.921848065980529e-05, + "loss": 0.3907, + "step": 2394 + }, + { + "epoch": 0.13150384193194292, + "grad_norm": 1.7973731756210327, + "learning_rate": 4.9217181178207535e-05, + "loss": 0.4248, + "step": 2396 + }, + { + "epoch": 0.13161361141602634, + "grad_norm": 2.061933755874634, + "learning_rate": 4.921588063432535e-05, + "loss": 0.5187, + "step": 2398 + }, + { + "epoch": 0.13172338090010977, + "grad_norm": 1.1178492307662964, + "learning_rate": 4.9214579028215776e-05, + "loss": 0.3305, + "step": 2400 + }, + { + "epoch": 0.1318331503841932, + "grad_norm": 1.6705479621887207, + "learning_rate": 4.9213276359935924e-05, + "loss": 0.3694, + "step": 2402 + }, + { + "epoch": 0.1319429198682766, + "grad_norm": 2.2592437267303467, + "learning_rate": 4.9211972629542926e-05, + "loss": 0.38, + "step": 2404 + }, + { + "epoch": 0.13205268935236003, + "grad_norm": 1.9933950901031494, + "learning_rate": 4.921066783709396e-05, + "loss": 0.3158, + "step": 2406 + }, + { + "epoch": 0.13216245883644348, + "grad_norm": 1.9043179750442505, + "learning_rate": 4.920936198264627e-05, + "loss": 0.343, + "step": 2408 + }, + { + "epoch": 0.1322722283205269, + "grad_norm": 2.170167922973633, + "learning_rate": 4.9208055066257144e-05, + "loss": 0.4062, + "step": 2410 + }, + { + "epoch": 0.13238199780461032, + "grad_norm": 1.5694773197174072, + "learning_rate": 4.9206747087983894e-05, + "loss": 0.4636, + "step": 2412 + }, + { + "epoch": 0.13249176728869375, + "grad_norm": 2.790281295776367, + "learning_rate": 4.920543804788391e-05, + "loss": 0.392, + "step": 2414 + }, + { + "epoch": 0.13260153677277717, + "grad_norm": 2.237722635269165, + "learning_rate": 4.920412794601461e-05, + "loss": 0.4428, + "step": 2416 + }, + { + "epoch": 0.1327113062568606, + "grad_norm": 3.1681368350982666, + "learning_rate": 4.920281678243345e-05, + "loss": 0.4913, + "step": 2418 + }, + { + "epoch": 0.132821075740944, + "grad_norm": 1.5961679220199585, + "learning_rate": 4.9201504557197955e-05, + "loss": 0.3885, + "step": 2420 + }, + { + "epoch": 0.13293084522502743, + "grad_norm": 1.8354734182357788, + "learning_rate": 4.920019127036567e-05, + "loss": 0.5389, + "step": 2422 + }, + { + "epoch": 0.13304061470911085, + "grad_norm": 1.0047056674957275, + "learning_rate": 4.919887692199423e-05, + "loss": 0.2655, + "step": 2424 + }, + { + "epoch": 0.1331503841931943, + "grad_norm": 2.4289603233337402, + "learning_rate": 4.9197561512141265e-05, + "loss": 0.3349, + "step": 2426 + }, + { + "epoch": 0.13326015367727773, + "grad_norm": 3.3829703330993652, + "learning_rate": 4.9196245040864486e-05, + "loss": 0.5458, + "step": 2428 + }, + { + "epoch": 0.13336992316136115, + "grad_norm": 3.2188241481781006, + "learning_rate": 4.919492750822164e-05, + "loss": 0.496, + "step": 2430 + }, + { + "epoch": 0.13347969264544457, + "grad_norm": 9.273036003112793, + "learning_rate": 4.9193608914270515e-05, + "loss": 0.4393, + "step": 2432 + }, + { + "epoch": 0.133589462129528, + "grad_norm": 2.3505423069000244, + "learning_rate": 4.9192289259068954e-05, + "loss": 0.5238, + "step": 2434 + }, + { + "epoch": 0.1336992316136114, + "grad_norm": 1.9848664999008179, + "learning_rate": 4.919096854267484e-05, + "loss": 0.4323, + "step": 2436 + }, + { + "epoch": 0.13380900109769484, + "grad_norm": 2.1456122398376465, + "learning_rate": 4.918964676514611e-05, + "loss": 0.3139, + "step": 2438 + }, + { + "epoch": 0.13391877058177826, + "grad_norm": 4.189302921295166, + "learning_rate": 4.9188323926540746e-05, + "loss": 0.5089, + "step": 2440 + }, + { + "epoch": 0.13402854006586168, + "grad_norm": 2.5437073707580566, + "learning_rate": 4.918700002691677e-05, + "loss": 0.4566, + "step": 2442 + }, + { + "epoch": 0.13413830954994513, + "grad_norm": 3.222627639770508, + "learning_rate": 4.918567506633226e-05, + "loss": 0.5521, + "step": 2444 + }, + { + "epoch": 0.13424807903402855, + "grad_norm": 3.5579702854156494, + "learning_rate": 4.9184349044845336e-05, + "loss": 0.5087, + "step": 2446 + }, + { + "epoch": 0.13435784851811197, + "grad_norm": 1.6745498180389404, + "learning_rate": 4.918302196251415e-05, + "loss": 0.6118, + "step": 2448 + }, + { + "epoch": 0.1344676180021954, + "grad_norm": 4.697577953338623, + "learning_rate": 4.918169381939692e-05, + "loss": 0.4126, + "step": 2450 + }, + { + "epoch": 0.13457738748627882, + "grad_norm": 2.6539642810821533, + "learning_rate": 4.918036461555192e-05, + "loss": 0.3277, + "step": 2452 + }, + { + "epoch": 0.13468715697036224, + "grad_norm": 1.9520505666732788, + "learning_rate": 4.9179034351037434e-05, + "loss": 0.561, + "step": 2454 + }, + { + "epoch": 0.13479692645444566, + "grad_norm": 1.0967049598693848, + "learning_rate": 4.9177703025911825e-05, + "loss": 0.3212, + "step": 2456 + }, + { + "epoch": 0.13490669593852908, + "grad_norm": 2.7454655170440674, + "learning_rate": 4.9176370640233496e-05, + "loss": 0.3431, + "step": 2458 + }, + { + "epoch": 0.1350164654226125, + "grad_norm": 2.6118767261505127, + "learning_rate": 4.917503719406088e-05, + "loss": 0.3961, + "step": 2460 + }, + { + "epoch": 0.13512623490669595, + "grad_norm": 2.151808977127075, + "learning_rate": 4.9173702687452474e-05, + "loss": 0.5565, + "step": 2462 + }, + { + "epoch": 0.13523600439077937, + "grad_norm": 2.3569324016571045, + "learning_rate": 4.917236712046682e-05, + "loss": 0.3931, + "step": 2464 + }, + { + "epoch": 0.1353457738748628, + "grad_norm": 1.58269202709198, + "learning_rate": 4.917103049316249e-05, + "loss": 0.3754, + "step": 2466 + }, + { + "epoch": 0.13545554335894622, + "grad_norm": 1.8546475172042847, + "learning_rate": 4.9169692805598145e-05, + "loss": 0.4795, + "step": 2468 + }, + { + "epoch": 0.13556531284302964, + "grad_norm": 1.9630231857299805, + "learning_rate": 4.9168354057832424e-05, + "loss": 0.4445, + "step": 2470 + }, + { + "epoch": 0.13567508232711306, + "grad_norm": 1.4799460172653198, + "learning_rate": 4.9167014249924075e-05, + "loss": 0.4012, + "step": 2472 + }, + { + "epoch": 0.13578485181119648, + "grad_norm": 2.4023022651672363, + "learning_rate": 4.916567338193187e-05, + "loss": 0.2557, + "step": 2474 + }, + { + "epoch": 0.1358946212952799, + "grad_norm": 2.6374783515930176, + "learning_rate": 4.91643314539146e-05, + "loss": 0.3798, + "step": 2476 + }, + { + "epoch": 0.13600439077936333, + "grad_norm": 2.467451333999634, + "learning_rate": 4.916298846593116e-05, + "loss": 0.5127, + "step": 2478 + }, + { + "epoch": 0.13611416026344675, + "grad_norm": 2.974621057510376, + "learning_rate": 4.916164441804044e-05, + "loss": 0.4966, + "step": 2480 + }, + { + "epoch": 0.1362239297475302, + "grad_norm": 5.232806205749512, + "learning_rate": 4.916029931030141e-05, + "loss": 0.5566, + "step": 2482 + }, + { + "epoch": 0.13633369923161362, + "grad_norm": 2.6433253288269043, + "learning_rate": 4.915895314277306e-05, + "loss": 0.5267, + "step": 2484 + }, + { + "epoch": 0.13644346871569704, + "grad_norm": 1.2294399738311768, + "learning_rate": 4.915760591551445e-05, + "loss": 0.3846, + "step": 2486 + }, + { + "epoch": 0.13655323819978046, + "grad_norm": 2.9727673530578613, + "learning_rate": 4.915625762858467e-05, + "loss": 0.5064, + "step": 2488 + }, + { + "epoch": 0.13666300768386389, + "grad_norm": 2.788783311843872, + "learning_rate": 4.915490828204287e-05, + "loss": 0.4318, + "step": 2490 + }, + { + "epoch": 0.1367727771679473, + "grad_norm": 1.5734070539474487, + "learning_rate": 4.915355787594823e-05, + "loss": 0.351, + "step": 2492 + }, + { + "epoch": 0.13688254665203073, + "grad_norm": 1.6543442010879517, + "learning_rate": 4.915220641035999e-05, + "loss": 0.5436, + "step": 2494 + }, + { + "epoch": 0.13699231613611415, + "grad_norm": 3.7969372272491455, + "learning_rate": 4.9150853885337426e-05, + "loss": 0.476, + "step": 2496 + }, + { + "epoch": 0.13710208562019757, + "grad_norm": 1.800389051437378, + "learning_rate": 4.914950030093988e-05, + "loss": 0.4494, + "step": 2498 + }, + { + "epoch": 0.13721185510428102, + "grad_norm": 1.5155529975891113, + "learning_rate": 4.914814565722671e-05, + "loss": 0.3916, + "step": 2500 + }, + { + "epoch": 0.13732162458836444, + "grad_norm": 1.69076406955719, + "learning_rate": 4.914678995425734e-05, + "loss": 0.51, + "step": 2502 + }, + { + "epoch": 0.13743139407244787, + "grad_norm": 1.3067333698272705, + "learning_rate": 4.914543319209126e-05, + "loss": 0.4159, + "step": 2504 + }, + { + "epoch": 0.1375411635565313, + "grad_norm": 1.3804795742034912, + "learning_rate": 4.9144075370787955e-05, + "loss": 0.3829, + "step": 2506 + }, + { + "epoch": 0.1376509330406147, + "grad_norm": 1.3558374643325806, + "learning_rate": 4.9142716490407e-05, + "loss": 0.3049, + "step": 2508 + }, + { + "epoch": 0.13776070252469813, + "grad_norm": 3.561126947402954, + "learning_rate": 4.914135655100801e-05, + "loss": 0.4345, + "step": 2510 + }, + { + "epoch": 0.13787047200878155, + "grad_norm": 2.04278302192688, + "learning_rate": 4.913999555265062e-05, + "loss": 0.4429, + "step": 2512 + }, + { + "epoch": 0.13798024149286497, + "grad_norm": 2.6968486309051514, + "learning_rate": 4.913863349539454e-05, + "loss": 0.4704, + "step": 2514 + }, + { + "epoch": 0.1380900109769484, + "grad_norm": 2.076338052749634, + "learning_rate": 4.913727037929952e-05, + "loss": 0.4868, + "step": 2516 + }, + { + "epoch": 0.13819978046103185, + "grad_norm": 2.2324113845825195, + "learning_rate": 4.913590620442534e-05, + "loss": 0.3418, + "step": 2518 + }, + { + "epoch": 0.13830954994511527, + "grad_norm": 2.206188201904297, + "learning_rate": 4.913454097083185e-05, + "loss": 0.3578, + "step": 2520 + }, + { + "epoch": 0.1384193194291987, + "grad_norm": 2.1441478729248047, + "learning_rate": 4.913317467857894e-05, + "loss": 0.392, + "step": 2522 + }, + { + "epoch": 0.1385290889132821, + "grad_norm": 1.9933929443359375, + "learning_rate": 4.913180732772652e-05, + "loss": 0.3991, + "step": 2524 + }, + { + "epoch": 0.13863885839736553, + "grad_norm": 1.82801353931427, + "learning_rate": 4.9130438918334606e-05, + "loss": 0.4711, + "step": 2526 + }, + { + "epoch": 0.13874862788144895, + "grad_norm": 1.7010186910629272, + "learning_rate": 4.9129069450463186e-05, + "loss": 0.4733, + "step": 2528 + }, + { + "epoch": 0.13885839736553238, + "grad_norm": 2.6615631580352783, + "learning_rate": 4.912769892417236e-05, + "loss": 0.3829, + "step": 2530 + }, + { + "epoch": 0.1389681668496158, + "grad_norm": 1.4312536716461182, + "learning_rate": 4.9126327339522225e-05, + "loss": 0.406, + "step": 2532 + }, + { + "epoch": 0.13907793633369922, + "grad_norm": 3.4619576930999756, + "learning_rate": 4.9124954696572956e-05, + "loss": 0.4696, + "step": 2534 + }, + { + "epoch": 0.13918770581778267, + "grad_norm": 2.949315309524536, + "learning_rate": 4.912358099538476e-05, + "loss": 0.532, + "step": 2536 + }, + { + "epoch": 0.1392974753018661, + "grad_norm": 2.844975709915161, + "learning_rate": 4.9122206236017896e-05, + "loss": 0.3807, + "step": 2538 + }, + { + "epoch": 0.1394072447859495, + "grad_norm": 3.053830862045288, + "learning_rate": 4.912083041853267e-05, + "loss": 0.5117, + "step": 2540 + }, + { + "epoch": 0.13951701427003294, + "grad_norm": 1.4631602764129639, + "learning_rate": 4.9119453542989435e-05, + "loss": 0.4528, + "step": 2542 + }, + { + "epoch": 0.13962678375411636, + "grad_norm": 2.7190327644348145, + "learning_rate": 4.911807560944858e-05, + "loss": 0.6115, + "step": 2544 + }, + { + "epoch": 0.13973655323819978, + "grad_norm": 1.7714920043945312, + "learning_rate": 4.911669661797054e-05, + "loss": 0.4171, + "step": 2546 + }, + { + "epoch": 0.1398463227222832, + "grad_norm": 2.575167179107666, + "learning_rate": 4.9115316568615824e-05, + "loss": 0.5525, + "step": 2548 + }, + { + "epoch": 0.13995609220636662, + "grad_norm": 2.4822731018066406, + "learning_rate": 4.9113935461444955e-05, + "loss": 0.4311, + "step": 2550 + }, + { + "epoch": 0.14006586169045004, + "grad_norm": 2.2466392517089844, + "learning_rate": 4.911255329651851e-05, + "loss": 0.4382, + "step": 2552 + }, + { + "epoch": 0.14017563117453347, + "grad_norm": 2.30403733253479, + "learning_rate": 4.911117007389714e-05, + "loss": 0.5403, + "step": 2554 + }, + { + "epoch": 0.14028540065861692, + "grad_norm": 4.2551469802856445, + "learning_rate": 4.910978579364151e-05, + "loss": 0.4559, + "step": 2556 + }, + { + "epoch": 0.14039517014270034, + "grad_norm": 3.05833101272583, + "learning_rate": 4.910840045581233e-05, + "loss": 0.4454, + "step": 2558 + }, + { + "epoch": 0.14050493962678376, + "grad_norm": 2.6600539684295654, + "learning_rate": 4.910701406047037e-05, + "loss": 0.4688, + "step": 2560 + }, + { + "epoch": 0.14061470911086718, + "grad_norm": 1.7311935424804688, + "learning_rate": 4.910562660767645e-05, + "loss": 0.4209, + "step": 2562 + }, + { + "epoch": 0.1407244785949506, + "grad_norm": 3.1481597423553467, + "learning_rate": 4.910423809749143e-05, + "loss": 0.4217, + "step": 2564 + }, + { + "epoch": 0.14083424807903402, + "grad_norm": 1.6687304973602295, + "learning_rate": 4.910284852997622e-05, + "loss": 0.3445, + "step": 2566 + }, + { + "epoch": 0.14094401756311745, + "grad_norm": 1.8450651168823242, + "learning_rate": 4.9101457905191774e-05, + "loss": 0.4414, + "step": 2568 + }, + { + "epoch": 0.14105378704720087, + "grad_norm": 1.7147290706634521, + "learning_rate": 4.910006622319908e-05, + "loss": 0.4783, + "step": 2570 + }, + { + "epoch": 0.1411635565312843, + "grad_norm": 1.9136619567871094, + "learning_rate": 4.9098673484059195e-05, + "loss": 0.3491, + "step": 2572 + }, + { + "epoch": 0.14127332601536774, + "grad_norm": 1.5611414909362793, + "learning_rate": 4.909727968783321e-05, + "loss": 0.3613, + "step": 2574 + }, + { + "epoch": 0.14138309549945116, + "grad_norm": 1.3791896104812622, + "learning_rate": 4.909588483458225e-05, + "loss": 0.4587, + "step": 2576 + }, + { + "epoch": 0.14149286498353458, + "grad_norm": 2.1935243606567383, + "learning_rate": 4.909448892436752e-05, + "loss": 0.5291, + "step": 2578 + }, + { + "epoch": 0.141602634467618, + "grad_norm": 2.3396389484405518, + "learning_rate": 4.909309195725025e-05, + "loss": 0.5258, + "step": 2580 + }, + { + "epoch": 0.14171240395170143, + "grad_norm": 1.6880239248275757, + "learning_rate": 4.9091693933291696e-05, + "loss": 0.5089, + "step": 2582 + }, + { + "epoch": 0.14182217343578485, + "grad_norm": 1.5117390155792236, + "learning_rate": 4.909029485255321e-05, + "loss": 0.396, + "step": 2584 + }, + { + "epoch": 0.14193194291986827, + "grad_norm": 1.4338605403900146, + "learning_rate": 4.908889471509614e-05, + "loss": 0.3854, + "step": 2586 + }, + { + "epoch": 0.1420417124039517, + "grad_norm": 2.3558058738708496, + "learning_rate": 4.908749352098192e-05, + "loss": 0.4474, + "step": 2588 + }, + { + "epoch": 0.1421514818880351, + "grad_norm": 1.8985053300857544, + "learning_rate": 4.9086091270272e-05, + "loss": 0.5315, + "step": 2590 + }, + { + "epoch": 0.14226125137211856, + "grad_norm": 1.9606941938400269, + "learning_rate": 4.9084687963027894e-05, + "loss": 0.3851, + "step": 2592 + }, + { + "epoch": 0.14237102085620199, + "grad_norm": 2.1185760498046875, + "learning_rate": 4.908328359931117e-05, + "loss": 0.4277, + "step": 2594 + }, + { + "epoch": 0.1424807903402854, + "grad_norm": 1.556232213973999, + "learning_rate": 4.908187817918341e-05, + "loss": 0.5009, + "step": 2596 + }, + { + "epoch": 0.14259055982436883, + "grad_norm": 2.3968892097473145, + "learning_rate": 4.908047170270628e-05, + "loss": 0.3637, + "step": 2598 + }, + { + "epoch": 0.14270032930845225, + "grad_norm": 1.4780995845794678, + "learning_rate": 4.907906416994146e-05, + "loss": 0.2788, + "step": 2600 + }, + { + "epoch": 0.14281009879253567, + "grad_norm": 5.520054817199707, + "learning_rate": 4.9077655580950696e-05, + "loss": 0.6116, + "step": 2602 + }, + { + "epoch": 0.1429198682766191, + "grad_norm": 2.2772445678710938, + "learning_rate": 4.9076245935795786e-05, + "loss": 0.3661, + "step": 2604 + }, + { + "epoch": 0.14302963776070252, + "grad_norm": 1.7767205238342285, + "learning_rate": 4.907483523453855e-05, + "loss": 0.4406, + "step": 2606 + }, + { + "epoch": 0.14313940724478594, + "grad_norm": 2.0458595752716064, + "learning_rate": 4.907342347724087e-05, + "loss": 0.4233, + "step": 2608 + }, + { + "epoch": 0.1432491767288694, + "grad_norm": 2.495547294616699, + "learning_rate": 4.907201066396469e-05, + "loss": 0.5157, + "step": 2610 + }, + { + "epoch": 0.1433589462129528, + "grad_norm": 2.5903077125549316, + "learning_rate": 4.907059679477197e-05, + "loss": 0.4052, + "step": 2612 + }, + { + "epoch": 0.14346871569703623, + "grad_norm": 1.899183750152588, + "learning_rate": 4.9069181869724725e-05, + "loss": 0.4184, + "step": 2614 + }, + { + "epoch": 0.14357848518111965, + "grad_norm": 2.5485708713531494, + "learning_rate": 4.906776588888502e-05, + "loss": 0.4268, + "step": 2616 + }, + { + "epoch": 0.14368825466520307, + "grad_norm": 3.4886977672576904, + "learning_rate": 4.906634885231497e-05, + "loss": 0.4085, + "step": 2618 + }, + { + "epoch": 0.1437980241492865, + "grad_norm": 2.53035306930542, + "learning_rate": 4.906493076007674e-05, + "loss": 0.4171, + "step": 2620 + }, + { + "epoch": 0.14390779363336992, + "grad_norm": 2.700394630432129, + "learning_rate": 4.9063511612232526e-05, + "loss": 0.4897, + "step": 2622 + }, + { + "epoch": 0.14401756311745334, + "grad_norm": 1.8324885368347168, + "learning_rate": 4.906209140884459e-05, + "loss": 0.3794, + "step": 2624 + }, + { + "epoch": 0.14412733260153676, + "grad_norm": 2.3769705295562744, + "learning_rate": 4.9060670149975214e-05, + "loss": 0.4106, + "step": 2626 + }, + { + "epoch": 0.1442371020856202, + "grad_norm": 3.223423480987549, + "learning_rate": 4.905924783568675e-05, + "loss": 0.4355, + "step": 2628 + }, + { + "epoch": 0.14434687156970363, + "grad_norm": 2.951533079147339, + "learning_rate": 4.905782446604159e-05, + "loss": 0.3686, + "step": 2630 + }, + { + "epoch": 0.14445664105378705, + "grad_norm": 1.8039194345474243, + "learning_rate": 4.905640004110216e-05, + "loss": 0.3905, + "step": 2632 + }, + { + "epoch": 0.14456641053787048, + "grad_norm": 2.1065821647644043, + "learning_rate": 4.9054974560930946e-05, + "loss": 0.292, + "step": 2634 + }, + { + "epoch": 0.1446761800219539, + "grad_norm": 2.281355142593384, + "learning_rate": 4.905354802559049e-05, + "loss": 0.5093, + "step": 2636 + }, + { + "epoch": 0.14478594950603732, + "grad_norm": 2.658552408218384, + "learning_rate": 4.905212043514335e-05, + "loss": 0.4155, + "step": 2638 + }, + { + "epoch": 0.14489571899012074, + "grad_norm": 1.3893420696258545, + "learning_rate": 4.905069178965215e-05, + "loss": 0.3414, + "step": 2640 + }, + { + "epoch": 0.14500548847420416, + "grad_norm": 1.8907029628753662, + "learning_rate": 4.904926208917956e-05, + "loss": 0.4032, + "step": 2642 + }, + { + "epoch": 0.14511525795828759, + "grad_norm": 2.3178043365478516, + "learning_rate": 4.9047831333788295e-05, + "loss": 0.3492, + "step": 2644 + }, + { + "epoch": 0.145225027442371, + "grad_norm": 2.517411708831787, + "learning_rate": 4.904639952354112e-05, + "loss": 0.3745, + "step": 2646 + }, + { + "epoch": 0.14533479692645446, + "grad_norm": 1.5086538791656494, + "learning_rate": 4.904496665850084e-05, + "loss": 0.3874, + "step": 2648 + }, + { + "epoch": 0.14544456641053788, + "grad_norm": 1.4577511548995972, + "learning_rate": 4.9043532738730284e-05, + "loss": 0.3517, + "step": 2650 + }, + { + "epoch": 0.1455543358946213, + "grad_norm": 1.8942824602127075, + "learning_rate": 4.9042097764292385e-05, + "loss": 0.4675, + "step": 2652 + }, + { + "epoch": 0.14566410537870472, + "grad_norm": 3.3922879695892334, + "learning_rate": 4.904066173525006e-05, + "loss": 0.426, + "step": 2654 + }, + { + "epoch": 0.14577387486278814, + "grad_norm": 2.105289936065674, + "learning_rate": 4.9039224651666325e-05, + "loss": 0.4708, + "step": 2656 + }, + { + "epoch": 0.14588364434687157, + "grad_norm": 1.7922981977462769, + "learning_rate": 4.90377865136042e-05, + "loss": 0.3752, + "step": 2658 + }, + { + "epoch": 0.145993413830955, + "grad_norm": 1.5454444885253906, + "learning_rate": 4.903634732112678e-05, + "loss": 0.4424, + "step": 2660 + }, + { + "epoch": 0.1461031833150384, + "grad_norm": 3.394895076751709, + "learning_rate": 4.9034907074297176e-05, + "loss": 0.5356, + "step": 2662 + }, + { + "epoch": 0.14621295279912183, + "grad_norm": 3.2603580951690674, + "learning_rate": 4.903346577317859e-05, + "loss": 0.4136, + "step": 2664 + }, + { + "epoch": 0.14632272228320528, + "grad_norm": 2.17467999458313, + "learning_rate": 4.903202341783422e-05, + "loss": 0.526, + "step": 2666 + }, + { + "epoch": 0.1464324917672887, + "grad_norm": 3.6285147666931152, + "learning_rate": 4.9030580008327353e-05, + "loss": 0.4028, + "step": 2668 + }, + { + "epoch": 0.14654226125137212, + "grad_norm": 6.424526691436768, + "learning_rate": 4.90291355447213e-05, + "loss": 0.3882, + "step": 2670 + }, + { + "epoch": 0.14665203073545555, + "grad_norm": 2.8423922061920166, + "learning_rate": 4.902769002707942e-05, + "loss": 0.5056, + "step": 2672 + }, + { + "epoch": 0.14676180021953897, + "grad_norm": 2.34586763381958, + "learning_rate": 4.902624345546511e-05, + "loss": 0.4318, + "step": 2674 + }, + { + "epoch": 0.1468715697036224, + "grad_norm": 3.190441131591797, + "learning_rate": 4.902479582994185e-05, + "loss": 0.4239, + "step": 2676 + }, + { + "epoch": 0.1469813391877058, + "grad_norm": 2.806947708129883, + "learning_rate": 4.902334715057312e-05, + "loss": 0.5379, + "step": 2678 + }, + { + "epoch": 0.14709110867178923, + "grad_norm": 1.8669596910476685, + "learning_rate": 4.902189741742247e-05, + "loss": 0.329, + "step": 2680 + }, + { + "epoch": 0.14720087815587266, + "grad_norm": 2.1170382499694824, + "learning_rate": 4.9020446630553486e-05, + "loss": 0.4791, + "step": 2682 + }, + { + "epoch": 0.1473106476399561, + "grad_norm": 1.3349443674087524, + "learning_rate": 4.901899479002982e-05, + "loss": 0.3021, + "step": 2684 + }, + { + "epoch": 0.14742041712403953, + "grad_norm": 1.9473098516464233, + "learning_rate": 4.9017541895915146e-05, + "loss": 0.4941, + "step": 2686 + }, + { + "epoch": 0.14753018660812295, + "grad_norm": 1.616746425628662, + "learning_rate": 4.90160879482732e-05, + "loss": 0.332, + "step": 2688 + }, + { + "epoch": 0.14763995609220637, + "grad_norm": 2.0163321495056152, + "learning_rate": 4.9014632947167763e-05, + "loss": 0.4126, + "step": 2690 + }, + { + "epoch": 0.1477497255762898, + "grad_norm": 2.105588674545288, + "learning_rate": 4.9013176892662654e-05, + "loss": 0.3578, + "step": 2692 + }, + { + "epoch": 0.14785949506037321, + "grad_norm": 2.205087423324585, + "learning_rate": 4.901171978482174e-05, + "loss": 0.3366, + "step": 2694 + }, + { + "epoch": 0.14796926454445664, + "grad_norm": 3.1218252182006836, + "learning_rate": 4.9010261623708944e-05, + "loss": 0.3996, + "step": 2696 + }, + { + "epoch": 0.14807903402854006, + "grad_norm": 1.8371303081512451, + "learning_rate": 4.9008802409388225e-05, + "loss": 0.381, + "step": 2698 + }, + { + "epoch": 0.14818880351262348, + "grad_norm": 1.8454713821411133, + "learning_rate": 4.900734214192358e-05, + "loss": 0.4432, + "step": 2700 + }, + { + "epoch": 0.14829857299670693, + "grad_norm": 3.324636697769165, + "learning_rate": 4.900588082137908e-05, + "loss": 0.5445, + "step": 2702 + }, + { + "epoch": 0.14840834248079035, + "grad_norm": 1.8456639051437378, + "learning_rate": 4.9004418447818815e-05, + "loss": 0.494, + "step": 2704 + }, + { + "epoch": 0.14851811196487377, + "grad_norm": 1.2616275548934937, + "learning_rate": 4.900295502130694e-05, + "loss": 0.3499, + "step": 2706 + }, + { + "epoch": 0.1486278814489572, + "grad_norm": 2.34356427192688, + "learning_rate": 4.9001490541907645e-05, + "loss": 0.518, + "step": 2708 + }, + { + "epoch": 0.14873765093304062, + "grad_norm": 2.6533031463623047, + "learning_rate": 4.900002500968517e-05, + "loss": 0.5074, + "step": 2710 + }, + { + "epoch": 0.14884742041712404, + "grad_norm": 2.572266101837158, + "learning_rate": 4.89985584247038e-05, + "loss": 0.3487, + "step": 2712 + }, + { + "epoch": 0.14895718990120746, + "grad_norm": 1.0480130910873413, + "learning_rate": 4.899709078702786e-05, + "loss": 0.4495, + "step": 2714 + }, + { + "epoch": 0.14906695938529088, + "grad_norm": 2.508660078048706, + "learning_rate": 4.899562209672174e-05, + "loss": 0.4898, + "step": 2716 + }, + { + "epoch": 0.1491767288693743, + "grad_norm": 1.4909086227416992, + "learning_rate": 4.899415235384985e-05, + "loss": 0.4685, + "step": 2718 + }, + { + "epoch": 0.14928649835345773, + "grad_norm": 2.622509479522705, + "learning_rate": 4.899268155847667e-05, + "loss": 0.4742, + "step": 2720 + }, + { + "epoch": 0.14939626783754117, + "grad_norm": 2.211092233657837, + "learning_rate": 4.899120971066671e-05, + "loss": 0.431, + "step": 2722 + }, + { + "epoch": 0.1495060373216246, + "grad_norm": 1.6192232370376587, + "learning_rate": 4.898973681048454e-05, + "loss": 0.3512, + "step": 2724 + }, + { + "epoch": 0.14961580680570802, + "grad_norm": 2.7053017616271973, + "learning_rate": 4.898826285799477e-05, + "loss": 0.387, + "step": 2726 + }, + { + "epoch": 0.14972557628979144, + "grad_norm": 1.8292922973632812, + "learning_rate": 4.898678785326205e-05, + "loss": 0.3146, + "step": 2728 + }, + { + "epoch": 0.14983534577387486, + "grad_norm": 2.891996145248413, + "learning_rate": 4.898531179635107e-05, + "loss": 0.373, + "step": 2730 + }, + { + "epoch": 0.14994511525795828, + "grad_norm": 2.2545270919799805, + "learning_rate": 4.8983834687326596e-05, + "loss": 0.5097, + "step": 2732 + }, + { + "epoch": 0.1500548847420417, + "grad_norm": 1.2040090560913086, + "learning_rate": 4.898235652625341e-05, + "loss": 0.442, + "step": 2734 + }, + { + "epoch": 0.15016465422612513, + "grad_norm": 1.8937634229660034, + "learning_rate": 4.898087731319636e-05, + "loss": 0.3635, + "step": 2736 + }, + { + "epoch": 0.15027442371020855, + "grad_norm": 1.9633947610855103, + "learning_rate": 4.8979397048220324e-05, + "loss": 0.5449, + "step": 2738 + }, + { + "epoch": 0.150384193194292, + "grad_norm": 2.092808723449707, + "learning_rate": 4.897791573139023e-05, + "loss": 0.3573, + "step": 2740 + }, + { + "epoch": 0.15049396267837542, + "grad_norm": 1.781663417816162, + "learning_rate": 4.897643336277106e-05, + "loss": 0.3702, + "step": 2742 + }, + { + "epoch": 0.15060373216245884, + "grad_norm": 1.5251768827438354, + "learning_rate": 4.897494994242785e-05, + "loss": 0.3363, + "step": 2744 + }, + { + "epoch": 0.15071350164654226, + "grad_norm": 1.3512837886810303, + "learning_rate": 4.897346547042565e-05, + "loss": 0.4843, + "step": 2746 + }, + { + "epoch": 0.15082327113062569, + "grad_norm": 2.5647542476654053, + "learning_rate": 4.897197994682959e-05, + "loss": 0.4463, + "step": 2748 + }, + { + "epoch": 0.1509330406147091, + "grad_norm": 2.6886281967163086, + "learning_rate": 4.8970493371704826e-05, + "loss": 0.5718, + "step": 2750 + }, + { + "epoch": 0.15104281009879253, + "grad_norm": 2.6101534366607666, + "learning_rate": 4.896900574511657e-05, + "loss": 0.2414, + "step": 2752 + }, + { + "epoch": 0.15115257958287595, + "grad_norm": 3.539294719696045, + "learning_rate": 4.896751706713007e-05, + "loss": 0.557, + "step": 2754 + }, + { + "epoch": 0.15126234906695937, + "grad_norm": 1.7053173780441284, + "learning_rate": 4.896602733781065e-05, + "loss": 0.2756, + "step": 2756 + }, + { + "epoch": 0.15137211855104282, + "grad_norm": 1.9812370538711548, + "learning_rate": 4.896453655722362e-05, + "loss": 0.3992, + "step": 2758 + }, + { + "epoch": 0.15148188803512624, + "grad_norm": 3.0186731815338135, + "learning_rate": 4.89630447254344e-05, + "loss": 0.4945, + "step": 2760 + }, + { + "epoch": 0.15159165751920967, + "grad_norm": 2.3716189861297607, + "learning_rate": 4.896155184250842e-05, + "loss": 0.5826, + "step": 2762 + }, + { + "epoch": 0.1517014270032931, + "grad_norm": 2.001894950866699, + "learning_rate": 4.896005790851116e-05, + "loss": 0.5547, + "step": 2764 + }, + { + "epoch": 0.1518111964873765, + "grad_norm": 2.0245375633239746, + "learning_rate": 4.895856292350817e-05, + "loss": 0.439, + "step": 2766 + }, + { + "epoch": 0.15192096597145993, + "grad_norm": 2.1681723594665527, + "learning_rate": 4.8957066887565e-05, + "loss": 0.3925, + "step": 2768 + }, + { + "epoch": 0.15203073545554335, + "grad_norm": 3.5595481395721436, + "learning_rate": 4.895556980074729e-05, + "loss": 0.4316, + "step": 2770 + }, + { + "epoch": 0.15214050493962678, + "grad_norm": 1.9780160188674927, + "learning_rate": 4.8954071663120715e-05, + "loss": 0.443, + "step": 2772 + }, + { + "epoch": 0.1522502744237102, + "grad_norm": 1.8084383010864258, + "learning_rate": 4.895257247475098e-05, + "loss": 0.3417, + "step": 2774 + }, + { + "epoch": 0.15236004390779365, + "grad_norm": 1.5438579320907593, + "learning_rate": 4.8951072235703855e-05, + "loss": 0.3596, + "step": 2776 + }, + { + "epoch": 0.15246981339187707, + "grad_norm": 4.846376895904541, + "learning_rate": 4.8949570946045143e-05, + "loss": 0.4599, + "step": 2778 + }, + { + "epoch": 0.1525795828759605, + "grad_norm": 2.0844712257385254, + "learning_rate": 4.8948068605840694e-05, + "loss": 0.3718, + "step": 2780 + }, + { + "epoch": 0.1526893523600439, + "grad_norm": 1.5936390161514282, + "learning_rate": 4.8946565215156405e-05, + "loss": 0.4256, + "step": 2782 + }, + { + "epoch": 0.15279912184412733, + "grad_norm": 3.14936900138855, + "learning_rate": 4.894506077405824e-05, + "loss": 0.3095, + "step": 2784 + }, + { + "epoch": 0.15290889132821076, + "grad_norm": 1.7039520740509033, + "learning_rate": 4.894355528261218e-05, + "loss": 0.4548, + "step": 2786 + }, + { + "epoch": 0.15301866081229418, + "grad_norm": 2.505938768386841, + "learning_rate": 4.894204874088425e-05, + "loss": 0.4232, + "step": 2788 + }, + { + "epoch": 0.1531284302963776, + "grad_norm": 2.4030158519744873, + "learning_rate": 4.8940541148940555e-05, + "loss": 0.3886, + "step": 2790 + }, + { + "epoch": 0.15323819978046102, + "grad_norm": 6.354808330535889, + "learning_rate": 4.8939032506847224e-05, + "loss": 0.4927, + "step": 2792 + }, + { + "epoch": 0.15334796926454447, + "grad_norm": 2.0001614093780518, + "learning_rate": 4.8937522814670424e-05, + "loss": 0.4086, + "step": 2794 + }, + { + "epoch": 0.1534577387486279, + "grad_norm": 3.880098819732666, + "learning_rate": 4.893601207247638e-05, + "loss": 0.4835, + "step": 2796 + }, + { + "epoch": 0.15356750823271131, + "grad_norm": 1.7750613689422607, + "learning_rate": 4.8934500280331365e-05, + "loss": 0.4454, + "step": 2798 + }, + { + "epoch": 0.15367727771679474, + "grad_norm": 2.009774923324585, + "learning_rate": 4.893298743830168e-05, + "loss": 0.3668, + "step": 2800 + }, + { + "epoch": 0.15378704720087816, + "grad_norm": 2.9865355491638184, + "learning_rate": 4.893147354645371e-05, + "loss": 0.4022, + "step": 2802 + }, + { + "epoch": 0.15389681668496158, + "grad_norm": 2.2190535068511963, + "learning_rate": 4.892995860485384e-05, + "loss": 0.5967, + "step": 2804 + }, + { + "epoch": 0.154006586169045, + "grad_norm": 1.7039543390274048, + "learning_rate": 4.8928442613568535e-05, + "loss": 0.3773, + "step": 2806 + }, + { + "epoch": 0.15411635565312842, + "grad_norm": 5.573587894439697, + "learning_rate": 4.892692557266429e-05, + "loss": 0.5269, + "step": 2808 + }, + { + "epoch": 0.15422612513721184, + "grad_norm": 2.753667116165161, + "learning_rate": 4.8925407482207634e-05, + "loss": 0.463, + "step": 2810 + }, + { + "epoch": 0.15433589462129527, + "grad_norm": 2.417353868484497, + "learning_rate": 4.892388834226519e-05, + "loss": 0.6106, + "step": 2812 + }, + { + "epoch": 0.15444566410537872, + "grad_norm": 1.9860767126083374, + "learning_rate": 4.8922368152903565e-05, + "loss": 0.4816, + "step": 2814 + }, + { + "epoch": 0.15455543358946214, + "grad_norm": 1.9473891258239746, + "learning_rate": 4.892084691418947e-05, + "loss": 0.3689, + "step": 2816 + }, + { + "epoch": 0.15466520307354556, + "grad_norm": 3.551497459411621, + "learning_rate": 4.891932462618961e-05, + "loss": 0.4426, + "step": 2818 + }, + { + "epoch": 0.15477497255762898, + "grad_norm": 1.096441626548767, + "learning_rate": 4.891780128897077e-05, + "loss": 0.334, + "step": 2820 + }, + { + "epoch": 0.1548847420417124, + "grad_norm": 1.5722484588623047, + "learning_rate": 4.8916276902599764e-05, + "loss": 0.372, + "step": 2822 + }, + { + "epoch": 0.15499451152579583, + "grad_norm": 4.200645446777344, + "learning_rate": 4.891475146714347e-05, + "loss": 0.3814, + "step": 2824 + }, + { + "epoch": 0.15510428100987925, + "grad_norm": 1.758373498916626, + "learning_rate": 4.89132249826688e-05, + "loss": 0.3948, + "step": 2826 + }, + { + "epoch": 0.15521405049396267, + "grad_norm": 2.4200003147125244, + "learning_rate": 4.891169744924271e-05, + "loss": 0.3381, + "step": 2828 + }, + { + "epoch": 0.1553238199780461, + "grad_norm": 2.1771607398986816, + "learning_rate": 4.8910168866932195e-05, + "loss": 0.3724, + "step": 2830 + }, + { + "epoch": 0.15543358946212954, + "grad_norm": 1.9267643690109253, + "learning_rate": 4.8908639235804324e-05, + "loss": 0.4162, + "step": 2832 + }, + { + "epoch": 0.15554335894621296, + "grad_norm": 1.3779077529907227, + "learning_rate": 4.890710855592618e-05, + "loss": 0.4232, + "step": 2834 + }, + { + "epoch": 0.15565312843029638, + "grad_norm": 2.267185688018799, + "learning_rate": 4.890557682736491e-05, + "loss": 0.4314, + "step": 2836 + }, + { + "epoch": 0.1557628979143798, + "grad_norm": 2.543492078781128, + "learning_rate": 4.890404405018772e-05, + "loss": 0.4444, + "step": 2838 + }, + { + "epoch": 0.15587266739846323, + "grad_norm": 2.6621363162994385, + "learning_rate": 4.890251022446181e-05, + "loss": 0.4572, + "step": 2840 + }, + { + "epoch": 0.15598243688254665, + "grad_norm": 3.1390912532806396, + "learning_rate": 4.890097535025449e-05, + "loss": 0.4233, + "step": 2842 + }, + { + "epoch": 0.15609220636663007, + "grad_norm": 3.38101863861084, + "learning_rate": 4.8899439427633076e-05, + "loss": 0.3891, + "step": 2844 + }, + { + "epoch": 0.1562019758507135, + "grad_norm": 2.5746655464172363, + "learning_rate": 4.889790245666495e-05, + "loss": 0.6657, + "step": 2846 + }, + { + "epoch": 0.15631174533479691, + "grad_norm": 1.7942345142364502, + "learning_rate": 4.889636443741752e-05, + "loss": 0.354, + "step": 2848 + }, + { + "epoch": 0.15642151481888036, + "grad_norm": 2.2310404777526855, + "learning_rate": 4.8894825369958255e-05, + "loss": 0.486, + "step": 2850 + }, + { + "epoch": 0.15653128430296379, + "grad_norm": 1.7940536737442017, + "learning_rate": 4.889328525435467e-05, + "loss": 0.3604, + "step": 2852 + }, + { + "epoch": 0.1566410537870472, + "grad_norm": 3.3854525089263916, + "learning_rate": 4.889174409067431e-05, + "loss": 0.4017, + "step": 2854 + }, + { + "epoch": 0.15675082327113063, + "grad_norm": 1.430957555770874, + "learning_rate": 4.8890201878984796e-05, + "loss": 0.4185, + "step": 2856 + }, + { + "epoch": 0.15686059275521405, + "grad_norm": 2.0111987590789795, + "learning_rate": 4.888865861935377e-05, + "loss": 0.507, + "step": 2858 + }, + { + "epoch": 0.15697036223929747, + "grad_norm": 1.4864064455032349, + "learning_rate": 4.8887114311848915e-05, + "loss": 0.2614, + "step": 2860 + }, + { + "epoch": 0.1570801317233809, + "grad_norm": 1.838361382484436, + "learning_rate": 4.888556895653799e-05, + "loss": 0.5939, + "step": 2862 + }, + { + "epoch": 0.15718990120746432, + "grad_norm": 2.4927380084991455, + "learning_rate": 4.888402255348876e-05, + "loss": 0.4696, + "step": 2864 + }, + { + "epoch": 0.15729967069154774, + "grad_norm": 1.130833387374878, + "learning_rate": 4.888247510276908e-05, + "loss": 0.4584, + "step": 2866 + }, + { + "epoch": 0.1574094401756312, + "grad_norm": 2.1088662147521973, + "learning_rate": 4.888092660444682e-05, + "loss": 0.4275, + "step": 2868 + }, + { + "epoch": 0.1575192096597146, + "grad_norm": 1.7862321138381958, + "learning_rate": 4.88793770585899e-05, + "loss": 0.4325, + "step": 2870 + }, + { + "epoch": 0.15762897914379803, + "grad_norm": 1.9409456253051758, + "learning_rate": 4.887782646526631e-05, + "loss": 0.3726, + "step": 2872 + }, + { + "epoch": 0.15773874862788145, + "grad_norm": 1.789082407951355, + "learning_rate": 4.887627482454403e-05, + "loss": 0.4257, + "step": 2874 + }, + { + "epoch": 0.15784851811196488, + "grad_norm": 2.657299518585205, + "learning_rate": 4.8874722136491155e-05, + "loss": 0.4033, + "step": 2876 + }, + { + "epoch": 0.1579582875960483, + "grad_norm": 2.2701995372772217, + "learning_rate": 4.887316840117579e-05, + "loss": 0.5404, + "step": 2878 + }, + { + "epoch": 0.15806805708013172, + "grad_norm": 2.6834213733673096, + "learning_rate": 4.887161361866608e-05, + "loss": 0.3549, + "step": 2880 + }, + { + "epoch": 0.15817782656421514, + "grad_norm": 1.574965000152588, + "learning_rate": 4.887005778903022e-05, + "loss": 0.4718, + "step": 2882 + }, + { + "epoch": 0.15828759604829856, + "grad_norm": 2.3970108032226562, + "learning_rate": 4.8868500912336465e-05, + "loss": 0.3678, + "step": 2884 + }, + { + "epoch": 0.15839736553238198, + "grad_norm": 1.3085768222808838, + "learning_rate": 4.8866942988653115e-05, + "loss": 0.4144, + "step": 2886 + }, + { + "epoch": 0.15850713501646543, + "grad_norm": 4.219538688659668, + "learning_rate": 4.8865384018048494e-05, + "loss": 0.4623, + "step": 2888 + }, + { + "epoch": 0.15861690450054886, + "grad_norm": 1.8667372465133667, + "learning_rate": 4.8863824000590995e-05, + "loss": 0.4024, + "step": 2890 + }, + { + "epoch": 0.15872667398463228, + "grad_norm": 1.5527117252349854, + "learning_rate": 4.886226293634904e-05, + "loss": 0.4115, + "step": 2892 + }, + { + "epoch": 0.1588364434687157, + "grad_norm": 2.490652084350586, + "learning_rate": 4.886070082539112e-05, + "loss": 0.4746, + "step": 2894 + }, + { + "epoch": 0.15894621295279912, + "grad_norm": 1.9998081922531128, + "learning_rate": 4.8859137667785735e-05, + "loss": 0.4138, + "step": 2896 + }, + { + "epoch": 0.15905598243688254, + "grad_norm": 1.6327296495437622, + "learning_rate": 4.8857573463601465e-05, + "loss": 0.4164, + "step": 2898 + }, + { + "epoch": 0.15916575192096596, + "grad_norm": 1.4540300369262695, + "learning_rate": 4.8856008212906925e-05, + "loss": 0.3247, + "step": 2900 + }, + { + "epoch": 0.1592755214050494, + "grad_norm": 2.0166261196136475, + "learning_rate": 4.885444191577078e-05, + "loss": 0.289, + "step": 2902 + }, + { + "epoch": 0.1593852908891328, + "grad_norm": 1.283927083015442, + "learning_rate": 4.885287457226172e-05, + "loss": 0.4375, + "step": 2904 + }, + { + "epoch": 0.15949506037321626, + "grad_norm": 1.5835309028625488, + "learning_rate": 4.88513061824485e-05, + "loss": 0.3873, + "step": 2906 + }, + { + "epoch": 0.15960482985729968, + "grad_norm": 2.1071617603302, + "learning_rate": 4.884973674639993e-05, + "loss": 0.5735, + "step": 2908 + }, + { + "epoch": 0.1597145993413831, + "grad_norm": 2.525240182876587, + "learning_rate": 4.8848166264184844e-05, + "loss": 0.4362, + "step": 2910 + }, + { + "epoch": 0.15982436882546652, + "grad_norm": 1.4718520641326904, + "learning_rate": 4.884659473587213e-05, + "loss": 0.4092, + "step": 2912 + }, + { + "epoch": 0.15993413830954994, + "grad_norm": 2.139051914215088, + "learning_rate": 4.8845022161530726e-05, + "loss": 0.3788, + "step": 2914 + }, + { + "epoch": 0.16004390779363337, + "grad_norm": 2.1244547367095947, + "learning_rate": 4.884344854122961e-05, + "loss": 0.4096, + "step": 2916 + }, + { + "epoch": 0.1601536772777168, + "grad_norm": 1.3620063066482544, + "learning_rate": 4.884187387503781e-05, + "loss": 0.2854, + "step": 2918 + }, + { + "epoch": 0.1602634467618002, + "grad_norm": 2.495741844177246, + "learning_rate": 4.88402981630244e-05, + "loss": 0.5355, + "step": 2920 + }, + { + "epoch": 0.16037321624588363, + "grad_norm": 1.009167194366455, + "learning_rate": 4.8838721405258504e-05, + "loss": 0.3079, + "step": 2922 + }, + { + "epoch": 0.16048298572996708, + "grad_norm": 1.4634437561035156, + "learning_rate": 4.883714360180927e-05, + "loss": 0.4446, + "step": 2924 + }, + { + "epoch": 0.1605927552140505, + "grad_norm": 2.1280524730682373, + "learning_rate": 4.883556475274592e-05, + "loss": 0.5786, + "step": 2926 + }, + { + "epoch": 0.16070252469813393, + "grad_norm": 3.2480928897857666, + "learning_rate": 4.8833984858137715e-05, + "loss": 0.3439, + "step": 2928 + }, + { + "epoch": 0.16081229418221735, + "grad_norm": 1.5592504739761353, + "learning_rate": 4.883240391805394e-05, + "loss": 0.2724, + "step": 2930 + }, + { + "epoch": 0.16092206366630077, + "grad_norm": 1.9153730869293213, + "learning_rate": 4.883082193256397e-05, + "loss": 0.2731, + "step": 2932 + }, + { + "epoch": 0.1610318331503842, + "grad_norm": 2.161949872970581, + "learning_rate": 4.882923890173716e-05, + "loss": 0.3031, + "step": 2934 + }, + { + "epoch": 0.1611416026344676, + "grad_norm": 1.5290963649749756, + "learning_rate": 4.882765482564298e-05, + "loss": 0.3297, + "step": 2936 + }, + { + "epoch": 0.16125137211855103, + "grad_norm": 2.2883799076080322, + "learning_rate": 4.882606970435091e-05, + "loss": 0.4364, + "step": 2938 + }, + { + "epoch": 0.16136114160263446, + "grad_norm": 2.102957248687744, + "learning_rate": 4.882448353793048e-05, + "loss": 0.3898, + "step": 2940 + }, + { + "epoch": 0.1614709110867179, + "grad_norm": 1.5278122425079346, + "learning_rate": 4.8822896326451254e-05, + "loss": 0.3163, + "step": 2942 + }, + { + "epoch": 0.16158068057080133, + "grad_norm": 2.662027597427368, + "learning_rate": 4.8821308069982867e-05, + "loss": 0.3792, + "step": 2944 + }, + { + "epoch": 0.16169045005488475, + "grad_norm": 1.3018760681152344, + "learning_rate": 4.881971876859499e-05, + "loss": 0.3632, + "step": 2946 + }, + { + "epoch": 0.16180021953896817, + "grad_norm": 1.8565620183944702, + "learning_rate": 4.8818128422357335e-05, + "loss": 0.4021, + "step": 2948 + }, + { + "epoch": 0.1619099890230516, + "grad_norm": 2.837235450744629, + "learning_rate": 4.881653703133966e-05, + "loss": 0.4899, + "step": 2950 + }, + { + "epoch": 0.16201975850713501, + "grad_norm": 1.8246214389801025, + "learning_rate": 4.8814944595611776e-05, + "loss": 0.416, + "step": 2952 + }, + { + "epoch": 0.16212952799121844, + "grad_norm": 2.697004556655884, + "learning_rate": 4.8813351115243524e-05, + "loss": 0.4252, + "step": 2954 + }, + { + "epoch": 0.16223929747530186, + "grad_norm": 1.6510181427001953, + "learning_rate": 4.8811756590304815e-05, + "loss": 0.4609, + "step": 2956 + }, + { + "epoch": 0.16234906695938528, + "grad_norm": 2.263984203338623, + "learning_rate": 4.881016102086558e-05, + "loss": 0.4158, + "step": 2958 + }, + { + "epoch": 0.16245883644346873, + "grad_norm": 1.3938826322555542, + "learning_rate": 4.880856440699582e-05, + "loss": 0.5664, + "step": 2960 + }, + { + "epoch": 0.16256860592755215, + "grad_norm": 2.3428242206573486, + "learning_rate": 4.880696674876557e-05, + "loss": 0.392, + "step": 2962 + }, + { + "epoch": 0.16267837541163557, + "grad_norm": 1.7701524496078491, + "learning_rate": 4.880536804624491e-05, + "loss": 0.544, + "step": 2964 + }, + { + "epoch": 0.162788144895719, + "grad_norm": 2.759232521057129, + "learning_rate": 4.8803768299503946e-05, + "loss": 0.4829, + "step": 2966 + }, + { + "epoch": 0.16289791437980242, + "grad_norm": 1.8513339757919312, + "learning_rate": 4.880216750861288e-05, + "loss": 0.4857, + "step": 2968 + }, + { + "epoch": 0.16300768386388584, + "grad_norm": 1.5074750185012817, + "learning_rate": 4.880056567364192e-05, + "loss": 0.4275, + "step": 2970 + }, + { + "epoch": 0.16311745334796926, + "grad_norm": 1.9421806335449219, + "learning_rate": 4.879896279466133e-05, + "loss": 0.3961, + "step": 2972 + }, + { + "epoch": 0.16322722283205268, + "grad_norm": 1.7953379154205322, + "learning_rate": 4.879735887174141e-05, + "loss": 0.3773, + "step": 2974 + }, + { + "epoch": 0.1633369923161361, + "grad_norm": 2.032336711883545, + "learning_rate": 4.8795753904952534e-05, + "loss": 0.4577, + "step": 2976 + }, + { + "epoch": 0.16344676180021953, + "grad_norm": 1.1940827369689941, + "learning_rate": 4.87941478943651e-05, + "loss": 0.3013, + "step": 2978 + }, + { + "epoch": 0.16355653128430298, + "grad_norm": 1.7609816789627075, + "learning_rate": 4.879254084004955e-05, + "loss": 0.5334, + "step": 2980 + }, + { + "epoch": 0.1636663007683864, + "grad_norm": 3.1303744316101074, + "learning_rate": 4.8790932742076365e-05, + "loss": 0.4487, + "step": 2982 + }, + { + "epoch": 0.16377607025246982, + "grad_norm": 1.5713893175125122, + "learning_rate": 4.8789323600516104e-05, + "loss": 0.462, + "step": 2984 + }, + { + "epoch": 0.16388583973655324, + "grad_norm": 1.864857792854309, + "learning_rate": 4.878771341543935e-05, + "loss": 0.3629, + "step": 2986 + }, + { + "epoch": 0.16399560922063666, + "grad_norm": 1.6406718492507935, + "learning_rate": 4.878610218691673e-05, + "loss": 0.4971, + "step": 2988 + }, + { + "epoch": 0.16410537870472008, + "grad_norm": 1.7716403007507324, + "learning_rate": 4.8784489915018905e-05, + "loss": 0.4114, + "step": 2990 + }, + { + "epoch": 0.1642151481888035, + "grad_norm": 1.8556480407714844, + "learning_rate": 4.878287659981662e-05, + "loss": 0.505, + "step": 2992 + }, + { + "epoch": 0.16432491767288693, + "grad_norm": 2.968378782272339, + "learning_rate": 4.8781262241380635e-05, + "loss": 0.4689, + "step": 2994 + }, + { + "epoch": 0.16443468715697035, + "grad_norm": 2.486325263977051, + "learning_rate": 4.8779646839781765e-05, + "loss": 0.472, + "step": 2996 + }, + { + "epoch": 0.1645444566410538, + "grad_norm": 1.7953907251358032, + "learning_rate": 4.877803039509087e-05, + "loss": 0.3632, + "step": 2998 + }, + { + "epoch": 0.16465422612513722, + "grad_norm": 1.4732033014297485, + "learning_rate": 4.877641290737884e-05, + "loss": 0.3527, + "step": 3000 + }, + { + "epoch": 0.16476399560922064, + "grad_norm": 5.348898410797119, + "learning_rate": 4.8774794376716646e-05, + "loss": 0.4575, + "step": 3002 + }, + { + "epoch": 0.16487376509330406, + "grad_norm": 1.279014229774475, + "learning_rate": 4.877317480317528e-05, + "loss": 0.3373, + "step": 3004 + }, + { + "epoch": 0.1649835345773875, + "grad_norm": 2.0928263664245605, + "learning_rate": 4.8771554186825774e-05, + "loss": 0.7855, + "step": 3006 + }, + { + "epoch": 0.1650933040614709, + "grad_norm": 2.0150177478790283, + "learning_rate": 4.8769932527739225e-05, + "loss": 0.4768, + "step": 3008 + }, + { + "epoch": 0.16520307354555433, + "grad_norm": 2.128439426422119, + "learning_rate": 4.876830982598677e-05, + "loss": 0.508, + "step": 3010 + }, + { + "epoch": 0.16531284302963775, + "grad_norm": 2.857706308364868, + "learning_rate": 4.876668608163959e-05, + "loss": 0.4809, + "step": 3012 + }, + { + "epoch": 0.16542261251372117, + "grad_norm": 1.7298556566238403, + "learning_rate": 4.876506129476889e-05, + "loss": 0.4319, + "step": 3014 + }, + { + "epoch": 0.16553238199780462, + "grad_norm": 2.806729316711426, + "learning_rate": 4.8763435465445964e-05, + "loss": 0.5306, + "step": 3016 + }, + { + "epoch": 0.16564215148188804, + "grad_norm": 2.318413257598877, + "learning_rate": 4.876180859374212e-05, + "loss": 0.415, + "step": 3018 + }, + { + "epoch": 0.16575192096597147, + "grad_norm": 2.3591792583465576, + "learning_rate": 4.876018067972872e-05, + "loss": 0.4169, + "step": 3020 + }, + { + "epoch": 0.1658616904500549, + "grad_norm": 1.9438740015029907, + "learning_rate": 4.8758551723477177e-05, + "loss": 0.6821, + "step": 3022 + }, + { + "epoch": 0.1659714599341383, + "grad_norm": 1.1361305713653564, + "learning_rate": 4.8756921725058934e-05, + "loss": 0.4681, + "step": 3024 + }, + { + "epoch": 0.16608122941822173, + "grad_norm": 1.7159416675567627, + "learning_rate": 4.875529068454551e-05, + "loss": 0.3857, + "step": 3026 + }, + { + "epoch": 0.16619099890230515, + "grad_norm": 1.4031691551208496, + "learning_rate": 4.8753658602008425e-05, + "loss": 0.3693, + "step": 3028 + }, + { + "epoch": 0.16630076838638858, + "grad_norm": 2.28609561920166, + "learning_rate": 4.8752025477519295e-05, + "loss": 0.4428, + "step": 3030 + }, + { + "epoch": 0.166410537870472, + "grad_norm": 1.8916451930999756, + "learning_rate": 4.875039131114975e-05, + "loss": 0.4712, + "step": 3032 + }, + { + "epoch": 0.16652030735455545, + "grad_norm": 1.2553579807281494, + "learning_rate": 4.874875610297145e-05, + "loss": 0.2912, + "step": 3034 + }, + { + "epoch": 0.16663007683863887, + "grad_norm": 1.2079182863235474, + "learning_rate": 4.8747119853056156e-05, + "loss": 0.374, + "step": 3036 + }, + { + "epoch": 0.1667398463227223, + "grad_norm": 2.043260097503662, + "learning_rate": 4.874548256147562e-05, + "loss": 0.4261, + "step": 3038 + }, + { + "epoch": 0.1668496158068057, + "grad_norm": 1.6167014837265015, + "learning_rate": 4.874384422830167e-05, + "loss": 0.3647, + "step": 3040 + }, + { + "epoch": 0.16695938529088913, + "grad_norm": 2.378702402114868, + "learning_rate": 4.874220485360618e-05, + "loss": 0.4544, + "step": 3042 + }, + { + "epoch": 0.16706915477497256, + "grad_norm": 3.810681104660034, + "learning_rate": 4.874056443746104e-05, + "loss": 0.3594, + "step": 3044 + }, + { + "epoch": 0.16717892425905598, + "grad_norm": 1.3831862211227417, + "learning_rate": 4.8738922979938226e-05, + "loss": 0.3405, + "step": 3046 + }, + { + "epoch": 0.1672886937431394, + "grad_norm": 2.0596823692321777, + "learning_rate": 4.8737280481109724e-05, + "loss": 0.4198, + "step": 3048 + }, + { + "epoch": 0.16739846322722282, + "grad_norm": 3.7204272747039795, + "learning_rate": 4.87356369410476e-05, + "loss": 0.3487, + "step": 3050 + }, + { + "epoch": 0.16750823271130624, + "grad_norm": 1.9042683839797974, + "learning_rate": 4.8733992359823936e-05, + "loss": 0.5242, + "step": 3052 + }, + { + "epoch": 0.1676180021953897, + "grad_norm": 1.9216721057891846, + "learning_rate": 4.873234673751087e-05, + "loss": 0.3394, + "step": 3054 + }, + { + "epoch": 0.16772777167947311, + "grad_norm": 1.5589194297790527, + "learning_rate": 4.873070007418059e-05, + "loss": 0.3074, + "step": 3056 + }, + { + "epoch": 0.16783754116355654, + "grad_norm": 1.5710655450820923, + "learning_rate": 4.872905236990533e-05, + "loss": 0.3832, + "step": 3058 + }, + { + "epoch": 0.16794731064763996, + "grad_norm": 1.580443263053894, + "learning_rate": 4.8727403624757365e-05, + "loss": 0.3992, + "step": 3060 + }, + { + "epoch": 0.16805708013172338, + "grad_norm": 2.7403082847595215, + "learning_rate": 4.872575383880902e-05, + "loss": 0.3422, + "step": 3062 + }, + { + "epoch": 0.1681668496158068, + "grad_norm": 2.822016716003418, + "learning_rate": 4.872410301213265e-05, + "loss": 0.4108, + "step": 3064 + }, + { + "epoch": 0.16827661909989022, + "grad_norm": 2.12910795211792, + "learning_rate": 4.872245114480069e-05, + "loss": 0.475, + "step": 3066 + }, + { + "epoch": 0.16838638858397365, + "grad_norm": 3.941950559616089, + "learning_rate": 4.872079823688557e-05, + "loss": 0.4861, + "step": 3068 + }, + { + "epoch": 0.16849615806805707, + "grad_norm": 1.900827169418335, + "learning_rate": 4.8719144288459814e-05, + "loss": 0.5452, + "step": 3070 + }, + { + "epoch": 0.16860592755214052, + "grad_norm": 1.7142810821533203, + "learning_rate": 4.871748929959598e-05, + "loss": 0.5603, + "step": 3072 + }, + { + "epoch": 0.16871569703622394, + "grad_norm": 1.7768524885177612, + "learning_rate": 4.8715833270366644e-05, + "loss": 0.4397, + "step": 3074 + }, + { + "epoch": 0.16882546652030736, + "grad_norm": 2.4841678142547607, + "learning_rate": 4.8714176200844464e-05, + "loss": 0.4503, + "step": 3076 + }, + { + "epoch": 0.16893523600439078, + "grad_norm": 1.5232911109924316, + "learning_rate": 4.871251809110211e-05, + "loss": 0.3913, + "step": 3078 + }, + { + "epoch": 0.1690450054884742, + "grad_norm": 1.9659096002578735, + "learning_rate": 4.871085894121233e-05, + "loss": 0.4019, + "step": 3080 + }, + { + "epoch": 0.16915477497255763, + "grad_norm": 1.3290090560913086, + "learning_rate": 4.87091987512479e-05, + "loss": 0.3512, + "step": 3082 + }, + { + "epoch": 0.16926454445664105, + "grad_norm": 1.5036472082138062, + "learning_rate": 4.8707537521281635e-05, + "loss": 0.3542, + "step": 3084 + }, + { + "epoch": 0.16937431394072447, + "grad_norm": 2.0573949813842773, + "learning_rate": 4.870587525138641e-05, + "loss": 0.3583, + "step": 3086 + }, + { + "epoch": 0.1694840834248079, + "grad_norm": 2.6663691997528076, + "learning_rate": 4.870421194163515e-05, + "loss": 0.4463, + "step": 3088 + }, + { + "epoch": 0.16959385290889134, + "grad_norm": 1.775612711906433, + "learning_rate": 4.87025475921008e-05, + "loss": 0.3808, + "step": 3090 + }, + { + "epoch": 0.16970362239297476, + "grad_norm": 1.8235937356948853, + "learning_rate": 4.870088220285638e-05, + "loss": 0.4907, + "step": 3092 + }, + { + "epoch": 0.16981339187705818, + "grad_norm": 1.4459749460220337, + "learning_rate": 4.869921577397493e-05, + "loss": 0.3091, + "step": 3094 + }, + { + "epoch": 0.1699231613611416, + "grad_norm": 2.388127326965332, + "learning_rate": 4.869754830552956e-05, + "loss": 0.473, + "step": 3096 + }, + { + "epoch": 0.17003293084522503, + "grad_norm": 1.7241696119308472, + "learning_rate": 4.8695879797593394e-05, + "loss": 0.4293, + "step": 3098 + }, + { + "epoch": 0.17014270032930845, + "grad_norm": 2.8194515705108643, + "learning_rate": 4.869421025023965e-05, + "loss": 0.3953, + "step": 3100 + }, + { + "epoch": 0.17025246981339187, + "grad_norm": 1.5520384311676025, + "learning_rate": 4.869253966354154e-05, + "loss": 0.4062, + "step": 3102 + }, + { + "epoch": 0.1703622392974753, + "grad_norm": 5.553796291351318, + "learning_rate": 4.8690868037572346e-05, + "loss": 0.5189, + "step": 3104 + }, + { + "epoch": 0.17047200878155871, + "grad_norm": 1.7908719778060913, + "learning_rate": 4.8689195372405395e-05, + "loss": 0.3855, + "step": 3106 + }, + { + "epoch": 0.17058177826564216, + "grad_norm": 2.8682539463043213, + "learning_rate": 4.8687521668114064e-05, + "loss": 0.5787, + "step": 3108 + }, + { + "epoch": 0.1706915477497256, + "grad_norm": 2.455676555633545, + "learning_rate": 4.8685846924771774e-05, + "loss": 0.4004, + "step": 3110 + }, + { + "epoch": 0.170801317233809, + "grad_norm": 2.9044899940490723, + "learning_rate": 4.8684171142451986e-05, + "loss": 0.4419, + "step": 3112 + }, + { + "epoch": 0.17091108671789243, + "grad_norm": 2.2840428352355957, + "learning_rate": 4.868249432122819e-05, + "loss": 0.4173, + "step": 3114 + }, + { + "epoch": 0.17102085620197585, + "grad_norm": 3.066030263900757, + "learning_rate": 4.868081646117395e-05, + "loss": 0.5523, + "step": 3116 + }, + { + "epoch": 0.17113062568605927, + "grad_norm": 1.6596968173980713, + "learning_rate": 4.867913756236288e-05, + "loss": 0.4113, + "step": 3118 + }, + { + "epoch": 0.1712403951701427, + "grad_norm": 2.183272123336792, + "learning_rate": 4.867745762486861e-05, + "loss": 0.3987, + "step": 3120 + }, + { + "epoch": 0.17135016465422612, + "grad_norm": 1.8821039199829102, + "learning_rate": 4.867577664876483e-05, + "loss": 0.3376, + "step": 3122 + }, + { + "epoch": 0.17145993413830954, + "grad_norm": 2.511225938796997, + "learning_rate": 4.867409463412528e-05, + "loss": 0.4722, + "step": 3124 + }, + { + "epoch": 0.171569703622393, + "grad_norm": 2.3839142322540283, + "learning_rate": 4.8672411581023736e-05, + "loss": 0.4301, + "step": 3126 + }, + { + "epoch": 0.1716794731064764, + "grad_norm": 1.376909613609314, + "learning_rate": 4.8670727489534034e-05, + "loss": 0.3194, + "step": 3128 + }, + { + "epoch": 0.17178924259055983, + "grad_norm": 2.1477129459381104, + "learning_rate": 4.866904235973005e-05, + "loss": 0.3727, + "step": 3130 + }, + { + "epoch": 0.17189901207464325, + "grad_norm": 2.7149672508239746, + "learning_rate": 4.866735619168568e-05, + "loss": 0.4296, + "step": 3132 + }, + { + "epoch": 0.17200878155872668, + "grad_norm": 2.736356735229492, + "learning_rate": 4.86656689854749e-05, + "loss": 0.5613, + "step": 3134 + }, + { + "epoch": 0.1721185510428101, + "grad_norm": 2.7425010204315186, + "learning_rate": 4.8663980741171724e-05, + "loss": 0.3656, + "step": 3136 + }, + { + "epoch": 0.17222832052689352, + "grad_norm": 1.8875664472579956, + "learning_rate": 4.86622914588502e-05, + "loss": 0.5047, + "step": 3138 + }, + { + "epoch": 0.17233809001097694, + "grad_norm": 1.5633827447891235, + "learning_rate": 4.866060113858444e-05, + "loss": 0.3995, + "step": 3140 + }, + { + "epoch": 0.17244785949506036, + "grad_norm": 2.942631244659424, + "learning_rate": 4.865890978044857e-05, + "loss": 0.5099, + "step": 3142 + }, + { + "epoch": 0.17255762897914378, + "grad_norm": 1.8425629138946533, + "learning_rate": 4.86572173845168e-05, + "loss": 0.4036, + "step": 3144 + }, + { + "epoch": 0.17266739846322723, + "grad_norm": 3.2554540634155273, + "learning_rate": 4.8655523950863355e-05, + "loss": 0.5395, + "step": 3146 + }, + { + "epoch": 0.17277716794731066, + "grad_norm": 1.471200704574585, + "learning_rate": 4.865382947956253e-05, + "loss": 0.3106, + "step": 3148 + }, + { + "epoch": 0.17288693743139408, + "grad_norm": 2.5688445568084717, + "learning_rate": 4.8652133970688636e-05, + "loss": 0.4428, + "step": 3150 + }, + { + "epoch": 0.1729967069154775, + "grad_norm": 1.9122648239135742, + "learning_rate": 4.865043742431605e-05, + "loss": 0.4568, + "step": 3152 + }, + { + "epoch": 0.17310647639956092, + "grad_norm": 2.2206814289093018, + "learning_rate": 4.864873984051921e-05, + "loss": 0.4961, + "step": 3154 + }, + { + "epoch": 0.17321624588364434, + "grad_norm": 1.289576768875122, + "learning_rate": 4.864704121937256e-05, + "loss": 0.272, + "step": 3156 + }, + { + "epoch": 0.17332601536772776, + "grad_norm": 1.766336441040039, + "learning_rate": 4.864534156095061e-05, + "loss": 0.439, + "step": 3158 + }, + { + "epoch": 0.1734357848518112, + "grad_norm": 1.3995245695114136, + "learning_rate": 4.864364086532792e-05, + "loss": 0.4375, + "step": 3160 + }, + { + "epoch": 0.1735455543358946, + "grad_norm": 2.617487668991089, + "learning_rate": 4.8641939132579094e-05, + "loss": 0.3848, + "step": 3162 + }, + { + "epoch": 0.17365532381997806, + "grad_norm": 2.639082193374634, + "learning_rate": 4.864023636277878e-05, + "loss": 0.4299, + "step": 3164 + }, + { + "epoch": 0.17376509330406148, + "grad_norm": 2.328237295150757, + "learning_rate": 4.863853255600167e-05, + "loss": 0.5509, + "step": 3166 + }, + { + "epoch": 0.1738748627881449, + "grad_norm": 2.1113409996032715, + "learning_rate": 4.863682771232248e-05, + "loss": 0.4219, + "step": 3168 + }, + { + "epoch": 0.17398463227222832, + "grad_norm": 1.4429086446762085, + "learning_rate": 4.863512183181603e-05, + "loss": 0.3113, + "step": 3170 + }, + { + "epoch": 0.17409440175631175, + "grad_norm": 2.0814476013183594, + "learning_rate": 4.863341491455712e-05, + "loss": 0.4744, + "step": 3172 + }, + { + "epoch": 0.17420417124039517, + "grad_norm": 1.6756304502487183, + "learning_rate": 4.8631706960620635e-05, + "loss": 0.376, + "step": 3174 + }, + { + "epoch": 0.1743139407244786, + "grad_norm": 1.3122304677963257, + "learning_rate": 4.862999797008149e-05, + "loss": 0.36, + "step": 3176 + }, + { + "epoch": 0.174423710208562, + "grad_norm": 1.5043530464172363, + "learning_rate": 4.862828794301465e-05, + "loss": 0.3674, + "step": 3178 + }, + { + "epoch": 0.17453347969264543, + "grad_norm": 3.6034953594207764, + "learning_rate": 4.862657687949512e-05, + "loss": 0.5424, + "step": 3180 + }, + { + "epoch": 0.17464324917672888, + "grad_norm": 3.4951000213623047, + "learning_rate": 4.8624864779597975e-05, + "loss": 0.5421, + "step": 3182 + }, + { + "epoch": 0.1747530186608123, + "grad_norm": 2.0396502017974854, + "learning_rate": 4.862315164339829e-05, + "loss": 0.5009, + "step": 3184 + }, + { + "epoch": 0.17486278814489573, + "grad_norm": 2.276747703552246, + "learning_rate": 4.862143747097123e-05, + "loss": 0.4837, + "step": 3186 + }, + { + "epoch": 0.17497255762897915, + "grad_norm": 1.771214246749878, + "learning_rate": 4.861972226239199e-05, + "loss": 0.3752, + "step": 3188 + }, + { + "epoch": 0.17508232711306257, + "grad_norm": 2.080963611602783, + "learning_rate": 4.861800601773579e-05, + "loss": 0.3961, + "step": 3190 + }, + { + "epoch": 0.175192096597146, + "grad_norm": 2.394515037536621, + "learning_rate": 4.861628873707792e-05, + "loss": 0.5628, + "step": 3192 + }, + { + "epoch": 0.1753018660812294, + "grad_norm": 1.6709855794906616, + "learning_rate": 4.861457042049372e-05, + "loss": 0.346, + "step": 3194 + }, + { + "epoch": 0.17541163556531283, + "grad_norm": 2.2394886016845703, + "learning_rate": 4.8612851068058544e-05, + "loss": 0.4071, + "step": 3196 + }, + { + "epoch": 0.17552140504939626, + "grad_norm": 2.6217775344848633, + "learning_rate": 4.861113067984783e-05, + "loss": 0.5139, + "step": 3198 + }, + { + "epoch": 0.1756311745334797, + "grad_norm": 1.6589971780776978, + "learning_rate": 4.860940925593703e-05, + "loss": 0.3621, + "step": 3200 + }, + { + "epoch": 0.17574094401756313, + "grad_norm": 2.2259371280670166, + "learning_rate": 4.8607686796401655e-05, + "loss": 0.5465, + "step": 3202 + }, + { + "epoch": 0.17585071350164655, + "grad_norm": 2.1536202430725098, + "learning_rate": 4.860596330131727e-05, + "loss": 0.5346, + "step": 3204 + }, + { + "epoch": 0.17596048298572997, + "grad_norm": 1.8108298778533936, + "learning_rate": 4.860423877075947e-05, + "loss": 0.3919, + "step": 3206 + }, + { + "epoch": 0.1760702524698134, + "grad_norm": 2.2997491359710693, + "learning_rate": 4.8602513204803896e-05, + "loss": 0.4526, + "step": 3208 + }, + { + "epoch": 0.17618002195389681, + "grad_norm": 2.0965144634246826, + "learning_rate": 4.860078660352625e-05, + "loss": 0.3125, + "step": 3210 + }, + { + "epoch": 0.17628979143798024, + "grad_norm": 1.6151671409606934, + "learning_rate": 4.8599058967002254e-05, + "loss": 0.3882, + "step": 3212 + }, + { + "epoch": 0.17639956092206366, + "grad_norm": 1.7152764797210693, + "learning_rate": 4.859733029530771e-05, + "loss": 0.4169, + "step": 3214 + }, + { + "epoch": 0.17650933040614708, + "grad_norm": 2.443171501159668, + "learning_rate": 4.859560058851844e-05, + "loss": 0.6281, + "step": 3216 + }, + { + "epoch": 0.1766190998902305, + "grad_norm": 2.1533122062683105, + "learning_rate": 4.8593869846710307e-05, + "loss": 0.4284, + "step": 3218 + }, + { + "epoch": 0.17672886937431395, + "grad_norm": 2.0680997371673584, + "learning_rate": 4.859213806995924e-05, + "loss": 0.4486, + "step": 3220 + }, + { + "epoch": 0.17683863885839737, + "grad_norm": 1.8307222127914429, + "learning_rate": 4.8590405258341195e-05, + "loss": 0.4012, + "step": 3222 + }, + { + "epoch": 0.1769484083424808, + "grad_norm": 1.508884072303772, + "learning_rate": 4.858867141193219e-05, + "loss": 0.4331, + "step": 3224 + }, + { + "epoch": 0.17705817782656422, + "grad_norm": 2.6606924533843994, + "learning_rate": 4.858693653080828e-05, + "loss": 0.3882, + "step": 3226 + }, + { + "epoch": 0.17716794731064764, + "grad_norm": 2.5827724933624268, + "learning_rate": 4.8585200615045555e-05, + "loss": 0.4374, + "step": 3228 + }, + { + "epoch": 0.17727771679473106, + "grad_norm": 2.1030287742614746, + "learning_rate": 4.8583463664720176e-05, + "loss": 0.3993, + "step": 3230 + }, + { + "epoch": 0.17738748627881448, + "grad_norm": 3.39654803276062, + "learning_rate": 4.8581725679908317e-05, + "loss": 0.3058, + "step": 3232 + }, + { + "epoch": 0.1774972557628979, + "grad_norm": 2.1271848678588867, + "learning_rate": 4.857998666068624e-05, + "loss": 0.4674, + "step": 3234 + }, + { + "epoch": 0.17760702524698133, + "grad_norm": 2.157992362976074, + "learning_rate": 4.85782466071302e-05, + "loss": 0.6202, + "step": 3236 + }, + { + "epoch": 0.17771679473106478, + "grad_norm": 2.835108995437622, + "learning_rate": 4.857650551931653e-05, + "loss": 0.4046, + "step": 3238 + }, + { + "epoch": 0.1778265642151482, + "grad_norm": 2.501436471939087, + "learning_rate": 4.8574763397321614e-05, + "loss": 0.2931, + "step": 3240 + }, + { + "epoch": 0.17793633369923162, + "grad_norm": 2.1563851833343506, + "learning_rate": 4.857302024122186e-05, + "loss": 0.3827, + "step": 3242 + }, + { + "epoch": 0.17804610318331504, + "grad_norm": 2.0141687393188477, + "learning_rate": 4.857127605109374e-05, + "loss": 0.467, + "step": 3244 + }, + { + "epoch": 0.17815587266739846, + "grad_norm": 1.9657913446426392, + "learning_rate": 4.8569530827013756e-05, + "loss": 0.4113, + "step": 3246 + }, + { + "epoch": 0.17826564215148188, + "grad_norm": 1.8925089836120605, + "learning_rate": 4.856778456905846e-05, + "loss": 0.4488, + "step": 3248 + }, + { + "epoch": 0.1783754116355653, + "grad_norm": 2.6134490966796875, + "learning_rate": 4.856603727730447e-05, + "loss": 0.6492, + "step": 3250 + }, + { + "epoch": 0.17848518111964873, + "grad_norm": 2.33992600440979, + "learning_rate": 4.85642889518284e-05, + "loss": 0.3685, + "step": 3252 + }, + { + "epoch": 0.17859495060373215, + "grad_norm": 1.9827073812484741, + "learning_rate": 4.8562539592706956e-05, + "loss": 0.348, + "step": 3254 + }, + { + "epoch": 0.1787047200878156, + "grad_norm": 4.279092311859131, + "learning_rate": 4.8560789200016884e-05, + "loss": 0.3973, + "step": 3256 + }, + { + "epoch": 0.17881448957189902, + "grad_norm": 1.5967990159988403, + "learning_rate": 4.855903777383495e-05, + "loss": 0.3155, + "step": 3258 + }, + { + "epoch": 0.17892425905598244, + "grad_norm": 1.5897514820098877, + "learning_rate": 4.855728531423798e-05, + "loss": 0.4265, + "step": 3260 + }, + { + "epoch": 0.17903402854006586, + "grad_norm": 2.7456607818603516, + "learning_rate": 4.8555531821302855e-05, + "loss": 0.4232, + "step": 3262 + }, + { + "epoch": 0.1791437980241493, + "grad_norm": 1.869880199432373, + "learning_rate": 4.855377729510648e-05, + "loss": 0.3876, + "step": 3264 + }, + { + "epoch": 0.1792535675082327, + "grad_norm": 1.6606590747833252, + "learning_rate": 4.855202173572584e-05, + "loss": 0.345, + "step": 3266 + }, + { + "epoch": 0.17936333699231613, + "grad_norm": 1.5022339820861816, + "learning_rate": 4.855026514323792e-05, + "loss": 0.3202, + "step": 3268 + }, + { + "epoch": 0.17947310647639955, + "grad_norm": 1.3215463161468506, + "learning_rate": 4.854850751771977e-05, + "loss": 0.4883, + "step": 3270 + }, + { + "epoch": 0.17958287596048297, + "grad_norm": 2.439467191696167, + "learning_rate": 4.85467488592485e-05, + "loss": 0.5053, + "step": 3272 + }, + { + "epoch": 0.17969264544456642, + "grad_norm": 2.95265531539917, + "learning_rate": 4.8544989167901256e-05, + "loss": 0.4639, + "step": 3274 + }, + { + "epoch": 0.17980241492864985, + "grad_norm": 2.244610071182251, + "learning_rate": 4.854322844375522e-05, + "loss": 0.4408, + "step": 3276 + }, + { + "epoch": 0.17991218441273327, + "grad_norm": 2.2163848876953125, + "learning_rate": 4.854146668688763e-05, + "loss": 0.455, + "step": 3278 + }, + { + "epoch": 0.1800219538968167, + "grad_norm": 3.4411754608154297, + "learning_rate": 4.8539703897375755e-05, + "loss": 0.3672, + "step": 3280 + }, + { + "epoch": 0.1801317233809001, + "grad_norm": 2.8960399627685547, + "learning_rate": 4.853794007529693e-05, + "loss": 0.3533, + "step": 3282 + }, + { + "epoch": 0.18024149286498353, + "grad_norm": 2.3136839866638184, + "learning_rate": 4.853617522072853e-05, + "loss": 0.5044, + "step": 3284 + }, + { + "epoch": 0.18035126234906695, + "grad_norm": 3.0554957389831543, + "learning_rate": 4.8534409333747954e-05, + "loss": 0.4248, + "step": 3286 + }, + { + "epoch": 0.18046103183315038, + "grad_norm": 2.457514524459839, + "learning_rate": 4.8532642414432674e-05, + "loss": 0.5874, + "step": 3288 + }, + { + "epoch": 0.1805708013172338, + "grad_norm": 2.7749974727630615, + "learning_rate": 4.8530874462860194e-05, + "loss": 0.3468, + "step": 3290 + }, + { + "epoch": 0.18068057080131725, + "grad_norm": 2.5279529094696045, + "learning_rate": 4.852910547910806e-05, + "loss": 0.4275, + "step": 3292 + }, + { + "epoch": 0.18079034028540067, + "grad_norm": 2.1423869132995605, + "learning_rate": 4.8527335463253874e-05, + "loss": 0.341, + "step": 3294 + }, + { + "epoch": 0.1809001097694841, + "grad_norm": 2.4018473625183105, + "learning_rate": 4.852556441537528e-05, + "loss": 0.4858, + "step": 3296 + }, + { + "epoch": 0.1810098792535675, + "grad_norm": 2.0305228233337402, + "learning_rate": 4.852379233554996e-05, + "loss": 0.578, + "step": 3298 + }, + { + "epoch": 0.18111964873765093, + "grad_norm": 2.4089319705963135, + "learning_rate": 4.852201922385564e-05, + "loss": 0.3952, + "step": 3300 + }, + { + "epoch": 0.18122941822173436, + "grad_norm": 1.5017364025115967, + "learning_rate": 4.852024508037011e-05, + "loss": 0.359, + "step": 3302 + }, + { + "epoch": 0.18133918770581778, + "grad_norm": 1.751083493232727, + "learning_rate": 4.851846990517118e-05, + "loss": 0.4689, + "step": 3304 + }, + { + "epoch": 0.1814489571899012, + "grad_norm": 2.369723320007324, + "learning_rate": 4.851669369833673e-05, + "loss": 0.4406, + "step": 3306 + }, + { + "epoch": 0.18155872667398462, + "grad_norm": 1.4014800786972046, + "learning_rate": 4.8514916459944666e-05, + "loss": 0.4833, + "step": 3308 + }, + { + "epoch": 0.18166849615806804, + "grad_norm": 2.010829448699951, + "learning_rate": 4.851313819007295e-05, + "loss": 0.3839, + "step": 3310 + }, + { + "epoch": 0.1817782656421515, + "grad_norm": 1.8912054300308228, + "learning_rate": 4.851135888879958e-05, + "loss": 0.3603, + "step": 3312 + }, + { + "epoch": 0.18188803512623491, + "grad_norm": 1.4227039813995361, + "learning_rate": 4.8509578556202606e-05, + "loss": 0.4679, + "step": 3314 + }, + { + "epoch": 0.18199780461031834, + "grad_norm": 3.493694305419922, + "learning_rate": 4.8507797192360134e-05, + "loss": 0.3544, + "step": 3316 + }, + { + "epoch": 0.18210757409440176, + "grad_norm": 1.636831521987915, + "learning_rate": 4.850601479735029e-05, + "loss": 0.3693, + "step": 3318 + }, + { + "epoch": 0.18221734357848518, + "grad_norm": 3.0660128593444824, + "learning_rate": 4.8504231371251255e-05, + "loss": 0.4218, + "step": 3320 + }, + { + "epoch": 0.1823271130625686, + "grad_norm": 3.159005641937256, + "learning_rate": 4.850244691414128e-05, + "loss": 0.4808, + "step": 3322 + }, + { + "epoch": 0.18243688254665202, + "grad_norm": 2.195775270462036, + "learning_rate": 4.850066142609862e-05, + "loss": 0.3963, + "step": 3324 + }, + { + "epoch": 0.18254665203073545, + "grad_norm": 3.3864526748657227, + "learning_rate": 4.8498874907201594e-05, + "loss": 0.3962, + "step": 3326 + }, + { + "epoch": 0.18265642151481887, + "grad_norm": 2.3724989891052246, + "learning_rate": 4.849708735752859e-05, + "loss": 0.3223, + "step": 3328 + }, + { + "epoch": 0.18276619099890232, + "grad_norm": 2.429733991622925, + "learning_rate": 4.849529877715799e-05, + "loss": 0.4901, + "step": 3330 + }, + { + "epoch": 0.18287596048298574, + "grad_norm": 3.059931993484497, + "learning_rate": 4.849350916616827e-05, + "loss": 0.3143, + "step": 3332 + }, + { + "epoch": 0.18298572996706916, + "grad_norm": 1.733731985092163, + "learning_rate": 4.849171852463793e-05, + "loss": 0.3604, + "step": 3334 + }, + { + "epoch": 0.18309549945115258, + "grad_norm": 1.4118953943252563, + "learning_rate": 4.8489926852645505e-05, + "loss": 0.429, + "step": 3336 + }, + { + "epoch": 0.183205268935236, + "grad_norm": 1.9299349784851074, + "learning_rate": 4.84881341502696e-05, + "loss": 0.4745, + "step": 3338 + }, + { + "epoch": 0.18331503841931943, + "grad_norm": 1.6707870960235596, + "learning_rate": 4.8486340417588835e-05, + "loss": 0.3583, + "step": 3340 + }, + { + "epoch": 0.18342480790340285, + "grad_norm": 2.102996826171875, + "learning_rate": 4.848454565468191e-05, + "loss": 0.4058, + "step": 3342 + }, + { + "epoch": 0.18353457738748627, + "grad_norm": 2.820221185684204, + "learning_rate": 4.848274986162754e-05, + "loss": 0.525, + "step": 3344 + }, + { + "epoch": 0.1836443468715697, + "grad_norm": 1.765818476676941, + "learning_rate": 4.84809530385045e-05, + "loss": 0.3075, + "step": 3346 + }, + { + "epoch": 0.18375411635565314, + "grad_norm": 2.084256887435913, + "learning_rate": 4.847915518539161e-05, + "loss": 0.3881, + "step": 3348 + }, + { + "epoch": 0.18386388583973656, + "grad_norm": 1.4945929050445557, + "learning_rate": 4.847735630236773e-05, + "loss": 0.5081, + "step": 3350 + }, + { + "epoch": 0.18397365532381998, + "grad_norm": 1.7525147199630737, + "learning_rate": 4.847555638951177e-05, + "loss": 0.4516, + "step": 3352 + }, + { + "epoch": 0.1840834248079034, + "grad_norm": 2.3277363777160645, + "learning_rate": 4.847375544690268e-05, + "loss": 0.4694, + "step": 3354 + }, + { + "epoch": 0.18419319429198683, + "grad_norm": 1.769914984703064, + "learning_rate": 4.8471953474619466e-05, + "loss": 0.322, + "step": 3356 + }, + { + "epoch": 0.18430296377607025, + "grad_norm": 1.490169644355774, + "learning_rate": 4.847015047274116e-05, + "loss": 0.3963, + "step": 3358 + }, + { + "epoch": 0.18441273326015367, + "grad_norm": 2.9485087394714355, + "learning_rate": 4.846834644134686e-05, + "loss": 0.574, + "step": 3360 + }, + { + "epoch": 0.1845225027442371, + "grad_norm": 1.2109450101852417, + "learning_rate": 4.846654138051569e-05, + "loss": 0.293, + "step": 3362 + }, + { + "epoch": 0.18463227222832052, + "grad_norm": 2.2310686111450195, + "learning_rate": 4.846473529032684e-05, + "loss": 0.3526, + "step": 3364 + }, + { + "epoch": 0.18474204171240396, + "grad_norm": 1.8529998064041138, + "learning_rate": 4.8462928170859525e-05, + "loss": 0.4199, + "step": 3366 + }, + { + "epoch": 0.1848518111964874, + "grad_norm": 1.5979466438293457, + "learning_rate": 4.846112002219301e-05, + "loss": 0.2637, + "step": 3368 + }, + { + "epoch": 0.1849615806805708, + "grad_norm": 1.1421658992767334, + "learning_rate": 4.845931084440662e-05, + "loss": 0.2924, + "step": 3370 + }, + { + "epoch": 0.18507135016465423, + "grad_norm": 1.9188330173492432, + "learning_rate": 4.8457500637579726e-05, + "loss": 0.4418, + "step": 3372 + }, + { + "epoch": 0.18518111964873765, + "grad_norm": 2.243230104446411, + "learning_rate": 4.8455689401791706e-05, + "loss": 0.5103, + "step": 3374 + }, + { + "epoch": 0.18529088913282107, + "grad_norm": 2.3322834968566895, + "learning_rate": 4.845387713712203e-05, + "loss": 0.5419, + "step": 3376 + }, + { + "epoch": 0.1854006586169045, + "grad_norm": 1.5294543504714966, + "learning_rate": 4.845206384365018e-05, + "loss": 0.3255, + "step": 3378 + }, + { + "epoch": 0.18551042810098792, + "grad_norm": 1.7108384370803833, + "learning_rate": 4.8450249521455695e-05, + "loss": 0.4679, + "step": 3380 + }, + { + "epoch": 0.18562019758507134, + "grad_norm": 1.651991605758667, + "learning_rate": 4.844843417061816e-05, + "loss": 0.276, + "step": 3382 + }, + { + "epoch": 0.1857299670691548, + "grad_norm": 2.8964931964874268, + "learning_rate": 4.844661779121723e-05, + "loss": 0.4436, + "step": 3384 + }, + { + "epoch": 0.1858397365532382, + "grad_norm": 1.0027337074279785, + "learning_rate": 4.844480038333255e-05, + "loss": 0.2476, + "step": 3386 + }, + { + "epoch": 0.18594950603732163, + "grad_norm": 7.182661056518555, + "learning_rate": 4.844298194704384e-05, + "loss": 0.4591, + "step": 3388 + }, + { + "epoch": 0.18605927552140505, + "grad_norm": 2.3614203929901123, + "learning_rate": 4.844116248243089e-05, + "loss": 0.4474, + "step": 3390 + }, + { + "epoch": 0.18616904500548848, + "grad_norm": 2.6233062744140625, + "learning_rate": 4.84393419895735e-05, + "loss": 0.4239, + "step": 3392 + }, + { + "epoch": 0.1862788144895719, + "grad_norm": 1.554219365119934, + "learning_rate": 4.8437520468551514e-05, + "loss": 0.3415, + "step": 3394 + }, + { + "epoch": 0.18638858397365532, + "grad_norm": 1.8682836294174194, + "learning_rate": 4.843569791944486e-05, + "loss": 0.6398, + "step": 3396 + }, + { + "epoch": 0.18649835345773874, + "grad_norm": 1.6368458271026611, + "learning_rate": 4.843387434233345e-05, + "loss": 0.4912, + "step": 3398 + }, + { + "epoch": 0.18660812294182216, + "grad_norm": 3.3570964336395264, + "learning_rate": 4.843204973729729e-05, + "loss": 0.3656, + "step": 3400 + }, + { + "epoch": 0.18671789242590558, + "grad_norm": 1.5955463647842407, + "learning_rate": 4.843022410441642e-05, + "loss": 0.4071, + "step": 3402 + }, + { + "epoch": 0.18682766190998903, + "grad_norm": 2.387195348739624, + "learning_rate": 4.8428397443770926e-05, + "loss": 0.3634, + "step": 3404 + }, + { + "epoch": 0.18693743139407246, + "grad_norm": 2.0995588302612305, + "learning_rate": 4.842656975544092e-05, + "loss": 0.3963, + "step": 3406 + }, + { + "epoch": 0.18704720087815588, + "grad_norm": 2.612987756729126, + "learning_rate": 4.8424741039506575e-05, + "loss": 0.4189, + "step": 3408 + }, + { + "epoch": 0.1871569703622393, + "grad_norm": 2.7765398025512695, + "learning_rate": 4.842291129604812e-05, + "loss": 0.4255, + "step": 3410 + }, + { + "epoch": 0.18726673984632272, + "grad_norm": 2.7147183418273926, + "learning_rate": 4.842108052514581e-05, + "loss": 0.4742, + "step": 3412 + }, + { + "epoch": 0.18737650933040614, + "grad_norm": 2.9730162620544434, + "learning_rate": 4.841924872687995e-05, + "loss": 0.4255, + "step": 3414 + }, + { + "epoch": 0.18748627881448957, + "grad_norm": 3.0310564041137695, + "learning_rate": 4.8417415901330886e-05, + "loss": 0.3628, + "step": 3416 + }, + { + "epoch": 0.187596048298573, + "grad_norm": 1.5199607610702515, + "learning_rate": 4.8415582048579035e-05, + "loss": 0.3836, + "step": 3418 + }, + { + "epoch": 0.1877058177826564, + "grad_norm": 4.735464096069336, + "learning_rate": 4.841374716870481e-05, + "loss": 0.551, + "step": 3420 + }, + { + "epoch": 0.18781558726673986, + "grad_norm": 1.755795955657959, + "learning_rate": 4.8411911261788726e-05, + "loss": 0.4782, + "step": 3422 + }, + { + "epoch": 0.18792535675082328, + "grad_norm": 1.955367922782898, + "learning_rate": 4.841007432791129e-05, + "loss": 0.3552, + "step": 3424 + }, + { + "epoch": 0.1880351262349067, + "grad_norm": 1.7440489530563354, + "learning_rate": 4.840823636715309e-05, + "loss": 0.3705, + "step": 3426 + }, + { + "epoch": 0.18814489571899012, + "grad_norm": 2.389270782470703, + "learning_rate": 4.840639737959476e-05, + "loss": 0.3653, + "step": 3428 + }, + { + "epoch": 0.18825466520307355, + "grad_norm": 3.381563186645508, + "learning_rate": 4.840455736531695e-05, + "loss": 0.4379, + "step": 3430 + }, + { + "epoch": 0.18836443468715697, + "grad_norm": 2.1786224842071533, + "learning_rate": 4.840271632440038e-05, + "loss": 0.518, + "step": 3432 + }, + { + "epoch": 0.1884742041712404, + "grad_norm": 2.0637190341949463, + "learning_rate": 4.8400874256925796e-05, + "loss": 0.4629, + "step": 3434 + }, + { + "epoch": 0.1885839736553238, + "grad_norm": 1.600815773010254, + "learning_rate": 4.839903116297401e-05, + "loss": 0.482, + "step": 3436 + }, + { + "epoch": 0.18869374313940723, + "grad_norm": 1.3976746797561646, + "learning_rate": 4.839718704262587e-05, + "loss": 0.3245, + "step": 3438 + }, + { + "epoch": 0.18880351262349068, + "grad_norm": 1.9523875713348389, + "learning_rate": 4.839534189596228e-05, + "loss": 0.37, + "step": 3440 + }, + { + "epoch": 0.1889132821075741, + "grad_norm": 1.495551347732544, + "learning_rate": 4.839349572306414e-05, + "loss": 0.2887, + "step": 3442 + }, + { + "epoch": 0.18902305159165753, + "grad_norm": 1.226603388786316, + "learning_rate": 4.839164852401247e-05, + "loss": 0.3655, + "step": 3444 + }, + { + "epoch": 0.18913282107574095, + "grad_norm": 2.3480401039123535, + "learning_rate": 4.838980029888828e-05, + "loss": 0.4019, + "step": 3446 + }, + { + "epoch": 0.18924259055982437, + "grad_norm": 2.136697292327881, + "learning_rate": 4.838795104777265e-05, + "loss": 0.4307, + "step": 3448 + }, + { + "epoch": 0.1893523600439078, + "grad_norm": 7.632808208465576, + "learning_rate": 4.838610077074669e-05, + "loss": 0.4947, + "step": 3450 + }, + { + "epoch": 0.1894621295279912, + "grad_norm": 1.140358328819275, + "learning_rate": 4.838424946789156e-05, + "loss": 0.3476, + "step": 3452 + }, + { + "epoch": 0.18957189901207464, + "grad_norm": 2.4868338108062744, + "learning_rate": 4.8382397139288474e-05, + "loss": 0.4275, + "step": 3454 + }, + { + "epoch": 0.18968166849615806, + "grad_norm": 1.2881617546081543, + "learning_rate": 4.8380543785018677e-05, + "loss": 0.2934, + "step": 3456 + }, + { + "epoch": 0.1897914379802415, + "grad_norm": 1.3311318159103394, + "learning_rate": 4.837868940516348e-05, + "loss": 0.3849, + "step": 3458 + }, + { + "epoch": 0.18990120746432493, + "grad_norm": 2.299922466278076, + "learning_rate": 4.837683399980421e-05, + "loss": 0.4253, + "step": 3460 + }, + { + "epoch": 0.19001097694840835, + "grad_norm": 2.5410804748535156, + "learning_rate": 4.837497756902226e-05, + "loss": 0.4588, + "step": 3462 + }, + { + "epoch": 0.19012074643249177, + "grad_norm": 2.263634443283081, + "learning_rate": 4.837312011289907e-05, + "loss": 0.5177, + "step": 3464 + }, + { + "epoch": 0.1902305159165752, + "grad_norm": 2.652181625366211, + "learning_rate": 4.83712616315161e-05, + "loss": 0.39, + "step": 3466 + }, + { + "epoch": 0.19034028540065862, + "grad_norm": 2.214254140853882, + "learning_rate": 4.836940212495489e-05, + "loss": 0.3704, + "step": 3468 + }, + { + "epoch": 0.19045005488474204, + "grad_norm": 3.990338087081909, + "learning_rate": 4.8367541593297e-05, + "loss": 0.4579, + "step": 3470 + }, + { + "epoch": 0.19055982436882546, + "grad_norm": 2.7707247734069824, + "learning_rate": 4.8365680036624026e-05, + "loss": 0.4806, + "step": 3472 + }, + { + "epoch": 0.19066959385290888, + "grad_norm": 1.5153566598892212, + "learning_rate": 4.8363817455017655e-05, + "loss": 0.2963, + "step": 3474 + }, + { + "epoch": 0.1907793633369923, + "grad_norm": 2.0539379119873047, + "learning_rate": 4.836195384855957e-05, + "loss": 0.4556, + "step": 3476 + }, + { + "epoch": 0.19088913282107575, + "grad_norm": 2.3366811275482178, + "learning_rate": 4.8360089217331525e-05, + "loss": 0.4439, + "step": 3478 + }, + { + "epoch": 0.19099890230515917, + "grad_norm": 1.1428523063659668, + "learning_rate": 4.8358223561415304e-05, + "loss": 0.364, + "step": 3480 + }, + { + "epoch": 0.1911086717892426, + "grad_norm": 1.3581838607788086, + "learning_rate": 4.8356356880892754e-05, + "loss": 0.3633, + "step": 3482 + }, + { + "epoch": 0.19121844127332602, + "grad_norm": 3.4152746200561523, + "learning_rate": 4.835448917584574e-05, + "loss": 0.4926, + "step": 3484 + }, + { + "epoch": 0.19132821075740944, + "grad_norm": 2.888129234313965, + "learning_rate": 4.835262044635621e-05, + "loss": 0.3294, + "step": 3486 + }, + { + "epoch": 0.19143798024149286, + "grad_norm": 1.5096670389175415, + "learning_rate": 4.835075069250613e-05, + "loss": 0.5189, + "step": 3488 + }, + { + "epoch": 0.19154774972557628, + "grad_norm": 2.980353832244873, + "learning_rate": 4.8348879914377504e-05, + "loss": 0.4303, + "step": 3490 + }, + { + "epoch": 0.1916575192096597, + "grad_norm": 1.5787807703018188, + "learning_rate": 4.834700811205241e-05, + "loss": 0.2143, + "step": 3492 + }, + { + "epoch": 0.19176728869374313, + "grad_norm": 2.6501283645629883, + "learning_rate": 4.834513528561293e-05, + "loss": 0.4096, + "step": 3494 + }, + { + "epoch": 0.19187705817782658, + "grad_norm": 1.3578392267227173, + "learning_rate": 4.8343261435141244e-05, + "loss": 0.3766, + "step": 3496 + }, + { + "epoch": 0.19198682766191, + "grad_norm": 1.8643882274627686, + "learning_rate": 4.8341386560719534e-05, + "loss": 0.3041, + "step": 3498 + }, + { + "epoch": 0.19209659714599342, + "grad_norm": 1.373505711555481, + "learning_rate": 4.8339510662430046e-05, + "loss": 0.3752, + "step": 3500 + }, + { + "epoch": 0.19220636663007684, + "grad_norm": 2.4860196113586426, + "learning_rate": 4.8337633740355056e-05, + "loss": 0.5049, + "step": 3502 + }, + { + "epoch": 0.19231613611416026, + "grad_norm": 2.417325973510742, + "learning_rate": 4.833575579457691e-05, + "loss": 0.3401, + "step": 3504 + }, + { + "epoch": 0.19242590559824369, + "grad_norm": 1.9478113651275635, + "learning_rate": 4.8333876825177975e-05, + "loss": 0.487, + "step": 3506 + }, + { + "epoch": 0.1925356750823271, + "grad_norm": 2.3389906883239746, + "learning_rate": 4.8331996832240675e-05, + "loss": 0.3888, + "step": 3508 + }, + { + "epoch": 0.19264544456641053, + "grad_norm": 3.0102243423461914, + "learning_rate": 4.8330115815847465e-05, + "loss": 0.3698, + "step": 3510 + }, + { + "epoch": 0.19275521405049395, + "grad_norm": 2.0829660892486572, + "learning_rate": 4.832823377608087e-05, + "loss": 0.4182, + "step": 3512 + }, + { + "epoch": 0.1928649835345774, + "grad_norm": 1.2830010652542114, + "learning_rate": 4.832635071302344e-05, + "loss": 0.3203, + "step": 3514 + }, + { + "epoch": 0.19297475301866082, + "grad_norm": 1.6332365274429321, + "learning_rate": 4.8324466626757775e-05, + "loss": 0.4752, + "step": 3516 + }, + { + "epoch": 0.19308452250274424, + "grad_norm": 1.3601250648498535, + "learning_rate": 4.832258151736652e-05, + "loss": 0.2799, + "step": 3518 + }, + { + "epoch": 0.19319429198682767, + "grad_norm": 1.4841405153274536, + "learning_rate": 4.832069538493237e-05, + "loss": 0.3575, + "step": 3520 + }, + { + "epoch": 0.1933040614709111, + "grad_norm": 5.129024505615234, + "learning_rate": 4.8318808229538045e-05, + "loss": 0.4769, + "step": 3522 + }, + { + "epoch": 0.1934138309549945, + "grad_norm": 1.500874638557434, + "learning_rate": 4.8316920051266343e-05, + "loss": 0.3531, + "step": 3524 + }, + { + "epoch": 0.19352360043907793, + "grad_norm": 2.026702880859375, + "learning_rate": 4.831503085020008e-05, + "loss": 0.5537, + "step": 3526 + }, + { + "epoch": 0.19363336992316135, + "grad_norm": 3.9163124561309814, + "learning_rate": 4.8313140626422125e-05, + "loss": 0.4275, + "step": 3528 + }, + { + "epoch": 0.19374313940724477, + "grad_norm": 2.433232307434082, + "learning_rate": 4.83112493800154e-05, + "loss": 0.4308, + "step": 3530 + }, + { + "epoch": 0.19385290889132822, + "grad_norm": 2.304945468902588, + "learning_rate": 4.8309357111062856e-05, + "loss": 0.4102, + "step": 3532 + }, + { + "epoch": 0.19396267837541165, + "grad_norm": 1.6091928482055664, + "learning_rate": 4.83074638196475e-05, + "loss": 0.3544, + "step": 3534 + }, + { + "epoch": 0.19407244785949507, + "grad_norm": 2.2606542110443115, + "learning_rate": 4.830556950585238e-05, + "loss": 0.4555, + "step": 3536 + }, + { + "epoch": 0.1941822173435785, + "grad_norm": 1.67505943775177, + "learning_rate": 4.8303674169760594e-05, + "loss": 0.4453, + "step": 3538 + }, + { + "epoch": 0.1942919868276619, + "grad_norm": 1.7176194190979004, + "learning_rate": 4.8301777811455276e-05, + "loss": 0.3884, + "step": 3540 + }, + { + "epoch": 0.19440175631174533, + "grad_norm": 1.9634898900985718, + "learning_rate": 4.8299880431019614e-05, + "loss": 0.3484, + "step": 3542 + }, + { + "epoch": 0.19451152579582875, + "grad_norm": 2.477541923522949, + "learning_rate": 4.8297982028536826e-05, + "loss": 0.3891, + "step": 3544 + }, + { + "epoch": 0.19462129527991218, + "grad_norm": 1.4411896467208862, + "learning_rate": 4.82960826040902e-05, + "loss": 0.4925, + "step": 3546 + }, + { + "epoch": 0.1947310647639956, + "grad_norm": 1.6899806261062622, + "learning_rate": 4.8294182157763044e-05, + "loss": 0.5113, + "step": 3548 + }, + { + "epoch": 0.19484083424807905, + "grad_norm": 1.3991827964782715, + "learning_rate": 4.8292280689638725e-05, + "loss": 0.4595, + "step": 3550 + }, + { + "epoch": 0.19495060373216247, + "grad_norm": 2.8396389484405518, + "learning_rate": 4.829037819980065e-05, + "loss": 0.4447, + "step": 3552 + }, + { + "epoch": 0.1950603732162459, + "grad_norm": 1.757897973060608, + "learning_rate": 4.828847468833228e-05, + "loss": 0.2781, + "step": 3554 + }, + { + "epoch": 0.1951701427003293, + "grad_norm": 2.9618940353393555, + "learning_rate": 4.828657015531709e-05, + "loss": 0.3768, + "step": 3556 + }, + { + "epoch": 0.19527991218441274, + "grad_norm": 2.0119779109954834, + "learning_rate": 4.828466460083864e-05, + "loss": 0.4531, + "step": 3558 + }, + { + "epoch": 0.19538968166849616, + "grad_norm": 1.5942295789718628, + "learning_rate": 4.828275802498051e-05, + "loss": 0.369, + "step": 3560 + }, + { + "epoch": 0.19549945115257958, + "grad_norm": 2.053464889526367, + "learning_rate": 4.8280850427826344e-05, + "loss": 0.4262, + "step": 3562 + }, + { + "epoch": 0.195609220636663, + "grad_norm": 1.9995553493499756, + "learning_rate": 4.82789418094598e-05, + "loss": 0.3428, + "step": 3564 + }, + { + "epoch": 0.19571899012074642, + "grad_norm": 2.53166127204895, + "learning_rate": 4.827703216996461e-05, + "loss": 0.4014, + "step": 3566 + }, + { + "epoch": 0.19582875960482984, + "grad_norm": 1.6071888208389282, + "learning_rate": 4.827512150942454e-05, + "loss": 0.4801, + "step": 3568 + }, + { + "epoch": 0.1959385290889133, + "grad_norm": 1.4288541078567505, + "learning_rate": 4.827320982792339e-05, + "loss": 0.6373, + "step": 3570 + }, + { + "epoch": 0.19604829857299672, + "grad_norm": 4.021005153656006, + "learning_rate": 4.827129712554504e-05, + "loss": 0.358, + "step": 3572 + }, + { + "epoch": 0.19615806805708014, + "grad_norm": 2.1209285259246826, + "learning_rate": 4.826938340237337e-05, + "loss": 0.4097, + "step": 3574 + }, + { + "epoch": 0.19626783754116356, + "grad_norm": 3.8589234352111816, + "learning_rate": 4.8267468658492335e-05, + "loss": 0.3823, + "step": 3576 + }, + { + "epoch": 0.19637760702524698, + "grad_norm": 1.778193712234497, + "learning_rate": 4.826555289398591e-05, + "loss": 0.3148, + "step": 3578 + }, + { + "epoch": 0.1964873765093304, + "grad_norm": 1.6296138763427734, + "learning_rate": 4.8263636108938156e-05, + "loss": 0.5069, + "step": 3580 + }, + { + "epoch": 0.19659714599341382, + "grad_norm": 1.8278764486312866, + "learning_rate": 4.826171830343313e-05, + "loss": 0.4111, + "step": 3582 + }, + { + "epoch": 0.19670691547749725, + "grad_norm": 1.8989523649215698, + "learning_rate": 4.8259799477554965e-05, + "loss": 0.4632, + "step": 3584 + }, + { + "epoch": 0.19681668496158067, + "grad_norm": 3.9177401065826416, + "learning_rate": 4.8257879631387825e-05, + "loss": 0.5309, + "step": 3586 + }, + { + "epoch": 0.19692645444566412, + "grad_norm": 3.382319211959839, + "learning_rate": 4.825595876501593e-05, + "loss": 0.4055, + "step": 3588 + }, + { + "epoch": 0.19703622392974754, + "grad_norm": 1.975528597831726, + "learning_rate": 4.825403687852354e-05, + "loss": 0.4804, + "step": 3590 + }, + { + "epoch": 0.19714599341383096, + "grad_norm": 1.371712565422058, + "learning_rate": 4.825211397199495e-05, + "loss": 0.3338, + "step": 3592 + }, + { + "epoch": 0.19725576289791438, + "grad_norm": 2.2633841037750244, + "learning_rate": 4.825019004551452e-05, + "loss": 0.3263, + "step": 3594 + }, + { + "epoch": 0.1973655323819978, + "grad_norm": 2.5708770751953125, + "learning_rate": 4.8248265099166634e-05, + "loss": 0.4108, + "step": 3596 + }, + { + "epoch": 0.19747530186608123, + "grad_norm": 1.4257043600082397, + "learning_rate": 4.8246339133035726e-05, + "loss": 0.3592, + "step": 3598 + }, + { + "epoch": 0.19758507135016465, + "grad_norm": 2.09321928024292, + "learning_rate": 4.8244412147206284e-05, + "loss": 0.3935, + "step": 3600 + }, + { + "epoch": 0.19769484083424807, + "grad_norm": 2.771012306213379, + "learning_rate": 4.824248414176284e-05, + "loss": 0.3306, + "step": 3602 + }, + { + "epoch": 0.1978046103183315, + "grad_norm": 1.7103081941604614, + "learning_rate": 4.8240555116789964e-05, + "loss": 0.3174, + "step": 3604 + }, + { + "epoch": 0.19791437980241494, + "grad_norm": 1.2474560737609863, + "learning_rate": 4.823862507237226e-05, + "loss": 0.2998, + "step": 3606 + }, + { + "epoch": 0.19802414928649836, + "grad_norm": 2.0913825035095215, + "learning_rate": 4.8236694008594405e-05, + "loss": 0.4375, + "step": 3608 + }, + { + "epoch": 0.19813391877058179, + "grad_norm": 1.8185617923736572, + "learning_rate": 4.823476192554109e-05, + "loss": 0.3099, + "step": 3610 + }, + { + "epoch": 0.1982436882546652, + "grad_norm": 1.5044342279434204, + "learning_rate": 4.8232828823297085e-05, + "loss": 0.3818, + "step": 3612 + }, + { + "epoch": 0.19835345773874863, + "grad_norm": 2.8721306324005127, + "learning_rate": 4.823089470194717e-05, + "loss": 0.6648, + "step": 3614 + }, + { + "epoch": 0.19846322722283205, + "grad_norm": 3.6556382179260254, + "learning_rate": 4.822895956157619e-05, + "loss": 0.4468, + "step": 3616 + }, + { + "epoch": 0.19857299670691547, + "grad_norm": 2.454815626144409, + "learning_rate": 4.8227023402269025e-05, + "loss": 0.3342, + "step": 3618 + }, + { + "epoch": 0.1986827661909989, + "grad_norm": 1.950376272201538, + "learning_rate": 4.8225086224110615e-05, + "loss": 0.4189, + "step": 3620 + }, + { + "epoch": 0.19879253567508232, + "grad_norm": 2.0487842559814453, + "learning_rate": 4.822314802718593e-05, + "loss": 0.544, + "step": 3622 + }, + { + "epoch": 0.19890230515916577, + "grad_norm": 1.545840859413147, + "learning_rate": 4.822120881157998e-05, + "loss": 0.3874, + "step": 3624 + }, + { + "epoch": 0.1990120746432492, + "grad_norm": 1.4182488918304443, + "learning_rate": 4.821926857737783e-05, + "loss": 0.3491, + "step": 3626 + }, + { + "epoch": 0.1991218441273326, + "grad_norm": 1.9558861255645752, + "learning_rate": 4.8217327324664595e-05, + "loss": 0.4272, + "step": 3628 + }, + { + "epoch": 0.19923161361141603, + "grad_norm": 1.2986009120941162, + "learning_rate": 4.821538505352543e-05, + "loss": 0.332, + "step": 3630 + }, + { + "epoch": 0.19934138309549945, + "grad_norm": 2.43734073638916, + "learning_rate": 4.821344176404554e-05, + "loss": 0.2974, + "step": 3632 + }, + { + "epoch": 0.19945115257958287, + "grad_norm": 1.9493643045425415, + "learning_rate": 4.821149745631014e-05, + "loss": 0.2972, + "step": 3634 + }, + { + "epoch": 0.1995609220636663, + "grad_norm": 3.8157925605773926, + "learning_rate": 4.820955213040454e-05, + "loss": 0.4696, + "step": 3636 + }, + { + "epoch": 0.19967069154774972, + "grad_norm": 1.8774893283843994, + "learning_rate": 4.820760578641406e-05, + "loss": 0.336, + "step": 3638 + }, + { + "epoch": 0.19978046103183314, + "grad_norm": 4.233694076538086, + "learning_rate": 4.820565842442408e-05, + "loss": 0.4987, + "step": 3640 + }, + { + "epoch": 0.19989023051591656, + "grad_norm": 1.7048786878585815, + "learning_rate": 4.8203710044520026e-05, + "loss": 0.3143, + "step": 3642 + }, + { + "epoch": 0.2, + "grad_norm": 1.8102807998657227, + "learning_rate": 4.8201760646787366e-05, + "loss": 0.3657, + "step": 3644 + }, + { + "epoch": 0.20010976948408343, + "grad_norm": 2.705460786819458, + "learning_rate": 4.819981023131159e-05, + "loss": 0.5458, + "step": 3646 + }, + { + "epoch": 0.20021953896816685, + "grad_norm": 2.1958765983581543, + "learning_rate": 4.819785879817827e-05, + "loss": 0.5337, + "step": 3648 + }, + { + "epoch": 0.20032930845225028, + "grad_norm": 2.6541380882263184, + "learning_rate": 4.8195906347473e-05, + "loss": 0.4535, + "step": 3650 + }, + { + "epoch": 0.2004390779363337, + "grad_norm": 2.433911085128784, + "learning_rate": 4.819395287928143e-05, + "loss": 0.2571, + "step": 3652 + }, + { + "epoch": 0.20054884742041712, + "grad_norm": 1.6709914207458496, + "learning_rate": 4.819199839368924e-05, + "loss": 0.3548, + "step": 3654 + }, + { + "epoch": 0.20065861690450054, + "grad_norm": 2.8162145614624023, + "learning_rate": 4.819004289078217e-05, + "loss": 0.3334, + "step": 3656 + }, + { + "epoch": 0.20076838638858396, + "grad_norm": 1.7834858894348145, + "learning_rate": 4.8188086370645994e-05, + "loss": 0.3144, + "step": 3658 + }, + { + "epoch": 0.20087815587266739, + "grad_norm": 2.0256688594818115, + "learning_rate": 4.818612883336654e-05, + "loss": 0.4641, + "step": 3660 + }, + { + "epoch": 0.20098792535675084, + "grad_norm": 2.887218952178955, + "learning_rate": 4.8184170279029664e-05, + "loss": 0.5313, + "step": 3662 + }, + { + "epoch": 0.20109769484083426, + "grad_norm": 1.3462895154953003, + "learning_rate": 4.8182210707721284e-05, + "loss": 0.3212, + "step": 3664 + }, + { + "epoch": 0.20120746432491768, + "grad_norm": 2.30818247795105, + "learning_rate": 4.818025011952737e-05, + "loss": 0.4361, + "step": 3666 + }, + { + "epoch": 0.2013172338090011, + "grad_norm": 1.301153302192688, + "learning_rate": 4.81782885145339e-05, + "loss": 0.3004, + "step": 3668 + }, + { + "epoch": 0.20142700329308452, + "grad_norm": 1.3619123697280884, + "learning_rate": 4.8176325892826926e-05, + "loss": 0.3439, + "step": 3670 + }, + { + "epoch": 0.20153677277716794, + "grad_norm": 2.237260580062866, + "learning_rate": 4.817436225449255e-05, + "loss": 0.4328, + "step": 3672 + }, + { + "epoch": 0.20164654226125137, + "grad_norm": 2.3982551097869873, + "learning_rate": 4.81723975996169e-05, + "loss": 0.4188, + "step": 3674 + }, + { + "epoch": 0.2017563117453348, + "grad_norm": 1.6313750743865967, + "learning_rate": 4.8170431928286155e-05, + "loss": 0.4842, + "step": 3676 + }, + { + "epoch": 0.2018660812294182, + "grad_norm": 1.974703311920166, + "learning_rate": 4.816846524058653e-05, + "loss": 0.3717, + "step": 3678 + }, + { + "epoch": 0.20197585071350166, + "grad_norm": 1.750830888748169, + "learning_rate": 4.81664975366043e-05, + "loss": 0.5423, + "step": 3680 + }, + { + "epoch": 0.20208562019758508, + "grad_norm": 2.301332473754883, + "learning_rate": 4.816452881642579e-05, + "loss": 0.4657, + "step": 3682 + }, + { + "epoch": 0.2021953896816685, + "grad_norm": 3.01485013961792, + "learning_rate": 4.8162559080137346e-05, + "loss": 0.4348, + "step": 3684 + }, + { + "epoch": 0.20230515916575192, + "grad_norm": 2.3994626998901367, + "learning_rate": 4.8160588327825374e-05, + "loss": 0.4455, + "step": 3686 + }, + { + "epoch": 0.20241492864983535, + "grad_norm": 1.8185398578643799, + "learning_rate": 4.815861655957632e-05, + "loss": 0.4072, + "step": 3688 + }, + { + "epoch": 0.20252469813391877, + "grad_norm": 1.6445631980895996, + "learning_rate": 4.8156643775476664e-05, + "loss": 0.4494, + "step": 3690 + }, + { + "epoch": 0.2026344676180022, + "grad_norm": 2.0141730308532715, + "learning_rate": 4.8154669975612966e-05, + "loss": 0.3527, + "step": 3692 + }, + { + "epoch": 0.2027442371020856, + "grad_norm": 2.0265259742736816, + "learning_rate": 4.815269516007179e-05, + "loss": 0.4771, + "step": 3694 + }, + { + "epoch": 0.20285400658616903, + "grad_norm": 1.7260929346084595, + "learning_rate": 4.8150719328939755e-05, + "loss": 0.5373, + "step": 3696 + }, + { + "epoch": 0.20296377607025248, + "grad_norm": 2.7216787338256836, + "learning_rate": 4.8148742482303545e-05, + "loss": 0.4225, + "step": 3698 + }, + { + "epoch": 0.2030735455543359, + "grad_norm": 1.528950810432434, + "learning_rate": 4.814676462024988e-05, + "loss": 0.4551, + "step": 3700 + }, + { + "epoch": 0.20318331503841933, + "grad_norm": 2.1860251426696777, + "learning_rate": 4.814478574286549e-05, + "loss": 0.3992, + "step": 3702 + }, + { + "epoch": 0.20329308452250275, + "grad_norm": 2.0621109008789062, + "learning_rate": 4.814280585023721e-05, + "loss": 0.362, + "step": 3704 + }, + { + "epoch": 0.20340285400658617, + "grad_norm": 1.36505925655365, + "learning_rate": 4.8140824942451856e-05, + "loss": 0.2527, + "step": 3706 + }, + { + "epoch": 0.2035126234906696, + "grad_norm": 2.4241554737091064, + "learning_rate": 4.813884301959635e-05, + "loss": 0.4438, + "step": 3708 + }, + { + "epoch": 0.203622392974753, + "grad_norm": 1.8774529695510864, + "learning_rate": 4.813686008175762e-05, + "loss": 0.3533, + "step": 3710 + }, + { + "epoch": 0.20373216245883644, + "grad_norm": 1.9272782802581787, + "learning_rate": 4.813487612902264e-05, + "loss": 0.4657, + "step": 3712 + }, + { + "epoch": 0.20384193194291986, + "grad_norm": 3.1594297885894775, + "learning_rate": 4.8132891161478446e-05, + "loss": 0.4394, + "step": 3714 + }, + { + "epoch": 0.2039517014270033, + "grad_norm": 1.7088334560394287, + "learning_rate": 4.813090517921209e-05, + "loss": 0.4083, + "step": 3716 + }, + { + "epoch": 0.20406147091108673, + "grad_norm": 1.5016498565673828, + "learning_rate": 4.812891818231071e-05, + "loss": 0.4759, + "step": 3718 + }, + { + "epoch": 0.20417124039517015, + "grad_norm": 2.5058116912841797, + "learning_rate": 4.812693017086145e-05, + "loss": 0.4311, + "step": 3720 + }, + { + "epoch": 0.20428100987925357, + "grad_norm": 1.6748089790344238, + "learning_rate": 4.8124941144951525e-05, + "loss": 0.3223, + "step": 3722 + }, + { + "epoch": 0.204390779363337, + "grad_norm": 2.3235507011413574, + "learning_rate": 4.812295110466817e-05, + "loss": 0.3989, + "step": 3724 + }, + { + "epoch": 0.20450054884742042, + "grad_norm": 2.36348557472229, + "learning_rate": 4.81209600500987e-05, + "loss": 0.3208, + "step": 3726 + }, + { + "epoch": 0.20461031833150384, + "grad_norm": 2.820863962173462, + "learning_rate": 4.811896798133042e-05, + "loss": 0.3187, + "step": 3728 + }, + { + "epoch": 0.20472008781558726, + "grad_norm": 2.1075658798217773, + "learning_rate": 4.8116974898450736e-05, + "loss": 0.4755, + "step": 3730 + }, + { + "epoch": 0.20482985729967068, + "grad_norm": 2.490832567214966, + "learning_rate": 4.811498080154707e-05, + "loss": 0.4737, + "step": 3732 + }, + { + "epoch": 0.2049396267837541, + "grad_norm": 2.6346843242645264, + "learning_rate": 4.811298569070689e-05, + "loss": 0.3948, + "step": 3734 + }, + { + "epoch": 0.20504939626783755, + "grad_norm": 1.4356262683868408, + "learning_rate": 4.8110989566017716e-05, + "loss": 0.4507, + "step": 3736 + }, + { + "epoch": 0.20515916575192097, + "grad_norm": 1.8413069248199463, + "learning_rate": 4.81089924275671e-05, + "loss": 0.4311, + "step": 3738 + }, + { + "epoch": 0.2052689352360044, + "grad_norm": 1.6541643142700195, + "learning_rate": 4.810699427544265e-05, + "loss": 0.3863, + "step": 3740 + }, + { + "epoch": 0.20537870472008782, + "grad_norm": 2.6789865493774414, + "learning_rate": 4.810499510973202e-05, + "loss": 0.3426, + "step": 3742 + }, + { + "epoch": 0.20548847420417124, + "grad_norm": 2.0632622241973877, + "learning_rate": 4.810299493052289e-05, + "loss": 0.3555, + "step": 3744 + }, + { + "epoch": 0.20559824368825466, + "grad_norm": 1.7498737573623657, + "learning_rate": 4.810099373790302e-05, + "loss": 0.402, + "step": 3746 + }, + { + "epoch": 0.20570801317233808, + "grad_norm": 2.1032161712646484, + "learning_rate": 4.809899153196017e-05, + "loss": 0.4475, + "step": 3748 + }, + { + "epoch": 0.2058177826564215, + "grad_norm": 1.3025320768356323, + "learning_rate": 4.8096988312782174e-05, + "loss": 0.3811, + "step": 3750 + }, + { + "epoch": 0.20592755214050493, + "grad_norm": 2.0664937496185303, + "learning_rate": 4.8094984080456904e-05, + "loss": 0.3649, + "step": 3752 + }, + { + "epoch": 0.20603732162458838, + "grad_norm": 1.606792688369751, + "learning_rate": 4.8092978835072274e-05, + "loss": 0.4454, + "step": 3754 + }, + { + "epoch": 0.2061470911086718, + "grad_norm": 1.7955405712127686, + "learning_rate": 4.809097257671625e-05, + "loss": 0.4171, + "step": 3756 + }, + { + "epoch": 0.20625686059275522, + "grad_norm": 3.0128273963928223, + "learning_rate": 4.808896530547683e-05, + "loss": 0.4184, + "step": 3758 + }, + { + "epoch": 0.20636663007683864, + "grad_norm": 2.522442102432251, + "learning_rate": 4.808695702144206e-05, + "loss": 0.3803, + "step": 3760 + }, + { + "epoch": 0.20647639956092206, + "grad_norm": 2.440786838531494, + "learning_rate": 4.8084947724700044e-05, + "loss": 0.4434, + "step": 3762 + }, + { + "epoch": 0.20658616904500549, + "grad_norm": 2.3531742095947266, + "learning_rate": 4.808293741533891e-05, + "loss": 0.3838, + "step": 3764 + }, + { + "epoch": 0.2066959385290889, + "grad_norm": 1.3632075786590576, + "learning_rate": 4.808092609344684e-05, + "loss": 0.3903, + "step": 3766 + }, + { + "epoch": 0.20680570801317233, + "grad_norm": 2.584995746612549, + "learning_rate": 4.8078913759112066e-05, + "loss": 0.2946, + "step": 3768 + }, + { + "epoch": 0.20691547749725575, + "grad_norm": 1.3086150884628296, + "learning_rate": 4.8076900412422856e-05, + "loss": 0.2933, + "step": 3770 + }, + { + "epoch": 0.2070252469813392, + "grad_norm": 2.2789454460144043, + "learning_rate": 4.807488605346753e-05, + "loss": 0.4337, + "step": 3772 + }, + { + "epoch": 0.20713501646542262, + "grad_norm": 2.707874059677124, + "learning_rate": 4.807287068233444e-05, + "loss": 0.5223, + "step": 3774 + }, + { + "epoch": 0.20724478594950604, + "grad_norm": 1.2104742527008057, + "learning_rate": 4.8070854299111994e-05, + "loss": 0.3069, + "step": 3776 + }, + { + "epoch": 0.20735455543358947, + "grad_norm": 1.7294856309890747, + "learning_rate": 4.806883690388864e-05, + "loss": 0.3458, + "step": 3778 + }, + { + "epoch": 0.2074643249176729, + "grad_norm": 1.784811019897461, + "learning_rate": 4.8066818496752875e-05, + "loss": 0.4811, + "step": 3780 + }, + { + "epoch": 0.2075740944017563, + "grad_norm": 1.5444071292877197, + "learning_rate": 4.8064799077793225e-05, + "loss": 0.376, + "step": 3782 + }, + { + "epoch": 0.20768386388583973, + "grad_norm": 3.3225960731506348, + "learning_rate": 4.8062778647098284e-05, + "loss": 0.4611, + "step": 3784 + }, + { + "epoch": 0.20779363336992315, + "grad_norm": 2.21529221534729, + "learning_rate": 4.806075720475667e-05, + "loss": 0.388, + "step": 3786 + }, + { + "epoch": 0.20790340285400657, + "grad_norm": 1.4696484804153442, + "learning_rate": 4.805873475085706e-05, + "loss": 0.313, + "step": 3788 + }, + { + "epoch": 0.20801317233809002, + "grad_norm": 1.4269322156906128, + "learning_rate": 4.805671128548816e-05, + "loss": 0.6155, + "step": 3790 + }, + { + "epoch": 0.20812294182217345, + "grad_norm": 1.923394799232483, + "learning_rate": 4.805468680873874e-05, + "loss": 0.4415, + "step": 3792 + }, + { + "epoch": 0.20823271130625687, + "grad_norm": 3.7704172134399414, + "learning_rate": 4.805266132069759e-05, + "loss": 0.3462, + "step": 3794 + }, + { + "epoch": 0.2083424807903403, + "grad_norm": 2.027111530303955, + "learning_rate": 4.8050634821453565e-05, + "loss": 0.3813, + "step": 3796 + }, + { + "epoch": 0.2084522502744237, + "grad_norm": 1.6994842290878296, + "learning_rate": 4.804860731109557e-05, + "loss": 0.3095, + "step": 3798 + }, + { + "epoch": 0.20856201975850713, + "grad_norm": 2.8365349769592285, + "learning_rate": 4.8046578789712515e-05, + "loss": 0.3846, + "step": 3800 + }, + { + "epoch": 0.20867178924259056, + "grad_norm": 1.171102523803711, + "learning_rate": 4.80445492573934e-05, + "loss": 0.452, + "step": 3802 + }, + { + "epoch": 0.20878155872667398, + "grad_norm": 1.35848867893219, + "learning_rate": 4.804251871422725e-05, + "loss": 0.4041, + "step": 3804 + }, + { + "epoch": 0.2088913282107574, + "grad_norm": 1.5057244300842285, + "learning_rate": 4.8040487160303126e-05, + "loss": 0.3728, + "step": 3806 + }, + { + "epoch": 0.20900109769484082, + "grad_norm": 2.4035701751708984, + "learning_rate": 4.803845459571014e-05, + "loss": 0.4243, + "step": 3808 + }, + { + "epoch": 0.20911086717892427, + "grad_norm": 1.116283655166626, + "learning_rate": 4.803642102053746e-05, + "loss": 0.2748, + "step": 3810 + }, + { + "epoch": 0.2092206366630077, + "grad_norm": 1.9219353199005127, + "learning_rate": 4.803438643487429e-05, + "loss": 0.4286, + "step": 3812 + }, + { + "epoch": 0.2093304061470911, + "grad_norm": 1.9614331722259521, + "learning_rate": 4.803235083880987e-05, + "loss": 0.4271, + "step": 3814 + }, + { + "epoch": 0.20944017563117454, + "grad_norm": 2.060427188873291, + "learning_rate": 4.803031423243349e-05, + "loss": 0.4892, + "step": 3816 + }, + { + "epoch": 0.20954994511525796, + "grad_norm": 1.7710754871368408, + "learning_rate": 4.802827661583449e-05, + "loss": 0.3767, + "step": 3818 + }, + { + "epoch": 0.20965971459934138, + "grad_norm": 2.314626693725586, + "learning_rate": 4.802623798910224e-05, + "loss": 0.665, + "step": 3820 + }, + { + "epoch": 0.2097694840834248, + "grad_norm": 1.4753789901733398, + "learning_rate": 4.802419835232618e-05, + "loss": 0.4758, + "step": 3822 + }, + { + "epoch": 0.20987925356750822, + "grad_norm": 1.702632188796997, + "learning_rate": 4.802215770559577e-05, + "loss": 0.342, + "step": 3824 + }, + { + "epoch": 0.20998902305159164, + "grad_norm": 2.333897829055786, + "learning_rate": 4.802011604900053e-05, + "loss": 0.3829, + "step": 3826 + }, + { + "epoch": 0.2100987925356751, + "grad_norm": 1.586760401725769, + "learning_rate": 4.801807338263e-05, + "loss": 0.278, + "step": 3828 + }, + { + "epoch": 0.21020856201975852, + "grad_norm": 1.6597449779510498, + "learning_rate": 4.801602970657379e-05, + "loss": 0.393, + "step": 3830 + }, + { + "epoch": 0.21031833150384194, + "grad_norm": 2.5260119438171387, + "learning_rate": 4.801398502092156e-05, + "loss": 0.3541, + "step": 3832 + }, + { + "epoch": 0.21042810098792536, + "grad_norm": 3.59124755859375, + "learning_rate": 4.801193932576299e-05, + "loss": 0.4364, + "step": 3834 + }, + { + "epoch": 0.21053787047200878, + "grad_norm": 1.408016562461853, + "learning_rate": 4.80098926211878e-05, + "loss": 0.2563, + "step": 3836 + }, + { + "epoch": 0.2106476399560922, + "grad_norm": 1.9735534191131592, + "learning_rate": 4.800784490728578e-05, + "loss": 0.2826, + "step": 3838 + }, + { + "epoch": 0.21075740944017562, + "grad_norm": 19.963489532470703, + "learning_rate": 4.800579618414676e-05, + "loss": 0.36, + "step": 3840 + }, + { + "epoch": 0.21086717892425905, + "grad_norm": 5.782192707061768, + "learning_rate": 4.8003746451860597e-05, + "loss": 0.504, + "step": 3842 + }, + { + "epoch": 0.21097694840834247, + "grad_norm": 2.6737377643585205, + "learning_rate": 4.800169571051721e-05, + "loss": 0.4592, + "step": 3844 + }, + { + "epoch": 0.21108671789242592, + "grad_norm": 1.0823606252670288, + "learning_rate": 4.799964396020654e-05, + "loss": 0.3168, + "step": 3846 + }, + { + "epoch": 0.21119648737650934, + "grad_norm": 3.2044732570648193, + "learning_rate": 4.799759120101861e-05, + "loss": 0.309, + "step": 3848 + }, + { + "epoch": 0.21130625686059276, + "grad_norm": 1.7933378219604492, + "learning_rate": 4.7995537433043446e-05, + "loss": 0.2748, + "step": 3850 + }, + { + "epoch": 0.21141602634467618, + "grad_norm": 2.0902388095855713, + "learning_rate": 4.7993482656371135e-05, + "loss": 0.4149, + "step": 3852 + }, + { + "epoch": 0.2115257958287596, + "grad_norm": 2.554020643234253, + "learning_rate": 4.799142687109183e-05, + "loss": 0.5548, + "step": 3854 + }, + { + "epoch": 0.21163556531284303, + "grad_norm": 2.739316701889038, + "learning_rate": 4.798937007729568e-05, + "loss": 0.4685, + "step": 3856 + }, + { + "epoch": 0.21174533479692645, + "grad_norm": 1.5771307945251465, + "learning_rate": 4.7987312275072926e-05, + "loss": 0.512, + "step": 3858 + }, + { + "epoch": 0.21185510428100987, + "grad_norm": 1.8573580980300903, + "learning_rate": 4.7985253464513825e-05, + "loss": 0.31, + "step": 3860 + }, + { + "epoch": 0.2119648737650933, + "grad_norm": 2.09100341796875, + "learning_rate": 4.798319364570869e-05, + "loss": 0.3873, + "step": 3862 + }, + { + "epoch": 0.21207464324917674, + "grad_norm": 1.3432930707931519, + "learning_rate": 4.7981132818747876e-05, + "loss": 0.564, + "step": 3864 + }, + { + "epoch": 0.21218441273326016, + "grad_norm": 1.9967769384384155, + "learning_rate": 4.797907098372177e-05, + "loss": 0.3831, + "step": 3866 + }, + { + "epoch": 0.21229418221734359, + "grad_norm": 2.2984330654144287, + "learning_rate": 4.797700814072083e-05, + "loss": 0.3723, + "step": 3868 + }, + { + "epoch": 0.212403951701427, + "grad_norm": 1.9177026748657227, + "learning_rate": 4.797494428983553e-05, + "loss": 0.2575, + "step": 3870 + }, + { + "epoch": 0.21251372118551043, + "grad_norm": 2.3893096446990967, + "learning_rate": 4.797287943115641e-05, + "loss": 0.4549, + "step": 3872 + }, + { + "epoch": 0.21262349066959385, + "grad_norm": 5.052048683166504, + "learning_rate": 4.7970813564774044e-05, + "loss": 0.3422, + "step": 3874 + }, + { + "epoch": 0.21273326015367727, + "grad_norm": 1.8911099433898926, + "learning_rate": 4.7968746690779044e-05, + "loss": 0.4868, + "step": 3876 + }, + { + "epoch": 0.2128430296377607, + "grad_norm": 2.3677427768707275, + "learning_rate": 4.796667880926208e-05, + "loss": 0.4504, + "step": 3878 + }, + { + "epoch": 0.21295279912184412, + "grad_norm": 5.085298538208008, + "learning_rate": 4.796460992031385e-05, + "loss": 0.5884, + "step": 3880 + }, + { + "epoch": 0.21306256860592757, + "grad_norm": 2.8999130725860596, + "learning_rate": 4.796254002402512e-05, + "loss": 0.3625, + "step": 3882 + }, + { + "epoch": 0.213172338090011, + "grad_norm": 1.3241032361984253, + "learning_rate": 4.7960469120486674e-05, + "loss": 0.292, + "step": 3884 + }, + { + "epoch": 0.2132821075740944, + "grad_norm": 1.6120060682296753, + "learning_rate": 4.795839720978935e-05, + "loss": 0.3575, + "step": 3886 + }, + { + "epoch": 0.21339187705817783, + "grad_norm": 2.2628676891326904, + "learning_rate": 4.795632429202405e-05, + "loss": 0.4175, + "step": 3888 + }, + { + "epoch": 0.21350164654226125, + "grad_norm": 1.7551523447036743, + "learning_rate": 4.795425036728168e-05, + "loss": 0.3646, + "step": 3890 + }, + { + "epoch": 0.21361141602634467, + "grad_norm": 3.193352699279785, + "learning_rate": 4.7952175435653226e-05, + "loss": 0.4669, + "step": 3892 + }, + { + "epoch": 0.2137211855104281, + "grad_norm": 2.6284165382385254, + "learning_rate": 4.79500994972297e-05, + "loss": 0.4588, + "step": 3894 + }, + { + "epoch": 0.21383095499451152, + "grad_norm": 2.196856737136841, + "learning_rate": 4.794802255210217e-05, + "loss": 0.3463, + "step": 3896 + }, + { + "epoch": 0.21394072447859494, + "grad_norm": 1.7800096273422241, + "learning_rate": 4.7945944600361725e-05, + "loss": 0.4586, + "step": 3898 + }, + { + "epoch": 0.21405049396267836, + "grad_norm": 2.5874698162078857, + "learning_rate": 4.794386564209953e-05, + "loss": 0.5314, + "step": 3900 + }, + { + "epoch": 0.2141602634467618, + "grad_norm": 2.142383098602295, + "learning_rate": 4.7941785677406774e-05, + "loss": 0.4438, + "step": 3902 + }, + { + "epoch": 0.21427003293084523, + "grad_norm": 1.9092521667480469, + "learning_rate": 4.793970470637469e-05, + "loss": 0.272, + "step": 3904 + }, + { + "epoch": 0.21437980241492866, + "grad_norm": 1.7980530261993408, + "learning_rate": 4.793762272909457e-05, + "loss": 0.4067, + "step": 3906 + }, + { + "epoch": 0.21448957189901208, + "grad_norm": 4.077652931213379, + "learning_rate": 4.793553974565773e-05, + "loss": 0.3709, + "step": 3908 + }, + { + "epoch": 0.2145993413830955, + "grad_norm": 2.087402582168579, + "learning_rate": 4.7933455756155536e-05, + "loss": 0.4408, + "step": 3910 + }, + { + "epoch": 0.21470911086717892, + "grad_norm": 1.5463011264801025, + "learning_rate": 4.793137076067942e-05, + "loss": 0.422, + "step": 3912 + }, + { + "epoch": 0.21481888035126234, + "grad_norm": 1.7472026348114014, + "learning_rate": 4.7929284759320814e-05, + "loss": 0.4186, + "step": 3914 + }, + { + "epoch": 0.21492864983534576, + "grad_norm": 2.933457612991333, + "learning_rate": 4.792719775217124e-05, + "loss": 0.4377, + "step": 3916 + }, + { + "epoch": 0.21503841931942919, + "grad_norm": 2.0735394954681396, + "learning_rate": 4.792510973932225e-05, + "loss": 0.3287, + "step": 3918 + }, + { + "epoch": 0.21514818880351264, + "grad_norm": 1.8683826923370361, + "learning_rate": 4.7923020720865414e-05, + "loss": 0.2518, + "step": 3920 + }, + { + "epoch": 0.21525795828759606, + "grad_norm": 1.7409896850585938, + "learning_rate": 4.792093069689237e-05, + "loss": 0.3252, + "step": 3922 + }, + { + "epoch": 0.21536772777167948, + "grad_norm": 1.6012794971466064, + "learning_rate": 4.791883966749482e-05, + "loss": 0.4045, + "step": 3924 + }, + { + "epoch": 0.2154774972557629, + "grad_norm": 1.5863546133041382, + "learning_rate": 4.791674763276446e-05, + "loss": 0.3089, + "step": 3926 + }, + { + "epoch": 0.21558726673984632, + "grad_norm": 2.0202274322509766, + "learning_rate": 4.7914654592793065e-05, + "loss": 0.5092, + "step": 3928 + }, + { + "epoch": 0.21569703622392974, + "grad_norm": 3.4373133182525635, + "learning_rate": 4.791256054767245e-05, + "loss": 0.5044, + "step": 3930 + }, + { + "epoch": 0.21580680570801317, + "grad_norm": 1.785767912864685, + "learning_rate": 4.7910465497494474e-05, + "loss": 0.3561, + "step": 3932 + }, + { + "epoch": 0.2159165751920966, + "grad_norm": 1.9329453706741333, + "learning_rate": 4.790836944235102e-05, + "loss": 0.3983, + "step": 3934 + }, + { + "epoch": 0.21602634467618, + "grad_norm": 1.3855924606323242, + "learning_rate": 4.790627238233405e-05, + "loss": 0.3154, + "step": 3936 + }, + { + "epoch": 0.21613611416026346, + "grad_norm": 4.303253650665283, + "learning_rate": 4.790417431753553e-05, + "loss": 0.3688, + "step": 3938 + }, + { + "epoch": 0.21624588364434688, + "grad_norm": 2.1491472721099854, + "learning_rate": 4.7902075248047515e-05, + "loss": 0.36, + "step": 3940 + }, + { + "epoch": 0.2163556531284303, + "grad_norm": 2.081239938735962, + "learning_rate": 4.789997517396207e-05, + "loss": 0.45, + "step": 3942 + }, + { + "epoch": 0.21646542261251372, + "grad_norm": 2.7920095920562744, + "learning_rate": 4.789787409537131e-05, + "loss": 0.4153, + "step": 3944 + }, + { + "epoch": 0.21657519209659715, + "grad_norm": 1.4285731315612793, + "learning_rate": 4.7895772012367406e-05, + "loss": 0.3397, + "step": 3946 + }, + { + "epoch": 0.21668496158068057, + "grad_norm": 2.284701347351074, + "learning_rate": 4.7893668925042565e-05, + "loss": 0.391, + "step": 3948 + }, + { + "epoch": 0.216794731064764, + "grad_norm": 1.86039137840271, + "learning_rate": 4.7891564833489035e-05, + "loss": 0.3894, + "step": 3950 + }, + { + "epoch": 0.2169045005488474, + "grad_norm": 2.5603251457214355, + "learning_rate": 4.78894597377991e-05, + "loss": 0.3286, + "step": 3952 + }, + { + "epoch": 0.21701427003293083, + "grad_norm": 1.6881811618804932, + "learning_rate": 4.7887353638065125e-05, + "loss": 0.3793, + "step": 3954 + }, + { + "epoch": 0.21712403951701428, + "grad_norm": 1.6470414400100708, + "learning_rate": 4.788524653437948e-05, + "loss": 0.2897, + "step": 3956 + }, + { + "epoch": 0.2172338090010977, + "grad_norm": 1.9241209030151367, + "learning_rate": 4.788313842683459e-05, + "loss": 0.2713, + "step": 3958 + }, + { + "epoch": 0.21734357848518113, + "grad_norm": 2.5899369716644287, + "learning_rate": 4.788102931552294e-05, + "loss": 0.4792, + "step": 3960 + }, + { + "epoch": 0.21745334796926455, + "grad_norm": 2.3465380668640137, + "learning_rate": 4.7878919200537034e-05, + "loss": 0.4169, + "step": 3962 + }, + { + "epoch": 0.21756311745334797, + "grad_norm": 1.6018109321594238, + "learning_rate": 4.7876808081969436e-05, + "loss": 0.3928, + "step": 3964 + }, + { + "epoch": 0.2176728869374314, + "grad_norm": 2.3752284049987793, + "learning_rate": 4.787469595991275e-05, + "loss": 0.4564, + "step": 3966 + }, + { + "epoch": 0.21778265642151481, + "grad_norm": 1.4495984315872192, + "learning_rate": 4.787258283445962e-05, + "loss": 0.4359, + "step": 3968 + }, + { + "epoch": 0.21789242590559824, + "grad_norm": 2.399146318435669, + "learning_rate": 4.787046870570274e-05, + "loss": 0.3971, + "step": 3970 + }, + { + "epoch": 0.21800219538968166, + "grad_norm": 1.8427530527114868, + "learning_rate": 4.786835357373486e-05, + "loss": 0.3175, + "step": 3972 + }, + { + "epoch": 0.21811196487376508, + "grad_norm": 1.8244068622589111, + "learning_rate": 4.786623743864873e-05, + "loss": 0.4481, + "step": 3974 + }, + { + "epoch": 0.21822173435784853, + "grad_norm": 2.5982697010040283, + "learning_rate": 4.7864120300537206e-05, + "loss": 0.3416, + "step": 3976 + }, + { + "epoch": 0.21833150384193195, + "grad_norm": 2.14022159576416, + "learning_rate": 4.7862002159493135e-05, + "loss": 0.3613, + "step": 3978 + }, + { + "epoch": 0.21844127332601537, + "grad_norm": 1.3728787899017334, + "learning_rate": 4.785988301560944e-05, + "loss": 0.2958, + "step": 3980 + }, + { + "epoch": 0.2185510428100988, + "grad_norm": 2.334949254989624, + "learning_rate": 4.785776286897907e-05, + "loss": 0.357, + "step": 3982 + }, + { + "epoch": 0.21866081229418222, + "grad_norm": 1.8699477910995483, + "learning_rate": 4.7855641719695023e-05, + "loss": 0.3472, + "step": 3984 + }, + { + "epoch": 0.21877058177826564, + "grad_norm": 2.376932144165039, + "learning_rate": 4.7853519567850356e-05, + "loss": 0.4471, + "step": 3986 + }, + { + "epoch": 0.21888035126234906, + "grad_norm": 1.6721129417419434, + "learning_rate": 4.785139641353815e-05, + "loss": 0.3526, + "step": 3988 + }, + { + "epoch": 0.21899012074643248, + "grad_norm": 1.086011528968811, + "learning_rate": 4.784927225685153e-05, + "loss": 0.2787, + "step": 3990 + }, + { + "epoch": 0.2190998902305159, + "grad_norm": 1.6585849523544312, + "learning_rate": 4.784714709788368e-05, + "loss": 0.5352, + "step": 3992 + }, + { + "epoch": 0.21920965971459935, + "grad_norm": 2.6000874042510986, + "learning_rate": 4.784502093672782e-05, + "loss": 0.4412, + "step": 3994 + }, + { + "epoch": 0.21931942919868277, + "grad_norm": 2.6029272079467773, + "learning_rate": 4.784289377347721e-05, + "loss": 0.4663, + "step": 3996 + }, + { + "epoch": 0.2194291986827662, + "grad_norm": 1.5476576089859009, + "learning_rate": 4.784076560822516e-05, + "loss": 0.5119, + "step": 3998 + }, + { + "epoch": 0.21953896816684962, + "grad_norm": 1.966074824333191, + "learning_rate": 4.783863644106502e-05, + "loss": 0.355, + "step": 4000 + }, + { + "epoch": 0.21964873765093304, + "grad_norm": 1.939609169960022, + "learning_rate": 4.783650627209019e-05, + "loss": 0.348, + "step": 4002 + }, + { + "epoch": 0.21975850713501646, + "grad_norm": 1.3605542182922363, + "learning_rate": 4.783437510139411e-05, + "loss": 0.3031, + "step": 4004 + }, + { + "epoch": 0.21986827661909988, + "grad_norm": 2.7810864448547363, + "learning_rate": 4.783224292907025e-05, + "loss": 0.3714, + "step": 4006 + }, + { + "epoch": 0.2199780461031833, + "grad_norm": 1.4949496984481812, + "learning_rate": 4.783010975521216e-05, + "loss": 0.4407, + "step": 4008 + }, + { + "epoch": 0.22008781558726673, + "grad_norm": 2.7977726459503174, + "learning_rate": 4.782797557991339e-05, + "loss": 0.3809, + "step": 4010 + }, + { + "epoch": 0.22019758507135018, + "grad_norm": 1.828149437904358, + "learning_rate": 4.782584040326757e-05, + "loss": 0.432, + "step": 4012 + }, + { + "epoch": 0.2203073545554336, + "grad_norm": 1.9441677331924438, + "learning_rate": 4.782370422536835e-05, + "loss": 0.3544, + "step": 4014 + }, + { + "epoch": 0.22041712403951702, + "grad_norm": 1.6075091361999512, + "learning_rate": 4.782156704630944e-05, + "loss": 0.3356, + "step": 4016 + }, + { + "epoch": 0.22052689352360044, + "grad_norm": 2.6955912113189697, + "learning_rate": 4.781942886618459e-05, + "loss": 0.5356, + "step": 4018 + }, + { + "epoch": 0.22063666300768386, + "grad_norm": 2.2693288326263428, + "learning_rate": 4.7817289685087577e-05, + "loss": 0.3197, + "step": 4020 + }, + { + "epoch": 0.22074643249176729, + "grad_norm": 3.23866605758667, + "learning_rate": 4.7815149503112244e-05, + "loss": 0.4685, + "step": 4022 + }, + { + "epoch": 0.2208562019758507, + "grad_norm": 1.3522919416427612, + "learning_rate": 4.781300832035247e-05, + "loss": 0.2655, + "step": 4024 + }, + { + "epoch": 0.22096597145993413, + "grad_norm": 4.513492107391357, + "learning_rate": 4.781086613690218e-05, + "loss": 0.3732, + "step": 4026 + }, + { + "epoch": 0.22107574094401755, + "grad_norm": 1.9705297946929932, + "learning_rate": 4.7808722952855344e-05, + "loss": 0.4141, + "step": 4028 + }, + { + "epoch": 0.221185510428101, + "grad_norm": 4.527685165405273, + "learning_rate": 4.780657876830597e-05, + "loss": 0.4104, + "step": 4030 + }, + { + "epoch": 0.22129527991218442, + "grad_norm": 4.537426948547363, + "learning_rate": 4.78044335833481e-05, + "loss": 0.5031, + "step": 4032 + }, + { + "epoch": 0.22140504939626784, + "grad_norm": 2.2985329627990723, + "learning_rate": 4.780228739807584e-05, + "loss": 0.5262, + "step": 4034 + }, + { + "epoch": 0.22151481888035127, + "grad_norm": 1.571162462234497, + "learning_rate": 4.780014021258334e-05, + "loss": 0.2677, + "step": 4036 + }, + { + "epoch": 0.2216245883644347, + "grad_norm": 1.5710617303848267, + "learning_rate": 4.779799202696479e-05, + "loss": 0.3295, + "step": 4038 + }, + { + "epoch": 0.2217343578485181, + "grad_norm": 1.3701198101043701, + "learning_rate": 4.77958428413144e-05, + "loss": 0.3328, + "step": 4040 + }, + { + "epoch": 0.22184412733260153, + "grad_norm": 5.833156108856201, + "learning_rate": 4.779369265572645e-05, + "loss": 0.5627, + "step": 4042 + }, + { + "epoch": 0.22195389681668495, + "grad_norm": 2.4641380310058594, + "learning_rate": 4.779154147029527e-05, + "loss": 0.3831, + "step": 4044 + }, + { + "epoch": 0.22206366630076838, + "grad_norm": 1.4880268573760986, + "learning_rate": 4.778938928511522e-05, + "loss": 0.309, + "step": 4046 + }, + { + "epoch": 0.22217343578485182, + "grad_norm": 4.7036824226379395, + "learning_rate": 4.7787236100280685e-05, + "loss": 0.3814, + "step": 4048 + }, + { + "epoch": 0.22228320526893525, + "grad_norm": 2.285555362701416, + "learning_rate": 4.7785081915886134e-05, + "loss": 0.4063, + "step": 4050 + }, + { + "epoch": 0.22239297475301867, + "grad_norm": 1.418087363243103, + "learning_rate": 4.778292673202606e-05, + "loss": 0.4187, + "step": 4052 + }, + { + "epoch": 0.2225027442371021, + "grad_norm": 1.0518293380737305, + "learning_rate": 4.7780770548794984e-05, + "loss": 0.2783, + "step": 4054 + }, + { + "epoch": 0.2226125137211855, + "grad_norm": 3.1337034702301025, + "learning_rate": 4.7778613366287505e-05, + "loss": 0.4749, + "step": 4056 + }, + { + "epoch": 0.22272228320526893, + "grad_norm": 1.8895395994186401, + "learning_rate": 4.7776455184598236e-05, + "loss": 0.2279, + "step": 4058 + }, + { + "epoch": 0.22283205268935236, + "grad_norm": 1.1558701992034912, + "learning_rate": 4.777429600382185e-05, + "loss": 0.344, + "step": 4060 + }, + { + "epoch": 0.22294182217343578, + "grad_norm": 2.14320969581604, + "learning_rate": 4.777213582405306e-05, + "loss": 0.4516, + "step": 4062 + }, + { + "epoch": 0.2230515916575192, + "grad_norm": 1.581064224243164, + "learning_rate": 4.776997464538662e-05, + "loss": 0.3869, + "step": 4064 + }, + { + "epoch": 0.22316136114160262, + "grad_norm": 1.5755053758621216, + "learning_rate": 4.776781246791733e-05, + "loss": 0.4525, + "step": 4066 + }, + { + "epoch": 0.22327113062568607, + "grad_norm": 2.412735939025879, + "learning_rate": 4.776564929174003e-05, + "loss": 0.2801, + "step": 4068 + }, + { + "epoch": 0.2233809001097695, + "grad_norm": 4.126567840576172, + "learning_rate": 4.776348511694961e-05, + "loss": 0.3705, + "step": 4070 + }, + { + "epoch": 0.22349066959385291, + "grad_norm": 1.6538273096084595, + "learning_rate": 4.776131994364102e-05, + "loss": 0.5479, + "step": 4072 + }, + { + "epoch": 0.22360043907793634, + "grad_norm": 1.4322946071624756, + "learning_rate": 4.77591537719092e-05, + "loss": 0.3071, + "step": 4074 + }, + { + "epoch": 0.22371020856201976, + "grad_norm": 2.3544397354125977, + "learning_rate": 4.775698660184919e-05, + "loss": 0.4946, + "step": 4076 + }, + { + "epoch": 0.22381997804610318, + "grad_norm": 2.0867416858673096, + "learning_rate": 4.775481843355606e-05, + "loss": 0.3646, + "step": 4078 + }, + { + "epoch": 0.2239297475301866, + "grad_norm": 2.2534782886505127, + "learning_rate": 4.775264926712489e-05, + "loss": 0.429, + "step": 4080 + }, + { + "epoch": 0.22403951701427002, + "grad_norm": 1.9833158254623413, + "learning_rate": 4.775047910265086e-05, + "loss": 0.4149, + "step": 4082 + }, + { + "epoch": 0.22414928649835344, + "grad_norm": 1.6948546171188354, + "learning_rate": 4.774830794022915e-05, + "loss": 0.5237, + "step": 4084 + }, + { + "epoch": 0.2242590559824369, + "grad_norm": 2.2782955169677734, + "learning_rate": 4.7746135779954995e-05, + "loss": 0.3239, + "step": 4086 + }, + { + "epoch": 0.22436882546652032, + "grad_norm": 1.8417588472366333, + "learning_rate": 4.7743962621923674e-05, + "loss": 0.3208, + "step": 4088 + }, + { + "epoch": 0.22447859495060374, + "grad_norm": 2.2538936138153076, + "learning_rate": 4.774178846623053e-05, + "loss": 0.4123, + "step": 4090 + }, + { + "epoch": 0.22458836443468716, + "grad_norm": 1.4287444353103638, + "learning_rate": 4.773961331297092e-05, + "loss": 0.3596, + "step": 4092 + }, + { + "epoch": 0.22469813391877058, + "grad_norm": 2.2594645023345947, + "learning_rate": 4.773743716224025e-05, + "loss": 0.3484, + "step": 4094 + }, + { + "epoch": 0.224807903402854, + "grad_norm": 1.5188583135604858, + "learning_rate": 4.7735260014133986e-05, + "loss": 0.3955, + "step": 4096 + }, + { + "epoch": 0.22491767288693743, + "grad_norm": 1.9417874813079834, + "learning_rate": 4.7733081868747626e-05, + "loss": 0.4446, + "step": 4098 + }, + { + "epoch": 0.22502744237102085, + "grad_norm": 1.6921424865722656, + "learning_rate": 4.773090272617672e-05, + "loss": 0.5171, + "step": 4100 + }, + { + "epoch": 0.22513721185510427, + "grad_norm": 1.8400663137435913, + "learning_rate": 4.772872258651684e-05, + "loss": 0.5714, + "step": 4102 + }, + { + "epoch": 0.22524698133918772, + "grad_norm": 1.6464987993240356, + "learning_rate": 4.772654144986364e-05, + "loss": 0.3956, + "step": 4104 + }, + { + "epoch": 0.22535675082327114, + "grad_norm": 2.679349422454834, + "learning_rate": 4.772435931631278e-05, + "loss": 0.3868, + "step": 4106 + }, + { + "epoch": 0.22546652030735456, + "grad_norm": 2.001236915588379, + "learning_rate": 4.7722176185959974e-05, + "loss": 0.324, + "step": 4108 + }, + { + "epoch": 0.22557628979143798, + "grad_norm": 2.3367698192596436, + "learning_rate": 4.7719992058901006e-05, + "loss": 0.331, + "step": 4110 + }, + { + "epoch": 0.2256860592755214, + "grad_norm": 2.0561118125915527, + "learning_rate": 4.7717806935231665e-05, + "loss": 0.3152, + "step": 4112 + }, + { + "epoch": 0.22579582875960483, + "grad_norm": 2.3733482360839844, + "learning_rate": 4.77156208150478e-05, + "loss": 0.4983, + "step": 4114 + }, + { + "epoch": 0.22590559824368825, + "grad_norm": 1.3838893175125122, + "learning_rate": 4.771343369844532e-05, + "loss": 0.4045, + "step": 4116 + }, + { + "epoch": 0.22601536772777167, + "grad_norm": 4.320215702056885, + "learning_rate": 4.771124558552015e-05, + "loss": 0.3894, + "step": 4118 + }, + { + "epoch": 0.2261251372118551, + "grad_norm": 2.358649730682373, + "learning_rate": 4.770905647636828e-05, + "loss": 0.4491, + "step": 4120 + }, + { + "epoch": 0.22623490669593854, + "grad_norm": 1.3792657852172852, + "learning_rate": 4.7706866371085726e-05, + "loss": 0.3783, + "step": 4122 + }, + { + "epoch": 0.22634467618002196, + "grad_norm": 1.6789357662200928, + "learning_rate": 4.7704675269768565e-05, + "loss": 0.2903, + "step": 4124 + }, + { + "epoch": 0.22645444566410539, + "grad_norm": 1.3651833534240723, + "learning_rate": 4.77024831725129e-05, + "loss": 0.2474, + "step": 4126 + }, + { + "epoch": 0.2265642151481888, + "grad_norm": 3.135831594467163, + "learning_rate": 4.7700290079414896e-05, + "loss": 0.3687, + "step": 4128 + }, + { + "epoch": 0.22667398463227223, + "grad_norm": 3.4429593086242676, + "learning_rate": 4.769809599057075e-05, + "loss": 0.4652, + "step": 4130 + }, + { + "epoch": 0.22678375411635565, + "grad_norm": 1.8342524766921997, + "learning_rate": 4.76959009060767e-05, + "loss": 0.4016, + "step": 4132 + }, + { + "epoch": 0.22689352360043907, + "grad_norm": 1.8908830881118774, + "learning_rate": 4.769370482602904e-05, + "loss": 0.4879, + "step": 4134 + }, + { + "epoch": 0.2270032930845225, + "grad_norm": 1.8179155588150024, + "learning_rate": 4.769150775052411e-05, + "loss": 0.2367, + "step": 4136 + }, + { + "epoch": 0.22711306256860592, + "grad_norm": 3.057727098464966, + "learning_rate": 4.7689309679658257e-05, + "loss": 0.4046, + "step": 4138 + }, + { + "epoch": 0.22722283205268934, + "grad_norm": 2.4411890506744385, + "learning_rate": 4.7687110613527926e-05, + "loss": 0.3497, + "step": 4140 + }, + { + "epoch": 0.2273326015367728, + "grad_norm": 1.9157335758209229, + "learning_rate": 4.768491055222957e-05, + "loss": 0.2644, + "step": 4142 + }, + { + "epoch": 0.2274423710208562, + "grad_norm": 2.253709077835083, + "learning_rate": 4.768270949585968e-05, + "loss": 0.3871, + "step": 4144 + }, + { + "epoch": 0.22755214050493963, + "grad_norm": 1.6458079814910889, + "learning_rate": 4.768050744451483e-05, + "loss": 0.437, + "step": 4146 + }, + { + "epoch": 0.22766190998902305, + "grad_norm": 2.5520501136779785, + "learning_rate": 4.76783043982916e-05, + "loss": 0.5435, + "step": 4148 + }, + { + "epoch": 0.22777167947310648, + "grad_norm": 1.8270812034606934, + "learning_rate": 4.7676100357286624e-05, + "loss": 0.413, + "step": 4150 + }, + { + "epoch": 0.2278814489571899, + "grad_norm": 2.6614770889282227, + "learning_rate": 4.767389532159659e-05, + "loss": 0.3896, + "step": 4152 + }, + { + "epoch": 0.22799121844127332, + "grad_norm": 1.6737784147262573, + "learning_rate": 4.767168929131821e-05, + "loss": 0.4098, + "step": 4154 + }, + { + "epoch": 0.22810098792535674, + "grad_norm": 1.6383023262023926, + "learning_rate": 4.7669482266548264e-05, + "loss": 0.3638, + "step": 4156 + }, + { + "epoch": 0.22821075740944016, + "grad_norm": 1.6260226964950562, + "learning_rate": 4.766727424738356e-05, + "loss": 0.3876, + "step": 4158 + }, + { + "epoch": 0.2283205268935236, + "grad_norm": 2.4016144275665283, + "learning_rate": 4.7665065233920945e-05, + "loss": 0.5129, + "step": 4160 + }, + { + "epoch": 0.22843029637760703, + "grad_norm": 2.8564164638519287, + "learning_rate": 4.7662855226257324e-05, + "loss": 0.3189, + "step": 4162 + }, + { + "epoch": 0.22854006586169046, + "grad_norm": 2.618034601211548, + "learning_rate": 4.766064422448964e-05, + "loss": 0.3982, + "step": 4164 + }, + { + "epoch": 0.22864983534577388, + "grad_norm": 1.7198436260223389, + "learning_rate": 4.7658432228714866e-05, + "loss": 0.2662, + "step": 4166 + }, + { + "epoch": 0.2287596048298573, + "grad_norm": 2.234762191772461, + "learning_rate": 4.7656219239030046e-05, + "loss": 0.3434, + "step": 4168 + }, + { + "epoch": 0.22886937431394072, + "grad_norm": 1.5690256357192993, + "learning_rate": 4.7654005255532244e-05, + "loss": 0.3178, + "step": 4170 + }, + { + "epoch": 0.22897914379802414, + "grad_norm": 2.323016881942749, + "learning_rate": 4.765179027831858e-05, + "loss": 0.2883, + "step": 4172 + }, + { + "epoch": 0.22908891328210756, + "grad_norm": 1.8699212074279785, + "learning_rate": 4.764957430748622e-05, + "loss": 0.5148, + "step": 4174 + }, + { + "epoch": 0.229198682766191, + "grad_norm": 1.9301018714904785, + "learning_rate": 4.764735734313236e-05, + "loss": 0.5136, + "step": 4176 + }, + { + "epoch": 0.22930845225027444, + "grad_norm": 3.007248878479004, + "learning_rate": 4.764513938535424e-05, + "loss": 0.3826, + "step": 4178 + }, + { + "epoch": 0.22941822173435786, + "grad_norm": 3.2082653045654297, + "learning_rate": 4.764292043424916e-05, + "loss": 0.4079, + "step": 4180 + }, + { + "epoch": 0.22952799121844128, + "grad_norm": 1.9841172695159912, + "learning_rate": 4.7640700489914444e-05, + "loss": 0.422, + "step": 4182 + }, + { + "epoch": 0.2296377607025247, + "grad_norm": 2.3655149936676025, + "learning_rate": 4.763847955244749e-05, + "loss": 0.3367, + "step": 4184 + }, + { + "epoch": 0.22974753018660812, + "grad_norm": 3.082956075668335, + "learning_rate": 4.7636257621945704e-05, + "loss": 0.6911, + "step": 4186 + }, + { + "epoch": 0.22985729967069155, + "grad_norm": 1.588028907775879, + "learning_rate": 4.7634034698506545e-05, + "loss": 0.3559, + "step": 4188 + }, + { + "epoch": 0.22996706915477497, + "grad_norm": 1.7267999649047852, + "learning_rate": 4.7631810782227535e-05, + "loss": 0.418, + "step": 4190 + }, + { + "epoch": 0.2300768386388584, + "grad_norm": 2.290015697479248, + "learning_rate": 4.7629585873206226e-05, + "loss": 0.3626, + "step": 4192 + }, + { + "epoch": 0.2301866081229418, + "grad_norm": 1.6633563041687012, + "learning_rate": 4.76273599715402e-05, + "loss": 0.2511, + "step": 4194 + }, + { + "epoch": 0.23029637760702526, + "grad_norm": 1.4218146800994873, + "learning_rate": 4.762513307732711e-05, + "loss": 0.2773, + "step": 4196 + }, + { + "epoch": 0.23040614709110868, + "grad_norm": 2.2498321533203125, + "learning_rate": 4.762290519066464e-05, + "loss": 0.3394, + "step": 4198 + }, + { + "epoch": 0.2305159165751921, + "grad_norm": 1.4671210050582886, + "learning_rate": 4.762067631165049e-05, + "loss": 0.2636, + "step": 4200 + }, + { + "epoch": 0.23062568605927553, + "grad_norm": 1.4234280586242676, + "learning_rate": 4.7618446440382455e-05, + "loss": 0.4741, + "step": 4202 + }, + { + "epoch": 0.23073545554335895, + "grad_norm": 2.23885178565979, + "learning_rate": 4.761621557695834e-05, + "loss": 0.3673, + "step": 4204 + }, + { + "epoch": 0.23084522502744237, + "grad_norm": 1.142303228378296, + "learning_rate": 4.761398372147601e-05, + "loss": 0.2797, + "step": 4206 + }, + { + "epoch": 0.2309549945115258, + "grad_norm": 1.7091857194900513, + "learning_rate": 4.7611750874033356e-05, + "loss": 0.2451, + "step": 4208 + }, + { + "epoch": 0.2310647639956092, + "grad_norm": 1.8580242395401, + "learning_rate": 4.760951703472832e-05, + "loss": 0.3352, + "step": 4210 + }, + { + "epoch": 0.23117453347969263, + "grad_norm": 1.9839634895324707, + "learning_rate": 4.76072822036589e-05, + "loss": 0.4609, + "step": 4212 + }, + { + "epoch": 0.23128430296377608, + "grad_norm": 1.955080270767212, + "learning_rate": 4.760504638092311e-05, + "loss": 0.3697, + "step": 4214 + }, + { + "epoch": 0.2313940724478595, + "grad_norm": 2.071753978729248, + "learning_rate": 4.760280956661903e-05, + "loss": 0.5747, + "step": 4216 + }, + { + "epoch": 0.23150384193194293, + "grad_norm": 2.189081907272339, + "learning_rate": 4.760057176084479e-05, + "loss": 0.447, + "step": 4218 + }, + { + "epoch": 0.23161361141602635, + "grad_norm": 1.1173670291900635, + "learning_rate": 4.7598332963698545e-05, + "loss": 0.3512, + "step": 4220 + }, + { + "epoch": 0.23172338090010977, + "grad_norm": 2.141491413116455, + "learning_rate": 4.75960931752785e-05, + "loss": 0.3287, + "step": 4222 + }, + { + "epoch": 0.2318331503841932, + "grad_norm": 1.631437063217163, + "learning_rate": 4.759385239568289e-05, + "loss": 0.411, + "step": 4224 + }, + { + "epoch": 0.23194291986827661, + "grad_norm": 2.6236684322357178, + "learning_rate": 4.759161062501002e-05, + "loss": 0.4725, + "step": 4226 + }, + { + "epoch": 0.23205268935236004, + "grad_norm": 1.7024621963500977, + "learning_rate": 4.7589367863358225e-05, + "loss": 0.3462, + "step": 4228 + }, + { + "epoch": 0.23216245883644346, + "grad_norm": 2.057704210281372, + "learning_rate": 4.7587124110825875e-05, + "loss": 0.4206, + "step": 4230 + }, + { + "epoch": 0.23227222832052688, + "grad_norm": 1.1480622291564941, + "learning_rate": 4.7584879367511395e-05, + "loss": 0.3152, + "step": 4232 + }, + { + "epoch": 0.23238199780461033, + "grad_norm": 1.9713517427444458, + "learning_rate": 4.7582633633513266e-05, + "loss": 0.3183, + "step": 4234 + }, + { + "epoch": 0.23249176728869375, + "grad_norm": 1.6149277687072754, + "learning_rate": 4.758038690892997e-05, + "loss": 0.3509, + "step": 4236 + }, + { + "epoch": 0.23260153677277717, + "grad_norm": 2.6468122005462646, + "learning_rate": 4.7578139193860076e-05, + "loss": 0.4139, + "step": 4238 + }, + { + "epoch": 0.2327113062568606, + "grad_norm": 2.0688791275024414, + "learning_rate": 4.7575890488402185e-05, + "loss": 0.2725, + "step": 4240 + }, + { + "epoch": 0.23282107574094402, + "grad_norm": 3.1904189586639404, + "learning_rate": 4.7573640792654913e-05, + "loss": 0.4847, + "step": 4242 + }, + { + "epoch": 0.23293084522502744, + "grad_norm": 2.8937160968780518, + "learning_rate": 4.757139010671697e-05, + "loss": 0.4759, + "step": 4244 + }, + { + "epoch": 0.23304061470911086, + "grad_norm": 3.619690179824829, + "learning_rate": 4.756913843068707e-05, + "loss": 0.4769, + "step": 4246 + }, + { + "epoch": 0.23315038419319428, + "grad_norm": 2.4387266635894775, + "learning_rate": 4.756688576466398e-05, + "loss": 0.384, + "step": 4248 + }, + { + "epoch": 0.2332601536772777, + "grad_norm": 1.9038208723068237, + "learning_rate": 4.756463210874652e-05, + "loss": 0.2984, + "step": 4250 + }, + { + "epoch": 0.23336992316136115, + "grad_norm": 2.239190101623535, + "learning_rate": 4.7562377463033536e-05, + "loss": 0.2985, + "step": 4252 + }, + { + "epoch": 0.23347969264544458, + "grad_norm": 1.7674202919006348, + "learning_rate": 4.7560121827623936e-05, + "loss": 0.4636, + "step": 4254 + }, + { + "epoch": 0.233589462129528, + "grad_norm": 1.5209506750106812, + "learning_rate": 4.7557865202616656e-05, + "loss": 0.3958, + "step": 4256 + }, + { + "epoch": 0.23369923161361142, + "grad_norm": 1.5390881299972534, + "learning_rate": 4.75556075881107e-05, + "loss": 0.4403, + "step": 4258 + }, + { + "epoch": 0.23380900109769484, + "grad_norm": 1.694185495376587, + "learning_rate": 4.755334898420507e-05, + "loss": 0.3601, + "step": 4260 + }, + { + "epoch": 0.23391877058177826, + "grad_norm": 2.2951207160949707, + "learning_rate": 4.755108939099887e-05, + "loss": 0.4647, + "step": 4262 + }, + { + "epoch": 0.23402854006586168, + "grad_norm": 1.635766863822937, + "learning_rate": 4.7548828808591195e-05, + "loss": 0.4687, + "step": 4264 + }, + { + "epoch": 0.2341383095499451, + "grad_norm": 2.3402421474456787, + "learning_rate": 4.754656723708121e-05, + "loss": 0.3766, + "step": 4266 + }, + { + "epoch": 0.23424807903402853, + "grad_norm": 1.31655752658844, + "learning_rate": 4.754430467656812e-05, + "loss": 0.331, + "step": 4268 + }, + { + "epoch": 0.23435784851811198, + "grad_norm": 2.9529051780700684, + "learning_rate": 4.7542041127151184e-05, + "loss": 0.4003, + "step": 4270 + }, + { + "epoch": 0.2344676180021954, + "grad_norm": 2.210334062576294, + "learning_rate": 4.753977658892967e-05, + "loss": 0.3558, + "step": 4272 + }, + { + "epoch": 0.23457738748627882, + "grad_norm": 1.711230993270874, + "learning_rate": 4.753751106200293e-05, + "loss": 0.2501, + "step": 4274 + }, + { + "epoch": 0.23468715697036224, + "grad_norm": 1.7573057413101196, + "learning_rate": 4.7535244546470325e-05, + "loss": 0.4794, + "step": 4276 + }, + { + "epoch": 0.23479692645444566, + "grad_norm": 1.5393537282943726, + "learning_rate": 4.753297704243129e-05, + "loss": 0.251, + "step": 4278 + }, + { + "epoch": 0.2349066959385291, + "grad_norm": 2.472729206085205, + "learning_rate": 4.7530708549985287e-05, + "loss": 0.493, + "step": 4280 + }, + { + "epoch": 0.2350164654226125, + "grad_norm": 2.3834095001220703, + "learning_rate": 4.7528439069231815e-05, + "loss": 0.4327, + "step": 4282 + }, + { + "epoch": 0.23512623490669593, + "grad_norm": 1.1464766263961792, + "learning_rate": 4.7526168600270435e-05, + "loss": 0.3377, + "step": 4284 + }, + { + "epoch": 0.23523600439077935, + "grad_norm": 2.078918218612671, + "learning_rate": 4.7523897143200724e-05, + "loss": 0.3427, + "step": 4286 + }, + { + "epoch": 0.2353457738748628, + "grad_norm": 3.0025389194488525, + "learning_rate": 4.752162469812234e-05, + "loss": 0.4368, + "step": 4288 + }, + { + "epoch": 0.23545554335894622, + "grad_norm": 2.1669163703918457, + "learning_rate": 4.751935126513496e-05, + "loss": 0.3936, + "step": 4290 + }, + { + "epoch": 0.23556531284302965, + "grad_norm": 2.9495880603790283, + "learning_rate": 4.7517076844338285e-05, + "loss": 0.4401, + "step": 4292 + }, + { + "epoch": 0.23567508232711307, + "grad_norm": 2.412649154663086, + "learning_rate": 4.751480143583211e-05, + "loss": 0.4409, + "step": 4294 + }, + { + "epoch": 0.2357848518111965, + "grad_norm": 1.6544116735458374, + "learning_rate": 4.751252503971624e-05, + "loss": 0.3703, + "step": 4296 + }, + { + "epoch": 0.2358946212952799, + "grad_norm": 1.1507232189178467, + "learning_rate": 4.751024765609051e-05, + "loss": 0.4637, + "step": 4298 + }, + { + "epoch": 0.23600439077936333, + "grad_norm": 2.2043566703796387, + "learning_rate": 4.7507969285054845e-05, + "loss": 0.4702, + "step": 4300 + }, + { + "epoch": 0.23611416026344675, + "grad_norm": 2.6112916469573975, + "learning_rate": 4.750568992670916e-05, + "loss": 0.4301, + "step": 4302 + }, + { + "epoch": 0.23622392974753018, + "grad_norm": 1.7292306423187256, + "learning_rate": 4.750340958115346e-05, + "loss": 0.5817, + "step": 4304 + }, + { + "epoch": 0.2363336992316136, + "grad_norm": 1.6004762649536133, + "learning_rate": 4.7501128248487755e-05, + "loss": 0.3563, + "step": 4306 + }, + { + "epoch": 0.23644346871569705, + "grad_norm": 1.8907843828201294, + "learning_rate": 4.749884592881212e-05, + "loss": 0.3821, + "step": 4308 + }, + { + "epoch": 0.23655323819978047, + "grad_norm": 2.1436381340026855, + "learning_rate": 4.749656262222668e-05, + "loss": 0.3684, + "step": 4310 + }, + { + "epoch": 0.2366630076838639, + "grad_norm": 2.2730937004089355, + "learning_rate": 4.7494278328831584e-05, + "loss": 0.4038, + "step": 4312 + }, + { + "epoch": 0.2367727771679473, + "grad_norm": 1.4591721296310425, + "learning_rate": 4.749199304872703e-05, + "loss": 0.4447, + "step": 4314 + }, + { + "epoch": 0.23688254665203073, + "grad_norm": 2.5017693042755127, + "learning_rate": 4.748970678201326e-05, + "loss": 0.3756, + "step": 4316 + }, + { + "epoch": 0.23699231613611416, + "grad_norm": 2.546053647994995, + "learning_rate": 4.748741952879057e-05, + "loss": 0.6148, + "step": 4318 + }, + { + "epoch": 0.23710208562019758, + "grad_norm": 1.3538914918899536, + "learning_rate": 4.7485131289159276e-05, + "loss": 0.28, + "step": 4320 + }, + { + "epoch": 0.237211855104281, + "grad_norm": 1.4684221744537354, + "learning_rate": 4.748284206321977e-05, + "loss": 0.4142, + "step": 4322 + }, + { + "epoch": 0.23732162458836442, + "grad_norm": 1.4833909273147583, + "learning_rate": 4.7480551851072454e-05, + "loss": 0.3874, + "step": 4324 + }, + { + "epoch": 0.23743139407244787, + "grad_norm": 2.5113627910614014, + "learning_rate": 4.74782606528178e-05, + "loss": 0.3709, + "step": 4326 + }, + { + "epoch": 0.2375411635565313, + "grad_norm": 2.6678335666656494, + "learning_rate": 4.7475968468556295e-05, + "loss": 0.433, + "step": 4328 + }, + { + "epoch": 0.23765093304061471, + "grad_norm": 2.020110845565796, + "learning_rate": 4.7473675298388495e-05, + "loss": 0.3374, + "step": 4330 + }, + { + "epoch": 0.23776070252469814, + "grad_norm": 1.9699070453643799, + "learning_rate": 4.747138114241499e-05, + "loss": 0.4145, + "step": 4332 + }, + { + "epoch": 0.23787047200878156, + "grad_norm": 3.611104965209961, + "learning_rate": 4.7469086000736415e-05, + "loss": 0.4654, + "step": 4334 + }, + { + "epoch": 0.23798024149286498, + "grad_norm": 2.0138306617736816, + "learning_rate": 4.7466789873453444e-05, + "loss": 0.379, + "step": 4336 + }, + { + "epoch": 0.2380900109769484, + "grad_norm": 1.7838205099105835, + "learning_rate": 4.746449276066679e-05, + "loss": 0.3651, + "step": 4338 + }, + { + "epoch": 0.23819978046103182, + "grad_norm": 1.5425498485565186, + "learning_rate": 4.746219466247722e-05, + "loss": 0.2934, + "step": 4340 + }, + { + "epoch": 0.23830954994511525, + "grad_norm": 2.9235551357269287, + "learning_rate": 4.7459895578985545e-05, + "loss": 0.4182, + "step": 4342 + }, + { + "epoch": 0.2384193194291987, + "grad_norm": 1.5947749614715576, + "learning_rate": 4.745759551029261e-05, + "loss": 0.3807, + "step": 4344 + }, + { + "epoch": 0.23852908891328212, + "grad_norm": 2.100144624710083, + "learning_rate": 4.745529445649931e-05, + "loss": 0.3712, + "step": 4346 + }, + { + "epoch": 0.23863885839736554, + "grad_norm": 1.7437595129013062, + "learning_rate": 4.745299241770658e-05, + "loss": 0.338, + "step": 4348 + }, + { + "epoch": 0.23874862788144896, + "grad_norm": 1.4482954740524292, + "learning_rate": 4.745068939401539e-05, + "loss": 0.2784, + "step": 4350 + }, + { + "epoch": 0.23885839736553238, + "grad_norm": 1.9753851890563965, + "learning_rate": 4.744838538552677e-05, + "loss": 0.2949, + "step": 4352 + }, + { + "epoch": 0.2389681668496158, + "grad_norm": 1.705527901649475, + "learning_rate": 4.744608039234179e-05, + "loss": 0.3071, + "step": 4354 + }, + { + "epoch": 0.23907793633369923, + "grad_norm": 1.4556300640106201, + "learning_rate": 4.744377441456155e-05, + "loss": 0.3587, + "step": 4356 + }, + { + "epoch": 0.23918770581778265, + "grad_norm": 1.4701536893844604, + "learning_rate": 4.74414674522872e-05, + "loss": 0.3228, + "step": 4358 + }, + { + "epoch": 0.23929747530186607, + "grad_norm": 2.395641803741455, + "learning_rate": 4.743915950561994e-05, + "loss": 0.3732, + "step": 4360 + }, + { + "epoch": 0.23940724478594952, + "grad_norm": 2.48983097076416, + "learning_rate": 4.743685057466101e-05, + "loss": 0.2817, + "step": 4362 + }, + { + "epoch": 0.23951701427003294, + "grad_norm": 2.475370168685913, + "learning_rate": 4.743454065951168e-05, + "loss": 0.3965, + "step": 4364 + }, + { + "epoch": 0.23962678375411636, + "grad_norm": 2.4753880500793457, + "learning_rate": 4.743222976027328e-05, + "loss": 0.3493, + "step": 4366 + }, + { + "epoch": 0.23973655323819978, + "grad_norm": 2.030813694000244, + "learning_rate": 4.742991787704719e-05, + "loss": 0.4095, + "step": 4368 + }, + { + "epoch": 0.2398463227222832, + "grad_norm": 1.590721607208252, + "learning_rate": 4.742760500993481e-05, + "loss": 0.3942, + "step": 4370 + }, + { + "epoch": 0.23995609220636663, + "grad_norm": 2.079615354537964, + "learning_rate": 4.7425291159037575e-05, + "loss": 0.4477, + "step": 4372 + }, + { + "epoch": 0.24006586169045005, + "grad_norm": 1.522128939628601, + "learning_rate": 4.742297632445701e-05, + "loss": 0.3867, + "step": 4374 + }, + { + "epoch": 0.24017563117453347, + "grad_norm": 1.1746517419815063, + "learning_rate": 4.742066050629465e-05, + "loss": 0.243, + "step": 4376 + }, + { + "epoch": 0.2402854006586169, + "grad_norm": 2.5982208251953125, + "learning_rate": 4.7418343704652066e-05, + "loss": 0.3881, + "step": 4378 + }, + { + "epoch": 0.24039517014270034, + "grad_norm": 1.3545206785202026, + "learning_rate": 4.7416025919630904e-05, + "loss": 0.3852, + "step": 4380 + }, + { + "epoch": 0.24050493962678376, + "grad_norm": 1.468131184577942, + "learning_rate": 4.7413707151332806e-05, + "loss": 0.2743, + "step": 4382 + }, + { + "epoch": 0.2406147091108672, + "grad_norm": 1.5574736595153809, + "learning_rate": 4.741138739985951e-05, + "loss": 0.4097, + "step": 4384 + }, + { + "epoch": 0.2407244785949506, + "grad_norm": 2.315612554550171, + "learning_rate": 4.740906666531275e-05, + "loss": 0.5513, + "step": 4386 + }, + { + "epoch": 0.24083424807903403, + "grad_norm": 1.7469784021377563, + "learning_rate": 4.740674494779435e-05, + "loss": 0.238, + "step": 4388 + }, + { + "epoch": 0.24094401756311745, + "grad_norm": 1.538866639137268, + "learning_rate": 4.740442224740612e-05, + "loss": 0.4722, + "step": 4390 + }, + { + "epoch": 0.24105378704720087, + "grad_norm": 1.336159348487854, + "learning_rate": 4.7402098564249974e-05, + "loss": 0.5148, + "step": 4392 + }, + { + "epoch": 0.2411635565312843, + "grad_norm": 1.2958937883377075, + "learning_rate": 4.739977389842783e-05, + "loss": 0.3399, + "step": 4394 + }, + { + "epoch": 0.24127332601536772, + "grad_norm": 2.1434805393218994, + "learning_rate": 4.739744825004165e-05, + "loss": 0.2852, + "step": 4396 + }, + { + "epoch": 0.24138309549945114, + "grad_norm": 1.9049288034439087, + "learning_rate": 4.7395121619193465e-05, + "loss": 0.4167, + "step": 4398 + }, + { + "epoch": 0.2414928649835346, + "grad_norm": 2.3075180053710938, + "learning_rate": 4.7392794005985326e-05, + "loss": 0.4469, + "step": 4400 + }, + { + "epoch": 0.241602634467618, + "grad_norm": 3.368734121322632, + "learning_rate": 4.7390465410519326e-05, + "loss": 0.3988, + "step": 4402 + }, + { + "epoch": 0.24171240395170143, + "grad_norm": 2.025636672973633, + "learning_rate": 4.738813583289762e-05, + "loss": 0.5882, + "step": 4404 + }, + { + "epoch": 0.24182217343578485, + "grad_norm": 2.319378137588501, + "learning_rate": 4.738580527322238e-05, + "loss": 0.3368, + "step": 4406 + }, + { + "epoch": 0.24193194291986828, + "grad_norm": 1.7699884176254272, + "learning_rate": 4.738347373159585e-05, + "loss": 0.3482, + "step": 4408 + }, + { + "epoch": 0.2420417124039517, + "grad_norm": 1.6302613019943237, + "learning_rate": 4.7381141208120296e-05, + "loss": 0.4267, + "step": 4410 + }, + { + "epoch": 0.24215148188803512, + "grad_norm": 2.3956563472747803, + "learning_rate": 4.737880770289803e-05, + "loss": 0.4739, + "step": 4412 + }, + { + "epoch": 0.24226125137211854, + "grad_norm": 2.2124574184417725, + "learning_rate": 4.7376473216031425e-05, + "loss": 0.4377, + "step": 4414 + }, + { + "epoch": 0.24237102085620196, + "grad_norm": 1.8437236547470093, + "learning_rate": 4.737413774762287e-05, + "loss": 0.2883, + "step": 4416 + }, + { + "epoch": 0.2424807903402854, + "grad_norm": 1.3682926893234253, + "learning_rate": 4.737180129777482e-05, + "loss": 0.3368, + "step": 4418 + }, + { + "epoch": 0.24259055982436883, + "grad_norm": 2.102620840072632, + "learning_rate": 4.736946386658976e-05, + "loss": 0.4907, + "step": 4420 + }, + { + "epoch": 0.24270032930845226, + "grad_norm": 2.168203353881836, + "learning_rate": 4.736712545417021e-05, + "loss": 0.4532, + "step": 4422 + }, + { + "epoch": 0.24281009879253568, + "grad_norm": 2.7998058795928955, + "learning_rate": 4.736478606061875e-05, + "loss": 0.4947, + "step": 4424 + }, + { + "epoch": 0.2429198682766191, + "grad_norm": 1.805408000946045, + "learning_rate": 4.7362445686038014e-05, + "loss": 0.4063, + "step": 4426 + }, + { + "epoch": 0.24302963776070252, + "grad_norm": 1.8013697862625122, + "learning_rate": 4.736010433053064e-05, + "loss": 0.4144, + "step": 4428 + }, + { + "epoch": 0.24313940724478594, + "grad_norm": 2.179189682006836, + "learning_rate": 4.735776199419935e-05, + "loss": 0.3767, + "step": 4430 + }, + { + "epoch": 0.24324917672886937, + "grad_norm": 3.640501022338867, + "learning_rate": 4.735541867714687e-05, + "loss": 0.4264, + "step": 4432 + }, + { + "epoch": 0.2433589462129528, + "grad_norm": 1.518078088760376, + "learning_rate": 4.7353074379476006e-05, + "loss": 0.2849, + "step": 4434 + }, + { + "epoch": 0.24346871569703624, + "grad_norm": 4.833668231964111, + "learning_rate": 4.735072910128957e-05, + "loss": 0.398, + "step": 4436 + }, + { + "epoch": 0.24357848518111966, + "grad_norm": 1.5846948623657227, + "learning_rate": 4.7348382842690466e-05, + "loss": 0.3748, + "step": 4438 + }, + { + "epoch": 0.24368825466520308, + "grad_norm": 2.0966594219207764, + "learning_rate": 4.73460356037816e-05, + "loss": 0.4369, + "step": 4440 + }, + { + "epoch": 0.2437980241492865, + "grad_norm": 2.5105972290039062, + "learning_rate": 4.734368738466592e-05, + "loss": 0.3568, + "step": 4442 + }, + { + "epoch": 0.24390779363336992, + "grad_norm": 1.7211599349975586, + "learning_rate": 4.734133818544645e-05, + "loss": 0.3467, + "step": 4444 + }, + { + "epoch": 0.24401756311745335, + "grad_norm": 2.2398667335510254, + "learning_rate": 4.733898800622623e-05, + "loss": 0.3607, + "step": 4446 + }, + { + "epoch": 0.24412733260153677, + "grad_norm": 2.4085965156555176, + "learning_rate": 4.733663684710835e-05, + "loss": 0.3858, + "step": 4448 + }, + { + "epoch": 0.2442371020856202, + "grad_norm": 2.21409010887146, + "learning_rate": 4.733428470819594e-05, + "loss": 0.5762, + "step": 4450 + }, + { + "epoch": 0.2443468715697036, + "grad_norm": 2.1346523761749268, + "learning_rate": 4.733193158959218e-05, + "loss": 0.3913, + "step": 4452 + }, + { + "epoch": 0.24445664105378706, + "grad_norm": 2.054082155227661, + "learning_rate": 4.732957749140029e-05, + "loss": 0.3186, + "step": 4454 + }, + { + "epoch": 0.24456641053787048, + "grad_norm": 2.4580936431884766, + "learning_rate": 4.7327222413723536e-05, + "loss": 0.5189, + "step": 4456 + }, + { + "epoch": 0.2446761800219539, + "grad_norm": 2.473982095718384, + "learning_rate": 4.732486635666521e-05, + "loss": 0.4203, + "step": 4458 + }, + { + "epoch": 0.24478594950603733, + "grad_norm": 1.814021348953247, + "learning_rate": 4.7322509320328675e-05, + "loss": 0.4364, + "step": 4460 + }, + { + "epoch": 0.24489571899012075, + "grad_norm": 1.9363151788711548, + "learning_rate": 4.732015130481731e-05, + "loss": 0.3964, + "step": 4462 + }, + { + "epoch": 0.24500548847420417, + "grad_norm": 1.9631251096725464, + "learning_rate": 4.731779231023456e-05, + "loss": 0.2878, + "step": 4464 + }, + { + "epoch": 0.2451152579582876, + "grad_norm": 1.4907934665679932, + "learning_rate": 4.73154323366839e-05, + "loss": 0.3656, + "step": 4466 + }, + { + "epoch": 0.245225027442371, + "grad_norm": 1.377186894416809, + "learning_rate": 4.7313071384268836e-05, + "loss": 0.2998, + "step": 4468 + }, + { + "epoch": 0.24533479692645443, + "grad_norm": 1.3729411363601685, + "learning_rate": 4.731070945309295e-05, + "loss": 0.4028, + "step": 4470 + }, + { + "epoch": 0.24544456641053786, + "grad_norm": 2.6836516857147217, + "learning_rate": 4.730834654325984e-05, + "loss": 0.2901, + "step": 4472 + }, + { + "epoch": 0.2455543358946213, + "grad_norm": 1.783523440361023, + "learning_rate": 4.730598265487315e-05, + "loss": 0.4134, + "step": 4474 + }, + { + "epoch": 0.24566410537870473, + "grad_norm": 2.967195749282837, + "learning_rate": 4.730361778803658e-05, + "loss": 0.3531, + "step": 4476 + }, + { + "epoch": 0.24577387486278815, + "grad_norm": 2.6810340881347656, + "learning_rate": 4.730125194285386e-05, + "loss": 0.467, + "step": 4478 + }, + { + "epoch": 0.24588364434687157, + "grad_norm": 1.5032267570495605, + "learning_rate": 4.7298885119428773e-05, + "loss": 0.4112, + "step": 4480 + }, + { + "epoch": 0.245993413830955, + "grad_norm": 2.5889246463775635, + "learning_rate": 4.729651731786513e-05, + "loss": 0.3876, + "step": 4482 + }, + { + "epoch": 0.24610318331503842, + "grad_norm": 2.3496322631835938, + "learning_rate": 4.72941485382668e-05, + "loss": 0.3225, + "step": 4484 + }, + { + "epoch": 0.24621295279912184, + "grad_norm": 3.0398716926574707, + "learning_rate": 4.72917787807377e-05, + "loss": 0.5036, + "step": 4486 + }, + { + "epoch": 0.24632272228320526, + "grad_norm": 3.474410057067871, + "learning_rate": 4.728940804538176e-05, + "loss": 0.3537, + "step": 4488 + }, + { + "epoch": 0.24643249176728868, + "grad_norm": 2.334526300430298, + "learning_rate": 4.7287036332302967e-05, + "loss": 0.5207, + "step": 4490 + }, + { + "epoch": 0.24654226125137213, + "grad_norm": 1.9698269367218018, + "learning_rate": 4.7284663641605384e-05, + "loss": 0.4345, + "step": 4492 + }, + { + "epoch": 0.24665203073545555, + "grad_norm": 1.5127743482589722, + "learning_rate": 4.728228997339308e-05, + "loss": 0.3125, + "step": 4494 + }, + { + "epoch": 0.24676180021953897, + "grad_norm": 1.237381935119629, + "learning_rate": 4.7279915327770155e-05, + "loss": 0.4122, + "step": 4496 + }, + { + "epoch": 0.2468715697036224, + "grad_norm": 2.0708181858062744, + "learning_rate": 4.72775397048408e-05, + "loss": 0.3723, + "step": 4498 + }, + { + "epoch": 0.24698133918770582, + "grad_norm": 2.8110599517822266, + "learning_rate": 4.72751631047092e-05, + "loss": 0.4361, + "step": 4500 + }, + { + "epoch": 0.24709110867178924, + "grad_norm": 1.8859621286392212, + "learning_rate": 4.7272785527479615e-05, + "loss": 0.3322, + "step": 4502 + }, + { + "epoch": 0.24720087815587266, + "grad_norm": 1.754880666732788, + "learning_rate": 4.727040697325634e-05, + "loss": 0.4017, + "step": 4504 + }, + { + "epoch": 0.24731064763995608, + "grad_norm": 3.0061144828796387, + "learning_rate": 4.72680274421437e-05, + "loss": 0.4058, + "step": 4506 + }, + { + "epoch": 0.2474204171240395, + "grad_norm": 5.510927677154541, + "learning_rate": 4.726564693424608e-05, + "loss": 0.3673, + "step": 4508 + }, + { + "epoch": 0.24753018660812295, + "grad_norm": 3.9008500576019287, + "learning_rate": 4.72632654496679e-05, + "loss": 0.4125, + "step": 4510 + }, + { + "epoch": 0.24763995609220638, + "grad_norm": 2.4861838817596436, + "learning_rate": 4.7260882988513624e-05, + "loss": 0.4068, + "step": 4512 + }, + { + "epoch": 0.2477497255762898, + "grad_norm": 8.481901168823242, + "learning_rate": 4.725849955088776e-05, + "loss": 0.3577, + "step": 4514 + }, + { + "epoch": 0.24785949506037322, + "grad_norm": 1.5691986083984375, + "learning_rate": 4.725611513689485e-05, + "loss": 0.3451, + "step": 4516 + }, + { + "epoch": 0.24796926454445664, + "grad_norm": 1.7091432809829712, + "learning_rate": 4.725372974663948e-05, + "loss": 0.3357, + "step": 4518 + }, + { + "epoch": 0.24807903402854006, + "grad_norm": 2.0054755210876465, + "learning_rate": 4.725134338022631e-05, + "loss": 0.4576, + "step": 4520 + }, + { + "epoch": 0.24818880351262348, + "grad_norm": 1.7354460954666138, + "learning_rate": 4.7248956037760004e-05, + "loss": 0.3216, + "step": 4522 + }, + { + "epoch": 0.2482985729967069, + "grad_norm": 1.6417115926742554, + "learning_rate": 4.724656771934528e-05, + "loss": 0.3489, + "step": 4524 + }, + { + "epoch": 0.24840834248079033, + "grad_norm": 2.7124717235565186, + "learning_rate": 4.72441784250869e-05, + "loss": 0.4412, + "step": 4526 + }, + { + "epoch": 0.24851811196487378, + "grad_norm": 2.195239305496216, + "learning_rate": 4.724178815508967e-05, + "loss": 0.3044, + "step": 4528 + }, + { + "epoch": 0.2486278814489572, + "grad_norm": 1.772429347038269, + "learning_rate": 4.723939690945846e-05, + "loss": 0.3742, + "step": 4530 + }, + { + "epoch": 0.24873765093304062, + "grad_norm": 2.324903726577759, + "learning_rate": 4.7237004688298125e-05, + "loss": 0.2795, + "step": 4532 + }, + { + "epoch": 0.24884742041712404, + "grad_norm": 1.4281996488571167, + "learning_rate": 4.7234611491713624e-05, + "loss": 0.4084, + "step": 4534 + }, + { + "epoch": 0.24895718990120747, + "grad_norm": 2.248044013977051, + "learning_rate": 4.723221731980993e-05, + "loss": 0.4593, + "step": 4536 + }, + { + "epoch": 0.2490669593852909, + "grad_norm": 1.7696675062179565, + "learning_rate": 4.722982217269206e-05, + "loss": 0.5189, + "step": 4538 + }, + { + "epoch": 0.2491767288693743, + "grad_norm": 2.3435308933258057, + "learning_rate": 4.7227426050465084e-05, + "loss": 0.5441, + "step": 4540 + }, + { + "epoch": 0.24928649835345773, + "grad_norm": 1.6365221738815308, + "learning_rate": 4.722502895323409e-05, + "loss": 0.4534, + "step": 4542 + }, + { + "epoch": 0.24939626783754115, + "grad_norm": 1.5713003873825073, + "learning_rate": 4.722263088110426e-05, + "loss": 0.3382, + "step": 4544 + }, + { + "epoch": 0.2495060373216246, + "grad_norm": 1.825334072113037, + "learning_rate": 4.722023183418075e-05, + "loss": 0.3851, + "step": 4546 + }, + { + "epoch": 0.24961580680570802, + "grad_norm": 1.8909341096878052, + "learning_rate": 4.7217831812568815e-05, + "loss": 0.3972, + "step": 4548 + }, + { + "epoch": 0.24972557628979145, + "grad_norm": 1.5297553539276123, + "learning_rate": 4.7215430816373726e-05, + "loss": 0.3029, + "step": 4550 + }, + { + "epoch": 0.24983534577387487, + "grad_norm": 2.494837999343872, + "learning_rate": 4.721302884570079e-05, + "loss": 0.4098, + "step": 4552 + }, + { + "epoch": 0.2499451152579583, + "grad_norm": 1.855993628501892, + "learning_rate": 4.721062590065539e-05, + "loss": 0.3763, + "step": 4554 + }, + { + "epoch": 0.25005488474204174, + "grad_norm": 2.3331544399261475, + "learning_rate": 4.720822198134293e-05, + "loss": 0.4356, + "step": 4556 + }, + { + "epoch": 0.25016465422612516, + "grad_norm": 1.1785588264465332, + "learning_rate": 4.7205817087868834e-05, + "loss": 0.3246, + "step": 4558 + }, + { + "epoch": 0.2502744237102086, + "grad_norm": 2.336494207382202, + "learning_rate": 4.720341122033862e-05, + "loss": 0.4142, + "step": 4560 + }, + { + "epoch": 0.250384193194292, + "grad_norm": 1.8330886363983154, + "learning_rate": 4.72010043788578e-05, + "loss": 0.3769, + "step": 4562 + }, + { + "epoch": 0.2504939626783754, + "grad_norm": 2.281031608581543, + "learning_rate": 4.719859656353196e-05, + "loss": 0.4173, + "step": 4564 + }, + { + "epoch": 0.25060373216245885, + "grad_norm": 1.4052140712738037, + "learning_rate": 4.719618777446672e-05, + "loss": 0.4569, + "step": 4566 + }, + { + "epoch": 0.25071350164654227, + "grad_norm": 1.4162375926971436, + "learning_rate": 4.719377801176774e-05, + "loss": 0.379, + "step": 4568 + }, + { + "epoch": 0.2508232711306257, + "grad_norm": 1.869537353515625, + "learning_rate": 4.719136727554072e-05, + "loss": 0.4257, + "step": 4570 + }, + { + "epoch": 0.2509330406147091, + "grad_norm": 2.372753143310547, + "learning_rate": 4.718895556589141e-05, + "loss": 0.3955, + "step": 4572 + }, + { + "epoch": 0.25104281009879253, + "grad_norm": 2.2255842685699463, + "learning_rate": 4.7186542882925604e-05, + "loss": 0.4151, + "step": 4574 + }, + { + "epoch": 0.25115257958287596, + "grad_norm": 2.1957828998565674, + "learning_rate": 4.718412922674913e-05, + "loss": 0.3245, + "step": 4576 + }, + { + "epoch": 0.2512623490669594, + "grad_norm": 3.628544569015503, + "learning_rate": 4.718171459746785e-05, + "loss": 0.4935, + "step": 4578 + }, + { + "epoch": 0.2513721185510428, + "grad_norm": 2.240464687347412, + "learning_rate": 4.71792989951877e-05, + "loss": 0.4092, + "step": 4580 + }, + { + "epoch": 0.2514818880351262, + "grad_norm": 1.7942688465118408, + "learning_rate": 4.717688242001464e-05, + "loss": 0.4864, + "step": 4582 + }, + { + "epoch": 0.25159165751920964, + "grad_norm": 1.6869524717330933, + "learning_rate": 4.717446487205466e-05, + "loss": 0.3804, + "step": 4584 + }, + { + "epoch": 0.25170142700329307, + "grad_norm": 1.4527387619018555, + "learning_rate": 4.717204635141381e-05, + "loss": 0.3735, + "step": 4586 + }, + { + "epoch": 0.2518111964873765, + "grad_norm": 2.5535004138946533, + "learning_rate": 4.716962685819819e-05, + "loss": 0.4134, + "step": 4588 + }, + { + "epoch": 0.2519209659714599, + "grad_norm": 2.0283989906311035, + "learning_rate": 4.716720639251392e-05, + "loss": 0.4802, + "step": 4590 + }, + { + "epoch": 0.2520307354555434, + "grad_norm": 3.015749216079712, + "learning_rate": 4.7164784954467166e-05, + "loss": 0.4286, + "step": 4592 + }, + { + "epoch": 0.2521405049396268, + "grad_norm": 2.4181034564971924, + "learning_rate": 4.716236254416415e-05, + "loss": 0.5148, + "step": 4594 + }, + { + "epoch": 0.25225027442371023, + "grad_norm": 1.490950584411621, + "learning_rate": 4.715993916171114e-05, + "loss": 0.3836, + "step": 4596 + }, + { + "epoch": 0.25236004390779365, + "grad_norm": 2.2827372550964355, + "learning_rate": 4.715751480721443e-05, + "loss": 0.4249, + "step": 4598 + }, + { + "epoch": 0.2524698133918771, + "grad_norm": 1.4199413061141968, + "learning_rate": 4.715508948078037e-05, + "loss": 0.4293, + "step": 4600 + }, + { + "epoch": 0.2525795828759605, + "grad_norm": 1.5931822061538696, + "learning_rate": 4.715266318251534e-05, + "loss": 0.3252, + "step": 4602 + }, + { + "epoch": 0.2526893523600439, + "grad_norm": 1.918935775756836, + "learning_rate": 4.715023591252576e-05, + "loss": 0.3766, + "step": 4604 + }, + { + "epoch": 0.25279912184412734, + "grad_norm": 2.8270232677459717, + "learning_rate": 4.714780767091813e-05, + "loss": 0.4877, + "step": 4606 + }, + { + "epoch": 0.25290889132821076, + "grad_norm": 3.7373414039611816, + "learning_rate": 4.714537845779894e-05, + "loss": 0.3692, + "step": 4608 + }, + { + "epoch": 0.2530186608122942, + "grad_norm": 1.2964197397232056, + "learning_rate": 4.7142948273274754e-05, + "loss": 0.4837, + "step": 4610 + }, + { + "epoch": 0.2531284302963776, + "grad_norm": 1.6061490774154663, + "learning_rate": 4.714051711745217e-05, + "loss": 0.378, + "step": 4612 + }, + { + "epoch": 0.253238199780461, + "grad_norm": 2.475186824798584, + "learning_rate": 4.713808499043784e-05, + "loss": 0.318, + "step": 4614 + }, + { + "epoch": 0.25334796926454445, + "grad_norm": 1.425100326538086, + "learning_rate": 4.713565189233844e-05, + "loss": 0.2807, + "step": 4616 + }, + { + "epoch": 0.25345773874862787, + "grad_norm": 1.9389842748641968, + "learning_rate": 4.7133217823260694e-05, + "loss": 0.344, + "step": 4618 + }, + { + "epoch": 0.2535675082327113, + "grad_norm": 1.5721851587295532, + "learning_rate": 4.713078278331138e-05, + "loss": 0.3515, + "step": 4620 + }, + { + "epoch": 0.2536772777167947, + "grad_norm": 2.002131700515747, + "learning_rate": 4.712834677259732e-05, + "loss": 0.3394, + "step": 4622 + }, + { + "epoch": 0.25378704720087814, + "grad_norm": 2.537458896636963, + "learning_rate": 4.712590979122534e-05, + "loss": 0.2645, + "step": 4624 + }, + { + "epoch": 0.25389681668496156, + "grad_norm": 2.6696951389312744, + "learning_rate": 4.7123471839302367e-05, + "loss": 0.4756, + "step": 4626 + }, + { + "epoch": 0.254006586169045, + "grad_norm": 3.3623530864715576, + "learning_rate": 4.712103291693533e-05, + "loss": 0.4365, + "step": 4628 + }, + { + "epoch": 0.25411635565312846, + "grad_norm": 1.3535360097885132, + "learning_rate": 4.7118593024231214e-05, + "loss": 0.3885, + "step": 4630 + }, + { + "epoch": 0.2542261251372119, + "grad_norm": 1.939564824104309, + "learning_rate": 4.7116152161297045e-05, + "loss": 0.5224, + "step": 4632 + }, + { + "epoch": 0.2543358946212953, + "grad_norm": 3.009676694869995, + "learning_rate": 4.711371032823988e-05, + "loss": 0.3286, + "step": 4634 + }, + { + "epoch": 0.2544456641053787, + "grad_norm": 1.5756648778915405, + "learning_rate": 4.7111267525166845e-05, + "loss": 0.2553, + "step": 4636 + }, + { + "epoch": 0.25455543358946214, + "grad_norm": 1.739640474319458, + "learning_rate": 4.710882375218509e-05, + "loss": 0.4991, + "step": 4638 + }, + { + "epoch": 0.25466520307354557, + "grad_norm": 2.287449359893799, + "learning_rate": 4.710637900940181e-05, + "loss": 0.389, + "step": 4640 + }, + { + "epoch": 0.254774972557629, + "grad_norm": 2.3112242221832275, + "learning_rate": 4.710393329692424e-05, + "loss": 0.3924, + "step": 4642 + }, + { + "epoch": 0.2548847420417124, + "grad_norm": 2.3861656188964844, + "learning_rate": 4.710148661485966e-05, + "loss": 0.3046, + "step": 4644 + }, + { + "epoch": 0.25499451152579583, + "grad_norm": 3.738415479660034, + "learning_rate": 4.70990389633154e-05, + "loss": 0.4463, + "step": 4646 + }, + { + "epoch": 0.25510428100987925, + "grad_norm": 1.6863044500350952, + "learning_rate": 4.709659034239883e-05, + "loss": 0.4411, + "step": 4648 + }, + { + "epoch": 0.2552140504939627, + "grad_norm": 1.785526990890503, + "learning_rate": 4.709414075221734e-05, + "loss": 0.4034, + "step": 4650 + }, + { + "epoch": 0.2553238199780461, + "grad_norm": 2.898036479949951, + "learning_rate": 4.709169019287839e-05, + "loss": 0.3881, + "step": 4652 + }, + { + "epoch": 0.2554335894621295, + "grad_norm": 1.293798804283142, + "learning_rate": 4.708923866448949e-05, + "loss": 0.3115, + "step": 4654 + }, + { + "epoch": 0.25554335894621294, + "grad_norm": 2.298365354537964, + "learning_rate": 4.708678616715815e-05, + "loss": 0.3135, + "step": 4656 + }, + { + "epoch": 0.25565312843029636, + "grad_norm": 1.8629719018936157, + "learning_rate": 4.7084332700991965e-05, + "loss": 0.4807, + "step": 4658 + }, + { + "epoch": 0.2557628979143798, + "grad_norm": 1.9690532684326172, + "learning_rate": 4.7081878266098545e-05, + "loss": 0.3745, + "step": 4660 + }, + { + "epoch": 0.2558726673984632, + "grad_norm": 1.5357521772384644, + "learning_rate": 4.7079422862585565e-05, + "loss": 0.3466, + "step": 4662 + }, + { + "epoch": 0.2559824368825466, + "grad_norm": 3.864891767501831, + "learning_rate": 4.707696649056073e-05, + "loss": 0.4721, + "step": 4664 + }, + { + "epoch": 0.2560922063666301, + "grad_norm": 1.8430882692337036, + "learning_rate": 4.7074509150131775e-05, + "loss": 0.4034, + "step": 4666 + }, + { + "epoch": 0.2562019758507135, + "grad_norm": 1.5162454843521118, + "learning_rate": 4.707205084140651e-05, + "loss": 0.3514, + "step": 4668 + }, + { + "epoch": 0.25631174533479695, + "grad_norm": 2.9395906925201416, + "learning_rate": 4.706959156449276e-05, + "loss": 0.409, + "step": 4670 + }, + { + "epoch": 0.25642151481888037, + "grad_norm": 2.323711633682251, + "learning_rate": 4.706713131949839e-05, + "loss": 0.3116, + "step": 4672 + }, + { + "epoch": 0.2565312843029638, + "grad_norm": 1.721876859664917, + "learning_rate": 4.7064670106531335e-05, + "loss": 0.4991, + "step": 4674 + }, + { + "epoch": 0.2566410537870472, + "grad_norm": 2.779493570327759, + "learning_rate": 4.7062207925699544e-05, + "loss": 0.5255, + "step": 4676 + }, + { + "epoch": 0.25675082327113063, + "grad_norm": 1.8664764165878296, + "learning_rate": 4.7059744777111035e-05, + "loss": 0.4207, + "step": 4678 + }, + { + "epoch": 0.25686059275521406, + "grad_norm": 2.356752872467041, + "learning_rate": 4.7057280660873835e-05, + "loss": 0.4127, + "step": 4680 + }, + { + "epoch": 0.2569703622392975, + "grad_norm": 1.6722084283828735, + "learning_rate": 4.7054815577096046e-05, + "loss": 0.5193, + "step": 4682 + }, + { + "epoch": 0.2570801317233809, + "grad_norm": 2.5197699069976807, + "learning_rate": 4.705234952588579e-05, + "loss": 0.4378, + "step": 4684 + }, + { + "epoch": 0.2571899012074643, + "grad_norm": 1.9991236925125122, + "learning_rate": 4.704988250735125e-05, + "loss": 0.4358, + "step": 4686 + }, + { + "epoch": 0.25729967069154774, + "grad_norm": 2.098748207092285, + "learning_rate": 4.7047414521600644e-05, + "loss": 0.3586, + "step": 4688 + }, + { + "epoch": 0.25740944017563117, + "grad_norm": 3.147103786468506, + "learning_rate": 4.704494556874221e-05, + "loss": 0.3902, + "step": 4690 + }, + { + "epoch": 0.2575192096597146, + "grad_norm": 1.8642932176589966, + "learning_rate": 4.7042475648884254e-05, + "loss": 0.4094, + "step": 4692 + }, + { + "epoch": 0.257628979143798, + "grad_norm": 1.5092785358428955, + "learning_rate": 4.7040004762135134e-05, + "loss": 0.5186, + "step": 4694 + }, + { + "epoch": 0.25773874862788143, + "grad_norm": 1.9215115308761597, + "learning_rate": 4.703753290860323e-05, + "loss": 0.3435, + "step": 4696 + }, + { + "epoch": 0.25784851811196485, + "grad_norm": 1.2888684272766113, + "learning_rate": 4.7035060088396965e-05, + "loss": 0.4403, + "step": 4698 + }, + { + "epoch": 0.2579582875960483, + "grad_norm": 1.474329948425293, + "learning_rate": 4.70325863016248e-05, + "loss": 0.3665, + "step": 4700 + }, + { + "epoch": 0.2580680570801317, + "grad_norm": 1.9854449033737183, + "learning_rate": 4.703011154839527e-05, + "loss": 0.3121, + "step": 4702 + }, + { + "epoch": 0.2581778265642152, + "grad_norm": 1.5036336183547974, + "learning_rate": 4.702763582881692e-05, + "loss": 0.3282, + "step": 4704 + }, + { + "epoch": 0.2582875960482986, + "grad_norm": 2.0923993587493896, + "learning_rate": 4.702515914299833e-05, + "loss": 0.3615, + "step": 4706 + }, + { + "epoch": 0.258397365532382, + "grad_norm": 2.612125873565674, + "learning_rate": 4.702268149104816e-05, + "loss": 0.5367, + "step": 4708 + }, + { + "epoch": 0.25850713501646544, + "grad_norm": 2.1035878658294678, + "learning_rate": 4.702020287307509e-05, + "loss": 0.3987, + "step": 4710 + }, + { + "epoch": 0.25861690450054886, + "grad_norm": 1.888472318649292, + "learning_rate": 4.701772328918784e-05, + "loss": 0.3375, + "step": 4712 + }, + { + "epoch": 0.2587266739846323, + "grad_norm": 2.3855948448181152, + "learning_rate": 4.7015242739495174e-05, + "loss": 0.3769, + "step": 4714 + }, + { + "epoch": 0.2588364434687157, + "grad_norm": 1.1757752895355225, + "learning_rate": 4.701276122410591e-05, + "loss": 0.4461, + "step": 4716 + }, + { + "epoch": 0.2589462129527991, + "grad_norm": 5.113138198852539, + "learning_rate": 4.7010278743128885e-05, + "loss": 0.4054, + "step": 4718 + }, + { + "epoch": 0.25905598243688255, + "grad_norm": 2.50069260597229, + "learning_rate": 4.7007795296673006e-05, + "loss": 0.4281, + "step": 4720 + }, + { + "epoch": 0.25916575192096597, + "grad_norm": 1.7445752620697021, + "learning_rate": 4.700531088484721e-05, + "loss": 0.4204, + "step": 4722 + }, + { + "epoch": 0.2592755214050494, + "grad_norm": 1.457191824913025, + "learning_rate": 4.7002825507760465e-05, + "loss": 0.2439, + "step": 4724 + }, + { + "epoch": 0.2593852908891328, + "grad_norm": 2.349210500717163, + "learning_rate": 4.700033916552179e-05, + "loss": 0.4672, + "step": 4726 + }, + { + "epoch": 0.25949506037321624, + "grad_norm": 2.4366371631622314, + "learning_rate": 4.699785185824026e-05, + "loss": 0.2818, + "step": 4728 + }, + { + "epoch": 0.25960482985729966, + "grad_norm": 2.022494316101074, + "learning_rate": 4.6995363586024975e-05, + "loss": 0.3749, + "step": 4730 + }, + { + "epoch": 0.2597145993413831, + "grad_norm": 1.7914493083953857, + "learning_rate": 4.6992874348985093e-05, + "loss": 0.436, + "step": 4732 + }, + { + "epoch": 0.2598243688254665, + "grad_norm": 1.8793193101882935, + "learning_rate": 4.699038414722979e-05, + "loss": 0.2585, + "step": 4734 + }, + { + "epoch": 0.2599341383095499, + "grad_norm": 3.7166292667388916, + "learning_rate": 4.6987892980868296e-05, + "loss": 0.5426, + "step": 4736 + }, + { + "epoch": 0.26004390779363334, + "grad_norm": 2.679379463195801, + "learning_rate": 4.6985400850009894e-05, + "loss": 0.377, + "step": 4738 + }, + { + "epoch": 0.2601536772777168, + "grad_norm": 2.0310747623443604, + "learning_rate": 4.6982907754763906e-05, + "loss": 0.3756, + "step": 4740 + }, + { + "epoch": 0.26026344676180024, + "grad_norm": 1.9372042417526245, + "learning_rate": 4.698041369523969e-05, + "loss": 0.4174, + "step": 4742 + }, + { + "epoch": 0.26037321624588367, + "grad_norm": 1.9384922981262207, + "learning_rate": 4.697791867154663e-05, + "loss": 0.3963, + "step": 4744 + }, + { + "epoch": 0.2604829857299671, + "grad_norm": 1.3601526021957397, + "learning_rate": 4.697542268379419e-05, + "loss": 0.3379, + "step": 4746 + }, + { + "epoch": 0.2605927552140505, + "grad_norm": 2.373607635498047, + "learning_rate": 4.697292573209185e-05, + "loss": 0.4306, + "step": 4748 + }, + { + "epoch": 0.26070252469813393, + "grad_norm": 1.2488517761230469, + "learning_rate": 4.697042781654913e-05, + "loss": 0.2986, + "step": 4750 + }, + { + "epoch": 0.26081229418221735, + "grad_norm": 1.3470221757888794, + "learning_rate": 4.696792893727562e-05, + "loss": 0.3457, + "step": 4752 + }, + { + "epoch": 0.2609220636663008, + "grad_norm": 1.2644425630569458, + "learning_rate": 4.696542909438092e-05, + "loss": 0.2725, + "step": 4754 + }, + { + "epoch": 0.2610318331503842, + "grad_norm": 3.311817169189453, + "learning_rate": 4.696292828797468e-05, + "loss": 0.3507, + "step": 4756 + }, + { + "epoch": 0.2611416026344676, + "grad_norm": 1.5684577226638794, + "learning_rate": 4.6960426518166615e-05, + "loss": 0.3048, + "step": 4758 + }, + { + "epoch": 0.26125137211855104, + "grad_norm": 1.552272915840149, + "learning_rate": 4.6957923785066445e-05, + "loss": 0.4088, + "step": 4760 + }, + { + "epoch": 0.26136114160263446, + "grad_norm": 1.6488155126571655, + "learning_rate": 4.695542008878397e-05, + "loss": 0.3777, + "step": 4762 + }, + { + "epoch": 0.2614709110867179, + "grad_norm": 1.6034860610961914, + "learning_rate": 4.6952915429429e-05, + "loss": 0.2736, + "step": 4764 + }, + { + "epoch": 0.2615806805708013, + "grad_norm": 1.3857738971710205, + "learning_rate": 4.695040980711141e-05, + "loss": 0.3562, + "step": 4766 + }, + { + "epoch": 0.2616904500548847, + "grad_norm": 3.665613889694214, + "learning_rate": 4.694790322194111e-05, + "loss": 0.6663, + "step": 4768 + }, + { + "epoch": 0.26180021953896815, + "grad_norm": 1.9810984134674072, + "learning_rate": 4.6945395674028046e-05, + "loss": 0.4377, + "step": 4770 + }, + { + "epoch": 0.26190998902305157, + "grad_norm": 3.321455240249634, + "learning_rate": 4.694288716348221e-05, + "loss": 0.5615, + "step": 4772 + }, + { + "epoch": 0.262019758507135, + "grad_norm": 2.822528600692749, + "learning_rate": 4.694037769041365e-05, + "loss": 0.4772, + "step": 4774 + }, + { + "epoch": 0.2621295279912184, + "grad_norm": 2.6785247325897217, + "learning_rate": 4.693786725493242e-05, + "loss": 0.5176, + "step": 4776 + }, + { + "epoch": 0.2622392974753019, + "grad_norm": 1.4879597425460815, + "learning_rate": 4.6935355857148663e-05, + "loss": 0.3168, + "step": 4778 + }, + { + "epoch": 0.2623490669593853, + "grad_norm": 1.6412516832351685, + "learning_rate": 4.693284349717254e-05, + "loss": 0.3665, + "step": 4780 + }, + { + "epoch": 0.26245883644346873, + "grad_norm": 2.242095947265625, + "learning_rate": 4.693033017511424e-05, + "loss": 0.3118, + "step": 4782 + }, + { + "epoch": 0.26256860592755216, + "grad_norm": 1.6707425117492676, + "learning_rate": 4.692781589108402e-05, + "loss": 0.4204, + "step": 4784 + }, + { + "epoch": 0.2626783754116356, + "grad_norm": 1.3135994672775269, + "learning_rate": 4.692530064519217e-05, + "loss": 0.428, + "step": 4786 + }, + { + "epoch": 0.262788144895719, + "grad_norm": 1.8831923007965088, + "learning_rate": 4.692278443754901e-05, + "loss": 0.4209, + "step": 4788 + }, + { + "epoch": 0.2628979143798024, + "grad_norm": 2.152506113052368, + "learning_rate": 4.692026726826493e-05, + "loss": 0.3562, + "step": 4790 + }, + { + "epoch": 0.26300768386388584, + "grad_norm": 2.0633046627044678, + "learning_rate": 4.691774913745033e-05, + "loss": 0.4421, + "step": 4792 + }, + { + "epoch": 0.26311745334796927, + "grad_norm": 2.742335557937622, + "learning_rate": 4.691523004521567e-05, + "loss": 0.4045, + "step": 4794 + }, + { + "epoch": 0.2632272228320527, + "grad_norm": 1.4854223728179932, + "learning_rate": 4.691270999167147e-05, + "loss": 0.3011, + "step": 4796 + }, + { + "epoch": 0.2633369923161361, + "grad_norm": 1.4618455171585083, + "learning_rate": 4.6910188976928246e-05, + "loss": 0.3643, + "step": 4798 + }, + { + "epoch": 0.26344676180021953, + "grad_norm": 2.0567007064819336, + "learning_rate": 4.690766700109659e-05, + "loss": 0.4995, + "step": 4800 + }, + { + "epoch": 0.26355653128430295, + "grad_norm": 2.3695528507232666, + "learning_rate": 4.690514406428713e-05, + "loss": 0.4756, + "step": 4802 + }, + { + "epoch": 0.2636663007683864, + "grad_norm": 2.0661203861236572, + "learning_rate": 4.690262016661054e-05, + "loss": 0.2713, + "step": 4804 + }, + { + "epoch": 0.2637760702524698, + "grad_norm": 2.290130376815796, + "learning_rate": 4.690009530817753e-05, + "loss": 0.3926, + "step": 4806 + }, + { + "epoch": 0.2638858397365532, + "grad_norm": 2.294353723526001, + "learning_rate": 4.689756948909884e-05, + "loss": 0.4138, + "step": 4808 + }, + { + "epoch": 0.26399560922063664, + "grad_norm": 1.5947372913360596, + "learning_rate": 4.689504270948527e-05, + "loss": 0.2949, + "step": 4810 + }, + { + "epoch": 0.26410537870472006, + "grad_norm": 2.2821764945983887, + "learning_rate": 4.6892514969447664e-05, + "loss": 0.3024, + "step": 4812 + }, + { + "epoch": 0.26421514818880354, + "grad_norm": 1.744213342666626, + "learning_rate": 4.6889986269096894e-05, + "loss": 0.4461, + "step": 4814 + }, + { + "epoch": 0.26432491767288696, + "grad_norm": 3.483344316482544, + "learning_rate": 4.688745660854388e-05, + "loss": 0.5341, + "step": 4816 + }, + { + "epoch": 0.2644346871569704, + "grad_norm": 1.7360198497772217, + "learning_rate": 4.68849259878996e-05, + "loss": 0.295, + "step": 4818 + }, + { + "epoch": 0.2645444566410538, + "grad_norm": 1.7672652006149292, + "learning_rate": 4.6882394407275044e-05, + "loss": 0.3443, + "step": 4820 + }, + { + "epoch": 0.2646542261251372, + "grad_norm": 2.0433177947998047, + "learning_rate": 4.687986186678126e-05, + "loss": 0.2894, + "step": 4822 + }, + { + "epoch": 0.26476399560922065, + "grad_norm": 1.6798961162567139, + "learning_rate": 4.6877328366529346e-05, + "loss": 0.3888, + "step": 4824 + }, + { + "epoch": 0.26487376509330407, + "grad_norm": 1.9383891820907593, + "learning_rate": 4.6874793906630424e-05, + "loss": 0.3131, + "step": 4826 + }, + { + "epoch": 0.2649835345773875, + "grad_norm": 3.7728660106658936, + "learning_rate": 4.687225848719568e-05, + "loss": 0.4415, + "step": 4828 + }, + { + "epoch": 0.2650933040614709, + "grad_norm": 1.4867984056472778, + "learning_rate": 4.6869722108336323e-05, + "loss": 0.3032, + "step": 4830 + }, + { + "epoch": 0.26520307354555434, + "grad_norm": 2.869490623474121, + "learning_rate": 4.686718477016361e-05, + "loss": 0.3183, + "step": 4832 + }, + { + "epoch": 0.26531284302963776, + "grad_norm": 2.190741539001465, + "learning_rate": 4.6864646472788845e-05, + "loss": 0.3211, + "step": 4834 + }, + { + "epoch": 0.2654226125137212, + "grad_norm": 1.4252122640609741, + "learning_rate": 4.686210721632336e-05, + "loss": 0.2923, + "step": 4836 + }, + { + "epoch": 0.2655323819978046, + "grad_norm": 2.8772904872894287, + "learning_rate": 4.685956700087856e-05, + "loss": 0.4077, + "step": 4838 + }, + { + "epoch": 0.265642151481888, + "grad_norm": 1.760129451751709, + "learning_rate": 4.685702582656584e-05, + "loss": 0.2909, + "step": 4840 + }, + { + "epoch": 0.26575192096597144, + "grad_norm": 2.4548532962799072, + "learning_rate": 4.6854483693496696e-05, + "loss": 0.3155, + "step": 4842 + }, + { + "epoch": 0.26586169045005487, + "grad_norm": 1.8641449213027954, + "learning_rate": 4.6851940601782635e-05, + "loss": 0.5974, + "step": 4844 + }, + { + "epoch": 0.2659714599341383, + "grad_norm": 2.3503122329711914, + "learning_rate": 4.68493965515352e-05, + "loss": 0.3434, + "step": 4846 + }, + { + "epoch": 0.2660812294182217, + "grad_norm": 2.506556510925293, + "learning_rate": 4.684685154286599e-05, + "loss": 0.3892, + "step": 4848 + }, + { + "epoch": 0.26619099890230513, + "grad_norm": 1.5786720514297485, + "learning_rate": 4.684430557588664e-05, + "loss": 0.5216, + "step": 4850 + }, + { + "epoch": 0.2663007683863886, + "grad_norm": 2.8884167671203613, + "learning_rate": 4.6841758650708824e-05, + "loss": 0.6222, + "step": 4852 + }, + { + "epoch": 0.26641053787047203, + "grad_norm": 2.419877290725708, + "learning_rate": 4.683921076744427e-05, + "loss": 0.4035, + "step": 4854 + }, + { + "epoch": 0.26652030735455545, + "grad_norm": 1.5553789138793945, + "learning_rate": 4.6836661926204736e-05, + "loss": 0.2853, + "step": 4856 + }, + { + "epoch": 0.2666300768386389, + "grad_norm": 2.778242826461792, + "learning_rate": 4.6834112127102036e-05, + "loss": 0.3902, + "step": 4858 + }, + { + "epoch": 0.2667398463227223, + "grad_norm": 2.0836665630340576, + "learning_rate": 4.683156137024801e-05, + "loss": 0.4226, + "step": 4860 + }, + { + "epoch": 0.2668496158068057, + "grad_norm": 2.6553308963775635, + "learning_rate": 4.6829009655754544e-05, + "loss": 0.4387, + "step": 4862 + }, + { + "epoch": 0.26695938529088914, + "grad_norm": 1.2762423753738403, + "learning_rate": 4.682645698373357e-05, + "loss": 0.3214, + "step": 4864 + }, + { + "epoch": 0.26706915477497256, + "grad_norm": 3.1089305877685547, + "learning_rate": 4.682390335429706e-05, + "loss": 0.722, + "step": 4866 + }, + { + "epoch": 0.267178924259056, + "grad_norm": 1.202528953552246, + "learning_rate": 4.682134876755704e-05, + "loss": 0.32, + "step": 4868 + }, + { + "epoch": 0.2672886937431394, + "grad_norm": 1.8145580291748047, + "learning_rate": 4.681879322362555e-05, + "loss": 0.3599, + "step": 4870 + }, + { + "epoch": 0.2673984632272228, + "grad_norm": 1.52018404006958, + "learning_rate": 4.681623672261469e-05, + "loss": 0.3643, + "step": 4872 + }, + { + "epoch": 0.26750823271130625, + "grad_norm": 2.9365739822387695, + "learning_rate": 4.6813679264636625e-05, + "loss": 0.2805, + "step": 4874 + }, + { + "epoch": 0.26761800219538967, + "grad_norm": 3.086212635040283, + "learning_rate": 4.68111208498035e-05, + "loss": 0.3583, + "step": 4876 + }, + { + "epoch": 0.2677277716794731, + "grad_norm": 1.6019089221954346, + "learning_rate": 4.6808561478227576e-05, + "loss": 0.3027, + "step": 4878 + }, + { + "epoch": 0.2678375411635565, + "grad_norm": 1.062882661819458, + "learning_rate": 4.68060011500211e-05, + "loss": 0.364, + "step": 4880 + }, + { + "epoch": 0.26794731064763994, + "grad_norm": 1.3273016214370728, + "learning_rate": 4.680343986529637e-05, + "loss": 0.4016, + "step": 4882 + }, + { + "epoch": 0.26805708013172336, + "grad_norm": 1.5129411220550537, + "learning_rate": 4.680087762416576e-05, + "loss": 0.3998, + "step": 4884 + }, + { + "epoch": 0.2681668496158068, + "grad_norm": 2.599259376525879, + "learning_rate": 4.679831442674165e-05, + "loss": 0.4595, + "step": 4886 + }, + { + "epoch": 0.26827661909989026, + "grad_norm": 1.556980013847351, + "learning_rate": 4.679575027313649e-05, + "loss": 0.3706, + "step": 4888 + }, + { + "epoch": 0.2683863885839737, + "grad_norm": 1.671441674232483, + "learning_rate": 4.6793185163462726e-05, + "loss": 0.3192, + "step": 4890 + }, + { + "epoch": 0.2684961580680571, + "grad_norm": 2.6737468242645264, + "learning_rate": 4.67906190978329e-05, + "loss": 0.4074, + "step": 4892 + }, + { + "epoch": 0.2686059275521405, + "grad_norm": 1.47415030002594, + "learning_rate": 4.678805207635957e-05, + "loss": 0.3148, + "step": 4894 + }, + { + "epoch": 0.26871569703622394, + "grad_norm": 2.0890209674835205, + "learning_rate": 4.678548409915532e-05, + "loss": 0.4033, + "step": 4896 + }, + { + "epoch": 0.26882546652030737, + "grad_norm": 2.3564929962158203, + "learning_rate": 4.678291516633282e-05, + "loss": 0.3244, + "step": 4898 + }, + { + "epoch": 0.2689352360043908, + "grad_norm": 2.3734793663024902, + "learning_rate": 4.678034527800474e-05, + "loss": 0.3991, + "step": 4900 + }, + { + "epoch": 0.2690450054884742, + "grad_norm": 2.5375282764434814, + "learning_rate": 4.677777443428381e-05, + "loss": 0.3232, + "step": 4902 + }, + { + "epoch": 0.26915477497255763, + "grad_norm": 2.461303472518921, + "learning_rate": 4.67752026352828e-05, + "loss": 0.2763, + "step": 4904 + }, + { + "epoch": 0.26926454445664105, + "grad_norm": 2.7253170013427734, + "learning_rate": 4.677262988111453e-05, + "loss": 0.5334, + "step": 4906 + }, + { + "epoch": 0.2693743139407245, + "grad_norm": 1.989006757736206, + "learning_rate": 4.6770056171891846e-05, + "loss": 0.3488, + "step": 4908 + }, + { + "epoch": 0.2694840834248079, + "grad_norm": 2.776144504547119, + "learning_rate": 4.6767481507727646e-05, + "loss": 0.5355, + "step": 4910 + }, + { + "epoch": 0.2695938529088913, + "grad_norm": 1.5577946901321411, + "learning_rate": 4.676490588873486e-05, + "loss": 0.3957, + "step": 4912 + }, + { + "epoch": 0.26970362239297474, + "grad_norm": 1.7412978410720825, + "learning_rate": 4.676232931502648e-05, + "loss": 0.3658, + "step": 4914 + }, + { + "epoch": 0.26981339187705816, + "grad_norm": 1.2808969020843506, + "learning_rate": 4.675975178671551e-05, + "loss": 0.4401, + "step": 4916 + }, + { + "epoch": 0.2699231613611416, + "grad_norm": 1.1340497732162476, + "learning_rate": 4.6757173303915035e-05, + "loss": 0.3376, + "step": 4918 + }, + { + "epoch": 0.270032930845225, + "grad_norm": 2.394625663757324, + "learning_rate": 4.675459386673815e-05, + "loss": 0.39, + "step": 4920 + }, + { + "epoch": 0.2701427003293084, + "grad_norm": 2.1911871433258057, + "learning_rate": 4.675201347529799e-05, + "loss": 0.3621, + "step": 4922 + }, + { + "epoch": 0.2702524698133919, + "grad_norm": 1.2999705076217651, + "learning_rate": 4.674943212970776e-05, + "loss": 0.5528, + "step": 4924 + }, + { + "epoch": 0.2703622392974753, + "grad_norm": 2.1126561164855957, + "learning_rate": 4.674684983008067e-05, + "loss": 0.3747, + "step": 4926 + }, + { + "epoch": 0.27047200878155875, + "grad_norm": 5.744015216827393, + "learning_rate": 4.674426657653003e-05, + "loss": 0.356, + "step": 4928 + }, + { + "epoch": 0.27058177826564217, + "grad_norm": 1.0003608465194702, + "learning_rate": 4.6741682369169116e-05, + "loss": 0.4084, + "step": 4930 + }, + { + "epoch": 0.2706915477497256, + "grad_norm": 2.353253126144409, + "learning_rate": 4.6739097208111306e-05, + "loss": 0.5086, + "step": 4932 + }, + { + "epoch": 0.270801317233809, + "grad_norm": 1.794062852859497, + "learning_rate": 4.673651109346998e-05, + "loss": 0.264, + "step": 4934 + }, + { + "epoch": 0.27091108671789244, + "grad_norm": 1.2812281847000122, + "learning_rate": 4.6733924025358597e-05, + "loss": 0.3361, + "step": 4936 + }, + { + "epoch": 0.27102085620197586, + "grad_norm": 1.9027810096740723, + "learning_rate": 4.673133600389063e-05, + "loss": 0.3903, + "step": 4938 + }, + { + "epoch": 0.2711306256860593, + "grad_norm": 3.208103895187378, + "learning_rate": 4.67287470291796e-05, + "loss": 0.3794, + "step": 4940 + }, + { + "epoch": 0.2712403951701427, + "grad_norm": 3.2382125854492188, + "learning_rate": 4.672615710133907e-05, + "loss": 0.4465, + "step": 4942 + }, + { + "epoch": 0.2713501646542261, + "grad_norm": 1.5088406801223755, + "learning_rate": 4.6723566220482664e-05, + "loss": 0.4327, + "step": 4944 + }, + { + "epoch": 0.27145993413830954, + "grad_norm": 2.5356733798980713, + "learning_rate": 4.672097438672401e-05, + "loss": 0.3778, + "step": 4946 + }, + { + "epoch": 0.27156970362239297, + "grad_norm": 2.5118110179901123, + "learning_rate": 4.671838160017681e-05, + "loss": 0.4809, + "step": 4948 + }, + { + "epoch": 0.2716794731064764, + "grad_norm": 3.01845121383667, + "learning_rate": 4.671578786095478e-05, + "loss": 0.5428, + "step": 4950 + }, + { + "epoch": 0.2717892425905598, + "grad_norm": 2.08864688873291, + "learning_rate": 4.6713193169171724e-05, + "loss": 0.4389, + "step": 4952 + }, + { + "epoch": 0.27189901207464323, + "grad_norm": 1.4552528858184814, + "learning_rate": 4.671059752494143e-05, + "loss": 0.4628, + "step": 4954 + }, + { + "epoch": 0.27200878155872665, + "grad_norm": 1.3443033695220947, + "learning_rate": 4.670800092837777e-05, + "loss": 0.3897, + "step": 4956 + }, + { + "epoch": 0.2721185510428101, + "grad_norm": 2.3445942401885986, + "learning_rate": 4.6705403379594634e-05, + "loss": 0.4415, + "step": 4958 + }, + { + "epoch": 0.2722283205268935, + "grad_norm": 4.476299285888672, + "learning_rate": 4.670280487870598e-05, + "loss": 0.3471, + "step": 4960 + }, + { + "epoch": 0.272338090010977, + "grad_norm": 2.669999361038208, + "learning_rate": 4.670020542582578e-05, + "loss": 0.3535, + "step": 4962 + }, + { + "epoch": 0.2724478594950604, + "grad_norm": 1.7698752880096436, + "learning_rate": 4.669760502106805e-05, + "loss": 0.4258, + "step": 4964 + }, + { + "epoch": 0.2725576289791438, + "grad_norm": 1.6142781972885132, + "learning_rate": 4.6695003664546876e-05, + "loss": 0.4454, + "step": 4966 + }, + { + "epoch": 0.27266739846322724, + "grad_norm": 6.760419845581055, + "learning_rate": 4.669240135637635e-05, + "loss": 0.4188, + "step": 4968 + }, + { + "epoch": 0.27277716794731066, + "grad_norm": 4.211716175079346, + "learning_rate": 4.6689798096670626e-05, + "loss": 0.3824, + "step": 4970 + }, + { + "epoch": 0.2728869374313941, + "grad_norm": 1.785603642463684, + "learning_rate": 4.66871938855439e-05, + "loss": 0.4372, + "step": 4972 + }, + { + "epoch": 0.2729967069154775, + "grad_norm": 4.8777642250061035, + "learning_rate": 4.668458872311041e-05, + "loss": 0.3481, + "step": 4974 + }, + { + "epoch": 0.2731064763995609, + "grad_norm": 2.4677581787109375, + "learning_rate": 4.6681982609484416e-05, + "loss": 0.5691, + "step": 4976 + }, + { + "epoch": 0.27321624588364435, + "grad_norm": 1.8335014581680298, + "learning_rate": 4.667937554478025e-05, + "loss": 0.5004, + "step": 4978 + }, + { + "epoch": 0.27332601536772777, + "grad_norm": 1.6653109788894653, + "learning_rate": 4.667676752911225e-05, + "loss": 0.2838, + "step": 4980 + }, + { + "epoch": 0.2734357848518112, + "grad_norm": 1.6297502517700195, + "learning_rate": 4.6674158562594845e-05, + "loss": 0.3545, + "step": 4982 + }, + { + "epoch": 0.2735455543358946, + "grad_norm": 2.2819221019744873, + "learning_rate": 4.6671548645342456e-05, + "loss": 0.322, + "step": 4984 + }, + { + "epoch": 0.27365532381997804, + "grad_norm": 1.6334096193313599, + "learning_rate": 4.666893777746957e-05, + "loss": 0.2312, + "step": 4986 + }, + { + "epoch": 0.27376509330406146, + "grad_norm": 1.1485649347305298, + "learning_rate": 4.666632595909072e-05, + "loss": 0.3443, + "step": 4988 + }, + { + "epoch": 0.2738748627881449, + "grad_norm": 3.060880422592163, + "learning_rate": 4.666371319032047e-05, + "loss": 0.6143, + "step": 4990 + }, + { + "epoch": 0.2739846322722283, + "grad_norm": 2.9461028575897217, + "learning_rate": 4.666109947127343e-05, + "loss": 0.6196, + "step": 4992 + }, + { + "epoch": 0.2740944017563117, + "grad_norm": 1.5640993118286133, + "learning_rate": 4.665848480206424e-05, + "loss": 0.3791, + "step": 4994 + }, + { + "epoch": 0.27420417124039514, + "grad_norm": 1.1234089136123657, + "learning_rate": 4.665586918280761e-05, + "loss": 0.2937, + "step": 4996 + }, + { + "epoch": 0.2743139407244786, + "grad_norm": 1.7399846315383911, + "learning_rate": 4.665325261361826e-05, + "loss": 0.338, + "step": 4998 + }, + { + "epoch": 0.27442371020856204, + "grad_norm": 1.7686043977737427, + "learning_rate": 4.665063509461097e-05, + "loss": 0.3218, + "step": 5000 + }, + { + "epoch": 0.27453347969264547, + "grad_norm": 1.5894302129745483, + "learning_rate": 4.664801662590055e-05, + "loss": 0.3733, + "step": 5002 + }, + { + "epoch": 0.2746432491767289, + "grad_norm": 2.3869364261627197, + "learning_rate": 4.6645397207601884e-05, + "loss": 0.4407, + "step": 5004 + }, + { + "epoch": 0.2747530186608123, + "grad_norm": 1.193390965461731, + "learning_rate": 4.664277683982984e-05, + "loss": 0.3528, + "step": 5006 + }, + { + "epoch": 0.27486278814489573, + "grad_norm": 1.6554293632507324, + "learning_rate": 4.6640155522699374e-05, + "loss": 0.4081, + "step": 5008 + }, + { + "epoch": 0.27497255762897915, + "grad_norm": 1.6681755781173706, + "learning_rate": 4.663753325632548e-05, + "loss": 0.4427, + "step": 5010 + }, + { + "epoch": 0.2750823271130626, + "grad_norm": 1.7224030494689941, + "learning_rate": 4.663491004082316e-05, + "loss": 0.3345, + "step": 5012 + }, + { + "epoch": 0.275192096597146, + "grad_norm": 3.1266226768493652, + "learning_rate": 4.6632285876307514e-05, + "loss": 0.3771, + "step": 5014 + }, + { + "epoch": 0.2753018660812294, + "grad_norm": 2.8438141345977783, + "learning_rate": 4.662966076289362e-05, + "loss": 0.4087, + "step": 5016 + }, + { + "epoch": 0.27541163556531284, + "grad_norm": 2.7767374515533447, + "learning_rate": 4.6627034700696634e-05, + "loss": 0.6495, + "step": 5018 + }, + { + "epoch": 0.27552140504939626, + "grad_norm": 1.9211156368255615, + "learning_rate": 4.662440768983177e-05, + "loss": 0.4286, + "step": 5020 + }, + { + "epoch": 0.2756311745334797, + "grad_norm": 1.8659816980361938, + "learning_rate": 4.662177973041424e-05, + "loss": 0.3554, + "step": 5022 + }, + { + "epoch": 0.2757409440175631, + "grad_norm": 1.9950428009033203, + "learning_rate": 4.661915082255932e-05, + "loss": 0.2785, + "step": 5024 + }, + { + "epoch": 0.2758507135016465, + "grad_norm": 2.3656623363494873, + "learning_rate": 4.661652096638234e-05, + "loss": 0.4731, + "step": 5026 + }, + { + "epoch": 0.27596048298572995, + "grad_norm": 1.6787320375442505, + "learning_rate": 4.661389016199864e-05, + "loss": 0.3894, + "step": 5028 + }, + { + "epoch": 0.27607025246981337, + "grad_norm": 2.6948201656341553, + "learning_rate": 4.661125840952364e-05, + "loss": 0.5489, + "step": 5030 + }, + { + "epoch": 0.2761800219538968, + "grad_norm": 2.0787813663482666, + "learning_rate": 4.660862570907277e-05, + "loss": 0.4278, + "step": 5032 + }, + { + "epoch": 0.2762897914379802, + "grad_norm": 3.3339693546295166, + "learning_rate": 4.660599206076151e-05, + "loss": 0.31, + "step": 5034 + }, + { + "epoch": 0.2763995609220637, + "grad_norm": 2.2401158809661865, + "learning_rate": 4.660335746470539e-05, + "loss": 0.4137, + "step": 5036 + }, + { + "epoch": 0.2765093304061471, + "grad_norm": 1.3709335327148438, + "learning_rate": 4.660072192101999e-05, + "loss": 0.4398, + "step": 5038 + }, + { + "epoch": 0.27661909989023054, + "grad_norm": 2.4560275077819824, + "learning_rate": 4.659808542982088e-05, + "loss": 0.5048, + "step": 5040 + }, + { + "epoch": 0.27672886937431396, + "grad_norm": 1.8032898902893066, + "learning_rate": 4.659544799122375e-05, + "loss": 0.3665, + "step": 5042 + }, + { + "epoch": 0.2768386388583974, + "grad_norm": 2.1860618591308594, + "learning_rate": 4.6592809605344276e-05, + "loss": 0.5021, + "step": 5044 + }, + { + "epoch": 0.2769484083424808, + "grad_norm": 1.28166663646698, + "learning_rate": 4.6590170272298187e-05, + "loss": 0.3115, + "step": 5046 + }, + { + "epoch": 0.2770581778265642, + "grad_norm": 3.624868631362915, + "learning_rate": 4.658752999220125e-05, + "loss": 0.6279, + "step": 5048 + }, + { + "epoch": 0.27716794731064764, + "grad_norm": 1.2606903314590454, + "learning_rate": 4.6584888765169296e-05, + "loss": 0.3261, + "step": 5050 + }, + { + "epoch": 0.27727771679473107, + "grad_norm": 2.2374062538146973, + "learning_rate": 4.6582246591318175e-05, + "loss": 0.3287, + "step": 5052 + }, + { + "epoch": 0.2773874862788145, + "grad_norm": 1.3106646537780762, + "learning_rate": 4.657960347076379e-05, + "loss": 0.2742, + "step": 5054 + }, + { + "epoch": 0.2774972557628979, + "grad_norm": 1.7318843603134155, + "learning_rate": 4.657695940362207e-05, + "loss": 0.4133, + "step": 5056 + }, + { + "epoch": 0.27760702524698133, + "grad_norm": 1.3177231550216675, + "learning_rate": 4.657431439000901e-05, + "loss": 0.3213, + "step": 5058 + }, + { + "epoch": 0.27771679473106475, + "grad_norm": 1.5283461809158325, + "learning_rate": 4.6571668430040625e-05, + "loss": 0.4518, + "step": 5060 + }, + { + "epoch": 0.2778265642151482, + "grad_norm": 1.254949927330017, + "learning_rate": 4.656902152383299e-05, + "loss": 0.3772, + "step": 5062 + }, + { + "epoch": 0.2779363336992316, + "grad_norm": 1.7956005334854126, + "learning_rate": 4.6566373671502196e-05, + "loss": 0.4395, + "step": 5064 + }, + { + "epoch": 0.278046103183315, + "grad_norm": 1.3647994995117188, + "learning_rate": 4.6563724873164397e-05, + "loss": 0.302, + "step": 5066 + }, + { + "epoch": 0.27815587266739844, + "grad_norm": 1.1879534721374512, + "learning_rate": 4.656107512893579e-05, + "loss": 0.4634, + "step": 5068 + }, + { + "epoch": 0.27826564215148186, + "grad_norm": 1.632498860359192, + "learning_rate": 4.65584244389326e-05, + "loss": 0.3325, + "step": 5070 + }, + { + "epoch": 0.27837541163556534, + "grad_norm": 2.089772939682007, + "learning_rate": 4.65557728032711e-05, + "loss": 0.4001, + "step": 5072 + }, + { + "epoch": 0.27848518111964876, + "grad_norm": 3.0594189167022705, + "learning_rate": 4.6553120222067605e-05, + "loss": 0.2632, + "step": 5074 + }, + { + "epoch": 0.2785949506037322, + "grad_norm": 2.1679129600524902, + "learning_rate": 4.655046669543845e-05, + "loss": 0.3697, + "step": 5076 + }, + { + "epoch": 0.2787047200878156, + "grad_norm": 2.0419795513153076, + "learning_rate": 4.654781222350007e-05, + "loss": 0.3083, + "step": 5078 + }, + { + "epoch": 0.278814489571899, + "grad_norm": 2.9475533962249756, + "learning_rate": 4.654515680636888e-05, + "loss": 0.2927, + "step": 5080 + }, + { + "epoch": 0.27892425905598245, + "grad_norm": 1.522895097732544, + "learning_rate": 4.654250044416136e-05, + "loss": 0.3617, + "step": 5082 + }, + { + "epoch": 0.27903402854006587, + "grad_norm": 2.2824013233184814, + "learning_rate": 4.6539843136994036e-05, + "loss": 0.3916, + "step": 5084 + }, + { + "epoch": 0.2791437980241493, + "grad_norm": 2.213261127471924, + "learning_rate": 4.653718488498346e-05, + "loss": 0.3443, + "step": 5086 + }, + { + "epoch": 0.2792535675082327, + "grad_norm": 1.4255458116531372, + "learning_rate": 4.653452568824625e-05, + "loss": 0.3474, + "step": 5088 + }, + { + "epoch": 0.27936333699231614, + "grad_norm": 1.976433515548706, + "learning_rate": 4.6531865546899045e-05, + "loss": 0.4542, + "step": 5090 + }, + { + "epoch": 0.27947310647639956, + "grad_norm": 2.200190544128418, + "learning_rate": 4.652920446105853e-05, + "loss": 0.4353, + "step": 5092 + }, + { + "epoch": 0.279582875960483, + "grad_norm": 2.1255643367767334, + "learning_rate": 4.6526542430841436e-05, + "loss": 0.353, + "step": 5094 + }, + { + "epoch": 0.2796926454445664, + "grad_norm": 1.914396047592163, + "learning_rate": 4.652387945636454e-05, + "loss": 0.4017, + "step": 5096 + }, + { + "epoch": 0.2798024149286498, + "grad_norm": 2.1788501739501953, + "learning_rate": 4.652121553774464e-05, + "loss": 0.4876, + "step": 5098 + }, + { + "epoch": 0.27991218441273324, + "grad_norm": 1.8762497901916504, + "learning_rate": 4.65185506750986e-05, + "loss": 0.4428, + "step": 5100 + }, + { + "epoch": 0.28002195389681667, + "grad_norm": 2.878673553466797, + "learning_rate": 4.65158848685433e-05, + "loss": 0.4693, + "step": 5102 + }, + { + "epoch": 0.2801317233809001, + "grad_norm": 2.183546781539917, + "learning_rate": 4.651321811819568e-05, + "loss": 0.4895, + "step": 5104 + }, + { + "epoch": 0.2802414928649835, + "grad_norm": 1.7452061176300049, + "learning_rate": 4.6510550424172726e-05, + "loss": 0.3419, + "step": 5106 + }, + { + "epoch": 0.28035126234906693, + "grad_norm": 2.0205183029174805, + "learning_rate": 4.650788178659146e-05, + "loss": 0.2544, + "step": 5108 + }, + { + "epoch": 0.2804610318331504, + "grad_norm": 1.7092475891113281, + "learning_rate": 4.6505212205568916e-05, + "loss": 0.4333, + "step": 5110 + }, + { + "epoch": 0.28057080131723383, + "grad_norm": 1.503493070602417, + "learning_rate": 4.650254168122222e-05, + "loss": 0.3222, + "step": 5112 + }, + { + "epoch": 0.28068057080131725, + "grad_norm": 1.7057902812957764, + "learning_rate": 4.64998702136685e-05, + "loss": 0.3917, + "step": 5114 + }, + { + "epoch": 0.2807903402854007, + "grad_norm": 2.2710580825805664, + "learning_rate": 4.649719780302495e-05, + "loss": 0.3494, + "step": 5116 + }, + { + "epoch": 0.2809001097694841, + "grad_norm": 1.6810355186462402, + "learning_rate": 4.6494524449408786e-05, + "loss": 0.4347, + "step": 5118 + }, + { + "epoch": 0.2810098792535675, + "grad_norm": 2.48734974861145, + "learning_rate": 4.649185015293728e-05, + "loss": 0.3111, + "step": 5120 + }, + { + "epoch": 0.28111964873765094, + "grad_norm": 2.315481662750244, + "learning_rate": 4.648917491372774e-05, + "loss": 0.4669, + "step": 5122 + }, + { + "epoch": 0.28122941822173436, + "grad_norm": 1.201666235923767, + "learning_rate": 4.648649873189751e-05, + "loss": 0.3601, + "step": 5124 + }, + { + "epoch": 0.2813391877058178, + "grad_norm": 1.4723236560821533, + "learning_rate": 4.648382160756398e-05, + "loss": 0.295, + "step": 5126 + }, + { + "epoch": 0.2814489571899012, + "grad_norm": 1.3640230894088745, + "learning_rate": 4.648114354084459e-05, + "loss": 0.4437, + "step": 5128 + }, + { + "epoch": 0.2815587266739846, + "grad_norm": 2.5573999881744385, + "learning_rate": 4.647846453185681e-05, + "loss": 0.3898, + "step": 5130 + }, + { + "epoch": 0.28166849615806805, + "grad_norm": 1.8792080879211426, + "learning_rate": 4.6475784580718155e-05, + "loss": 0.4236, + "step": 5132 + }, + { + "epoch": 0.28177826564215147, + "grad_norm": 1.3642044067382812, + "learning_rate": 4.647310368754617e-05, + "loss": 0.3634, + "step": 5134 + }, + { + "epoch": 0.2818880351262349, + "grad_norm": 1.352081537246704, + "learning_rate": 4.647042185245847e-05, + "loss": 0.2695, + "step": 5136 + }, + { + "epoch": 0.2819978046103183, + "grad_norm": 1.6134836673736572, + "learning_rate": 4.646773907557268e-05, + "loss": 0.592, + "step": 5138 + }, + { + "epoch": 0.28210757409440174, + "grad_norm": 1.5316119194030762, + "learning_rate": 4.646505535700649e-05, + "loss": 0.4239, + "step": 5140 + }, + { + "epoch": 0.28221734357848516, + "grad_norm": 1.9974910020828247, + "learning_rate": 4.646237069687761e-05, + "loss": 0.4943, + "step": 5142 + }, + { + "epoch": 0.2823271130625686, + "grad_norm": 2.4622867107391357, + "learning_rate": 4.645968509530381e-05, + "loss": 0.4236, + "step": 5144 + }, + { + "epoch": 0.28243688254665206, + "grad_norm": 2.17594051361084, + "learning_rate": 4.6456998552402884e-05, + "loss": 0.3456, + "step": 5146 + }, + { + "epoch": 0.2825466520307355, + "grad_norm": 2.71008563041687, + "learning_rate": 4.64543110682927e-05, + "loss": 0.4104, + "step": 5148 + }, + { + "epoch": 0.2826564215148189, + "grad_norm": 2.6877880096435547, + "learning_rate": 4.645162264309112e-05, + "loss": 0.6268, + "step": 5150 + }, + { + "epoch": 0.2827661909989023, + "grad_norm": 3.6680097579956055, + "learning_rate": 4.6448933276916076e-05, + "loss": 0.2842, + "step": 5152 + }, + { + "epoch": 0.28287596048298574, + "grad_norm": 2.4233009815216064, + "learning_rate": 4.6446242969885546e-05, + "loss": 0.3642, + "step": 5154 + }, + { + "epoch": 0.28298572996706917, + "grad_norm": 2.0302693843841553, + "learning_rate": 4.644355172211753e-05, + "loss": 0.3756, + "step": 5156 + }, + { + "epoch": 0.2830954994511526, + "grad_norm": 2.1701619625091553, + "learning_rate": 4.644085953373008e-05, + "loss": 0.4232, + "step": 5158 + }, + { + "epoch": 0.283205268935236, + "grad_norm": 2.1602017879486084, + "learning_rate": 4.643816640484131e-05, + "loss": 0.3556, + "step": 5160 + }, + { + "epoch": 0.28331503841931943, + "grad_norm": 1.6955937147140503, + "learning_rate": 4.6435472335569324e-05, + "loss": 0.3109, + "step": 5162 + }, + { + "epoch": 0.28342480790340285, + "grad_norm": 2.2151601314544678, + "learning_rate": 4.6432777326032316e-05, + "loss": 0.2819, + "step": 5164 + }, + { + "epoch": 0.2835345773874863, + "grad_norm": 3.420269012451172, + "learning_rate": 4.643008137634849e-05, + "loss": 0.4196, + "step": 5166 + }, + { + "epoch": 0.2836443468715697, + "grad_norm": 2.3938751220703125, + "learning_rate": 4.6427384486636113e-05, + "loss": 0.3666, + "step": 5168 + }, + { + "epoch": 0.2837541163556531, + "grad_norm": 2.1297707557678223, + "learning_rate": 4.6424686657013484e-05, + "loss": 0.3283, + "step": 5170 + }, + { + "epoch": 0.28386388583973654, + "grad_norm": 2.2806806564331055, + "learning_rate": 4.642198788759894e-05, + "loss": 0.4072, + "step": 5172 + }, + { + "epoch": 0.28397365532381996, + "grad_norm": 2.3277506828308105, + "learning_rate": 4.641928817851086e-05, + "loss": 0.4106, + "step": 5174 + }, + { + "epoch": 0.2840834248079034, + "grad_norm": 1.728433609008789, + "learning_rate": 4.6416587529867664e-05, + "loss": 0.3595, + "step": 5176 + }, + { + "epoch": 0.2841931942919868, + "grad_norm": 1.961196780204773, + "learning_rate": 4.641388594178782e-05, + "loss": 0.3403, + "step": 5178 + }, + { + "epoch": 0.2843029637760702, + "grad_norm": 1.6812809705734253, + "learning_rate": 4.6411183414389837e-05, + "loss": 0.5739, + "step": 5180 + }, + { + "epoch": 0.28441273326015365, + "grad_norm": 1.0600223541259766, + "learning_rate": 4.640847994779226e-05, + "loss": 0.2887, + "step": 5182 + }, + { + "epoch": 0.2845225027442371, + "grad_norm": 2.154599905014038, + "learning_rate": 4.640577554211366e-05, + "loss": 0.3817, + "step": 5184 + }, + { + "epoch": 0.28463227222832055, + "grad_norm": 1.4455523490905762, + "learning_rate": 4.6403070197472695e-05, + "loss": 0.3163, + "step": 5186 + }, + { + "epoch": 0.28474204171240397, + "grad_norm": 1.711634635925293, + "learning_rate": 4.640036391398801e-05, + "loss": 0.4308, + "step": 5188 + }, + { + "epoch": 0.2848518111964874, + "grad_norm": 2.017641544342041, + "learning_rate": 4.639765669177833e-05, + "loss": 0.367, + "step": 5190 + }, + { + "epoch": 0.2849615806805708, + "grad_norm": 6.528948783874512, + "learning_rate": 4.6394948530962396e-05, + "loss": 0.3057, + "step": 5192 + }, + { + "epoch": 0.28507135016465424, + "grad_norm": 1.8255685567855835, + "learning_rate": 4.6392239431659014e-05, + "loss": 0.407, + "step": 5194 + }, + { + "epoch": 0.28518111964873766, + "grad_norm": 2.0191519260406494, + "learning_rate": 4.6389529393987e-05, + "loss": 0.3346, + "step": 5196 + }, + { + "epoch": 0.2852908891328211, + "grad_norm": 3.8364617824554443, + "learning_rate": 4.6386818418065244e-05, + "loss": 0.3307, + "step": 5198 + }, + { + "epoch": 0.2854006586169045, + "grad_norm": 2.5395500659942627, + "learning_rate": 4.638410650401267e-05, + "loss": 0.2786, + "step": 5200 + }, + { + "epoch": 0.2855104281009879, + "grad_norm": 1.8829036951065063, + "learning_rate": 4.638139365194821e-05, + "loss": 0.3834, + "step": 5202 + }, + { + "epoch": 0.28562019758507134, + "grad_norm": 2.3882975578308105, + "learning_rate": 4.637867986199089e-05, + "loss": 0.3524, + "step": 5204 + }, + { + "epoch": 0.28572996706915477, + "grad_norm": 2.119128465652466, + "learning_rate": 4.637596513425974e-05, + "loss": 0.2889, + "step": 5206 + }, + { + "epoch": 0.2858397365532382, + "grad_norm": 3.4326934814453125, + "learning_rate": 4.6373249468873833e-05, + "loss": 0.3965, + "step": 5208 + }, + { + "epoch": 0.2859495060373216, + "grad_norm": 1.9750043153762817, + "learning_rate": 4.6370532865952296e-05, + "loss": 0.3649, + "step": 5210 + }, + { + "epoch": 0.28605927552140503, + "grad_norm": 2.2871830463409424, + "learning_rate": 4.6367815325614306e-05, + "loss": 0.4147, + "step": 5212 + }, + { + "epoch": 0.28616904500548845, + "grad_norm": 2.233435869216919, + "learning_rate": 4.6365096847979046e-05, + "loss": 0.2989, + "step": 5214 + }, + { + "epoch": 0.2862788144895719, + "grad_norm": 1.5967321395874023, + "learning_rate": 4.636237743316578e-05, + "loss": 0.317, + "step": 5216 + }, + { + "epoch": 0.2863885839736553, + "grad_norm": 1.318194031715393, + "learning_rate": 4.6359657081293775e-05, + "loss": 0.3484, + "step": 5218 + }, + { + "epoch": 0.2864983534577388, + "grad_norm": 1.2868449687957764, + "learning_rate": 4.635693579248238e-05, + "loss": 0.2814, + "step": 5220 + }, + { + "epoch": 0.2866081229418222, + "grad_norm": 2.5087194442749023, + "learning_rate": 4.6354213566850955e-05, + "loss": 0.4442, + "step": 5222 + }, + { + "epoch": 0.2867178924259056, + "grad_norm": 1.3336997032165527, + "learning_rate": 4.635149040451891e-05, + "loss": 0.3016, + "step": 5224 + }, + { + "epoch": 0.28682766190998904, + "grad_norm": 1.6989197731018066, + "learning_rate": 4.634876630560569e-05, + "loss": 0.3054, + "step": 5226 + }, + { + "epoch": 0.28693743139407246, + "grad_norm": 1.3276766538619995, + "learning_rate": 4.6346041270230804e-05, + "loss": 0.3149, + "step": 5228 + }, + { + "epoch": 0.2870472008781559, + "grad_norm": 4.1111741065979, + "learning_rate": 4.6343315298513765e-05, + "loss": 0.2705, + "step": 5230 + }, + { + "epoch": 0.2871569703622393, + "grad_norm": 1.753036618232727, + "learning_rate": 4.634058839057417e-05, + "loss": 0.2759, + "step": 5232 + }, + { + "epoch": 0.2872667398463227, + "grad_norm": 1.8832874298095703, + "learning_rate": 4.633786054653162e-05, + "loss": 0.2553, + "step": 5234 + }, + { + "epoch": 0.28737650933040615, + "grad_norm": 2.1354451179504395, + "learning_rate": 4.633513176650577e-05, + "loss": 0.3871, + "step": 5236 + }, + { + "epoch": 0.28748627881448957, + "grad_norm": 2.5656309127807617, + "learning_rate": 4.633240205061632e-05, + "loss": 0.328, + "step": 5238 + }, + { + "epoch": 0.287596048298573, + "grad_norm": 1.5373066663742065, + "learning_rate": 4.632967139898301e-05, + "loss": 0.4027, + "step": 5240 + }, + { + "epoch": 0.2877058177826564, + "grad_norm": 1.6504428386688232, + "learning_rate": 4.6326939811725624e-05, + "loss": 0.2016, + "step": 5242 + }, + { + "epoch": 0.28781558726673984, + "grad_norm": 2.525275945663452, + "learning_rate": 4.6324207288963974e-05, + "loss": 0.4935, + "step": 5244 + }, + { + "epoch": 0.28792535675082326, + "grad_norm": 1.7025080919265747, + "learning_rate": 4.632147383081793e-05, + "loss": 0.3403, + "step": 5246 + }, + { + "epoch": 0.2880351262349067, + "grad_norm": 2.0041182041168213, + "learning_rate": 4.63187394374074e-05, + "loss": 0.4375, + "step": 5248 + }, + { + "epoch": 0.2881448957189901, + "grad_norm": 2.7189455032348633, + "learning_rate": 4.6316004108852305e-05, + "loss": 0.3724, + "step": 5250 + }, + { + "epoch": 0.2882546652030735, + "grad_norm": 2.0608959197998047, + "learning_rate": 4.6313267845272656e-05, + "loss": 0.3518, + "step": 5252 + }, + { + "epoch": 0.28836443468715695, + "grad_norm": 2.042259931564331, + "learning_rate": 4.631053064678846e-05, + "loss": 0.3068, + "step": 5254 + }, + { + "epoch": 0.2884742041712404, + "grad_norm": 2.360116481781006, + "learning_rate": 4.63077925135198e-05, + "loss": 0.4463, + "step": 5256 + }, + { + "epoch": 0.28858397365532384, + "grad_norm": 2.0600154399871826, + "learning_rate": 4.630505344558677e-05, + "loss": 0.3829, + "step": 5258 + }, + { + "epoch": 0.28869374313940727, + "grad_norm": 2.1825990676879883, + "learning_rate": 4.6302313443109526e-05, + "loss": 0.3497, + "step": 5260 + }, + { + "epoch": 0.2888035126234907, + "grad_norm": 2.330157995223999, + "learning_rate": 4.629957250620826e-05, + "loss": 0.287, + "step": 5262 + }, + { + "epoch": 0.2889132821075741, + "grad_norm": 1.6980255842208862, + "learning_rate": 4.629683063500319e-05, + "loss": 0.3756, + "step": 5264 + }, + { + "epoch": 0.28902305159165753, + "grad_norm": 2.773566246032715, + "learning_rate": 4.6294087829614595e-05, + "loss": 0.3495, + "step": 5266 + }, + { + "epoch": 0.28913282107574095, + "grad_norm": 2.302861213684082, + "learning_rate": 4.6291344090162804e-05, + "loss": 0.4127, + "step": 5268 + }, + { + "epoch": 0.2892425905598244, + "grad_norm": 1.8646020889282227, + "learning_rate": 4.6288599416768155e-05, + "loss": 0.4102, + "step": 5270 + }, + { + "epoch": 0.2893523600439078, + "grad_norm": 1.80588960647583, + "learning_rate": 4.6285853809551036e-05, + "loss": 0.4411, + "step": 5272 + }, + { + "epoch": 0.2894621295279912, + "grad_norm": 1.0934592485427856, + "learning_rate": 4.6283107268631895e-05, + "loss": 0.3284, + "step": 5274 + }, + { + "epoch": 0.28957189901207464, + "grad_norm": 4.069481372833252, + "learning_rate": 4.628035979413121e-05, + "loss": 0.367, + "step": 5276 + }, + { + "epoch": 0.28968166849615806, + "grad_norm": 1.8416523933410645, + "learning_rate": 4.627761138616949e-05, + "loss": 0.3324, + "step": 5278 + }, + { + "epoch": 0.2897914379802415, + "grad_norm": 1.9438947439193726, + "learning_rate": 4.6274862044867304e-05, + "loss": 0.4962, + "step": 5280 + }, + { + "epoch": 0.2899012074643249, + "grad_norm": 3.1091723442077637, + "learning_rate": 4.627211177034524e-05, + "loss": 0.4895, + "step": 5282 + }, + { + "epoch": 0.2900109769484083, + "grad_norm": 1.8128358125686646, + "learning_rate": 4.626936056272394e-05, + "loss": 0.3703, + "step": 5284 + }, + { + "epoch": 0.29012074643249175, + "grad_norm": 2.0967230796813965, + "learning_rate": 4.62666084221241e-05, + "loss": 0.4513, + "step": 5286 + }, + { + "epoch": 0.29023051591657517, + "grad_norm": 2.8992698192596436, + "learning_rate": 4.626385534866642e-05, + "loss": 0.3713, + "step": 5288 + }, + { + "epoch": 0.2903402854006586, + "grad_norm": 1.7371035814285278, + "learning_rate": 4.626110134247168e-05, + "loss": 0.2798, + "step": 5290 + }, + { + "epoch": 0.290450054884742, + "grad_norm": 1.5077639818191528, + "learning_rate": 4.625834640366068e-05, + "loss": 0.3785, + "step": 5292 + }, + { + "epoch": 0.2905598243688255, + "grad_norm": 1.860842227935791, + "learning_rate": 4.625559053235427e-05, + "loss": 0.3462, + "step": 5294 + }, + { + "epoch": 0.2906695938529089, + "grad_norm": 2.364990711212158, + "learning_rate": 4.625283372867333e-05, + "loss": 0.391, + "step": 5296 + }, + { + "epoch": 0.29077936333699234, + "grad_norm": 3.2872836589813232, + "learning_rate": 4.625007599273879e-05, + "loss": 0.4052, + "step": 5298 + }, + { + "epoch": 0.29088913282107576, + "grad_norm": 2.379528045654297, + "learning_rate": 4.6247317324671605e-05, + "loss": 0.2976, + "step": 5300 + }, + { + "epoch": 0.2909989023051592, + "grad_norm": 1.9431992769241333, + "learning_rate": 4.624455772459279e-05, + "loss": 0.3919, + "step": 5302 + }, + { + "epoch": 0.2911086717892426, + "grad_norm": 1.4238536357879639, + "learning_rate": 4.624179719262342e-05, + "loss": 0.4261, + "step": 5304 + }, + { + "epoch": 0.291218441273326, + "grad_norm": 1.7025543451309204, + "learning_rate": 4.623903572888454e-05, + "loss": 0.2739, + "step": 5306 + }, + { + "epoch": 0.29132821075740944, + "grad_norm": 1.420092225074768, + "learning_rate": 4.623627333349732e-05, + "loss": 0.3207, + "step": 5308 + }, + { + "epoch": 0.29143798024149287, + "grad_norm": 1.8426563739776611, + "learning_rate": 4.6233510006582914e-05, + "loss": 0.3606, + "step": 5310 + }, + { + "epoch": 0.2915477497255763, + "grad_norm": 1.035178542137146, + "learning_rate": 4.623074574826254e-05, + "loss": 0.2904, + "step": 5312 + }, + { + "epoch": 0.2916575192096597, + "grad_norm": 1.4558377265930176, + "learning_rate": 4.622798055865746e-05, + "loss": 0.2773, + "step": 5314 + }, + { + "epoch": 0.29176728869374313, + "grad_norm": 1.6469902992248535, + "learning_rate": 4.622521443788894e-05, + "loss": 0.3866, + "step": 5316 + }, + { + "epoch": 0.29187705817782655, + "grad_norm": 3.0068001747131348, + "learning_rate": 4.622244738607835e-05, + "loss": 0.3863, + "step": 5318 + }, + { + "epoch": 0.29198682766191, + "grad_norm": 1.6398367881774902, + "learning_rate": 4.621967940334705e-05, + "loss": 0.3239, + "step": 5320 + }, + { + "epoch": 0.2920965971459934, + "grad_norm": 1.1634970903396606, + "learning_rate": 4.6216910489816455e-05, + "loss": 0.3188, + "step": 5322 + }, + { + "epoch": 0.2922063666300768, + "grad_norm": 1.470943808555603, + "learning_rate": 4.621414064560803e-05, + "loss": 0.3485, + "step": 5324 + }, + { + "epoch": 0.29231613611416024, + "grad_norm": 3.681824207305908, + "learning_rate": 4.621136987084327e-05, + "loss": 0.4187, + "step": 5326 + }, + { + "epoch": 0.29242590559824366, + "grad_norm": 1.9648160934448242, + "learning_rate": 4.6208598165643715e-05, + "loss": 0.4148, + "step": 5328 + }, + { + "epoch": 0.29253567508232714, + "grad_norm": 1.39838445186615, + "learning_rate": 4.6205825530130944e-05, + "loss": 0.351, + "step": 5330 + }, + { + "epoch": 0.29264544456641056, + "grad_norm": 1.8846259117126465, + "learning_rate": 4.620305196442659e-05, + "loss": 0.3903, + "step": 5332 + }, + { + "epoch": 0.292755214050494, + "grad_norm": 2.070674419403076, + "learning_rate": 4.62002774686523e-05, + "loss": 0.3614, + "step": 5334 + }, + { + "epoch": 0.2928649835345774, + "grad_norm": 1.962031364440918, + "learning_rate": 4.619750204292978e-05, + "loss": 0.346, + "step": 5336 + }, + { + "epoch": 0.2929747530186608, + "grad_norm": 2.5104870796203613, + "learning_rate": 4.619472568738078e-05, + "loss": 0.579, + "step": 5338 + }, + { + "epoch": 0.29308452250274425, + "grad_norm": 2.8968944549560547, + "learning_rate": 4.619194840212708e-05, + "loss": 0.3078, + "step": 5340 + }, + { + "epoch": 0.29319429198682767, + "grad_norm": 2.758690595626831, + "learning_rate": 4.618917018729051e-05, + "loss": 0.3797, + "step": 5342 + }, + { + "epoch": 0.2933040614709111, + "grad_norm": 1.317427635192871, + "learning_rate": 4.618639104299294e-05, + "loss": 0.2443, + "step": 5344 + }, + { + "epoch": 0.2934138309549945, + "grad_norm": 2.666754722595215, + "learning_rate": 4.618361096935626e-05, + "loss": 0.3427, + "step": 5346 + }, + { + "epoch": 0.29352360043907794, + "grad_norm": 2.8820319175720215, + "learning_rate": 4.618082996650243e-05, + "loss": 0.5212, + "step": 5348 + }, + { + "epoch": 0.29363336992316136, + "grad_norm": 3.694446325302124, + "learning_rate": 4.617804803455344e-05, + "loss": 0.383, + "step": 5350 + }, + { + "epoch": 0.2937431394072448, + "grad_norm": 1.455602765083313, + "learning_rate": 4.61752651736313e-05, + "loss": 0.4315, + "step": 5352 + }, + { + "epoch": 0.2938529088913282, + "grad_norm": 4.168053150177002, + "learning_rate": 4.617248138385811e-05, + "loss": 0.3885, + "step": 5354 + }, + { + "epoch": 0.2939626783754116, + "grad_norm": 2.0924758911132812, + "learning_rate": 4.616969666535596e-05, + "loss": 0.5825, + "step": 5356 + }, + { + "epoch": 0.29407244785949505, + "grad_norm": 1.4998453855514526, + "learning_rate": 4.6166911018247004e-05, + "loss": 0.3786, + "step": 5358 + }, + { + "epoch": 0.29418221734357847, + "grad_norm": 2.205134391784668, + "learning_rate": 4.616412444265345e-05, + "loss": 0.4698, + "step": 5360 + }, + { + "epoch": 0.2942919868276619, + "grad_norm": 1.465649962425232, + "learning_rate": 4.616133693869751e-05, + "loss": 0.3554, + "step": 5362 + }, + { + "epoch": 0.2944017563117453, + "grad_norm": 2.451044797897339, + "learning_rate": 4.6158548506501464e-05, + "loss": 0.5051, + "step": 5364 + }, + { + "epoch": 0.29451152579582873, + "grad_norm": 2.3961081504821777, + "learning_rate": 4.615575914618763e-05, + "loss": 0.5436, + "step": 5366 + }, + { + "epoch": 0.2946212952799122, + "grad_norm": 1.9824758768081665, + "learning_rate": 4.6152968857878366e-05, + "loss": 0.471, + "step": 5368 + }, + { + "epoch": 0.29473106476399563, + "grad_norm": 1.7763067483901978, + "learning_rate": 4.6150177641696055e-05, + "loss": 0.3034, + "step": 5370 + }, + { + "epoch": 0.29484083424807905, + "grad_norm": 2.2930848598480225, + "learning_rate": 4.614738549776315e-05, + "loss": 0.3184, + "step": 5372 + }, + { + "epoch": 0.2949506037321625, + "grad_norm": 3.1483540534973145, + "learning_rate": 4.6144592426202114e-05, + "loss": 0.3159, + "step": 5374 + }, + { + "epoch": 0.2950603732162459, + "grad_norm": 2.5750505924224854, + "learning_rate": 4.614179842713547e-05, + "loss": 0.3213, + "step": 5376 + }, + { + "epoch": 0.2951701427003293, + "grad_norm": 1.188085913658142, + "learning_rate": 4.613900350068578e-05, + "loss": 0.2546, + "step": 5378 + }, + { + "epoch": 0.29527991218441274, + "grad_norm": 1.3436830043792725, + "learning_rate": 4.613620764697564e-05, + "loss": 0.4251, + "step": 5380 + }, + { + "epoch": 0.29538968166849616, + "grad_norm": 2.723936080932617, + "learning_rate": 4.613341086612769e-05, + "loss": 0.392, + "step": 5382 + }, + { + "epoch": 0.2954994511525796, + "grad_norm": 1.7387757301330566, + "learning_rate": 4.613061315826461e-05, + "loss": 0.3824, + "step": 5384 + }, + { + "epoch": 0.295609220636663, + "grad_norm": 1.5200152397155762, + "learning_rate": 4.6127814523509114e-05, + "loss": 0.3106, + "step": 5386 + }, + { + "epoch": 0.29571899012074643, + "grad_norm": 1.7627485990524292, + "learning_rate": 4.612501496198398e-05, + "loss": 0.3849, + "step": 5388 + }, + { + "epoch": 0.29582875960482985, + "grad_norm": 2.439096450805664, + "learning_rate": 4.6122214473812005e-05, + "loss": 0.3051, + "step": 5390 + }, + { + "epoch": 0.29593852908891327, + "grad_norm": 2.4600894451141357, + "learning_rate": 4.611941305911602e-05, + "loss": 0.3499, + "step": 5392 + }, + { + "epoch": 0.2960482985729967, + "grad_norm": 1.8222928047180176, + "learning_rate": 4.6116610718018925e-05, + "loss": 0.4222, + "step": 5394 + }, + { + "epoch": 0.2961580680570801, + "grad_norm": 1.6992032527923584, + "learning_rate": 4.611380745064363e-05, + "loss": 0.2597, + "step": 5396 + }, + { + "epoch": 0.29626783754116354, + "grad_norm": 3.3589417934417725, + "learning_rate": 4.611100325711312e-05, + "loss": 0.3319, + "step": 5398 + }, + { + "epoch": 0.29637760702524696, + "grad_norm": 2.0518698692321777, + "learning_rate": 4.610819813755038e-05, + "loss": 0.468, + "step": 5400 + }, + { + "epoch": 0.2964873765093304, + "grad_norm": 1.358229637145996, + "learning_rate": 4.6105392092078464e-05, + "loss": 0.3693, + "step": 5402 + }, + { + "epoch": 0.29659714599341386, + "grad_norm": 1.4449390172958374, + "learning_rate": 4.610258512082046e-05, + "loss": 0.261, + "step": 5404 + }, + { + "epoch": 0.2967069154774973, + "grad_norm": 1.2498023509979248, + "learning_rate": 4.60997772238995e-05, + "loss": 0.3036, + "step": 5406 + }, + { + "epoch": 0.2968166849615807, + "grad_norm": 1.8005753755569458, + "learning_rate": 4.6096968401438745e-05, + "loss": 0.4, + "step": 5408 + }, + { + "epoch": 0.2969264544456641, + "grad_norm": 1.7074607610702515, + "learning_rate": 4.60941586535614e-05, + "loss": 0.283, + "step": 5410 + }, + { + "epoch": 0.29703622392974754, + "grad_norm": 1.6943377256393433, + "learning_rate": 4.609134798039073e-05, + "loss": 0.3612, + "step": 5412 + }, + { + "epoch": 0.29714599341383097, + "grad_norm": 1.8173588514328003, + "learning_rate": 4.6088536382050006e-05, + "loss": 0.4386, + "step": 5414 + }, + { + "epoch": 0.2972557628979144, + "grad_norm": 1.5733003616333008, + "learning_rate": 4.608572385866257e-05, + "loss": 0.4028, + "step": 5416 + }, + { + "epoch": 0.2973655323819978, + "grad_norm": 2.0327939987182617, + "learning_rate": 4.608291041035179e-05, + "loss": 0.3217, + "step": 5418 + }, + { + "epoch": 0.29747530186608123, + "grad_norm": 1.9989784955978394, + "learning_rate": 4.6080096037241086e-05, + "loss": 0.3677, + "step": 5420 + }, + { + "epoch": 0.29758507135016465, + "grad_norm": 1.5722163915634155, + "learning_rate": 4.6077280739453893e-05, + "loss": 0.2975, + "step": 5422 + }, + { + "epoch": 0.2976948408342481, + "grad_norm": 1.2504435777664185, + "learning_rate": 4.607446451711372e-05, + "loss": 0.3869, + "step": 5424 + }, + { + "epoch": 0.2978046103183315, + "grad_norm": 3.5008623600006104, + "learning_rate": 4.607164737034409e-05, + "loss": 0.4123, + "step": 5426 + }, + { + "epoch": 0.2979143798024149, + "grad_norm": 1.7499910593032837, + "learning_rate": 4.606882929926858e-05, + "loss": 0.4843, + "step": 5428 + }, + { + "epoch": 0.29802414928649834, + "grad_norm": 1.4702811241149902, + "learning_rate": 4.606601030401081e-05, + "loss": 0.3573, + "step": 5430 + }, + { + "epoch": 0.29813391877058176, + "grad_norm": 2.4298317432403564, + "learning_rate": 4.606319038469443e-05, + "loss": 0.3905, + "step": 5432 + }, + { + "epoch": 0.2982436882546652, + "grad_norm": 2.2422409057617188, + "learning_rate": 4.606036954144313e-05, + "loss": 0.3726, + "step": 5434 + }, + { + "epoch": 0.2983534577387486, + "grad_norm": 2.5327041149139404, + "learning_rate": 4.605754777438065e-05, + "loss": 0.437, + "step": 5436 + }, + { + "epoch": 0.29846322722283203, + "grad_norm": 1.1960959434509277, + "learning_rate": 4.6054725083630775e-05, + "loss": 0.3117, + "step": 5438 + }, + { + "epoch": 0.29857299670691545, + "grad_norm": 3.0272958278656006, + "learning_rate": 4.605190146931731e-05, + "loss": 0.449, + "step": 5440 + }, + { + "epoch": 0.2986827661909989, + "grad_norm": 1.30680251121521, + "learning_rate": 4.604907693156412e-05, + "loss": 0.331, + "step": 5442 + }, + { + "epoch": 0.29879253567508235, + "grad_norm": 1.7740246057510376, + "learning_rate": 4.60462514704951e-05, + "loss": 0.4983, + "step": 5444 + }, + { + "epoch": 0.29890230515916577, + "grad_norm": 1.7680453062057495, + "learning_rate": 4.604342508623419e-05, + "loss": 0.3411, + "step": 5446 + }, + { + "epoch": 0.2990120746432492, + "grad_norm": 1.8141471147537231, + "learning_rate": 4.604059777890537e-05, + "loss": 0.401, + "step": 5448 + }, + { + "epoch": 0.2991218441273326, + "grad_norm": 1.6284024715423584, + "learning_rate": 4.6037769548632656e-05, + "loss": 0.3912, + "step": 5450 + }, + { + "epoch": 0.29923161361141604, + "grad_norm": 1.646665096282959, + "learning_rate": 4.603494039554011e-05, + "loss": 0.3129, + "step": 5452 + }, + { + "epoch": 0.29934138309549946, + "grad_norm": 1.7306952476501465, + "learning_rate": 4.603211031975184e-05, + "loss": 0.3743, + "step": 5454 + }, + { + "epoch": 0.2994511525795829, + "grad_norm": 2.4959757328033447, + "learning_rate": 4.602927932139197e-05, + "loss": 0.3955, + "step": 5456 + }, + { + "epoch": 0.2995609220636663, + "grad_norm": 1.343945026397705, + "learning_rate": 4.6026447400584695e-05, + "loss": 0.335, + "step": 5458 + }, + { + "epoch": 0.2996706915477497, + "grad_norm": 1.8081806898117065, + "learning_rate": 4.602361455745423e-05, + "loss": 0.4335, + "step": 5460 + }, + { + "epoch": 0.29978046103183315, + "grad_norm": 1.274646520614624, + "learning_rate": 4.602078079212484e-05, + "loss": 0.3859, + "step": 5462 + }, + { + "epoch": 0.29989023051591657, + "grad_norm": 1.1183141469955444, + "learning_rate": 4.6017946104720836e-05, + "loss": 0.3231, + "step": 5464 + }, + { + "epoch": 0.3, + "grad_norm": 1.4162373542785645, + "learning_rate": 4.6015110495366545e-05, + "loss": 0.2883, + "step": 5466 + }, + { + "epoch": 0.3001097694840834, + "grad_norm": 2.8769636154174805, + "learning_rate": 4.6012273964186365e-05, + "loss": 0.3495, + "step": 5468 + }, + { + "epoch": 0.30021953896816683, + "grad_norm": 1.6873334646224976, + "learning_rate": 4.6009436511304715e-05, + "loss": 0.3063, + "step": 5470 + }, + { + "epoch": 0.30032930845225025, + "grad_norm": 1.8258212804794312, + "learning_rate": 4.6006598136846056e-05, + "loss": 0.2632, + "step": 5472 + }, + { + "epoch": 0.3004390779363337, + "grad_norm": 2.513005495071411, + "learning_rate": 4.60037588409349e-05, + "loss": 0.3876, + "step": 5474 + }, + { + "epoch": 0.3005488474204171, + "grad_norm": 4.46135139465332, + "learning_rate": 4.600091862369579e-05, + "loss": 0.3128, + "step": 5476 + }, + { + "epoch": 0.3006586169045006, + "grad_norm": 1.4418209791183472, + "learning_rate": 4.5998077485253296e-05, + "loss": 0.4823, + "step": 5478 + }, + { + "epoch": 0.300768386388584, + "grad_norm": 5.590057373046875, + "learning_rate": 4.599523542573207e-05, + "loss": 0.4431, + "step": 5480 + }, + { + "epoch": 0.3008781558726674, + "grad_norm": 2.0439181327819824, + "learning_rate": 4.599239244525677e-05, + "loss": 0.3334, + "step": 5482 + }, + { + "epoch": 0.30098792535675084, + "grad_norm": 2.638273000717163, + "learning_rate": 4.59895485439521e-05, + "loss": 0.4517, + "step": 5484 + }, + { + "epoch": 0.30109769484083426, + "grad_norm": 1.2933578491210938, + "learning_rate": 4.59867037219428e-05, + "loss": 0.3665, + "step": 5486 + }, + { + "epoch": 0.3012074643249177, + "grad_norm": 2.076843500137329, + "learning_rate": 4.598385797935368e-05, + "loss": 0.2662, + "step": 5488 + }, + { + "epoch": 0.3013172338090011, + "grad_norm": 2.476914405822754, + "learning_rate": 4.598101131630954e-05, + "loss": 0.4242, + "step": 5490 + }, + { + "epoch": 0.30142700329308453, + "grad_norm": 1.8163065910339355, + "learning_rate": 4.597816373293528e-05, + "loss": 0.447, + "step": 5492 + }, + { + "epoch": 0.30153677277716795, + "grad_norm": 4.010210990905762, + "learning_rate": 4.5975315229355774e-05, + "loss": 0.4418, + "step": 5494 + }, + { + "epoch": 0.30164654226125137, + "grad_norm": 2.7196238040924072, + "learning_rate": 4.5972465805695996e-05, + "loss": 0.4496, + "step": 5496 + }, + { + "epoch": 0.3017563117453348, + "grad_norm": 1.8319209814071655, + "learning_rate": 4.596961546208093e-05, + "loss": 0.3772, + "step": 5498 + }, + { + "epoch": 0.3018660812294182, + "grad_norm": 2.7837724685668945, + "learning_rate": 4.5966764198635606e-05, + "loss": 0.4523, + "step": 5500 + }, + { + "epoch": 0.30197585071350164, + "grad_norm": 4.224636554718018, + "learning_rate": 4.596391201548509e-05, + "loss": 0.5097, + "step": 5502 + }, + { + "epoch": 0.30208562019758506, + "grad_norm": 2.6815133094787598, + "learning_rate": 4.596105891275449e-05, + "loss": 0.4474, + "step": 5504 + }, + { + "epoch": 0.3021953896816685, + "grad_norm": 1.4605159759521484, + "learning_rate": 4.595820489056898e-05, + "loss": 0.3334, + "step": 5506 + }, + { + "epoch": 0.3023051591657519, + "grad_norm": 3.943458080291748, + "learning_rate": 4.595534994905372e-05, + "loss": 0.3496, + "step": 5508 + }, + { + "epoch": 0.3024149286498353, + "grad_norm": 1.4381780624389648, + "learning_rate": 4.5952494088333964e-05, + "loss": 0.402, + "step": 5510 + }, + { + "epoch": 0.30252469813391875, + "grad_norm": 3.3935163021087646, + "learning_rate": 4.594963730853497e-05, + "loss": 0.352, + "step": 5512 + }, + { + "epoch": 0.3026344676180022, + "grad_norm": 2.9393017292022705, + "learning_rate": 4.594677960978206e-05, + "loss": 0.3759, + "step": 5514 + }, + { + "epoch": 0.30274423710208564, + "grad_norm": 2.135578155517578, + "learning_rate": 4.5943920992200585e-05, + "loss": 0.3067, + "step": 5516 + }, + { + "epoch": 0.30285400658616907, + "grad_norm": 1.9016053676605225, + "learning_rate": 4.594106145591594e-05, + "loss": 0.1949, + "step": 5518 + }, + { + "epoch": 0.3029637760702525, + "grad_norm": 1.7622814178466797, + "learning_rate": 4.593820100105355e-05, + "loss": 0.3267, + "step": 5520 + }, + { + "epoch": 0.3030735455543359, + "grad_norm": 1.7016198635101318, + "learning_rate": 4.5935339627738896e-05, + "loss": 0.3307, + "step": 5522 + }, + { + "epoch": 0.30318331503841933, + "grad_norm": 2.994224786758423, + "learning_rate": 4.593247733609748e-05, + "loss": 0.4858, + "step": 5524 + }, + { + "epoch": 0.30329308452250275, + "grad_norm": 3.2511146068573, + "learning_rate": 4.592961412625487e-05, + "loss": 0.393, + "step": 5526 + }, + { + "epoch": 0.3034028540065862, + "grad_norm": 1.2849351167678833, + "learning_rate": 4.592674999833666e-05, + "loss": 0.3864, + "step": 5528 + }, + { + "epoch": 0.3035126234906696, + "grad_norm": 3.0167906284332275, + "learning_rate": 4.592388495246848e-05, + "loss": 0.2743, + "step": 5530 + }, + { + "epoch": 0.303622392974753, + "grad_norm": 1.4064664840698242, + "learning_rate": 4.5921018988776e-05, + "loss": 0.3826, + "step": 5532 + }, + { + "epoch": 0.30373216245883644, + "grad_norm": 1.149636149406433, + "learning_rate": 4.5918152107384945e-05, + "loss": 0.2985, + "step": 5534 + }, + { + "epoch": 0.30384193194291986, + "grad_norm": 2.5770368576049805, + "learning_rate": 4.591528430842107e-05, + "loss": 0.367, + "step": 5536 + }, + { + "epoch": 0.3039517014270033, + "grad_norm": 1.7994356155395508, + "learning_rate": 4.5912415592010164e-05, + "loss": 0.309, + "step": 5538 + }, + { + "epoch": 0.3040614709110867, + "grad_norm": 1.4809250831604004, + "learning_rate": 4.590954595827806e-05, + "loss": 0.3586, + "step": 5540 + }, + { + "epoch": 0.30417124039517013, + "grad_norm": 1.7871793508529663, + "learning_rate": 4.5906675407350644e-05, + "loss": 0.4006, + "step": 5542 + }, + { + "epoch": 0.30428100987925355, + "grad_norm": 1.474789023399353, + "learning_rate": 4.590380393935383e-05, + "loss": 0.3486, + "step": 5544 + }, + { + "epoch": 0.30439077936333697, + "grad_norm": 2.5736758708953857, + "learning_rate": 4.5900931554413575e-05, + "loss": 0.2763, + "step": 5546 + }, + { + "epoch": 0.3045005488474204, + "grad_norm": 1.0010662078857422, + "learning_rate": 4.589805825265587e-05, + "loss": 0.2985, + "step": 5548 + }, + { + "epoch": 0.3046103183315038, + "grad_norm": 1.9489102363586426, + "learning_rate": 4.5895184034206765e-05, + "loss": 0.3267, + "step": 5550 + }, + { + "epoch": 0.3047200878155873, + "grad_norm": 1.5026044845581055, + "learning_rate": 4.589230889919232e-05, + "loss": 0.3188, + "step": 5552 + }, + { + "epoch": 0.3048298572996707, + "grad_norm": 3.182844400405884, + "learning_rate": 4.588943284773866e-05, + "loss": 0.4342, + "step": 5554 + }, + { + "epoch": 0.30493962678375414, + "grad_norm": 1.454245924949646, + "learning_rate": 4.588655587997195e-05, + "loss": 0.392, + "step": 5556 + }, + { + "epoch": 0.30504939626783756, + "grad_norm": 1.3679101467132568, + "learning_rate": 4.588367799601838e-05, + "loss": 0.3289, + "step": 5558 + }, + { + "epoch": 0.305159165751921, + "grad_norm": 1.5380369424819946, + "learning_rate": 4.588079919600419e-05, + "loss": 0.3638, + "step": 5560 + }, + { + "epoch": 0.3052689352360044, + "grad_norm": 3.011547803878784, + "learning_rate": 4.5877919480055654e-05, + "loss": 0.4667, + "step": 5562 + }, + { + "epoch": 0.3053787047200878, + "grad_norm": 1.1638596057891846, + "learning_rate": 4.587503884829909e-05, + "loss": 0.2733, + "step": 5564 + }, + { + "epoch": 0.30548847420417125, + "grad_norm": 2.5310957431793213, + "learning_rate": 4.587215730086087e-05, + "loss": 0.2799, + "step": 5566 + }, + { + "epoch": 0.30559824368825467, + "grad_norm": 2.4314610958099365, + "learning_rate": 4.5869274837867394e-05, + "loss": 0.5165, + "step": 5568 + }, + { + "epoch": 0.3057080131723381, + "grad_norm": 3.0826010704040527, + "learning_rate": 4.586639145944508e-05, + "loss": 0.3172, + "step": 5570 + }, + { + "epoch": 0.3058177826564215, + "grad_norm": 1.9000836610794067, + "learning_rate": 4.5863507165720415e-05, + "loss": 0.3488, + "step": 5572 + }, + { + "epoch": 0.30592755214050493, + "grad_norm": 2.1104824542999268, + "learning_rate": 4.586062195681993e-05, + "loss": 0.567, + "step": 5574 + }, + { + "epoch": 0.30603732162458835, + "grad_norm": 1.4803249835968018, + "learning_rate": 4.5857735832870166e-05, + "loss": 0.3033, + "step": 5576 + }, + { + "epoch": 0.3061470911086718, + "grad_norm": 1.4719065427780151, + "learning_rate": 4.585484879399774e-05, + "loss": 0.3459, + "step": 5578 + }, + { + "epoch": 0.3062568605927552, + "grad_norm": 1.628990888595581, + "learning_rate": 4.585196084032928e-05, + "loss": 0.552, + "step": 5580 + }, + { + "epoch": 0.3063666300768386, + "grad_norm": 2.738065004348755, + "learning_rate": 4.584907197199148e-05, + "loss": 0.4189, + "step": 5582 + }, + { + "epoch": 0.30647639956092204, + "grad_norm": 1.786090612411499, + "learning_rate": 4.5846182189111035e-05, + "loss": 0.314, + "step": 5584 + }, + { + "epoch": 0.30658616904500546, + "grad_norm": 1.197013258934021, + "learning_rate": 4.584329149181473e-05, + "loss": 0.2687, + "step": 5586 + }, + { + "epoch": 0.30669593852908894, + "grad_norm": 1.7458593845367432, + "learning_rate": 4.5840399880229354e-05, + "loss": 0.2689, + "step": 5588 + }, + { + "epoch": 0.30680570801317236, + "grad_norm": 5.751514911651611, + "learning_rate": 4.5837507354481745e-05, + "loss": 0.5973, + "step": 5590 + }, + { + "epoch": 0.3069154774972558, + "grad_norm": 2.4815220832824707, + "learning_rate": 4.583461391469879e-05, + "loss": 0.3314, + "step": 5592 + }, + { + "epoch": 0.3070252469813392, + "grad_norm": 4.5059919357299805, + "learning_rate": 4.5831719561007406e-05, + "loss": 0.3935, + "step": 5594 + }, + { + "epoch": 0.30713501646542263, + "grad_norm": 1.3458720445632935, + "learning_rate": 4.5828824293534555e-05, + "loss": 0.5069, + "step": 5596 + }, + { + "epoch": 0.30724478594950605, + "grad_norm": 2.811023235321045, + "learning_rate": 4.5825928112407236e-05, + "loss": 0.4391, + "step": 5598 + }, + { + "epoch": 0.30735455543358947, + "grad_norm": 2.617219924926758, + "learning_rate": 4.5823031017752485e-05, + "loss": 0.434, + "step": 5600 + }, + { + "epoch": 0.3074643249176729, + "grad_norm": 1.2396986484527588, + "learning_rate": 4.58201330096974e-05, + "loss": 0.4047, + "step": 5602 + }, + { + "epoch": 0.3075740944017563, + "grad_norm": 2.0130834579467773, + "learning_rate": 4.581723408836908e-05, + "loss": 0.3942, + "step": 5604 + }, + { + "epoch": 0.30768386388583974, + "grad_norm": 6.871053695678711, + "learning_rate": 4.5814334253894696e-05, + "loss": 0.3493, + "step": 5606 + }, + { + "epoch": 0.30779363336992316, + "grad_norm": 2.031081438064575, + "learning_rate": 4.5811433506401456e-05, + "loss": 0.3264, + "step": 5608 + }, + { + "epoch": 0.3079034028540066, + "grad_norm": 1.4248323440551758, + "learning_rate": 4.580853184601659e-05, + "loss": 0.4918, + "step": 5610 + }, + { + "epoch": 0.30801317233809, + "grad_norm": 1.798354983329773, + "learning_rate": 4.580562927286738e-05, + "loss": 0.2379, + "step": 5612 + }, + { + "epoch": 0.3081229418221734, + "grad_norm": 2.267484188079834, + "learning_rate": 4.580272578708115e-05, + "loss": 0.3524, + "step": 5614 + }, + { + "epoch": 0.30823271130625685, + "grad_norm": 2.6271800994873047, + "learning_rate": 4.579982138878527e-05, + "loss": 0.3583, + "step": 5616 + }, + { + "epoch": 0.30834248079034027, + "grad_norm": 3.8782756328582764, + "learning_rate": 4.579691607810712e-05, + "loss": 0.3022, + "step": 5618 + }, + { + "epoch": 0.3084522502744237, + "grad_norm": 2.428311824798584, + "learning_rate": 4.579400985517416e-05, + "loss": 0.3687, + "step": 5620 + }, + { + "epoch": 0.3085620197585071, + "grad_norm": 1.2806674242019653, + "learning_rate": 4.5791102720113864e-05, + "loss": 0.2689, + "step": 5622 + }, + { + "epoch": 0.30867178924259053, + "grad_norm": 2.439091444015503, + "learning_rate": 4.5788194673053756e-05, + "loss": 0.3665, + "step": 5624 + }, + { + "epoch": 0.308781558726674, + "grad_norm": 1.869637370109558, + "learning_rate": 4.57852857141214e-05, + "loss": 0.3717, + "step": 5626 + }, + { + "epoch": 0.30889132821075743, + "grad_norm": 2.263754367828369, + "learning_rate": 4.578237584344438e-05, + "loss": 0.2548, + "step": 5628 + }, + { + "epoch": 0.30900109769484085, + "grad_norm": 2.6071882247924805, + "learning_rate": 4.577946506115035e-05, + "loss": 0.37, + "step": 5630 + }, + { + "epoch": 0.3091108671789243, + "grad_norm": 2.456960678100586, + "learning_rate": 4.5776553367367e-05, + "loss": 0.5054, + "step": 5632 + }, + { + "epoch": 0.3092206366630077, + "grad_norm": 1.8399455547332764, + "learning_rate": 4.577364076222204e-05, + "loss": 0.3873, + "step": 5634 + }, + { + "epoch": 0.3093304061470911, + "grad_norm": 2.021223545074463, + "learning_rate": 4.577072724584323e-05, + "loss": 0.3315, + "step": 5636 + }, + { + "epoch": 0.30944017563117454, + "grad_norm": 2.265575647354126, + "learning_rate": 4.576781281835838e-05, + "loss": 0.359, + "step": 5638 + }, + { + "epoch": 0.30954994511525796, + "grad_norm": 3.500415563583374, + "learning_rate": 4.5764897479895317e-05, + "loss": 0.3692, + "step": 5640 + }, + { + "epoch": 0.3096597145993414, + "grad_norm": 2.1575372219085693, + "learning_rate": 4.576198123058193e-05, + "loss": 0.5198, + "step": 5642 + }, + { + "epoch": 0.3097694840834248, + "grad_norm": 2.316368818283081, + "learning_rate": 4.575906407054615e-05, + "loss": 0.3158, + "step": 5644 + }, + { + "epoch": 0.30987925356750823, + "grad_norm": 2.4189231395721436, + "learning_rate": 4.575614599991592e-05, + "loss": 0.4306, + "step": 5646 + }, + { + "epoch": 0.30998902305159165, + "grad_norm": 2.4953320026397705, + "learning_rate": 4.575322701881926e-05, + "loss": 0.301, + "step": 5648 + }, + { + "epoch": 0.31009879253567507, + "grad_norm": 1.7791945934295654, + "learning_rate": 4.575030712738419e-05, + "loss": 0.445, + "step": 5650 + }, + { + "epoch": 0.3102085620197585, + "grad_norm": 3.1498732566833496, + "learning_rate": 4.574738632573881e-05, + "loss": 0.4902, + "step": 5652 + }, + { + "epoch": 0.3103183315038419, + "grad_norm": 2.739542007446289, + "learning_rate": 4.574446461401122e-05, + "loss": 0.5053, + "step": 5654 + }, + { + "epoch": 0.31042810098792534, + "grad_norm": 1.5240960121154785, + "learning_rate": 4.574154199232959e-05, + "loss": 0.3595, + "step": 5656 + }, + { + "epoch": 0.31053787047200876, + "grad_norm": 3.919666051864624, + "learning_rate": 4.5738618460822134e-05, + "loss": 0.4332, + "step": 5658 + }, + { + "epoch": 0.3106476399560922, + "grad_norm": 2.070885181427002, + "learning_rate": 4.573569401961708e-05, + "loss": 0.4334, + "step": 5660 + }, + { + "epoch": 0.31075740944017566, + "grad_norm": 2.1189136505126953, + "learning_rate": 4.573276866884271e-05, + "loss": 0.4202, + "step": 5662 + }, + { + "epoch": 0.3108671789242591, + "grad_norm": 1.3891681432724, + "learning_rate": 4.5729842408627334e-05, + "loss": 0.4918, + "step": 5664 + }, + { + "epoch": 0.3109769484083425, + "grad_norm": 1.5822113752365112, + "learning_rate": 4.5726915239099334e-05, + "loss": 0.3803, + "step": 5666 + }, + { + "epoch": 0.3110867178924259, + "grad_norm": 1.2920863628387451, + "learning_rate": 4.572398716038709e-05, + "loss": 0.3527, + "step": 5668 + }, + { + "epoch": 0.31119648737650935, + "grad_norm": 1.357405185699463, + "learning_rate": 4.572105817261905e-05, + "loss": 0.351, + "step": 5670 + }, + { + "epoch": 0.31130625686059277, + "grad_norm": 2.382232189178467, + "learning_rate": 4.57181282759237e-05, + "loss": 0.346, + "step": 5672 + }, + { + "epoch": 0.3114160263446762, + "grad_norm": 1.46821928024292, + "learning_rate": 4.571519747042955e-05, + "loss": 0.3335, + "step": 5674 + }, + { + "epoch": 0.3115257958287596, + "grad_norm": 2.190321683883667, + "learning_rate": 4.571226575626516e-05, + "loss": 0.3609, + "step": 5676 + }, + { + "epoch": 0.31163556531284303, + "grad_norm": 2.2142090797424316, + "learning_rate": 4.570933313355913e-05, + "loss": 0.4234, + "step": 5678 + }, + { + "epoch": 0.31174533479692645, + "grad_norm": 1.930377721786499, + "learning_rate": 4.5706399602440106e-05, + "loss": 0.3717, + "step": 5680 + }, + { + "epoch": 0.3118551042810099, + "grad_norm": 1.5256797075271606, + "learning_rate": 4.5703465163036764e-05, + "loss": 0.3003, + "step": 5682 + }, + { + "epoch": 0.3119648737650933, + "grad_norm": 1.804304599761963, + "learning_rate": 4.570052981547782e-05, + "loss": 0.4002, + "step": 5684 + }, + { + "epoch": 0.3120746432491767, + "grad_norm": 2.4427528381347656, + "learning_rate": 4.5697593559892026e-05, + "loss": 0.4624, + "step": 5686 + }, + { + "epoch": 0.31218441273326014, + "grad_norm": 1.9109646081924438, + "learning_rate": 4.5694656396408195e-05, + "loss": 0.488, + "step": 5688 + }, + { + "epoch": 0.31229418221734356, + "grad_norm": 2.1416518688201904, + "learning_rate": 4.569171832515516e-05, + "loss": 0.3518, + "step": 5690 + }, + { + "epoch": 0.312403951701427, + "grad_norm": 1.6507787704467773, + "learning_rate": 4.56887793462618e-05, + "loss": 0.306, + "step": 5692 + }, + { + "epoch": 0.3125137211855104, + "grad_norm": 1.512976884841919, + "learning_rate": 4.5685839459857035e-05, + "loss": 0.3776, + "step": 5694 + }, + { + "epoch": 0.31262349066959383, + "grad_norm": 2.870779514312744, + "learning_rate": 4.568289866606981e-05, + "loss": 0.4649, + "step": 5696 + }, + { + "epoch": 0.31273326015367725, + "grad_norm": 2.0973012447357178, + "learning_rate": 4.567995696502914e-05, + "loss": 0.4592, + "step": 5698 + }, + { + "epoch": 0.31284302963776073, + "grad_norm": 1.2871760129928589, + "learning_rate": 4.567701435686404e-05, + "loss": 0.3821, + "step": 5700 + }, + { + "epoch": 0.31295279912184415, + "grad_norm": 1.6614818572998047, + "learning_rate": 4.567407084170362e-05, + "loss": 0.2595, + "step": 5702 + }, + { + "epoch": 0.31306256860592757, + "grad_norm": 1.4438036680221558, + "learning_rate": 4.567112641967697e-05, + "loss": 0.3356, + "step": 5704 + }, + { + "epoch": 0.313172338090011, + "grad_norm": 1.2574738264083862, + "learning_rate": 4.566818109091325e-05, + "loss": 0.4015, + "step": 5706 + }, + { + "epoch": 0.3132821075740944, + "grad_norm": 1.589101791381836, + "learning_rate": 4.5665234855541675e-05, + "loss": 0.3189, + "step": 5708 + }, + { + "epoch": 0.31339187705817784, + "grad_norm": 2.2452473640441895, + "learning_rate": 4.566228771369146e-05, + "loss": 0.4191, + "step": 5710 + }, + { + "epoch": 0.31350164654226126, + "grad_norm": 2.1529221534729004, + "learning_rate": 4.565933966549189e-05, + "loss": 0.2981, + "step": 5712 + }, + { + "epoch": 0.3136114160263447, + "grad_norm": 1.8043906688690186, + "learning_rate": 4.5656390711072285e-05, + "loss": 0.4197, + "step": 5714 + }, + { + "epoch": 0.3137211855104281, + "grad_norm": 1.8023546934127808, + "learning_rate": 4.5653440850561986e-05, + "loss": 0.4162, + "step": 5716 + }, + { + "epoch": 0.3138309549945115, + "grad_norm": 3.01224684715271, + "learning_rate": 4.56504900840904e-05, + "loss": 0.4124, + "step": 5718 + }, + { + "epoch": 0.31394072447859495, + "grad_norm": 1.381068229675293, + "learning_rate": 4.564753841178697e-05, + "loss": 0.3495, + "step": 5720 + }, + { + "epoch": 0.31405049396267837, + "grad_norm": 1.592443823814392, + "learning_rate": 4.564458583378115e-05, + "loss": 0.4974, + "step": 5722 + }, + { + "epoch": 0.3141602634467618, + "grad_norm": 3.042708396911621, + "learning_rate": 4.564163235020247e-05, + "loss": 0.537, + "step": 5724 + }, + { + "epoch": 0.3142700329308452, + "grad_norm": 2.8122713565826416, + "learning_rate": 4.563867796118049e-05, + "loss": 0.3467, + "step": 5726 + }, + { + "epoch": 0.31437980241492863, + "grad_norm": 1.498807430267334, + "learning_rate": 4.5635722666844775e-05, + "loss": 0.3096, + "step": 5728 + }, + { + "epoch": 0.31448957189901205, + "grad_norm": 2.3261382579803467, + "learning_rate": 4.563276646732499e-05, + "loss": 0.3784, + "step": 5730 + }, + { + "epoch": 0.3145993413830955, + "grad_norm": 1.9380112886428833, + "learning_rate": 4.56298093627508e-05, + "loss": 0.4092, + "step": 5732 + }, + { + "epoch": 0.3147091108671789, + "grad_norm": 2.100959062576294, + "learning_rate": 4.562685135325191e-05, + "loss": 0.2961, + "step": 5734 + }, + { + "epoch": 0.3148188803512624, + "grad_norm": 2.142310380935669, + "learning_rate": 4.5623892438958074e-05, + "loss": 0.2625, + "step": 5736 + }, + { + "epoch": 0.3149286498353458, + "grad_norm": 2.2872326374053955, + "learning_rate": 4.5620932619999084e-05, + "loss": 0.3844, + "step": 5738 + }, + { + "epoch": 0.3150384193194292, + "grad_norm": 1.5456137657165527, + "learning_rate": 4.561797189650478e-05, + "loss": 0.3061, + "step": 5740 + }, + { + "epoch": 0.31514818880351264, + "grad_norm": 1.7517894506454468, + "learning_rate": 4.561501026860503e-05, + "loss": 0.4775, + "step": 5742 + }, + { + "epoch": 0.31525795828759606, + "grad_norm": 1.3008642196655273, + "learning_rate": 4.561204773642974e-05, + "loss": 0.2411, + "step": 5744 + }, + { + "epoch": 0.3153677277716795, + "grad_norm": 1.5336843729019165, + "learning_rate": 4.5609084300108875e-05, + "loss": 0.36, + "step": 5746 + }, + { + "epoch": 0.3154774972557629, + "grad_norm": 1.4759721755981445, + "learning_rate": 4.560611995977242e-05, + "loss": 0.4507, + "step": 5748 + }, + { + "epoch": 0.31558726673984633, + "grad_norm": 2.579155683517456, + "learning_rate": 4.5603154715550386e-05, + "loss": 0.3077, + "step": 5750 + }, + { + "epoch": 0.31569703622392975, + "grad_norm": 1.1508653163909912, + "learning_rate": 4.5600188567572876e-05, + "loss": 0.2848, + "step": 5752 + }, + { + "epoch": 0.31580680570801317, + "grad_norm": 1.3202242851257324, + "learning_rate": 4.559722151596998e-05, + "loss": 0.2866, + "step": 5754 + }, + { + "epoch": 0.3159165751920966, + "grad_norm": 3.110534191131592, + "learning_rate": 4.5594253560871854e-05, + "loss": 0.4276, + "step": 5756 + }, + { + "epoch": 0.31602634467618, + "grad_norm": 2.068772792816162, + "learning_rate": 4.559128470240868e-05, + "loss": 0.3595, + "step": 5758 + }, + { + "epoch": 0.31613611416026344, + "grad_norm": 1.414602279663086, + "learning_rate": 4.558831494071069e-05, + "loss": 0.3047, + "step": 5760 + }, + { + "epoch": 0.31624588364434686, + "grad_norm": 2.2304234504699707, + "learning_rate": 4.558534427590815e-05, + "loss": 0.387, + "step": 5762 + }, + { + "epoch": 0.3163556531284303, + "grad_norm": 3.099419355392456, + "learning_rate": 4.5582372708131385e-05, + "loss": 0.3945, + "step": 5764 + }, + { + "epoch": 0.3164654226125137, + "grad_norm": 4.255032539367676, + "learning_rate": 4.557940023751071e-05, + "loss": 0.3056, + "step": 5766 + }, + { + "epoch": 0.3165751920965971, + "grad_norm": 2.060134172439575, + "learning_rate": 4.557642686417654e-05, + "loss": 0.3977, + "step": 5768 + }, + { + "epoch": 0.31668496158068055, + "grad_norm": 1.456932783126831, + "learning_rate": 4.5573452588259295e-05, + "loss": 0.4217, + "step": 5770 + }, + { + "epoch": 0.31679473106476397, + "grad_norm": 2.415884017944336, + "learning_rate": 4.557047740988944e-05, + "loss": 0.2436, + "step": 5772 + }, + { + "epoch": 0.31690450054884745, + "grad_norm": 3.3950350284576416, + "learning_rate": 4.556750132919747e-05, + "loss": 0.4384, + "step": 5774 + }, + { + "epoch": 0.31701427003293087, + "grad_norm": 3.5558993816375732, + "learning_rate": 4.556452434631395e-05, + "loss": 0.3815, + "step": 5776 + }, + { + "epoch": 0.3171240395170143, + "grad_norm": 1.7007932662963867, + "learning_rate": 4.5561546461369454e-05, + "loss": 0.4473, + "step": 5778 + }, + { + "epoch": 0.3172338090010977, + "grad_norm": 2.560790777206421, + "learning_rate": 4.555856767449461e-05, + "loss": 0.3631, + "step": 5780 + }, + { + "epoch": 0.31734357848518113, + "grad_norm": 2.12422776222229, + "learning_rate": 4.5555587985820074e-05, + "loss": 0.3177, + "step": 5782 + }, + { + "epoch": 0.31745334796926455, + "grad_norm": 2.1433517932891846, + "learning_rate": 4.555260739547657e-05, + "loss": 0.387, + "step": 5784 + }, + { + "epoch": 0.317563117453348, + "grad_norm": 2.0021328926086426, + "learning_rate": 4.554962590359481e-05, + "loss": 0.2878, + "step": 5786 + }, + { + "epoch": 0.3176728869374314, + "grad_norm": 2.2429139614105225, + "learning_rate": 4.55466435103056e-05, + "loss": 0.2612, + "step": 5788 + }, + { + "epoch": 0.3177826564215148, + "grad_norm": 1.3645507097244263, + "learning_rate": 4.554366021573976e-05, + "loss": 0.3357, + "step": 5790 + }, + { + "epoch": 0.31789242590559824, + "grad_norm": 1.945850133895874, + "learning_rate": 4.5540676020028145e-05, + "loss": 0.2932, + "step": 5792 + }, + { + "epoch": 0.31800219538968166, + "grad_norm": 1.766969919204712, + "learning_rate": 4.553769092330166e-05, + "loss": 0.2604, + "step": 5794 + }, + { + "epoch": 0.3181119648737651, + "grad_norm": 2.192885398864746, + "learning_rate": 4.553470492569125e-05, + "loss": 0.4254, + "step": 5796 + }, + { + "epoch": 0.3182217343578485, + "grad_norm": 2.3550937175750732, + "learning_rate": 4.553171802732789e-05, + "loss": 0.4728, + "step": 5798 + }, + { + "epoch": 0.31833150384193193, + "grad_norm": 1.4743833541870117, + "learning_rate": 4.5528730228342605e-05, + "loss": 0.3157, + "step": 5800 + }, + { + "epoch": 0.31844127332601535, + "grad_norm": 1.8152598142623901, + "learning_rate": 4.552574152886645e-05, + "loss": 0.2933, + "step": 5802 + }, + { + "epoch": 0.3185510428100988, + "grad_norm": 2.56325364112854, + "learning_rate": 4.552275192903052e-05, + "loss": 0.3264, + "step": 5804 + }, + { + "epoch": 0.3186608122941822, + "grad_norm": 1.7938536405563354, + "learning_rate": 4.551976142896596e-05, + "loss": 0.4934, + "step": 5806 + }, + { + "epoch": 0.3187705817782656, + "grad_norm": 2.039762258529663, + "learning_rate": 4.5516770028803954e-05, + "loss": 0.4063, + "step": 5808 + }, + { + "epoch": 0.3188803512623491, + "grad_norm": 2.4776198863983154, + "learning_rate": 4.551377772867571e-05, + "loss": 0.4188, + "step": 5810 + }, + { + "epoch": 0.3189901207464325, + "grad_norm": 2.3531179428100586, + "learning_rate": 4.551078452871248e-05, + "loss": 0.3685, + "step": 5812 + }, + { + "epoch": 0.31909989023051594, + "grad_norm": 1.246846079826355, + "learning_rate": 4.550779042904557e-05, + "loss": 0.4172, + "step": 5814 + }, + { + "epoch": 0.31920965971459936, + "grad_norm": 2.0533933639526367, + "learning_rate": 4.550479542980632e-05, + "loss": 0.3128, + "step": 5816 + }, + { + "epoch": 0.3193194291986828, + "grad_norm": 1.7418731451034546, + "learning_rate": 4.550179953112609e-05, + "loss": 0.2921, + "step": 5818 + }, + { + "epoch": 0.3194291986827662, + "grad_norm": 1.8458967208862305, + "learning_rate": 4.549880273313631e-05, + "loss": 0.3213, + "step": 5820 + }, + { + "epoch": 0.3195389681668496, + "grad_norm": 1.910467267036438, + "learning_rate": 4.5495805035968434e-05, + "loss": 0.436, + "step": 5822 + }, + { + "epoch": 0.31964873765093305, + "grad_norm": 2.37796688079834, + "learning_rate": 4.5492806439753935e-05, + "loss": 0.3549, + "step": 5824 + }, + { + "epoch": 0.31975850713501647, + "grad_norm": 1.6273505687713623, + "learning_rate": 4.5489806944624366e-05, + "loss": 0.3073, + "step": 5826 + }, + { + "epoch": 0.3198682766190999, + "grad_norm": 3.6044821739196777, + "learning_rate": 4.54868065507113e-05, + "loss": 0.5216, + "step": 5828 + }, + { + "epoch": 0.3199780461031833, + "grad_norm": 2.335916757583618, + "learning_rate": 4.548380525814634e-05, + "loss": 0.5353, + "step": 5830 + }, + { + "epoch": 0.32008781558726673, + "grad_norm": 1.6388972997665405, + "learning_rate": 4.548080306706114e-05, + "loss": 0.4377, + "step": 5832 + }, + { + "epoch": 0.32019758507135015, + "grad_norm": 2.088719606399536, + "learning_rate": 4.54777999775874e-05, + "loss": 0.3463, + "step": 5834 + }, + { + "epoch": 0.3203073545554336, + "grad_norm": 2.123539924621582, + "learning_rate": 4.547479598985683e-05, + "loss": 0.3634, + "step": 5836 + }, + { + "epoch": 0.320417124039517, + "grad_norm": 1.3670226335525513, + "learning_rate": 4.5471791104001215e-05, + "loss": 0.4275, + "step": 5838 + }, + { + "epoch": 0.3205268935236004, + "grad_norm": 1.4074115753173828, + "learning_rate": 4.5468785320152365e-05, + "loss": 0.2538, + "step": 5840 + }, + { + "epoch": 0.32063666300768384, + "grad_norm": 3.3862829208374023, + "learning_rate": 4.5465778638442127e-05, + "loss": 0.3927, + "step": 5842 + }, + { + "epoch": 0.32074643249176726, + "grad_norm": 2.988105297088623, + "learning_rate": 4.546277105900237e-05, + "loss": 0.5085, + "step": 5844 + }, + { + "epoch": 0.32085620197585074, + "grad_norm": 1.0238031148910522, + "learning_rate": 4.5459762581965056e-05, + "loss": 0.2135, + "step": 5846 + }, + { + "epoch": 0.32096597145993416, + "grad_norm": 1.3352628946304321, + "learning_rate": 4.545675320746212e-05, + "loss": 0.4335, + "step": 5848 + }, + { + "epoch": 0.3210757409440176, + "grad_norm": 1.6750520467758179, + "learning_rate": 4.545374293562559e-05, + "loss": 0.4043, + "step": 5850 + }, + { + "epoch": 0.321185510428101, + "grad_norm": 1.4310013055801392, + "learning_rate": 4.54507317665875e-05, + "loss": 0.3796, + "step": 5852 + }, + { + "epoch": 0.32129527991218443, + "grad_norm": 2.833601474761963, + "learning_rate": 4.544771970047993e-05, + "loss": 0.3615, + "step": 5854 + }, + { + "epoch": 0.32140504939626785, + "grad_norm": 2.255695104598999, + "learning_rate": 4.5444706737435014e-05, + "loss": 0.2758, + "step": 5856 + }, + { + "epoch": 0.32151481888035127, + "grad_norm": 1.933944821357727, + "learning_rate": 4.544169287758491e-05, + "loss": 0.3992, + "step": 5858 + }, + { + "epoch": 0.3216245883644347, + "grad_norm": 2.2703566551208496, + "learning_rate": 4.543867812106183e-05, + "loss": 0.4224, + "step": 5860 + }, + { + "epoch": 0.3217343578485181, + "grad_norm": 3.597686767578125, + "learning_rate": 4.5435662467998e-05, + "loss": 0.4352, + "step": 5862 + }, + { + "epoch": 0.32184412733260154, + "grad_norm": 1.6664149761199951, + "learning_rate": 4.543264591852572e-05, + "loss": 0.3191, + "step": 5864 + }, + { + "epoch": 0.32195389681668496, + "grad_norm": 2.5312795639038086, + "learning_rate": 4.542962847277729e-05, + "loss": 0.3198, + "step": 5866 + }, + { + "epoch": 0.3220636663007684, + "grad_norm": 2.430854558944702, + "learning_rate": 4.5426610130885087e-05, + "loss": 0.4366, + "step": 5868 + }, + { + "epoch": 0.3221734357848518, + "grad_norm": 1.9703330993652344, + "learning_rate": 4.5423590892981506e-05, + "loss": 0.3279, + "step": 5870 + }, + { + "epoch": 0.3222832052689352, + "grad_norm": 1.5143684148788452, + "learning_rate": 4.542057075919897e-05, + "loss": 0.4796, + "step": 5872 + }, + { + "epoch": 0.32239297475301865, + "grad_norm": 1.5479339361190796, + "learning_rate": 4.5417549729669984e-05, + "loss": 0.3134, + "step": 5874 + }, + { + "epoch": 0.32250274423710207, + "grad_norm": 1.1734957695007324, + "learning_rate": 4.541452780452705e-05, + "loss": 0.2795, + "step": 5876 + }, + { + "epoch": 0.3226125137211855, + "grad_norm": 4.55381965637207, + "learning_rate": 4.541150498390272e-05, + "loss": 0.3977, + "step": 5878 + }, + { + "epoch": 0.3227222832052689, + "grad_norm": 1.5096983909606934, + "learning_rate": 4.5408481267929605e-05, + "loss": 0.2741, + "step": 5880 + }, + { + "epoch": 0.32283205268935233, + "grad_norm": 2.6083943843841553, + "learning_rate": 4.540545665674032e-05, + "loss": 0.4296, + "step": 5882 + }, + { + "epoch": 0.3229418221734358, + "grad_norm": 2.2786011695861816, + "learning_rate": 4.540243115046756e-05, + "loss": 0.3322, + "step": 5884 + }, + { + "epoch": 0.32305159165751923, + "grad_norm": 1.6701875925064087, + "learning_rate": 4.5399404749244026e-05, + "loss": 0.3755, + "step": 5886 + }, + { + "epoch": 0.32316136114160265, + "grad_norm": 2.2576394081115723, + "learning_rate": 4.5396377453202466e-05, + "loss": 0.4127, + "step": 5888 + }, + { + "epoch": 0.3232711306256861, + "grad_norm": 3.1528005599975586, + "learning_rate": 4.539334926247569e-05, + "loss": 0.3552, + "step": 5890 + }, + { + "epoch": 0.3233809001097695, + "grad_norm": 1.0059279203414917, + "learning_rate": 4.539032017719651e-05, + "loss": 0.2119, + "step": 5892 + }, + { + "epoch": 0.3234906695938529, + "grad_norm": 1.9846247434616089, + "learning_rate": 4.538729019749781e-05, + "loss": 0.2942, + "step": 5894 + }, + { + "epoch": 0.32360043907793634, + "grad_norm": 2.390550374984741, + "learning_rate": 4.5384259323512504e-05, + "loss": 0.4034, + "step": 5896 + }, + { + "epoch": 0.32371020856201976, + "grad_norm": 2.0593206882476807, + "learning_rate": 4.5381227555373516e-05, + "loss": 0.3646, + "step": 5898 + }, + { + "epoch": 0.3238199780461032, + "grad_norm": 2.23244309425354, + "learning_rate": 4.537819489321386e-05, + "loss": 0.3786, + "step": 5900 + }, + { + "epoch": 0.3239297475301866, + "grad_norm": 1.9732602834701538, + "learning_rate": 4.537516133716655e-05, + "loss": 0.3792, + "step": 5902 + }, + { + "epoch": 0.32403951701427003, + "grad_norm": 1.9312447309494019, + "learning_rate": 4.5372126887364655e-05, + "loss": 0.4097, + "step": 5904 + }, + { + "epoch": 0.32414928649835345, + "grad_norm": 1.4954513311386108, + "learning_rate": 4.536909154394129e-05, + "loss": 0.3495, + "step": 5906 + }, + { + "epoch": 0.3242590559824369, + "grad_norm": 1.200068712234497, + "learning_rate": 4.5366055307029585e-05, + "loss": 0.3174, + "step": 5908 + }, + { + "epoch": 0.3243688254665203, + "grad_norm": 1.9755651950836182, + "learning_rate": 4.536301817676274e-05, + "loss": 0.526, + "step": 5910 + }, + { + "epoch": 0.3244785949506037, + "grad_norm": 1.7840911149978638, + "learning_rate": 4.5359980153273964e-05, + "loss": 0.4321, + "step": 5912 + }, + { + "epoch": 0.32458836443468714, + "grad_norm": 1.2653611898422241, + "learning_rate": 4.5356941236696525e-05, + "loss": 0.2985, + "step": 5914 + }, + { + "epoch": 0.32469813391877056, + "grad_norm": 9.037978172302246, + "learning_rate": 4.5353901427163725e-05, + "loss": 0.3956, + "step": 5916 + }, + { + "epoch": 0.324807903402854, + "grad_norm": 2.968306064605713, + "learning_rate": 4.535086072480891e-05, + "loss": 0.4097, + "step": 5918 + }, + { + "epoch": 0.32491767288693746, + "grad_norm": 4.9635772705078125, + "learning_rate": 4.534781912976546e-05, + "loss": 0.4265, + "step": 5920 + }, + { + "epoch": 0.3250274423710209, + "grad_norm": 2.080228090286255, + "learning_rate": 4.5344776642166775e-05, + "loss": 0.3367, + "step": 5922 + }, + { + "epoch": 0.3251372118551043, + "grad_norm": 3.5157763957977295, + "learning_rate": 4.534173326214634e-05, + "loss": 0.509, + "step": 5924 + }, + { + "epoch": 0.3252469813391877, + "grad_norm": 3.673520088195801, + "learning_rate": 4.533868898983764e-05, + "loss": 0.4298, + "step": 5926 + }, + { + "epoch": 0.32535675082327115, + "grad_norm": 2.550692319869995, + "learning_rate": 4.533564382537421e-05, + "loss": 0.4218, + "step": 5928 + }, + { + "epoch": 0.32546652030735457, + "grad_norm": 1.9975769519805908, + "learning_rate": 4.533259776888963e-05, + "loss": 0.3449, + "step": 5930 + }, + { + "epoch": 0.325576289791438, + "grad_norm": 2.282149076461792, + "learning_rate": 4.532955082051751e-05, + "loss": 0.3639, + "step": 5932 + }, + { + "epoch": 0.3256860592755214, + "grad_norm": 1.833769679069519, + "learning_rate": 4.5326502980391515e-05, + "loss": 0.328, + "step": 5934 + }, + { + "epoch": 0.32579582875960483, + "grad_norm": 2.0050230026245117, + "learning_rate": 4.5323454248645324e-05, + "loss": 0.3734, + "step": 5936 + }, + { + "epoch": 0.32590559824368825, + "grad_norm": 2.2653510570526123, + "learning_rate": 4.5320404625412684e-05, + "loss": 0.4177, + "step": 5938 + }, + { + "epoch": 0.3260153677277717, + "grad_norm": 2.1147656440734863, + "learning_rate": 4.531735411082735e-05, + "loss": 0.4153, + "step": 5940 + }, + { + "epoch": 0.3261251372118551, + "grad_norm": 1.3345718383789062, + "learning_rate": 4.531430270502315e-05, + "loss": 0.295, + "step": 5942 + }, + { + "epoch": 0.3262349066959385, + "grad_norm": 1.5834379196166992, + "learning_rate": 4.531125040813392e-05, + "loss": 0.4378, + "step": 5944 + }, + { + "epoch": 0.32634467618002194, + "grad_norm": 1.6762309074401855, + "learning_rate": 4.530819722029355e-05, + "loss": 0.3338, + "step": 5946 + }, + { + "epoch": 0.32645444566410536, + "grad_norm": 2.88928484916687, + "learning_rate": 4.5305143141635976e-05, + "loss": 0.2273, + "step": 5948 + }, + { + "epoch": 0.3265642151481888, + "grad_norm": 5.289597988128662, + "learning_rate": 4.5302088172295156e-05, + "loss": 0.3831, + "step": 5950 + }, + { + "epoch": 0.3266739846322722, + "grad_norm": 2.9898407459259033, + "learning_rate": 4.529903231240511e-05, + "loss": 0.405, + "step": 5952 + }, + { + "epoch": 0.32678375411635563, + "grad_norm": 2.3065505027770996, + "learning_rate": 4.5295975562099866e-05, + "loss": 0.3164, + "step": 5954 + }, + { + "epoch": 0.32689352360043905, + "grad_norm": 2.2842493057250977, + "learning_rate": 4.529291792151351e-05, + "loss": 0.3804, + "step": 5956 + }, + { + "epoch": 0.32700329308452253, + "grad_norm": 1.7544355392456055, + "learning_rate": 4.528985939078018e-05, + "loss": 0.4689, + "step": 5958 + }, + { + "epoch": 0.32711306256860595, + "grad_norm": 1.7439783811569214, + "learning_rate": 4.528679997003403e-05, + "loss": 0.2894, + "step": 5960 + }, + { + "epoch": 0.32722283205268937, + "grad_norm": 3.531177282333374, + "learning_rate": 4.5283739659409256e-05, + "loss": 0.4214, + "step": 5962 + }, + { + "epoch": 0.3273326015367728, + "grad_norm": 2.0504608154296875, + "learning_rate": 4.5280678459040095e-05, + "loss": 0.3304, + "step": 5964 + }, + { + "epoch": 0.3274423710208562, + "grad_norm": 2.006199836730957, + "learning_rate": 4.527761636906084e-05, + "loss": 0.412, + "step": 5966 + }, + { + "epoch": 0.32755214050493964, + "grad_norm": 1.6758108139038086, + "learning_rate": 4.52745533896058e-05, + "loss": 0.3726, + "step": 5968 + }, + { + "epoch": 0.32766190998902306, + "grad_norm": 1.6266685724258423, + "learning_rate": 4.527148952080934e-05, + "loss": 0.2772, + "step": 5970 + }, + { + "epoch": 0.3277716794731065, + "grad_norm": 1.1417325735092163, + "learning_rate": 4.526842476280585e-05, + "loss": 0.2562, + "step": 5972 + }, + { + "epoch": 0.3278814489571899, + "grad_norm": 2.6897385120391846, + "learning_rate": 4.526535911572977e-05, + "loss": 0.4881, + "step": 5974 + }, + { + "epoch": 0.3279912184412733, + "grad_norm": 1.5535308122634888, + "learning_rate": 4.5262292579715556e-05, + "loss": 0.4576, + "step": 5976 + }, + { + "epoch": 0.32810098792535675, + "grad_norm": 1.8410979509353638, + "learning_rate": 4.525922515489775e-05, + "loss": 0.389, + "step": 5978 + }, + { + "epoch": 0.32821075740944017, + "grad_norm": 4.114858627319336, + "learning_rate": 4.5256156841410886e-05, + "loss": 0.397, + "step": 5980 + }, + { + "epoch": 0.3283205268935236, + "grad_norm": 1.500614881515503, + "learning_rate": 4.5253087639389556e-05, + "loss": 0.4264, + "step": 5982 + }, + { + "epoch": 0.328430296377607, + "grad_norm": 1.9911799430847168, + "learning_rate": 4.5250017548968404e-05, + "loss": 0.473, + "step": 5984 + }, + { + "epoch": 0.32854006586169043, + "grad_norm": 1.5804150104522705, + "learning_rate": 4.5246946570282084e-05, + "loss": 0.4738, + "step": 5986 + }, + { + "epoch": 0.32864983534577386, + "grad_norm": 1.6984422206878662, + "learning_rate": 4.524387470346531e-05, + "loss": 0.3615, + "step": 5988 + }, + { + "epoch": 0.3287596048298573, + "grad_norm": 2.104107141494751, + "learning_rate": 4.524080194865283e-05, + "loss": 0.4374, + "step": 5990 + }, + { + "epoch": 0.3288693743139407, + "grad_norm": 2.14615797996521, + "learning_rate": 4.523772830597942e-05, + "loss": 0.2393, + "step": 5992 + }, + { + "epoch": 0.3289791437980242, + "grad_norm": 3.949063539505005, + "learning_rate": 4.5234653775579925e-05, + "loss": 0.5209, + "step": 5994 + }, + { + "epoch": 0.3290889132821076, + "grad_norm": 1.9280880689620972, + "learning_rate": 4.52315783575892e-05, + "loss": 0.2935, + "step": 5996 + }, + { + "epoch": 0.329198682766191, + "grad_norm": 1.4678467512130737, + "learning_rate": 4.5228502052142136e-05, + "loss": 0.4478, + "step": 5998 + }, + { + "epoch": 0.32930845225027444, + "grad_norm": 2.050224542617798, + "learning_rate": 4.522542485937369e-05, + "loss": 0.3008, + "step": 6000 + }, + { + "epoch": 0.32941822173435786, + "grad_norm": 1.8808414936065674, + "learning_rate": 4.5222346779418835e-05, + "loss": 0.2801, + "step": 6002 + }, + { + "epoch": 0.3295279912184413, + "grad_norm": 2.197317600250244, + "learning_rate": 4.521926781241259e-05, + "loss": 0.5298, + "step": 6004 + }, + { + "epoch": 0.3296377607025247, + "grad_norm": 2.1016366481781006, + "learning_rate": 4.521618795849002e-05, + "loss": 0.3184, + "step": 6006 + }, + { + "epoch": 0.32974753018660813, + "grad_norm": 4.548704624176025, + "learning_rate": 4.521310721778622e-05, + "loss": 0.3514, + "step": 6008 + }, + { + "epoch": 0.32985729967069155, + "grad_norm": 2.95377779006958, + "learning_rate": 4.5210025590436334e-05, + "loss": 0.4803, + "step": 6010 + }, + { + "epoch": 0.329967069154775, + "grad_norm": 1.9477492570877075, + "learning_rate": 4.520694307657551e-05, + "loss": 0.3405, + "step": 6012 + }, + { + "epoch": 0.3300768386388584, + "grad_norm": 2.3428385257720947, + "learning_rate": 4.5203859676339e-05, + "loss": 0.3077, + "step": 6014 + }, + { + "epoch": 0.3301866081229418, + "grad_norm": 1.1337162256240845, + "learning_rate": 4.5200775389862026e-05, + "loss": 0.2618, + "step": 6016 + }, + { + "epoch": 0.33029637760702524, + "grad_norm": 2.2881369590759277, + "learning_rate": 4.51976902172799e-05, + "loss": 0.3066, + "step": 6018 + }, + { + "epoch": 0.33040614709110866, + "grad_norm": 2.2653067111968994, + "learning_rate": 4.519460415872794e-05, + "loss": 0.3907, + "step": 6020 + }, + { + "epoch": 0.3305159165751921, + "grad_norm": 4.139437198638916, + "learning_rate": 4.519151721434152e-05, + "loss": 0.4949, + "step": 6022 + }, + { + "epoch": 0.3306256860592755, + "grad_norm": 1.4624977111816406, + "learning_rate": 4.518842938425605e-05, + "loss": 0.3046, + "step": 6024 + }, + { + "epoch": 0.3307354555433589, + "grad_norm": 2.057217597961426, + "learning_rate": 4.518534066860698e-05, + "loss": 0.2955, + "step": 6026 + }, + { + "epoch": 0.33084522502744235, + "grad_norm": 4.967296600341797, + "learning_rate": 4.518225106752979e-05, + "loss": 0.3819, + "step": 6028 + }, + { + "epoch": 0.33095499451152577, + "grad_norm": 1.4017413854599, + "learning_rate": 4.5179160581160005e-05, + "loss": 0.2622, + "step": 6030 + }, + { + "epoch": 0.33106476399560925, + "grad_norm": 2.4802420139312744, + "learning_rate": 4.51760692096332e-05, + "loss": 0.3852, + "step": 6032 + }, + { + "epoch": 0.33117453347969267, + "grad_norm": 1.4737839698791504, + "learning_rate": 4.5172976953084966e-05, + "loss": 0.3742, + "step": 6034 + }, + { + "epoch": 0.3312843029637761, + "grad_norm": 2.3513567447662354, + "learning_rate": 4.516988381165095e-05, + "loss": 0.2753, + "step": 6036 + }, + { + "epoch": 0.3313940724478595, + "grad_norm": 2.168196201324463, + "learning_rate": 4.5166789785466824e-05, + "loss": 0.3896, + "step": 6038 + }, + { + "epoch": 0.33150384193194293, + "grad_norm": 2.19977068901062, + "learning_rate": 4.516369487466832e-05, + "loss": 0.6081, + "step": 6040 + }, + { + "epoch": 0.33161361141602635, + "grad_norm": 2.3046622276306152, + "learning_rate": 4.516059907939118e-05, + "loss": 0.5377, + "step": 6042 + }, + { + "epoch": 0.3317233809001098, + "grad_norm": 4.620467662811279, + "learning_rate": 4.515750239977122e-05, + "loss": 0.4479, + "step": 6044 + }, + { + "epoch": 0.3318331503841932, + "grad_norm": 1.4552867412567139, + "learning_rate": 4.5154404835944256e-05, + "loss": 0.4327, + "step": 6046 + }, + { + "epoch": 0.3319429198682766, + "grad_norm": 2.091726779937744, + "learning_rate": 4.5151306388046175e-05, + "loss": 0.3439, + "step": 6048 + }, + { + "epoch": 0.33205268935236004, + "grad_norm": 1.499635934829712, + "learning_rate": 4.5148207056212896e-05, + "loss": 0.4444, + "step": 6050 + }, + { + "epoch": 0.33216245883644346, + "grad_norm": 1.6715131998062134, + "learning_rate": 4.514510684058036e-05, + "loss": 0.3934, + "step": 6052 + }, + { + "epoch": 0.3322722283205269, + "grad_norm": 1.6542991399765015, + "learning_rate": 4.514200574128455e-05, + "loss": 0.2739, + "step": 6054 + }, + { + "epoch": 0.3323819978046103, + "grad_norm": 1.7867778539657593, + "learning_rate": 4.5138903758461515e-05, + "loss": 0.2502, + "step": 6056 + }, + { + "epoch": 0.33249176728869373, + "grad_norm": 2.869964599609375, + "learning_rate": 4.513580089224732e-05, + "loss": 0.3412, + "step": 6058 + }, + { + "epoch": 0.33260153677277715, + "grad_norm": 2.0080032348632812, + "learning_rate": 4.513269714277805e-05, + "loss": 0.3143, + "step": 6060 + }, + { + "epoch": 0.3327113062568606, + "grad_norm": 1.400646448135376, + "learning_rate": 4.512959251018987e-05, + "loss": 0.3563, + "step": 6062 + }, + { + "epoch": 0.332821075740944, + "grad_norm": 1.5356860160827637, + "learning_rate": 4.512648699461897e-05, + "loss": 0.2302, + "step": 6064 + }, + { + "epoch": 0.3329308452250274, + "grad_norm": 2.4388506412506104, + "learning_rate": 4.5123380596201556e-05, + "loss": 0.3801, + "step": 6066 + }, + { + "epoch": 0.3330406147091109, + "grad_norm": 5.421361446380615, + "learning_rate": 4.5120273315073897e-05, + "loss": 0.2502, + "step": 6068 + }, + { + "epoch": 0.3331503841931943, + "grad_norm": 1.5495022535324097, + "learning_rate": 4.5117165151372296e-05, + "loss": 0.253, + "step": 6070 + }, + { + "epoch": 0.33326015367727774, + "grad_norm": 1.7170186042785645, + "learning_rate": 4.511405610523309e-05, + "loss": 0.2665, + "step": 6072 + }, + { + "epoch": 0.33336992316136116, + "grad_norm": 3.384371519088745, + "learning_rate": 4.5110946176792664e-05, + "loss": 0.3098, + "step": 6074 + }, + { + "epoch": 0.3334796926454446, + "grad_norm": 1.8864940404891968, + "learning_rate": 4.5107835366187425e-05, + "loss": 0.5316, + "step": 6076 + }, + { + "epoch": 0.333589462129528, + "grad_norm": 1.7956823110580444, + "learning_rate": 4.510472367355383e-05, + "loss": 0.332, + "step": 6078 + }, + { + "epoch": 0.3336992316136114, + "grad_norm": 1.7911691665649414, + "learning_rate": 4.510161109902837e-05, + "loss": 0.3957, + "step": 6080 + }, + { + "epoch": 0.33380900109769485, + "grad_norm": 2.6595890522003174, + "learning_rate": 4.509849764274759e-05, + "loss": 0.3766, + "step": 6082 + }, + { + "epoch": 0.33391877058177827, + "grad_norm": 1.097749948501587, + "learning_rate": 4.509538330484805e-05, + "loss": 0.229, + "step": 6084 + }, + { + "epoch": 0.3340285400658617, + "grad_norm": 1.982761263847351, + "learning_rate": 4.5092268085466364e-05, + "loss": 0.4331, + "step": 6086 + }, + { + "epoch": 0.3341383095499451, + "grad_norm": 1.3931185007095337, + "learning_rate": 4.508915198473919e-05, + "loss": 0.3622, + "step": 6088 + }, + { + "epoch": 0.33424807903402853, + "grad_norm": 3.4685206413269043, + "learning_rate": 4.5086035002803195e-05, + "loss": 0.4036, + "step": 6090 + }, + { + "epoch": 0.33435784851811196, + "grad_norm": 1.6063061952590942, + "learning_rate": 4.5082917139795125e-05, + "loss": 0.2833, + "step": 6092 + }, + { + "epoch": 0.3344676180021954, + "grad_norm": 2.9833409786224365, + "learning_rate": 4.507979839585172e-05, + "loss": 0.4119, + "step": 6094 + }, + { + "epoch": 0.3345773874862788, + "grad_norm": 1.6101008653640747, + "learning_rate": 4.507667877110982e-05, + "loss": 0.2841, + "step": 6096 + }, + { + "epoch": 0.3346871569703622, + "grad_norm": 2.1228911876678467, + "learning_rate": 4.507355826570624e-05, + "loss": 0.4091, + "step": 6098 + }, + { + "epoch": 0.33479692645444564, + "grad_norm": 2.441441535949707, + "learning_rate": 4.5070436879777865e-05, + "loss": 0.4411, + "step": 6100 + }, + { + "epoch": 0.33490669593852906, + "grad_norm": 3.6565165519714355, + "learning_rate": 4.506731461346162e-05, + "loss": 0.353, + "step": 6102 + }, + { + "epoch": 0.3350164654226125, + "grad_norm": 1.8668426275253296, + "learning_rate": 4.506419146689446e-05, + "loss": 0.3232, + "step": 6104 + }, + { + "epoch": 0.33512623490669596, + "grad_norm": 1.7002127170562744, + "learning_rate": 4.506106744021338e-05, + "loss": 0.4204, + "step": 6106 + }, + { + "epoch": 0.3352360043907794, + "grad_norm": 1.7696278095245361, + "learning_rate": 4.505794253355542e-05, + "loss": 0.3628, + "step": 6108 + }, + { + "epoch": 0.3353457738748628, + "grad_norm": 3.412606954574585, + "learning_rate": 4.5054816747057647e-05, + "loss": 0.3387, + "step": 6110 + }, + { + "epoch": 0.33545554335894623, + "grad_norm": 2.480647325515747, + "learning_rate": 4.5051690080857176e-05, + "loss": 0.3477, + "step": 6112 + }, + { + "epoch": 0.33556531284302965, + "grad_norm": 1.6552315950393677, + "learning_rate": 4.5048562535091154e-05, + "loss": 0.3613, + "step": 6114 + }, + { + "epoch": 0.3356750823271131, + "grad_norm": 1.3850690126419067, + "learning_rate": 4.5045434109896786e-05, + "loss": 0.3047, + "step": 6116 + }, + { + "epoch": 0.3357848518111965, + "grad_norm": 2.5462613105773926, + "learning_rate": 4.5042304805411285e-05, + "loss": 0.4313, + "step": 6118 + }, + { + "epoch": 0.3358946212952799, + "grad_norm": 1.608770489692688, + "learning_rate": 4.503917462177192e-05, + "loss": 0.3699, + "step": 6120 + }, + { + "epoch": 0.33600439077936334, + "grad_norm": 2.2181520462036133, + "learning_rate": 4.503604355911599e-05, + "loss": 0.473, + "step": 6122 + }, + { + "epoch": 0.33611416026344676, + "grad_norm": 1.5982508659362793, + "learning_rate": 4.503291161758087e-05, + "loss": 0.4131, + "step": 6124 + }, + { + "epoch": 0.3362239297475302, + "grad_norm": 1.6959283351898193, + "learning_rate": 4.5029778797303894e-05, + "loss": 0.2829, + "step": 6126 + }, + { + "epoch": 0.3363336992316136, + "grad_norm": 1.4618877172470093, + "learning_rate": 4.5026645098422515e-05, + "loss": 0.2966, + "step": 6128 + }, + { + "epoch": 0.336443468715697, + "grad_norm": 1.758219838142395, + "learning_rate": 4.50235105210742e-05, + "loss": 0.4096, + "step": 6130 + }, + { + "epoch": 0.33655323819978045, + "grad_norm": 2.0478005409240723, + "learning_rate": 4.502037506539642e-05, + "loss": 0.4457, + "step": 6132 + }, + { + "epoch": 0.33666300768386387, + "grad_norm": 1.7590384483337402, + "learning_rate": 4.501723873152672e-05, + "loss": 0.3221, + "step": 6134 + }, + { + "epoch": 0.3367727771679473, + "grad_norm": 2.49336314201355, + "learning_rate": 4.501410151960268e-05, + "loss": 0.4038, + "step": 6136 + }, + { + "epoch": 0.3368825466520307, + "grad_norm": 1.7995789051055908, + "learning_rate": 4.5010963429761924e-05, + "loss": 0.344, + "step": 6138 + }, + { + "epoch": 0.33699231613611413, + "grad_norm": 1.3092906475067139, + "learning_rate": 4.5007824462142076e-05, + "loss": 0.2533, + "step": 6140 + }, + { + "epoch": 0.3371020856201976, + "grad_norm": 3.05808424949646, + "learning_rate": 4.500468461688086e-05, + "loss": 0.3905, + "step": 6142 + }, + { + "epoch": 0.33721185510428103, + "grad_norm": 1.171627402305603, + "learning_rate": 4.5001543894115975e-05, + "loss": 0.1873, + "step": 6144 + }, + { + "epoch": 0.33732162458836445, + "grad_norm": 1.263147234916687, + "learning_rate": 4.499840229398521e-05, + "loss": 0.374, + "step": 6146 + }, + { + "epoch": 0.3374313940724479, + "grad_norm": 1.8000704050064087, + "learning_rate": 4.4995259816626356e-05, + "loss": 0.4024, + "step": 6148 + }, + { + "epoch": 0.3375411635565313, + "grad_norm": 1.3361409902572632, + "learning_rate": 4.499211646217727e-05, + "loss": 0.3254, + "step": 6150 + }, + { + "epoch": 0.3376509330406147, + "grad_norm": 2.0787675380706787, + "learning_rate": 4.498897223077582e-05, + "loss": 0.4157, + "step": 6152 + }, + { + "epoch": 0.33776070252469814, + "grad_norm": 2.0534298419952393, + "learning_rate": 4.498582712255994e-05, + "loss": 0.3638, + "step": 6154 + }, + { + "epoch": 0.33787047200878156, + "grad_norm": 2.682570457458496, + "learning_rate": 4.4982681137667594e-05, + "loss": 0.312, + "step": 6156 + }, + { + "epoch": 0.337980241492865, + "grad_norm": 2.211566209793091, + "learning_rate": 4.497953427623677e-05, + "loss": 0.4314, + "step": 6158 + }, + { + "epoch": 0.3380900109769484, + "grad_norm": 0.9271048307418823, + "learning_rate": 4.4976386538405495e-05, + "loss": 0.2743, + "step": 6160 + }, + { + "epoch": 0.33819978046103183, + "grad_norm": 1.9390267133712769, + "learning_rate": 4.497323792431187e-05, + "loss": 0.4305, + "step": 6162 + }, + { + "epoch": 0.33830954994511525, + "grad_norm": 2.318986177444458, + "learning_rate": 4.497008843409399e-05, + "loss": 0.3835, + "step": 6164 + }, + { + "epoch": 0.3384193194291987, + "grad_norm": 3.0749099254608154, + "learning_rate": 4.4966938067890004e-05, + "loss": 0.3675, + "step": 6166 + }, + { + "epoch": 0.3385290889132821, + "grad_norm": 1.9882687330245972, + "learning_rate": 4.496378682583813e-05, + "loss": 0.3282, + "step": 6168 + }, + { + "epoch": 0.3386388583973655, + "grad_norm": 1.7802263498306274, + "learning_rate": 4.4960634708076566e-05, + "loss": 0.2973, + "step": 6170 + }, + { + "epoch": 0.33874862788144894, + "grad_norm": 2.32855224609375, + "learning_rate": 4.4957481714743585e-05, + "loss": 0.3893, + "step": 6172 + }, + { + "epoch": 0.33885839736553236, + "grad_norm": 1.4420382976531982, + "learning_rate": 4.495432784597751e-05, + "loss": 0.2899, + "step": 6174 + }, + { + "epoch": 0.3389681668496158, + "grad_norm": 2.5031049251556396, + "learning_rate": 4.4951173101916675e-05, + "loss": 0.2376, + "step": 6176 + }, + { + "epoch": 0.33907793633369926, + "grad_norm": 2.747556447982788, + "learning_rate": 4.4948017482699456e-05, + "loss": 0.317, + "step": 6178 + }, + { + "epoch": 0.3391877058177827, + "grad_norm": 1.7603529691696167, + "learning_rate": 4.4944860988464276e-05, + "loss": 0.4059, + "step": 6180 + }, + { + "epoch": 0.3392974753018661, + "grad_norm": 1.5245331525802612, + "learning_rate": 4.494170361934961e-05, + "loss": 0.2856, + "step": 6182 + }, + { + "epoch": 0.3394072447859495, + "grad_norm": 1.9339715242385864, + "learning_rate": 4.4938545375493934e-05, + "loss": 0.458, + "step": 6184 + }, + { + "epoch": 0.33951701427003295, + "grad_norm": 1.824879765510559, + "learning_rate": 4.493538625703579e-05, + "loss": 0.3702, + "step": 6186 + }, + { + "epoch": 0.33962678375411637, + "grad_norm": 2.7182047367095947, + "learning_rate": 4.4932226264113764e-05, + "loss": 0.3343, + "step": 6188 + }, + { + "epoch": 0.3397365532381998, + "grad_norm": 1.802304744720459, + "learning_rate": 4.492906539686646e-05, + "loss": 0.3813, + "step": 6190 + }, + { + "epoch": 0.3398463227222832, + "grad_norm": 2.243642568588257, + "learning_rate": 4.492590365543253e-05, + "loss": 0.4033, + "step": 6192 + }, + { + "epoch": 0.33995609220636663, + "grad_norm": 2.83305287361145, + "learning_rate": 4.492274103995066e-05, + "loss": 0.5308, + "step": 6194 + }, + { + "epoch": 0.34006586169045006, + "grad_norm": 1.5558570623397827, + "learning_rate": 4.491957755055959e-05, + "loss": 0.3013, + "step": 6196 + }, + { + "epoch": 0.3401756311745335, + "grad_norm": Infinity, + "learning_rate": 4.491799547819145e-05, + "loss": 0.488, + "step": 6198 + }, + { + "epoch": 0.3402854006586169, + "grad_norm": 1.5384591817855835, + "learning_rate": 4.4914830678196764e-05, + "loss": 0.2883, + "step": 6200 + }, + { + "epoch": 0.3403951701427003, + "grad_norm": 2.544976234436035, + "learning_rate": 4.491166500463986e-05, + "loss": 0.2802, + "step": 6202 + }, + { + "epoch": 0.34050493962678374, + "grad_norm": 3.882084369659424, + "learning_rate": 4.4908498457659586e-05, + "loss": 0.4401, + "step": 6204 + }, + { + "epoch": 0.34061470911086716, + "grad_norm": 1.5975689888000488, + "learning_rate": 4.4905331037394855e-05, + "loss": 0.3781, + "step": 6206 + }, + { + "epoch": 0.3407244785949506, + "grad_norm": 2.4044394493103027, + "learning_rate": 4.49021627439846e-05, + "loss": 0.2595, + "step": 6208 + }, + { + "epoch": 0.340834248079034, + "grad_norm": 2.22904372215271, + "learning_rate": 4.4898993577567805e-05, + "loss": 0.3551, + "step": 6210 + }, + { + "epoch": 0.34094401756311743, + "grad_norm": 2.9457786083221436, + "learning_rate": 4.489582353828349e-05, + "loss": 0.5044, + "step": 6212 + }, + { + "epoch": 0.34105378704720085, + "grad_norm": 1.684760570526123, + "learning_rate": 4.489265262627069e-05, + "loss": 0.2295, + "step": 6214 + }, + { + "epoch": 0.34116355653128433, + "grad_norm": 1.4637792110443115, + "learning_rate": 4.488948084166851e-05, + "loss": 0.3298, + "step": 6216 + }, + { + "epoch": 0.34127332601536775, + "grad_norm": 8.264935493469238, + "learning_rate": 4.4886308184616075e-05, + "loss": 0.424, + "step": 6218 + }, + { + "epoch": 0.3413830954994512, + "grad_norm": 2.6386971473693848, + "learning_rate": 4.4883134655252555e-05, + "loss": 0.3276, + "step": 6220 + }, + { + "epoch": 0.3414928649835346, + "grad_norm": 3.055664539337158, + "learning_rate": 4.487996025371716e-05, + "loss": 0.3555, + "step": 6222 + }, + { + "epoch": 0.341602634467618, + "grad_norm": 1.5945584774017334, + "learning_rate": 4.4876784980149135e-05, + "loss": 0.4088, + "step": 6224 + }, + { + "epoch": 0.34171240395170144, + "grad_norm": 2.4317972660064697, + "learning_rate": 4.487360883468775e-05, + "loss": 0.2983, + "step": 6226 + }, + { + "epoch": 0.34182217343578486, + "grad_norm": 1.6614909172058105, + "learning_rate": 4.4870431817472346e-05, + "loss": 0.305, + "step": 6228 + }, + { + "epoch": 0.3419319429198683, + "grad_norm": 2.76859974861145, + "learning_rate": 4.486725392864227e-05, + "loss": 0.351, + "step": 6230 + }, + { + "epoch": 0.3420417124039517, + "grad_norm": 1.3955104351043701, + "learning_rate": 4.486407516833692e-05, + "loss": 0.2937, + "step": 6232 + }, + { + "epoch": 0.3421514818880351, + "grad_norm": 1.1365389823913574, + "learning_rate": 4.486089553669574e-05, + "loss": 0.2417, + "step": 6234 + }, + { + "epoch": 0.34226125137211855, + "grad_norm": 1.5208218097686768, + "learning_rate": 4.485771503385818e-05, + "loss": 0.299, + "step": 6236 + }, + { + "epoch": 0.34237102085620197, + "grad_norm": 3.042067766189575, + "learning_rate": 4.4854533659963796e-05, + "loss": 0.366, + "step": 6238 + }, + { + "epoch": 0.3424807903402854, + "grad_norm": 1.3791027069091797, + "learning_rate": 4.48513514151521e-05, + "loss": 0.3221, + "step": 6240 + }, + { + "epoch": 0.3425905598243688, + "grad_norm": 1.8722023963928223, + "learning_rate": 4.4848168299562696e-05, + "loss": 0.4187, + "step": 6242 + }, + { + "epoch": 0.34270032930845223, + "grad_norm": 2.0743696689605713, + "learning_rate": 4.484498431333521e-05, + "loss": 0.3538, + "step": 6244 + }, + { + "epoch": 0.34281009879253566, + "grad_norm": 1.4411135911941528, + "learning_rate": 4.484179945660931e-05, + "loss": 0.3851, + "step": 6246 + }, + { + "epoch": 0.3429198682766191, + "grad_norm": 2.0070266723632812, + "learning_rate": 4.48386137295247e-05, + "loss": 0.3389, + "step": 6248 + }, + { + "epoch": 0.3430296377607025, + "grad_norm": 2.790045976638794, + "learning_rate": 4.4835427132221106e-05, + "loss": 0.4808, + "step": 6250 + }, + { + "epoch": 0.343139407244786, + "grad_norm": 1.8527288436889648, + "learning_rate": 4.483223966483833e-05, + "loss": 0.3276, + "step": 6252 + }, + { + "epoch": 0.3432491767288694, + "grad_norm": 2.07973313331604, + "learning_rate": 4.482905132751618e-05, + "loss": 0.3856, + "step": 6254 + }, + { + "epoch": 0.3433589462129528, + "grad_norm": 1.6252590417861938, + "learning_rate": 4.482586212039451e-05, + "loss": 0.4439, + "step": 6256 + }, + { + "epoch": 0.34346871569703624, + "grad_norm": 2.175919771194458, + "learning_rate": 4.4822672043613215e-05, + "loss": 0.3796, + "step": 6258 + }, + { + "epoch": 0.34357848518111966, + "grad_norm": 2.1437313556671143, + "learning_rate": 4.481948109731223e-05, + "loss": 0.3303, + "step": 6260 + }, + { + "epoch": 0.3436882546652031, + "grad_norm": 1.9727102518081665, + "learning_rate": 4.481628928163152e-05, + "loss": 0.4202, + "step": 6262 + }, + { + "epoch": 0.3437980241492865, + "grad_norm": 2.0729403495788574, + "learning_rate": 4.4813096596711105e-05, + "loss": 0.3864, + "step": 6264 + }, + { + "epoch": 0.34390779363336993, + "grad_norm": 2.481593608856201, + "learning_rate": 4.480990304269102e-05, + "loss": 0.3554, + "step": 6266 + }, + { + "epoch": 0.34401756311745335, + "grad_norm": 4.082510948181152, + "learning_rate": 4.4806708619711355e-05, + "loss": 0.4908, + "step": 6268 + }, + { + "epoch": 0.3441273326015368, + "grad_norm": 2.2893483638763428, + "learning_rate": 4.480351332791224e-05, + "loss": 0.3199, + "step": 6270 + }, + { + "epoch": 0.3442371020856202, + "grad_norm": 1.9199700355529785, + "learning_rate": 4.480031716743382e-05, + "loss": 0.3132, + "step": 6272 + }, + { + "epoch": 0.3443468715697036, + "grad_norm": 1.4906516075134277, + "learning_rate": 4.47971201384163e-05, + "loss": 0.2759, + "step": 6274 + }, + { + "epoch": 0.34445664105378704, + "grad_norm": 1.3763362169265747, + "learning_rate": 4.4793922240999933e-05, + "loss": 0.5651, + "step": 6276 + }, + { + "epoch": 0.34456641053787046, + "grad_norm": 2.1010684967041016, + "learning_rate": 4.479072347532498e-05, + "loss": 0.384, + "step": 6278 + }, + { + "epoch": 0.3446761800219539, + "grad_norm": 2.3462421894073486, + "learning_rate": 4.4787523841531753e-05, + "loss": 0.3038, + "step": 6280 + }, + { + "epoch": 0.3447859495060373, + "grad_norm": 1.0658035278320312, + "learning_rate": 4.47843233397606e-05, + "loss": 0.2299, + "step": 6282 + }, + { + "epoch": 0.3448957189901207, + "grad_norm": 1.5347139835357666, + "learning_rate": 4.478112197015193e-05, + "loss": 0.3168, + "step": 6284 + }, + { + "epoch": 0.34500548847420415, + "grad_norm": 3.523449182510376, + "learning_rate": 4.4777919732846166e-05, + "loss": 0.368, + "step": 6286 + }, + { + "epoch": 0.34511525795828757, + "grad_norm": 2.0001373291015625, + "learning_rate": 4.477471662798375e-05, + "loss": 0.4515, + "step": 6288 + }, + { + "epoch": 0.34522502744237105, + "grad_norm": 1.3472172021865845, + "learning_rate": 4.477151265570521e-05, + "loss": 0.4031, + "step": 6290 + }, + { + "epoch": 0.34533479692645447, + "grad_norm": 1.3509502410888672, + "learning_rate": 4.476830781615108e-05, + "loss": 0.2964, + "step": 6292 + }, + { + "epoch": 0.3454445664105379, + "grad_norm": 1.3662561178207397, + "learning_rate": 4.4765102109461934e-05, + "loss": 0.3943, + "step": 6294 + }, + { + "epoch": 0.3455543358946213, + "grad_norm": 2.6632883548736572, + "learning_rate": 4.47618955357784e-05, + "loss": 0.355, + "step": 6296 + }, + { + "epoch": 0.34566410537870473, + "grad_norm": 2.638113498687744, + "learning_rate": 4.475868809524114e-05, + "loss": 0.4147, + "step": 6298 + }, + { + "epoch": 0.34577387486278816, + "grad_norm": 2.2687885761260986, + "learning_rate": 4.4755479787990825e-05, + "loss": 0.4625, + "step": 6300 + }, + { + "epoch": 0.3458836443468716, + "grad_norm": 1.73666250705719, + "learning_rate": 4.4752270614168214e-05, + "loss": 0.351, + "step": 6302 + }, + { + "epoch": 0.345993413830955, + "grad_norm": 1.9557325839996338, + "learning_rate": 4.474906057391406e-05, + "loss": 0.2864, + "step": 6304 + }, + { + "epoch": 0.3461031833150384, + "grad_norm": 1.3597663640975952, + "learning_rate": 4.474584966736916e-05, + "loss": 0.4175, + "step": 6306 + }, + { + "epoch": 0.34621295279912184, + "grad_norm": 3.2001402378082275, + "learning_rate": 4.47426378946744e-05, + "loss": 0.3086, + "step": 6308 + }, + { + "epoch": 0.34632272228320526, + "grad_norm": 1.3115712404251099, + "learning_rate": 4.473942525597062e-05, + "loss": 0.2565, + "step": 6310 + }, + { + "epoch": 0.3464324917672887, + "grad_norm": 2.647202730178833, + "learning_rate": 4.473621175139877e-05, + "loss": 0.3806, + "step": 6312 + }, + { + "epoch": 0.3465422612513721, + "grad_norm": 1.5884943008422852, + "learning_rate": 4.4732997381099807e-05, + "loss": 0.3077, + "step": 6314 + }, + { + "epoch": 0.34665203073545553, + "grad_norm": 1.918850302696228, + "learning_rate": 4.4729782145214716e-05, + "loss": 0.4463, + "step": 6316 + }, + { + "epoch": 0.34676180021953895, + "grad_norm": 1.8649673461914062, + "learning_rate": 4.472656604388454e-05, + "loss": 0.2848, + "step": 6318 + }, + { + "epoch": 0.3468715697036224, + "grad_norm": 1.8987736701965332, + "learning_rate": 4.472334907725035e-05, + "loss": 0.329, + "step": 6320 + }, + { + "epoch": 0.3469813391877058, + "grad_norm": 2.1078569889068604, + "learning_rate": 4.472013124545327e-05, + "loss": 0.413, + "step": 6322 + }, + { + "epoch": 0.3470911086717892, + "grad_norm": 1.2076135873794556, + "learning_rate": 4.471691254863444e-05, + "loss": 0.2198, + "step": 6324 + }, + { + "epoch": 0.3472008781558727, + "grad_norm": 1.812496304512024, + "learning_rate": 4.471369298693505e-05, + "loss": 0.2909, + "step": 6326 + }, + { + "epoch": 0.3473106476399561, + "grad_norm": 2.9403157234191895, + "learning_rate": 4.471047256049632e-05, + "loss": 0.3216, + "step": 6328 + }, + { + "epoch": 0.34742041712403954, + "grad_norm": 1.39503014087677, + "learning_rate": 4.470725126945953e-05, + "loss": 0.2549, + "step": 6330 + }, + { + "epoch": 0.34753018660812296, + "grad_norm": 2.6846261024475098, + "learning_rate": 4.4704029113965956e-05, + "loss": 0.4032, + "step": 6332 + }, + { + "epoch": 0.3476399560922064, + "grad_norm": 1.732485055923462, + "learning_rate": 4.4700806094156955e-05, + "loss": 0.4637, + "step": 6334 + }, + { + "epoch": 0.3477497255762898, + "grad_norm": 1.4977720975875854, + "learning_rate": 4.469758221017391e-05, + "loss": 0.3771, + "step": 6336 + }, + { + "epoch": 0.3478594950603732, + "grad_norm": 2.430778980255127, + "learning_rate": 4.469435746215822e-05, + "loss": 0.406, + "step": 6338 + }, + { + "epoch": 0.34796926454445665, + "grad_norm": 3.3771557807922363, + "learning_rate": 4.469113185025135e-05, + "loss": 0.3292, + "step": 6340 + }, + { + "epoch": 0.34807903402854007, + "grad_norm": 2.98762845993042, + "learning_rate": 4.468790537459479e-05, + "loss": 0.2838, + "step": 6342 + }, + { + "epoch": 0.3481888035126235, + "grad_norm": 1.251626968383789, + "learning_rate": 4.468467803533006e-05, + "loss": 0.3432, + "step": 6344 + }, + { + "epoch": 0.3482985729967069, + "grad_norm": 2.1305460929870605, + "learning_rate": 4.468144983259873e-05, + "loss": 0.3534, + "step": 6346 + }, + { + "epoch": 0.34840834248079033, + "grad_norm": 2.5244150161743164, + "learning_rate": 4.467822076654242e-05, + "loss": 0.2859, + "step": 6348 + }, + { + "epoch": 0.34851811196487376, + "grad_norm": 1.9378371238708496, + "learning_rate": 4.467499083730275e-05, + "loss": 0.475, + "step": 6350 + }, + { + "epoch": 0.3486278814489572, + "grad_norm": 1.3213738203048706, + "learning_rate": 4.467176004502141e-05, + "loss": 0.3544, + "step": 6352 + }, + { + "epoch": 0.3487376509330406, + "grad_norm": 1.9753730297088623, + "learning_rate": 4.466852838984013e-05, + "loss": 0.2993, + "step": 6354 + }, + { + "epoch": 0.348847420417124, + "grad_norm": 1.566986083984375, + "learning_rate": 4.4665295871900656e-05, + "loss": 0.3474, + "step": 6356 + }, + { + "epoch": 0.34895718990120744, + "grad_norm": 1.0888259410858154, + "learning_rate": 4.4662062491344775e-05, + "loss": 0.2928, + "step": 6358 + }, + { + "epoch": 0.34906695938529086, + "grad_norm": 0.799623966217041, + "learning_rate": 4.4658828248314326e-05, + "loss": 0.293, + "step": 6360 + }, + { + "epoch": 0.3491767288693743, + "grad_norm": 2.8863370418548584, + "learning_rate": 4.465559314295118e-05, + "loss": 0.424, + "step": 6362 + }, + { + "epoch": 0.34928649835345776, + "grad_norm": 1.821646809577942, + "learning_rate": 4.465235717539725e-05, + "loss": 0.4007, + "step": 6364 + }, + { + "epoch": 0.3493962678375412, + "grad_norm": 0.9853725433349609, + "learning_rate": 4.464912034579447e-05, + "loss": 0.3081, + "step": 6366 + }, + { + "epoch": 0.3495060373216246, + "grad_norm": 2.7946512699127197, + "learning_rate": 4.464588265428482e-05, + "loss": 0.4183, + "step": 6368 + }, + { + "epoch": 0.34961580680570803, + "grad_norm": 1.9444077014923096, + "learning_rate": 4.4642644101010345e-05, + "loss": 0.3912, + "step": 6370 + }, + { + "epoch": 0.34972557628979145, + "grad_norm": 1.6569031476974487, + "learning_rate": 4.463940468611308e-05, + "loss": 0.2991, + "step": 6372 + }, + { + "epoch": 0.3498353457738749, + "grad_norm": 1.7672420740127563, + "learning_rate": 4.4636164409735125e-05, + "loss": 0.5042, + "step": 6374 + }, + { + "epoch": 0.3499451152579583, + "grad_norm": 1.3125030994415283, + "learning_rate": 4.463292327201862e-05, + "loss": 0.2912, + "step": 6376 + }, + { + "epoch": 0.3500548847420417, + "grad_norm": 2.522571325302124, + "learning_rate": 4.462968127310574e-05, + "loss": 0.3049, + "step": 6378 + }, + { + "epoch": 0.35016465422612514, + "grad_norm": 1.3451695442199707, + "learning_rate": 4.4626438413138695e-05, + "loss": 0.3694, + "step": 6380 + }, + { + "epoch": 0.35027442371020856, + "grad_norm": 2.9338364601135254, + "learning_rate": 4.4623194692259726e-05, + "loss": 0.3493, + "step": 6382 + }, + { + "epoch": 0.350384193194292, + "grad_norm": 1.476245641708374, + "learning_rate": 4.461995011061112e-05, + "loss": 0.289, + "step": 6384 + }, + { + "epoch": 0.3504939626783754, + "grad_norm": 2.1083245277404785, + "learning_rate": 4.46167046683352e-05, + "loss": 0.4795, + "step": 6386 + }, + { + "epoch": 0.3506037321624588, + "grad_norm": 1.502083420753479, + "learning_rate": 4.461345836557433e-05, + "loss": 0.331, + "step": 6388 + }, + { + "epoch": 0.35071350164654225, + "grad_norm": 3.0905399322509766, + "learning_rate": 4.461021120247091e-05, + "loss": 0.3561, + "step": 6390 + }, + { + "epoch": 0.35082327113062567, + "grad_norm": 2.249783515930176, + "learning_rate": 4.460696317916737e-05, + "loss": 0.4282, + "step": 6392 + }, + { + "epoch": 0.3509330406147091, + "grad_norm": 1.1806585788726807, + "learning_rate": 4.46037142958062e-05, + "loss": 0.2833, + "step": 6394 + }, + { + "epoch": 0.3510428100987925, + "grad_norm": 2.0121991634368896, + "learning_rate": 4.460046455252989e-05, + "loss": 0.3621, + "step": 6396 + }, + { + "epoch": 0.35115257958287593, + "grad_norm": 2.0166893005371094, + "learning_rate": 4.4597213949480995e-05, + "loss": 0.4921, + "step": 6398 + }, + { + "epoch": 0.3512623490669594, + "grad_norm": 2.342435121536255, + "learning_rate": 4.4593962486802116e-05, + "loss": 0.3609, + "step": 6400 + }, + { + "epoch": 0.35137211855104283, + "grad_norm": 2.0374886989593506, + "learning_rate": 4.459071016463587e-05, + "loss": 0.3949, + "step": 6402 + }, + { + "epoch": 0.35148188803512626, + "grad_norm": 1.939738392829895, + "learning_rate": 4.458745698312492e-05, + "loss": 0.2862, + "step": 6404 + }, + { + "epoch": 0.3515916575192097, + "grad_norm": 2.310133934020996, + "learning_rate": 4.458420294241196e-05, + "loss": 0.3196, + "step": 6406 + }, + { + "epoch": 0.3517014270032931, + "grad_norm": 1.4627829790115356, + "learning_rate": 4.458094804263974e-05, + "loss": 0.3307, + "step": 6408 + }, + { + "epoch": 0.3518111964873765, + "grad_norm": 2.312147855758667, + "learning_rate": 4.4577692283951014e-05, + "loss": 0.3357, + "step": 6410 + }, + { + "epoch": 0.35192096597145994, + "grad_norm": 2.508704662322998, + "learning_rate": 4.457443566648862e-05, + "loss": 0.5225, + "step": 6412 + }, + { + "epoch": 0.35203073545554336, + "grad_norm": 3.3970694541931152, + "learning_rate": 4.457117819039541e-05, + "loss": 0.3605, + "step": 6414 + }, + { + "epoch": 0.3521405049396268, + "grad_norm": 1.572051763534546, + "learning_rate": 4.456791985581426e-05, + "loss": 0.2943, + "step": 6416 + }, + { + "epoch": 0.3522502744237102, + "grad_norm": 2.1088225841522217, + "learning_rate": 4.4564660662888094e-05, + "loss": 0.3409, + "step": 6418 + }, + { + "epoch": 0.35236004390779363, + "grad_norm": 2.418727397918701, + "learning_rate": 4.456140061175989e-05, + "loss": 0.4047, + "step": 6420 + }, + { + "epoch": 0.35246981339187705, + "grad_norm": 2.7876343727111816, + "learning_rate": 4.455813970257264e-05, + "loss": 0.2626, + "step": 6422 + }, + { + "epoch": 0.3525795828759605, + "grad_norm": 1.1519627571105957, + "learning_rate": 4.455487793546939e-05, + "loss": 0.2714, + "step": 6424 + }, + { + "epoch": 0.3526893523600439, + "grad_norm": 1.7364413738250732, + "learning_rate": 4.4551615310593195e-05, + "loss": 0.3491, + "step": 6426 + }, + { + "epoch": 0.3527991218441273, + "grad_norm": 1.3367091417312622, + "learning_rate": 4.454835182808721e-05, + "loss": 0.2682, + "step": 6428 + }, + { + "epoch": 0.35290889132821074, + "grad_norm": 1.2389658689498901, + "learning_rate": 4.454508748809455e-05, + "loss": 0.3319, + "step": 6430 + }, + { + "epoch": 0.35301866081229416, + "grad_norm": 2.0277817249298096, + "learning_rate": 4.454182229075842e-05, + "loss": 0.386, + "step": 6432 + }, + { + "epoch": 0.3531284302963776, + "grad_norm": 1.5673171281814575, + "learning_rate": 4.453855623622205e-05, + "loss": 0.3455, + "step": 6434 + }, + { + "epoch": 0.353238199780461, + "grad_norm": 1.7606981992721558, + "learning_rate": 4.45352893246287e-05, + "loss": 0.3829, + "step": 6436 + }, + { + "epoch": 0.3533479692645445, + "grad_norm": 1.5264078378677368, + "learning_rate": 4.453202155612168e-05, + "loss": 0.2808, + "step": 6438 + }, + { + "epoch": 0.3534577387486279, + "grad_norm": 2.1834123134613037, + "learning_rate": 4.452875293084433e-05, + "loss": 0.3436, + "step": 6440 + }, + { + "epoch": 0.3535675082327113, + "grad_norm": 1.1772949695587158, + "learning_rate": 4.452548344894002e-05, + "loss": 0.2855, + "step": 6442 + }, + { + "epoch": 0.35367727771679475, + "grad_norm": 2.4893689155578613, + "learning_rate": 4.4522213110552166e-05, + "loss": 0.3114, + "step": 6444 + }, + { + "epoch": 0.35378704720087817, + "grad_norm": 1.8599752187728882, + "learning_rate": 4.451894191582423e-05, + "loss": 0.5091, + "step": 6446 + }, + { + "epoch": 0.3538968166849616, + "grad_norm": 1.8630733489990234, + "learning_rate": 4.451566986489969e-05, + "loss": 0.4001, + "step": 6448 + }, + { + "epoch": 0.354006586169045, + "grad_norm": 1.3179906606674194, + "learning_rate": 4.451239695792209e-05, + "loss": 0.2971, + "step": 6450 + }, + { + "epoch": 0.35411635565312843, + "grad_norm": 2.3322184085845947, + "learning_rate": 4.4509123195034974e-05, + "loss": 0.3088, + "step": 6452 + }, + { + "epoch": 0.35422612513721186, + "grad_norm": 2.6914350986480713, + "learning_rate": 4.450584857638197e-05, + "loss": 0.5148, + "step": 6454 + }, + { + "epoch": 0.3543358946212953, + "grad_norm": 2.7133305072784424, + "learning_rate": 4.45025731021067e-05, + "loss": 0.2767, + "step": 6456 + }, + { + "epoch": 0.3544456641053787, + "grad_norm": 2.761319637298584, + "learning_rate": 4.4499296772352856e-05, + "loss": 0.3474, + "step": 6458 + }, + { + "epoch": 0.3545554335894621, + "grad_norm": 2.131495952606201, + "learning_rate": 4.4496019587264145e-05, + "loss": 0.4418, + "step": 6460 + }, + { + "epoch": 0.35466520307354554, + "grad_norm": 1.3824834823608398, + "learning_rate": 4.449274154698432e-05, + "loss": 0.386, + "step": 6462 + }, + { + "epoch": 0.35477497255762896, + "grad_norm": 2.3911588191986084, + "learning_rate": 4.448946265165718e-05, + "loss": 0.292, + "step": 6464 + }, + { + "epoch": 0.3548847420417124, + "grad_norm": 2.2914459705352783, + "learning_rate": 4.448618290142654e-05, + "loss": 0.3809, + "step": 6466 + }, + { + "epoch": 0.3549945115257958, + "grad_norm": 2.713111400604248, + "learning_rate": 4.448290229643628e-05, + "loss": 0.3092, + "step": 6468 + }, + { + "epoch": 0.35510428100987923, + "grad_norm": 1.6738253831863403, + "learning_rate": 4.4479620836830295e-05, + "loss": 0.3333, + "step": 6470 + }, + { + "epoch": 0.35521405049396265, + "grad_norm": 1.8141306638717651, + "learning_rate": 4.4476338522752526e-05, + "loss": 0.378, + "step": 6472 + }, + { + "epoch": 0.35532381997804613, + "grad_norm": 1.9256813526153564, + "learning_rate": 4.447305535434696e-05, + "loss": 0.3102, + "step": 6474 + }, + { + "epoch": 0.35543358946212955, + "grad_norm": 1.769178032875061, + "learning_rate": 4.4469771331757604e-05, + "loss": 0.5182, + "step": 6476 + }, + { + "epoch": 0.355543358946213, + "grad_norm": 1.779021978378296, + "learning_rate": 4.4466486455128516e-05, + "loss": 0.3835, + "step": 6478 + }, + { + "epoch": 0.3556531284302964, + "grad_norm": 4.129996299743652, + "learning_rate": 4.446320072460378e-05, + "loss": 0.3611, + "step": 6480 + }, + { + "epoch": 0.3557628979143798, + "grad_norm": 2.988287925720215, + "learning_rate": 4.445991414032754e-05, + "loss": 0.4922, + "step": 6482 + }, + { + "epoch": 0.35587266739846324, + "grad_norm": 3.2447690963745117, + "learning_rate": 4.445662670244394e-05, + "loss": 0.3708, + "step": 6484 + }, + { + "epoch": 0.35598243688254666, + "grad_norm": 2.4939920902252197, + "learning_rate": 4.4453338411097196e-05, + "loss": 0.2697, + "step": 6486 + }, + { + "epoch": 0.3560922063666301, + "grad_norm": 1.2998415231704712, + "learning_rate": 4.445004926643155e-05, + "loss": 0.3349, + "step": 6488 + }, + { + "epoch": 0.3562019758507135, + "grad_norm": 2.6774723529815674, + "learning_rate": 4.444675926859128e-05, + "loss": 0.2669, + "step": 6490 + }, + { + "epoch": 0.3563117453347969, + "grad_norm": 1.9337279796600342, + "learning_rate": 4.444346841772068e-05, + "loss": 0.3055, + "step": 6492 + }, + { + "epoch": 0.35642151481888035, + "grad_norm": 2.2823829650878906, + "learning_rate": 4.444017671396414e-05, + "loss": 0.4433, + "step": 6494 + }, + { + "epoch": 0.35653128430296377, + "grad_norm": 3.356008291244507, + "learning_rate": 4.4436884157466025e-05, + "loss": 0.3968, + "step": 6496 + }, + { + "epoch": 0.3566410537870472, + "grad_norm": 2.1480698585510254, + "learning_rate": 4.443359074837077e-05, + "loss": 0.2903, + "step": 6498 + }, + { + "epoch": 0.3567508232711306, + "grad_norm": 2.763657569885254, + "learning_rate": 4.443029648682284e-05, + "loss": 0.441, + "step": 6500 + }, + { + "epoch": 0.35686059275521403, + "grad_norm": 2.645225763320923, + "learning_rate": 4.4427001372966736e-05, + "loss": 0.4856, + "step": 6502 + }, + { + "epoch": 0.35697036223929746, + "grad_norm": 1.0411440134048462, + "learning_rate": 4.442370540694699e-05, + "loss": 0.344, + "step": 6504 + }, + { + "epoch": 0.3570801317233809, + "grad_norm": 1.8054146766662598, + "learning_rate": 4.44204085889082e-05, + "loss": 0.3549, + "step": 6506 + }, + { + "epoch": 0.3571899012074643, + "grad_norm": 2.123670816421509, + "learning_rate": 4.441711091899496e-05, + "loss": 0.4015, + "step": 6508 + }, + { + "epoch": 0.3572996706915478, + "grad_norm": 1.4355716705322266, + "learning_rate": 4.441381239735194e-05, + "loss": 0.374, + "step": 6510 + }, + { + "epoch": 0.3574094401756312, + "grad_norm": 2.12677001953125, + "learning_rate": 4.441051302412381e-05, + "loss": 0.3874, + "step": 6512 + }, + { + "epoch": 0.3575192096597146, + "grad_norm": 1.5566542148590088, + "learning_rate": 4.4407212799455313e-05, + "loss": 0.3134, + "step": 6514 + }, + { + "epoch": 0.35762897914379804, + "grad_norm": 2.8016836643218994, + "learning_rate": 4.44039117234912e-05, + "loss": 0.4728, + "step": 6516 + }, + { + "epoch": 0.35773874862788146, + "grad_norm": 2.240091323852539, + "learning_rate": 4.440060979637628e-05, + "loss": 0.3545, + "step": 6518 + }, + { + "epoch": 0.3578485181119649, + "grad_norm": 2.4489145278930664, + "learning_rate": 4.43973070182554e-05, + "loss": 0.3665, + "step": 6520 + }, + { + "epoch": 0.3579582875960483, + "grad_norm": 1.8581966161727905, + "learning_rate": 4.4394003389273415e-05, + "loss": 0.3675, + "step": 6522 + }, + { + "epoch": 0.35806805708013173, + "grad_norm": 1.3462873697280884, + "learning_rate": 4.4390698909575254e-05, + "loss": 0.3192, + "step": 6524 + }, + { + "epoch": 0.35817782656421515, + "grad_norm": 2.064490795135498, + "learning_rate": 4.4387393579305865e-05, + "loss": 0.4579, + "step": 6526 + }, + { + "epoch": 0.3582875960482986, + "grad_norm": 1.3941328525543213, + "learning_rate": 4.438408739861023e-05, + "loss": 0.2497, + "step": 6528 + }, + { + "epoch": 0.358397365532382, + "grad_norm": 1.8542602062225342, + "learning_rate": 4.4380780367633386e-05, + "loss": 0.3812, + "step": 6530 + }, + { + "epoch": 0.3585071350164654, + "grad_norm": 1.4028321504592896, + "learning_rate": 4.437747248652038e-05, + "loss": 0.3917, + "step": 6532 + }, + { + "epoch": 0.35861690450054884, + "grad_norm": 2.3571462631225586, + "learning_rate": 4.437416375541633e-05, + "loss": 0.4068, + "step": 6534 + }, + { + "epoch": 0.35872667398463226, + "grad_norm": 5.078784465789795, + "learning_rate": 4.437085417446636e-05, + "loss": 0.472, + "step": 6536 + }, + { + "epoch": 0.3588364434687157, + "grad_norm": 1.0798213481903076, + "learning_rate": 4.436754374381564e-05, + "loss": 0.3748, + "step": 6538 + }, + { + "epoch": 0.3589462129527991, + "grad_norm": 1.5372111797332764, + "learning_rate": 4.43642324636094e-05, + "loss": 0.3394, + "step": 6540 + }, + { + "epoch": 0.3590559824368825, + "grad_norm": 1.6680786609649658, + "learning_rate": 4.436092033399288e-05, + "loss": 0.2915, + "step": 6542 + }, + { + "epoch": 0.35916575192096595, + "grad_norm": 2.133972406387329, + "learning_rate": 4.435760735511136e-05, + "loss": 0.3773, + "step": 6544 + }, + { + "epoch": 0.35927552140504937, + "grad_norm": 1.256148338317871, + "learning_rate": 4.4354293527110166e-05, + "loss": 0.3082, + "step": 6546 + }, + { + "epoch": 0.35938529088913285, + "grad_norm": 2.518049955368042, + "learning_rate": 4.435097885013467e-05, + "loss": 0.3174, + "step": 6548 + }, + { + "epoch": 0.35949506037321627, + "grad_norm": 1.5609753131866455, + "learning_rate": 4.434766332433026e-05, + "loss": 0.3727, + "step": 6550 + }, + { + "epoch": 0.3596048298572997, + "grad_norm": 1.7466692924499512, + "learning_rate": 4.434434694984237e-05, + "loss": 0.4221, + "step": 6552 + }, + { + "epoch": 0.3597145993413831, + "grad_norm": 2.477464199066162, + "learning_rate": 4.4341029726816475e-05, + "loss": 0.2738, + "step": 6554 + }, + { + "epoch": 0.35982436882546653, + "grad_norm": 1.4153783321380615, + "learning_rate": 4.433771165539808e-05, + "loss": 0.2435, + "step": 6556 + }, + { + "epoch": 0.35993413830954996, + "grad_norm": 1.412514090538025, + "learning_rate": 4.433439273573274e-05, + "loss": 0.3397, + "step": 6558 + }, + { + "epoch": 0.3600439077936334, + "grad_norm": 1.3139595985412598, + "learning_rate": 4.433107296796603e-05, + "loss": 0.3191, + "step": 6560 + }, + { + "epoch": 0.3601536772777168, + "grad_norm": 1.6589447259902954, + "learning_rate": 4.432775235224359e-05, + "loss": 0.4119, + "step": 6562 + }, + { + "epoch": 0.3602634467618002, + "grad_norm": 1.5935890674591064, + "learning_rate": 4.432443088871107e-05, + "loss": 0.4021, + "step": 6564 + }, + { + "epoch": 0.36037321624588364, + "grad_norm": 2.1166484355926514, + "learning_rate": 4.4321108577514145e-05, + "loss": 0.3938, + "step": 6566 + }, + { + "epoch": 0.36048298572996706, + "grad_norm": 4.089608192443848, + "learning_rate": 4.431778541879857e-05, + "loss": 0.2629, + "step": 6568 + }, + { + "epoch": 0.3605927552140505, + "grad_norm": 1.5465751886367798, + "learning_rate": 4.431446141271011e-05, + "loss": 0.551, + "step": 6570 + }, + { + "epoch": 0.3607025246981339, + "grad_norm": 2.115731954574585, + "learning_rate": 4.4311136559394574e-05, + "loss": 0.3174, + "step": 6572 + }, + { + "epoch": 0.36081229418221733, + "grad_norm": 1.5934311151504517, + "learning_rate": 4.43078108589978e-05, + "loss": 0.2935, + "step": 6574 + }, + { + "epoch": 0.36092206366630075, + "grad_norm": 2.8315672874450684, + "learning_rate": 4.430448431166567e-05, + "loss": 0.3193, + "step": 6576 + }, + { + "epoch": 0.3610318331503842, + "grad_norm": 1.485571026802063, + "learning_rate": 4.430115691754412e-05, + "loss": 0.3342, + "step": 6578 + }, + { + "epoch": 0.3611416026344676, + "grad_norm": 1.8536274433135986, + "learning_rate": 4.429782867677908e-05, + "loss": 0.4033, + "step": 6580 + }, + { + "epoch": 0.361251372118551, + "grad_norm": 2.037135124206543, + "learning_rate": 4.429449958951656e-05, + "loss": 0.281, + "step": 6582 + }, + { + "epoch": 0.3613611416026345, + "grad_norm": 2.804363489151001, + "learning_rate": 4.4291169655902574e-05, + "loss": 0.3297, + "step": 6584 + }, + { + "epoch": 0.3614709110867179, + "grad_norm": 1.173848032951355, + "learning_rate": 4.428783887608321e-05, + "loss": 0.2667, + "step": 6586 + }, + { + "epoch": 0.36158068057080134, + "grad_norm": 1.750144600868225, + "learning_rate": 4.4284507250204566e-05, + "loss": 0.2368, + "step": 6588 + }, + { + "epoch": 0.36169045005488476, + "grad_norm": 1.587090253829956, + "learning_rate": 4.428117477841277e-05, + "loss": 0.2558, + "step": 6590 + }, + { + "epoch": 0.3618002195389682, + "grad_norm": 1.4268505573272705, + "learning_rate": 4.427784146085402e-05, + "loss": 0.2748, + "step": 6592 + }, + { + "epoch": 0.3619099890230516, + "grad_norm": 1.3130604028701782, + "learning_rate": 4.4274507297674513e-05, + "loss": 0.3318, + "step": 6594 + }, + { + "epoch": 0.362019758507135, + "grad_norm": 1.4703688621520996, + "learning_rate": 4.4271172289020524e-05, + "loss": 0.3971, + "step": 6596 + }, + { + "epoch": 0.36212952799121845, + "grad_norm": 2.1529195308685303, + "learning_rate": 4.4267836435038326e-05, + "loss": 0.4123, + "step": 6598 + }, + { + "epoch": 0.36223929747530187, + "grad_norm": 1.7184844017028809, + "learning_rate": 4.4264499735874245e-05, + "loss": 0.4012, + "step": 6600 + }, + { + "epoch": 0.3623490669593853, + "grad_norm": 1.5183699131011963, + "learning_rate": 4.4261162191674644e-05, + "loss": 0.2399, + "step": 6602 + }, + { + "epoch": 0.3624588364434687, + "grad_norm": 2.5965425968170166, + "learning_rate": 4.425782380258594e-05, + "loss": 0.4385, + "step": 6604 + }, + { + "epoch": 0.36256860592755213, + "grad_norm": 1.102054238319397, + "learning_rate": 4.4254484568754554e-05, + "loss": 0.3122, + "step": 6606 + }, + { + "epoch": 0.36267837541163556, + "grad_norm": 2.3679258823394775, + "learning_rate": 4.425114449032697e-05, + "loss": 0.561, + "step": 6608 + }, + { + "epoch": 0.362788144895719, + "grad_norm": 2.175365686416626, + "learning_rate": 4.424780356744971e-05, + "loss": 0.2988, + "step": 6610 + }, + { + "epoch": 0.3628979143798024, + "grad_norm": 1.8241618871688843, + "learning_rate": 4.42444618002693e-05, + "loss": 0.3238, + "step": 6612 + }, + { + "epoch": 0.3630076838638858, + "grad_norm": 1.594374179840088, + "learning_rate": 4.4241119188932336e-05, + "loss": 0.4705, + "step": 6614 + }, + { + "epoch": 0.36311745334796924, + "grad_norm": 1.941961646080017, + "learning_rate": 4.423777573358545e-05, + "loss": 0.5068, + "step": 6616 + }, + { + "epoch": 0.36322722283205267, + "grad_norm": 1.2006685733795166, + "learning_rate": 4.4234431434375296e-05, + "loss": 0.2923, + "step": 6618 + }, + { + "epoch": 0.3633369923161361, + "grad_norm": 1.203403115272522, + "learning_rate": 4.4231086291448566e-05, + "loss": 0.3075, + "step": 6620 + }, + { + "epoch": 0.36344676180021956, + "grad_norm": 1.3676036596298218, + "learning_rate": 4.4227740304952006e-05, + "loss": 0.2692, + "step": 6622 + }, + { + "epoch": 0.363556531284303, + "grad_norm": 2.4291324615478516, + "learning_rate": 4.422439347503238e-05, + "loss": 0.4055, + "step": 6624 + }, + { + "epoch": 0.3636663007683864, + "grad_norm": 2.0787158012390137, + "learning_rate": 4.4221045801836494e-05, + "loss": 0.3838, + "step": 6626 + }, + { + "epoch": 0.36377607025246983, + "grad_norm": 2.091001272201538, + "learning_rate": 4.421769728551119e-05, + "loss": 0.2444, + "step": 6628 + }, + { + "epoch": 0.36388583973655325, + "grad_norm": 1.6253015995025635, + "learning_rate": 4.421434792620337e-05, + "loss": 0.3472, + "step": 6630 + }, + { + "epoch": 0.3639956092206367, + "grad_norm": 2.3374526500701904, + "learning_rate": 4.421099772405993e-05, + "loss": 0.2872, + "step": 6632 + }, + { + "epoch": 0.3641053787047201, + "grad_norm": 2.7636396884918213, + "learning_rate": 4.4207646679227846e-05, + "loss": 0.3935, + "step": 6634 + }, + { + "epoch": 0.3642151481888035, + "grad_norm": 2.285900115966797, + "learning_rate": 4.4204294791854094e-05, + "loss": 0.3147, + "step": 6636 + }, + { + "epoch": 0.36432491767288694, + "grad_norm": 2.744236946105957, + "learning_rate": 4.420094206208571e-05, + "loss": 0.4464, + "step": 6638 + }, + { + "epoch": 0.36443468715697036, + "grad_norm": 1.991303563117981, + "learning_rate": 4.4197588490069776e-05, + "loss": 0.4877, + "step": 6640 + }, + { + "epoch": 0.3645444566410538, + "grad_norm": 1.189886450767517, + "learning_rate": 4.419423407595338e-05, + "loss": 0.3407, + "step": 6642 + }, + { + "epoch": 0.3646542261251372, + "grad_norm": 1.6632541418075562, + "learning_rate": 4.419087881988366e-05, + "loss": 0.2916, + "step": 6644 + }, + { + "epoch": 0.3647639956092206, + "grad_norm": 1.1954714059829712, + "learning_rate": 4.4187522722007805e-05, + "loss": 0.3757, + "step": 6646 + }, + { + "epoch": 0.36487376509330405, + "grad_norm": 1.6919116973876953, + "learning_rate": 4.418416578247302e-05, + "loss": 0.4193, + "step": 6648 + }, + { + "epoch": 0.36498353457738747, + "grad_norm": 1.7742640972137451, + "learning_rate": 4.418080800142657e-05, + "loss": 0.4229, + "step": 6650 + }, + { + "epoch": 0.3650933040614709, + "grad_norm": 1.3688569068908691, + "learning_rate": 4.4177449379015737e-05, + "loss": 0.3059, + "step": 6652 + }, + { + "epoch": 0.3652030735455543, + "grad_norm": 1.696321725845337, + "learning_rate": 4.417408991538784e-05, + "loss": 0.2973, + "step": 6654 + }, + { + "epoch": 0.36531284302963773, + "grad_norm": 1.2302676439285278, + "learning_rate": 4.417072961069024e-05, + "loss": 0.3933, + "step": 6656 + }, + { + "epoch": 0.3654226125137212, + "grad_norm": 2.0657172203063965, + "learning_rate": 4.416736846507036e-05, + "loss": 0.3522, + "step": 6658 + }, + { + "epoch": 0.36553238199780463, + "grad_norm": 1.3816159963607788, + "learning_rate": 4.4164006478675614e-05, + "loss": 0.3212, + "step": 6660 + }, + { + "epoch": 0.36564215148188806, + "grad_norm": 4.561327934265137, + "learning_rate": 4.416064365165348e-05, + "loss": 0.3573, + "step": 6662 + }, + { + "epoch": 0.3657519209659715, + "grad_norm": 1.579689621925354, + "learning_rate": 4.415727998415147e-05, + "loss": 0.479, + "step": 6664 + }, + { + "epoch": 0.3658616904500549, + "grad_norm": 2.206653118133545, + "learning_rate": 4.4153915476317125e-05, + "loss": 0.3143, + "step": 6666 + }, + { + "epoch": 0.3659714599341383, + "grad_norm": 2.554774284362793, + "learning_rate": 4.415055012829804e-05, + "loss": 0.3986, + "step": 6668 + }, + { + "epoch": 0.36608122941822174, + "grad_norm": 1.6855424642562866, + "learning_rate": 4.414718394024183e-05, + "loss": 0.3215, + "step": 6670 + }, + { + "epoch": 0.36619099890230516, + "grad_norm": 1.6710102558135986, + "learning_rate": 4.414381691229615e-05, + "loss": 0.2995, + "step": 6672 + }, + { + "epoch": 0.3663007683863886, + "grad_norm": 1.9824143648147583, + "learning_rate": 4.41404490446087e-05, + "loss": 0.437, + "step": 6674 + }, + { + "epoch": 0.366410537870472, + "grad_norm": 1.7623497247695923, + "learning_rate": 4.4137080337327205e-05, + "loss": 0.303, + "step": 6676 + }, + { + "epoch": 0.36652030735455543, + "grad_norm": 2.4740898609161377, + "learning_rate": 4.413371079059944e-05, + "loss": 0.4047, + "step": 6678 + }, + { + "epoch": 0.36663007683863885, + "grad_norm": 1.8774124383926392, + "learning_rate": 4.41303404045732e-05, + "loss": 0.4683, + "step": 6680 + }, + { + "epoch": 0.3667398463227223, + "grad_norm": 1.7245914936065674, + "learning_rate": 4.4126969179396335e-05, + "loss": 0.3905, + "step": 6682 + }, + { + "epoch": 0.3668496158068057, + "grad_norm": 2.9703195095062256, + "learning_rate": 4.412359711521673e-05, + "loss": 0.3832, + "step": 6684 + }, + { + "epoch": 0.3669593852908891, + "grad_norm": 2.040583372116089, + "learning_rate": 4.412022421218228e-05, + "loss": 0.1948, + "step": 6686 + }, + { + "epoch": 0.36706915477497254, + "grad_norm": 1.9151432514190674, + "learning_rate": 4.4116850470440955e-05, + "loss": 0.3562, + "step": 6688 + }, + { + "epoch": 0.36717892425905596, + "grad_norm": 1.6606963872909546, + "learning_rate": 4.411347589014073e-05, + "loss": 0.2952, + "step": 6690 + }, + { + "epoch": 0.3672886937431394, + "grad_norm": 1.6172014474868774, + "learning_rate": 4.411010047142964e-05, + "loss": 0.2631, + "step": 6692 + }, + { + "epoch": 0.3673984632272228, + "grad_norm": 2.0872268676757812, + "learning_rate": 4.4106724214455754e-05, + "loss": 0.3645, + "step": 6694 + }, + { + "epoch": 0.3675082327113063, + "grad_norm": 2.278298854827881, + "learning_rate": 4.410334711936715e-05, + "loss": 0.3732, + "step": 6696 + }, + { + "epoch": 0.3676180021953897, + "grad_norm": 1.502493977546692, + "learning_rate": 4.409996918631199e-05, + "loss": 0.3963, + "step": 6698 + }, + { + "epoch": 0.3677277716794731, + "grad_norm": 2.1080245971679688, + "learning_rate": 4.409659041543842e-05, + "loss": 0.2564, + "step": 6700 + }, + { + "epoch": 0.36783754116355655, + "grad_norm": 2.369624376296997, + "learning_rate": 4.409321080689467e-05, + "loss": 0.3142, + "step": 6702 + }, + { + "epoch": 0.36794731064763997, + "grad_norm": 1.1082921028137207, + "learning_rate": 4.4089830360828976e-05, + "loss": 0.204, + "step": 6704 + }, + { + "epoch": 0.3680570801317234, + "grad_norm": 1.4967761039733887, + "learning_rate": 4.408644907738964e-05, + "loss": 0.2634, + "step": 6706 + }, + { + "epoch": 0.3681668496158068, + "grad_norm": 1.4883028268814087, + "learning_rate": 4.4083066956724946e-05, + "loss": 0.3092, + "step": 6708 + }, + { + "epoch": 0.36827661909989023, + "grad_norm": 1.8226351737976074, + "learning_rate": 4.4079683998983276e-05, + "loss": 0.4027, + "step": 6710 + }, + { + "epoch": 0.36838638858397366, + "grad_norm": 2.716966390609741, + "learning_rate": 4.407630020431302e-05, + "loss": 0.4356, + "step": 6712 + }, + { + "epoch": 0.3684961580680571, + "grad_norm": 2.750580310821533, + "learning_rate": 4.4072915572862596e-05, + "loss": 0.2683, + "step": 6714 + }, + { + "epoch": 0.3686059275521405, + "grad_norm": 1.3697038888931274, + "learning_rate": 4.406953010478049e-05, + "loss": 0.4042, + "step": 6716 + }, + { + "epoch": 0.3687156970362239, + "grad_norm": 2.935755491256714, + "learning_rate": 4.4066143800215186e-05, + "loss": 0.2766, + "step": 6718 + }, + { + "epoch": 0.36882546652030734, + "grad_norm": 1.512320876121521, + "learning_rate": 4.406275665931524e-05, + "loss": 0.401, + "step": 6720 + }, + { + "epoch": 0.36893523600439077, + "grad_norm": 2.0303449630737305, + "learning_rate": 4.4059368682229216e-05, + "loss": 0.466, + "step": 6722 + }, + { + "epoch": 0.3690450054884742, + "grad_norm": 2.1034905910491943, + "learning_rate": 4.4055979869105734e-05, + "loss": 0.4369, + "step": 6724 + }, + { + "epoch": 0.3691547749725576, + "grad_norm": 2.164449453353882, + "learning_rate": 4.4052590220093446e-05, + "loss": 0.3997, + "step": 6726 + }, + { + "epoch": 0.36926454445664103, + "grad_norm": 1.1818972826004028, + "learning_rate": 4.404919973534104e-05, + "loss": 0.3116, + "step": 6728 + }, + { + "epoch": 0.36937431394072445, + "grad_norm": 2.1597626209259033, + "learning_rate": 4.4045808414997226e-05, + "loss": 0.2596, + "step": 6730 + }, + { + "epoch": 0.36948408342480793, + "grad_norm": 1.5921685695648193, + "learning_rate": 4.404241625921077e-05, + "loss": 0.3538, + "step": 6732 + }, + { + "epoch": 0.36959385290889135, + "grad_norm": 1.4609589576721191, + "learning_rate": 4.4039023268130474e-05, + "loss": 0.2708, + "step": 6734 + }, + { + "epoch": 0.3697036223929748, + "grad_norm": 1.8147037029266357, + "learning_rate": 4.4035629441905174e-05, + "loss": 0.2733, + "step": 6736 + }, + { + "epoch": 0.3698133918770582, + "grad_norm": 2.117738962173462, + "learning_rate": 4.403223478068373e-05, + "loss": 0.3386, + "step": 6738 + }, + { + "epoch": 0.3699231613611416, + "grad_norm": 1.5615261793136597, + "learning_rate": 4.402883928461505e-05, + "loss": 0.2971, + "step": 6740 + }, + { + "epoch": 0.37003293084522504, + "grad_norm": 3.243530750274658, + "learning_rate": 4.4025442953848086e-05, + "loss": 0.3709, + "step": 6742 + }, + { + "epoch": 0.37014270032930846, + "grad_norm": 3.0915820598602295, + "learning_rate": 4.4022045788531806e-05, + "loss": 0.3594, + "step": 6744 + }, + { + "epoch": 0.3702524698133919, + "grad_norm": 2.080981492996216, + "learning_rate": 4.4018647788815235e-05, + "loss": 0.3055, + "step": 6746 + }, + { + "epoch": 0.3703622392974753, + "grad_norm": 1.773810625076294, + "learning_rate": 4.401524895484743e-05, + "loss": 0.3731, + "step": 6748 + }, + { + "epoch": 0.3704720087815587, + "grad_norm": 1.645552396774292, + "learning_rate": 4.401184928677746e-05, + "loss": 0.3901, + "step": 6750 + }, + { + "epoch": 0.37058177826564215, + "grad_norm": 1.8153647184371948, + "learning_rate": 4.400844878475447e-05, + "loss": 0.4239, + "step": 6752 + }, + { + "epoch": 0.37069154774972557, + "grad_norm": 1.7976208925247192, + "learning_rate": 4.400504744892763e-05, + "loss": 0.3436, + "step": 6754 + }, + { + "epoch": 0.370801317233809, + "grad_norm": 1.8197546005249023, + "learning_rate": 4.4001645279446116e-05, + "loss": 0.3376, + "step": 6756 + }, + { + "epoch": 0.3709110867178924, + "grad_norm": 2.4988720417022705, + "learning_rate": 4.399824227645917e-05, + "loss": 0.4514, + "step": 6758 + }, + { + "epoch": 0.37102085620197583, + "grad_norm": 1.7157279253005981, + "learning_rate": 4.399483844011607e-05, + "loss": 0.3006, + "step": 6760 + }, + { + "epoch": 0.37113062568605926, + "grad_norm": 2.121267557144165, + "learning_rate": 4.3991433770566124e-05, + "loss": 0.2612, + "step": 6762 + }, + { + "epoch": 0.3712403951701427, + "grad_norm": 1.8800967931747437, + "learning_rate": 4.398802826795868e-05, + "loss": 0.4612, + "step": 6764 + }, + { + "epoch": 0.3713501646542261, + "grad_norm": 1.3163204193115234, + "learning_rate": 4.398462193244312e-05, + "loss": 0.2962, + "step": 6766 + }, + { + "epoch": 0.3714599341383096, + "grad_norm": 2.374640703201294, + "learning_rate": 4.3981214764168856e-05, + "loss": 0.4547, + "step": 6768 + }, + { + "epoch": 0.371569703622393, + "grad_norm": 1.8124887943267822, + "learning_rate": 4.397780676328535e-05, + "loss": 0.4355, + "step": 6770 + }, + { + "epoch": 0.3716794731064764, + "grad_norm": 1.4233076572418213, + "learning_rate": 4.397439792994209e-05, + "loss": 0.309, + "step": 6772 + }, + { + "epoch": 0.37178924259055984, + "grad_norm": 2.9687561988830566, + "learning_rate": 4.39709882642886e-05, + "loss": 0.3605, + "step": 6774 + }, + { + "epoch": 0.37189901207464326, + "grad_norm": 4.213799953460693, + "learning_rate": 4.3967577766474455e-05, + "loss": 0.4792, + "step": 6776 + }, + { + "epoch": 0.3720087815587267, + "grad_norm": 2.5986480712890625, + "learning_rate": 4.3964166436649254e-05, + "loss": 0.3632, + "step": 6778 + }, + { + "epoch": 0.3721185510428101, + "grad_norm": 3.3811264038085938, + "learning_rate": 4.3960754274962624e-05, + "loss": 0.2632, + "step": 6780 + }, + { + "epoch": 0.37222832052689353, + "grad_norm": 3.437883138656616, + "learning_rate": 4.395734128156425e-05, + "loss": 0.3173, + "step": 6782 + }, + { + "epoch": 0.37233809001097695, + "grad_norm": 1.7955284118652344, + "learning_rate": 4.395392745660384e-05, + "loss": 0.3798, + "step": 6784 + }, + { + "epoch": 0.3724478594950604, + "grad_norm": 3.2665257453918457, + "learning_rate": 4.3950512800231136e-05, + "loss": 0.3385, + "step": 6786 + }, + { + "epoch": 0.3725576289791438, + "grad_norm": 2.6805880069732666, + "learning_rate": 4.3947097312595934e-05, + "loss": 0.3598, + "step": 6788 + }, + { + "epoch": 0.3726673984632272, + "grad_norm": 2.6998536586761475, + "learning_rate": 4.394368099384803e-05, + "loss": 0.379, + "step": 6790 + }, + { + "epoch": 0.37277716794731064, + "grad_norm": 2.327284574508667, + "learning_rate": 4.3940263844137317e-05, + "loss": 0.3183, + "step": 6792 + }, + { + "epoch": 0.37288693743139406, + "grad_norm": 1.4484586715698242, + "learning_rate": 4.3936845863613664e-05, + "loss": 0.3195, + "step": 6794 + }, + { + "epoch": 0.3729967069154775, + "grad_norm": 2.08255934715271, + "learning_rate": 4.393342705242699e-05, + "loss": 0.3581, + "step": 6796 + }, + { + "epoch": 0.3731064763995609, + "grad_norm": 2.0793979167938232, + "learning_rate": 4.3930007410727286e-05, + "loss": 0.3101, + "step": 6798 + }, + { + "epoch": 0.3732162458836443, + "grad_norm": 1.5870450735092163, + "learning_rate": 4.392658693866454e-05, + "loss": 0.4186, + "step": 6800 + }, + { + "epoch": 0.37332601536772775, + "grad_norm": 1.8471176624298096, + "learning_rate": 4.392316563638879e-05, + "loss": 0.3398, + "step": 6802 + }, + { + "epoch": 0.37343578485181117, + "grad_norm": 2.002993106842041, + "learning_rate": 4.391974350405013e-05, + "loss": 0.3667, + "step": 6804 + }, + { + "epoch": 0.37354555433589465, + "grad_norm": 2.817134141921997, + "learning_rate": 4.391632054179864e-05, + "loss": 0.5443, + "step": 6806 + }, + { + "epoch": 0.37365532381997807, + "grad_norm": 2.040130138397217, + "learning_rate": 4.3912896749784485e-05, + "loss": 0.4021, + "step": 6808 + }, + { + "epoch": 0.3737650933040615, + "grad_norm": 2.197392702102661, + "learning_rate": 4.390947212815786e-05, + "loss": 0.3686, + "step": 6810 + }, + { + "epoch": 0.3738748627881449, + "grad_norm": 2.054661750793457, + "learning_rate": 4.3906046677068965e-05, + "loss": 0.2718, + "step": 6812 + }, + { + "epoch": 0.37398463227222833, + "grad_norm": 2.3386733531951904, + "learning_rate": 4.390262039666807e-05, + "loss": 0.5156, + "step": 6814 + }, + { + "epoch": 0.37409440175631176, + "grad_norm": 1.6450068950653076, + "learning_rate": 4.389919328710546e-05, + "loss": 0.3643, + "step": 6816 + }, + { + "epoch": 0.3742041712403952, + "grad_norm": 2.4167890548706055, + "learning_rate": 4.389576534853147e-05, + "loss": 0.3659, + "step": 6818 + }, + { + "epoch": 0.3743139407244786, + "grad_norm": 1.4051777124404907, + "learning_rate": 4.389233658109647e-05, + "loss": 0.3116, + "step": 6820 + }, + { + "epoch": 0.374423710208562, + "grad_norm": 3.100503444671631, + "learning_rate": 4.388890698495086e-05, + "loss": 0.3701, + "step": 6822 + }, + { + "epoch": 0.37453347969264544, + "grad_norm": 1.9208966493606567, + "learning_rate": 4.3885476560245074e-05, + "loss": 0.3514, + "step": 6824 + }, + { + "epoch": 0.37464324917672887, + "grad_norm": 1.7673033475875854, + "learning_rate": 4.3882045307129594e-05, + "loss": 0.3049, + "step": 6826 + }, + { + "epoch": 0.3747530186608123, + "grad_norm": 1.1555935144424438, + "learning_rate": 4.387861322575493e-05, + "loss": 0.3466, + "step": 6828 + }, + { + "epoch": 0.3748627881448957, + "grad_norm": 2.110043525695801, + "learning_rate": 4.387518031627162e-05, + "loss": 0.3149, + "step": 6830 + }, + { + "epoch": 0.37497255762897913, + "grad_norm": 1.8304097652435303, + "learning_rate": 4.387174657883026e-05, + "loss": 0.3827, + "step": 6832 + }, + { + "epoch": 0.37508232711306255, + "grad_norm": 1.2392441034317017, + "learning_rate": 4.386831201358147e-05, + "loss": 0.3028, + "step": 6834 + }, + { + "epoch": 0.375192096597146, + "grad_norm": 2.0278396606445312, + "learning_rate": 4.3864876620675896e-05, + "loss": 0.3522, + "step": 6836 + }, + { + "epoch": 0.3753018660812294, + "grad_norm": 1.7215709686279297, + "learning_rate": 4.386144040026425e-05, + "loss": 0.3273, + "step": 6838 + }, + { + "epoch": 0.3754116355653128, + "grad_norm": 3.1434128284454346, + "learning_rate": 4.385800335249725e-05, + "loss": 0.4094, + "step": 6840 + }, + { + "epoch": 0.3755214050493963, + "grad_norm": 1.5640748739242554, + "learning_rate": 4.3854565477525655e-05, + "loss": 0.3615, + "step": 6842 + }, + { + "epoch": 0.3756311745334797, + "grad_norm": 1.454296588897705, + "learning_rate": 4.385112677550027e-05, + "loss": 0.3268, + "step": 6844 + }, + { + "epoch": 0.37574094401756314, + "grad_norm": 1.5069221258163452, + "learning_rate": 4.3847687246571956e-05, + "loss": 0.294, + "step": 6846 + }, + { + "epoch": 0.37585071350164656, + "grad_norm": 1.4778636693954468, + "learning_rate": 4.384424689089155e-05, + "loss": 0.4044, + "step": 6848 + }, + { + "epoch": 0.37596048298573, + "grad_norm": 2.393035411834717, + "learning_rate": 4.384080570860999e-05, + "loss": 0.3899, + "step": 6850 + }, + { + "epoch": 0.3760702524698134, + "grad_norm": 1.79377019405365, + "learning_rate": 4.383736369987822e-05, + "loss": 0.3025, + "step": 6852 + }, + { + "epoch": 0.3761800219538968, + "grad_norm": 2.871631383895874, + "learning_rate": 4.383392086484722e-05, + "loss": 0.2475, + "step": 6854 + }, + { + "epoch": 0.37628979143798025, + "grad_norm": 2.2197132110595703, + "learning_rate": 4.3830477203668005e-05, + "loss": 0.3937, + "step": 6856 + }, + { + "epoch": 0.37639956092206367, + "grad_norm": 1.4932997226715088, + "learning_rate": 4.382703271649163e-05, + "loss": 0.3066, + "step": 6858 + }, + { + "epoch": 0.3765093304061471, + "grad_norm": 1.1876462697982788, + "learning_rate": 4.38235874034692e-05, + "loss": 0.3224, + "step": 6860 + }, + { + "epoch": 0.3766190998902305, + "grad_norm": 2.3512752056121826, + "learning_rate": 4.3820141264751826e-05, + "loss": 0.4282, + "step": 6862 + }, + { + "epoch": 0.37672886937431393, + "grad_norm": 2.078582525253296, + "learning_rate": 4.3816694300490685e-05, + "loss": 0.259, + "step": 6864 + }, + { + "epoch": 0.37683863885839736, + "grad_norm": 4.495780944824219, + "learning_rate": 4.381324651083697e-05, + "loss": 0.3314, + "step": 6866 + }, + { + "epoch": 0.3769484083424808, + "grad_norm": 2.1423847675323486, + "learning_rate": 4.380979789594193e-05, + "loss": 0.2913, + "step": 6868 + }, + { + "epoch": 0.3770581778265642, + "grad_norm": 1.322531819343567, + "learning_rate": 4.3806348455956825e-05, + "loss": 0.2478, + "step": 6870 + }, + { + "epoch": 0.3771679473106476, + "grad_norm": 2.103938341140747, + "learning_rate": 4.3802898191032975e-05, + "loss": 0.257, + "step": 6872 + }, + { + "epoch": 0.37727771679473104, + "grad_norm": 1.978139042854309, + "learning_rate": 4.3799447101321723e-05, + "loss": 0.2503, + "step": 6874 + }, + { + "epoch": 0.37738748627881447, + "grad_norm": 2.6775403022766113, + "learning_rate": 4.379599518697444e-05, + "loss": 0.284, + "step": 6876 + }, + { + "epoch": 0.3774972557628979, + "grad_norm": 1.2046157121658325, + "learning_rate": 4.3792542448142547e-05, + "loss": 0.2935, + "step": 6878 + }, + { + "epoch": 0.37760702524698136, + "grad_norm": 3.889631986618042, + "learning_rate": 4.3789088884977514e-05, + "loss": 0.312, + "step": 6880 + }, + { + "epoch": 0.3777167947310648, + "grad_norm": 2.2482943534851074, + "learning_rate": 4.3785634497630816e-05, + "loss": 0.3892, + "step": 6882 + }, + { + "epoch": 0.3778265642151482, + "grad_norm": 1.8426278829574585, + "learning_rate": 4.378217928625399e-05, + "loss": 0.3527, + "step": 6884 + }, + { + "epoch": 0.37793633369923163, + "grad_norm": 1.7001817226409912, + "learning_rate": 4.377872325099858e-05, + "loss": 0.2623, + "step": 6886 + }, + { + "epoch": 0.37804610318331505, + "grad_norm": 1.3066107034683228, + "learning_rate": 4.37752663920162e-05, + "loss": 0.2693, + "step": 6888 + }, + { + "epoch": 0.3781558726673985, + "grad_norm": 1.8795005083084106, + "learning_rate": 4.377180870945849e-05, + "loss": 0.3959, + "step": 6890 + }, + { + "epoch": 0.3782656421514819, + "grad_norm": 1.8544446229934692, + "learning_rate": 4.37683502034771e-05, + "loss": 0.426, + "step": 6892 + }, + { + "epoch": 0.3783754116355653, + "grad_norm": 3.344998836517334, + "learning_rate": 4.376489087422376e-05, + "loss": 0.4567, + "step": 6894 + }, + { + "epoch": 0.37848518111964874, + "grad_norm": 2.1099209785461426, + "learning_rate": 4.3761430721850205e-05, + "loss": 0.3532, + "step": 6896 + }, + { + "epoch": 0.37859495060373216, + "grad_norm": 1.7813183069229126, + "learning_rate": 4.37579697465082e-05, + "loss": 0.396, + "step": 6898 + }, + { + "epoch": 0.3787047200878156, + "grad_norm": 2.3529231548309326, + "learning_rate": 4.375450794834958e-05, + "loss": 0.3001, + "step": 6900 + }, + { + "epoch": 0.378814489571899, + "grad_norm": 1.4901988506317139, + "learning_rate": 4.375104532752619e-05, + "loss": 0.333, + "step": 6902 + }, + { + "epoch": 0.3789242590559824, + "grad_norm": 1.8098708391189575, + "learning_rate": 4.3747581884189913e-05, + "loss": 0.3629, + "step": 6904 + }, + { + "epoch": 0.37903402854006585, + "grad_norm": 2.032824993133545, + "learning_rate": 4.374411761849268e-05, + "loss": 0.3246, + "step": 6906 + }, + { + "epoch": 0.37914379802414927, + "grad_norm": 2.462883472442627, + "learning_rate": 4.374065253058645e-05, + "loss": 0.3456, + "step": 6908 + }, + { + "epoch": 0.3792535675082327, + "grad_norm": 5.522491931915283, + "learning_rate": 4.37371866206232e-05, + "loss": 0.2328, + "step": 6910 + }, + { + "epoch": 0.3793633369923161, + "grad_norm": 2.0067737102508545, + "learning_rate": 4.373371988875499e-05, + "loss": 0.4834, + "step": 6912 + }, + { + "epoch": 0.37947310647639954, + "grad_norm": 4.198498725891113, + "learning_rate": 4.373025233513387e-05, + "loss": 0.2854, + "step": 6914 + }, + { + "epoch": 0.379582875960483, + "grad_norm": 1.527751088142395, + "learning_rate": 4.3726783959911956e-05, + "loss": 0.288, + "step": 6916 + }, + { + "epoch": 0.37969264544456643, + "grad_norm": 2.6436767578125, + "learning_rate": 4.372331476324137e-05, + "loss": 0.4019, + "step": 6918 + }, + { + "epoch": 0.37980241492864986, + "grad_norm": 2.868116617202759, + "learning_rate": 4.371984474527431e-05, + "loss": 0.3525, + "step": 6920 + }, + { + "epoch": 0.3799121844127333, + "grad_norm": 2.0747690200805664, + "learning_rate": 4.371637390616297e-05, + "loss": 0.4293, + "step": 6922 + }, + { + "epoch": 0.3800219538968167, + "grad_norm": 2.4652442932128906, + "learning_rate": 4.371290224605961e-05, + "loss": 0.2964, + "step": 6924 + }, + { + "epoch": 0.3801317233809001, + "grad_norm": 2.305387496948242, + "learning_rate": 4.3709429765116504e-05, + "loss": 0.3604, + "step": 6926 + }, + { + "epoch": 0.38024149286498354, + "grad_norm": 3.717564344406128, + "learning_rate": 4.370595646348598e-05, + "loss": 0.2946, + "step": 6928 + }, + { + "epoch": 0.38035126234906697, + "grad_norm": 3.122875452041626, + "learning_rate": 4.370248234132039e-05, + "loss": 0.4689, + "step": 6930 + }, + { + "epoch": 0.3804610318331504, + "grad_norm": 1.6742208003997803, + "learning_rate": 4.369900739877213e-05, + "loss": 0.3697, + "step": 6932 + }, + { + "epoch": 0.3805708013172338, + "grad_norm": 1.4755542278289795, + "learning_rate": 4.369553163599362e-05, + "loss": 0.4021, + "step": 6934 + }, + { + "epoch": 0.38068057080131723, + "grad_norm": 1.826159954071045, + "learning_rate": 4.369205505313733e-05, + "loss": 0.3467, + "step": 6936 + }, + { + "epoch": 0.38079034028540065, + "grad_norm": 1.8032242059707642, + "learning_rate": 4.3688577650355764e-05, + "loss": 0.3451, + "step": 6938 + }, + { + "epoch": 0.3809001097694841, + "grad_norm": 2.9075026512145996, + "learning_rate": 4.3685099427801454e-05, + "loss": 0.4066, + "step": 6940 + }, + { + "epoch": 0.3810098792535675, + "grad_norm": 2.1799569129943848, + "learning_rate": 4.368162038562696e-05, + "loss": 0.4268, + "step": 6942 + }, + { + "epoch": 0.3811196487376509, + "grad_norm": 3.319582939147949, + "learning_rate": 4.3678140523984916e-05, + "loss": 0.4151, + "step": 6944 + }, + { + "epoch": 0.38122941822173434, + "grad_norm": 1.5719448328018188, + "learning_rate": 4.367465984302794e-05, + "loss": 0.373, + "step": 6946 + }, + { + "epoch": 0.38133918770581776, + "grad_norm": 1.5576294660568237, + "learning_rate": 4.3671178342908735e-05, + "loss": 0.3267, + "step": 6948 + }, + { + "epoch": 0.3814489571899012, + "grad_norm": 1.6317062377929688, + "learning_rate": 4.3667696023779994e-05, + "loss": 0.3156, + "step": 6950 + }, + { + "epoch": 0.3815587266739846, + "grad_norm": 1.9825810194015503, + "learning_rate": 4.366421288579448e-05, + "loss": 0.4079, + "step": 6952 + }, + { + "epoch": 0.3816684961580681, + "grad_norm": 1.93692147731781, + "learning_rate": 4.366072892910498e-05, + "loss": 0.3634, + "step": 6954 + }, + { + "epoch": 0.3817782656421515, + "grad_norm": 1.5452299118041992, + "learning_rate": 4.365724415386432e-05, + "loss": 0.3684, + "step": 6956 + }, + { + "epoch": 0.3818880351262349, + "grad_norm": 2.3162789344787598, + "learning_rate": 4.365375856022535e-05, + "loss": 0.2496, + "step": 6958 + }, + { + "epoch": 0.38199780461031835, + "grad_norm": 2.0358705520629883, + "learning_rate": 4.365027214834098e-05, + "loss": 0.4243, + "step": 6960 + }, + { + "epoch": 0.38210757409440177, + "grad_norm": 2.0502593517303467, + "learning_rate": 4.364678491836413e-05, + "loss": 0.2588, + "step": 6962 + }, + { + "epoch": 0.3822173435784852, + "grad_norm": 1.504127025604248, + "learning_rate": 4.364329687044777e-05, + "loss": 0.3013, + "step": 6964 + }, + { + "epoch": 0.3823271130625686, + "grad_norm": 2.594912528991699, + "learning_rate": 4.36398080047449e-05, + "loss": 0.3784, + "step": 6966 + }, + { + "epoch": 0.38243688254665203, + "grad_norm": 1.4995540380477905, + "learning_rate": 4.363631832140856e-05, + "loss": 0.4594, + "step": 6968 + }, + { + "epoch": 0.38254665203073546, + "grad_norm": 1.79916512966156, + "learning_rate": 4.3632827820591825e-05, + "loss": 0.2806, + "step": 6970 + }, + { + "epoch": 0.3826564215148189, + "grad_norm": 2.8299291133880615, + "learning_rate": 4.362933650244781e-05, + "loss": 0.3722, + "step": 6972 + }, + { + "epoch": 0.3827661909989023, + "grad_norm": 1.4090065956115723, + "learning_rate": 4.3625844367129665e-05, + "loss": 0.3026, + "step": 6974 + }, + { + "epoch": 0.3828759604829857, + "grad_norm": 1.7683827877044678, + "learning_rate": 4.3622351414790554e-05, + "loss": 0.2166, + "step": 6976 + }, + { + "epoch": 0.38298572996706914, + "grad_norm": 1.2072985172271729, + "learning_rate": 4.361885764558371e-05, + "loss": 0.245, + "step": 6978 + }, + { + "epoch": 0.38309549945115257, + "grad_norm": 1.417519211769104, + "learning_rate": 4.3615363059662387e-05, + "loss": 0.3317, + "step": 6980 + }, + { + "epoch": 0.383205268935236, + "grad_norm": 2.9435813426971436, + "learning_rate": 4.361186765717986e-05, + "loss": 0.2599, + "step": 6982 + }, + { + "epoch": 0.3833150384193194, + "grad_norm": 2.0755488872528076, + "learning_rate": 4.3608371438289475e-05, + "loss": 0.2833, + "step": 6984 + }, + { + "epoch": 0.38342480790340283, + "grad_norm": 1.3693913221359253, + "learning_rate": 4.360487440314458e-05, + "loss": 0.4089, + "step": 6986 + }, + { + "epoch": 0.38353457738748625, + "grad_norm": 1.8759344816207886, + "learning_rate": 4.360137655189857e-05, + "loss": 0.3131, + "step": 6988 + }, + { + "epoch": 0.38364434687156973, + "grad_norm": 1.6978245973587036, + "learning_rate": 4.359787788470489e-05, + "loss": 0.396, + "step": 6990 + }, + { + "epoch": 0.38375411635565315, + "grad_norm": 1.316044807434082, + "learning_rate": 4.3594378401717e-05, + "loss": 0.2874, + "step": 6992 + }, + { + "epoch": 0.3838638858397366, + "grad_norm": 1.530831217765808, + "learning_rate": 4.3590878103088405e-05, + "loss": 0.2932, + "step": 6994 + }, + { + "epoch": 0.38397365532382, + "grad_norm": 1.6670317649841309, + "learning_rate": 4.3587376988972653e-05, + "loss": 0.3928, + "step": 6996 + }, + { + "epoch": 0.3840834248079034, + "grad_norm": 3.0372183322906494, + "learning_rate": 4.3583875059523315e-05, + "loss": 0.457, + "step": 6998 + }, + { + "epoch": 0.38419319429198684, + "grad_norm": 2.89052677154541, + "learning_rate": 4.358037231489399e-05, + "loss": 0.5719, + "step": 7000 + }, + { + "epoch": 0.38430296377607026, + "grad_norm": 2.5248618125915527, + "learning_rate": 4.357686875523835e-05, + "loss": 0.3802, + "step": 7002 + }, + { + "epoch": 0.3844127332601537, + "grad_norm": 1.7738367319107056, + "learning_rate": 4.357336438071006e-05, + "loss": 0.32, + "step": 7004 + }, + { + "epoch": 0.3845225027442371, + "grad_norm": 3.810863494873047, + "learning_rate": 4.3569859191462845e-05, + "loss": 0.3559, + "step": 7006 + }, + { + "epoch": 0.3846322722283205, + "grad_norm": 1.2852082252502441, + "learning_rate": 4.356635318765046e-05, + "loss": 0.3, + "step": 7008 + }, + { + "epoch": 0.38474204171240395, + "grad_norm": 1.9520033597946167, + "learning_rate": 4.35628463694267e-05, + "loss": 0.3667, + "step": 7010 + }, + { + "epoch": 0.38485181119648737, + "grad_norm": 1.4640014171600342, + "learning_rate": 4.3559338736945375e-05, + "loss": 0.4526, + "step": 7012 + }, + { + "epoch": 0.3849615806805708, + "grad_norm": 1.7434964179992676, + "learning_rate": 4.355583029036037e-05, + "loss": 0.2846, + "step": 7014 + }, + { + "epoch": 0.3850713501646542, + "grad_norm": 1.5576337575912476, + "learning_rate": 4.355232102982556e-05, + "loss": 0.3596, + "step": 7016 + }, + { + "epoch": 0.38518111964873764, + "grad_norm": 1.6285455226898193, + "learning_rate": 4.35488109554949e-05, + "loss": 0.307, + "step": 7018 + }, + { + "epoch": 0.38529088913282106, + "grad_norm": 2.6630454063415527, + "learning_rate": 4.354530006752234e-05, + "loss": 0.3247, + "step": 7020 + }, + { + "epoch": 0.3854006586169045, + "grad_norm": 1.9492852687835693, + "learning_rate": 4.3541788366061894e-05, + "loss": 0.4558, + "step": 7022 + }, + { + "epoch": 0.3855104281009879, + "grad_norm": 1.2760319709777832, + "learning_rate": 4.353827585126762e-05, + "loss": 0.3638, + "step": 7024 + }, + { + "epoch": 0.3856201975850713, + "grad_norm": 1.3432798385620117, + "learning_rate": 4.353476252329356e-05, + "loss": 0.3502, + "step": 7026 + }, + { + "epoch": 0.3857299670691548, + "grad_norm": 1.6356483697891235, + "learning_rate": 4.353124838229384e-05, + "loss": 0.3693, + "step": 7028 + }, + { + "epoch": 0.3858397365532382, + "grad_norm": 1.635949969291687, + "learning_rate": 4.3527733428422614e-05, + "loss": 0.4482, + "step": 7030 + }, + { + "epoch": 0.38594950603732164, + "grad_norm": 1.3747456073760986, + "learning_rate": 4.352421766183406e-05, + "loss": 0.3481, + "step": 7032 + }, + { + "epoch": 0.38605927552140507, + "grad_norm": 1.7761266231536865, + "learning_rate": 4.352070108268239e-05, + "loss": 0.3172, + "step": 7034 + }, + { + "epoch": 0.3861690450054885, + "grad_norm": 1.5570636987686157, + "learning_rate": 4.3517183691121874e-05, + "loss": 0.389, + "step": 7036 + }, + { + "epoch": 0.3862788144895719, + "grad_norm": 2.2630527019500732, + "learning_rate": 4.351366548730679e-05, + "loss": 0.3918, + "step": 7038 + }, + { + "epoch": 0.38638858397365533, + "grad_norm": 2.8517098426818848, + "learning_rate": 4.351014647139147e-05, + "loss": 0.4064, + "step": 7040 + }, + { + "epoch": 0.38649835345773875, + "grad_norm": 2.236799716949463, + "learning_rate": 4.3506626643530265e-05, + "loss": 0.234, + "step": 7042 + }, + { + "epoch": 0.3866081229418222, + "grad_norm": 2.2903943061828613, + "learning_rate": 4.350310600387759e-05, + "loss": 0.3692, + "step": 7044 + }, + { + "epoch": 0.3867178924259056, + "grad_norm": 2.4909260272979736, + "learning_rate": 4.3499584552587865e-05, + "loss": 0.3285, + "step": 7046 + }, + { + "epoch": 0.386827661909989, + "grad_norm": 1.4437428712844849, + "learning_rate": 4.3496062289815556e-05, + "loss": 0.3345, + "step": 7048 + }, + { + "epoch": 0.38693743139407244, + "grad_norm": 2.0147011280059814, + "learning_rate": 4.349253921571517e-05, + "loss": 0.288, + "step": 7050 + }, + { + "epoch": 0.38704720087815586, + "grad_norm": 3.9263193607330322, + "learning_rate": 4.348901533044125e-05, + "loss": 0.2881, + "step": 7052 + }, + { + "epoch": 0.3871569703622393, + "grad_norm": 1.3922299146652222, + "learning_rate": 4.3485490634148375e-05, + "loss": 0.4127, + "step": 7054 + }, + { + "epoch": 0.3872667398463227, + "grad_norm": 2.909266233444214, + "learning_rate": 4.348196512699114e-05, + "loss": 0.3965, + "step": 7056 + }, + { + "epoch": 0.3873765093304061, + "grad_norm": 2.10323429107666, + "learning_rate": 4.34784388091242e-05, + "loss": 0.2713, + "step": 7058 + }, + { + "epoch": 0.38748627881448955, + "grad_norm": 1.42953360080719, + "learning_rate": 4.3474911680702235e-05, + "loss": 0.2792, + "step": 7060 + }, + { + "epoch": 0.38759604829857297, + "grad_norm": 1.3961962461471558, + "learning_rate": 4.3471383741879965e-05, + "loss": 0.3113, + "step": 7062 + }, + { + "epoch": 0.38770581778265645, + "grad_norm": 1.2368838787078857, + "learning_rate": 4.3467854992812154e-05, + "loss": 0.3399, + "step": 7064 + }, + { + "epoch": 0.38781558726673987, + "grad_norm": 1.7609059810638428, + "learning_rate": 4.3464325433653566e-05, + "loss": 0.2454, + "step": 7066 + }, + { + "epoch": 0.3879253567508233, + "grad_norm": 2.22249174118042, + "learning_rate": 4.346079506455903e-05, + "loss": 0.404, + "step": 7068 + }, + { + "epoch": 0.3880351262349067, + "grad_norm": 1.9410943984985352, + "learning_rate": 4.345726388568342e-05, + "loss": 0.4815, + "step": 7070 + }, + { + "epoch": 0.38814489571899013, + "grad_norm": 1.2974598407745361, + "learning_rate": 4.345373189718161e-05, + "loss": 0.4092, + "step": 7072 + }, + { + "epoch": 0.38825466520307356, + "grad_norm": 1.5719081163406372, + "learning_rate": 4.3450199099208554e-05, + "loss": 0.3992, + "step": 7074 + }, + { + "epoch": 0.388364434687157, + "grad_norm": 1.2639501094818115, + "learning_rate": 4.344666549191921e-05, + "loss": 0.2506, + "step": 7076 + }, + { + "epoch": 0.3884742041712404, + "grad_norm": 1.5234674215316772, + "learning_rate": 4.344313107546857e-05, + "loss": 0.2715, + "step": 7078 + }, + { + "epoch": 0.3885839736553238, + "grad_norm": 1.7129502296447754, + "learning_rate": 4.343959585001168e-05, + "loss": 0.4536, + "step": 7080 + }, + { + "epoch": 0.38869374313940724, + "grad_norm": 1.7456997632980347, + "learning_rate": 4.3436059815703615e-05, + "loss": 0.3427, + "step": 7082 + }, + { + "epoch": 0.38880351262349067, + "grad_norm": 1.3463435173034668, + "learning_rate": 4.343252297269946e-05, + "loss": 0.3316, + "step": 7084 + }, + { + "epoch": 0.3889132821075741, + "grad_norm": 2.5337467193603516, + "learning_rate": 4.342898532115439e-05, + "loss": 0.3218, + "step": 7086 + }, + { + "epoch": 0.3890230515916575, + "grad_norm": 2.170685052871704, + "learning_rate": 4.342544686122356e-05, + "loss": 0.3786, + "step": 7088 + }, + { + "epoch": 0.38913282107574093, + "grad_norm": 1.2269796133041382, + "learning_rate": 4.34219075930622e-05, + "loss": 0.3953, + "step": 7090 + }, + { + "epoch": 0.38924259055982435, + "grad_norm": 1.9804043769836426, + "learning_rate": 4.3418367516825556e-05, + "loss": 0.3296, + "step": 7092 + }, + { + "epoch": 0.3893523600439078, + "grad_norm": 1.546819806098938, + "learning_rate": 4.34148266326689e-05, + "loss": 0.345, + "step": 7094 + }, + { + "epoch": 0.3894621295279912, + "grad_norm": 2.6554884910583496, + "learning_rate": 4.3411284940747566e-05, + "loss": 0.5186, + "step": 7096 + }, + { + "epoch": 0.3895718990120746, + "grad_norm": 2.9768106937408447, + "learning_rate": 4.3407742441216904e-05, + "loss": 0.5558, + "step": 7098 + }, + { + "epoch": 0.3896816684961581, + "grad_norm": 2.034788131713867, + "learning_rate": 4.340419913423232e-05, + "loss": 0.385, + "step": 7100 + }, + { + "epoch": 0.3897914379802415, + "grad_norm": 2.0706894397735596, + "learning_rate": 4.340065501994921e-05, + "loss": 0.3421, + "step": 7102 + }, + { + "epoch": 0.38990120746432494, + "grad_norm": 1.7257450819015503, + "learning_rate": 4.339711009852306e-05, + "loss": 0.4766, + "step": 7104 + }, + { + "epoch": 0.39001097694840836, + "grad_norm": 1.8471473455429077, + "learning_rate": 4.339356437010937e-05, + "loss": 0.3789, + "step": 7106 + }, + { + "epoch": 0.3901207464324918, + "grad_norm": 1.9061880111694336, + "learning_rate": 4.3390017834863675e-05, + "loss": 0.428, + "step": 7108 + }, + { + "epoch": 0.3902305159165752, + "grad_norm": 1.5069500207901, + "learning_rate": 4.338647049294152e-05, + "loss": 0.3433, + "step": 7110 + }, + { + "epoch": 0.3903402854006586, + "grad_norm": 1.6856905221939087, + "learning_rate": 4.338292234449852e-05, + "loss": 0.5172, + "step": 7112 + }, + { + "epoch": 0.39045005488474205, + "grad_norm": 1.3364838361740112, + "learning_rate": 4.337937338969033e-05, + "loss": 0.3841, + "step": 7114 + }, + { + "epoch": 0.39055982436882547, + "grad_norm": 2.6391327381134033, + "learning_rate": 4.33758236286726e-05, + "loss": 0.4995, + "step": 7116 + }, + { + "epoch": 0.3906695938529089, + "grad_norm": 2.4522464275360107, + "learning_rate": 4.337227306160106e-05, + "loss": 0.3001, + "step": 7118 + }, + { + "epoch": 0.3907793633369923, + "grad_norm": 1.4327472448349, + "learning_rate": 4.3368721688631445e-05, + "loss": 0.2676, + "step": 7120 + }, + { + "epoch": 0.39088913282107574, + "grad_norm": 1.3339340686798096, + "learning_rate": 4.336516950991953e-05, + "loss": 0.2774, + "step": 7122 + }, + { + "epoch": 0.39099890230515916, + "grad_norm": 1.7036477327346802, + "learning_rate": 4.336161652562115e-05, + "loss": 0.3167, + "step": 7124 + }, + { + "epoch": 0.3911086717892426, + "grad_norm": 1.6052663326263428, + "learning_rate": 4.335806273589214e-05, + "loss": 0.3405, + "step": 7126 + }, + { + "epoch": 0.391218441273326, + "grad_norm": 1.602963924407959, + "learning_rate": 4.3354508140888394e-05, + "loss": 0.3332, + "step": 7128 + }, + { + "epoch": 0.3913282107574094, + "grad_norm": 2.069967269897461, + "learning_rate": 4.335095274076583e-05, + "loss": 0.362, + "step": 7130 + }, + { + "epoch": 0.39143798024149284, + "grad_norm": 4.235011577606201, + "learning_rate": 4.334739653568041e-05, + "loss": 0.293, + "step": 7132 + }, + { + "epoch": 0.39154774972557627, + "grad_norm": 2.7581722736358643, + "learning_rate": 4.334383952578812e-05, + "loss": 0.295, + "step": 7134 + }, + { + "epoch": 0.3916575192096597, + "grad_norm": 1.9626786708831787, + "learning_rate": 4.334028171124499e-05, + "loss": 0.3511, + "step": 7136 + }, + { + "epoch": 0.39176728869374317, + "grad_norm": 2.2093701362609863, + "learning_rate": 4.333672309220709e-05, + "loss": 0.3519, + "step": 7138 + }, + { + "epoch": 0.3918770581778266, + "grad_norm": 1.6518646478652954, + "learning_rate": 4.333316366883051e-05, + "loss": 0.3285, + "step": 7140 + }, + { + "epoch": 0.39198682766191, + "grad_norm": 3.061817169189453, + "learning_rate": 4.332960344127138e-05, + "loss": 0.4965, + "step": 7142 + }, + { + "epoch": 0.39209659714599343, + "grad_norm": 1.213983416557312, + "learning_rate": 4.332604240968588e-05, + "loss": 0.2836, + "step": 7144 + }, + { + "epoch": 0.39220636663007685, + "grad_norm": 2.5872912406921387, + "learning_rate": 4.332248057423022e-05, + "loss": 0.2611, + "step": 7146 + }, + { + "epoch": 0.3923161361141603, + "grad_norm": 1.6954683065414429, + "learning_rate": 4.331891793506062e-05, + "loss": 0.2724, + "step": 7148 + }, + { + "epoch": 0.3924259055982437, + "grad_norm": 2.3582639694213867, + "learning_rate": 4.331535449233337e-05, + "loss": 0.3634, + "step": 7150 + }, + { + "epoch": 0.3925356750823271, + "grad_norm": 1.3274763822555542, + "learning_rate": 4.331179024620478e-05, + "loss": 0.4223, + "step": 7152 + }, + { + "epoch": 0.39264544456641054, + "grad_norm": 1.8896440267562866, + "learning_rate": 4.330822519683117e-05, + "loss": 0.2856, + "step": 7154 + }, + { + "epoch": 0.39275521405049396, + "grad_norm": 3.2560932636260986, + "learning_rate": 4.330465934436896e-05, + "loss": 0.4179, + "step": 7156 + }, + { + "epoch": 0.3928649835345774, + "grad_norm": 1.4557716846466064, + "learning_rate": 4.330109268897454e-05, + "loss": 0.2936, + "step": 7158 + }, + { + "epoch": 0.3929747530186608, + "grad_norm": 2.452613115310669, + "learning_rate": 4.329752523080437e-05, + "loss": 0.3626, + "step": 7160 + }, + { + "epoch": 0.3930845225027442, + "grad_norm": 1.1408787965774536, + "learning_rate": 4.329395697001494e-05, + "loss": 0.3827, + "step": 7162 + }, + { + "epoch": 0.39319429198682765, + "grad_norm": 1.3577815294265747, + "learning_rate": 4.3290387906762755e-05, + "loss": 0.2667, + "step": 7164 + }, + { + "epoch": 0.39330406147091107, + "grad_norm": 2.3560256958007812, + "learning_rate": 4.328681804120438e-05, + "loss": 0.4651, + "step": 7166 + }, + { + "epoch": 0.3934138309549945, + "grad_norm": 3.506398916244507, + "learning_rate": 4.3283247373496415e-05, + "loss": 0.3768, + "step": 7168 + }, + { + "epoch": 0.3935236004390779, + "grad_norm": 1.6538928747177124, + "learning_rate": 4.327967590379548e-05, + "loss": 0.3476, + "step": 7170 + }, + { + "epoch": 0.39363336992316134, + "grad_norm": 1.6591105461120605, + "learning_rate": 4.3276103632258235e-05, + "loss": 0.3284, + "step": 7172 + }, + { + "epoch": 0.3937431394072448, + "grad_norm": 1.690680980682373, + "learning_rate": 4.3272530559041384e-05, + "loss": 0.2204, + "step": 7174 + }, + { + "epoch": 0.39385290889132823, + "grad_norm": 2.0052270889282227, + "learning_rate": 4.326895668430166e-05, + "loss": 0.4766, + "step": 7176 + }, + { + "epoch": 0.39396267837541166, + "grad_norm": 1.7338272333145142, + "learning_rate": 4.3265382008195814e-05, + "loss": 0.3426, + "step": 7178 + }, + { + "epoch": 0.3940724478594951, + "grad_norm": 2.5364418029785156, + "learning_rate": 4.326180653088067e-05, + "loss": 0.4152, + "step": 7180 + }, + { + "epoch": 0.3941822173435785, + "grad_norm": 2.561950922012329, + "learning_rate": 4.325823025251305e-05, + "loss": 0.3978, + "step": 7182 + }, + { + "epoch": 0.3942919868276619, + "grad_norm": 2.645805835723877, + "learning_rate": 4.325465317324985e-05, + "loss": 0.3062, + "step": 7184 + }, + { + "epoch": 0.39440175631174534, + "grad_norm": 2.7862257957458496, + "learning_rate": 4.325107529324795e-05, + "loss": 0.4161, + "step": 7186 + }, + { + "epoch": 0.39451152579582877, + "grad_norm": 2.1830878257751465, + "learning_rate": 4.324749661266431e-05, + "loss": 0.4193, + "step": 7188 + }, + { + "epoch": 0.3946212952799122, + "grad_norm": 1.7341114282608032, + "learning_rate": 4.32439171316559e-05, + "loss": 0.2944, + "step": 7190 + }, + { + "epoch": 0.3947310647639956, + "grad_norm": 1.9390015602111816, + "learning_rate": 4.324033685037974e-05, + "loss": 0.2688, + "step": 7192 + }, + { + "epoch": 0.39484083424807903, + "grad_norm": 1.5105228424072266, + "learning_rate": 4.323675576899288e-05, + "loss": 0.428, + "step": 7194 + }, + { + "epoch": 0.39495060373216245, + "grad_norm": 2.00042462348938, + "learning_rate": 4.32331738876524e-05, + "loss": 0.2572, + "step": 7196 + }, + { + "epoch": 0.3950603732162459, + "grad_norm": 1.7595399618148804, + "learning_rate": 4.322959120651542e-05, + "loss": 0.3383, + "step": 7198 + }, + { + "epoch": 0.3951701427003293, + "grad_norm": 1.9721094369888306, + "learning_rate": 4.322600772573909e-05, + "loss": 0.4963, + "step": 7200 + }, + { + "epoch": 0.3952799121844127, + "grad_norm": 1.0900721549987793, + "learning_rate": 4.3222423445480605e-05, + "loss": 0.2646, + "step": 7202 + }, + { + "epoch": 0.39538968166849614, + "grad_norm": 1.7146446704864502, + "learning_rate": 4.3218838365897184e-05, + "loss": 0.3238, + "step": 7204 + }, + { + "epoch": 0.39549945115257956, + "grad_norm": 1.379347801208496, + "learning_rate": 4.32152524871461e-05, + "loss": 0.4754, + "step": 7206 + }, + { + "epoch": 0.395609220636663, + "grad_norm": 2.2776851654052734, + "learning_rate": 4.321166580938462e-05, + "loss": 0.4479, + "step": 7208 + }, + { + "epoch": 0.3957189901207464, + "grad_norm": 1.7167357206344604, + "learning_rate": 4.32080783327701e-05, + "loss": 0.3286, + "step": 7210 + }, + { + "epoch": 0.3958287596048299, + "grad_norm": 2.4218037128448486, + "learning_rate": 4.320449005745989e-05, + "loss": 0.3564, + "step": 7212 + }, + { + "epoch": 0.3959385290889133, + "grad_norm": 1.4840348958969116, + "learning_rate": 4.3200900983611394e-05, + "loss": 0.265, + "step": 7214 + }, + { + "epoch": 0.3960482985729967, + "grad_norm": 1.0150549411773682, + "learning_rate": 4.3197311111382045e-05, + "loss": 0.2667, + "step": 7216 + }, + { + "epoch": 0.39615806805708015, + "grad_norm": 1.6647956371307373, + "learning_rate": 4.319372044092931e-05, + "loss": 0.4004, + "step": 7218 + }, + { + "epoch": 0.39626783754116357, + "grad_norm": 1.3846112489700317, + "learning_rate": 4.3190128972410705e-05, + "loss": 0.2713, + "step": 7220 + }, + { + "epoch": 0.396377607025247, + "grad_norm": 2.440682888031006, + "learning_rate": 4.3186536705983756e-05, + "loss": 0.34, + "step": 7222 + }, + { + "epoch": 0.3964873765093304, + "grad_norm": 2.6227004528045654, + "learning_rate": 4.318294364180604e-05, + "loss": 0.5001, + "step": 7224 + }, + { + "epoch": 0.39659714599341384, + "grad_norm": 2.18155837059021, + "learning_rate": 4.317934978003517e-05, + "loss": 0.3592, + "step": 7226 + }, + { + "epoch": 0.39670691547749726, + "grad_norm": 2.4382731914520264, + "learning_rate": 4.3175755120828794e-05, + "loss": 0.4265, + "step": 7228 + }, + { + "epoch": 0.3968166849615807, + "grad_norm": 2.450343132019043, + "learning_rate": 4.317215966434458e-05, + "loss": 0.3731, + "step": 7230 + }, + { + "epoch": 0.3969264544456641, + "grad_norm": 1.4867384433746338, + "learning_rate": 4.316856341074025e-05, + "loss": 0.4074, + "step": 7232 + }, + { + "epoch": 0.3970362239297475, + "grad_norm": 2.5054759979248047, + "learning_rate": 4.316496636017355e-05, + "loss": 0.4193, + "step": 7234 + }, + { + "epoch": 0.39714599341383094, + "grad_norm": 2.1247775554656982, + "learning_rate": 4.316136851280228e-05, + "loss": 0.2621, + "step": 7236 + }, + { + "epoch": 0.39725576289791437, + "grad_norm": 1.9133936166763306, + "learning_rate": 4.3157769868784236e-05, + "loss": 0.4384, + "step": 7238 + }, + { + "epoch": 0.3973655323819978, + "grad_norm": 1.2418276071548462, + "learning_rate": 4.315417042827728e-05, + "loss": 0.3483, + "step": 7240 + }, + { + "epoch": 0.3974753018660812, + "grad_norm": 2.235707998275757, + "learning_rate": 4.31505701914393e-05, + "loss": 0.3598, + "step": 7242 + }, + { + "epoch": 0.39758507135016463, + "grad_norm": 1.7483556270599365, + "learning_rate": 4.314696915842823e-05, + "loss": 0.2954, + "step": 7244 + }, + { + "epoch": 0.39769484083424805, + "grad_norm": 1.3424301147460938, + "learning_rate": 4.314336732940202e-05, + "loss": 0.327, + "step": 7246 + }, + { + "epoch": 0.39780461031833153, + "grad_norm": 1.525708556175232, + "learning_rate": 4.313976470451867e-05, + "loss": 0.3508, + "step": 7248 + }, + { + "epoch": 0.39791437980241495, + "grad_norm": 2.3810629844665527, + "learning_rate": 4.313616128393621e-05, + "loss": 0.3184, + "step": 7250 + }, + { + "epoch": 0.3980241492864984, + "grad_norm": 1.2531286478042603, + "learning_rate": 4.313255706781268e-05, + "loss": 0.3424, + "step": 7252 + }, + { + "epoch": 0.3981339187705818, + "grad_norm": 1.1966544389724731, + "learning_rate": 4.312895205630621e-05, + "loss": 0.2909, + "step": 7254 + }, + { + "epoch": 0.3982436882546652, + "grad_norm": 2.7529008388519287, + "learning_rate": 4.3125346249574915e-05, + "loss": 0.3496, + "step": 7256 + }, + { + "epoch": 0.39835345773874864, + "grad_norm": 1.9575462341308594, + "learning_rate": 4.312173964777697e-05, + "loss": 0.3923, + "step": 7258 + }, + { + "epoch": 0.39846322722283206, + "grad_norm": 1.2455494403839111, + "learning_rate": 4.3118132251070585e-05, + "loss": 0.2275, + "step": 7260 + }, + { + "epoch": 0.3985729967069155, + "grad_norm": 1.1132760047912598, + "learning_rate": 4.311452405961398e-05, + "loss": 0.2232, + "step": 7262 + }, + { + "epoch": 0.3986827661909989, + "grad_norm": 1.171661615371704, + "learning_rate": 4.3110915073565444e-05, + "loss": 0.3017, + "step": 7264 + }, + { + "epoch": 0.3987925356750823, + "grad_norm": 2.563126802444458, + "learning_rate": 4.310730529308328e-05, + "loss": 0.348, + "step": 7266 + }, + { + "epoch": 0.39890230515916575, + "grad_norm": 1.0029234886169434, + "learning_rate": 4.3103694718325826e-05, + "loss": 0.2445, + "step": 7268 + }, + { + "epoch": 0.39901207464324917, + "grad_norm": 2.710447072982788, + "learning_rate": 4.310008334945147e-05, + "loss": 0.3495, + "step": 7270 + }, + { + "epoch": 0.3991218441273326, + "grad_norm": 1.563930869102478, + "learning_rate": 4.3096471186618615e-05, + "loss": 0.373, + "step": 7272 + }, + { + "epoch": 0.399231613611416, + "grad_norm": 1.999718427658081, + "learning_rate": 4.309285822998571e-05, + "loss": 0.4279, + "step": 7274 + }, + { + "epoch": 0.39934138309549944, + "grad_norm": 3.8929996490478516, + "learning_rate": 4.3089244479711236e-05, + "loss": 0.4847, + "step": 7276 + }, + { + "epoch": 0.39945115257958286, + "grad_norm": 2.0028867721557617, + "learning_rate": 4.308562993595371e-05, + "loss": 0.3146, + "step": 7278 + }, + { + "epoch": 0.3995609220636663, + "grad_norm": 1.7225685119628906, + "learning_rate": 4.308201459887169e-05, + "loss": 0.461, + "step": 7280 + }, + { + "epoch": 0.3996706915477497, + "grad_norm": 2.125669479370117, + "learning_rate": 4.307839846862376e-05, + "loss": 0.2931, + "step": 7282 + }, + { + "epoch": 0.3997804610318331, + "grad_norm": 1.4570298194885254, + "learning_rate": 4.307478154536854e-05, + "loss": 0.415, + "step": 7284 + }, + { + "epoch": 0.3998902305159166, + "grad_norm": 1.5333250761032104, + "learning_rate": 4.307116382926468e-05, + "loss": 0.3805, + "step": 7286 + }, + { + "epoch": 0.4, + "grad_norm": 2.892404079437256, + "learning_rate": 4.3067545320470874e-05, + "loss": 0.3871, + "step": 7288 + }, + { + "epoch": 0.40010976948408344, + "grad_norm": 2.0604212284088135, + "learning_rate": 4.3063926019145856e-05, + "loss": 0.3233, + "step": 7290 + }, + { + "epoch": 0.40021953896816687, + "grad_norm": 1.316688060760498, + "learning_rate": 4.3060305925448376e-05, + "loss": 0.2701, + "step": 7292 + }, + { + "epoch": 0.4003293084522503, + "grad_norm": 2.210956335067749, + "learning_rate": 4.305668503953724e-05, + "loss": 0.3224, + "step": 7294 + }, + { + "epoch": 0.4004390779363337, + "grad_norm": 1.700216293334961, + "learning_rate": 4.3053063361571255e-05, + "loss": 0.2883, + "step": 7296 + }, + { + "epoch": 0.40054884742041713, + "grad_norm": 2.4671738147735596, + "learning_rate": 4.304944089170931e-05, + "loss": 0.3667, + "step": 7298 + }, + { + "epoch": 0.40065861690450055, + "grad_norm": 2.057154655456543, + "learning_rate": 4.3045817630110303e-05, + "loss": 0.3103, + "step": 7300 + }, + { + "epoch": 0.400768386388584, + "grad_norm": 2.8139846324920654, + "learning_rate": 4.3042193576933155e-05, + "loss": 0.4884, + "step": 7302 + }, + { + "epoch": 0.4008781558726674, + "grad_norm": 2.941321849822998, + "learning_rate": 4.3038568732336837e-05, + "loss": 0.3348, + "step": 7304 + }, + { + "epoch": 0.4009879253567508, + "grad_norm": 2.6969692707061768, + "learning_rate": 4.3034943096480354e-05, + "loss": 0.3504, + "step": 7306 + }, + { + "epoch": 0.40109769484083424, + "grad_norm": 3.0557429790496826, + "learning_rate": 4.303131666952275e-05, + "loss": 0.3641, + "step": 7308 + }, + { + "epoch": 0.40120746432491766, + "grad_norm": 1.681152582168579, + "learning_rate": 4.30276894516231e-05, + "loss": 0.3812, + "step": 7310 + }, + { + "epoch": 0.4013172338090011, + "grad_norm": 2.6056602001190186, + "learning_rate": 4.30240614429405e-05, + "loss": 0.3359, + "step": 7312 + }, + { + "epoch": 0.4014270032930845, + "grad_norm": 2.8840818405151367, + "learning_rate": 4.30204326436341e-05, + "loss": 0.4121, + "step": 7314 + }, + { + "epoch": 0.4015367727771679, + "grad_norm": 2.0778403282165527, + "learning_rate": 4.301680305386306e-05, + "loss": 0.4333, + "step": 7316 + }, + { + "epoch": 0.40164654226125135, + "grad_norm": 1.5349204540252686, + "learning_rate": 4.301317267378663e-05, + "loss": 0.4105, + "step": 7318 + }, + { + "epoch": 0.40175631174533477, + "grad_norm": 1.538578748703003, + "learning_rate": 4.300954150356402e-05, + "loss": 0.3133, + "step": 7320 + }, + { + "epoch": 0.40186608122941825, + "grad_norm": 1.1984609365463257, + "learning_rate": 4.3005909543354525e-05, + "loss": 0.2909, + "step": 7322 + }, + { + "epoch": 0.40197585071350167, + "grad_norm": 3.1351537704467773, + "learning_rate": 4.300227679331745e-05, + "loss": 0.4296, + "step": 7324 + }, + { + "epoch": 0.4020856201975851, + "grad_norm": 3.4231104850769043, + "learning_rate": 4.299864325361217e-05, + "loss": 0.4209, + "step": 7326 + }, + { + "epoch": 0.4021953896816685, + "grad_norm": 2.6165270805358887, + "learning_rate": 4.299500892439805e-05, + "loss": 0.4142, + "step": 7328 + }, + { + "epoch": 0.40230515916575194, + "grad_norm": 1.4018962383270264, + "learning_rate": 4.2991373805834514e-05, + "loss": 0.3112, + "step": 7330 + }, + { + "epoch": 0.40241492864983536, + "grad_norm": 0.9606845378875732, + "learning_rate": 4.298773789808101e-05, + "loss": 0.2151, + "step": 7332 + }, + { + "epoch": 0.4025246981339188, + "grad_norm": 2.750009775161743, + "learning_rate": 4.298410120129704e-05, + "loss": 0.2242, + "step": 7334 + }, + { + "epoch": 0.4026344676180022, + "grad_norm": 2.2859046459198, + "learning_rate": 4.2980463715642116e-05, + "loss": 0.3192, + "step": 7336 + }, + { + "epoch": 0.4027442371020856, + "grad_norm": 1.7908763885498047, + "learning_rate": 4.29768254412758e-05, + "loss": 0.3493, + "step": 7338 + }, + { + "epoch": 0.40285400658616904, + "grad_norm": 2.0190587043762207, + "learning_rate": 4.297318637835769e-05, + "loss": 0.3126, + "step": 7340 + }, + { + "epoch": 0.40296377607025247, + "grad_norm": 1.6913291215896606, + "learning_rate": 4.29695465270474e-05, + "loss": 0.2847, + "step": 7342 + }, + { + "epoch": 0.4030735455543359, + "grad_norm": 2.2146189212799072, + "learning_rate": 4.29659058875046e-05, + "loss": 0.4306, + "step": 7344 + }, + { + "epoch": 0.4031833150384193, + "grad_norm": 1.4736039638519287, + "learning_rate": 4.296226445988899e-05, + "loss": 0.3388, + "step": 7346 + }, + { + "epoch": 0.40329308452250273, + "grad_norm": 1.3321088552474976, + "learning_rate": 4.2958622244360304e-05, + "loss": 0.3258, + "step": 7348 + }, + { + "epoch": 0.40340285400658615, + "grad_norm": 1.727677822113037, + "learning_rate": 4.295497924107828e-05, + "loss": 0.308, + "step": 7350 + }, + { + "epoch": 0.4035126234906696, + "grad_norm": 1.6267075538635254, + "learning_rate": 4.2951335450202754e-05, + "loss": 0.3607, + "step": 7352 + }, + { + "epoch": 0.403622392974753, + "grad_norm": 2.2350499629974365, + "learning_rate": 4.294769087189354e-05, + "loss": 0.4108, + "step": 7354 + }, + { + "epoch": 0.4037321624588364, + "grad_norm": 1.5267630815505981, + "learning_rate": 4.294404550631051e-05, + "loss": 0.4339, + "step": 7356 + }, + { + "epoch": 0.40384193194291984, + "grad_norm": 3.191586494445801, + "learning_rate": 4.294039935361358e-05, + "loss": 0.3428, + "step": 7358 + }, + { + "epoch": 0.4039517014270033, + "grad_norm": 1.7007145881652832, + "learning_rate": 4.2936752413962674e-05, + "loss": 0.4206, + "step": 7360 + }, + { + "epoch": 0.40406147091108674, + "grad_norm": 1.2684215307235718, + "learning_rate": 4.293310468751776e-05, + "loss": 0.3084, + "step": 7362 + }, + { + "epoch": 0.40417124039517016, + "grad_norm": 1.6771223545074463, + "learning_rate": 4.292945617443886e-05, + "loss": 0.312, + "step": 7364 + }, + { + "epoch": 0.4042810098792536, + "grad_norm": 2.864893913269043, + "learning_rate": 4.292580687488601e-05, + "loss": 0.4219, + "step": 7366 + }, + { + "epoch": 0.404390779363337, + "grad_norm": 1.797911524772644, + "learning_rate": 4.292215678901929e-05, + "loss": 0.4018, + "step": 7368 + }, + { + "epoch": 0.4045005488474204, + "grad_norm": 3.052030324935913, + "learning_rate": 4.291850591699879e-05, + "loss": 0.2834, + "step": 7370 + }, + { + "epoch": 0.40461031833150385, + "grad_norm": 1.4886103868484497, + "learning_rate": 4.291485425898468e-05, + "loss": 0.3286, + "step": 7372 + }, + { + "epoch": 0.40472008781558727, + "grad_norm": 1.7092124223709106, + "learning_rate": 4.291120181513713e-05, + "loss": 0.3721, + "step": 7374 + }, + { + "epoch": 0.4048298572996707, + "grad_norm": 1.4506382942199707, + "learning_rate": 4.290754858561637e-05, + "loss": 0.3345, + "step": 7376 + }, + { + "epoch": 0.4049396267837541, + "grad_norm": 1.5511270761489868, + "learning_rate": 4.290389457058261e-05, + "loss": 0.2825, + "step": 7378 + }, + { + "epoch": 0.40504939626783754, + "grad_norm": 1.9216530323028564, + "learning_rate": 4.290023977019616e-05, + "loss": 0.3161, + "step": 7380 + }, + { + "epoch": 0.40515916575192096, + "grad_norm": 6.055116176605225, + "learning_rate": 4.289658418461735e-05, + "loss": 0.2649, + "step": 7382 + }, + { + "epoch": 0.4052689352360044, + "grad_norm": 2.0336270332336426, + "learning_rate": 4.28929278140065e-05, + "loss": 0.3309, + "step": 7384 + }, + { + "epoch": 0.4053787047200878, + "grad_norm": 2.4339041709899902, + "learning_rate": 4.2889270658524024e-05, + "loss": 0.5057, + "step": 7386 + }, + { + "epoch": 0.4054884742041712, + "grad_norm": 2.8725392818450928, + "learning_rate": 4.2885612718330324e-05, + "loss": 0.3986, + "step": 7388 + }, + { + "epoch": 0.40559824368825464, + "grad_norm": 1.5812710523605347, + "learning_rate": 4.288195399358587e-05, + "loss": 0.2888, + "step": 7390 + }, + { + "epoch": 0.40570801317233807, + "grad_norm": 2.2590789794921875, + "learning_rate": 4.287829448445113e-05, + "loss": 0.3261, + "step": 7392 + }, + { + "epoch": 0.4058177826564215, + "grad_norm": 1.9850302934646606, + "learning_rate": 4.287463419108665e-05, + "loss": 0.3343, + "step": 7394 + }, + { + "epoch": 0.40592755214050497, + "grad_norm": 1.1052722930908203, + "learning_rate": 4.287097311365299e-05, + "loss": 0.2768, + "step": 7396 + }, + { + "epoch": 0.4060373216245884, + "grad_norm": 2.008995294570923, + "learning_rate": 4.2867311252310724e-05, + "loss": 0.3391, + "step": 7398 + }, + { + "epoch": 0.4061470911086718, + "grad_norm": 2.6853420734405518, + "learning_rate": 4.286364860722048e-05, + "loss": 0.4941, + "step": 7400 + }, + { + "epoch": 0.40625686059275523, + "grad_norm": 1.1858325004577637, + "learning_rate": 4.285998517854294e-05, + "loss": 0.3211, + "step": 7402 + }, + { + "epoch": 0.40636663007683865, + "grad_norm": 2.2573108673095703, + "learning_rate": 4.285632096643879e-05, + "loss": 0.4191, + "step": 7404 + }, + { + "epoch": 0.4064763995609221, + "grad_norm": 1.532334566116333, + "learning_rate": 4.285265597106875e-05, + "loss": 0.332, + "step": 7406 + }, + { + "epoch": 0.4065861690450055, + "grad_norm": 1.3227511644363403, + "learning_rate": 4.2848990192593597e-05, + "loss": 0.4113, + "step": 7408 + }, + { + "epoch": 0.4066959385290889, + "grad_norm": 1.7148106098175049, + "learning_rate": 4.2845323631174127e-05, + "loss": 0.3841, + "step": 7410 + }, + { + "epoch": 0.40680570801317234, + "grad_norm": 1.4250690937042236, + "learning_rate": 4.284165628697118e-05, + "loss": 0.3348, + "step": 7412 + }, + { + "epoch": 0.40691547749725576, + "grad_norm": 1.923003077507019, + "learning_rate": 4.2837988160145605e-05, + "loss": 0.2795, + "step": 7414 + }, + { + "epoch": 0.4070252469813392, + "grad_norm": 1.3567312955856323, + "learning_rate": 4.283431925085832e-05, + "loss": 0.2724, + "step": 7416 + }, + { + "epoch": 0.4071350164654226, + "grad_norm": 2.6487362384796143, + "learning_rate": 4.283064955927025e-05, + "loss": 0.4434, + "step": 7418 + }, + { + "epoch": 0.407244785949506, + "grad_norm": 1.8581430912017822, + "learning_rate": 4.282697908554238e-05, + "loss": 0.3269, + "step": 7420 + }, + { + "epoch": 0.40735455543358945, + "grad_norm": 1.3697861433029175, + "learning_rate": 4.2823307829835705e-05, + "loss": 0.296, + "step": 7422 + }, + { + "epoch": 0.40746432491767287, + "grad_norm": 1.5164517164230347, + "learning_rate": 4.2819635792311266e-05, + "loss": 0.3672, + "step": 7424 + }, + { + "epoch": 0.4075740944017563, + "grad_norm": 3.4996397495269775, + "learning_rate": 4.281596297313013e-05, + "loss": 0.3093, + "step": 7426 + }, + { + "epoch": 0.4076838638858397, + "grad_norm": 1.4879331588745117, + "learning_rate": 4.281228937245343e-05, + "loss": 0.3558, + "step": 7428 + }, + { + "epoch": 0.40779363336992314, + "grad_norm": 1.7895996570587158, + "learning_rate": 4.280861499044227e-05, + "loss": 0.2845, + "step": 7430 + }, + { + "epoch": 0.4079034028540066, + "grad_norm": 1.1996694803237915, + "learning_rate": 4.280493982725786e-05, + "loss": 0.3358, + "step": 7432 + }, + { + "epoch": 0.40801317233809004, + "grad_norm": 2.3962268829345703, + "learning_rate": 4.28012638830614e-05, + "loss": 0.3999, + "step": 7434 + }, + { + "epoch": 0.40812294182217346, + "grad_norm": 1.3897401094436646, + "learning_rate": 4.279758715801412e-05, + "loss": 0.2647, + "step": 7436 + }, + { + "epoch": 0.4082327113062569, + "grad_norm": 1.5840359926223755, + "learning_rate": 4.279390965227732e-05, + "loss": 0.2936, + "step": 7438 + }, + { + "epoch": 0.4083424807903403, + "grad_norm": 2.5547664165496826, + "learning_rate": 4.279023136601231e-05, + "loss": 0.4156, + "step": 7440 + }, + { + "epoch": 0.4084522502744237, + "grad_norm": 2.1354727745056152, + "learning_rate": 4.2786552299380424e-05, + "loss": 0.262, + "step": 7442 + }, + { + "epoch": 0.40856201975850714, + "grad_norm": 2.276221513748169, + "learning_rate": 4.2782872452543056e-05, + "loss": 0.2843, + "step": 7444 + }, + { + "epoch": 0.40867178924259057, + "grad_norm": 1.7189621925354004, + "learning_rate": 4.2779191825661616e-05, + "loss": 0.3254, + "step": 7446 + }, + { + "epoch": 0.408781558726674, + "grad_norm": 2.176020860671997, + "learning_rate": 4.277551041889756e-05, + "loss": 0.2873, + "step": 7448 + }, + { + "epoch": 0.4088913282107574, + "grad_norm": 1.9367698431015015, + "learning_rate": 4.277182823241236e-05, + "loss": 0.4208, + "step": 7450 + }, + { + "epoch": 0.40900109769484083, + "grad_norm": 1.4643090963363647, + "learning_rate": 4.276814526636755e-05, + "loss": 0.4503, + "step": 7452 + }, + { + "epoch": 0.40911086717892425, + "grad_norm": 1.658757209777832, + "learning_rate": 4.276446152092468e-05, + "loss": 0.3193, + "step": 7454 + }, + { + "epoch": 0.4092206366630077, + "grad_norm": 1.4495841264724731, + "learning_rate": 4.2760776996245336e-05, + "loss": 0.3146, + "step": 7456 + }, + { + "epoch": 0.4093304061470911, + "grad_norm": 1.2602355480194092, + "learning_rate": 4.2757091692491135e-05, + "loss": 0.2865, + "step": 7458 + }, + { + "epoch": 0.4094401756311745, + "grad_norm": 2.149757146835327, + "learning_rate": 4.275340560982374e-05, + "loss": 0.4627, + "step": 7460 + }, + { + "epoch": 0.40954994511525794, + "grad_norm": 1.6619254350662231, + "learning_rate": 4.2749718748404835e-05, + "loss": 0.3695, + "step": 7462 + }, + { + "epoch": 0.40965971459934136, + "grad_norm": 2.202972173690796, + "learning_rate": 4.2746031108396146e-05, + "loss": 0.3289, + "step": 7464 + }, + { + "epoch": 0.4097694840834248, + "grad_norm": 1.5845158100128174, + "learning_rate": 4.274234268995942e-05, + "loss": 0.2453, + "step": 7466 + }, + { + "epoch": 0.4098792535675082, + "grad_norm": 3.441934585571289, + "learning_rate": 4.2738653493256467e-05, + "loss": 0.3177, + "step": 7468 + }, + { + "epoch": 0.4099890230515917, + "grad_norm": 1.5031096935272217, + "learning_rate": 4.27349635184491e-05, + "loss": 0.3309, + "step": 7470 + }, + { + "epoch": 0.4100987925356751, + "grad_norm": 1.027192234992981, + "learning_rate": 4.2731272765699196e-05, + "loss": 0.3318, + "step": 7472 + }, + { + "epoch": 0.4102085620197585, + "grad_norm": 3.1997666358947754, + "learning_rate": 4.272758123516863e-05, + "loss": 0.3202, + "step": 7474 + }, + { + "epoch": 0.41031833150384195, + "grad_norm": 2.6507792472839355, + "learning_rate": 4.272388892701934e-05, + "loss": 0.3008, + "step": 7476 + }, + { + "epoch": 0.41042810098792537, + "grad_norm": 1.8576916456222534, + "learning_rate": 4.272019584141329e-05, + "loss": 0.3699, + "step": 7478 + }, + { + "epoch": 0.4105378704720088, + "grad_norm": 1.8565027713775635, + "learning_rate": 4.271650197851247e-05, + "loss": 0.2683, + "step": 7480 + }, + { + "epoch": 0.4106476399560922, + "grad_norm": 1.1790794134140015, + "learning_rate": 4.2712807338478914e-05, + "loss": 0.2412, + "step": 7482 + }, + { + "epoch": 0.41075740944017564, + "grad_norm": 1.1220759153366089, + "learning_rate": 4.27091119214747e-05, + "loss": 0.3846, + "step": 7484 + }, + { + "epoch": 0.41086717892425906, + "grad_norm": 1.287249207496643, + "learning_rate": 4.27054157276619e-05, + "loss": 0.2835, + "step": 7486 + }, + { + "epoch": 0.4109769484083425, + "grad_norm": 2.6381123065948486, + "learning_rate": 4.270171875720267e-05, + "loss": 0.3956, + "step": 7488 + }, + { + "epoch": 0.4110867178924259, + "grad_norm": 1.6566697359085083, + "learning_rate": 4.269802101025917e-05, + "loss": 0.2472, + "step": 7490 + }, + { + "epoch": 0.4111964873765093, + "grad_norm": 2.614971160888672, + "learning_rate": 4.269432248699361e-05, + "loss": 0.3532, + "step": 7492 + }, + { + "epoch": 0.41130625686059274, + "grad_norm": 2.204822540283203, + "learning_rate": 4.2690623187568206e-05, + "loss": 0.3432, + "step": 7494 + }, + { + "epoch": 0.41141602634467617, + "grad_norm": 1.2829772233963013, + "learning_rate": 4.268692311214525e-05, + "loss": 0.2478, + "step": 7496 + }, + { + "epoch": 0.4115257958287596, + "grad_norm": 1.1310069561004639, + "learning_rate": 4.268322226088702e-05, + "loss": 0.299, + "step": 7498 + }, + { + "epoch": 0.411635565312843, + "grad_norm": 1.3226715326309204, + "learning_rate": 4.267952063395587e-05, + "loss": 0.2951, + "step": 7500 + }, + { + "epoch": 0.41174533479692643, + "grad_norm": 1.4642552137374878, + "learning_rate": 4.2675818231514174e-05, + "loss": 0.3105, + "step": 7502 + }, + { + "epoch": 0.41185510428100985, + "grad_norm": 1.647803544998169, + "learning_rate": 4.267211505372433e-05, + "loss": 0.4062, + "step": 7504 + }, + { + "epoch": 0.41196487376509333, + "grad_norm": 1.4673935174942017, + "learning_rate": 4.266841110074878e-05, + "loss": 0.3667, + "step": 7506 + }, + { + "epoch": 0.41207464324917675, + "grad_norm": 1.3372071981430054, + "learning_rate": 4.2664706372749996e-05, + "loss": 0.3337, + "step": 7508 + }, + { + "epoch": 0.4121844127332602, + "grad_norm": 1.622452974319458, + "learning_rate": 4.266100086989049e-05, + "loss": 0.2194, + "step": 7510 + }, + { + "epoch": 0.4122941822173436, + "grad_norm": 1.972111701965332, + "learning_rate": 4.2657294592332796e-05, + "loss": 0.3695, + "step": 7512 + }, + { + "epoch": 0.412403951701427, + "grad_norm": 2.0394985675811768, + "learning_rate": 4.2653587540239496e-05, + "loss": 0.3036, + "step": 7514 + }, + { + "epoch": 0.41251372118551044, + "grad_norm": 1.7810065746307373, + "learning_rate": 4.26498797137732e-05, + "loss": 0.2596, + "step": 7516 + }, + { + "epoch": 0.41262349066959386, + "grad_norm": 1.4884634017944336, + "learning_rate": 4.264617111309654e-05, + "loss": 0.4033, + "step": 7518 + }, + { + "epoch": 0.4127332601536773, + "grad_norm": 1.6890630722045898, + "learning_rate": 4.264246173837222e-05, + "loss": 0.2895, + "step": 7520 + }, + { + "epoch": 0.4128430296377607, + "grad_norm": 2.235481023788452, + "learning_rate": 4.263875158976293e-05, + "loss": 0.3849, + "step": 7522 + }, + { + "epoch": 0.4129527991218441, + "grad_norm": 2.0821726322174072, + "learning_rate": 4.2635040667431405e-05, + "loss": 0.283, + "step": 7524 + }, + { + "epoch": 0.41306256860592755, + "grad_norm": 4.29846715927124, + "learning_rate": 4.2631328971540444e-05, + "loss": 0.3238, + "step": 7526 + }, + { + "epoch": 0.41317233809001097, + "grad_norm": 1.64116370677948, + "learning_rate": 4.262761650225286e-05, + "loss": 0.4049, + "step": 7528 + }, + { + "epoch": 0.4132821075740944, + "grad_norm": 1.5821001529693604, + "learning_rate": 4.262390325973149e-05, + "loss": 0.316, + "step": 7530 + }, + { + "epoch": 0.4133918770581778, + "grad_norm": 3.4382545948028564, + "learning_rate": 4.262018924413922e-05, + "loss": 0.392, + "step": 7532 + }, + { + "epoch": 0.41350164654226124, + "grad_norm": 2.325072765350342, + "learning_rate": 4.261647445563897e-05, + "loss": 0.3179, + "step": 7534 + }, + { + "epoch": 0.41361141602634466, + "grad_norm": 1.982776403427124, + "learning_rate": 4.261275889439368e-05, + "loss": 0.3959, + "step": 7536 + }, + { + "epoch": 0.4137211855104281, + "grad_norm": 1.437077283859253, + "learning_rate": 4.2609042560566336e-05, + "loss": 0.3432, + "step": 7538 + }, + { + "epoch": 0.4138309549945115, + "grad_norm": 1.4414743185043335, + "learning_rate": 4.260532545431996e-05, + "loss": 0.1996, + "step": 7540 + }, + { + "epoch": 0.4139407244785949, + "grad_norm": 2.174528121948242, + "learning_rate": 4.2601607575817586e-05, + "loss": 0.4376, + "step": 7542 + }, + { + "epoch": 0.4140504939626784, + "grad_norm": 2.7817647457122803, + "learning_rate": 4.2597888925222316e-05, + "loss": 0.3265, + "step": 7544 + }, + { + "epoch": 0.4141602634467618, + "grad_norm": 2.0446438789367676, + "learning_rate": 4.259416950269727e-05, + "loss": 0.2866, + "step": 7546 + }, + { + "epoch": 0.41427003293084524, + "grad_norm": 2.5019419193267822, + "learning_rate": 4.259044930840558e-05, + "loss": 0.3756, + "step": 7548 + }, + { + "epoch": 0.41437980241492867, + "grad_norm": 1.7266042232513428, + "learning_rate": 4.258672834251045e-05, + "loss": 0.3443, + "step": 7550 + }, + { + "epoch": 0.4144895718990121, + "grad_norm": 2.014463424682617, + "learning_rate": 4.2583006605175095e-05, + "loss": 0.3939, + "step": 7552 + }, + { + "epoch": 0.4145993413830955, + "grad_norm": 1.8582427501678467, + "learning_rate": 4.257928409656276e-05, + "loss": 0.3568, + "step": 7554 + }, + { + "epoch": 0.41470911086717893, + "grad_norm": 1.7949765920639038, + "learning_rate": 4.2575560816836755e-05, + "loss": 0.28, + "step": 7556 + }, + { + "epoch": 0.41481888035126235, + "grad_norm": 2.189824104309082, + "learning_rate": 4.257183676616038e-05, + "loss": 0.4396, + "step": 7558 + }, + { + "epoch": 0.4149286498353458, + "grad_norm": 1.589207410812378, + "learning_rate": 4.2568111944697e-05, + "loss": 0.2746, + "step": 7560 + }, + { + "epoch": 0.4150384193194292, + "grad_norm": 1.1600195169448853, + "learning_rate": 4.2564386352609994e-05, + "loss": 0.3383, + "step": 7562 + }, + { + "epoch": 0.4151481888035126, + "grad_norm": 1.9548778533935547, + "learning_rate": 4.256065999006279e-05, + "loss": 0.363, + "step": 7564 + }, + { + "epoch": 0.41525795828759604, + "grad_norm": 6.327892780303955, + "learning_rate": 4.255693285721886e-05, + "loss": 0.2994, + "step": 7566 + }, + { + "epoch": 0.41536772777167946, + "grad_norm": 2.359828472137451, + "learning_rate": 4.2553204954241666e-05, + "loss": 0.3432, + "step": 7568 + }, + { + "epoch": 0.4154774972557629, + "grad_norm": 1.8766496181488037, + "learning_rate": 4.254947628129474e-05, + "loss": 0.3684, + "step": 7570 + }, + { + "epoch": 0.4155872667398463, + "grad_norm": 1.9545187950134277, + "learning_rate": 4.2545746838541666e-05, + "loss": 0.4354, + "step": 7572 + }, + { + "epoch": 0.41569703622392973, + "grad_norm": 3.691470146179199, + "learning_rate": 4.2542016626146e-05, + "loss": 0.3667, + "step": 7574 + }, + { + "epoch": 0.41580680570801315, + "grad_norm": 1.481495976448059, + "learning_rate": 4.25382856442714e-05, + "loss": 0.3341, + "step": 7576 + }, + { + "epoch": 0.41591657519209657, + "grad_norm": 1.5000406503677368, + "learning_rate": 4.2534553893081496e-05, + "loss": 0.3084, + "step": 7578 + }, + { + "epoch": 0.41602634467618005, + "grad_norm": 4.090287685394287, + "learning_rate": 4.253082137273999e-05, + "loss": 0.4885, + "step": 7580 + }, + { + "epoch": 0.41613611416026347, + "grad_norm": 2.157860517501831, + "learning_rate": 4.252708808341063e-05, + "loss": 0.2904, + "step": 7582 + }, + { + "epoch": 0.4162458836443469, + "grad_norm": 1.2205047607421875, + "learning_rate": 4.252335402525715e-05, + "loss": 0.2645, + "step": 7584 + }, + { + "epoch": 0.4163556531284303, + "grad_norm": 1.719239592552185, + "learning_rate": 4.251961919844334e-05, + "loss": 0.3224, + "step": 7586 + }, + { + "epoch": 0.41646542261251374, + "grad_norm": 1.4501562118530273, + "learning_rate": 4.251588360313305e-05, + "loss": 0.4133, + "step": 7588 + }, + { + "epoch": 0.41657519209659716, + "grad_norm": 1.509788155555725, + "learning_rate": 4.251214723949014e-05, + "loss": 0.3537, + "step": 7590 + }, + { + "epoch": 0.4166849615806806, + "grad_norm": 2.271115779876709, + "learning_rate": 4.250841010767849e-05, + "loss": 0.4184, + "step": 7592 + }, + { + "epoch": 0.416794731064764, + "grad_norm": 3.5765881538391113, + "learning_rate": 4.250467220786204e-05, + "loss": 0.3384, + "step": 7594 + }, + { + "epoch": 0.4169045005488474, + "grad_norm": 1.6465004682540894, + "learning_rate": 4.2500933540204744e-05, + "loss": 0.4229, + "step": 7596 + }, + { + "epoch": 0.41701427003293084, + "grad_norm": 1.6346662044525146, + "learning_rate": 4.249719410487061e-05, + "loss": 0.2617, + "step": 7598 + }, + { + "epoch": 0.41712403951701427, + "grad_norm": 4.6785712242126465, + "learning_rate": 4.249345390202365e-05, + "loss": 0.4832, + "step": 7600 + }, + { + "epoch": 0.4172338090010977, + "grad_norm": 1.5276368856430054, + "learning_rate": 4.2489712931827945e-05, + "loss": 0.337, + "step": 7602 + }, + { + "epoch": 0.4173435784851811, + "grad_norm": 1.6124552488327026, + "learning_rate": 4.248597119444759e-05, + "loss": 0.3517, + "step": 7604 + }, + { + "epoch": 0.41745334796926453, + "grad_norm": 1.7135039567947388, + "learning_rate": 4.248222869004671e-05, + "loss": 0.3051, + "step": 7606 + }, + { + "epoch": 0.41756311745334795, + "grad_norm": 2.06292986869812, + "learning_rate": 4.247848541878947e-05, + "loss": 0.4559, + "step": 7608 + }, + { + "epoch": 0.4176728869374314, + "grad_norm": 2.40722393989563, + "learning_rate": 4.247474138084007e-05, + "loss": 0.3246, + "step": 7610 + }, + { + "epoch": 0.4177826564215148, + "grad_norm": 1.9286662340164185, + "learning_rate": 4.2470996576362745e-05, + "loss": 0.3302, + "step": 7612 + }, + { + "epoch": 0.4178924259055982, + "grad_norm": 2.568880081176758, + "learning_rate": 4.246725100552176e-05, + "loss": 0.3964, + "step": 7614 + }, + { + "epoch": 0.41800219538968164, + "grad_norm": 3.2130024433135986, + "learning_rate": 4.24635046684814e-05, + "loss": 0.4286, + "step": 7616 + }, + { + "epoch": 0.4181119648737651, + "grad_norm": 2.1286065578460693, + "learning_rate": 4.245975756540602e-05, + "loss": 0.4003, + "step": 7618 + }, + { + "epoch": 0.41822173435784854, + "grad_norm": 1.1618084907531738, + "learning_rate": 4.2456009696459965e-05, + "loss": 0.3078, + "step": 7620 + }, + { + "epoch": 0.41833150384193196, + "grad_norm": 2.3361666202545166, + "learning_rate": 4.2452261061807655e-05, + "loss": 0.4318, + "step": 7622 + }, + { + "epoch": 0.4184412733260154, + "grad_norm": 2.232983112335205, + "learning_rate": 4.2448511661613514e-05, + "loss": 0.2479, + "step": 7624 + }, + { + "epoch": 0.4185510428100988, + "grad_norm": 1.7118287086486816, + "learning_rate": 4.244476149604201e-05, + "loss": 0.302, + "step": 7626 + }, + { + "epoch": 0.4186608122941822, + "grad_norm": 2.325563669204712, + "learning_rate": 4.2441010565257635e-05, + "loss": 0.3697, + "step": 7628 + }, + { + "epoch": 0.41877058177826565, + "grad_norm": 1.236142635345459, + "learning_rate": 4.2437258869424934e-05, + "loss": 0.3489, + "step": 7630 + }, + { + "epoch": 0.41888035126234907, + "grad_norm": 2.7722630500793457, + "learning_rate": 4.243350640870847e-05, + "loss": 0.3382, + "step": 7632 + }, + { + "epoch": 0.4189901207464325, + "grad_norm": 1.0040308237075806, + "learning_rate": 4.242975318327286e-05, + "loss": 0.3402, + "step": 7634 + }, + { + "epoch": 0.4190998902305159, + "grad_norm": 2.5777361392974854, + "learning_rate": 4.2425999193282714e-05, + "loss": 0.3606, + "step": 7636 + }, + { + "epoch": 0.41920965971459934, + "grad_norm": 2.757239818572998, + "learning_rate": 4.242224443890271e-05, + "loss": 0.423, + "step": 7638 + }, + { + "epoch": 0.41931942919868276, + "grad_norm": 1.8768852949142456, + "learning_rate": 4.241848892029756e-05, + "loss": 0.2473, + "step": 7640 + }, + { + "epoch": 0.4194291986827662, + "grad_norm": 2.42932391166687, + "learning_rate": 4.241473263763198e-05, + "loss": 0.4044, + "step": 7642 + }, + { + "epoch": 0.4195389681668496, + "grad_norm": 1.5743058919906616, + "learning_rate": 4.2410975591070754e-05, + "loss": 0.3931, + "step": 7644 + }, + { + "epoch": 0.419648737650933, + "grad_norm": 2.4301376342773438, + "learning_rate": 4.2407217780778677e-05, + "loss": 0.3717, + "step": 7646 + }, + { + "epoch": 0.41975850713501645, + "grad_norm": 1.8958086967468262, + "learning_rate": 4.240345920692059e-05, + "loss": 0.3475, + "step": 7648 + }, + { + "epoch": 0.41986827661909987, + "grad_norm": 1.3042714595794678, + "learning_rate": 4.239969986966137e-05, + "loss": 0.3233, + "step": 7650 + }, + { + "epoch": 0.4199780461031833, + "grad_norm": 1.7498273849487305, + "learning_rate": 4.2395939769165905e-05, + "loss": 0.4276, + "step": 7652 + }, + { + "epoch": 0.42008781558726677, + "grad_norm": 2.156116485595703, + "learning_rate": 4.239217890559914e-05, + "loss": 0.4411, + "step": 7654 + }, + { + "epoch": 0.4201975850713502, + "grad_norm": 1.546905279159546, + "learning_rate": 4.2388417279126035e-05, + "loss": 0.3453, + "step": 7656 + }, + { + "epoch": 0.4203073545554336, + "grad_norm": 1.5905815362930298, + "learning_rate": 4.2384654889911614e-05, + "loss": 0.2997, + "step": 7658 + }, + { + "epoch": 0.42041712403951703, + "grad_norm": 3.2865607738494873, + "learning_rate": 4.238089173812089e-05, + "loss": 0.4024, + "step": 7660 + }, + { + "epoch": 0.42052689352360045, + "grad_norm": 1.157072901725769, + "learning_rate": 4.237712782391894e-05, + "loss": 0.3713, + "step": 7662 + }, + { + "epoch": 0.4206366630076839, + "grad_norm": 1.7525235414505005, + "learning_rate": 4.237336314747088e-05, + "loss": 0.4427, + "step": 7664 + }, + { + "epoch": 0.4207464324917673, + "grad_norm": 2.3579001426696777, + "learning_rate": 4.236959770894183e-05, + "loss": 0.2835, + "step": 7666 + }, + { + "epoch": 0.4208562019758507, + "grad_norm": 1.2954939603805542, + "learning_rate": 4.236583150849698e-05, + "loss": 0.2577, + "step": 7668 + }, + { + "epoch": 0.42096597145993414, + "grad_norm": 1.3232803344726562, + "learning_rate": 4.236206454630153e-05, + "loss": 0.3884, + "step": 7670 + }, + { + "epoch": 0.42107574094401756, + "grad_norm": 1.9522467851638794, + "learning_rate": 4.2358296822520694e-05, + "loss": 0.2378, + "step": 7672 + }, + { + "epoch": 0.421185510428101, + "grad_norm": 1.5981470346450806, + "learning_rate": 4.2354528337319764e-05, + "loss": 0.3957, + "step": 7674 + }, + { + "epoch": 0.4212952799121844, + "grad_norm": 2.0944125652313232, + "learning_rate": 4.2350759090864046e-05, + "loss": 0.2976, + "step": 7676 + }, + { + "epoch": 0.42140504939626783, + "grad_norm": 2.327275514602661, + "learning_rate": 4.2346989083318866e-05, + "loss": 0.6141, + "step": 7678 + }, + { + "epoch": 0.42151481888035125, + "grad_norm": 2.3798606395721436, + "learning_rate": 4.23432183148496e-05, + "loss": 0.3289, + "step": 7680 + }, + { + "epoch": 0.42162458836443467, + "grad_norm": 1.7254877090454102, + "learning_rate": 4.233944678562166e-05, + "loss": 0.3837, + "step": 7682 + }, + { + "epoch": 0.4217343578485181, + "grad_norm": 2.4543282985687256, + "learning_rate": 4.233567449580047e-05, + "loss": 0.2551, + "step": 7684 + }, + { + "epoch": 0.4218441273326015, + "grad_norm": 1.7611807584762573, + "learning_rate": 4.233190144555152e-05, + "loss": 0.3037, + "step": 7686 + }, + { + "epoch": 0.42195389681668494, + "grad_norm": 1.8014805316925049, + "learning_rate": 4.2328127635040285e-05, + "loss": 0.301, + "step": 7688 + }, + { + "epoch": 0.42206366630076836, + "grad_norm": 1.944178581237793, + "learning_rate": 4.2324353064432335e-05, + "loss": 0.4, + "step": 7690 + }, + { + "epoch": 0.42217343578485184, + "grad_norm": 3.553910970687866, + "learning_rate": 4.232057773389322e-05, + "loss": 0.3621, + "step": 7692 + }, + { + "epoch": 0.42228320526893526, + "grad_norm": 1.465686559677124, + "learning_rate": 4.231680164358855e-05, + "loss": 0.3313, + "step": 7694 + }, + { + "epoch": 0.4223929747530187, + "grad_norm": 1.6650315523147583, + "learning_rate": 4.2313024793683965e-05, + "loss": 0.3204, + "step": 7696 + }, + { + "epoch": 0.4225027442371021, + "grad_norm": 1.4763799905776978, + "learning_rate": 4.2309247184345134e-05, + "loss": 0.2942, + "step": 7698 + }, + { + "epoch": 0.4226125137211855, + "grad_norm": 4.151755332946777, + "learning_rate": 4.230546881573777e-05, + "loss": 0.2463, + "step": 7700 + }, + { + "epoch": 0.42272228320526894, + "grad_norm": 1.655250906944275, + "learning_rate": 4.2301689688027596e-05, + "loss": 0.3365, + "step": 7702 + }, + { + "epoch": 0.42283205268935237, + "grad_norm": 1.165658712387085, + "learning_rate": 4.22979098013804e-05, + "loss": 0.3123, + "step": 7704 + }, + { + "epoch": 0.4229418221734358, + "grad_norm": 1.5283666849136353, + "learning_rate": 4.229412915596196e-05, + "loss": 0.297, + "step": 7706 + }, + { + "epoch": 0.4230515916575192, + "grad_norm": 1.717162013053894, + "learning_rate": 4.229034775193814e-05, + "loss": 0.3409, + "step": 7708 + }, + { + "epoch": 0.42316136114160263, + "grad_norm": 2.5472638607025146, + "learning_rate": 4.2286565589474806e-05, + "loss": 0.3651, + "step": 7710 + }, + { + "epoch": 0.42327113062568605, + "grad_norm": 1.7895523309707642, + "learning_rate": 4.228278266873785e-05, + "loss": 0.3668, + "step": 7712 + }, + { + "epoch": 0.4233809001097695, + "grad_norm": 1.679622769355774, + "learning_rate": 4.227899898989323e-05, + "loss": 0.3146, + "step": 7714 + }, + { + "epoch": 0.4234906695938529, + "grad_norm": 1.8119345903396606, + "learning_rate": 4.227521455310689e-05, + "loss": 0.4429, + "step": 7716 + }, + { + "epoch": 0.4236004390779363, + "grad_norm": 2.3437788486480713, + "learning_rate": 4.2271429358544855e-05, + "loss": 0.3856, + "step": 7718 + }, + { + "epoch": 0.42371020856201974, + "grad_norm": 1.6301014423370361, + "learning_rate": 4.2267643406373156e-05, + "loss": 0.3026, + "step": 7720 + }, + { + "epoch": 0.42381997804610316, + "grad_norm": 1.5685787200927734, + "learning_rate": 4.226385669675785e-05, + "loss": 0.5023, + "step": 7722 + }, + { + "epoch": 0.4239297475301866, + "grad_norm": 1.2605865001678467, + "learning_rate": 4.226006922986507e-05, + "loss": 0.226, + "step": 7724 + }, + { + "epoch": 0.42403951701427, + "grad_norm": 1.867928385734558, + "learning_rate": 4.225628100586093e-05, + "loss": 0.353, + "step": 7726 + }, + { + "epoch": 0.4241492864983535, + "grad_norm": 1.650245189666748, + "learning_rate": 4.2252492024911605e-05, + "loss": 0.3546, + "step": 7728 + }, + { + "epoch": 0.4242590559824369, + "grad_norm": 2.0350847244262695, + "learning_rate": 4.224870228718331e-05, + "loss": 0.3343, + "step": 7730 + }, + { + "epoch": 0.4243688254665203, + "grad_norm": 2.018437147140503, + "learning_rate": 4.2244911792842255e-05, + "loss": 0.2046, + "step": 7732 + }, + { + "epoch": 0.42447859495060375, + "grad_norm": 1.818974256515503, + "learning_rate": 4.2241120542054736e-05, + "loss": 0.3486, + "step": 7734 + }, + { + "epoch": 0.42458836443468717, + "grad_norm": 1.1903029680252075, + "learning_rate": 4.223732853498704e-05, + "loss": 0.2489, + "step": 7736 + }, + { + "epoch": 0.4246981339187706, + "grad_norm": 1.7895044088363647, + "learning_rate": 4.223353577180551e-05, + "loss": 0.4527, + "step": 7738 + }, + { + "epoch": 0.424807903402854, + "grad_norm": 1.5405800342559814, + "learning_rate": 4.2229742252676516e-05, + "loss": 0.437, + "step": 7740 + }, + { + "epoch": 0.42491767288693744, + "grad_norm": 1.1937223672866821, + "learning_rate": 4.222594797776646e-05, + "loss": 0.3875, + "step": 7742 + }, + { + "epoch": 0.42502744237102086, + "grad_norm": 1.7386127710342407, + "learning_rate": 4.222215294724177e-05, + "loss": 0.3361, + "step": 7744 + }, + { + "epoch": 0.4251372118551043, + "grad_norm": 1.7856451272964478, + "learning_rate": 4.221835716126892e-05, + "loss": 0.3224, + "step": 7746 + }, + { + "epoch": 0.4252469813391877, + "grad_norm": 0.96915602684021, + "learning_rate": 4.2214560620014414e-05, + "loss": 0.3222, + "step": 7748 + }, + { + "epoch": 0.4253567508232711, + "grad_norm": 1.6827123165130615, + "learning_rate": 4.221076332364479e-05, + "loss": 0.3511, + "step": 7750 + }, + { + "epoch": 0.42546652030735455, + "grad_norm": 1.6015338897705078, + "learning_rate": 4.2206965272326604e-05, + "loss": 0.3433, + "step": 7752 + }, + { + "epoch": 0.42557628979143797, + "grad_norm": 1.1634753942489624, + "learning_rate": 4.2203166466226466e-05, + "loss": 0.2369, + "step": 7754 + }, + { + "epoch": 0.4256860592755214, + "grad_norm": 1.8322612047195435, + "learning_rate": 4.219936690551101e-05, + "loss": 0.3406, + "step": 7756 + }, + { + "epoch": 0.4257958287596048, + "grad_norm": 2.563279867172241, + "learning_rate": 4.2195566590346904e-05, + "loss": 0.2768, + "step": 7758 + }, + { + "epoch": 0.42590559824368823, + "grad_norm": 2.4313747882843018, + "learning_rate": 4.219176552090084e-05, + "loss": 0.4055, + "step": 7760 + }, + { + "epoch": 0.42601536772777165, + "grad_norm": 1.1698187589645386, + "learning_rate": 4.2187963697339575e-05, + "loss": 0.3815, + "step": 7762 + }, + { + "epoch": 0.42612513721185513, + "grad_norm": 2.0686075687408447, + "learning_rate": 4.218416111982985e-05, + "loss": 0.2998, + "step": 7764 + }, + { + "epoch": 0.42623490669593855, + "grad_norm": 1.8002547025680542, + "learning_rate": 4.218035778853846e-05, + "loss": 0.3538, + "step": 7766 + }, + { + "epoch": 0.426344676180022, + "grad_norm": 1.8520643711090088, + "learning_rate": 4.217655370363226e-05, + "loss": 0.4706, + "step": 7768 + }, + { + "epoch": 0.4264544456641054, + "grad_norm": 1.7323797941207886, + "learning_rate": 4.217274886527811e-05, + "loss": 0.3094, + "step": 7770 + }, + { + "epoch": 0.4265642151481888, + "grad_norm": 1.8216111660003662, + "learning_rate": 4.216894327364291e-05, + "loss": 0.3233, + "step": 7772 + }, + { + "epoch": 0.42667398463227224, + "grad_norm": 2.302581310272217, + "learning_rate": 4.216513692889358e-05, + "loss": 0.3443, + "step": 7774 + }, + { + "epoch": 0.42678375411635566, + "grad_norm": 2.482020378112793, + "learning_rate": 4.2161329831197095e-05, + "loss": 0.3212, + "step": 7776 + }, + { + "epoch": 0.4268935236004391, + "grad_norm": 1.1064728498458862, + "learning_rate": 4.215752198072045e-05, + "loss": 0.2933, + "step": 7778 + }, + { + "epoch": 0.4270032930845225, + "grad_norm": 1.9270957708358765, + "learning_rate": 4.2153713377630684e-05, + "loss": 0.3611, + "step": 7780 + }, + { + "epoch": 0.42711306256860593, + "grad_norm": 2.654589891433716, + "learning_rate": 4.2149904022094846e-05, + "loss": 0.2743, + "step": 7782 + }, + { + "epoch": 0.42722283205268935, + "grad_norm": 1.8898169994354248, + "learning_rate": 4.214609391428004e-05, + "loss": 0.3335, + "step": 7784 + }, + { + "epoch": 0.42733260153677277, + "grad_norm": 1.50973641872406, + "learning_rate": 4.21422830543534e-05, + "loss": 0.3137, + "step": 7786 + }, + { + "epoch": 0.4274423710208562, + "grad_norm": 1.6722608804702759, + "learning_rate": 4.213847144248209e-05, + "loss": 0.3783, + "step": 7788 + }, + { + "epoch": 0.4275521405049396, + "grad_norm": 1.0398106575012207, + "learning_rate": 4.213465907883329e-05, + "loss": 0.3122, + "step": 7790 + }, + { + "epoch": 0.42766190998902304, + "grad_norm": 2.3363611698150635, + "learning_rate": 4.213084596357425e-05, + "loss": 0.405, + "step": 7792 + }, + { + "epoch": 0.42777167947310646, + "grad_norm": 1.3312653303146362, + "learning_rate": 4.212703209687222e-05, + "loss": 0.2404, + "step": 7794 + }, + { + "epoch": 0.4278814489571899, + "grad_norm": 1.9150793552398682, + "learning_rate": 4.212321747889451e-05, + "loss": 0.3728, + "step": 7796 + }, + { + "epoch": 0.4279912184412733, + "grad_norm": 3.2769179344177246, + "learning_rate": 4.211940210980842e-05, + "loss": 0.5991, + "step": 7798 + }, + { + "epoch": 0.4281009879253567, + "grad_norm": 1.973179578781128, + "learning_rate": 4.211558598978133e-05, + "loss": 0.2365, + "step": 7800 + }, + { + "epoch": 0.4282107574094402, + "grad_norm": 1.9615057706832886, + "learning_rate": 4.211176911898063e-05, + "loss": 0.4717, + "step": 7802 + }, + { + "epoch": 0.4283205268935236, + "grad_norm": 2.402641773223877, + "learning_rate": 4.210795149757375e-05, + "loss": 0.3081, + "step": 7804 + }, + { + "epoch": 0.42843029637760704, + "grad_norm": 1.2212549448013306, + "learning_rate": 4.210413312572815e-05, + "loss": 0.2381, + "step": 7806 + }, + { + "epoch": 0.42854006586169047, + "grad_norm": 3.139164447784424, + "learning_rate": 4.2100314003611314e-05, + "loss": 0.3766, + "step": 7808 + }, + { + "epoch": 0.4286498353457739, + "grad_norm": 1.3097903728485107, + "learning_rate": 4.209649413139077e-05, + "loss": 0.3549, + "step": 7810 + }, + { + "epoch": 0.4287596048298573, + "grad_norm": 1.3590033054351807, + "learning_rate": 4.2092673509234084e-05, + "loss": 0.261, + "step": 7812 + }, + { + "epoch": 0.42886937431394073, + "grad_norm": 1.658568024635315, + "learning_rate": 4.2088852137308837e-05, + "loss": 0.3855, + "step": 7814 + }, + { + "epoch": 0.42897914379802415, + "grad_norm": 1.6307673454284668, + "learning_rate": 4.208503001578266e-05, + "loss": 0.2703, + "step": 7816 + }, + { + "epoch": 0.4290889132821076, + "grad_norm": 1.7280590534210205, + "learning_rate": 4.208120714482321e-05, + "loss": 0.2915, + "step": 7818 + }, + { + "epoch": 0.429198682766191, + "grad_norm": 1.9089504480361938, + "learning_rate": 4.207738352459818e-05, + "loss": 0.3172, + "step": 7820 + }, + { + "epoch": 0.4293084522502744, + "grad_norm": 1.268791913986206, + "learning_rate": 4.207355915527528e-05, + "loss": 0.2685, + "step": 7822 + }, + { + "epoch": 0.42941822173435784, + "grad_norm": 1.350105881690979, + "learning_rate": 4.206973403702227e-05, + "loss": 0.3468, + "step": 7824 + }, + { + "epoch": 0.42952799121844126, + "grad_norm": 4.190103054046631, + "learning_rate": 4.2065908170006955e-05, + "loss": 0.373, + "step": 7826 + }, + { + "epoch": 0.4296377607025247, + "grad_norm": 1.8791165351867676, + "learning_rate": 4.206208155439713e-05, + "loss": 0.4597, + "step": 7828 + }, + { + "epoch": 0.4297475301866081, + "grad_norm": 2.351524591445923, + "learning_rate": 4.205825419036067e-05, + "loss": 0.2873, + "step": 7830 + }, + { + "epoch": 0.42985729967069153, + "grad_norm": 2.0265793800354004, + "learning_rate": 4.2054426078065456e-05, + "loss": 0.4054, + "step": 7832 + }, + { + "epoch": 0.42996706915477495, + "grad_norm": 4.558137893676758, + "learning_rate": 4.20505972176794e-05, + "loss": 0.4151, + "step": 7834 + }, + { + "epoch": 0.43007683863885837, + "grad_norm": 4.777888298034668, + "learning_rate": 4.2046767609370466e-05, + "loss": 0.2945, + "step": 7836 + }, + { + "epoch": 0.43018660812294185, + "grad_norm": 1.342150330543518, + "learning_rate": 4.204293725330664e-05, + "loss": 0.4071, + "step": 7838 + }, + { + "epoch": 0.43029637760702527, + "grad_norm": 4.47722053527832, + "learning_rate": 4.2039106149655914e-05, + "loss": 0.2958, + "step": 7840 + }, + { + "epoch": 0.4304061470911087, + "grad_norm": 3.914350986480713, + "learning_rate": 4.2035274298586374e-05, + "loss": 0.4474, + "step": 7842 + }, + { + "epoch": 0.4305159165751921, + "grad_norm": 1.4721407890319824, + "learning_rate": 4.203144170026608e-05, + "loss": 0.3212, + "step": 7844 + }, + { + "epoch": 0.43062568605927554, + "grad_norm": 1.4315251111984253, + "learning_rate": 4.202760835486317e-05, + "loss": 0.3964, + "step": 7846 + }, + { + "epoch": 0.43073545554335896, + "grad_norm": 2.0530147552490234, + "learning_rate": 4.202377426254578e-05, + "loss": 0.4335, + "step": 7848 + }, + { + "epoch": 0.4308452250274424, + "grad_norm": 2.4497928619384766, + "learning_rate": 4.201993942348209e-05, + "loss": 0.5157, + "step": 7850 + }, + { + "epoch": 0.4309549945115258, + "grad_norm": 2.649733543395996, + "learning_rate": 4.2016103837840316e-05, + "loss": 0.3983, + "step": 7852 + }, + { + "epoch": 0.4310647639956092, + "grad_norm": 2.6875076293945312, + "learning_rate": 4.201226750578871e-05, + "loss": 0.4876, + "step": 7854 + }, + { + "epoch": 0.43117453347969265, + "grad_norm": 1.9488316774368286, + "learning_rate": 4.200843042749555e-05, + "loss": 0.474, + "step": 7856 + }, + { + "epoch": 0.43128430296377607, + "grad_norm": 1.9400895833969116, + "learning_rate": 4.2004592603129145e-05, + "loss": 0.3752, + "step": 7858 + }, + { + "epoch": 0.4313940724478595, + "grad_norm": 1.692544937133789, + "learning_rate": 4.2000754032857845e-05, + "loss": 0.2952, + "step": 7860 + }, + { + "epoch": 0.4315038419319429, + "grad_norm": 2.0635151863098145, + "learning_rate": 4.199691471685003e-05, + "loss": 0.3386, + "step": 7862 + }, + { + "epoch": 0.43161361141602633, + "grad_norm": 1.6195954084396362, + "learning_rate": 4.1993074655274126e-05, + "loss": 0.4574, + "step": 7864 + }, + { + "epoch": 0.43172338090010975, + "grad_norm": 2.5134835243225098, + "learning_rate": 4.1989233848298534e-05, + "loss": 0.3078, + "step": 7866 + }, + { + "epoch": 0.4318331503841932, + "grad_norm": 1.5438605546951294, + "learning_rate": 4.1985392296091776e-05, + "loss": 0.2426, + "step": 7868 + }, + { + "epoch": 0.4319429198682766, + "grad_norm": 2.3296749591827393, + "learning_rate": 4.1981549998822334e-05, + "loss": 0.3681, + "step": 7870 + }, + { + "epoch": 0.43205268935236, + "grad_norm": 1.6539576053619385, + "learning_rate": 4.1977706956658756e-05, + "loss": 0.2977, + "step": 7872 + }, + { + "epoch": 0.43216245883644344, + "grad_norm": 1.6881834268569946, + "learning_rate": 4.197386316976963e-05, + "loss": 0.1916, + "step": 7874 + }, + { + "epoch": 0.4322722283205269, + "grad_norm": 1.7619667053222656, + "learning_rate": 4.197001863832355e-05, + "loss": 0.3225, + "step": 7876 + }, + { + "epoch": 0.43238199780461034, + "grad_norm": 2.2447028160095215, + "learning_rate": 4.196617336248915e-05, + "loss": 0.3272, + "step": 7878 + }, + { + "epoch": 0.43249176728869376, + "grad_norm": 2.0817089080810547, + "learning_rate": 4.1962327342435116e-05, + "loss": 0.4931, + "step": 7880 + }, + { + "epoch": 0.4326015367727772, + "grad_norm": 1.454208254814148, + "learning_rate": 4.1958480578330156e-05, + "loss": 0.441, + "step": 7882 + }, + { + "epoch": 0.4327113062568606, + "grad_norm": 1.8457770347595215, + "learning_rate": 4.1954633070343e-05, + "loss": 0.2563, + "step": 7884 + }, + { + "epoch": 0.43282107574094403, + "grad_norm": 1.1455903053283691, + "learning_rate": 4.195078481864241e-05, + "loss": 0.2944, + "step": 7886 + }, + { + "epoch": 0.43293084522502745, + "grad_norm": 3.0371592044830322, + "learning_rate": 4.1946935823397196e-05, + "loss": 0.3527, + "step": 7888 + }, + { + "epoch": 0.43304061470911087, + "grad_norm": 11.217233657836914, + "learning_rate": 4.1943086084776204e-05, + "loss": 0.3241, + "step": 7890 + }, + { + "epoch": 0.4331503841931943, + "grad_norm": 3.1570098400115967, + "learning_rate": 4.193923560294829e-05, + "loss": 0.6491, + "step": 7892 + }, + { + "epoch": 0.4332601536772777, + "grad_norm": 4.74833345413208, + "learning_rate": 4.1935384378082366e-05, + "loss": 0.4836, + "step": 7894 + }, + { + "epoch": 0.43336992316136114, + "grad_norm": 1.4935686588287354, + "learning_rate": 4.1931532410347365e-05, + "loss": 0.3218, + "step": 7896 + }, + { + "epoch": 0.43347969264544456, + "grad_norm": 1.2940423488616943, + "learning_rate": 4.192767969991224e-05, + "loss": 0.4327, + "step": 7898 + }, + { + "epoch": 0.433589462129528, + "grad_norm": 2.324782133102417, + "learning_rate": 4.1923826246945996e-05, + "loss": 0.3292, + "step": 7900 + }, + { + "epoch": 0.4336992316136114, + "grad_norm": 2.1690986156463623, + "learning_rate": 4.191997205161766e-05, + "loss": 0.3356, + "step": 7902 + }, + { + "epoch": 0.4338090010976948, + "grad_norm": 2.1970748901367188, + "learning_rate": 4.191611711409631e-05, + "loss": 0.3599, + "step": 7904 + }, + { + "epoch": 0.43391877058177825, + "grad_norm": 1.8048360347747803, + "learning_rate": 4.191226143455103e-05, + "loss": 0.3343, + "step": 7906 + }, + { + "epoch": 0.43402854006586167, + "grad_norm": 1.7132940292358398, + "learning_rate": 4.190840501315095e-05, + "loss": 0.3158, + "step": 7908 + }, + { + "epoch": 0.4341383095499451, + "grad_norm": 1.474027156829834, + "learning_rate": 4.1904547850065244e-05, + "loss": 0.2821, + "step": 7910 + }, + { + "epoch": 0.43424807903402857, + "grad_norm": 2.981661796569824, + "learning_rate": 4.19006899454631e-05, + "loss": 0.2742, + "step": 7912 + }, + { + "epoch": 0.434357848518112, + "grad_norm": 1.5247362852096558, + "learning_rate": 4.189683129951374e-05, + "loss": 0.3723, + "step": 7914 + }, + { + "epoch": 0.4344676180021954, + "grad_norm": 2.231654644012451, + "learning_rate": 4.189297191238641e-05, + "loss": 0.5059, + "step": 7916 + }, + { + "epoch": 0.43457738748627883, + "grad_norm": 1.5328501462936401, + "learning_rate": 4.188911178425042e-05, + "loss": 0.3175, + "step": 7918 + }, + { + "epoch": 0.43468715697036225, + "grad_norm": 3.561424493789673, + "learning_rate": 4.1885250915275105e-05, + "loss": 0.4142, + "step": 7920 + }, + { + "epoch": 0.4347969264544457, + "grad_norm": 2.7428839206695557, + "learning_rate": 4.18813893056298e-05, + "loss": 0.3363, + "step": 7922 + }, + { + "epoch": 0.4349066959385291, + "grad_norm": 1.9694489240646362, + "learning_rate": 4.18775269554839e-05, + "loss": 0.273, + "step": 7924 + }, + { + "epoch": 0.4350164654226125, + "grad_norm": 1.7594256401062012, + "learning_rate": 4.187366386500683e-05, + "loss": 0.2575, + "step": 7926 + }, + { + "epoch": 0.43512623490669594, + "grad_norm": 1.1492619514465332, + "learning_rate": 4.186980003436803e-05, + "loss": 0.2753, + "step": 7928 + }, + { + "epoch": 0.43523600439077936, + "grad_norm": 2.389436721801758, + "learning_rate": 4.186593546373702e-05, + "loss": 0.3294, + "step": 7930 + }, + { + "epoch": 0.4353457738748628, + "grad_norm": 1.3733434677124023, + "learning_rate": 4.186207015328328e-05, + "loss": 0.3485, + "step": 7932 + }, + { + "epoch": 0.4354555433589462, + "grad_norm": 1.9226398468017578, + "learning_rate": 4.185820410317639e-05, + "loss": 0.2848, + "step": 7934 + }, + { + "epoch": 0.43556531284302963, + "grad_norm": 1.389822244644165, + "learning_rate": 4.185433731358591e-05, + "loss": 0.2449, + "step": 7936 + }, + { + "epoch": 0.43567508232711305, + "grad_norm": 1.5267139673233032, + "learning_rate": 4.185046978468148e-05, + "loss": 0.3153, + "step": 7938 + }, + { + "epoch": 0.43578485181119647, + "grad_norm": 1.282420039176941, + "learning_rate": 4.1846601516632746e-05, + "loss": 0.3336, + "step": 7940 + }, + { + "epoch": 0.4358946212952799, + "grad_norm": 2.0457041263580322, + "learning_rate": 4.184273250960937e-05, + "loss": 0.2847, + "step": 7942 + }, + { + "epoch": 0.4360043907793633, + "grad_norm": 1.5845451354980469, + "learning_rate": 4.1838862763781074e-05, + "loss": 0.3338, + "step": 7944 + }, + { + "epoch": 0.43611416026344674, + "grad_norm": 1.8971558809280396, + "learning_rate": 4.183499227931761e-05, + "loss": 0.297, + "step": 7946 + }, + { + "epoch": 0.43622392974753016, + "grad_norm": 1.9371988773345947, + "learning_rate": 4.1831121056388756e-05, + "loss": 0.3339, + "step": 7948 + }, + { + "epoch": 0.43633369923161364, + "grad_norm": 1.7558894157409668, + "learning_rate": 4.182724909516432e-05, + "loss": 0.4666, + "step": 7950 + }, + { + "epoch": 0.43644346871569706, + "grad_norm": 2.1155333518981934, + "learning_rate": 4.182337639581415e-05, + "loss": 0.5106, + "step": 7952 + }, + { + "epoch": 0.4365532381997805, + "grad_norm": 1.6390222311019897, + "learning_rate": 4.181950295850811e-05, + "loss": 0.3509, + "step": 7954 + }, + { + "epoch": 0.4366630076838639, + "grad_norm": 2.9488508701324463, + "learning_rate": 4.1815628783416114e-05, + "loss": 0.3936, + "step": 7956 + }, + { + "epoch": 0.4367727771679473, + "grad_norm": 1.4722539186477661, + "learning_rate": 4.181175387070812e-05, + "loss": 0.3224, + "step": 7958 + }, + { + "epoch": 0.43688254665203075, + "grad_norm": 2.086639642715454, + "learning_rate": 4.180787822055407e-05, + "loss": 0.2168, + "step": 7960 + }, + { + "epoch": 0.43699231613611417, + "grad_norm": 1.4724841117858887, + "learning_rate": 4.1804001833123986e-05, + "loss": 0.2813, + "step": 7962 + }, + { + "epoch": 0.4371020856201976, + "grad_norm": 1.548899531364441, + "learning_rate": 4.18001247085879e-05, + "loss": 0.2991, + "step": 7964 + }, + { + "epoch": 0.437211855104281, + "grad_norm": 2.9270081520080566, + "learning_rate": 4.1796246847115886e-05, + "loss": 0.3576, + "step": 7966 + }, + { + "epoch": 0.43732162458836443, + "grad_norm": 1.6981335878372192, + "learning_rate": 4.179236824887804e-05, + "loss": 0.3569, + "step": 7968 + }, + { + "epoch": 0.43743139407244785, + "grad_norm": 1.172302007675171, + "learning_rate": 4.178848891404451e-05, + "loss": 0.366, + "step": 7970 + }, + { + "epoch": 0.4375411635565313, + "grad_norm": 1.397572636604309, + "learning_rate": 4.178460884278545e-05, + "loss": 0.4696, + "step": 7972 + }, + { + "epoch": 0.4376509330406147, + "grad_norm": 2.0505149364471436, + "learning_rate": 4.178072803527106e-05, + "loss": 0.384, + "step": 7974 + }, + { + "epoch": 0.4377607025246981, + "grad_norm": 2.8183324337005615, + "learning_rate": 4.177684649167158e-05, + "loss": 0.2883, + "step": 7976 + }, + { + "epoch": 0.43787047200878154, + "grad_norm": 1.8589441776275635, + "learning_rate": 4.177296421215726e-05, + "loss": 0.2376, + "step": 7978 + }, + { + "epoch": 0.43798024149286496, + "grad_norm": 2.1090688705444336, + "learning_rate": 4.176908119689841e-05, + "loss": 0.2377, + "step": 7980 + }, + { + "epoch": 0.4380900109769484, + "grad_norm": 1.6613849401474, + "learning_rate": 4.176519744606534e-05, + "loss": 0.3887, + "step": 7982 + }, + { + "epoch": 0.4381997804610318, + "grad_norm": 1.9688262939453125, + "learning_rate": 4.176131295982843e-05, + "loss": 0.3681, + "step": 7984 + }, + { + "epoch": 0.4383095499451153, + "grad_norm": 1.9048680067062378, + "learning_rate": 4.1757427738358066e-05, + "loss": 0.2841, + "step": 7986 + }, + { + "epoch": 0.4384193194291987, + "grad_norm": 3.7823586463928223, + "learning_rate": 4.175354178182467e-05, + "loss": 0.3332, + "step": 7988 + }, + { + "epoch": 0.43852908891328213, + "grad_norm": 1.5592975616455078, + "learning_rate": 4.17496550903987e-05, + "loss": 0.3955, + "step": 7990 + }, + { + "epoch": 0.43863885839736555, + "grad_norm": 1.8208184242248535, + "learning_rate": 4.174576766425064e-05, + "loss": 0.5552, + "step": 7992 + }, + { + "epoch": 0.43874862788144897, + "grad_norm": 1.25648033618927, + "learning_rate": 4.174187950355102e-05, + "loss": 0.3453, + "step": 7994 + }, + { + "epoch": 0.4388583973655324, + "grad_norm": 1.3149182796478271, + "learning_rate": 4.173799060847039e-05, + "loss": 0.3576, + "step": 7996 + }, + { + "epoch": 0.4389681668496158, + "grad_norm": 2.133411169052124, + "learning_rate": 4.173410097917934e-05, + "loss": 0.3752, + "step": 7998 + }, + { + "epoch": 0.43907793633369924, + "grad_norm": 1.099307656288147, + "learning_rate": 4.173021061584849e-05, + "loss": 0.2744, + "step": 8000 + }, + { + "epoch": 0.43918770581778266, + "grad_norm": 2.6556429862976074, + "learning_rate": 4.172631951864847e-05, + "loss": 0.388, + "step": 8002 + }, + { + "epoch": 0.4392974753018661, + "grad_norm": 2.4755587577819824, + "learning_rate": 4.172242768775e-05, + "loss": 0.3085, + "step": 8004 + }, + { + "epoch": 0.4394072447859495, + "grad_norm": 1.361464023590088, + "learning_rate": 4.1718535123323755e-05, + "loss": 0.3019, + "step": 8006 + }, + { + "epoch": 0.4395170142700329, + "grad_norm": 1.417433261871338, + "learning_rate": 4.171464182554051e-05, + "loss": 0.2732, + "step": 8008 + }, + { + "epoch": 0.43962678375411635, + "grad_norm": 1.5179953575134277, + "learning_rate": 4.171074779457103e-05, + "loss": 0.3617, + "step": 8010 + }, + { + "epoch": 0.43973655323819977, + "grad_norm": 1.7160853147506714, + "learning_rate": 4.1706853030586126e-05, + "loss": 0.3217, + "step": 8012 + }, + { + "epoch": 0.4398463227222832, + "grad_norm": 2.1673169136047363, + "learning_rate": 4.170295753375665e-05, + "loss": 0.4072, + "step": 8014 + }, + { + "epoch": 0.4399560922063666, + "grad_norm": 1.2768348455429077, + "learning_rate": 4.169906130425348e-05, + "loss": 0.3393, + "step": 8016 + }, + { + "epoch": 0.44006586169045003, + "grad_norm": 1.2571042776107788, + "learning_rate": 4.169516434224751e-05, + "loss": 0.375, + "step": 8018 + }, + { + "epoch": 0.44017563117453345, + "grad_norm": 1.526087760925293, + "learning_rate": 4.169126664790969e-05, + "loss": 0.2802, + "step": 8020 + }, + { + "epoch": 0.44028540065861693, + "grad_norm": 1.561037540435791, + "learning_rate": 4.168736822141099e-05, + "loss": 0.2721, + "step": 8022 + }, + { + "epoch": 0.44039517014270035, + "grad_norm": 2.4253201484680176, + "learning_rate": 4.168346906292241e-05, + "loss": 0.3661, + "step": 8024 + }, + { + "epoch": 0.4405049396267838, + "grad_norm": 1.5154519081115723, + "learning_rate": 4.1679569172614996e-05, + "loss": 0.3132, + "step": 8026 + }, + { + "epoch": 0.4406147091108672, + "grad_norm": 1.9764845371246338, + "learning_rate": 4.16756685506598e-05, + "loss": 0.3671, + "step": 8028 + }, + { + "epoch": 0.4407244785949506, + "grad_norm": 2.371915817260742, + "learning_rate": 4.167176719722794e-05, + "loss": 0.3451, + "step": 8030 + }, + { + "epoch": 0.44083424807903404, + "grad_norm": 1.6682357788085938, + "learning_rate": 4.166786511249055e-05, + "loss": 0.4076, + "step": 8032 + }, + { + "epoch": 0.44094401756311746, + "grad_norm": 1.171905279159546, + "learning_rate": 4.1663962296618763e-05, + "loss": 0.2664, + "step": 8034 + }, + { + "epoch": 0.4410537870472009, + "grad_norm": 3.3888394832611084, + "learning_rate": 4.166005874978382e-05, + "loss": 0.1965, + "step": 8036 + }, + { + "epoch": 0.4411635565312843, + "grad_norm": 2.1584789752960205, + "learning_rate": 4.1656154472156904e-05, + "loss": 0.3276, + "step": 8038 + }, + { + "epoch": 0.44127332601536773, + "grad_norm": 3.2963268756866455, + "learning_rate": 4.165224946390932e-05, + "loss": 0.3655, + "step": 8040 + }, + { + "epoch": 0.44138309549945115, + "grad_norm": 1.561259150505066, + "learning_rate": 4.164834372521233e-05, + "loss": 0.3207, + "step": 8042 + }, + { + "epoch": 0.44149286498353457, + "grad_norm": 1.8454620838165283, + "learning_rate": 4.164443725623728e-05, + "loss": 0.3679, + "step": 8044 + }, + { + "epoch": 0.441602634467618, + "grad_norm": 1.7081199884414673, + "learning_rate": 4.1640530057155504e-05, + "loss": 0.4024, + "step": 8046 + }, + { + "epoch": 0.4417124039517014, + "grad_norm": 1.7062314748764038, + "learning_rate": 4.1636622128138406e-05, + "loss": 0.4075, + "step": 8048 + }, + { + "epoch": 0.44182217343578484, + "grad_norm": 1.4814456701278687, + "learning_rate": 4.163271346935741e-05, + "loss": 0.5065, + "step": 8050 + }, + { + "epoch": 0.44193194291986826, + "grad_norm": 1.4701204299926758, + "learning_rate": 4.162880408098396e-05, + "loss": 0.3099, + "step": 8052 + }, + { + "epoch": 0.4420417124039517, + "grad_norm": 1.0737287998199463, + "learning_rate": 4.1624893963189546e-05, + "loss": 0.3352, + "step": 8054 + }, + { + "epoch": 0.4421514818880351, + "grad_norm": 1.0517948865890503, + "learning_rate": 4.162098311614567e-05, + "loss": 0.3921, + "step": 8056 + }, + { + "epoch": 0.4422612513721185, + "grad_norm": 1.2894278764724731, + "learning_rate": 4.161707154002391e-05, + "loss": 0.3016, + "step": 8058 + }, + { + "epoch": 0.442371020856202, + "grad_norm": 1.9769237041473389, + "learning_rate": 4.1613159234995825e-05, + "loss": 0.3333, + "step": 8060 + }, + { + "epoch": 0.4424807903402854, + "grad_norm": 1.3987324237823486, + "learning_rate": 4.160924620123304e-05, + "loss": 0.2715, + "step": 8062 + }, + { + "epoch": 0.44259055982436885, + "grad_norm": 2.943328380584717, + "learning_rate": 4.1605332438907185e-05, + "loss": 0.3358, + "step": 8064 + }, + { + "epoch": 0.44270032930845227, + "grad_norm": 2.1190576553344727, + "learning_rate": 4.160141794818995e-05, + "loss": 0.2966, + "step": 8066 + }, + { + "epoch": 0.4428100987925357, + "grad_norm": 2.2758259773254395, + "learning_rate": 4.159750272925304e-05, + "loss": 0.4676, + "step": 8068 + }, + { + "epoch": 0.4429198682766191, + "grad_norm": 2.3947970867156982, + "learning_rate": 4.159358678226819e-05, + "loss": 0.3461, + "step": 8070 + }, + { + "epoch": 0.44302963776070253, + "grad_norm": 2.3870885372161865, + "learning_rate": 4.1589670107407177e-05, + "loss": 0.3883, + "step": 8072 + }, + { + "epoch": 0.44313940724478595, + "grad_norm": 1.5564202070236206, + "learning_rate": 4.158575270484181e-05, + "loss": 0.2758, + "step": 8074 + }, + { + "epoch": 0.4432491767288694, + "grad_norm": 1.5493837594985962, + "learning_rate": 4.1581834574743915e-05, + "loss": 0.2345, + "step": 8076 + }, + { + "epoch": 0.4433589462129528, + "grad_norm": 3.097712993621826, + "learning_rate": 4.157791571728538e-05, + "loss": 0.3821, + "step": 8078 + }, + { + "epoch": 0.4434687156970362, + "grad_norm": 1.2836641073226929, + "learning_rate": 4.157399613263808e-05, + "loss": 0.325, + "step": 8080 + }, + { + "epoch": 0.44357848518111964, + "grad_norm": 1.9375379085540771, + "learning_rate": 4.157007582097397e-05, + "loss": 0.2982, + "step": 8082 + }, + { + "epoch": 0.44368825466520306, + "grad_norm": 1.739993929862976, + "learning_rate": 4.1566154782465e-05, + "loss": 0.2762, + "step": 8084 + }, + { + "epoch": 0.4437980241492865, + "grad_norm": 1.465742588043213, + "learning_rate": 4.156223301728316e-05, + "loss": 0.2406, + "step": 8086 + }, + { + "epoch": 0.4439077936333699, + "grad_norm": 1.6446067094802856, + "learning_rate": 4.15583105256005e-05, + "loss": 0.4751, + "step": 8088 + }, + { + "epoch": 0.44401756311745333, + "grad_norm": 1.3954198360443115, + "learning_rate": 4.1554387307589065e-05, + "loss": 0.3184, + "step": 8090 + }, + { + "epoch": 0.44412733260153675, + "grad_norm": 1.3687361478805542, + "learning_rate": 4.155046336342095e-05, + "loss": 0.2769, + "step": 8092 + }, + { + "epoch": 0.4442371020856202, + "grad_norm": 1.1972544193267822, + "learning_rate": 4.1546538693268275e-05, + "loss": 0.2636, + "step": 8094 + }, + { + "epoch": 0.44434687156970365, + "grad_norm": 1.7646287679672241, + "learning_rate": 4.1542613297303204e-05, + "loss": 0.4088, + "step": 8096 + }, + { + "epoch": 0.44445664105378707, + "grad_norm": 1.9297147989273071, + "learning_rate": 4.15386871756979e-05, + "loss": 0.3378, + "step": 8098 + }, + { + "epoch": 0.4445664105378705, + "grad_norm": 1.2237987518310547, + "learning_rate": 4.153476032862461e-05, + "loss": 0.2272, + "step": 8100 + }, + { + "epoch": 0.4446761800219539, + "grad_norm": 1.1341092586517334, + "learning_rate": 4.153083275625558e-05, + "loss": 0.1801, + "step": 8102 + }, + { + "epoch": 0.44478594950603734, + "grad_norm": 1.8853418827056885, + "learning_rate": 4.152690445876308e-05, + "loss": 0.2969, + "step": 8104 + }, + { + "epoch": 0.44489571899012076, + "grad_norm": 1.9641450643539429, + "learning_rate": 4.1522975436319445e-05, + "loss": 0.5042, + "step": 8106 + }, + { + "epoch": 0.4450054884742042, + "grad_norm": 1.6478806734085083, + "learning_rate": 4.151904568909699e-05, + "loss": 0.4836, + "step": 8108 + }, + { + "epoch": 0.4451152579582876, + "grad_norm": 1.7227188348770142, + "learning_rate": 4.151511521726812e-05, + "loss": 0.2591, + "step": 8110 + }, + { + "epoch": 0.445225027442371, + "grad_norm": 1.9803643226623535, + "learning_rate": 4.151118402100523e-05, + "loss": 0.3259, + "step": 8112 + }, + { + "epoch": 0.44533479692645445, + "grad_norm": 1.6446415185928345, + "learning_rate": 4.150725210048078e-05, + "loss": 0.413, + "step": 8114 + }, + { + "epoch": 0.44544456641053787, + "grad_norm": 1.5506232976913452, + "learning_rate": 4.1503319455867215e-05, + "loss": 0.298, + "step": 8116 + }, + { + "epoch": 0.4455543358946213, + "grad_norm": 2.2520973682403564, + "learning_rate": 4.1499386087337065e-05, + "loss": 0.3267, + "step": 8118 + }, + { + "epoch": 0.4456641053787047, + "grad_norm": 1.4548380374908447, + "learning_rate": 4.149545199506285e-05, + "loss": 0.2646, + "step": 8120 + }, + { + "epoch": 0.44577387486278813, + "grad_norm": 1.707119345664978, + "learning_rate": 4.1491517179217156e-05, + "loss": 0.3935, + "step": 8122 + }, + { + "epoch": 0.44588364434687155, + "grad_norm": 2.3085503578186035, + "learning_rate": 4.148758163997257e-05, + "loss": 0.4172, + "step": 8124 + }, + { + "epoch": 0.445993413830955, + "grad_norm": 1.9080380201339722, + "learning_rate": 4.148364537750172e-05, + "loss": 0.2614, + "step": 8126 + }, + { + "epoch": 0.4461031833150384, + "grad_norm": 1.533050775527954, + "learning_rate": 4.147970839197729e-05, + "loss": 0.3279, + "step": 8128 + }, + { + "epoch": 0.4462129527991218, + "grad_norm": 1.1994918584823608, + "learning_rate": 4.147577068357195e-05, + "loss": 0.3108, + "step": 8130 + }, + { + "epoch": 0.44632272228320524, + "grad_norm": 0.9514561295509338, + "learning_rate": 4.147183225245845e-05, + "loss": 0.2884, + "step": 8132 + }, + { + "epoch": 0.4464324917672887, + "grad_norm": 1.6372915506362915, + "learning_rate": 4.146789309880953e-05, + "loss": 0.2672, + "step": 8134 + }, + { + "epoch": 0.44654226125137214, + "grad_norm": 1.8051453828811646, + "learning_rate": 4.1463953222798e-05, + "loss": 0.3014, + "step": 8136 + }, + { + "epoch": 0.44665203073545556, + "grad_norm": 2.5924148559570312, + "learning_rate": 4.1460012624596666e-05, + "loss": 0.3987, + "step": 8138 + }, + { + "epoch": 0.446761800219539, + "grad_norm": 1.6784242391586304, + "learning_rate": 4.145607130437839e-05, + "loss": 0.4356, + "step": 8140 + }, + { + "epoch": 0.4468715697036224, + "grad_norm": 1.6642569303512573, + "learning_rate": 4.145212926231605e-05, + "loss": 0.2418, + "step": 8142 + }, + { + "epoch": 0.44698133918770583, + "grad_norm": 1.4972224235534668, + "learning_rate": 4.144818649858257e-05, + "loss": 0.2644, + "step": 8144 + }, + { + "epoch": 0.44709110867178925, + "grad_norm": 3.222050428390503, + "learning_rate": 4.14442430133509e-05, + "loss": 0.4694, + "step": 8146 + }, + { + "epoch": 0.44720087815587267, + "grad_norm": 2.7782816886901855, + "learning_rate": 4.144029880679402e-05, + "loss": 0.5119, + "step": 8148 + }, + { + "epoch": 0.4473106476399561, + "grad_norm": 1.1535011529922485, + "learning_rate": 4.143635387908493e-05, + "loss": 0.2459, + "step": 8150 + }, + { + "epoch": 0.4474204171240395, + "grad_norm": 1.4962058067321777, + "learning_rate": 4.1432408230396704e-05, + "loss": 0.2753, + "step": 8152 + }, + { + "epoch": 0.44753018660812294, + "grad_norm": 1.1594903469085693, + "learning_rate": 4.1428461860902385e-05, + "loss": 0.2494, + "step": 8154 + }, + { + "epoch": 0.44763995609220636, + "grad_norm": 1.3228449821472168, + "learning_rate": 4.1424514770775094e-05, + "loss": 0.2741, + "step": 8156 + }, + { + "epoch": 0.4477497255762898, + "grad_norm": 2.5028085708618164, + "learning_rate": 4.1420566960187965e-05, + "loss": 0.39, + "step": 8158 + }, + { + "epoch": 0.4478594950603732, + "grad_norm": 1.362625002861023, + "learning_rate": 4.141661842931418e-05, + "loss": 0.3279, + "step": 8160 + }, + { + "epoch": 0.4479692645444566, + "grad_norm": 1.9790849685668945, + "learning_rate": 4.141266917832693e-05, + "loss": 0.512, + "step": 8162 + }, + { + "epoch": 0.44807903402854005, + "grad_norm": 1.361454963684082, + "learning_rate": 4.1408719207399453e-05, + "loss": 0.2689, + "step": 8164 + }, + { + "epoch": 0.44818880351262347, + "grad_norm": 2.7069530487060547, + "learning_rate": 4.140476851670502e-05, + "loss": 0.2961, + "step": 8166 + }, + { + "epoch": 0.4482985729967069, + "grad_norm": 1.9177216291427612, + "learning_rate": 4.140081710641691e-05, + "loss": 0.2492, + "step": 8168 + }, + { + "epoch": 0.44840834248079037, + "grad_norm": 2.589583396911621, + "learning_rate": 4.139686497670846e-05, + "loss": 0.3616, + "step": 8170 + }, + { + "epoch": 0.4485181119648738, + "grad_norm": 3.175029754638672, + "learning_rate": 4.1392912127753034e-05, + "loss": 0.3859, + "step": 8172 + }, + { + "epoch": 0.4486278814489572, + "grad_norm": 2.005680561065674, + "learning_rate": 4.1388958559724025e-05, + "loss": 0.3421, + "step": 8174 + }, + { + "epoch": 0.44873765093304063, + "grad_norm": 1.5853904485702515, + "learning_rate": 4.138500427279485e-05, + "loss": 0.2915, + "step": 8176 + }, + { + "epoch": 0.44884742041712405, + "grad_norm": 1.6868375539779663, + "learning_rate": 4.138104926713896e-05, + "loss": 0.3042, + "step": 8178 + }, + { + "epoch": 0.4489571899012075, + "grad_norm": 1.3494675159454346, + "learning_rate": 4.137709354292986e-05, + "loss": 0.3736, + "step": 8180 + }, + { + "epoch": 0.4490669593852909, + "grad_norm": 1.814879298210144, + "learning_rate": 4.137313710034104e-05, + "loss": 0.3988, + "step": 8182 + }, + { + "epoch": 0.4491767288693743, + "grad_norm": 1.6310409307479858, + "learning_rate": 4.136917993954607e-05, + "loss": 0.3128, + "step": 8184 + }, + { + "epoch": 0.44928649835345774, + "grad_norm": 1.7927110195159912, + "learning_rate": 4.136522206071852e-05, + "loss": 0.3217, + "step": 8186 + }, + { + "epoch": 0.44939626783754116, + "grad_norm": 1.7329261302947998, + "learning_rate": 4.136126346403201e-05, + "loss": 0.3501, + "step": 8188 + }, + { + "epoch": 0.4495060373216246, + "grad_norm": 2.063162326812744, + "learning_rate": 4.1357304149660167e-05, + "loss": 0.505, + "step": 8190 + }, + { + "epoch": 0.449615806805708, + "grad_norm": 1.9618182182312012, + "learning_rate": 4.135334411777669e-05, + "loss": 0.4253, + "step": 8192 + }, + { + "epoch": 0.44972557628979143, + "grad_norm": 2.5535597801208496, + "learning_rate": 4.1349383368555265e-05, + "loss": 0.341, + "step": 8194 + }, + { + "epoch": 0.44983534577387485, + "grad_norm": 1.7530525922775269, + "learning_rate": 4.1345421902169645e-05, + "loss": 0.4301, + "step": 8196 + }, + { + "epoch": 0.4499451152579583, + "grad_norm": 1.2699799537658691, + "learning_rate": 4.134145971879359e-05, + "loss": 0.3678, + "step": 8198 + }, + { + "epoch": 0.4500548847420417, + "grad_norm": 1.28138267993927, + "learning_rate": 4.1337496818600895e-05, + "loss": 0.3703, + "step": 8200 + }, + { + "epoch": 0.4501646542261251, + "grad_norm": 1.403622031211853, + "learning_rate": 4.133353320176541e-05, + "loss": 0.3703, + "step": 8202 + }, + { + "epoch": 0.45027442371020854, + "grad_norm": 1.792397379875183, + "learning_rate": 4.132956886846099e-05, + "loss": 0.2507, + "step": 8204 + }, + { + "epoch": 0.45038419319429196, + "grad_norm": 1.6993627548217773, + "learning_rate": 4.132560381886152e-05, + "loss": 0.3481, + "step": 8206 + }, + { + "epoch": 0.45049396267837544, + "grad_norm": 1.9247430562973022, + "learning_rate": 4.132163805314094e-05, + "loss": 0.2116, + "step": 8208 + }, + { + "epoch": 0.45060373216245886, + "grad_norm": 1.9154212474822998, + "learning_rate": 4.131767157147321e-05, + "loss": 0.2714, + "step": 8210 + }, + { + "epoch": 0.4507135016465423, + "grad_norm": 1.7298506498336792, + "learning_rate": 4.13137043740323e-05, + "loss": 0.2462, + "step": 8212 + }, + { + "epoch": 0.4508232711306257, + "grad_norm": 1.252168893814087, + "learning_rate": 4.1309736460992254e-05, + "loss": 0.3329, + "step": 8214 + }, + { + "epoch": 0.4509330406147091, + "grad_norm": 2.018092393875122, + "learning_rate": 4.130576783252712e-05, + "loss": 0.304, + "step": 8216 + }, + { + "epoch": 0.45104281009879255, + "grad_norm": 1.9197202920913696, + "learning_rate": 4.130179848881096e-05, + "loss": 0.3279, + "step": 8218 + }, + { + "epoch": 0.45115257958287597, + "grad_norm": 1.6852003335952759, + "learning_rate": 4.129782843001792e-05, + "loss": 0.2381, + "step": 8220 + }, + { + "epoch": 0.4512623490669594, + "grad_norm": 1.7096372842788696, + "learning_rate": 4.129385765632211e-05, + "loss": 0.238, + "step": 8222 + }, + { + "epoch": 0.4513721185510428, + "grad_norm": 1.9552769660949707, + "learning_rate": 4.128988616789774e-05, + "loss": 0.3408, + "step": 8224 + }, + { + "epoch": 0.45148188803512623, + "grad_norm": 2.2095038890838623, + "learning_rate": 4.128591396491901e-05, + "loss": 0.3115, + "step": 8226 + }, + { + "epoch": 0.45159165751920965, + "grad_norm": 1.1816524267196655, + "learning_rate": 4.1281941047560155e-05, + "loss": 0.2929, + "step": 8228 + }, + { + "epoch": 0.4517014270032931, + "grad_norm": 1.4566391706466675, + "learning_rate": 4.127796741599545e-05, + "loss": 0.3635, + "step": 8230 + }, + { + "epoch": 0.4518111964873765, + "grad_norm": 1.5483719110488892, + "learning_rate": 4.12739930703992e-05, + "loss": 0.2412, + "step": 8232 + }, + { + "epoch": 0.4519209659714599, + "grad_norm": 2.0436291694641113, + "learning_rate": 4.127001801094573e-05, + "loss": 0.2799, + "step": 8234 + }, + { + "epoch": 0.45203073545554334, + "grad_norm": 1.8315601348876953, + "learning_rate": 4.126604223780941e-05, + "loss": 0.2335, + "step": 8236 + }, + { + "epoch": 0.45214050493962676, + "grad_norm": 1.644853115081787, + "learning_rate": 4.1262065751164644e-05, + "loss": 0.4031, + "step": 8238 + }, + { + "epoch": 0.4522502744237102, + "grad_norm": 1.5766499042510986, + "learning_rate": 4.125808855118586e-05, + "loss": 0.4216, + "step": 8240 + }, + { + "epoch": 0.4523600439077936, + "grad_norm": 1.7784594297409058, + "learning_rate": 4.1254110638047506e-05, + "loss": 0.2463, + "step": 8242 + }, + { + "epoch": 0.4524698133918771, + "grad_norm": 2.0272486209869385, + "learning_rate": 4.1250132011924083e-05, + "loss": 0.3286, + "step": 8244 + }, + { + "epoch": 0.4525795828759605, + "grad_norm": 1.2000620365142822, + "learning_rate": 4.1246152672990106e-05, + "loss": 0.3049, + "step": 8246 + }, + { + "epoch": 0.45268935236004393, + "grad_norm": 2.5974459648132324, + "learning_rate": 4.124217262142014e-05, + "loss": 0.2913, + "step": 8248 + }, + { + "epoch": 0.45279912184412735, + "grad_norm": 1.4029228687286377, + "learning_rate": 4.1238191857388755e-05, + "loss": 0.2867, + "step": 8250 + }, + { + "epoch": 0.45290889132821077, + "grad_norm": 1.5795823335647583, + "learning_rate": 4.123421038107057e-05, + "loss": 0.4393, + "step": 8252 + }, + { + "epoch": 0.4530186608122942, + "grad_norm": 2.029484510421753, + "learning_rate": 4.1230228192640236e-05, + "loss": 0.3487, + "step": 8254 + }, + { + "epoch": 0.4531284302963776, + "grad_norm": 2.0175745487213135, + "learning_rate": 4.122624529227244e-05, + "loss": 0.3403, + "step": 8256 + }, + { + "epoch": 0.45323819978046104, + "grad_norm": 1.8778767585754395, + "learning_rate": 4.122226168014187e-05, + "loss": 0.2782, + "step": 8258 + }, + { + "epoch": 0.45334796926454446, + "grad_norm": 1.386878252029419, + "learning_rate": 4.121827735642329e-05, + "loss": 0.4033, + "step": 8260 + }, + { + "epoch": 0.4534577387486279, + "grad_norm": 3.1564748287200928, + "learning_rate": 4.1214292321291456e-05, + "loss": 0.3427, + "step": 8262 + }, + { + "epoch": 0.4535675082327113, + "grad_norm": 2.8281500339508057, + "learning_rate": 4.121030657492118e-05, + "loss": 0.4376, + "step": 8264 + }, + { + "epoch": 0.4536772777167947, + "grad_norm": 1.2453105449676514, + "learning_rate": 4.1206320117487285e-05, + "loss": 0.3053, + "step": 8266 + }, + { + "epoch": 0.45378704720087815, + "grad_norm": 1.0891954898834229, + "learning_rate": 4.120233294916465e-05, + "loss": 0.2941, + "step": 8268 + }, + { + "epoch": 0.45389681668496157, + "grad_norm": 1.631539225578308, + "learning_rate": 4.1198345070128166e-05, + "loss": 0.396, + "step": 8270 + }, + { + "epoch": 0.454006586169045, + "grad_norm": 3.411970853805542, + "learning_rate": 4.119435648055276e-05, + "loss": 0.3254, + "step": 8272 + }, + { + "epoch": 0.4541163556531284, + "grad_norm": 3.224733352661133, + "learning_rate": 4.119036718061339e-05, + "loss": 0.2814, + "step": 8274 + }, + { + "epoch": 0.45422612513721183, + "grad_norm": 1.4496906995773315, + "learning_rate": 4.118637717048506e-05, + "loss": 0.3022, + "step": 8276 + }, + { + "epoch": 0.45433589462129526, + "grad_norm": 1.779305338859558, + "learning_rate": 4.118238645034277e-05, + "loss": 0.345, + "step": 8278 + }, + { + "epoch": 0.4544456641053787, + "grad_norm": 1.915695309638977, + "learning_rate": 4.117839502036158e-05, + "loss": 0.4216, + "step": 8280 + }, + { + "epoch": 0.45455543358946215, + "grad_norm": 1.2897106409072876, + "learning_rate": 4.117440288071658e-05, + "loss": 0.169, + "step": 8282 + }, + { + "epoch": 0.4546652030735456, + "grad_norm": 1.6605645418167114, + "learning_rate": 4.117041003158288e-05, + "loss": 0.3441, + "step": 8284 + }, + { + "epoch": 0.454774972557629, + "grad_norm": 2.972477912902832, + "learning_rate": 4.1166416473135626e-05, + "loss": 0.3852, + "step": 8286 + }, + { + "epoch": 0.4548847420417124, + "grad_norm": 1.9087257385253906, + "learning_rate": 4.116242220555e-05, + "loss": 0.4661, + "step": 8288 + }, + { + "epoch": 0.45499451152579584, + "grad_norm": 1.911405086517334, + "learning_rate": 4.11584272290012e-05, + "loss": 0.4472, + "step": 8290 + }, + { + "epoch": 0.45510428100987926, + "grad_norm": 1.2949843406677246, + "learning_rate": 4.115443154366448e-05, + "loss": 0.2705, + "step": 8292 + }, + { + "epoch": 0.4552140504939627, + "grad_norm": 1.823637843132019, + "learning_rate": 4.1150435149715093e-05, + "loss": 0.2981, + "step": 8294 + }, + { + "epoch": 0.4553238199780461, + "grad_norm": 2.5768535137176514, + "learning_rate": 4.114643804732835e-05, + "loss": 0.3641, + "step": 8296 + }, + { + "epoch": 0.45543358946212953, + "grad_norm": 2.6330454349517822, + "learning_rate": 4.114244023667959e-05, + "loss": 0.3642, + "step": 8298 + }, + { + "epoch": 0.45554335894621295, + "grad_norm": 2.8892695903778076, + "learning_rate": 4.113844171794416e-05, + "loss": 0.1947, + "step": 8300 + }, + { + "epoch": 0.4556531284302964, + "grad_norm": 3.302645206451416, + "learning_rate": 4.113444249129748e-05, + "loss": 0.2323, + "step": 8302 + }, + { + "epoch": 0.4557628979143798, + "grad_norm": 1.6377012729644775, + "learning_rate": 4.113044255691495e-05, + "loss": 0.3274, + "step": 8304 + }, + { + "epoch": 0.4558726673984632, + "grad_norm": 1.597448468208313, + "learning_rate": 4.1126441914972036e-05, + "loss": 0.2252, + "step": 8306 + }, + { + "epoch": 0.45598243688254664, + "grad_norm": 1.7439767122268677, + "learning_rate": 4.1122440565644225e-05, + "loss": 0.3134, + "step": 8308 + }, + { + "epoch": 0.45609220636663006, + "grad_norm": 3.1026933193206787, + "learning_rate": 4.111843850910704e-05, + "loss": 0.3741, + "step": 8310 + }, + { + "epoch": 0.4562019758507135, + "grad_norm": 1.381437063217163, + "learning_rate": 4.111443574553602e-05, + "loss": 0.3285, + "step": 8312 + }, + { + "epoch": 0.4563117453347969, + "grad_norm": 1.4108035564422607, + "learning_rate": 4.1110432275106767e-05, + "loss": 0.2325, + "step": 8314 + }, + { + "epoch": 0.4564215148188803, + "grad_norm": 1.6775527000427246, + "learning_rate": 4.1106428097994875e-05, + "loss": 0.323, + "step": 8316 + }, + { + "epoch": 0.4565312843029638, + "grad_norm": 4.843459606170654, + "learning_rate": 4.1102423214375986e-05, + "loss": 0.3294, + "step": 8318 + }, + { + "epoch": 0.4566410537870472, + "grad_norm": 1.5268449783325195, + "learning_rate": 4.109841762442579e-05, + "loss": 0.272, + "step": 8320 + }, + { + "epoch": 0.45675082327113065, + "grad_norm": 2.1699917316436768, + "learning_rate": 4.109441132831997e-05, + "loss": 0.3348, + "step": 8322 + }, + { + "epoch": 0.45686059275521407, + "grad_norm": 1.5189071893692017, + "learning_rate": 4.109040432623428e-05, + "loss": 0.3362, + "step": 8324 + }, + { + "epoch": 0.4569703622392975, + "grad_norm": 1.8002145290374756, + "learning_rate": 4.1086396618344476e-05, + "loss": 0.473, + "step": 8326 + }, + { + "epoch": 0.4570801317233809, + "grad_norm": 1.7756340503692627, + "learning_rate": 4.108238820482636e-05, + "loss": 0.2598, + "step": 8328 + }, + { + "epoch": 0.45718990120746433, + "grad_norm": 1.5088090896606445, + "learning_rate": 4.107837908585576e-05, + "loss": 0.3155, + "step": 8330 + }, + { + "epoch": 0.45729967069154775, + "grad_norm": 2.088538885116577, + "learning_rate": 4.107436926160854e-05, + "loss": 0.3751, + "step": 8332 + }, + { + "epoch": 0.4574094401756312, + "grad_norm": 1.3308888673782349, + "learning_rate": 4.1070358732260583e-05, + "loss": 0.3819, + "step": 8334 + }, + { + "epoch": 0.4575192096597146, + "grad_norm": 2.0622482299804688, + "learning_rate": 4.1066347497987826e-05, + "loss": 0.2604, + "step": 8336 + }, + { + "epoch": 0.457628979143798, + "grad_norm": 1.6706221103668213, + "learning_rate": 4.1062335558966194e-05, + "loss": 0.3356, + "step": 8338 + }, + { + "epoch": 0.45773874862788144, + "grad_norm": 1.3325977325439453, + "learning_rate": 4.1058322915371694e-05, + "loss": 0.3421, + "step": 8340 + }, + { + "epoch": 0.45784851811196486, + "grad_norm": 2.130091905593872, + "learning_rate": 4.105430956738032e-05, + "loss": 0.2736, + "step": 8342 + }, + { + "epoch": 0.4579582875960483, + "grad_norm": 6.228786468505859, + "learning_rate": 4.1050295515168144e-05, + "loss": 0.3749, + "step": 8344 + }, + { + "epoch": 0.4580680570801317, + "grad_norm": 2.1798110008239746, + "learning_rate": 4.1046280758911216e-05, + "loss": 0.4769, + "step": 8346 + }, + { + "epoch": 0.45817782656421513, + "grad_norm": 1.4577704668045044, + "learning_rate": 4.1042265298785664e-05, + "loss": 0.2133, + "step": 8348 + }, + { + "epoch": 0.45828759604829855, + "grad_norm": 1.4751864671707153, + "learning_rate": 4.103824913496761e-05, + "loss": 0.4977, + "step": 8350 + }, + { + "epoch": 0.458397365532382, + "grad_norm": 2.5764830112457275, + "learning_rate": 4.1034232267633235e-05, + "loss": 0.3334, + "step": 8352 + }, + { + "epoch": 0.45850713501646545, + "grad_norm": 3.250229835510254, + "learning_rate": 4.103021469695872e-05, + "loss": 0.3087, + "step": 8354 + }, + { + "epoch": 0.45861690450054887, + "grad_norm": 2.636441707611084, + "learning_rate": 4.102619642312031e-05, + "loss": 0.4133, + "step": 8356 + }, + { + "epoch": 0.4587266739846323, + "grad_norm": 1.079585313796997, + "learning_rate": 4.102217744629427e-05, + "loss": 0.3022, + "step": 8358 + }, + { + "epoch": 0.4588364434687157, + "grad_norm": 2.0278425216674805, + "learning_rate": 4.101815776665689e-05, + "loss": 0.3251, + "step": 8360 + }, + { + "epoch": 0.45894621295279914, + "grad_norm": 1.649854063987732, + "learning_rate": 4.101413738438448e-05, + "loss": 0.3708, + "step": 8362 + }, + { + "epoch": 0.45905598243688256, + "grad_norm": 1.2142541408538818, + "learning_rate": 4.101011629965341e-05, + "loss": 0.2142, + "step": 8364 + }, + { + "epoch": 0.459165751920966, + "grad_norm": 1.5252249240875244, + "learning_rate": 4.1006094512640044e-05, + "loss": 0.4364, + "step": 8366 + }, + { + "epoch": 0.4592755214050494, + "grad_norm": 1.2813236713409424, + "learning_rate": 4.100207202352082e-05, + "loss": 0.3325, + "step": 8368 + }, + { + "epoch": 0.4593852908891328, + "grad_norm": 1.7682527303695679, + "learning_rate": 4.0998048832472174e-05, + "loss": 0.3427, + "step": 8370 + }, + { + "epoch": 0.45949506037321625, + "grad_norm": 4.178377628326416, + "learning_rate": 4.0994024939670576e-05, + "loss": 0.2615, + "step": 8372 + }, + { + "epoch": 0.45960482985729967, + "grad_norm": 1.2886794805526733, + "learning_rate": 4.0990000345292546e-05, + "loss": 0.2612, + "step": 8374 + }, + { + "epoch": 0.4597145993413831, + "grad_norm": 1.6768825054168701, + "learning_rate": 4.098597504951462e-05, + "loss": 0.2349, + "step": 8376 + }, + { + "epoch": 0.4598243688254665, + "grad_norm": 1.8071656227111816, + "learning_rate": 4.098194905251336e-05, + "loss": 0.4609, + "step": 8378 + }, + { + "epoch": 0.45993413830954993, + "grad_norm": 1.6221179962158203, + "learning_rate": 4.097792235446538e-05, + "loss": 0.3507, + "step": 8380 + }, + { + "epoch": 0.46004390779363336, + "grad_norm": 1.4188587665557861, + "learning_rate": 4.097389495554729e-05, + "loss": 0.4461, + "step": 8382 + }, + { + "epoch": 0.4601536772777168, + "grad_norm": 1.585821270942688, + "learning_rate": 4.096986685593577e-05, + "loss": 0.3007, + "step": 8384 + }, + { + "epoch": 0.4602634467618002, + "grad_norm": 1.8804212808609009, + "learning_rate": 4.0965838055807495e-05, + "loss": 0.2901, + "step": 8386 + }, + { + "epoch": 0.4603732162458836, + "grad_norm": 1.1470831632614136, + "learning_rate": 4.096180855533921e-05, + "loss": 0.2508, + "step": 8388 + }, + { + "epoch": 0.46048298572996704, + "grad_norm": 1.4300342798233032, + "learning_rate": 4.0957778354707646e-05, + "loss": 0.2673, + "step": 8390 + }, + { + "epoch": 0.4605927552140505, + "grad_norm": 2.9391555786132812, + "learning_rate": 4.09537474540896e-05, + "loss": 0.3954, + "step": 8392 + }, + { + "epoch": 0.46070252469813394, + "grad_norm": 1.7882627248764038, + "learning_rate": 4.0949715853661895e-05, + "loss": 0.2939, + "step": 8394 + }, + { + "epoch": 0.46081229418221736, + "grad_norm": 2.0725677013397217, + "learning_rate": 4.094568355360136e-05, + "loss": 0.3042, + "step": 8396 + }, + { + "epoch": 0.4609220636663008, + "grad_norm": 1.2323737144470215, + "learning_rate": 4.094165055408488e-05, + "loss": 0.2202, + "step": 8398 + }, + { + "epoch": 0.4610318331503842, + "grad_norm": 1.027429223060608, + "learning_rate": 4.093761685528935e-05, + "loss": 0.4931, + "step": 8400 + }, + { + "epoch": 0.46114160263446763, + "grad_norm": 2.7206332683563232, + "learning_rate": 4.093358245739173e-05, + "loss": 0.3352, + "step": 8402 + }, + { + "epoch": 0.46125137211855105, + "grad_norm": 2.2153265476226807, + "learning_rate": 4.092954736056897e-05, + "loss": 0.4805, + "step": 8404 + }, + { + "epoch": 0.4613611416026345, + "grad_norm": 2.280742883682251, + "learning_rate": 4.0925511564998084e-05, + "loss": 0.3411, + "step": 8406 + }, + { + "epoch": 0.4614709110867179, + "grad_norm": 1.6092712879180908, + "learning_rate": 4.0921475070856084e-05, + "loss": 0.2294, + "step": 8408 + }, + { + "epoch": 0.4615806805708013, + "grad_norm": 2.343315839767456, + "learning_rate": 4.091743787832005e-05, + "loss": 0.4021, + "step": 8410 + }, + { + "epoch": 0.46169045005488474, + "grad_norm": 1.9625953435897827, + "learning_rate": 4.091339998756706e-05, + "loss": 0.3098, + "step": 8412 + }, + { + "epoch": 0.46180021953896816, + "grad_norm": 1.154941439628601, + "learning_rate": 4.090936139877424e-05, + "loss": 0.2907, + "step": 8414 + }, + { + "epoch": 0.4619099890230516, + "grad_norm": 2.180675983428955, + "learning_rate": 4.090532211211874e-05, + "loss": 0.2665, + "step": 8416 + }, + { + "epoch": 0.462019758507135, + "grad_norm": 1.9459776878356934, + "learning_rate": 4.090128212777774e-05, + "loss": 0.3171, + "step": 8418 + }, + { + "epoch": 0.4621295279912184, + "grad_norm": 2.139481544494629, + "learning_rate": 4.089724144592846e-05, + "loss": 0.3437, + "step": 8420 + }, + { + "epoch": 0.46223929747530185, + "grad_norm": 1.9970202445983887, + "learning_rate": 4.0893200066748144e-05, + "loss": 0.2833, + "step": 8422 + }, + { + "epoch": 0.46234906695938527, + "grad_norm": 1.4440422058105469, + "learning_rate": 4.088915799041406e-05, + "loss": 0.265, + "step": 8424 + }, + { + "epoch": 0.4624588364434687, + "grad_norm": 1.3536992073059082, + "learning_rate": 4.088511521710352e-05, + "loss": 0.2572, + "step": 8426 + }, + { + "epoch": 0.46256860592755217, + "grad_norm": 3.8492283821105957, + "learning_rate": 4.088107174699387e-05, + "loss": 0.3371, + "step": 8428 + }, + { + "epoch": 0.4626783754116356, + "grad_norm": 1.1407679319381714, + "learning_rate": 4.087702758026245e-05, + "loss": 0.2595, + "step": 8430 + }, + { + "epoch": 0.462788144895719, + "grad_norm": 1.3482921123504639, + "learning_rate": 4.087298271708667e-05, + "loss": 0.33, + "step": 8432 + }, + { + "epoch": 0.46289791437980243, + "grad_norm": 1.1302157640457153, + "learning_rate": 4.086893715764397e-05, + "loss": 0.28, + "step": 8434 + }, + { + "epoch": 0.46300768386388585, + "grad_norm": 2.9117794036865234, + "learning_rate": 4.086489090211179e-05, + "loss": 0.3262, + "step": 8436 + }, + { + "epoch": 0.4631174533479693, + "grad_norm": 1.282164454460144, + "learning_rate": 4.086084395066763e-05, + "loss": 0.2129, + "step": 8438 + }, + { + "epoch": 0.4632272228320527, + "grad_norm": 1.6204880475997925, + "learning_rate": 4.0856796303489e-05, + "loss": 0.4624, + "step": 8440 + }, + { + "epoch": 0.4633369923161361, + "grad_norm": 1.2469539642333984, + "learning_rate": 4.0852747960753454e-05, + "loss": 0.2205, + "step": 8442 + }, + { + "epoch": 0.46344676180021954, + "grad_norm": 2.8859519958496094, + "learning_rate": 4.0848698922638575e-05, + "loss": 0.3867, + "step": 8444 + }, + { + "epoch": 0.46355653128430296, + "grad_norm": 1.2002332210540771, + "learning_rate": 4.084464918932197e-05, + "loss": 0.3468, + "step": 8446 + }, + { + "epoch": 0.4636663007683864, + "grad_norm": 1.144768238067627, + "learning_rate": 4.084059876098128e-05, + "loss": 0.3043, + "step": 8448 + }, + { + "epoch": 0.4637760702524698, + "grad_norm": 2.388307809829712, + "learning_rate": 4.0836547637794176e-05, + "loss": 0.3785, + "step": 8450 + }, + { + "epoch": 0.46388583973655323, + "grad_norm": 2.6816978454589844, + "learning_rate": 4.083249581993837e-05, + "loss": 0.2581, + "step": 8452 + }, + { + "epoch": 0.46399560922063665, + "grad_norm": 2.7059357166290283, + "learning_rate": 4.0828443307591586e-05, + "loss": 0.5075, + "step": 8454 + }, + { + "epoch": 0.4641053787047201, + "grad_norm": 1.2787951231002808, + "learning_rate": 4.0824390100931586e-05, + "loss": 0.3443, + "step": 8456 + }, + { + "epoch": 0.4642151481888035, + "grad_norm": 1.0309239625930786, + "learning_rate": 4.0820336200136163e-05, + "loss": 0.2975, + "step": 8458 + }, + { + "epoch": 0.4643249176728869, + "grad_norm": 2.059414863586426, + "learning_rate": 4.081628160538315e-05, + "loss": 0.3168, + "step": 8460 + }, + { + "epoch": 0.46443468715697034, + "grad_norm": 1.6726640462875366, + "learning_rate": 4.081222631685038e-05, + "loss": 0.3256, + "step": 8462 + }, + { + "epoch": 0.46454445664105376, + "grad_norm": 1.112519383430481, + "learning_rate": 4.080817033471577e-05, + "loss": 0.2643, + "step": 8464 + }, + { + "epoch": 0.46465422612513724, + "grad_norm": 1.7732957601547241, + "learning_rate": 4.0804113659157204e-05, + "loss": 0.2773, + "step": 8466 + }, + { + "epoch": 0.46476399560922066, + "grad_norm": 1.3896464109420776, + "learning_rate": 4.080005629035265e-05, + "loss": 0.2362, + "step": 8468 + }, + { + "epoch": 0.4648737650933041, + "grad_norm": 1.524220585823059, + "learning_rate": 4.079599822848008e-05, + "loss": 0.3312, + "step": 8470 + }, + { + "epoch": 0.4649835345773875, + "grad_norm": 1.8381911516189575, + "learning_rate": 4.0791939473717485e-05, + "loss": 0.3249, + "step": 8472 + }, + { + "epoch": 0.4650933040614709, + "grad_norm": 3.4486083984375, + "learning_rate": 4.078788002624292e-05, + "loss": 0.3623, + "step": 8474 + }, + { + "epoch": 0.46520307354555435, + "grad_norm": 2.0371644496917725, + "learning_rate": 4.0783819886234445e-05, + "loss": 0.3915, + "step": 8476 + }, + { + "epoch": 0.46531284302963777, + "grad_norm": 1.349343180656433, + "learning_rate": 4.077975905387016e-05, + "loss": 0.3186, + "step": 8478 + }, + { + "epoch": 0.4654226125137212, + "grad_norm": 1.8479667901992798, + "learning_rate": 4.077569752932819e-05, + "loss": 0.3139, + "step": 8480 + }, + { + "epoch": 0.4655323819978046, + "grad_norm": 1.7775824069976807, + "learning_rate": 4.077163531278669e-05, + "loss": 0.496, + "step": 8482 + }, + { + "epoch": 0.46564215148188803, + "grad_norm": 2.075563907623291, + "learning_rate": 4.076757240442386e-05, + "loss": 0.3145, + "step": 8484 + }, + { + "epoch": 0.46575192096597146, + "grad_norm": 2.5856363773345947, + "learning_rate": 4.0763508804417905e-05, + "loss": 0.3778, + "step": 8486 + }, + { + "epoch": 0.4658616904500549, + "grad_norm": 1.5743770599365234, + "learning_rate": 4.0759444512947095e-05, + "loss": 0.302, + "step": 8488 + }, + { + "epoch": 0.4659714599341383, + "grad_norm": 1.8871400356292725, + "learning_rate": 4.0755379530189684e-05, + "loss": 0.3681, + "step": 8490 + }, + { + "epoch": 0.4660812294182217, + "grad_norm": 1.2275429964065552, + "learning_rate": 4.0751313856324e-05, + "loss": 0.4335, + "step": 8492 + }, + { + "epoch": 0.46619099890230514, + "grad_norm": 1.3829360008239746, + "learning_rate": 4.074724749152837e-05, + "loss": 0.2692, + "step": 8494 + }, + { + "epoch": 0.46630076838638856, + "grad_norm": 2.379483938217163, + "learning_rate": 4.0743180435981185e-05, + "loss": 0.3739, + "step": 8496 + }, + { + "epoch": 0.466410537870472, + "grad_norm": 1.8819562196731567, + "learning_rate": 4.073911268986083e-05, + "loss": 0.4107, + "step": 8498 + }, + { + "epoch": 0.4665203073545554, + "grad_norm": 1.4660289287567139, + "learning_rate": 4.073504425334574e-05, + "loss": 0.3697, + "step": 8500 + }, + { + "epoch": 0.4666300768386389, + "grad_norm": 1.5944684743881226, + "learning_rate": 4.073097512661438e-05, + "loss": 0.327, + "step": 8502 + }, + { + "epoch": 0.4667398463227223, + "grad_norm": 2.6113839149475098, + "learning_rate": 4.072690530984523e-05, + "loss": 0.2914, + "step": 8504 + }, + { + "epoch": 0.46684961580680573, + "grad_norm": 1.2178751230239868, + "learning_rate": 4.0722834803216836e-05, + "loss": 0.3, + "step": 8506 + }, + { + "epoch": 0.46695938529088915, + "grad_norm": 1.2338194847106934, + "learning_rate": 4.071876360690772e-05, + "loss": 0.2642, + "step": 8508 + }, + { + "epoch": 0.4670691547749726, + "grad_norm": 2.1794180870056152, + "learning_rate": 4.071469172109649e-05, + "loss": 0.346, + "step": 8510 + }, + { + "epoch": 0.467178924259056, + "grad_norm": 1.5034236907958984, + "learning_rate": 4.0710619145961735e-05, + "loss": 0.3492, + "step": 8512 + }, + { + "epoch": 0.4672886937431394, + "grad_norm": 1.5526858568191528, + "learning_rate": 4.070654588168212e-05, + "loss": 0.3077, + "step": 8514 + }, + { + "epoch": 0.46739846322722284, + "grad_norm": 1.172701120376587, + "learning_rate": 4.070247192843632e-05, + "loss": 0.2851, + "step": 8516 + }, + { + "epoch": 0.46750823271130626, + "grad_norm": 1.5362259149551392, + "learning_rate": 4.0698397286403024e-05, + "loss": 0.2822, + "step": 8518 + }, + { + "epoch": 0.4676180021953897, + "grad_norm": 1.547666311264038, + "learning_rate": 4.069432195576096e-05, + "loss": 0.2753, + "step": 8520 + }, + { + "epoch": 0.4677277716794731, + "grad_norm": 2.4164469242095947, + "learning_rate": 4.069024593668891e-05, + "loss": 0.3447, + "step": 8522 + }, + { + "epoch": 0.4678375411635565, + "grad_norm": 1.1644399166107178, + "learning_rate": 4.0686169229365665e-05, + "loss": 0.2573, + "step": 8524 + }, + { + "epoch": 0.46794731064763995, + "grad_norm": 1.391152262687683, + "learning_rate": 4.068209183397004e-05, + "loss": 0.4399, + "step": 8526 + }, + { + "epoch": 0.46805708013172337, + "grad_norm": 2.48172926902771, + "learning_rate": 4.06780137506809e-05, + "loss": 0.3447, + "step": 8528 + }, + { + "epoch": 0.4681668496158068, + "grad_norm": 1.8817487955093384, + "learning_rate": 4.067393497967712e-05, + "loss": 0.3918, + "step": 8530 + }, + { + "epoch": 0.4682766190998902, + "grad_norm": 1.6532938480377197, + "learning_rate": 4.066985552113762e-05, + "loss": 0.3675, + "step": 8532 + }, + { + "epoch": 0.46838638858397363, + "grad_norm": 2.93994140625, + "learning_rate": 4.0665775375241346e-05, + "loss": 0.1979, + "step": 8534 + }, + { + "epoch": 0.46849615806805706, + "grad_norm": 1.3949158191680908, + "learning_rate": 4.066169454216727e-05, + "loss": 0.2379, + "step": 8536 + }, + { + "epoch": 0.4686059275521405, + "grad_norm": 1.3840385675430298, + "learning_rate": 4.0657613022094405e-05, + "loss": 0.3665, + "step": 8538 + }, + { + "epoch": 0.46871569703622395, + "grad_norm": 1.4185101985931396, + "learning_rate": 4.0653530815201776e-05, + "loss": 0.3345, + "step": 8540 + }, + { + "epoch": 0.4688254665203074, + "grad_norm": 2.7573587894439697, + "learning_rate": 4.064944792166845e-05, + "loss": 0.3412, + "step": 8542 + }, + { + "epoch": 0.4689352360043908, + "grad_norm": 1.683763027191162, + "learning_rate": 4.064536434167353e-05, + "loss": 0.2677, + "step": 8544 + }, + { + "epoch": 0.4690450054884742, + "grad_norm": 1.7363033294677734, + "learning_rate": 4.064128007539614e-05, + "loss": 0.3169, + "step": 8546 + }, + { + "epoch": 0.46915477497255764, + "grad_norm": 2.090158462524414, + "learning_rate": 4.063719512301544e-05, + "loss": 0.3434, + "step": 8548 + }, + { + "epoch": 0.46926454445664106, + "grad_norm": 2.1588051319122314, + "learning_rate": 4.06331094847106e-05, + "loss": 0.3446, + "step": 8550 + }, + { + "epoch": 0.4693743139407245, + "grad_norm": 1.793947458267212, + "learning_rate": 4.062902316066084e-05, + "loss": 0.2815, + "step": 8552 + }, + { + "epoch": 0.4694840834248079, + "grad_norm": 2.706084728240967, + "learning_rate": 4.0624936151045426e-05, + "loss": 0.3868, + "step": 8554 + }, + { + "epoch": 0.46959385290889133, + "grad_norm": 2.0594396591186523, + "learning_rate": 4.062084845604362e-05, + "loss": 0.268, + "step": 8556 + }, + { + "epoch": 0.46970362239297475, + "grad_norm": 1.7215648889541626, + "learning_rate": 4.0616760075834715e-05, + "loss": 0.3956, + "step": 8558 + }, + { + "epoch": 0.4698133918770582, + "grad_norm": 1.8097460269927979, + "learning_rate": 4.0612671010598075e-05, + "loss": 0.3208, + "step": 8560 + }, + { + "epoch": 0.4699231613611416, + "grad_norm": 1.3564112186431885, + "learning_rate": 4.060858126051305e-05, + "loss": 0.2601, + "step": 8562 + }, + { + "epoch": 0.470032930845225, + "grad_norm": 3.881680727005005, + "learning_rate": 4.060449082575904e-05, + "loss": 0.4318, + "step": 8564 + }, + { + "epoch": 0.47014270032930844, + "grad_norm": 1.9430588483810425, + "learning_rate": 4.060039970651547e-05, + "loss": 0.2449, + "step": 8566 + }, + { + "epoch": 0.47025246981339186, + "grad_norm": 1.223751187324524, + "learning_rate": 4.0596307902961796e-05, + "loss": 0.3139, + "step": 8568 + }, + { + "epoch": 0.4703622392974753, + "grad_norm": 1.361564040184021, + "learning_rate": 4.05922154152775e-05, + "loss": 0.4194, + "step": 8570 + }, + { + "epoch": 0.4704720087815587, + "grad_norm": 1.8475648164749146, + "learning_rate": 4.0588122243642114e-05, + "loss": 0.4143, + "step": 8572 + }, + { + "epoch": 0.4705817782656421, + "grad_norm": 1.3537644147872925, + "learning_rate": 4.058402838823517e-05, + "loss": 0.3757, + "step": 8574 + }, + { + "epoch": 0.4706915477497256, + "grad_norm": 1.9378342628479004, + "learning_rate": 4.057993384923626e-05, + "loss": 0.3353, + "step": 8576 + }, + { + "epoch": 0.470801317233809, + "grad_norm": 1.7111073732376099, + "learning_rate": 4.057583862682498e-05, + "loss": 0.2384, + "step": 8578 + }, + { + "epoch": 0.47091108671789245, + "grad_norm": 2.4817163944244385, + "learning_rate": 4.057174272118096e-05, + "loss": 0.4403, + "step": 8580 + }, + { + "epoch": 0.47102085620197587, + "grad_norm": 1.127567172050476, + "learning_rate": 4.0567646132483875e-05, + "loss": 0.2322, + "step": 8582 + }, + { + "epoch": 0.4711306256860593, + "grad_norm": 2.643256425857544, + "learning_rate": 4.0563548860913415e-05, + "loss": 0.4011, + "step": 8584 + }, + { + "epoch": 0.4712403951701427, + "grad_norm": 1.4234589338302612, + "learning_rate": 4.0559450906649315e-05, + "loss": 0.3417, + "step": 8586 + }, + { + "epoch": 0.47135016465422613, + "grad_norm": 3.975149393081665, + "learning_rate": 4.055535226987133e-05, + "loss": 0.4005, + "step": 8588 + }, + { + "epoch": 0.47145993413830956, + "grad_norm": 1.4527102708816528, + "learning_rate": 4.055125295075924e-05, + "loss": 0.3233, + "step": 8590 + }, + { + "epoch": 0.471569703622393, + "grad_norm": 1.2908766269683838, + "learning_rate": 4.054715294949287e-05, + "loss": 0.316, + "step": 8592 + }, + { + "epoch": 0.4716794731064764, + "grad_norm": 1.6417427062988281, + "learning_rate": 4.054305226625207e-05, + "loss": 0.3117, + "step": 8594 + }, + { + "epoch": 0.4717892425905598, + "grad_norm": 1.3254948854446411, + "learning_rate": 4.053895090121669e-05, + "loss": 0.2787, + "step": 8596 + }, + { + "epoch": 0.47189901207464324, + "grad_norm": 1.576761245727539, + "learning_rate": 4.053484885456666e-05, + "loss": 0.3374, + "step": 8598 + }, + { + "epoch": 0.47200878155872666, + "grad_norm": 1.8205922842025757, + "learning_rate": 4.0530746126481915e-05, + "loss": 0.4846, + "step": 8600 + }, + { + "epoch": 0.4721185510428101, + "grad_norm": 0.9689183831214905, + "learning_rate": 4.052664271714242e-05, + "loss": 0.2863, + "step": 8602 + }, + { + "epoch": 0.4722283205268935, + "grad_norm": 1.9561008214950562, + "learning_rate": 4.0522538626728156e-05, + "loss": 0.4049, + "step": 8604 + }, + { + "epoch": 0.47233809001097693, + "grad_norm": 1.9191542863845825, + "learning_rate": 4.0518433855419155e-05, + "loss": 0.2692, + "step": 8606 + }, + { + "epoch": 0.47244785949506035, + "grad_norm": 1.6041009426116943, + "learning_rate": 4.051432840339549e-05, + "loss": 0.2841, + "step": 8608 + }, + { + "epoch": 0.4725576289791438, + "grad_norm": 2.1041462421417236, + "learning_rate": 4.051022227083723e-05, + "loss": 0.3297, + "step": 8610 + }, + { + "epoch": 0.4726673984632272, + "grad_norm": 2.0920350551605225, + "learning_rate": 4.050611545792448e-05, + "loss": 0.3815, + "step": 8612 + }, + { + "epoch": 0.4727771679473107, + "grad_norm": 1.0190370082855225, + "learning_rate": 4.050200796483741e-05, + "loss": 0.225, + "step": 8614 + }, + { + "epoch": 0.4728869374313941, + "grad_norm": 2.43933367729187, + "learning_rate": 4.0497899791756175e-05, + "loss": 0.2967, + "step": 8616 + }, + { + "epoch": 0.4729967069154775, + "grad_norm": 1.5258729457855225, + "learning_rate": 4.049379093886099e-05, + "loss": 0.3514, + "step": 8618 + }, + { + "epoch": 0.47310647639956094, + "grad_norm": 1.676794409751892, + "learning_rate": 4.048968140633208e-05, + "loss": 0.2805, + "step": 8620 + }, + { + "epoch": 0.47321624588364436, + "grad_norm": 2.102303981781006, + "learning_rate": 4.048557119434972e-05, + "loss": 0.3358, + "step": 8622 + }, + { + "epoch": 0.4733260153677278, + "grad_norm": 2.7345967292785645, + "learning_rate": 4.048146030309421e-05, + "loss": 0.2691, + "step": 8624 + }, + { + "epoch": 0.4734357848518112, + "grad_norm": 1.5964179039001465, + "learning_rate": 4.047734873274586e-05, + "loss": 0.5003, + "step": 8626 + }, + { + "epoch": 0.4735455543358946, + "grad_norm": 1.714919924736023, + "learning_rate": 4.047323648348501e-05, + "loss": 0.2376, + "step": 8628 + }, + { + "epoch": 0.47365532381997805, + "grad_norm": 1.9093276262283325, + "learning_rate": 4.046912355549208e-05, + "loss": 0.3542, + "step": 8630 + }, + { + "epoch": 0.47376509330406147, + "grad_norm": 1.653114914894104, + "learning_rate": 4.0465009948947454e-05, + "loss": 0.5583, + "step": 8632 + }, + { + "epoch": 0.4738748627881449, + "grad_norm": 2.0079030990600586, + "learning_rate": 4.0460895664031585e-05, + "loss": 0.3437, + "step": 8634 + }, + { + "epoch": 0.4739846322722283, + "grad_norm": 1.5226585865020752, + "learning_rate": 4.0456780700924955e-05, + "loss": 0.238, + "step": 8636 + }, + { + "epoch": 0.47409440175631173, + "grad_norm": 2.128345489501953, + "learning_rate": 4.0452665059808045e-05, + "loss": 0.2866, + "step": 8638 + }, + { + "epoch": 0.47420417124039516, + "grad_norm": 1.6698248386383057, + "learning_rate": 4.044854874086141e-05, + "loss": 0.2654, + "step": 8640 + }, + { + "epoch": 0.4743139407244786, + "grad_norm": 1.389648675918579, + "learning_rate": 4.04444317442656e-05, + "loss": 0.3105, + "step": 8642 + }, + { + "epoch": 0.474423710208562, + "grad_norm": 1.5086030960083008, + "learning_rate": 4.0440314070201194e-05, + "loss": 0.324, + "step": 8644 + }, + { + "epoch": 0.4745334796926454, + "grad_norm": 2.5787734985351562, + "learning_rate": 4.043619571884884e-05, + "loss": 0.4567, + "step": 8646 + }, + { + "epoch": 0.47464324917672884, + "grad_norm": 2.0992650985717773, + "learning_rate": 4.0432076690389176e-05, + "loss": 0.2951, + "step": 8648 + }, + { + "epoch": 0.4747530186608123, + "grad_norm": 1.7718474864959717, + "learning_rate": 4.042795698500288e-05, + "loss": 0.2216, + "step": 8650 + }, + { + "epoch": 0.47486278814489574, + "grad_norm": 1.1252772808074951, + "learning_rate": 4.042383660287067e-05, + "loss": 0.2659, + "step": 8652 + }, + { + "epoch": 0.47497255762897916, + "grad_norm": 1.5651158094406128, + "learning_rate": 4.041971554417328e-05, + "loss": 0.3089, + "step": 8654 + }, + { + "epoch": 0.4750823271130626, + "grad_norm": 1.360022783279419, + "learning_rate": 4.0415593809091476e-05, + "loss": 0.4196, + "step": 8656 + }, + { + "epoch": 0.475192096597146, + "grad_norm": 1.7036468982696533, + "learning_rate": 4.041147139780607e-05, + "loss": 0.382, + "step": 8658 + }, + { + "epoch": 0.47530186608122943, + "grad_norm": 1.8930476903915405, + "learning_rate": 4.0407348310497884e-05, + "loss": 0.3669, + "step": 8660 + }, + { + "epoch": 0.47541163556531285, + "grad_norm": 1.6027714014053345, + "learning_rate": 4.0403224547347776e-05, + "loss": 0.381, + "step": 8662 + }, + { + "epoch": 0.4755214050493963, + "grad_norm": 2.1036148071289062, + "learning_rate": 4.039910010853664e-05, + "loss": 0.256, + "step": 8664 + }, + { + "epoch": 0.4756311745334797, + "grad_norm": 2.3946340084075928, + "learning_rate": 4.039497499424538e-05, + "loss": 0.3594, + "step": 8666 + }, + { + "epoch": 0.4757409440175631, + "grad_norm": 1.5805374383926392, + "learning_rate": 4.0390849204654965e-05, + "loss": 0.3899, + "step": 8668 + }, + { + "epoch": 0.47585071350164654, + "grad_norm": 1.7196357250213623, + "learning_rate": 4.038672273994635e-05, + "loss": 0.3497, + "step": 8670 + }, + { + "epoch": 0.47596048298572996, + "grad_norm": 1.0855778455734253, + "learning_rate": 4.0382595600300566e-05, + "loss": 0.2663, + "step": 8672 + }, + { + "epoch": 0.4760702524698134, + "grad_norm": 1.4036169052124023, + "learning_rate": 4.037846778589862e-05, + "loss": 0.3617, + "step": 8674 + }, + { + "epoch": 0.4761800219538968, + "grad_norm": 1.3233675956726074, + "learning_rate": 4.037433929692161e-05, + "loss": 0.2828, + "step": 8676 + }, + { + "epoch": 0.4762897914379802, + "grad_norm": 1.508034586906433, + "learning_rate": 4.037021013355061e-05, + "loss": 0.2364, + "step": 8678 + }, + { + "epoch": 0.47639956092206365, + "grad_norm": 1.4808440208435059, + "learning_rate": 4.0366080295966754e-05, + "loss": 0.4176, + "step": 8680 + }, + { + "epoch": 0.47650933040614707, + "grad_norm": 1.7735154628753662, + "learning_rate": 4.036194978435119e-05, + "loss": 0.2757, + "step": 8682 + }, + { + "epoch": 0.4766190998902305, + "grad_norm": 3.694556713104248, + "learning_rate": 4.035781859888512e-05, + "loss": 0.3189, + "step": 8684 + }, + { + "epoch": 0.47672886937431397, + "grad_norm": 2.148141384124756, + "learning_rate": 4.0353686739749734e-05, + "loss": 0.3795, + "step": 8686 + }, + { + "epoch": 0.4768386388583974, + "grad_norm": 2.0570671558380127, + "learning_rate": 4.034955420712629e-05, + "loss": 0.3257, + "step": 8688 + }, + { + "epoch": 0.4769484083424808, + "grad_norm": 1.6955592632293701, + "learning_rate": 4.034542100119606e-05, + "loss": 0.3154, + "step": 8690 + }, + { + "epoch": 0.47705817782656423, + "grad_norm": 1.2767672538757324, + "learning_rate": 4.0341287122140346e-05, + "loss": 0.2216, + "step": 8692 + }, + { + "epoch": 0.47716794731064766, + "grad_norm": 1.4313441514968872, + "learning_rate": 4.033715257014048e-05, + "loss": 0.3256, + "step": 8694 + }, + { + "epoch": 0.4772777167947311, + "grad_norm": 1.9365800619125366, + "learning_rate": 4.033301734537782e-05, + "loss": 0.2742, + "step": 8696 + }, + { + "epoch": 0.4773874862788145, + "grad_norm": 1.6155753135681152, + "learning_rate": 4.032888144803376e-05, + "loss": 0.2564, + "step": 8698 + }, + { + "epoch": 0.4774972557628979, + "grad_norm": 1.4080674648284912, + "learning_rate": 4.032474487828972e-05, + "loss": 0.5036, + "step": 8700 + }, + { + "epoch": 0.47760702524698134, + "grad_norm": 1.5543922185897827, + "learning_rate": 4.032060763632716e-05, + "loss": 0.3452, + "step": 8702 + }, + { + "epoch": 0.47771679473106476, + "grad_norm": 2.416071891784668, + "learning_rate": 4.031646972232754e-05, + "loss": 0.2985, + "step": 8704 + }, + { + "epoch": 0.4778265642151482, + "grad_norm": 1.2470059394836426, + "learning_rate": 4.0312331136472385e-05, + "loss": 0.2627, + "step": 8706 + }, + { + "epoch": 0.4779363336992316, + "grad_norm": 1.6356616020202637, + "learning_rate": 4.0308191878943237e-05, + "loss": 0.2846, + "step": 8708 + }, + { + "epoch": 0.47804610318331503, + "grad_norm": 2.639033555984497, + "learning_rate": 4.030405194992164e-05, + "loss": 0.3755, + "step": 8710 + }, + { + "epoch": 0.47815587266739845, + "grad_norm": 1.5863823890686035, + "learning_rate": 4.029991134958922e-05, + "loss": 0.3246, + "step": 8712 + }, + { + "epoch": 0.4782656421514819, + "grad_norm": 1.2642024755477905, + "learning_rate": 4.0295770078127594e-05, + "loss": 0.2377, + "step": 8714 + }, + { + "epoch": 0.4783754116355653, + "grad_norm": 1.6965131759643555, + "learning_rate": 4.0291628135718404e-05, + "loss": 0.4143, + "step": 8716 + }, + { + "epoch": 0.4784851811196487, + "grad_norm": 2.1172831058502197, + "learning_rate": 4.0287485522543355e-05, + "loss": 0.3853, + "step": 8718 + }, + { + "epoch": 0.47859495060373214, + "grad_norm": 2.2036190032958984, + "learning_rate": 4.0283342238784154e-05, + "loss": 0.2983, + "step": 8720 + }, + { + "epoch": 0.47870472008781556, + "grad_norm": 2.996178150177002, + "learning_rate": 4.027919828462255e-05, + "loss": 0.3624, + "step": 8722 + }, + { + "epoch": 0.47881448957189904, + "grad_norm": 3.82488751411438, + "learning_rate": 4.027505366024032e-05, + "loss": 0.2996, + "step": 8724 + }, + { + "epoch": 0.47892425905598246, + "grad_norm": 1.5920789241790771, + "learning_rate": 4.027090836581925e-05, + "loss": 0.3758, + "step": 8726 + }, + { + "epoch": 0.4790340285400659, + "grad_norm": 1.532612681388855, + "learning_rate": 4.0266762401541195e-05, + "loss": 0.3625, + "step": 8728 + }, + { + "epoch": 0.4791437980241493, + "grad_norm": 1.6033190488815308, + "learning_rate": 4.0262615767587997e-05, + "loss": 0.2546, + "step": 8730 + }, + { + "epoch": 0.4792535675082327, + "grad_norm": 3.070883274078369, + "learning_rate": 4.025846846414156e-05, + "loss": 0.4727, + "step": 8732 + }, + { + "epoch": 0.47936333699231615, + "grad_norm": 1.5297569036483765, + "learning_rate": 4.025432049138381e-05, + "loss": 0.3465, + "step": 8734 + }, + { + "epoch": 0.47947310647639957, + "grad_norm": 1.9064351320266724, + "learning_rate": 4.0250171849496685e-05, + "loss": 0.323, + "step": 8736 + }, + { + "epoch": 0.479582875960483, + "grad_norm": 1.3727929592132568, + "learning_rate": 4.0246022538662174e-05, + "loss": 0.3064, + "step": 8738 + }, + { + "epoch": 0.4796926454445664, + "grad_norm": 1.274724006652832, + "learning_rate": 4.0241872559062286e-05, + "loss": 0.373, + "step": 8740 + }, + { + "epoch": 0.47980241492864983, + "grad_norm": 1.864495038986206, + "learning_rate": 4.023772191087905e-05, + "loss": 0.3346, + "step": 8742 + }, + { + "epoch": 0.47991218441273326, + "grad_norm": 1.669333815574646, + "learning_rate": 4.023357059429454e-05, + "loss": 0.2667, + "step": 8744 + }, + { + "epoch": 0.4800219538968167, + "grad_norm": 1.9146604537963867, + "learning_rate": 4.022941860949085e-05, + "loss": 0.3177, + "step": 8746 + }, + { + "epoch": 0.4801317233809001, + "grad_norm": 1.375328540802002, + "learning_rate": 4.0225265956650124e-05, + "loss": 0.3636, + "step": 8748 + }, + { + "epoch": 0.4802414928649835, + "grad_norm": 2.130614757537842, + "learning_rate": 4.0221112635954485e-05, + "loss": 0.4269, + "step": 8750 + }, + { + "epoch": 0.48035126234906694, + "grad_norm": 1.4790095090866089, + "learning_rate": 4.021695864758615e-05, + "loss": 0.2927, + "step": 8752 + }, + { + "epoch": 0.48046103183315036, + "grad_norm": 1.8386095762252808, + "learning_rate": 4.021280399172731e-05, + "loss": 0.4135, + "step": 8754 + }, + { + "epoch": 0.4805708013172338, + "grad_norm": 2.1053874492645264, + "learning_rate": 4.020864866856022e-05, + "loss": 0.4609, + "step": 8756 + }, + { + "epoch": 0.4806805708013172, + "grad_norm": 2.984914541244507, + "learning_rate": 4.020449267826716e-05, + "loss": 0.3263, + "step": 8758 + }, + { + "epoch": 0.4807903402854007, + "grad_norm": 1.4272053241729736, + "learning_rate": 4.020033602103042e-05, + "loss": 0.2899, + "step": 8760 + }, + { + "epoch": 0.4809001097694841, + "grad_norm": 2.0961692333221436, + "learning_rate": 4.019617869703232e-05, + "loss": 0.3441, + "step": 8762 + }, + { + "epoch": 0.48100987925356753, + "grad_norm": 1.4326351881027222, + "learning_rate": 4.0192020706455245e-05, + "loss": 0.4114, + "step": 8764 + }, + { + "epoch": 0.48111964873765095, + "grad_norm": 2.8796331882476807, + "learning_rate": 4.0187862049481576e-05, + "loss": 0.5424, + "step": 8766 + }, + { + "epoch": 0.4812294182217344, + "grad_norm": 4.235195636749268, + "learning_rate": 4.018370272629373e-05, + "loss": 0.3307, + "step": 8768 + }, + { + "epoch": 0.4813391877058178, + "grad_norm": 2.121422052383423, + "learning_rate": 4.017954273707416e-05, + "loss": 0.1981, + "step": 8770 + }, + { + "epoch": 0.4814489571899012, + "grad_norm": 1.1243647336959839, + "learning_rate": 4.017538208200534e-05, + "loss": 0.2962, + "step": 8772 + }, + { + "epoch": 0.48155872667398464, + "grad_norm": 2.156158208847046, + "learning_rate": 4.017122076126977e-05, + "loss": 0.394, + "step": 8774 + }, + { + "epoch": 0.48166849615806806, + "grad_norm": 1.7912416458129883, + "learning_rate": 4.0167058775049996e-05, + "loss": 0.3989, + "step": 8776 + }, + { + "epoch": 0.4817782656421515, + "grad_norm": 1.953059196472168, + "learning_rate": 4.0162896123528585e-05, + "loss": 0.2744, + "step": 8778 + }, + { + "epoch": 0.4818880351262349, + "grad_norm": 1.9908020496368408, + "learning_rate": 4.015873280688811e-05, + "loss": 0.4705, + "step": 8780 + }, + { + "epoch": 0.4819978046103183, + "grad_norm": 3.3449676036834717, + "learning_rate": 4.0154568825311224e-05, + "loss": 0.3801, + "step": 8782 + }, + { + "epoch": 0.48210757409440175, + "grad_norm": 1.6360225677490234, + "learning_rate": 4.015040417898057e-05, + "loss": 0.2672, + "step": 8784 + }, + { + "epoch": 0.48221734357848517, + "grad_norm": 1.8550375699996948, + "learning_rate": 4.014623886807882e-05, + "loss": 0.3228, + "step": 8786 + }, + { + "epoch": 0.4823271130625686, + "grad_norm": 1.4569674730300903, + "learning_rate": 4.014207289278869e-05, + "loss": 0.326, + "step": 8788 + }, + { + "epoch": 0.482436882546652, + "grad_norm": 1.885279655456543, + "learning_rate": 4.013790625329292e-05, + "loss": 0.3915, + "step": 8790 + }, + { + "epoch": 0.48254665203073543, + "grad_norm": 1.781617283821106, + "learning_rate": 4.0133738949774285e-05, + "loss": 0.3025, + "step": 8792 + }, + { + "epoch": 0.48265642151481886, + "grad_norm": 2.2281839847564697, + "learning_rate": 4.012957098241558e-05, + "loss": 0.3484, + "step": 8794 + }, + { + "epoch": 0.4827661909989023, + "grad_norm": 1.321531057357788, + "learning_rate": 4.012540235139962e-05, + "loss": 0.2924, + "step": 8796 + }, + { + "epoch": 0.48287596048298576, + "grad_norm": 1.9724355936050415, + "learning_rate": 4.0121233056909286e-05, + "loss": 0.4099, + "step": 8798 + }, + { + "epoch": 0.4829857299670692, + "grad_norm": 1.4479936361312866, + "learning_rate": 4.011706309912745e-05, + "loss": 0.2295, + "step": 8800 + }, + { + "epoch": 0.4830954994511526, + "grad_norm": 2.117464542388916, + "learning_rate": 4.0112892478237016e-05, + "loss": 0.4635, + "step": 8802 + }, + { + "epoch": 0.483205268935236, + "grad_norm": 1.6428061723709106, + "learning_rate": 4.010872119442095e-05, + "loss": 0.2944, + "step": 8804 + }, + { + "epoch": 0.48331503841931944, + "grad_norm": 0.9920347929000854, + "learning_rate": 4.010454924786222e-05, + "loss": 0.3203, + "step": 8806 + }, + { + "epoch": 0.48342480790340286, + "grad_norm": 1.9047728776931763, + "learning_rate": 4.0100376638743806e-05, + "loss": 0.3697, + "step": 8808 + }, + { + "epoch": 0.4835345773874863, + "grad_norm": 2.4276199340820312, + "learning_rate": 4.0096203367248765e-05, + "loss": 0.3905, + "step": 8810 + }, + { + "epoch": 0.4836443468715697, + "grad_norm": 1.8146473169326782, + "learning_rate": 4.0092029433560144e-05, + "loss": 0.3138, + "step": 8812 + }, + { + "epoch": 0.48375411635565313, + "grad_norm": 1.1564838886260986, + "learning_rate": 4.0087854837861044e-05, + "loss": 0.3202, + "step": 8814 + }, + { + "epoch": 0.48386388583973655, + "grad_norm": 2.4166035652160645, + "learning_rate": 4.008367958033457e-05, + "loss": 0.4272, + "step": 8816 + }, + { + "epoch": 0.48397365532382, + "grad_norm": 1.6866629123687744, + "learning_rate": 4.0079503661163876e-05, + "loss": 0.3758, + "step": 8818 + }, + { + "epoch": 0.4840834248079034, + "grad_norm": 1.4119205474853516, + "learning_rate": 4.007532708053213e-05, + "loss": 0.3369, + "step": 8820 + }, + { + "epoch": 0.4841931942919868, + "grad_norm": 2.0401699542999268, + "learning_rate": 4.007114983862256e-05, + "loss": 0.2896, + "step": 8822 + }, + { + "epoch": 0.48430296377607024, + "grad_norm": 1.9813860654830933, + "learning_rate": 4.006697193561837e-05, + "loss": 0.3232, + "step": 8824 + }, + { + "epoch": 0.48441273326015366, + "grad_norm": 2.4315907955169678, + "learning_rate": 4.006279337170283e-05, + "loss": 0.2341, + "step": 8826 + }, + { + "epoch": 0.4845225027442371, + "grad_norm": 1.683624029159546, + "learning_rate": 4.005861414705926e-05, + "loss": 0.206, + "step": 8828 + }, + { + "epoch": 0.4846322722283205, + "grad_norm": 1.274879813194275, + "learning_rate": 4.0054434261870956e-05, + "loss": 0.2168, + "step": 8830 + }, + { + "epoch": 0.4847420417124039, + "grad_norm": 1.1499210596084595, + "learning_rate": 4.005025371632127e-05, + "loss": 0.2218, + "step": 8832 + }, + { + "epoch": 0.4848518111964874, + "grad_norm": 1.815457820892334, + "learning_rate": 4.004607251059359e-05, + "loss": 0.4559, + "step": 8834 + }, + { + "epoch": 0.4849615806805708, + "grad_norm": 3.6823201179504395, + "learning_rate": 4.004189064487131e-05, + "loss": 0.2717, + "step": 8836 + }, + { + "epoch": 0.48507135016465425, + "grad_norm": 1.6230652332305908, + "learning_rate": 4.003770811933788e-05, + "loss": 0.3341, + "step": 8838 + }, + { + "epoch": 0.48518111964873767, + "grad_norm": 1.6046652793884277, + "learning_rate": 4.003352493417676e-05, + "loss": 0.2001, + "step": 8840 + }, + { + "epoch": 0.4852908891328211, + "grad_norm": 1.370919942855835, + "learning_rate": 4.002934108957146e-05, + "loss": 0.266, + "step": 8842 + }, + { + "epoch": 0.4854006586169045, + "grad_norm": 2.5116124153137207, + "learning_rate": 4.002515658570548e-05, + "loss": 0.3375, + "step": 8844 + }, + { + "epoch": 0.48551042810098793, + "grad_norm": 1.630244255065918, + "learning_rate": 4.0020971422762386e-05, + "loss": 0.3569, + "step": 8846 + }, + { + "epoch": 0.48562019758507136, + "grad_norm": 1.6948027610778809, + "learning_rate": 4.001678560092577e-05, + "loss": 0.3426, + "step": 8848 + }, + { + "epoch": 0.4857299670691548, + "grad_norm": 1.3448436260223389, + "learning_rate": 4.001259912037921e-05, + "loss": 0.4067, + "step": 8850 + }, + { + "epoch": 0.4858397365532382, + "grad_norm": 1.688605546951294, + "learning_rate": 4.0008411981306383e-05, + "loss": 0.3479, + "step": 8852 + }, + { + "epoch": 0.4859495060373216, + "grad_norm": 2.192854642868042, + "learning_rate": 4.000422418389094e-05, + "loss": 0.3937, + "step": 8854 + }, + { + "epoch": 0.48605927552140504, + "grad_norm": 1.4297267198562622, + "learning_rate": 4.000003572831656e-05, + "loss": 0.4063, + "step": 8856 + }, + { + "epoch": 0.48616904500548846, + "grad_norm": 1.3423888683319092, + "learning_rate": 3.9995846614767006e-05, + "loss": 0.2729, + "step": 8858 + }, + { + "epoch": 0.4862788144895719, + "grad_norm": 2.2756447792053223, + "learning_rate": 3.9991656843426015e-05, + "loss": 0.2291, + "step": 8860 + }, + { + "epoch": 0.4863885839736553, + "grad_norm": 1.3635532855987549, + "learning_rate": 3.998746641447737e-05, + "loss": 0.3774, + "step": 8862 + }, + { + "epoch": 0.48649835345773873, + "grad_norm": 1.815836787223816, + "learning_rate": 3.9983275328104886e-05, + "loss": 0.2566, + "step": 8864 + }, + { + "epoch": 0.48660812294182215, + "grad_norm": 0.7939592599868774, + "learning_rate": 3.99790835844924e-05, + "loss": 0.2637, + "step": 8866 + }, + { + "epoch": 0.4867178924259056, + "grad_norm": 1.3544139862060547, + "learning_rate": 3.9974891183823786e-05, + "loss": 0.4222, + "step": 8868 + }, + { + "epoch": 0.486827661909989, + "grad_norm": 2.2878994941711426, + "learning_rate": 3.9970698126282944e-05, + "loss": 0.2898, + "step": 8870 + }, + { + "epoch": 0.4869374313940725, + "grad_norm": 2.7084484100341797, + "learning_rate": 3.996650441205381e-05, + "loss": 0.423, + "step": 8872 + }, + { + "epoch": 0.4870472008781559, + "grad_norm": 1.4077540636062622, + "learning_rate": 3.9962310041320316e-05, + "loss": 0.3022, + "step": 8874 + }, + { + "epoch": 0.4871569703622393, + "grad_norm": 1.509132981300354, + "learning_rate": 3.995811501426648e-05, + "loss": 0.2145, + "step": 8876 + }, + { + "epoch": 0.48726673984632274, + "grad_norm": 1.7894909381866455, + "learning_rate": 3.9953919331076294e-05, + "loss": 0.25, + "step": 8878 + }, + { + "epoch": 0.48737650933040616, + "grad_norm": 1.6637341976165771, + "learning_rate": 3.994972299193381e-05, + "loss": 0.2486, + "step": 8880 + }, + { + "epoch": 0.4874862788144896, + "grad_norm": 2.4301412105560303, + "learning_rate": 3.994552599702309e-05, + "loss": 0.374, + "step": 8882 + }, + { + "epoch": 0.487596048298573, + "grad_norm": 1.7023228406906128, + "learning_rate": 3.994132834652825e-05, + "loss": 0.3342, + "step": 8884 + }, + { + "epoch": 0.4877058177826564, + "grad_norm": 1.3097496032714844, + "learning_rate": 3.9937130040633414e-05, + "loss": 0.2649, + "step": 8886 + }, + { + "epoch": 0.48781558726673985, + "grad_norm": 2.1472132205963135, + "learning_rate": 3.993293107952274e-05, + "loss": 0.3379, + "step": 8888 + }, + { + "epoch": 0.48792535675082327, + "grad_norm": 1.9425266981124878, + "learning_rate": 3.9928731463380406e-05, + "loss": 0.4276, + "step": 8890 + }, + { + "epoch": 0.4880351262349067, + "grad_norm": 2.944364070892334, + "learning_rate": 3.9924531192390644e-05, + "loss": 0.3018, + "step": 8892 + }, + { + "epoch": 0.4881448957189901, + "grad_norm": 3.026873826980591, + "learning_rate": 3.992033026673768e-05, + "loss": 0.3072, + "step": 8894 + }, + { + "epoch": 0.48825466520307353, + "grad_norm": 1.3403549194335938, + "learning_rate": 3.9916128686605814e-05, + "loss": 0.2473, + "step": 8896 + }, + { + "epoch": 0.48836443468715696, + "grad_norm": 2.253572702407837, + "learning_rate": 3.9911926452179314e-05, + "loss": 0.2899, + "step": 8898 + }, + { + "epoch": 0.4884742041712404, + "grad_norm": 2.144195556640625, + "learning_rate": 3.990772356364253e-05, + "loss": 0.3451, + "step": 8900 + }, + { + "epoch": 0.4885839736553238, + "grad_norm": 1.3330267667770386, + "learning_rate": 3.990352002117983e-05, + "loss": 0.2671, + "step": 8902 + }, + { + "epoch": 0.4886937431394072, + "grad_norm": 1.5990145206451416, + "learning_rate": 3.989931582497558e-05, + "loss": 0.2546, + "step": 8904 + }, + { + "epoch": 0.48880351262349064, + "grad_norm": 1.4141170978546143, + "learning_rate": 3.989511097521421e-05, + "loss": 0.2465, + "step": 8906 + }, + { + "epoch": 0.4889132821075741, + "grad_norm": 1.9576119184494019, + "learning_rate": 3.989090547208017e-05, + "loss": 0.3454, + "step": 8908 + }, + { + "epoch": 0.48902305159165754, + "grad_norm": 1.1365801095962524, + "learning_rate": 3.988669931575792e-05, + "loss": 0.3198, + "step": 8910 + }, + { + "epoch": 0.48913282107574096, + "grad_norm": 1.6862620115280151, + "learning_rate": 3.9882492506431974e-05, + "loss": 0.3001, + "step": 8912 + }, + { + "epoch": 0.4892425905598244, + "grad_norm": 1.723815679550171, + "learning_rate": 3.987828504428685e-05, + "loss": 0.4384, + "step": 8914 + }, + { + "epoch": 0.4893523600439078, + "grad_norm": 2.6691977977752686, + "learning_rate": 3.987407692950713e-05, + "loss": 0.3725, + "step": 8916 + }, + { + "epoch": 0.48946212952799123, + "grad_norm": 1.1726206541061401, + "learning_rate": 3.986986816227738e-05, + "loss": 0.2904, + "step": 8918 + }, + { + "epoch": 0.48957189901207465, + "grad_norm": 1.4826600551605225, + "learning_rate": 3.986565874278223e-05, + "loss": 0.3635, + "step": 8920 + }, + { + "epoch": 0.4896816684961581, + "grad_norm": 3.401667356491089, + "learning_rate": 3.9861448671206325e-05, + "loss": 0.3196, + "step": 8922 + }, + { + "epoch": 0.4897914379802415, + "grad_norm": 1.8330165147781372, + "learning_rate": 3.985723794773434e-05, + "loss": 0.2546, + "step": 8924 + }, + { + "epoch": 0.4899012074643249, + "grad_norm": 1.343770980834961, + "learning_rate": 3.985302657255097e-05, + "loss": 0.2165, + "step": 8926 + }, + { + "epoch": 0.49001097694840834, + "grad_norm": 1.5090328454971313, + "learning_rate": 3.984881454584095e-05, + "loss": 0.264, + "step": 8928 + }, + { + "epoch": 0.49012074643249176, + "grad_norm": 2.0968899726867676, + "learning_rate": 3.984460186778904e-05, + "loss": 0.294, + "step": 8930 + }, + { + "epoch": 0.4902305159165752, + "grad_norm": 1.8710025548934937, + "learning_rate": 3.984038853858003e-05, + "loss": 0.3864, + "step": 8932 + }, + { + "epoch": 0.4903402854006586, + "grad_norm": 2.460542917251587, + "learning_rate": 3.9836174558398744e-05, + "loss": 0.2719, + "step": 8934 + }, + { + "epoch": 0.490450054884742, + "grad_norm": 2.5591256618499756, + "learning_rate": 3.9831959927430017e-05, + "loss": 0.283, + "step": 8936 + }, + { + "epoch": 0.49055982436882545, + "grad_norm": 1.4945439100265503, + "learning_rate": 3.982774464585873e-05, + "loss": 0.2929, + "step": 8938 + }, + { + "epoch": 0.49066959385290887, + "grad_norm": 1.5153741836547852, + "learning_rate": 3.982352871386979e-05, + "loss": 0.3503, + "step": 8940 + }, + { + "epoch": 0.4907793633369923, + "grad_norm": 1.1660202741622925, + "learning_rate": 3.981931213164812e-05, + "loss": 0.3433, + "step": 8942 + }, + { + "epoch": 0.4908891328210757, + "grad_norm": 1.0889536142349243, + "learning_rate": 3.981509489937868e-05, + "loss": 0.1912, + "step": 8944 + }, + { + "epoch": 0.4909989023051592, + "grad_norm": 2.175877809524536, + "learning_rate": 3.981087701724645e-05, + "loss": 0.3311, + "step": 8946 + }, + { + "epoch": 0.4911086717892426, + "grad_norm": 1.8597362041473389, + "learning_rate": 3.980665848543647e-05, + "loss": 0.3938, + "step": 8948 + }, + { + "epoch": 0.49121844127332603, + "grad_norm": 1.302565097808838, + "learning_rate": 3.980243930413378e-05, + "loss": 0.3297, + "step": 8950 + }, + { + "epoch": 0.49132821075740946, + "grad_norm": 1.2603464126586914, + "learning_rate": 3.9798219473523435e-05, + "loss": 0.357, + "step": 8952 + }, + { + "epoch": 0.4914379802414929, + "grad_norm": 1.2566109895706177, + "learning_rate": 3.979399899379055e-05, + "loss": 0.3074, + "step": 8954 + }, + { + "epoch": 0.4915477497255763, + "grad_norm": 2.1465933322906494, + "learning_rate": 3.978977786512026e-05, + "loss": 0.1924, + "step": 8956 + }, + { + "epoch": 0.4916575192096597, + "grad_norm": 1.3968206644058228, + "learning_rate": 3.9785556087697726e-05, + "loss": 0.3747, + "step": 8958 + }, + { + "epoch": 0.49176728869374314, + "grad_norm": 1.5458095073699951, + "learning_rate": 3.978133366170812e-05, + "loss": 0.3116, + "step": 8960 + }, + { + "epoch": 0.49187705817782656, + "grad_norm": 2.411190986633301, + "learning_rate": 3.9777110587336674e-05, + "loss": 0.2598, + "step": 8962 + }, + { + "epoch": 0.49198682766191, + "grad_norm": 1.990957498550415, + "learning_rate": 3.9772886864768626e-05, + "loss": 0.2545, + "step": 8964 + }, + { + "epoch": 0.4920965971459934, + "grad_norm": 1.4192179441452026, + "learning_rate": 3.976866249418925e-05, + "loss": 0.3276, + "step": 8966 + }, + { + "epoch": 0.49220636663007683, + "grad_norm": 1.52523934841156, + "learning_rate": 3.976443747578385e-05, + "loss": 0.2441, + "step": 8968 + }, + { + "epoch": 0.49231613611416025, + "grad_norm": 1.4756760597229004, + "learning_rate": 3.976021180973775e-05, + "loss": 0.2807, + "step": 8970 + }, + { + "epoch": 0.4924259055982437, + "grad_norm": 1.668246865272522, + "learning_rate": 3.975598549623632e-05, + "loss": 0.3363, + "step": 8972 + }, + { + "epoch": 0.4925356750823271, + "grad_norm": 1.4362684488296509, + "learning_rate": 3.9751758535464935e-05, + "loss": 0.366, + "step": 8974 + }, + { + "epoch": 0.4926454445664105, + "grad_norm": 1.2815130949020386, + "learning_rate": 3.974753092760901e-05, + "loss": 0.1896, + "step": 8976 + }, + { + "epoch": 0.49275521405049394, + "grad_norm": 1.5012887716293335, + "learning_rate": 3.9743302672854e-05, + "loss": 0.3313, + "step": 8978 + }, + { + "epoch": 0.49286498353457736, + "grad_norm": 1.5740617513656616, + "learning_rate": 3.973907377138537e-05, + "loss": 0.2322, + "step": 8980 + }, + { + "epoch": 0.49297475301866084, + "grad_norm": 3.2669107913970947, + "learning_rate": 3.973484422338862e-05, + "loss": 0.3815, + "step": 8982 + }, + { + "epoch": 0.49308452250274426, + "grad_norm": 2.2934257984161377, + "learning_rate": 3.973061402904928e-05, + "loss": 0.4492, + "step": 8984 + }, + { + "epoch": 0.4931942919868277, + "grad_norm": 1.9124375581741333, + "learning_rate": 3.972638318855291e-05, + "loss": 0.338, + "step": 8986 + }, + { + "epoch": 0.4933040614709111, + "grad_norm": 3.289844036102295, + "learning_rate": 3.972215170208508e-05, + "loss": 0.2312, + "step": 8988 + }, + { + "epoch": 0.4934138309549945, + "grad_norm": 1.5759061574935913, + "learning_rate": 3.971791956983143e-05, + "loss": 0.2969, + "step": 8990 + }, + { + "epoch": 0.49352360043907795, + "grad_norm": 2.008754253387451, + "learning_rate": 3.9713686791977575e-05, + "loss": 0.3635, + "step": 8992 + }, + { + "epoch": 0.49363336992316137, + "grad_norm": 2.551917314529419, + "learning_rate": 3.97094533687092e-05, + "loss": 0.2957, + "step": 8994 + }, + { + "epoch": 0.4937431394072448, + "grad_norm": 2.476706027984619, + "learning_rate": 3.9705219300212016e-05, + "loss": 0.3935, + "step": 8996 + }, + { + "epoch": 0.4938529088913282, + "grad_norm": 1.2736437320709229, + "learning_rate": 3.970098458667172e-05, + "loss": 0.28, + "step": 8998 + }, + { + "epoch": 0.49396267837541163, + "grad_norm": 1.5236448049545288, + "learning_rate": 3.9696749228274095e-05, + "loss": 0.3951, + "step": 9000 + }, + { + "epoch": 0.49407244785949506, + "grad_norm": 1.5954474210739136, + "learning_rate": 3.969251322520491e-05, + "loss": 0.2784, + "step": 9002 + }, + { + "epoch": 0.4941822173435785, + "grad_norm": 2.0640676021575928, + "learning_rate": 3.968827657764997e-05, + "loss": 0.3364, + "step": 9004 + }, + { + "epoch": 0.4942919868276619, + "grad_norm": 1.752331256866455, + "learning_rate": 3.968403928579513e-05, + "loss": 0.3062, + "step": 9006 + }, + { + "epoch": 0.4944017563117453, + "grad_norm": 1.500508189201355, + "learning_rate": 3.967980134982626e-05, + "loss": 0.3113, + "step": 9008 + }, + { + "epoch": 0.49451152579582874, + "grad_norm": 2.111781120300293, + "learning_rate": 3.9675562769929245e-05, + "loss": 0.4271, + "step": 9010 + }, + { + "epoch": 0.49462129527991217, + "grad_norm": 2.3780670166015625, + "learning_rate": 3.967132354629002e-05, + "loss": 0.2116, + "step": 9012 + }, + { + "epoch": 0.4947310647639956, + "grad_norm": 1.4984153509140015, + "learning_rate": 3.966708367909453e-05, + "loss": 0.2768, + "step": 9014 + }, + { + "epoch": 0.494840834248079, + "grad_norm": 3.1834304332733154, + "learning_rate": 3.966284316852876e-05, + "loss": 0.3691, + "step": 9016 + }, + { + "epoch": 0.4949506037321625, + "grad_norm": 1.7665456533432007, + "learning_rate": 3.965860201477872e-05, + "loss": 0.3273, + "step": 9018 + }, + { + "epoch": 0.4950603732162459, + "grad_norm": 2.2442715167999268, + "learning_rate": 3.965436021803044e-05, + "loss": 0.2354, + "step": 9020 + }, + { + "epoch": 0.49517014270032933, + "grad_norm": 2.744109869003296, + "learning_rate": 3.9650117778469996e-05, + "loss": 0.4547, + "step": 9022 + }, + { + "epoch": 0.49527991218441275, + "grad_norm": 2.7365989685058594, + "learning_rate": 3.964587469628348e-05, + "loss": 0.2597, + "step": 9024 + }, + { + "epoch": 0.4953896816684962, + "grad_norm": 1.8460122346878052, + "learning_rate": 3.964163097165702e-05, + "loss": 0.3454, + "step": 9026 + }, + { + "epoch": 0.4954994511525796, + "grad_norm": 1.066847801208496, + "learning_rate": 3.9637386604776757e-05, + "loss": 0.2018, + "step": 9028 + }, + { + "epoch": 0.495609220636663, + "grad_norm": 2.6569416522979736, + "learning_rate": 3.963314159582887e-05, + "loss": 0.3965, + "step": 9030 + }, + { + "epoch": 0.49571899012074644, + "grad_norm": 1.6528611183166504, + "learning_rate": 3.962889594499957e-05, + "loss": 0.2671, + "step": 9032 + }, + { + "epoch": 0.49582875960482986, + "grad_norm": 1.6137902736663818, + "learning_rate": 3.962464965247509e-05, + "loss": 0.408, + "step": 9034 + }, + { + "epoch": 0.4959385290889133, + "grad_norm": 2.8055763244628906, + "learning_rate": 3.962040271844169e-05, + "loss": 0.3275, + "step": 9036 + }, + { + "epoch": 0.4960482985729967, + "grad_norm": 2.0987329483032227, + "learning_rate": 3.9616155143085676e-05, + "loss": 0.4238, + "step": 9038 + }, + { + "epoch": 0.4961580680570801, + "grad_norm": 1.7685799598693848, + "learning_rate": 3.961190692659335e-05, + "loss": 0.3504, + "step": 9040 + }, + { + "epoch": 0.49626783754116355, + "grad_norm": 1.4062660932540894, + "learning_rate": 3.960765806915107e-05, + "loss": 0.313, + "step": 9042 + }, + { + "epoch": 0.49637760702524697, + "grad_norm": 1.2293471097946167, + "learning_rate": 3.9603408570945204e-05, + "loss": 0.2983, + "step": 9044 + }, + { + "epoch": 0.4964873765093304, + "grad_norm": 2.0662965774536133, + "learning_rate": 3.959915843216216e-05, + "loss": 0.3419, + "step": 9046 + }, + { + "epoch": 0.4965971459934138, + "grad_norm": 1.5634279251098633, + "learning_rate": 3.959490765298838e-05, + "loss": 0.3753, + "step": 9048 + }, + { + "epoch": 0.49670691547749724, + "grad_norm": 3.502274990081787, + "learning_rate": 3.9590656233610304e-05, + "loss": 0.3354, + "step": 9050 + }, + { + "epoch": 0.49681668496158066, + "grad_norm": 1.5983493328094482, + "learning_rate": 3.958640417421444e-05, + "loss": 0.2402, + "step": 9052 + }, + { + "epoch": 0.4969264544456641, + "grad_norm": 1.270450234413147, + "learning_rate": 3.9582151474987286e-05, + "loss": 0.3354, + "step": 9054 + }, + { + "epoch": 0.49703622392974756, + "grad_norm": 1.1145856380462646, + "learning_rate": 3.9577898136115397e-05, + "loss": 0.3034, + "step": 9056 + }, + { + "epoch": 0.497145993413831, + "grad_norm": 2.519165515899658, + "learning_rate": 3.957364415778535e-05, + "loss": 0.3037, + "step": 9058 + }, + { + "epoch": 0.4972557628979144, + "grad_norm": 1.8645052909851074, + "learning_rate": 3.956938954018373e-05, + "loss": 0.2673, + "step": 9060 + }, + { + "epoch": 0.4973655323819978, + "grad_norm": 1.3837823867797852, + "learning_rate": 3.9565134283497175e-05, + "loss": 0.418, + "step": 9062 + }, + { + "epoch": 0.49747530186608124, + "grad_norm": 1.8457646369934082, + "learning_rate": 3.956087838791235e-05, + "loss": 0.4235, + "step": 9064 + }, + { + "epoch": 0.49758507135016466, + "grad_norm": 2.076090097427368, + "learning_rate": 3.9556621853615915e-05, + "loss": 0.3491, + "step": 9066 + }, + { + "epoch": 0.4976948408342481, + "grad_norm": 2.8077497482299805, + "learning_rate": 3.9552364680794606e-05, + "loss": 0.2274, + "step": 9068 + }, + { + "epoch": 0.4978046103183315, + "grad_norm": 1.332563042640686, + "learning_rate": 3.9548106869635145e-05, + "loss": 0.3289, + "step": 9070 + }, + { + "epoch": 0.49791437980241493, + "grad_norm": 2.299010753631592, + "learning_rate": 3.954384842032433e-05, + "loss": 0.3693, + "step": 9072 + }, + { + "epoch": 0.49802414928649835, + "grad_norm": 2.0859735012054443, + "learning_rate": 3.953958933304892e-05, + "loss": 0.2626, + "step": 9074 + }, + { + "epoch": 0.4981339187705818, + "grad_norm": 1.5658577680587769, + "learning_rate": 3.953532960799577e-05, + "loss": 0.3402, + "step": 9076 + }, + { + "epoch": 0.4982436882546652, + "grad_norm": 2.085752487182617, + "learning_rate": 3.953106924535171e-05, + "loss": 0.3106, + "step": 9078 + }, + { + "epoch": 0.4983534577387486, + "grad_norm": 1.973351001739502, + "learning_rate": 3.952680824530364e-05, + "loss": 0.3275, + "step": 9080 + }, + { + "epoch": 0.49846322722283204, + "grad_norm": 1.8107407093048096, + "learning_rate": 3.9522546608038446e-05, + "loss": 0.2075, + "step": 9082 + }, + { + "epoch": 0.49857299670691546, + "grad_norm": 2.092576026916504, + "learning_rate": 3.9518284333743084e-05, + "loss": 0.3416, + "step": 9084 + }, + { + "epoch": 0.4986827661909989, + "grad_norm": 1.1496124267578125, + "learning_rate": 3.9514021422604516e-05, + "loss": 0.382, + "step": 9086 + }, + { + "epoch": 0.4987925356750823, + "grad_norm": 1.1865239143371582, + "learning_rate": 3.9509757874809714e-05, + "loss": 0.2489, + "step": 9088 + }, + { + "epoch": 0.4989023051591657, + "grad_norm": 2.585958957672119, + "learning_rate": 3.950549369054573e-05, + "loss": 0.3069, + "step": 9090 + }, + { + "epoch": 0.4990120746432492, + "grad_norm": 1.7479274272918701, + "learning_rate": 3.950122886999959e-05, + "loss": 0.2333, + "step": 9092 + }, + { + "epoch": 0.4991218441273326, + "grad_norm": 2.0563361644744873, + "learning_rate": 3.949696341335838e-05, + "loss": 0.3839, + "step": 9094 + }, + { + "epoch": 0.49923161361141605, + "grad_norm": 1.3892794847488403, + "learning_rate": 3.949269732080919e-05, + "loss": 0.2424, + "step": 9096 + }, + { + "epoch": 0.49934138309549947, + "grad_norm": 2.397681951522827, + "learning_rate": 3.948843059253916e-05, + "loss": 0.2456, + "step": 9098 + }, + { + "epoch": 0.4994511525795829, + "grad_norm": 1.6688917875289917, + "learning_rate": 3.948416322873546e-05, + "loss": 0.2818, + "step": 9100 + }, + { + "epoch": 0.4995609220636663, + "grad_norm": 2.0586278438568115, + "learning_rate": 3.947989522958526e-05, + "loss": 0.1734, + "step": 9102 + }, + { + "epoch": 0.49967069154774973, + "grad_norm": 1.5538146495819092, + "learning_rate": 3.947562659527579e-05, + "loss": 0.2869, + "step": 9104 + }, + { + "epoch": 0.49978046103183316, + "grad_norm": 1.5382153987884521, + "learning_rate": 3.947135732599428e-05, + "loss": 0.2752, + "step": 9106 + }, + { + "epoch": 0.4998902305159166, + "grad_norm": 2.06135630607605, + "learning_rate": 3.946708742192802e-05, + "loss": 0.3313, + "step": 9108 + }, + { + "epoch": 0.5, + "grad_norm": 2.3237695693969727, + "learning_rate": 3.9462816883264295e-05, + "loss": 0.3159, + "step": 9110 + }, + { + "epoch": 0.5001097694840835, + "grad_norm": 1.2069194316864014, + "learning_rate": 3.945854571019042e-05, + "loss": 0.3274, + "step": 9112 + }, + { + "epoch": 0.5002195389681668, + "grad_norm": 2.002253532409668, + "learning_rate": 3.945427390289378e-05, + "loss": 0.4175, + "step": 9114 + }, + { + "epoch": 0.5003293084522503, + "grad_norm": 2.0508222579956055, + "learning_rate": 3.945000146156173e-05, + "loss": 0.3211, + "step": 9116 + }, + { + "epoch": 0.5004390779363337, + "grad_norm": 1.7116293907165527, + "learning_rate": 3.94457283863817e-05, + "loss": 0.3099, + "step": 9118 + }, + { + "epoch": 0.5005488474204172, + "grad_norm": 1.7231932878494263, + "learning_rate": 3.944145467754111e-05, + "loss": 0.2729, + "step": 9120 + }, + { + "epoch": 0.5006586169045005, + "grad_norm": 2.9138824939727783, + "learning_rate": 3.943718033522744e-05, + "loss": 0.44, + "step": 9122 + }, + { + "epoch": 0.500768386388584, + "grad_norm": 1.8123257160186768, + "learning_rate": 3.943290535962818e-05, + "loss": 0.3682, + "step": 9124 + }, + { + "epoch": 0.5008781558726674, + "grad_norm": 1.672720193862915, + "learning_rate": 3.942862975093085e-05, + "loss": 0.3941, + "step": 9126 + }, + { + "epoch": 0.5009879253567509, + "grad_norm": 3.0158519744873047, + "learning_rate": 3.9424353509322984e-05, + "loss": 0.2652, + "step": 9128 + }, + { + "epoch": 0.5010976948408342, + "grad_norm": 1.7462236881256104, + "learning_rate": 3.9420076634992194e-05, + "loss": 0.3332, + "step": 9130 + }, + { + "epoch": 0.5012074643249177, + "grad_norm": 1.4158501625061035, + "learning_rate": 3.941579912812606e-05, + "loss": 0.2181, + "step": 9132 + }, + { + "epoch": 0.5013172338090011, + "grad_norm": 1.523685097694397, + "learning_rate": 3.9411520988912215e-05, + "loss": 0.2665, + "step": 9134 + }, + { + "epoch": 0.5014270032930845, + "grad_norm": 0.9697153568267822, + "learning_rate": 3.940724221753832e-05, + "loss": 0.2918, + "step": 9136 + }, + { + "epoch": 0.5015367727771679, + "grad_norm": 2.823272228240967, + "learning_rate": 3.940296281419208e-05, + "loss": 0.3913, + "step": 9138 + }, + { + "epoch": 0.5016465422612514, + "grad_norm": 1.766518235206604, + "learning_rate": 3.939868277906119e-05, + "loss": 0.3093, + "step": 9140 + }, + { + "epoch": 0.5017563117453347, + "grad_norm": 2.657521963119507, + "learning_rate": 3.9394402112333397e-05, + "loss": 0.4508, + "step": 9142 + }, + { + "epoch": 0.5018660812294182, + "grad_norm": 1.7411141395568848, + "learning_rate": 3.939012081419648e-05, + "loss": 0.2605, + "step": 9144 + }, + { + "epoch": 0.5019758507135016, + "grad_norm": 1.210057258605957, + "learning_rate": 3.9385838884838225e-05, + "loss": 0.3449, + "step": 9146 + }, + { + "epoch": 0.5020856201975851, + "grad_norm": 1.0393311977386475, + "learning_rate": 3.938155632444648e-05, + "loss": 0.2393, + "step": 9148 + }, + { + "epoch": 0.5021953896816685, + "grad_norm": 2.2382848262786865, + "learning_rate": 3.9377273133209076e-05, + "loss": 0.2481, + "step": 9150 + }, + { + "epoch": 0.5023051591657519, + "grad_norm": 2.0688281059265137, + "learning_rate": 3.937298931131391e-05, + "loss": 0.3838, + "step": 9152 + }, + { + "epoch": 0.5024149286498354, + "grad_norm": 3.8967885971069336, + "learning_rate": 3.936870485894888e-05, + "loss": 0.4148, + "step": 9154 + }, + { + "epoch": 0.5025246981339188, + "grad_norm": 2.770108938217163, + "learning_rate": 3.936441977630193e-05, + "loss": 0.3767, + "step": 9156 + }, + { + "epoch": 0.5026344676180022, + "grad_norm": 1.577045202255249, + "learning_rate": 3.936013406356103e-05, + "loss": 0.3086, + "step": 9158 + }, + { + "epoch": 0.5027442371020856, + "grad_norm": 1.5019563436508179, + "learning_rate": 3.935584772091416e-05, + "loss": 0.2596, + "step": 9160 + }, + { + "epoch": 0.5028540065861691, + "grad_norm": 1.684006929397583, + "learning_rate": 3.935156074854935e-05, + "loss": 0.2083, + "step": 9162 + }, + { + "epoch": 0.5029637760702524, + "grad_norm": 1.7911007404327393, + "learning_rate": 3.934727314665464e-05, + "loss": 0.218, + "step": 9164 + }, + { + "epoch": 0.5030735455543359, + "grad_norm": 1.7399333715438843, + "learning_rate": 3.9342984915418114e-05, + "loss": 0.3633, + "step": 9166 + }, + { + "epoch": 0.5031833150384193, + "grad_norm": 1.2983802556991577, + "learning_rate": 3.933869605502787e-05, + "loss": 0.3474, + "step": 9168 + }, + { + "epoch": 0.5032930845225028, + "grad_norm": 1.9569716453552246, + "learning_rate": 3.933440656567203e-05, + "loss": 0.3651, + "step": 9170 + }, + { + "epoch": 0.5034028540065861, + "grad_norm": 2.1600146293640137, + "learning_rate": 3.933011644753877e-05, + "loss": 0.2831, + "step": 9172 + }, + { + "epoch": 0.5035126234906696, + "grad_norm": 1.9271916151046753, + "learning_rate": 3.932582570081627e-05, + "loss": 0.2902, + "step": 9174 + }, + { + "epoch": 0.503622392974753, + "grad_norm": 2.9710958003997803, + "learning_rate": 3.9321534325692726e-05, + "loss": 0.3287, + "step": 9176 + }, + { + "epoch": 0.5037321624588365, + "grad_norm": 1.392683506011963, + "learning_rate": 3.9317242322356405e-05, + "loss": 0.3168, + "step": 9178 + }, + { + "epoch": 0.5038419319429198, + "grad_norm": 1.800252079963684, + "learning_rate": 3.9312949690995555e-05, + "loss": 0.4125, + "step": 9180 + }, + { + "epoch": 0.5039517014270033, + "grad_norm": 1.555355429649353, + "learning_rate": 3.930865643179848e-05, + "loss": 0.3567, + "step": 9182 + }, + { + "epoch": 0.5040614709110868, + "grad_norm": 2.083906650543213, + "learning_rate": 3.9304362544953506e-05, + "loss": 0.4353, + "step": 9184 + }, + { + "epoch": 0.5041712403951701, + "grad_norm": 1.144890546798706, + "learning_rate": 3.930006803064898e-05, + "loss": 0.2998, + "step": 9186 + }, + { + "epoch": 0.5042810098792536, + "grad_norm": 1.787492275238037, + "learning_rate": 3.929577288907328e-05, + "loss": 0.256, + "step": 9188 + }, + { + "epoch": 0.504390779363337, + "grad_norm": 1.537794589996338, + "learning_rate": 3.929147712041481e-05, + "loss": 0.3792, + "step": 9190 + }, + { + "epoch": 0.5045005488474205, + "grad_norm": 3.6948206424713135, + "learning_rate": 3.928718072486201e-05, + "loss": 0.2363, + "step": 9192 + }, + { + "epoch": 0.5046103183315038, + "grad_norm": 3.5552122592926025, + "learning_rate": 3.9282883702603336e-05, + "loss": 0.3182, + "step": 9194 + }, + { + "epoch": 0.5047200878155873, + "grad_norm": 1.3749727010726929, + "learning_rate": 3.927858605382728e-05, + "loss": 0.2993, + "step": 9196 + }, + { + "epoch": 0.5048298572996707, + "grad_norm": 1.8382585048675537, + "learning_rate": 3.9274287778722365e-05, + "loss": 0.2978, + "step": 9198 + }, + { + "epoch": 0.5049396267837541, + "grad_norm": 1.540824294090271, + "learning_rate": 3.9269988877477116e-05, + "loss": 0.2403, + "step": 9200 + }, + { + "epoch": 0.5050493962678375, + "grad_norm": 1.8642491102218628, + "learning_rate": 3.926568935028012e-05, + "loss": 0.2413, + "step": 9202 + }, + { + "epoch": 0.505159165751921, + "grad_norm": 1.3207536935806274, + "learning_rate": 3.926138919731995e-05, + "loss": 0.2363, + "step": 9204 + }, + { + "epoch": 0.5052689352360044, + "grad_norm": 2.552870988845825, + "learning_rate": 3.925708841878527e-05, + "loss": 0.3106, + "step": 9206 + }, + { + "epoch": 0.5053787047200878, + "grad_norm": 2.6266486644744873, + "learning_rate": 3.925278701486471e-05, + "loss": 0.4917, + "step": 9208 + }, + { + "epoch": 0.5054884742041712, + "grad_norm": 1.429050326347351, + "learning_rate": 3.924848498574695e-05, + "loss": 0.3343, + "step": 9210 + }, + { + "epoch": 0.5055982436882547, + "grad_norm": 1.4113214015960693, + "learning_rate": 3.924418233162071e-05, + "loss": 0.2764, + "step": 9212 + }, + { + "epoch": 0.505708013172338, + "grad_norm": 0.9579571485519409, + "learning_rate": 3.9239879052674715e-05, + "loss": 0.2272, + "step": 9214 + }, + { + "epoch": 0.5058177826564215, + "grad_norm": 1.5395785570144653, + "learning_rate": 3.9235575149097734e-05, + "loss": 0.2932, + "step": 9216 + }, + { + "epoch": 0.5059275521405049, + "grad_norm": 1.2899045944213867, + "learning_rate": 3.923127062107855e-05, + "loss": 0.3383, + "step": 9218 + }, + { + "epoch": 0.5060373216245884, + "grad_norm": 1.5656850337982178, + "learning_rate": 3.922696546880599e-05, + "loss": 0.3589, + "step": 9220 + }, + { + "epoch": 0.5061470911086718, + "grad_norm": 2.4308176040649414, + "learning_rate": 3.922265969246889e-05, + "loss": 0.4345, + "step": 9222 + }, + { + "epoch": 0.5062568605927552, + "grad_norm": 1.2825429439544678, + "learning_rate": 3.921835329225613e-05, + "loss": 0.3782, + "step": 9224 + }, + { + "epoch": 0.5063666300768387, + "grad_norm": 2.113664388656616, + "learning_rate": 3.92140462683566e-05, + "loss": 0.2752, + "step": 9226 + }, + { + "epoch": 0.506476399560922, + "grad_norm": 1.6880452632904053, + "learning_rate": 3.920973862095924e-05, + "loss": 0.2039, + "step": 9228 + }, + { + "epoch": 0.5065861690450055, + "grad_norm": 2.353731632232666, + "learning_rate": 3.9205430350253e-05, + "loss": 0.2658, + "step": 9230 + }, + { + "epoch": 0.5066959385290889, + "grad_norm": 1.2638362646102905, + "learning_rate": 3.920112145642686e-05, + "loss": 0.2749, + "step": 9232 + }, + { + "epoch": 0.5068057080131724, + "grad_norm": 2.4901251792907715, + "learning_rate": 3.9196811939669824e-05, + "loss": 0.3457, + "step": 9234 + }, + { + "epoch": 0.5069154774972557, + "grad_norm": 1.3946045637130737, + "learning_rate": 3.919250180017094e-05, + "loss": 0.247, + "step": 9236 + }, + { + "epoch": 0.5070252469813392, + "grad_norm": 1.9349784851074219, + "learning_rate": 3.918819103811927e-05, + "loss": 0.356, + "step": 9238 + }, + { + "epoch": 0.5071350164654226, + "grad_norm": 1.7667731046676636, + "learning_rate": 3.918387965370389e-05, + "loss": 0.3838, + "step": 9240 + }, + { + "epoch": 0.5072447859495061, + "grad_norm": 2.6078238487243652, + "learning_rate": 3.917956764711394e-05, + "loss": 0.3896, + "step": 9242 + }, + { + "epoch": 0.5073545554335894, + "grad_norm": 1.6675304174423218, + "learning_rate": 3.917525501853855e-05, + "loss": 0.4524, + "step": 9244 + }, + { + "epoch": 0.5074643249176729, + "grad_norm": 1.1668113470077515, + "learning_rate": 3.91709417681669e-05, + "loss": 0.2866, + "step": 9246 + }, + { + "epoch": 0.5075740944017563, + "grad_norm": 2.4589760303497314, + "learning_rate": 3.9166627896188186e-05, + "loss": 0.391, + "step": 9248 + }, + { + "epoch": 0.5076838638858397, + "grad_norm": 2.2585060596466064, + "learning_rate": 3.916231340279164e-05, + "loss": 0.3693, + "step": 9250 + }, + { + "epoch": 0.5077936333699231, + "grad_norm": 1.993480920791626, + "learning_rate": 3.915799828816652e-05, + "loss": 0.4105, + "step": 9252 + }, + { + "epoch": 0.5079034028540066, + "grad_norm": 1.3512362241744995, + "learning_rate": 3.9153682552502104e-05, + "loss": 0.3133, + "step": 9254 + }, + { + "epoch": 0.50801317233809, + "grad_norm": 1.7119044065475464, + "learning_rate": 3.914936619598769e-05, + "loss": 0.3978, + "step": 9256 + }, + { + "epoch": 0.5081229418221734, + "grad_norm": 2.325655937194824, + "learning_rate": 3.914504921881263e-05, + "loss": 0.2826, + "step": 9258 + }, + { + "epoch": 0.5082327113062569, + "grad_norm": 2.6731815338134766, + "learning_rate": 3.9140731621166293e-05, + "loss": 0.2749, + "step": 9260 + }, + { + "epoch": 0.5083424807903403, + "grad_norm": 1.530858039855957, + "learning_rate": 3.913641340323805e-05, + "loss": 0.2419, + "step": 9262 + }, + { + "epoch": 0.5084522502744238, + "grad_norm": 1.822560429573059, + "learning_rate": 3.9132094565217335e-05, + "loss": 0.2494, + "step": 9264 + }, + { + "epoch": 0.5085620197585071, + "grad_norm": 1.6461443901062012, + "learning_rate": 3.912777510729358e-05, + "loss": 0.3065, + "step": 9266 + }, + { + "epoch": 0.5086717892425906, + "grad_norm": 3.941760301589966, + "learning_rate": 3.912345502965627e-05, + "loss": 0.236, + "step": 9268 + }, + { + "epoch": 0.508781558726674, + "grad_norm": 1.8550776243209839, + "learning_rate": 3.911913433249491e-05, + "loss": 0.2988, + "step": 9270 + }, + { + "epoch": 0.5088913282107574, + "grad_norm": 1.7434182167053223, + "learning_rate": 3.9114813015999005e-05, + "loss": 0.2241, + "step": 9272 + }, + { + "epoch": 0.5090010976948408, + "grad_norm": 1.396728754043579, + "learning_rate": 3.911049108035813e-05, + "loss": 0.2374, + "step": 9274 + }, + { + "epoch": 0.5091108671789243, + "grad_norm": 2.655158281326294, + "learning_rate": 3.9106168525761855e-05, + "loss": 0.2481, + "step": 9276 + }, + { + "epoch": 0.5092206366630077, + "grad_norm": 1.2200071811676025, + "learning_rate": 3.9101845352399786e-05, + "loss": 0.2349, + "step": 9278 + }, + { + "epoch": 0.5093304061470911, + "grad_norm": 5.009362697601318, + "learning_rate": 3.909752156046157e-05, + "loss": 0.3601, + "step": 9280 + }, + { + "epoch": 0.5094401756311745, + "grad_norm": 2.7002675533294678, + "learning_rate": 3.909319715013686e-05, + "loss": 0.4482, + "step": 9282 + }, + { + "epoch": 0.509549945115258, + "grad_norm": 1.8398932218551636, + "learning_rate": 3.908887212161535e-05, + "loss": 0.3136, + "step": 9284 + }, + { + "epoch": 0.5096597145993413, + "grad_norm": 1.8170849084854126, + "learning_rate": 3.908454647508676e-05, + "loss": 0.3406, + "step": 9286 + }, + { + "epoch": 0.5097694840834248, + "grad_norm": 1.5674043893814087, + "learning_rate": 3.908022021074083e-05, + "loss": 0.307, + "step": 9288 + }, + { + "epoch": 0.5098792535675082, + "grad_norm": 1.6644489765167236, + "learning_rate": 3.907589332876733e-05, + "loss": 0.2312, + "step": 9290 + }, + { + "epoch": 0.5099890230515917, + "grad_norm": 3.028019428253174, + "learning_rate": 3.907156582935606e-05, + "loss": 0.3454, + "step": 9292 + }, + { + "epoch": 0.510098792535675, + "grad_norm": 1.4496361017227173, + "learning_rate": 3.906723771269684e-05, + "loss": 0.3506, + "step": 9294 + }, + { + "epoch": 0.5102085620197585, + "grad_norm": 1.9574733972549438, + "learning_rate": 3.9062908978979535e-05, + "loss": 0.282, + "step": 9296 + }, + { + "epoch": 0.510318331503842, + "grad_norm": 1.5862140655517578, + "learning_rate": 3.905857962839402e-05, + "loss": 0.4692, + "step": 9298 + }, + { + "epoch": 0.5104281009879253, + "grad_norm": 1.3485631942749023, + "learning_rate": 3.905424966113019e-05, + "loss": 0.2826, + "step": 9300 + }, + { + "epoch": 0.5105378704720088, + "grad_norm": 1.2469545602798462, + "learning_rate": 3.9049919077378e-05, + "loss": 0.364, + "step": 9302 + }, + { + "epoch": 0.5106476399560922, + "grad_norm": 1.3336845636367798, + "learning_rate": 3.904558787732738e-05, + "loss": 0.3191, + "step": 9304 + }, + { + "epoch": 0.5107574094401757, + "grad_norm": 1.8429250717163086, + "learning_rate": 3.904125606116835e-05, + "loss": 0.3027, + "step": 9306 + }, + { + "epoch": 0.510867178924259, + "grad_norm": 2.1182854175567627, + "learning_rate": 3.9036923629090904e-05, + "loss": 0.3025, + "step": 9308 + }, + { + "epoch": 0.5109769484083425, + "grad_norm": 1.3212482929229736, + "learning_rate": 3.90325905812851e-05, + "loss": 0.3043, + "step": 9310 + }, + { + "epoch": 0.5110867178924259, + "grad_norm": 1.5945608615875244, + "learning_rate": 3.902825691794099e-05, + "loss": 0.2737, + "step": 9312 + }, + { + "epoch": 0.5111964873765094, + "grad_norm": 0.73200523853302, + "learning_rate": 3.9023922639248675e-05, + "loss": 0.318, + "step": 9314 + }, + { + "epoch": 0.5113062568605927, + "grad_norm": 1.1192060708999634, + "learning_rate": 3.9019587745398276e-05, + "loss": 0.303, + "step": 9316 + }, + { + "epoch": 0.5114160263446762, + "grad_norm": 1.7532641887664795, + "learning_rate": 3.9015252236579956e-05, + "loss": 0.3374, + "step": 9318 + }, + { + "epoch": 0.5115257958287596, + "grad_norm": 2.0140581130981445, + "learning_rate": 3.9010916112983875e-05, + "loss": 0.4634, + "step": 9320 + }, + { + "epoch": 0.511635565312843, + "grad_norm": 3.0120346546173096, + "learning_rate": 3.900657937480025e-05, + "loss": 0.3232, + "step": 9322 + }, + { + "epoch": 0.5117453347969264, + "grad_norm": 1.1700172424316406, + "learning_rate": 3.90022420222193e-05, + "loss": 0.2569, + "step": 9324 + }, + { + "epoch": 0.5118551042810099, + "grad_norm": 1.342064619064331, + "learning_rate": 3.899790405543129e-05, + "loss": 0.1682, + "step": 9326 + }, + { + "epoch": 0.5119648737650933, + "grad_norm": 1.4612730741500854, + "learning_rate": 3.8993565474626506e-05, + "loss": 0.3256, + "step": 9328 + }, + { + "epoch": 0.5120746432491767, + "grad_norm": 1.3451868295669556, + "learning_rate": 3.898922627999525e-05, + "loss": 0.4079, + "step": 9330 + }, + { + "epoch": 0.5121844127332602, + "grad_norm": 1.3885900974273682, + "learning_rate": 3.898488647172786e-05, + "loss": 0.3024, + "step": 9332 + }, + { + "epoch": 0.5122941822173436, + "grad_norm": 2.0588011741638184, + "learning_rate": 3.8980546050014724e-05, + "loss": 0.2789, + "step": 9334 + }, + { + "epoch": 0.512403951701427, + "grad_norm": 2.1542322635650635, + "learning_rate": 3.897620501504621e-05, + "loss": 0.2566, + "step": 9336 + }, + { + "epoch": 0.5125137211855104, + "grad_norm": 3.02168869972229, + "learning_rate": 3.897186336701274e-05, + "loss": 0.3215, + "step": 9338 + }, + { + "epoch": 0.5126234906695939, + "grad_norm": 2.515211343765259, + "learning_rate": 3.896752110610477e-05, + "loss": 0.3372, + "step": 9340 + }, + { + "epoch": 0.5127332601536773, + "grad_norm": 2.599571466445923, + "learning_rate": 3.8963178232512766e-05, + "loss": 0.3699, + "step": 9342 + }, + { + "epoch": 0.5128430296377607, + "grad_norm": 1.5379271507263184, + "learning_rate": 3.895883474642723e-05, + "loss": 0.3322, + "step": 9344 + }, + { + "epoch": 0.5129527991218441, + "grad_norm": 1.7449613809585571, + "learning_rate": 3.895449064803869e-05, + "loss": 0.2981, + "step": 9346 + }, + { + "epoch": 0.5130625686059276, + "grad_norm": 3.127086877822876, + "learning_rate": 3.8950145937537694e-05, + "loss": 0.3866, + "step": 9348 + }, + { + "epoch": 0.513172338090011, + "grad_norm": 1.846182942390442, + "learning_rate": 3.894580061511483e-05, + "loss": 0.2787, + "step": 9350 + }, + { + "epoch": 0.5132821075740944, + "grad_norm": 1.7905396223068237, + "learning_rate": 3.89414546809607e-05, + "loss": 0.2757, + "step": 9352 + }, + { + "epoch": 0.5133918770581778, + "grad_norm": 1.647354245185852, + "learning_rate": 3.893710813526593e-05, + "loss": 0.2549, + "step": 9354 + }, + { + "epoch": 0.5135016465422613, + "grad_norm": 1.9293339252471924, + "learning_rate": 3.89327609782212e-05, + "loss": 0.4314, + "step": 9356 + }, + { + "epoch": 0.5136114160263446, + "grad_norm": 2.236392021179199, + "learning_rate": 3.8928413210017185e-05, + "loss": 0.3395, + "step": 9358 + }, + { + "epoch": 0.5137211855104281, + "grad_norm": 3.435504913330078, + "learning_rate": 3.89240648308446e-05, + "loss": 0.3048, + "step": 9360 + }, + { + "epoch": 0.5138309549945115, + "grad_norm": 1.8901835680007935, + "learning_rate": 3.8919715840894195e-05, + "loss": 0.4191, + "step": 9362 + }, + { + "epoch": 0.513940724478595, + "grad_norm": 2.2826550006866455, + "learning_rate": 3.891536624035672e-05, + "loss": 0.3303, + "step": 9364 + }, + { + "epoch": 0.5140504939626783, + "grad_norm": 1.9564404487609863, + "learning_rate": 3.891101602942299e-05, + "loss": 0.214, + "step": 9366 + }, + { + "epoch": 0.5141602634467618, + "grad_norm": 1.9400849342346191, + "learning_rate": 3.890666520828382e-05, + "loss": 0.2336, + "step": 9368 + }, + { + "epoch": 0.5142700329308453, + "grad_norm": 2.2083816528320312, + "learning_rate": 3.890231377713004e-05, + "loss": 0.3338, + "step": 9370 + }, + { + "epoch": 0.5143798024149286, + "grad_norm": 1.4589747190475464, + "learning_rate": 3.889796173615255e-05, + "loss": 0.3206, + "step": 9372 + }, + { + "epoch": 0.5144895718990121, + "grad_norm": 1.4373646974563599, + "learning_rate": 3.889360908554225e-05, + "loss": 0.2289, + "step": 9374 + }, + { + "epoch": 0.5145993413830955, + "grad_norm": 1.7194019556045532, + "learning_rate": 3.888925582549006e-05, + "loss": 0.3745, + "step": 9376 + }, + { + "epoch": 0.514709110867179, + "grad_norm": 2.116813898086548, + "learning_rate": 3.888490195618693e-05, + "loss": 0.3861, + "step": 9378 + }, + { + "epoch": 0.5148188803512623, + "grad_norm": 1.5349152088165283, + "learning_rate": 3.888054747782386e-05, + "loss": 0.2421, + "step": 9380 + }, + { + "epoch": 0.5149286498353458, + "grad_norm": 4.490447998046875, + "learning_rate": 3.887619239059184e-05, + "loss": 0.2872, + "step": 9382 + }, + { + "epoch": 0.5150384193194292, + "grad_norm": 2.103581190109253, + "learning_rate": 3.887183669468191e-05, + "loss": 0.3085, + "step": 9384 + }, + { + "epoch": 0.5151481888035127, + "grad_norm": 2.068537473678589, + "learning_rate": 3.886748039028514e-05, + "loss": 0.3497, + "step": 9386 + }, + { + "epoch": 0.515257958287596, + "grad_norm": 2.0387136936187744, + "learning_rate": 3.886312347759261e-05, + "loss": 0.3485, + "step": 9388 + }, + { + "epoch": 0.5153677277716795, + "grad_norm": 2.3671822547912598, + "learning_rate": 3.8858765956795446e-05, + "loss": 0.2999, + "step": 9390 + }, + { + "epoch": 0.5154774972557629, + "grad_norm": 1.7366474866867065, + "learning_rate": 3.8854407828084784e-05, + "loss": 0.3378, + "step": 9392 + }, + { + "epoch": 0.5155872667398463, + "grad_norm": 1.6706892251968384, + "learning_rate": 3.8850049091651794e-05, + "loss": 0.2982, + "step": 9394 + }, + { + "epoch": 0.5156970362239297, + "grad_norm": 1.5102452039718628, + "learning_rate": 3.884568974768766e-05, + "loss": 0.303, + "step": 9396 + }, + { + "epoch": 0.5158068057080132, + "grad_norm": 1.3351467847824097, + "learning_rate": 3.884132979638363e-05, + "loss": 0.2713, + "step": 9398 + }, + { + "epoch": 0.5159165751920965, + "grad_norm": 1.7761025428771973, + "learning_rate": 3.8836969237930934e-05, + "loss": 0.3898, + "step": 9400 + }, + { + "epoch": 0.51602634467618, + "grad_norm": 1.4621498584747314, + "learning_rate": 3.883260807252084e-05, + "loss": 0.3696, + "step": 9402 + }, + { + "epoch": 0.5161361141602634, + "grad_norm": 1.4222677946090698, + "learning_rate": 3.882824630034467e-05, + "loss": 0.3963, + "step": 9404 + }, + { + "epoch": 0.5162458836443469, + "grad_norm": 2.5132029056549072, + "learning_rate": 3.8823883921593754e-05, + "loss": 0.2856, + "step": 9406 + }, + { + "epoch": 0.5163556531284303, + "grad_norm": 3.005880355834961, + "learning_rate": 3.8819520936459424e-05, + "loss": 0.3676, + "step": 9408 + }, + { + "epoch": 0.5164654226125137, + "grad_norm": 2.040095329284668, + "learning_rate": 3.881515734513308e-05, + "loss": 0.2664, + "step": 9410 + }, + { + "epoch": 0.5165751920965972, + "grad_norm": 1.6434710025787354, + "learning_rate": 3.881079314780612e-05, + "loss": 0.3379, + "step": 9412 + }, + { + "epoch": 0.5166849615806806, + "grad_norm": 1.4794059991836548, + "learning_rate": 3.880642834466999e-05, + "loss": 0.3129, + "step": 9414 + }, + { + "epoch": 0.516794731064764, + "grad_norm": 1.4865728616714478, + "learning_rate": 3.880206293591615e-05, + "loss": 0.1855, + "step": 9416 + }, + { + "epoch": 0.5169045005488474, + "grad_norm": 1.4702986478805542, + "learning_rate": 3.879769692173608e-05, + "loss": 0.2303, + "step": 9418 + }, + { + "epoch": 0.5170142700329309, + "grad_norm": 1.7336336374282837, + "learning_rate": 3.879333030232131e-05, + "loss": 0.345, + "step": 9420 + }, + { + "epoch": 0.5171240395170142, + "grad_norm": 1.0607565641403198, + "learning_rate": 3.878896307786336e-05, + "loss": 0.3185, + "step": 9422 + }, + { + "epoch": 0.5172338090010977, + "grad_norm": 1.7080707550048828, + "learning_rate": 3.878459524855381e-05, + "loss": 0.2575, + "step": 9424 + }, + { + "epoch": 0.5173435784851811, + "grad_norm": 1.5085957050323486, + "learning_rate": 3.878022681458426e-05, + "loss": 0.2419, + "step": 9426 + }, + { + "epoch": 0.5174533479692646, + "grad_norm": 1.4286775588989258, + "learning_rate": 3.8775857776146323e-05, + "loss": 0.358, + "step": 9428 + }, + { + "epoch": 0.5175631174533479, + "grad_norm": 1.9626606702804565, + "learning_rate": 3.8771488133431644e-05, + "loss": 0.2624, + "step": 9430 + }, + { + "epoch": 0.5176728869374314, + "grad_norm": 1.5606588125228882, + "learning_rate": 3.8767117886631904e-05, + "loss": 0.3882, + "step": 9432 + }, + { + "epoch": 0.5177826564215148, + "grad_norm": 1.4968167543411255, + "learning_rate": 3.8762747035938804e-05, + "loss": 0.2179, + "step": 9434 + }, + { + "epoch": 0.5178924259055983, + "grad_norm": 0.9903932213783264, + "learning_rate": 3.875837558154406e-05, + "loss": 0.3854, + "step": 9436 + }, + { + "epoch": 0.5180021953896816, + "grad_norm": 1.4042292833328247, + "learning_rate": 3.875400352363944e-05, + "loss": 0.3055, + "step": 9438 + }, + { + "epoch": 0.5181119648737651, + "grad_norm": 2.52697491645813, + "learning_rate": 3.874963086241671e-05, + "loss": 0.3351, + "step": 9440 + }, + { + "epoch": 0.5182217343578486, + "grad_norm": 3.504159450531006, + "learning_rate": 3.874525759806768e-05, + "loss": 0.2759, + "step": 9442 + }, + { + "epoch": 0.5183315038419319, + "grad_norm": 1.5490002632141113, + "learning_rate": 3.874088373078421e-05, + "loss": 0.3242, + "step": 9444 + }, + { + "epoch": 0.5184412733260154, + "grad_norm": 1.3389216661453247, + "learning_rate": 3.8736509260758106e-05, + "loss": 0.1889, + "step": 9446 + }, + { + "epoch": 0.5185510428100988, + "grad_norm": 1.864498496055603, + "learning_rate": 3.873213418818129e-05, + "loss": 0.4368, + "step": 9448 + }, + { + "epoch": 0.5186608122941823, + "grad_norm": 2.1523685455322266, + "learning_rate": 3.872775851324568e-05, + "loss": 0.3709, + "step": 9450 + }, + { + "epoch": 0.5187705817782656, + "grad_norm": 2.257406234741211, + "learning_rate": 3.872338223614319e-05, + "loss": 0.2787, + "step": 9452 + }, + { + "epoch": 0.5188803512623491, + "grad_norm": 1.7866798639297485, + "learning_rate": 3.8719005357065804e-05, + "loss": 0.3218, + "step": 9454 + }, + { + "epoch": 0.5189901207464325, + "grad_norm": 2.4134480953216553, + "learning_rate": 3.87146278762055e-05, + "loss": 0.5032, + "step": 9456 + }, + { + "epoch": 0.519099890230516, + "grad_norm": 1.5949935913085938, + "learning_rate": 3.8710249793754305e-05, + "loss": 0.3663, + "step": 9458 + }, + { + "epoch": 0.5192096597145993, + "grad_norm": 1.945719838142395, + "learning_rate": 3.870587110990426e-05, + "loss": 0.4217, + "step": 9460 + }, + { + "epoch": 0.5193194291986828, + "grad_norm": 1.2090002298355103, + "learning_rate": 3.870149182484744e-05, + "loss": 0.2529, + "step": 9462 + }, + { + "epoch": 0.5194291986827662, + "grad_norm": 1.7582370042800903, + "learning_rate": 3.869711193877593e-05, + "loss": 0.2387, + "step": 9464 + }, + { + "epoch": 0.5195389681668496, + "grad_norm": 1.6107162237167358, + "learning_rate": 3.869273145188187e-05, + "loss": 0.2559, + "step": 9466 + }, + { + "epoch": 0.519648737650933, + "grad_norm": 2.0066447257995605, + "learning_rate": 3.868835036435739e-05, + "loss": 0.2682, + "step": 9468 + }, + { + "epoch": 0.5197585071350165, + "grad_norm": 1.4329447746276855, + "learning_rate": 3.868396867639468e-05, + "loss": 0.3957, + "step": 9470 + }, + { + "epoch": 0.5198682766190998, + "grad_norm": 1.4916728734970093, + "learning_rate": 3.8679586388185947e-05, + "loss": 0.3515, + "step": 9472 + }, + { + "epoch": 0.5199780461031833, + "grad_norm": 1.4691187143325806, + "learning_rate": 3.86752034999234e-05, + "loss": 0.3037, + "step": 9474 + }, + { + "epoch": 0.5200878155872667, + "grad_norm": 1.0627838373184204, + "learning_rate": 3.8670820011799315e-05, + "loss": 0.2764, + "step": 9476 + }, + { + "epoch": 0.5201975850713502, + "grad_norm": 1.565097451210022, + "learning_rate": 3.866643592400596e-05, + "loss": 0.2252, + "step": 9478 + }, + { + "epoch": 0.5203073545554336, + "grad_norm": 1.1898478269577026, + "learning_rate": 3.866205123673564e-05, + "loss": 0.2657, + "step": 9480 + }, + { + "epoch": 0.520417124039517, + "grad_norm": 1.6062991619110107, + "learning_rate": 3.865766595018071e-05, + "loss": 0.2931, + "step": 9482 + }, + { + "epoch": 0.5205268935236005, + "grad_norm": 1.4574222564697266, + "learning_rate": 3.8653280064533506e-05, + "loss": 0.2916, + "step": 9484 + }, + { + "epoch": 0.5206366630076839, + "grad_norm": 3.1876509189605713, + "learning_rate": 3.864889357998642e-05, + "loss": 0.2904, + "step": 9486 + }, + { + "epoch": 0.5207464324917673, + "grad_norm": 2.2139127254486084, + "learning_rate": 3.864450649673188e-05, + "loss": 0.3531, + "step": 9488 + }, + { + "epoch": 0.5208562019758507, + "grad_norm": 1.5891518592834473, + "learning_rate": 3.86401188149623e-05, + "loss": 0.2286, + "step": 9490 + }, + { + "epoch": 0.5209659714599342, + "grad_norm": 3.0850155353546143, + "learning_rate": 3.863573053487017e-05, + "loss": 0.1943, + "step": 9492 + }, + { + "epoch": 0.5210757409440175, + "grad_norm": 3.0825390815734863, + "learning_rate": 3.863134165664797e-05, + "loss": 0.3205, + "step": 9494 + }, + { + "epoch": 0.521185510428101, + "grad_norm": 1.8134818077087402, + "learning_rate": 3.8626952180488216e-05, + "loss": 0.2253, + "step": 9496 + }, + { + "epoch": 0.5212952799121844, + "grad_norm": 1.3348368406295776, + "learning_rate": 3.862256210658346e-05, + "loss": 0.2835, + "step": 9498 + }, + { + "epoch": 0.5214050493962679, + "grad_norm": 1.8059262037277222, + "learning_rate": 3.861817143512626e-05, + "loss": 0.4054, + "step": 9500 + }, + { + "epoch": 0.5215148188803512, + "grad_norm": 1.1837095022201538, + "learning_rate": 3.861378016630922e-05, + "loss": 0.217, + "step": 9502 + }, + { + "epoch": 0.5216245883644347, + "grad_norm": 2.128169059753418, + "learning_rate": 3.860938830032496e-05, + "loss": 0.3451, + "step": 9504 + }, + { + "epoch": 0.5217343578485181, + "grad_norm": 1.7437101602554321, + "learning_rate": 3.860499583736613e-05, + "loss": 0.1969, + "step": 9506 + }, + { + "epoch": 0.5218441273326015, + "grad_norm": 1.5009043216705322, + "learning_rate": 3.8600602777625404e-05, + "loss": 0.2334, + "step": 9508 + }, + { + "epoch": 0.5219538968166849, + "grad_norm": 2.6701791286468506, + "learning_rate": 3.859620912129549e-05, + "loss": 0.3634, + "step": 9510 + }, + { + "epoch": 0.5220636663007684, + "grad_norm": 1.6009626388549805, + "learning_rate": 3.859181486856911e-05, + "loss": 0.3674, + "step": 9512 + }, + { + "epoch": 0.5221734357848518, + "grad_norm": 2.81184983253479, + "learning_rate": 3.858742001963902e-05, + "loss": 0.3533, + "step": 9514 + }, + { + "epoch": 0.5222832052689352, + "grad_norm": 2.009112596511841, + "learning_rate": 3.858302457469799e-05, + "loss": 0.4361, + "step": 9516 + }, + { + "epoch": 0.5223929747530187, + "grad_norm": 1.3124017715454102, + "learning_rate": 3.857862853393883e-05, + "loss": 0.3403, + "step": 9518 + }, + { + "epoch": 0.5225027442371021, + "grad_norm": 1.4637267589569092, + "learning_rate": 3.857423189755438e-05, + "loss": 0.3429, + "step": 9520 + }, + { + "epoch": 0.5226125137211856, + "grad_norm": 2.942612648010254, + "learning_rate": 3.856983466573749e-05, + "loss": 0.4137, + "step": 9522 + }, + { + "epoch": 0.5227222832052689, + "grad_norm": 2.969794750213623, + "learning_rate": 3.856543683868106e-05, + "loss": 0.3985, + "step": 9524 + }, + { + "epoch": 0.5228320526893524, + "grad_norm": 2.5517170429229736, + "learning_rate": 3.856103841657797e-05, + "loss": 0.2522, + "step": 9526 + }, + { + "epoch": 0.5229418221734358, + "grad_norm": 1.2847113609313965, + "learning_rate": 3.855663939962118e-05, + "loss": 0.2098, + "step": 9528 + }, + { + "epoch": 0.5230515916575192, + "grad_norm": 1.5705114603042603, + "learning_rate": 3.8552239788003655e-05, + "loss": 0.2205, + "step": 9530 + }, + { + "epoch": 0.5231613611416026, + "grad_norm": 1.8746699094772339, + "learning_rate": 3.854783958191836e-05, + "loss": 0.3346, + "step": 9532 + }, + { + "epoch": 0.5232711306256861, + "grad_norm": 1.86640465259552, + "learning_rate": 3.854343878155833e-05, + "loss": 0.3282, + "step": 9534 + }, + { + "epoch": 0.5233809001097695, + "grad_norm": 1.2077158689498901, + "learning_rate": 3.8539037387116594e-05, + "loss": 0.2803, + "step": 9536 + }, + { + "epoch": 0.5234906695938529, + "grad_norm": 1.4946242570877075, + "learning_rate": 3.853463539878623e-05, + "loss": 0.4148, + "step": 9538 + }, + { + "epoch": 0.5236004390779363, + "grad_norm": 1.7982895374298096, + "learning_rate": 3.853023281676033e-05, + "loss": 0.3156, + "step": 9540 + }, + { + "epoch": 0.5237102085620198, + "grad_norm": 1.2866342067718506, + "learning_rate": 3.8525829641232004e-05, + "loss": 0.2385, + "step": 9542 + }, + { + "epoch": 0.5238199780461031, + "grad_norm": 1.4506248235702515, + "learning_rate": 3.85214258723944e-05, + "loss": 0.2883, + "step": 9544 + }, + { + "epoch": 0.5239297475301866, + "grad_norm": 1.294179916381836, + "learning_rate": 3.851702151044069e-05, + "loss": 0.2468, + "step": 9546 + }, + { + "epoch": 0.52403951701427, + "grad_norm": 1.8619647026062012, + "learning_rate": 3.851261655556408e-05, + "loss": 0.3973, + "step": 9548 + }, + { + "epoch": 0.5241492864983535, + "grad_norm": 2.0959603786468506, + "learning_rate": 3.8508211007957775e-05, + "loss": 0.2804, + "step": 9550 + }, + { + "epoch": 0.5242590559824368, + "grad_norm": 2.6665751934051514, + "learning_rate": 3.850380486781503e-05, + "loss": 0.2948, + "step": 9552 + }, + { + "epoch": 0.5243688254665203, + "grad_norm": 1.6739884614944458, + "learning_rate": 3.8499398135329126e-05, + "loss": 0.3203, + "step": 9554 + }, + { + "epoch": 0.5244785949506038, + "grad_norm": 1.5947208404541016, + "learning_rate": 3.8494990810693366e-05, + "loss": 0.2918, + "step": 9556 + }, + { + "epoch": 0.5245883644346871, + "grad_norm": 1.994791030883789, + "learning_rate": 3.849058289410107e-05, + "loss": 0.3036, + "step": 9558 + }, + { + "epoch": 0.5246981339187706, + "grad_norm": 1.4526959657669067, + "learning_rate": 3.848617438574559e-05, + "loss": 0.2964, + "step": 9560 + }, + { + "epoch": 0.524807903402854, + "grad_norm": 2.0561883449554443, + "learning_rate": 3.8481765285820317e-05, + "loss": 0.3529, + "step": 9562 + }, + { + "epoch": 0.5249176728869375, + "grad_norm": 1.2142606973648071, + "learning_rate": 3.8477355594518625e-05, + "loss": 0.179, + "step": 9564 + }, + { + "epoch": 0.5250274423710208, + "grad_norm": 1.6463563442230225, + "learning_rate": 3.847294531203398e-05, + "loss": 0.2655, + "step": 9566 + }, + { + "epoch": 0.5251372118551043, + "grad_norm": 1.3318822383880615, + "learning_rate": 3.8468534438559825e-05, + "loss": 0.2489, + "step": 9568 + }, + { + "epoch": 0.5252469813391877, + "grad_norm": 1.9618703126907349, + "learning_rate": 3.846412297428964e-05, + "loss": 0.2666, + "step": 9570 + }, + { + "epoch": 0.5253567508232712, + "grad_norm": 1.321544885635376, + "learning_rate": 3.8459710919416935e-05, + "loss": 0.3566, + "step": 9572 + }, + { + "epoch": 0.5254665203073545, + "grad_norm": 2.0634405612945557, + "learning_rate": 3.8455298274135246e-05, + "loss": 0.3399, + "step": 9574 + }, + { + "epoch": 0.525576289791438, + "grad_norm": 1.2132834196090698, + "learning_rate": 3.8450885038638127e-05, + "loss": 0.3066, + "step": 9576 + }, + { + "epoch": 0.5256860592755214, + "grad_norm": 1.0886163711547852, + "learning_rate": 3.844647121311918e-05, + "loss": 0.2913, + "step": 9578 + }, + { + "epoch": 0.5257958287596048, + "grad_norm": 2.44585919380188, + "learning_rate": 3.8442056797772e-05, + "loss": 0.346, + "step": 9580 + }, + { + "epoch": 0.5259055982436882, + "grad_norm": 1.637629747390747, + "learning_rate": 3.843764179279022e-05, + "loss": 0.4113, + "step": 9582 + }, + { + "epoch": 0.5260153677277717, + "grad_norm": 1.4126919507980347, + "learning_rate": 3.8433226198367535e-05, + "loss": 0.2562, + "step": 9584 + }, + { + "epoch": 0.526125137211855, + "grad_norm": 1.9437215328216553, + "learning_rate": 3.8428810014697615e-05, + "loss": 0.2793, + "step": 9586 + }, + { + "epoch": 0.5262349066959385, + "grad_norm": 1.7406290769577026, + "learning_rate": 3.8424393241974164e-05, + "loss": 0.3748, + "step": 9588 + }, + { + "epoch": 0.526344676180022, + "grad_norm": 1.7903541326522827, + "learning_rate": 3.8419975880390934e-05, + "loss": 0.3614, + "step": 9590 + }, + { + "epoch": 0.5264544456641054, + "grad_norm": 1.6449593305587769, + "learning_rate": 3.84155579301417e-05, + "loss": 0.2931, + "step": 9592 + }, + { + "epoch": 0.5265642151481889, + "grad_norm": 0.9242619872093201, + "learning_rate": 3.841113939142024e-05, + "loss": 0.3036, + "step": 9594 + }, + { + "epoch": 0.5266739846322722, + "grad_norm": 1.6378570795059204, + "learning_rate": 3.840672026442038e-05, + "loss": 0.328, + "step": 9596 + }, + { + "epoch": 0.5267837541163557, + "grad_norm": 1.8056296110153198, + "learning_rate": 3.840230054933598e-05, + "loss": 0.363, + "step": 9598 + }, + { + "epoch": 0.5268935236004391, + "grad_norm": 1.1866860389709473, + "learning_rate": 3.839788024636088e-05, + "loss": 0.3715, + "step": 9600 + }, + { + "epoch": 0.5270032930845225, + "grad_norm": 2.0764124393463135, + "learning_rate": 3.8393459355689e-05, + "loss": 0.3473, + "step": 9602 + }, + { + "epoch": 0.5271130625686059, + "grad_norm": 2.285634756088257, + "learning_rate": 3.838903787751425e-05, + "loss": 0.3048, + "step": 9604 + }, + { + "epoch": 0.5272228320526894, + "grad_norm": 1.6419087648391724, + "learning_rate": 3.838461581203058e-05, + "loss": 0.2887, + "step": 9606 + }, + { + "epoch": 0.5273326015367727, + "grad_norm": 1.0097994804382324, + "learning_rate": 3.8380193159431966e-05, + "loss": 0.2504, + "step": 9608 + }, + { + "epoch": 0.5274423710208562, + "grad_norm": 2.703111410140991, + "learning_rate": 3.8375769919912416e-05, + "loss": 0.3628, + "step": 9610 + }, + { + "epoch": 0.5275521405049396, + "grad_norm": 2.3902666568756104, + "learning_rate": 3.837134609366593e-05, + "loss": 0.5205, + "step": 9612 + }, + { + "epoch": 0.5276619099890231, + "grad_norm": 1.1878756284713745, + "learning_rate": 3.836692168088658e-05, + "loss": 0.435, + "step": 9614 + }, + { + "epoch": 0.5277716794731064, + "grad_norm": 2.397479295730591, + "learning_rate": 3.836249668176844e-05, + "loss": 0.3065, + "step": 9616 + }, + { + "epoch": 0.5278814489571899, + "grad_norm": 1.3039381504058838, + "learning_rate": 3.83580710965056e-05, + "loss": 0.27, + "step": 9618 + }, + { + "epoch": 0.5279912184412733, + "grad_norm": 2.3025481700897217, + "learning_rate": 3.8353644925292204e-05, + "loss": 0.3475, + "step": 9620 + }, + { + "epoch": 0.5281009879253568, + "grad_norm": 2.082038402557373, + "learning_rate": 3.83492181683224e-05, + "loss": 0.39, + "step": 9622 + }, + { + "epoch": 0.5282107574094401, + "grad_norm": 3.422018051147461, + "learning_rate": 3.8344790825790356e-05, + "loss": 0.2372, + "step": 9624 + }, + { + "epoch": 0.5283205268935236, + "grad_norm": 2.2182328701019287, + "learning_rate": 3.834036289789029e-05, + "loss": 0.4933, + "step": 9626 + }, + { + "epoch": 0.5284302963776071, + "grad_norm": 1.017085313796997, + "learning_rate": 3.833593438481643e-05, + "loss": 0.1747, + "step": 9628 + }, + { + "epoch": 0.5285400658616904, + "grad_norm": 1.6076558828353882, + "learning_rate": 3.8331505286763036e-05, + "loss": 0.3519, + "step": 9630 + }, + { + "epoch": 0.5286498353457739, + "grad_norm": 2.607513189315796, + "learning_rate": 3.832707560392438e-05, + "loss": 0.2541, + "step": 9632 + }, + { + "epoch": 0.5287596048298573, + "grad_norm": 1.5764373540878296, + "learning_rate": 3.832264533649477e-05, + "loss": 0.4482, + "step": 9634 + }, + { + "epoch": 0.5288693743139408, + "grad_norm": 1.5660173892974854, + "learning_rate": 3.831821448466856e-05, + "loss": 0.3301, + "step": 9636 + }, + { + "epoch": 0.5289791437980241, + "grad_norm": 1.7191752195358276, + "learning_rate": 3.831378304864007e-05, + "loss": 0.4377, + "step": 9638 + }, + { + "epoch": 0.5290889132821076, + "grad_norm": 2.778986692428589, + "learning_rate": 3.830935102860373e-05, + "loss": 0.3645, + "step": 9640 + }, + { + "epoch": 0.529198682766191, + "grad_norm": 1.329482078552246, + "learning_rate": 3.830491842475391e-05, + "loss": 0.2912, + "step": 9642 + }, + { + "epoch": 0.5293084522502745, + "grad_norm": 1.272485375404358, + "learning_rate": 3.8300485237285075e-05, + "loss": 0.4265, + "step": 9644 + }, + { + "epoch": 0.5294182217343578, + "grad_norm": 1.3261581659317017, + "learning_rate": 3.829605146639167e-05, + "loss": 0.4156, + "step": 9646 + }, + { + "epoch": 0.5295279912184413, + "grad_norm": 1.4692845344543457, + "learning_rate": 3.829161711226819e-05, + "loss": 0.3442, + "step": 9648 + }, + { + "epoch": 0.5296377607025247, + "grad_norm": 1.5209238529205322, + "learning_rate": 3.8287182175109146e-05, + "loss": 0.345, + "step": 9650 + }, + { + "epoch": 0.5297475301866081, + "grad_norm": 2.4174768924713135, + "learning_rate": 3.828274665510907e-05, + "loss": 0.3036, + "step": 9652 + }, + { + "epoch": 0.5298572996706915, + "grad_norm": 1.227724552154541, + "learning_rate": 3.827831055246253e-05, + "loss": 0.2878, + "step": 9654 + }, + { + "epoch": 0.529967069154775, + "grad_norm": 1.4489715099334717, + "learning_rate": 3.8273873867364116e-05, + "loss": 0.2956, + "step": 9656 + }, + { + "epoch": 0.5300768386388583, + "grad_norm": 8.119613647460938, + "learning_rate": 3.826943660000844e-05, + "loss": 0.4346, + "step": 9658 + }, + { + "epoch": 0.5301866081229418, + "grad_norm": 1.287498116493225, + "learning_rate": 3.826499875059015e-05, + "loss": 0.4381, + "step": 9660 + }, + { + "epoch": 0.5302963776070252, + "grad_norm": 1.4999852180480957, + "learning_rate": 3.8260560319303904e-05, + "loss": 0.3535, + "step": 9662 + }, + { + "epoch": 0.5304061470911087, + "grad_norm": 1.1272190809249878, + "learning_rate": 3.825612130634439e-05, + "loss": 0.2583, + "step": 9664 + }, + { + "epoch": 0.5305159165751921, + "grad_norm": 1.5110429525375366, + "learning_rate": 3.825168171190634e-05, + "loss": 0.3033, + "step": 9666 + }, + { + "epoch": 0.5306256860592755, + "grad_norm": 1.9067405462265015, + "learning_rate": 3.824724153618449e-05, + "loss": 0.2939, + "step": 9668 + }, + { + "epoch": 0.530735455543359, + "grad_norm": 1.774583339691162, + "learning_rate": 3.8242800779373594e-05, + "loss": 0.2959, + "step": 9670 + }, + { + "epoch": 0.5308452250274424, + "grad_norm": 2.5460925102233887, + "learning_rate": 3.823835944166846e-05, + "loss": 0.2459, + "step": 9672 + }, + { + "epoch": 0.5309549945115258, + "grad_norm": 1.768389105796814, + "learning_rate": 3.823391752326389e-05, + "loss": 0.316, + "step": 9674 + }, + { + "epoch": 0.5310647639956092, + "grad_norm": 1.3120930194854736, + "learning_rate": 3.822947502435477e-05, + "loss": 0.257, + "step": 9676 + }, + { + "epoch": 0.5311745334796927, + "grad_norm": 1.06015145778656, + "learning_rate": 3.822503194513592e-05, + "loss": 0.3237, + "step": 9678 + }, + { + "epoch": 0.531284302963776, + "grad_norm": 1.2969051599502563, + "learning_rate": 3.8220588285802264e-05, + "loss": 0.3746, + "step": 9680 + }, + { + "epoch": 0.5313940724478595, + "grad_norm": 2.6119256019592285, + "learning_rate": 3.821614404654872e-05, + "loss": 0.3943, + "step": 9682 + }, + { + "epoch": 0.5315038419319429, + "grad_norm": 1.9397187232971191, + "learning_rate": 3.8211699227570224e-05, + "loss": 0.3201, + "step": 9684 + }, + { + "epoch": 0.5316136114160264, + "grad_norm": 1.6946998834609985, + "learning_rate": 3.820725382906175e-05, + "loss": 0.425, + "step": 9686 + }, + { + "epoch": 0.5317233809001097, + "grad_norm": 2.2350547313690186, + "learning_rate": 3.8202807851218305e-05, + "loss": 0.2632, + "step": 9688 + }, + { + "epoch": 0.5318331503841932, + "grad_norm": 1.5308009386062622, + "learning_rate": 3.8198361294234895e-05, + "loss": 0.238, + "step": 9690 + }, + { + "epoch": 0.5319429198682766, + "grad_norm": 1.514822006225586, + "learning_rate": 3.819391415830658e-05, + "loss": 0.3513, + "step": 9692 + }, + { + "epoch": 0.53205268935236, + "grad_norm": 1.9735424518585205, + "learning_rate": 3.818946644362844e-05, + "loss": 0.3278, + "step": 9694 + }, + { + "epoch": 0.5321624588364434, + "grad_norm": 1.4476224184036255, + "learning_rate": 3.8185018150395555e-05, + "loss": 0.3709, + "step": 9696 + }, + { + "epoch": 0.5322722283205269, + "grad_norm": 1.5252960920333862, + "learning_rate": 3.8180569278803056e-05, + "loss": 0.2644, + "step": 9698 + }, + { + "epoch": 0.5323819978046103, + "grad_norm": 1.2911908626556396, + "learning_rate": 3.81761198290461e-05, + "loss": 0.3737, + "step": 9700 + }, + { + "epoch": 0.5324917672886937, + "grad_norm": 1.715336561203003, + "learning_rate": 3.8171669801319854e-05, + "loss": 0.2116, + "step": 9702 + }, + { + "epoch": 0.5326015367727772, + "grad_norm": 1.3552749156951904, + "learning_rate": 3.8167219195819524e-05, + "loss": 0.3215, + "step": 9704 + }, + { + "epoch": 0.5327113062568606, + "grad_norm": 1.8904268741607666, + "learning_rate": 3.816276801274032e-05, + "loss": 0.3642, + "step": 9706 + }, + { + "epoch": 0.5328210757409441, + "grad_norm": 1.531692385673523, + "learning_rate": 3.815831625227752e-05, + "loss": 0.2726, + "step": 9708 + }, + { + "epoch": 0.5329308452250274, + "grad_norm": 1.1998167037963867, + "learning_rate": 3.815386391462637e-05, + "loss": 0.2367, + "step": 9710 + }, + { + "epoch": 0.5330406147091109, + "grad_norm": 1.5290862321853638, + "learning_rate": 3.814941099998221e-05, + "loss": 0.2849, + "step": 9712 + }, + { + "epoch": 0.5331503841931943, + "grad_norm": 1.112055778503418, + "learning_rate": 3.814495750854032e-05, + "loss": 0.2368, + "step": 9714 + }, + { + "epoch": 0.5332601536772777, + "grad_norm": 1.5965921878814697, + "learning_rate": 3.8140503440496076e-05, + "loss": 0.2983, + "step": 9716 + }, + { + "epoch": 0.5333699231613611, + "grad_norm": 0.9866284132003784, + "learning_rate": 3.813604879604486e-05, + "loss": 0.2364, + "step": 9718 + }, + { + "epoch": 0.5334796926454446, + "grad_norm": 2.8076114654541016, + "learning_rate": 3.813159357538206e-05, + "loss": 0.3434, + "step": 9720 + }, + { + "epoch": 0.533589462129528, + "grad_norm": 1.1559038162231445, + "learning_rate": 3.812713777870313e-05, + "loss": 0.1902, + "step": 9722 + }, + { + "epoch": 0.5336992316136114, + "grad_norm": 1.4742928743362427, + "learning_rate": 3.812268140620349e-05, + "loss": 0.2103, + "step": 9724 + }, + { + "epoch": 0.5338090010976948, + "grad_norm": 1.600690484046936, + "learning_rate": 3.811822445807863e-05, + "loss": 0.3184, + "step": 9726 + }, + { + "epoch": 0.5339187705817783, + "grad_norm": 1.8010145425796509, + "learning_rate": 3.8113766934524066e-05, + "loss": 0.3184, + "step": 9728 + }, + { + "epoch": 0.5340285400658616, + "grad_norm": 1.4885910749435425, + "learning_rate": 3.8109308835735316e-05, + "loss": 0.2383, + "step": 9730 + }, + { + "epoch": 0.5341383095499451, + "grad_norm": 2.3483502864837646, + "learning_rate": 3.8104850161907934e-05, + "loss": 0.422, + "step": 9732 + }, + { + "epoch": 0.5342480790340285, + "grad_norm": 1.2681035995483398, + "learning_rate": 3.810039091323751e-05, + "loss": 0.2497, + "step": 9734 + }, + { + "epoch": 0.534357848518112, + "grad_norm": 1.9266737699508667, + "learning_rate": 3.809593108991962e-05, + "loss": 0.4137, + "step": 9736 + }, + { + "epoch": 0.5344676180021954, + "grad_norm": 2.019930124282837, + "learning_rate": 3.8091470692149934e-05, + "loss": 0.3013, + "step": 9738 + }, + { + "epoch": 0.5345773874862788, + "grad_norm": 1.570014238357544, + "learning_rate": 3.808700972012408e-05, + "loss": 0.2612, + "step": 9740 + }, + { + "epoch": 0.5346871569703623, + "grad_norm": 2.186999559402466, + "learning_rate": 3.8082548174037743e-05, + "loss": 0.2644, + "step": 9742 + }, + { + "epoch": 0.5347969264544457, + "grad_norm": 2.0421388149261475, + "learning_rate": 3.807808605408664e-05, + "loss": 0.3184, + "step": 9744 + }, + { + "epoch": 0.5349066959385291, + "grad_norm": 1.2573679685592651, + "learning_rate": 3.807362336046648e-05, + "loss": 0.2029, + "step": 9746 + }, + { + "epoch": 0.5350164654226125, + "grad_norm": 1.5342141389846802, + "learning_rate": 3.806916009337303e-05, + "loss": 0.277, + "step": 9748 + }, + { + "epoch": 0.535126234906696, + "grad_norm": 2.096973180770874, + "learning_rate": 3.806469625300208e-05, + "loss": 0.3976, + "step": 9750 + }, + { + "epoch": 0.5352360043907793, + "grad_norm": 3.2223033905029297, + "learning_rate": 3.806023183954942e-05, + "loss": 0.2169, + "step": 9752 + }, + { + "epoch": 0.5353457738748628, + "grad_norm": 2.034022808074951, + "learning_rate": 3.805576685321089e-05, + "loss": 0.4503, + "step": 9754 + }, + { + "epoch": 0.5354555433589462, + "grad_norm": 1.7340128421783447, + "learning_rate": 3.805130129418235e-05, + "loss": 0.2943, + "step": 9756 + }, + { + "epoch": 0.5355653128430297, + "grad_norm": 1.2137641906738281, + "learning_rate": 3.8046835162659666e-05, + "loss": 0.2317, + "step": 9758 + }, + { + "epoch": 0.535675082327113, + "grad_norm": 1.3309990167617798, + "learning_rate": 3.804236845883876e-05, + "loss": 0.3108, + "step": 9760 + }, + { + "epoch": 0.5357848518111965, + "grad_norm": 1.8162202835083008, + "learning_rate": 3.803790118291555e-05, + "loss": 0.2984, + "step": 9762 + }, + { + "epoch": 0.5358946212952799, + "grad_norm": 1.872521162033081, + "learning_rate": 3.803343333508601e-05, + "loss": 0.2991, + "step": 9764 + }, + { + "epoch": 0.5360043907793633, + "grad_norm": 1.490234375, + "learning_rate": 3.802896491554611e-05, + "loss": 0.4203, + "step": 9766 + }, + { + "epoch": 0.5361141602634467, + "grad_norm": 1.3142585754394531, + "learning_rate": 3.8024495924491855e-05, + "loss": 0.323, + "step": 9768 + }, + { + "epoch": 0.5362239297475302, + "grad_norm": 1.7122663259506226, + "learning_rate": 3.802002636211929e-05, + "loss": 0.4692, + "step": 9770 + }, + { + "epoch": 0.5363336992316136, + "grad_norm": 1.4728320837020874, + "learning_rate": 3.801555622862445e-05, + "loss": 0.2548, + "step": 9772 + }, + { + "epoch": 0.536443468715697, + "grad_norm": 1.9289575815200806, + "learning_rate": 3.801108552420344e-05, + "loss": 0.2979, + "step": 9774 + }, + { + "epoch": 0.5365532381997805, + "grad_norm": 2.0152995586395264, + "learning_rate": 3.800661424905235e-05, + "loss": 0.3205, + "step": 9776 + }, + { + "epoch": 0.5366630076838639, + "grad_norm": 3.2270009517669678, + "learning_rate": 3.8002142403367316e-05, + "loss": 0.2991, + "step": 9778 + }, + { + "epoch": 0.5367727771679474, + "grad_norm": 1.6754637956619263, + "learning_rate": 3.799766998734451e-05, + "loss": 0.233, + "step": 9780 + }, + { + "epoch": 0.5368825466520307, + "grad_norm": 1.6012017726898193, + "learning_rate": 3.799319700118009e-05, + "loss": 0.2691, + "step": 9782 + }, + { + "epoch": 0.5369923161361142, + "grad_norm": 0.9644180536270142, + "learning_rate": 3.7988723445070285e-05, + "loss": 0.2676, + "step": 9784 + }, + { + "epoch": 0.5371020856201976, + "grad_norm": 1.5043126344680786, + "learning_rate": 3.7984249319211314e-05, + "loss": 0.4198, + "step": 9786 + }, + { + "epoch": 0.537211855104281, + "grad_norm": 1.3045185804367065, + "learning_rate": 3.7979774623799434e-05, + "loss": 0.2318, + "step": 9788 + }, + { + "epoch": 0.5373216245883644, + "grad_norm": 1.8028396368026733, + "learning_rate": 3.797529935903093e-05, + "loss": 0.3093, + "step": 9790 + }, + { + "epoch": 0.5374313940724479, + "grad_norm": 2.363476514816284, + "learning_rate": 3.797082352510211e-05, + "loss": 0.3904, + "step": 9792 + }, + { + "epoch": 0.5375411635565313, + "grad_norm": 4.021153926849365, + "learning_rate": 3.796634712220931e-05, + "loss": 0.3736, + "step": 9794 + }, + { + "epoch": 0.5376509330406147, + "grad_norm": 1.0294952392578125, + "learning_rate": 3.796187015054888e-05, + "loss": 0.2726, + "step": 9796 + }, + { + "epoch": 0.5377607025246981, + "grad_norm": 1.8748717308044434, + "learning_rate": 3.795739261031721e-05, + "loss": 0.3362, + "step": 9798 + }, + { + "epoch": 0.5378704720087816, + "grad_norm": 1.6188327074050903, + "learning_rate": 3.79529145017107e-05, + "loss": 0.2993, + "step": 9800 + }, + { + "epoch": 0.5379802414928649, + "grad_norm": 1.0592422485351562, + "learning_rate": 3.7948435824925773e-05, + "loss": 0.3075, + "step": 9802 + }, + { + "epoch": 0.5380900109769484, + "grad_norm": 1.2488629817962646, + "learning_rate": 3.794395658015891e-05, + "loss": 0.3417, + "step": 9804 + }, + { + "epoch": 0.5381997804610318, + "grad_norm": 1.4953713417053223, + "learning_rate": 3.793947676760657e-05, + "loss": 0.3209, + "step": 9806 + }, + { + "epoch": 0.5383095499451153, + "grad_norm": 1.0916837453842163, + "learning_rate": 3.7934996387465265e-05, + "loss": 0.1727, + "step": 9808 + }, + { + "epoch": 0.5384193194291986, + "grad_norm": 1.3669734001159668, + "learning_rate": 3.793051543993154e-05, + "loss": 0.3035, + "step": 9810 + }, + { + "epoch": 0.5385290889132821, + "grad_norm": 1.4699598550796509, + "learning_rate": 3.792603392520193e-05, + "loss": 0.3207, + "step": 9812 + }, + { + "epoch": 0.5386388583973656, + "grad_norm": 3.8555502891540527, + "learning_rate": 3.7921551843473036e-05, + "loss": 0.3411, + "step": 9814 + }, + { + "epoch": 0.538748627881449, + "grad_norm": 1.1755095720291138, + "learning_rate": 3.791706919494145e-05, + "loss": 0.2813, + "step": 9816 + }, + { + "epoch": 0.5388583973655324, + "grad_norm": 1.8144021034240723, + "learning_rate": 3.7912585979803816e-05, + "loss": 0.2976, + "step": 9818 + }, + { + "epoch": 0.5389681668496158, + "grad_norm": 1.9388439655303955, + "learning_rate": 3.790810219825677e-05, + "loss": 0.2864, + "step": 9820 + }, + { + "epoch": 0.5390779363336993, + "grad_norm": 2.307987689971924, + "learning_rate": 3.790361785049702e-05, + "loss": 0.2922, + "step": 9822 + }, + { + "epoch": 0.5391877058177826, + "grad_norm": 1.4693270921707153, + "learning_rate": 3.7899132936721246e-05, + "loss": 0.3412, + "step": 9824 + }, + { + "epoch": 0.5392974753018661, + "grad_norm": 1.7527016401290894, + "learning_rate": 3.789464745712619e-05, + "loss": 0.2806, + "step": 9826 + }, + { + "epoch": 0.5394072447859495, + "grad_norm": 2.0887880325317383, + "learning_rate": 3.7890161411908604e-05, + "loss": 0.2923, + "step": 9828 + }, + { + "epoch": 0.539517014270033, + "grad_norm": 1.2228971719741821, + "learning_rate": 3.7885674801265275e-05, + "loss": 0.4171, + "step": 9830 + }, + { + "epoch": 0.5396267837541163, + "grad_norm": 2.3821616172790527, + "learning_rate": 3.7881187625393e-05, + "loss": 0.4095, + "step": 9832 + }, + { + "epoch": 0.5397365532381998, + "grad_norm": 1.965805172920227, + "learning_rate": 3.787669988448861e-05, + "loss": 0.4278, + "step": 9834 + }, + { + "epoch": 0.5398463227222832, + "grad_norm": 1.7867968082427979, + "learning_rate": 3.7872211578748966e-05, + "loss": 0.3759, + "step": 9836 + }, + { + "epoch": 0.5399560922063666, + "grad_norm": 2.014178514480591, + "learning_rate": 3.786772270837093e-05, + "loss": 0.3753, + "step": 9838 + }, + { + "epoch": 0.54006586169045, + "grad_norm": 1.1007272005081177, + "learning_rate": 3.786323327355142e-05, + "loss": 0.2619, + "step": 9840 + }, + { + "epoch": 0.5401756311745335, + "grad_norm": 2.484637498855591, + "learning_rate": 3.785874327448737e-05, + "loss": 0.2499, + "step": 9842 + }, + { + "epoch": 0.5402854006586169, + "grad_norm": 1.4273048639297485, + "learning_rate": 3.785425271137573e-05, + "loss": 0.2328, + "step": 9844 + }, + { + "epoch": 0.5403951701427003, + "grad_norm": 2.231285572052002, + "learning_rate": 3.784976158441347e-05, + "loss": 0.339, + "step": 9846 + }, + { + "epoch": 0.5405049396267838, + "grad_norm": 3.0927047729492188, + "learning_rate": 3.784526989379759e-05, + "loss": 0.3797, + "step": 9848 + }, + { + "epoch": 0.5406147091108672, + "grad_norm": 1.266078233718872, + "learning_rate": 3.784077763972513e-05, + "loss": 0.3557, + "step": 9850 + }, + { + "epoch": 0.5407244785949507, + "grad_norm": 1.0617643594741821, + "learning_rate": 3.783628482239313e-05, + "loss": 0.4028, + "step": 9852 + }, + { + "epoch": 0.540834248079034, + "grad_norm": 1.8476786613464355, + "learning_rate": 3.783179144199868e-05, + "loss": 0.2048, + "step": 9854 + }, + { + "epoch": 0.5409440175631175, + "grad_norm": 2.3066375255584717, + "learning_rate": 3.7827297498738876e-05, + "loss": 0.2333, + "step": 9856 + }, + { + "epoch": 0.5410537870472009, + "grad_norm": 3.7674412727355957, + "learning_rate": 3.782280299281083e-05, + "loss": 0.3873, + "step": 9858 + }, + { + "epoch": 0.5411635565312843, + "grad_norm": 1.4727630615234375, + "learning_rate": 3.781830792441172e-05, + "loss": 0.4367, + "step": 9860 + }, + { + "epoch": 0.5412733260153677, + "grad_norm": 1.450958490371704, + "learning_rate": 3.781381229373871e-05, + "loss": 0.2922, + "step": 9862 + }, + { + "epoch": 0.5413830954994512, + "grad_norm": 1.5880197286605835, + "learning_rate": 3.780931610098899e-05, + "loss": 0.3793, + "step": 9864 + }, + { + "epoch": 0.5414928649835345, + "grad_norm": 1.876785397529602, + "learning_rate": 3.78048193463598e-05, + "loss": 0.2191, + "step": 9866 + }, + { + "epoch": 0.541602634467618, + "grad_norm": 0.9546993970870972, + "learning_rate": 3.7800322030048383e-05, + "loss": 0.2472, + "step": 9868 + }, + { + "epoch": 0.5417124039517014, + "grad_norm": 1.9795650243759155, + "learning_rate": 3.779582415225201e-05, + "loss": 0.2376, + "step": 9870 + }, + { + "epoch": 0.5418221734357849, + "grad_norm": 2.1554527282714844, + "learning_rate": 3.779132571316799e-05, + "loss": 0.2995, + "step": 9872 + }, + { + "epoch": 0.5419319429198682, + "grad_norm": 2.408088445663452, + "learning_rate": 3.778682671299364e-05, + "loss": 0.3627, + "step": 9874 + }, + { + "epoch": 0.5420417124039517, + "grad_norm": 2.2944233417510986, + "learning_rate": 3.77823271519263e-05, + "loss": 0.2855, + "step": 9876 + }, + { + "epoch": 0.5421514818880351, + "grad_norm": 1.3267650604248047, + "learning_rate": 3.777782703016337e-05, + "loss": 0.2992, + "step": 9878 + }, + { + "epoch": 0.5422612513721186, + "grad_norm": 1.6781110763549805, + "learning_rate": 3.777332634790221e-05, + "loss": 0.2844, + "step": 9880 + }, + { + "epoch": 0.5423710208562019, + "grad_norm": 1.4908878803253174, + "learning_rate": 3.776882510534027e-05, + "loss": 0.357, + "step": 9882 + }, + { + "epoch": 0.5424807903402854, + "grad_norm": 3.2372400760650635, + "learning_rate": 3.776432330267499e-05, + "loss": 0.2562, + "step": 9884 + }, + { + "epoch": 0.5425905598243689, + "grad_norm": 1.1765151023864746, + "learning_rate": 3.775982094010383e-05, + "loss": 0.2253, + "step": 9886 + }, + { + "epoch": 0.5427003293084522, + "grad_norm": 2.0396852493286133, + "learning_rate": 3.77553180178243e-05, + "loss": 0.3243, + "step": 9888 + }, + { + "epoch": 0.5428100987925357, + "grad_norm": 1.7303497791290283, + "learning_rate": 3.775081453603392e-05, + "loss": 0.3565, + "step": 9890 + }, + { + "epoch": 0.5429198682766191, + "grad_norm": 1.2797741889953613, + "learning_rate": 3.774631049493022e-05, + "loss": 0.2124, + "step": 9892 + }, + { + "epoch": 0.5430296377607026, + "grad_norm": 1.2703280448913574, + "learning_rate": 3.7741805894710784e-05, + "loss": 0.25, + "step": 9894 + }, + { + "epoch": 0.5431394072447859, + "grad_norm": 1.1490302085876465, + "learning_rate": 3.7737300735573205e-05, + "loss": 0.2523, + "step": 9896 + }, + { + "epoch": 0.5432491767288694, + "grad_norm": 1.429905891418457, + "learning_rate": 3.7732795017715096e-05, + "loss": 0.3654, + "step": 9898 + }, + { + "epoch": 0.5433589462129528, + "grad_norm": 2.6452252864837646, + "learning_rate": 3.7728288741334094e-05, + "loss": 0.4368, + "step": 9900 + }, + { + "epoch": 0.5434687156970363, + "grad_norm": 1.3856756687164307, + "learning_rate": 3.7723781906627876e-05, + "loss": 0.2726, + "step": 9902 + }, + { + "epoch": 0.5435784851811196, + "grad_norm": 1.3128238916397095, + "learning_rate": 3.771927451379414e-05, + "loss": 0.2683, + "step": 9904 + }, + { + "epoch": 0.5436882546652031, + "grad_norm": 3.7970468997955322, + "learning_rate": 3.7714766563030586e-05, + "loss": 0.2757, + "step": 9906 + }, + { + "epoch": 0.5437980241492865, + "grad_norm": 1.7263343334197998, + "learning_rate": 3.7710258054534965e-05, + "loss": 0.2883, + "step": 9908 + }, + { + "epoch": 0.5439077936333699, + "grad_norm": 1.8537112474441528, + "learning_rate": 3.770574898850504e-05, + "loss": 0.2806, + "step": 9910 + }, + { + "epoch": 0.5440175631174533, + "grad_norm": 1.265298843383789, + "learning_rate": 3.77012393651386e-05, + "loss": 0.3464, + "step": 9912 + }, + { + "epoch": 0.5441273326015368, + "grad_norm": 1.3059587478637695, + "learning_rate": 3.7696729184633464e-05, + "loss": 0.2366, + "step": 9914 + }, + { + "epoch": 0.5442371020856202, + "grad_norm": 1.6896560192108154, + "learning_rate": 3.769221844718746e-05, + "loss": 0.3058, + "step": 9916 + }, + { + "epoch": 0.5443468715697036, + "grad_norm": 1.5764820575714111, + "learning_rate": 3.7687707152998464e-05, + "loss": 0.3554, + "step": 9918 + }, + { + "epoch": 0.544456641053787, + "grad_norm": 3.813946008682251, + "learning_rate": 3.768319530226435e-05, + "loss": 0.4298, + "step": 9920 + }, + { + "epoch": 0.5445664105378705, + "grad_norm": 1.0986708402633667, + "learning_rate": 3.767868289518305e-05, + "loss": 0.3656, + "step": 9922 + }, + { + "epoch": 0.544676180021954, + "grad_norm": 4.232608795166016, + "learning_rate": 3.767416993195248e-05, + "loss": 0.2689, + "step": 9924 + }, + { + "epoch": 0.5447859495060373, + "grad_norm": 1.604078769683838, + "learning_rate": 3.76696564127706e-05, + "loss": 0.4019, + "step": 9926 + }, + { + "epoch": 0.5448957189901208, + "grad_norm": 1.6595298051834106, + "learning_rate": 3.766514233783541e-05, + "loss": 0.2672, + "step": 9928 + }, + { + "epoch": 0.5450054884742042, + "grad_norm": 2.2850897312164307, + "learning_rate": 3.766062770734492e-05, + "loss": 0.3842, + "step": 9930 + }, + { + "epoch": 0.5451152579582876, + "grad_norm": 1.658396601676941, + "learning_rate": 3.7656112521497145e-05, + "loss": 0.3283, + "step": 9932 + }, + { + "epoch": 0.545225027442371, + "grad_norm": 1.1175343990325928, + "learning_rate": 3.765159678049017e-05, + "loss": 0.2075, + "step": 9934 + }, + { + "epoch": 0.5453347969264545, + "grad_norm": 1.6681948900222778, + "learning_rate": 3.764708048452205e-05, + "loss": 0.3345, + "step": 9936 + }, + { + "epoch": 0.5454445664105378, + "grad_norm": 1.53634512424469, + "learning_rate": 3.764256363379091e-05, + "loss": 0.24, + "step": 9938 + }, + { + "epoch": 0.5455543358946213, + "grad_norm": 2.6318256855010986, + "learning_rate": 3.763804622849487e-05, + "loss": 0.3388, + "step": 9940 + }, + { + "epoch": 0.5456641053787047, + "grad_norm": 1.9342201948165894, + "learning_rate": 3.76335282688321e-05, + "loss": 0.3802, + "step": 9942 + }, + { + "epoch": 0.5457738748627882, + "grad_norm": 1.3076729774475098, + "learning_rate": 3.762900975500076e-05, + "loss": 0.3093, + "step": 9944 + }, + { + "epoch": 0.5458836443468715, + "grad_norm": 2.3090500831604004, + "learning_rate": 3.762449068719907e-05, + "loss": 0.4309, + "step": 9946 + }, + { + "epoch": 0.545993413830955, + "grad_norm": 1.8455116748809814, + "learning_rate": 3.7619971065625256e-05, + "loss": 0.3126, + "step": 9948 + }, + { + "epoch": 0.5461031833150384, + "grad_norm": 1.6781415939331055, + "learning_rate": 3.761545089047757e-05, + "loss": 0.3315, + "step": 9950 + }, + { + "epoch": 0.5462129527991219, + "grad_norm": 2.4552812576293945, + "learning_rate": 3.761093016195428e-05, + "loss": 0.3713, + "step": 9952 + }, + { + "epoch": 0.5463227222832052, + "grad_norm": 1.2872320413589478, + "learning_rate": 3.760640888025371e-05, + "loss": 0.3714, + "step": 9954 + }, + { + "epoch": 0.5464324917672887, + "grad_norm": 1.4316025972366333, + "learning_rate": 3.7601887045574155e-05, + "loss": 0.3423, + "step": 9956 + }, + { + "epoch": 0.5465422612513721, + "grad_norm": 1.7209724187850952, + "learning_rate": 3.759736465811399e-05, + "loss": 0.2954, + "step": 9958 + }, + { + "epoch": 0.5466520307354555, + "grad_norm": 1.1595317125320435, + "learning_rate": 3.759284171807157e-05, + "loss": 0.3317, + "step": 9960 + }, + { + "epoch": 0.546761800219539, + "grad_norm": 1.4945955276489258, + "learning_rate": 3.758831822564531e-05, + "loss": 0.3003, + "step": 9962 + }, + { + "epoch": 0.5468715697036224, + "grad_norm": 2.858394145965576, + "learning_rate": 3.758379418103363e-05, + "loss": 0.1911, + "step": 9964 + }, + { + "epoch": 0.5469813391877059, + "grad_norm": 1.0791171789169312, + "learning_rate": 3.757926958443496e-05, + "loss": 0.2602, + "step": 9966 + }, + { + "epoch": 0.5470911086717892, + "grad_norm": 1.240983009338379, + "learning_rate": 3.7574744436047796e-05, + "loss": 0.2936, + "step": 9968 + }, + { + "epoch": 0.5472008781558727, + "grad_norm": 1.5737571716308594, + "learning_rate": 3.757021873607062e-05, + "loss": 0.2245, + "step": 9970 + }, + { + "epoch": 0.5473106476399561, + "grad_norm": 1.4306422472000122, + "learning_rate": 3.756569248470194e-05, + "loss": 0.2329, + "step": 9972 + }, + { + "epoch": 0.5474204171240395, + "grad_norm": 1.5245237350463867, + "learning_rate": 3.756116568214032e-05, + "loss": 0.3752, + "step": 9974 + }, + { + "epoch": 0.5475301866081229, + "grad_norm": 1.5831350088119507, + "learning_rate": 3.755663832858432e-05, + "loss": 0.3497, + "step": 9976 + }, + { + "epoch": 0.5476399560922064, + "grad_norm": 1.9302034378051758, + "learning_rate": 3.7552110424232525e-05, + "loss": 0.2326, + "step": 9978 + }, + { + "epoch": 0.5477497255762898, + "grad_norm": 1.6669514179229736, + "learning_rate": 3.754758196928356e-05, + "loss": 0.3454, + "step": 9980 + }, + { + "epoch": 0.5478594950603732, + "grad_norm": 1.8781949281692505, + "learning_rate": 3.754305296393606e-05, + "loss": 0.2373, + "step": 9982 + }, + { + "epoch": 0.5479692645444566, + "grad_norm": 1.7781755924224854, + "learning_rate": 3.7538523408388705e-05, + "loss": 0.3182, + "step": 9984 + }, + { + "epoch": 0.5480790340285401, + "grad_norm": 2.9669032096862793, + "learning_rate": 3.7533993302840155e-05, + "loss": 0.4903, + "step": 9986 + }, + { + "epoch": 0.5481888035126234, + "grad_norm": 1.53484308719635, + "learning_rate": 3.752946264748915e-05, + "loss": 0.3511, + "step": 9988 + }, + { + "epoch": 0.5482985729967069, + "grad_norm": 0.9244143962860107, + "learning_rate": 3.7524931442534405e-05, + "loss": 0.2778, + "step": 9990 + }, + { + "epoch": 0.5484083424807903, + "grad_norm": 1.6132107973098755, + "learning_rate": 3.75203996881747e-05, + "loss": 0.278, + "step": 9992 + }, + { + "epoch": 0.5485181119648738, + "grad_norm": 1.6691792011260986, + "learning_rate": 3.75158673846088e-05, + "loss": 0.3179, + "step": 9994 + }, + { + "epoch": 0.5486278814489572, + "grad_norm": 2.8347384929656982, + "learning_rate": 3.751133453203554e-05, + "loss": 0.4266, + "step": 9996 + }, + { + "epoch": 0.5487376509330406, + "grad_norm": 1.5569430589675903, + "learning_rate": 3.750680113065372e-05, + "loss": 0.2434, + "step": 9998 + }, + { + "epoch": 0.5488474204171241, + "grad_norm": 1.9864038228988647, + "learning_rate": 3.750226718066223e-05, + "loss": 0.3014, + "step": 10000 + }, + { + "epoch": 0.5489571899012075, + "grad_norm": 1.9493814706802368, + "learning_rate": 3.749773268225993e-05, + "loss": 0.3231, + "step": 10002 + }, + { + "epoch": 0.5490669593852909, + "grad_norm": 2.308380603790283, + "learning_rate": 3.749319763564574e-05, + "loss": 0.3448, + "step": 10004 + }, + { + "epoch": 0.5491767288693743, + "grad_norm": 1.1357643604278564, + "learning_rate": 3.7488662041018575e-05, + "loss": 0.2681, + "step": 10006 + }, + { + "epoch": 0.5492864983534578, + "grad_norm": 1.9485307931900024, + "learning_rate": 3.748412589857739e-05, + "loss": 0.4263, + "step": 10008 + }, + { + "epoch": 0.5493962678375411, + "grad_norm": 1.4070276021957397, + "learning_rate": 3.7479589208521185e-05, + "loss": 0.3965, + "step": 10010 + }, + { + "epoch": 0.5495060373216246, + "grad_norm": 1.287886381149292, + "learning_rate": 3.747505197104893e-05, + "loss": 0.2592, + "step": 10012 + }, + { + "epoch": 0.549615806805708, + "grad_norm": 1.5886528491973877, + "learning_rate": 3.747051418635966e-05, + "loss": 0.2936, + "step": 10014 + }, + { + "epoch": 0.5497255762897915, + "grad_norm": 1.5307685136795044, + "learning_rate": 3.746597585465244e-05, + "loss": 0.2569, + "step": 10016 + }, + { + "epoch": 0.5498353457738748, + "grad_norm": 1.693480372428894, + "learning_rate": 3.7461436976126324e-05, + "loss": 0.378, + "step": 10018 + }, + { + "epoch": 0.5499451152579583, + "grad_norm": 1.3185675144195557, + "learning_rate": 3.745689755098042e-05, + "loss": 0.2397, + "step": 10020 + }, + { + "epoch": 0.5500548847420417, + "grad_norm": 1.1780710220336914, + "learning_rate": 3.7452357579413846e-05, + "loss": 0.1978, + "step": 10022 + }, + { + "epoch": 0.5501646542261251, + "grad_norm": 1.0254182815551758, + "learning_rate": 3.744781706162576e-05, + "loss": 0.3224, + "step": 10024 + }, + { + "epoch": 0.5502744237102085, + "grad_norm": 1.1904724836349487, + "learning_rate": 3.744327599781531e-05, + "loss": 0.2676, + "step": 10026 + }, + { + "epoch": 0.550384193194292, + "grad_norm": 1.5343797206878662, + "learning_rate": 3.74387343881817e-05, + "loss": 0.2269, + "step": 10028 + }, + { + "epoch": 0.5504939626783754, + "grad_norm": 2.1042678356170654, + "learning_rate": 3.7434192232924146e-05, + "loss": 0.2931, + "step": 10030 + }, + { + "epoch": 0.5506037321624588, + "grad_norm": 4.125793933868408, + "learning_rate": 3.742964953224189e-05, + "loss": 0.331, + "step": 10032 + }, + { + "epoch": 0.5507135016465423, + "grad_norm": 1.6200474500656128, + "learning_rate": 3.742510628633421e-05, + "loss": 0.2876, + "step": 10034 + }, + { + "epoch": 0.5508232711306257, + "grad_norm": 1.692918062210083, + "learning_rate": 3.742056249540036e-05, + "loss": 0.3114, + "step": 10036 + }, + { + "epoch": 0.5509330406147092, + "grad_norm": 2.490173816680908, + "learning_rate": 3.7416018159639695e-05, + "loss": 0.4089, + "step": 10038 + }, + { + "epoch": 0.5510428100987925, + "grad_norm": 1.5381406545639038, + "learning_rate": 3.741147327925152e-05, + "loss": 0.3152, + "step": 10040 + }, + { + "epoch": 0.551152579582876, + "grad_norm": 1.813635230064392, + "learning_rate": 3.740692785443521e-05, + "loss": 0.3313, + "step": 10042 + }, + { + "epoch": 0.5512623490669594, + "grad_norm": 2.303767681121826, + "learning_rate": 3.740238188539015e-05, + "loss": 0.4272, + "step": 10044 + }, + { + "epoch": 0.5513721185510428, + "grad_norm": 2.4977035522460938, + "learning_rate": 3.739783537231575e-05, + "loss": 0.415, + "step": 10046 + }, + { + "epoch": 0.5514818880351262, + "grad_norm": 1.1728566884994507, + "learning_rate": 3.7393288315411434e-05, + "loss": 0.2019, + "step": 10048 + }, + { + "epoch": 0.5515916575192097, + "grad_norm": 1.2750227451324463, + "learning_rate": 3.738874071487666e-05, + "loss": 0.1905, + "step": 10050 + }, + { + "epoch": 0.551701427003293, + "grad_norm": 2.2914183139801025, + "learning_rate": 3.738419257091091e-05, + "loss": 0.3098, + "step": 10052 + }, + { + "epoch": 0.5518111964873765, + "grad_norm": 1.9636144638061523, + "learning_rate": 3.73796438837137e-05, + "loss": 0.3134, + "step": 10054 + }, + { + "epoch": 0.5519209659714599, + "grad_norm": 1.899093508720398, + "learning_rate": 3.7375094653484534e-05, + "loss": 0.3329, + "step": 10056 + }, + { + "epoch": 0.5520307354555434, + "grad_norm": 2.4118826389312744, + "learning_rate": 3.7370544880422984e-05, + "loss": 0.2832, + "step": 10058 + }, + { + "epoch": 0.5521405049396267, + "grad_norm": 2.1186892986297607, + "learning_rate": 3.7365994564728614e-05, + "loss": 0.3356, + "step": 10060 + }, + { + "epoch": 0.5522502744237102, + "grad_norm": 1.299689769744873, + "learning_rate": 3.7361443706601026e-05, + "loss": 0.267, + "step": 10062 + }, + { + "epoch": 0.5523600439077936, + "grad_norm": 1.6367477178573608, + "learning_rate": 3.735689230623984e-05, + "loss": 0.336, + "step": 10064 + }, + { + "epoch": 0.5524698133918771, + "grad_norm": 3.0997917652130127, + "learning_rate": 3.7352340363844704e-05, + "loss": 0.3956, + "step": 10066 + }, + { + "epoch": 0.5525795828759604, + "grad_norm": 1.6784262657165527, + "learning_rate": 3.73477878796153e-05, + "loss": 0.2567, + "step": 10068 + }, + { + "epoch": 0.5526893523600439, + "grad_norm": 1.4507273435592651, + "learning_rate": 3.734323485375131e-05, + "loss": 0.3788, + "step": 10070 + }, + { + "epoch": 0.5527991218441274, + "grad_norm": 1.1814583539962769, + "learning_rate": 3.733868128645246e-05, + "loss": 0.2849, + "step": 10072 + }, + { + "epoch": 0.5529088913282107, + "grad_norm": 1.6554149389266968, + "learning_rate": 3.7334127177918484e-05, + "loss": 0.2816, + "step": 10074 + }, + { + "epoch": 0.5530186608122942, + "grad_norm": 0.720962405204773, + "learning_rate": 3.7329572528349146e-05, + "loss": 0.1774, + "step": 10076 + }, + { + "epoch": 0.5531284302963776, + "grad_norm": 1.247029185295105, + "learning_rate": 3.732501733794424e-05, + "loss": 0.3107, + "step": 10078 + }, + { + "epoch": 0.5532381997804611, + "grad_norm": 1.6097946166992188, + "learning_rate": 3.7320461606903575e-05, + "loss": 0.2371, + "step": 10080 + }, + { + "epoch": 0.5533479692645444, + "grad_norm": 1.1615594625473022, + "learning_rate": 3.7315905335427005e-05, + "loss": 0.2575, + "step": 10082 + }, + { + "epoch": 0.5534577387486279, + "grad_norm": 1.378731369972229, + "learning_rate": 3.731134852371436e-05, + "loss": 0.347, + "step": 10084 + }, + { + "epoch": 0.5535675082327113, + "grad_norm": 2.1828510761260986, + "learning_rate": 3.730679117196556e-05, + "loss": 0.1826, + "step": 10086 + }, + { + "epoch": 0.5536772777167948, + "grad_norm": 1.823170781135559, + "learning_rate": 3.730223328038048e-05, + "loss": 0.323, + "step": 10088 + }, + { + "epoch": 0.5537870472008781, + "grad_norm": 1.3627500534057617, + "learning_rate": 3.729767484915907e-05, + "loss": 0.3068, + "step": 10090 + }, + { + "epoch": 0.5538968166849616, + "grad_norm": 1.4135268926620483, + "learning_rate": 3.729311587850128e-05, + "loss": 0.2971, + "step": 10092 + }, + { + "epoch": 0.554006586169045, + "grad_norm": 1.395878791809082, + "learning_rate": 3.7288556368607096e-05, + "loss": 0.2005, + "step": 10094 + }, + { + "epoch": 0.5541163556531284, + "grad_norm": 1.8200788497924805, + "learning_rate": 3.728399631967651e-05, + "loss": 0.4721, + "step": 10096 + }, + { + "epoch": 0.5542261251372118, + "grad_norm": 2.3975937366485596, + "learning_rate": 3.7279435731909545e-05, + "loss": 0.328, + "step": 10098 + }, + { + "epoch": 0.5543358946212953, + "grad_norm": 1.8111355304718018, + "learning_rate": 3.727487460550626e-05, + "loss": 0.3348, + "step": 10100 + }, + { + "epoch": 0.5544456641053787, + "grad_norm": 3.2637712955474854, + "learning_rate": 3.7270312940666735e-05, + "loss": 0.3934, + "step": 10102 + }, + { + "epoch": 0.5545554335894621, + "grad_norm": 1.544808030128479, + "learning_rate": 3.726575073759105e-05, + "loss": 0.2547, + "step": 10104 + }, + { + "epoch": 0.5546652030735456, + "grad_norm": 2.057774066925049, + "learning_rate": 3.7261187996479333e-05, + "loss": 0.3433, + "step": 10106 + }, + { + "epoch": 0.554774972557629, + "grad_norm": 1.4933022260665894, + "learning_rate": 3.725662471753174e-05, + "loss": 0.3518, + "step": 10108 + }, + { + "epoch": 0.5548847420417125, + "grad_norm": 1.557885766029358, + "learning_rate": 3.725206090094841e-05, + "loss": 0.3979, + "step": 10110 + }, + { + "epoch": 0.5549945115257958, + "grad_norm": 1.92600679397583, + "learning_rate": 3.724749654692956e-05, + "loss": 0.389, + "step": 10112 + }, + { + "epoch": 0.5551042810098793, + "grad_norm": 1.3882415294647217, + "learning_rate": 3.7242931655675404e-05, + "loss": 0.2562, + "step": 10114 + }, + { + "epoch": 0.5552140504939627, + "grad_norm": 2.3379030227661133, + "learning_rate": 3.723836622738617e-05, + "loss": 0.2296, + "step": 10116 + }, + { + "epoch": 0.5553238199780461, + "grad_norm": 1.9395649433135986, + "learning_rate": 3.723380026226212e-05, + "loss": 0.2604, + "step": 10118 + }, + { + "epoch": 0.5554335894621295, + "grad_norm": 1.788201093673706, + "learning_rate": 3.7229233760503554e-05, + "loss": 0.2654, + "step": 10120 + }, + { + "epoch": 0.555543358946213, + "grad_norm": 1.6636691093444824, + "learning_rate": 3.722466672231076e-05, + "loss": 0.3579, + "step": 10122 + }, + { + "epoch": 0.5556531284302964, + "grad_norm": 2.1920595169067383, + "learning_rate": 3.722009914788408e-05, + "loss": 0.3139, + "step": 10124 + }, + { + "epoch": 0.5557628979143798, + "grad_norm": 1.625218391418457, + "learning_rate": 3.721553103742388e-05, + "loss": 0.3688, + "step": 10126 + }, + { + "epoch": 0.5558726673984632, + "grad_norm": 2.145080089569092, + "learning_rate": 3.7210962391130524e-05, + "loss": 0.3432, + "step": 10128 + }, + { + "epoch": 0.5559824368825467, + "grad_norm": 2.20579195022583, + "learning_rate": 3.720639320920442e-05, + "loss": 0.3433, + "step": 10130 + }, + { + "epoch": 0.55609220636663, + "grad_norm": 1.9358534812927246, + "learning_rate": 3.720182349184601e-05, + "loss": 0.3298, + "step": 10132 + }, + { + "epoch": 0.5562019758507135, + "grad_norm": 2.587120771408081, + "learning_rate": 3.719725323925573e-05, + "loss": 0.485, + "step": 10134 + }, + { + "epoch": 0.5563117453347969, + "grad_norm": 2.074402332305908, + "learning_rate": 3.719268245163404e-05, + "loss": 0.4276, + "step": 10136 + }, + { + "epoch": 0.5564215148188804, + "grad_norm": 1.8710741996765137, + "learning_rate": 3.718811112918147e-05, + "loss": 0.3339, + "step": 10138 + }, + { + "epoch": 0.5565312843029637, + "grad_norm": 1.7553151845932007, + "learning_rate": 3.7183539272098507e-05, + "loss": 0.2543, + "step": 10140 + }, + { + "epoch": 0.5566410537870472, + "grad_norm": 4.745977401733398, + "learning_rate": 3.717896688058572e-05, + "loss": 0.3828, + "step": 10142 + }, + { + "epoch": 0.5567508232711307, + "grad_norm": 1.9225990772247314, + "learning_rate": 3.7174393954843675e-05, + "loss": 0.2, + "step": 10144 + }, + { + "epoch": 0.556860592755214, + "grad_norm": 1.420976161956787, + "learning_rate": 3.7169820495072935e-05, + "loss": 0.2514, + "step": 10146 + }, + { + "epoch": 0.5569703622392975, + "grad_norm": 1.5325952768325806, + "learning_rate": 3.7165246501474154e-05, + "loss": 0.3133, + "step": 10148 + }, + { + "epoch": 0.5570801317233809, + "grad_norm": 1.4868677854537964, + "learning_rate": 3.716067197424795e-05, + "loss": 0.3083, + "step": 10150 + }, + { + "epoch": 0.5571899012074644, + "grad_norm": 1.2642275094985962, + "learning_rate": 3.7156096913594975e-05, + "loss": 0.1613, + "step": 10152 + }, + { + "epoch": 0.5572996706915477, + "grad_norm": 1.9308397769927979, + "learning_rate": 3.715152131971593e-05, + "loss": 0.2733, + "step": 10154 + }, + { + "epoch": 0.5574094401756312, + "grad_norm": 2.0733213424682617, + "learning_rate": 3.714694519281152e-05, + "loss": 0.2883, + "step": 10156 + }, + { + "epoch": 0.5575192096597146, + "grad_norm": 2.2088723182678223, + "learning_rate": 3.714236853308246e-05, + "loss": 0.2555, + "step": 10158 + }, + { + "epoch": 0.557628979143798, + "grad_norm": 1.8357654809951782, + "learning_rate": 3.713779134072953e-05, + "loss": 0.4062, + "step": 10160 + }, + { + "epoch": 0.5577387486278814, + "grad_norm": 1.4635850191116333, + "learning_rate": 3.71332136159535e-05, + "loss": 0.3433, + "step": 10162 + }, + { + "epoch": 0.5578485181119649, + "grad_norm": 1.3032190799713135, + "learning_rate": 3.712863535895515e-05, + "loss": 0.2858, + "step": 10164 + }, + { + "epoch": 0.5579582875960483, + "grad_norm": 1.6371628046035767, + "learning_rate": 3.712405656993534e-05, + "loss": 0.2372, + "step": 10166 + }, + { + "epoch": 0.5580680570801317, + "grad_norm": 2.5635268688201904, + "learning_rate": 3.711947724909489e-05, + "loss": 0.3864, + "step": 10168 + }, + { + "epoch": 0.5581778265642151, + "grad_norm": 0.9365130662918091, + "learning_rate": 3.711489739663468e-05, + "loss": 0.3284, + "step": 10170 + }, + { + "epoch": 0.5582875960482986, + "grad_norm": 1.183703899383545, + "learning_rate": 3.7110317012755614e-05, + "loss": 0.1568, + "step": 10172 + }, + { + "epoch": 0.558397365532382, + "grad_norm": 3.841623306274414, + "learning_rate": 3.710573609765861e-05, + "loss": 0.2777, + "step": 10174 + }, + { + "epoch": 0.5585071350164654, + "grad_norm": 1.9055920839309692, + "learning_rate": 3.7101154651544584e-05, + "loss": 0.3782, + "step": 10176 + }, + { + "epoch": 0.5586169045005488, + "grad_norm": 2.066896915435791, + "learning_rate": 3.709657267461453e-05, + "loss": 0.4012, + "step": 10178 + }, + { + "epoch": 0.5587266739846323, + "grad_norm": 1.092298984527588, + "learning_rate": 3.7091990167069425e-05, + "loss": 0.2443, + "step": 10180 + }, + { + "epoch": 0.5588364434687157, + "grad_norm": 1.3950092792510986, + "learning_rate": 3.708740712911028e-05, + "loss": 0.3005, + "step": 10182 + }, + { + "epoch": 0.5589462129527991, + "grad_norm": 1.2946093082427979, + "learning_rate": 3.708282356093812e-05, + "loss": 0.2904, + "step": 10184 + }, + { + "epoch": 0.5590559824368826, + "grad_norm": 2.166203498840332, + "learning_rate": 3.707823946275402e-05, + "loss": 0.2731, + "step": 10186 + }, + { + "epoch": 0.559165751920966, + "grad_norm": 1.2870736122131348, + "learning_rate": 3.707365483475906e-05, + "loss": 0.2967, + "step": 10188 + }, + { + "epoch": 0.5592755214050494, + "grad_norm": 1.7003107070922852, + "learning_rate": 3.7069069677154326e-05, + "loss": 0.2931, + "step": 10190 + }, + { + "epoch": 0.5593852908891328, + "grad_norm": 1.926591396331787, + "learning_rate": 3.706448399014097e-05, + "loss": 0.3145, + "step": 10192 + }, + { + "epoch": 0.5594950603732163, + "grad_norm": 2.2398898601531982, + "learning_rate": 3.705989777392012e-05, + "loss": 0.3608, + "step": 10194 + }, + { + "epoch": 0.5596048298572996, + "grad_norm": 1.8483997583389282, + "learning_rate": 3.705531102869297e-05, + "loss": 0.3366, + "step": 10196 + }, + { + "epoch": 0.5597145993413831, + "grad_norm": 1.6675236225128174, + "learning_rate": 3.70507237546607e-05, + "loss": 0.3401, + "step": 10198 + }, + { + "epoch": 0.5598243688254665, + "grad_norm": 1.2907098531723022, + "learning_rate": 3.704613595202454e-05, + "loss": 0.3264, + "step": 10200 + }, + { + "epoch": 0.55993413830955, + "grad_norm": 1.1518011093139648, + "learning_rate": 3.704154762098572e-05, + "loss": 0.2486, + "step": 10202 + }, + { + "epoch": 0.5600439077936333, + "grad_norm": 1.0731074810028076, + "learning_rate": 3.7036958761745535e-05, + "loss": 0.331, + "step": 10204 + }, + { + "epoch": 0.5601536772777168, + "grad_norm": 3.2339985370635986, + "learning_rate": 3.703236937450525e-05, + "loss": 0.4387, + "step": 10206 + }, + { + "epoch": 0.5602634467618002, + "grad_norm": 1.038610577583313, + "learning_rate": 3.7027779459466186e-05, + "loss": 0.2689, + "step": 10208 + }, + { + "epoch": 0.5603732162458837, + "grad_norm": 1.6525083780288696, + "learning_rate": 3.702318901682968e-05, + "loss": 0.3512, + "step": 10210 + }, + { + "epoch": 0.560482985729967, + "grad_norm": 1.1860029697418213, + "learning_rate": 3.70185980467971e-05, + "loss": 0.2762, + "step": 10212 + }, + { + "epoch": 0.5605927552140505, + "grad_norm": 1.336276888847351, + "learning_rate": 3.70140065495698e-05, + "loss": 0.3381, + "step": 10214 + }, + { + "epoch": 0.5607025246981339, + "grad_norm": 1.284471035003662, + "learning_rate": 3.700941452534922e-05, + "loss": 0.3462, + "step": 10216 + }, + { + "epoch": 0.5608122941822173, + "grad_norm": 1.9381273984909058, + "learning_rate": 3.700482197433677e-05, + "loss": 0.3936, + "step": 10218 + }, + { + "epoch": 0.5609220636663008, + "grad_norm": 1.642443299293518, + "learning_rate": 3.7000228896733895e-05, + "loss": 0.4836, + "step": 10220 + }, + { + "epoch": 0.5610318331503842, + "grad_norm": 1.1533230543136597, + "learning_rate": 3.6995635292742086e-05, + "loss": 0.2046, + "step": 10222 + }, + { + "epoch": 0.5611416026344677, + "grad_norm": 1.2293365001678467, + "learning_rate": 3.699104116256284e-05, + "loss": 0.2017, + "step": 10224 + }, + { + "epoch": 0.561251372118551, + "grad_norm": 2.026479721069336, + "learning_rate": 3.6986446506397666e-05, + "loss": 0.2956, + "step": 10226 + }, + { + "epoch": 0.5613611416026345, + "grad_norm": 1.6336941719055176, + "learning_rate": 3.698185132444812e-05, + "loss": 0.1983, + "step": 10228 + }, + { + "epoch": 0.5614709110867179, + "grad_norm": 1.3698687553405762, + "learning_rate": 3.6977255616915765e-05, + "loss": 0.3094, + "step": 10230 + }, + { + "epoch": 0.5615806805708013, + "grad_norm": 1.8268946409225464, + "learning_rate": 3.6972659384002184e-05, + "loss": 0.3038, + "step": 10232 + }, + { + "epoch": 0.5616904500548847, + "grad_norm": 1.9331929683685303, + "learning_rate": 3.6968062625909005e-05, + "loss": 0.2887, + "step": 10234 + }, + { + "epoch": 0.5618002195389682, + "grad_norm": 1.9507474899291992, + "learning_rate": 3.6963465342837856e-05, + "loss": 0.3727, + "step": 10236 + }, + { + "epoch": 0.5619099890230516, + "grad_norm": 1.5419909954071045, + "learning_rate": 3.695886753499039e-05, + "loss": 0.2991, + "step": 10238 + }, + { + "epoch": 0.562019758507135, + "grad_norm": 2.0670626163482666, + "learning_rate": 3.6954269202568303e-05, + "loss": 0.3757, + "step": 10240 + }, + { + "epoch": 0.5621295279912184, + "grad_norm": 1.7246425151824951, + "learning_rate": 3.69496703457733e-05, + "loss": 0.2538, + "step": 10242 + }, + { + "epoch": 0.5622392974753019, + "grad_norm": 1.1201452016830444, + "learning_rate": 3.694507096480709e-05, + "loss": 0.2262, + "step": 10244 + }, + { + "epoch": 0.5623490669593852, + "grad_norm": 1.0824987888336182, + "learning_rate": 3.694047105987144e-05, + "loss": 0.1637, + "step": 10246 + }, + { + "epoch": 0.5624588364434687, + "grad_norm": 1.4710360765457153, + "learning_rate": 3.693587063116812e-05, + "loss": 0.2826, + "step": 10248 + }, + { + "epoch": 0.5625686059275521, + "grad_norm": 2.1985089778900146, + "learning_rate": 3.693126967889894e-05, + "loss": 0.2872, + "step": 10250 + }, + { + "epoch": 0.5626783754116356, + "grad_norm": 1.5517609119415283, + "learning_rate": 3.69266682032657e-05, + "loss": 0.3087, + "step": 10252 + }, + { + "epoch": 0.562788144895719, + "grad_norm": 1.1423028707504272, + "learning_rate": 3.692206620447026e-05, + "loss": 0.3595, + "step": 10254 + }, + { + "epoch": 0.5628979143798024, + "grad_norm": 1.0427488088607788, + "learning_rate": 3.691746368271448e-05, + "loss": 0.2133, + "step": 10256 + }, + { + "epoch": 0.5630076838638859, + "grad_norm": 1.8708428144454956, + "learning_rate": 3.691286063820024e-05, + "loss": 0.2913, + "step": 10258 + }, + { + "epoch": 0.5631174533479693, + "grad_norm": 1.634352445602417, + "learning_rate": 3.690825707112947e-05, + "loss": 0.2752, + "step": 10260 + }, + { + "epoch": 0.5632272228320527, + "grad_norm": 1.385924220085144, + "learning_rate": 3.6903652981704083e-05, + "loss": 0.276, + "step": 10262 + }, + { + "epoch": 0.5633369923161361, + "grad_norm": 1.6190980672836304, + "learning_rate": 3.689904837012606e-05, + "loss": 0.2506, + "step": 10264 + }, + { + "epoch": 0.5634467618002196, + "grad_norm": 1.5377763509750366, + "learning_rate": 3.689444323659737e-05, + "loss": 0.2977, + "step": 10266 + }, + { + "epoch": 0.5635565312843029, + "grad_norm": 1.807294487953186, + "learning_rate": 3.688983758132002e-05, + "loss": 0.2212, + "step": 10268 + }, + { + "epoch": 0.5636663007683864, + "grad_norm": 1.1936908960342407, + "learning_rate": 3.688523140449603e-05, + "loss": 0.2326, + "step": 10270 + }, + { + "epoch": 0.5637760702524698, + "grad_norm": 1.346894383430481, + "learning_rate": 3.688062470632746e-05, + "loss": 0.3016, + "step": 10272 + }, + { + "epoch": 0.5638858397365533, + "grad_norm": 1.1039893627166748, + "learning_rate": 3.6876017487016376e-05, + "loss": 0.3328, + "step": 10274 + }, + { + "epoch": 0.5639956092206366, + "grad_norm": 2.6237475872039795, + "learning_rate": 3.6871409746764865e-05, + "loss": 0.3477, + "step": 10276 + }, + { + "epoch": 0.5641053787047201, + "grad_norm": 1.3953847885131836, + "learning_rate": 3.686680148577506e-05, + "loss": 0.1935, + "step": 10278 + }, + { + "epoch": 0.5642151481888035, + "grad_norm": 1.299443244934082, + "learning_rate": 3.68621927042491e-05, + "loss": 0.3157, + "step": 10280 + }, + { + "epoch": 0.564324917672887, + "grad_norm": 1.9501326084136963, + "learning_rate": 3.685758340238914e-05, + "loss": 0.269, + "step": 10282 + }, + { + "epoch": 0.5644346871569703, + "grad_norm": 1.056076169013977, + "learning_rate": 3.685297358039738e-05, + "loss": 0.2325, + "step": 10284 + }, + { + "epoch": 0.5645444566410538, + "grad_norm": 1.4314608573913574, + "learning_rate": 3.684836323847601e-05, + "loss": 0.3639, + "step": 10286 + }, + { + "epoch": 0.5646542261251372, + "grad_norm": 1.3363789319992065, + "learning_rate": 3.6843752376827275e-05, + "loss": 0.311, + "step": 10288 + }, + { + "epoch": 0.5647639956092206, + "grad_norm": 1.594504952430725, + "learning_rate": 3.683914099565344e-05, + "loss": 0.2217, + "step": 10290 + }, + { + "epoch": 0.5648737650933041, + "grad_norm": 1.3803119659423828, + "learning_rate": 3.683452909515675e-05, + "loss": 0.2531, + "step": 10292 + }, + { + "epoch": 0.5649835345773875, + "grad_norm": 1.6160778999328613, + "learning_rate": 3.682991667553954e-05, + "loss": 0.315, + "step": 10294 + }, + { + "epoch": 0.565093304061471, + "grad_norm": 1.5429577827453613, + "learning_rate": 3.682530373700412e-05, + "loss": 0.2511, + "step": 10296 + }, + { + "epoch": 0.5652030735455543, + "grad_norm": 1.5502166748046875, + "learning_rate": 3.682069027975284e-05, + "loss": 0.3817, + "step": 10298 + }, + { + "epoch": 0.5653128430296378, + "grad_norm": 1.5063519477844238, + "learning_rate": 3.681607630398806e-05, + "loss": 0.2119, + "step": 10300 + }, + { + "epoch": 0.5654226125137212, + "grad_norm": 3.123225212097168, + "learning_rate": 3.6811461809912174e-05, + "loss": 0.4439, + "step": 10302 + }, + { + "epoch": 0.5655323819978046, + "grad_norm": 1.9855310916900635, + "learning_rate": 3.68068467977276e-05, + "loss": 0.3726, + "step": 10304 + }, + { + "epoch": 0.565642151481888, + "grad_norm": 1.0097047090530396, + "learning_rate": 3.680223126763677e-05, + "loss": 0.277, + "step": 10306 + }, + { + "epoch": 0.5657519209659715, + "grad_norm": 2.1315970420837402, + "learning_rate": 3.679761521984216e-05, + "loss": 0.2612, + "step": 10308 + }, + { + "epoch": 0.5658616904500549, + "grad_norm": 1.3067435026168823, + "learning_rate": 3.679299865454623e-05, + "loss": 0.2075, + "step": 10310 + }, + { + "epoch": 0.5659714599341383, + "grad_norm": 1.2922331094741821, + "learning_rate": 3.67883815719515e-05, + "loss": 0.2802, + "step": 10312 + }, + { + "epoch": 0.5660812294182217, + "grad_norm": 1.6338638067245483, + "learning_rate": 3.678376397226049e-05, + "loss": 0.239, + "step": 10314 + }, + { + "epoch": 0.5661909989023052, + "grad_norm": 1.8944560289382935, + "learning_rate": 3.6779145855675765e-05, + "loss": 0.2948, + "step": 10316 + }, + { + "epoch": 0.5663007683863885, + "grad_norm": 3.078016757965088, + "learning_rate": 3.6774527222399877e-05, + "loss": 0.3167, + "step": 10318 + }, + { + "epoch": 0.566410537870472, + "grad_norm": 1.6954134702682495, + "learning_rate": 3.676990807263544e-05, + "loss": 0.2716, + "step": 10320 + }, + { + "epoch": 0.5665203073545554, + "grad_norm": 1.8805980682373047, + "learning_rate": 3.676528840658505e-05, + "loss": 0.2866, + "step": 10322 + }, + { + "epoch": 0.5666300768386389, + "grad_norm": 1.965874195098877, + "learning_rate": 3.6760668224451365e-05, + "loss": 0.1942, + "step": 10324 + }, + { + "epoch": 0.5667398463227222, + "grad_norm": 1.6443125009536743, + "learning_rate": 3.675604752643706e-05, + "loss": 0.3291, + "step": 10326 + }, + { + "epoch": 0.5668496158068057, + "grad_norm": 1.7106873989105225, + "learning_rate": 3.6751426312744804e-05, + "loss": 0.2337, + "step": 10328 + }, + { + "epoch": 0.5669593852908892, + "grad_norm": 1.0103145837783813, + "learning_rate": 3.6746804583577304e-05, + "loss": 0.3109, + "step": 10330 + }, + { + "epoch": 0.5670691547749726, + "grad_norm": 1.2862404584884644, + "learning_rate": 3.674218233913731e-05, + "loss": 0.3699, + "step": 10332 + }, + { + "epoch": 0.567178924259056, + "grad_norm": 1.174534559249878, + "learning_rate": 3.673755957962756e-05, + "loss": 0.263, + "step": 10334 + }, + { + "epoch": 0.5672886937431394, + "grad_norm": 1.8783988952636719, + "learning_rate": 3.673293630525083e-05, + "loss": 0.3989, + "step": 10336 + }, + { + "epoch": 0.5673984632272229, + "grad_norm": 1.7180575132369995, + "learning_rate": 3.672831251620992e-05, + "loss": 0.2832, + "step": 10338 + }, + { + "epoch": 0.5675082327113062, + "grad_norm": 2.1038639545440674, + "learning_rate": 3.672368821270767e-05, + "loss": 0.2279, + "step": 10340 + }, + { + "epoch": 0.5676180021953897, + "grad_norm": 1.9664634466171265, + "learning_rate": 3.6719063394946904e-05, + "loss": 0.3467, + "step": 10342 + }, + { + "epoch": 0.5677277716794731, + "grad_norm": 2.2725396156311035, + "learning_rate": 3.67144380631305e-05, + "loss": 0.2285, + "step": 10344 + }, + { + "epoch": 0.5678375411635566, + "grad_norm": 2.8727962970733643, + "learning_rate": 3.670981221746135e-05, + "loss": 0.3104, + "step": 10346 + }, + { + "epoch": 0.5679473106476399, + "grad_norm": 1.8909094333648682, + "learning_rate": 3.670518585814235e-05, + "loss": 0.2917, + "step": 10348 + }, + { + "epoch": 0.5680570801317234, + "grad_norm": 2.111898183822632, + "learning_rate": 3.670055898537646e-05, + "loss": 0.325, + "step": 10350 + }, + { + "epoch": 0.5681668496158068, + "grad_norm": 2.0597236156463623, + "learning_rate": 3.6695931599366606e-05, + "loss": 0.3148, + "step": 10352 + }, + { + "epoch": 0.5682766190998902, + "grad_norm": 1.9929205179214478, + "learning_rate": 3.6691303700315796e-05, + "loss": 0.2538, + "step": 10354 + }, + { + "epoch": 0.5683863885839736, + "grad_norm": 1.659684658050537, + "learning_rate": 3.668667528842702e-05, + "loss": 0.2614, + "step": 10356 + }, + { + "epoch": 0.5684961580680571, + "grad_norm": 1.0864266157150269, + "learning_rate": 3.66820463639033e-05, + "loss": 0.3281, + "step": 10358 + }, + { + "epoch": 0.5686059275521405, + "grad_norm": 1.0454638004302979, + "learning_rate": 3.6677416926947686e-05, + "loss": 0.2492, + "step": 10360 + }, + { + "epoch": 0.5687156970362239, + "grad_norm": 3.289034128189087, + "learning_rate": 3.667278697776326e-05, + "loss": 0.3501, + "step": 10362 + }, + { + "epoch": 0.5688254665203073, + "grad_norm": 2.5359256267547607, + "learning_rate": 3.66681565165531e-05, + "loss": 0.2789, + "step": 10364 + }, + { + "epoch": 0.5689352360043908, + "grad_norm": 4.291749000549316, + "learning_rate": 3.666352554352032e-05, + "loss": 0.4048, + "step": 10366 + }, + { + "epoch": 0.5690450054884743, + "grad_norm": 2.4387826919555664, + "learning_rate": 3.6658894058868064e-05, + "loss": 0.434, + "step": 10368 + }, + { + "epoch": 0.5691547749725576, + "grad_norm": 1.6336179971694946, + "learning_rate": 3.6654262062799485e-05, + "loss": 0.3169, + "step": 10370 + }, + { + "epoch": 0.5692645444566411, + "grad_norm": 1.4833683967590332, + "learning_rate": 3.664962955551778e-05, + "loss": 0.332, + "step": 10372 + }, + { + "epoch": 0.5693743139407245, + "grad_norm": 2.0973150730133057, + "learning_rate": 3.6644996537226135e-05, + "loss": 0.3333, + "step": 10374 + }, + { + "epoch": 0.5694840834248079, + "grad_norm": 1.9888361692428589, + "learning_rate": 3.6640363008127784e-05, + "loss": 0.2184, + "step": 10376 + }, + { + "epoch": 0.5695938529088913, + "grad_norm": 1.8965221643447876, + "learning_rate": 3.6635728968425985e-05, + "loss": 0.3322, + "step": 10378 + }, + { + "epoch": 0.5697036223929748, + "grad_norm": 2.701916217803955, + "learning_rate": 3.6631094418324e-05, + "loss": 0.3329, + "step": 10380 + }, + { + "epoch": 0.5698133918770582, + "grad_norm": 1.8369436264038086, + "learning_rate": 3.662645935802512e-05, + "loss": 0.3606, + "step": 10382 + }, + { + "epoch": 0.5699231613611416, + "grad_norm": 1.4277747869491577, + "learning_rate": 3.662182378773267e-05, + "loss": 0.3873, + "step": 10384 + }, + { + "epoch": 0.570032930845225, + "grad_norm": 1.204973578453064, + "learning_rate": 3.661718770764998e-05, + "loss": 0.1945, + "step": 10386 + }, + { + "epoch": 0.5701427003293085, + "grad_norm": 1.2337729930877686, + "learning_rate": 3.661255111798042e-05, + "loss": 0.211, + "step": 10388 + }, + { + "epoch": 0.5702524698133918, + "grad_norm": 1.2936077117919922, + "learning_rate": 3.6607914018927375e-05, + "loss": 0.2578, + "step": 10390 + }, + { + "epoch": 0.5703622392974753, + "grad_norm": 1.8418461084365845, + "learning_rate": 3.6603276410694235e-05, + "loss": 0.2624, + "step": 10392 + }, + { + "epoch": 0.5704720087815587, + "grad_norm": 1.4875887632369995, + "learning_rate": 3.659863829348446e-05, + "loss": 0.2714, + "step": 10394 + }, + { + "epoch": 0.5705817782656422, + "grad_norm": 1.8450158834457397, + "learning_rate": 3.6593999667501454e-05, + "loss": 0.3143, + "step": 10396 + }, + { + "epoch": 0.5706915477497255, + "grad_norm": 1.877677083015442, + "learning_rate": 3.658936053294872e-05, + "loss": 0.3256, + "step": 10398 + }, + { + "epoch": 0.570801317233809, + "grad_norm": 1.3323769569396973, + "learning_rate": 3.6584720890029766e-05, + "loss": 0.3141, + "step": 10400 + }, + { + "epoch": 0.5709110867178925, + "grad_norm": 1.6112066507339478, + "learning_rate": 3.6580080738948085e-05, + "loss": 0.3029, + "step": 10402 + }, + { + "epoch": 0.5710208562019758, + "grad_norm": 1.6535720825195312, + "learning_rate": 3.6575440079907216e-05, + "loss": 0.3508, + "step": 10404 + }, + { + "epoch": 0.5711306256860593, + "grad_norm": 1.5342271327972412, + "learning_rate": 3.6570798913110747e-05, + "loss": 0.2149, + "step": 10406 + }, + { + "epoch": 0.5712403951701427, + "grad_norm": 1.8431024551391602, + "learning_rate": 3.6566157238762235e-05, + "loss": 0.284, + "step": 10408 + }, + { + "epoch": 0.5713501646542262, + "grad_norm": 1.6032377481460571, + "learning_rate": 3.6561515057065294e-05, + "loss": 0.367, + "step": 10410 + }, + { + "epoch": 0.5714599341383095, + "grad_norm": 1.1936300992965698, + "learning_rate": 3.655687236822356e-05, + "loss": 0.1686, + "step": 10412 + }, + { + "epoch": 0.571569703622393, + "grad_norm": 2.431756019592285, + "learning_rate": 3.655222917244068e-05, + "loss": 0.449, + "step": 10414 + }, + { + "epoch": 0.5716794731064764, + "grad_norm": 1.7412084341049194, + "learning_rate": 3.6547585469920314e-05, + "loss": 0.2562, + "step": 10416 + }, + { + "epoch": 0.5717892425905599, + "grad_norm": 1.4174740314483643, + "learning_rate": 3.654294126086618e-05, + "loss": 0.4038, + "step": 10418 + }, + { + "epoch": 0.5718990120746432, + "grad_norm": 3.0959465503692627, + "learning_rate": 3.653829654548199e-05, + "loss": 0.2432, + "step": 10420 + }, + { + "epoch": 0.5720087815587267, + "grad_norm": 1.910365343093872, + "learning_rate": 3.653365132397147e-05, + "loss": 0.3642, + "step": 10422 + }, + { + "epoch": 0.5721185510428101, + "grad_norm": 1.4282461404800415, + "learning_rate": 3.6529005596538405e-05, + "loss": 0.2853, + "step": 10424 + }, + { + "epoch": 0.5722283205268935, + "grad_norm": 2.6762359142303467, + "learning_rate": 3.652435936338656e-05, + "loss": 0.3546, + "step": 10426 + }, + { + "epoch": 0.5723380900109769, + "grad_norm": 2.095367431640625, + "learning_rate": 3.6519712624719746e-05, + "loss": 0.2517, + "step": 10428 + }, + { + "epoch": 0.5724478594950604, + "grad_norm": 3.3398611545562744, + "learning_rate": 3.65150653807418e-05, + "loss": 0.3559, + "step": 10430 + }, + { + "epoch": 0.5725576289791438, + "grad_norm": 1.1411453485488892, + "learning_rate": 3.651041763165657e-05, + "loss": 0.3461, + "step": 10432 + }, + { + "epoch": 0.5726673984632272, + "grad_norm": 1.6416760683059692, + "learning_rate": 3.6505769377667916e-05, + "loss": 0.3795, + "step": 10434 + }, + { + "epoch": 0.5727771679473106, + "grad_norm": 2.094444751739502, + "learning_rate": 3.650112061897975e-05, + "loss": 0.3102, + "step": 10436 + }, + { + "epoch": 0.5728869374313941, + "grad_norm": 1.6099379062652588, + "learning_rate": 3.649647135579598e-05, + "loss": 0.2742, + "step": 10438 + }, + { + "epoch": 0.5729967069154775, + "grad_norm": 2.2429256439208984, + "learning_rate": 3.649182158832055e-05, + "loss": 0.4247, + "step": 10440 + }, + { + "epoch": 0.5731064763995609, + "grad_norm": 1.6577657461166382, + "learning_rate": 3.648717131675742e-05, + "loss": 0.3031, + "step": 10442 + }, + { + "epoch": 0.5732162458836444, + "grad_norm": 1.6362977027893066, + "learning_rate": 3.648252054131057e-05, + "loss": 0.2709, + "step": 10444 + }, + { + "epoch": 0.5733260153677278, + "grad_norm": 1.3398879766464233, + "learning_rate": 3.647786926218401e-05, + "loss": 0.3127, + "step": 10446 + }, + { + "epoch": 0.5734357848518112, + "grad_norm": 1.4923094511032104, + "learning_rate": 3.6473217479581776e-05, + "loss": 0.3517, + "step": 10448 + }, + { + "epoch": 0.5735455543358946, + "grad_norm": 2.147784948348999, + "learning_rate": 3.6468565193707906e-05, + "loss": 0.4042, + "step": 10450 + }, + { + "epoch": 0.5736553238199781, + "grad_norm": 1.3855595588684082, + "learning_rate": 3.646391240476647e-05, + "loss": 0.2942, + "step": 10452 + }, + { + "epoch": 0.5737650933040614, + "grad_norm": 1.3310973644256592, + "learning_rate": 3.6459259112961574e-05, + "loss": 0.3465, + "step": 10454 + }, + { + "epoch": 0.5738748627881449, + "grad_norm": 1.3449376821517944, + "learning_rate": 3.6454605318497326e-05, + "loss": 0.2596, + "step": 10456 + }, + { + "epoch": 0.5739846322722283, + "grad_norm": 2.5882742404937744, + "learning_rate": 3.6449951021577864e-05, + "loss": 0.282, + "step": 10458 + }, + { + "epoch": 0.5740944017563118, + "grad_norm": 1.5640093088150024, + "learning_rate": 3.644529622240734e-05, + "loss": 0.3798, + "step": 10460 + }, + { + "epoch": 0.5742041712403951, + "grad_norm": 1.305513858795166, + "learning_rate": 3.6440640921189964e-05, + "loss": 0.306, + "step": 10462 + }, + { + "epoch": 0.5743139407244786, + "grad_norm": 2.3778839111328125, + "learning_rate": 3.643598511812992e-05, + "loss": 0.2856, + "step": 10464 + }, + { + "epoch": 0.574423710208562, + "grad_norm": 1.6504653692245483, + "learning_rate": 3.6431328813431436e-05, + "loss": 0.3324, + "step": 10466 + }, + { + "epoch": 0.5745334796926455, + "grad_norm": 2.166513442993164, + "learning_rate": 3.642667200729876e-05, + "loss": 0.2775, + "step": 10468 + }, + { + "epoch": 0.5746432491767288, + "grad_norm": Infinity, + "learning_rate": 3.642434341625844e-05, + "loss": 0.4479, + "step": 10470 + }, + { + "epoch": 0.5747530186608123, + "grad_norm": 1.2439329624176025, + "learning_rate": 3.641968585835749e-05, + "loss": 0.3103, + "step": 10472 + }, + { + "epoch": 0.5748627881448957, + "grad_norm": 1.1988462209701538, + "learning_rate": 3.641502779953307e-05, + "loss": 0.2458, + "step": 10474 + }, + { + "epoch": 0.5749725576289791, + "grad_norm": 2.191133975982666, + "learning_rate": 3.641036923998951e-05, + "loss": 0.2881, + "step": 10476 + }, + { + "epoch": 0.5750823271130626, + "grad_norm": 2.1818687915802, + "learning_rate": 3.640571017993113e-05, + "loss": 0.2848, + "step": 10478 + }, + { + "epoch": 0.575192096597146, + "grad_norm": 1.102044701576233, + "learning_rate": 3.640105061956234e-05, + "loss": 0.2763, + "step": 10480 + }, + { + "epoch": 0.5753018660812295, + "grad_norm": 1.5043220520019531, + "learning_rate": 3.639639055908751e-05, + "loss": 0.4929, + "step": 10482 + }, + { + "epoch": 0.5754116355653128, + "grad_norm": 1.9230096340179443, + "learning_rate": 3.639172999871104e-05, + "loss": 0.3122, + "step": 10484 + }, + { + "epoch": 0.5755214050493963, + "grad_norm": 1.2897018194198608, + "learning_rate": 3.638706893863739e-05, + "loss": 0.1893, + "step": 10486 + }, + { + "epoch": 0.5756311745334797, + "grad_norm": 1.9267994165420532, + "learning_rate": 3.638240737907099e-05, + "loss": 0.3816, + "step": 10488 + }, + { + "epoch": 0.5757409440175631, + "grad_norm": 1.9685825109481812, + "learning_rate": 3.6377745320216346e-05, + "loss": 0.2819, + "step": 10490 + }, + { + "epoch": 0.5758507135016465, + "grad_norm": 1.1345950365066528, + "learning_rate": 3.6373082762277946e-05, + "loss": 0.1628, + "step": 10492 + }, + { + "epoch": 0.57596048298573, + "grad_norm": 3.005140781402588, + "learning_rate": 3.636841970546031e-05, + "loss": 0.2791, + "step": 10494 + }, + { + "epoch": 0.5760702524698134, + "grad_norm": 1.1271791458129883, + "learning_rate": 3.636375614996799e-05, + "loss": 0.3021, + "step": 10496 + }, + { + "epoch": 0.5761800219538968, + "grad_norm": 1.4451035261154175, + "learning_rate": 3.635909209600555e-05, + "loss": 0.2954, + "step": 10498 + }, + { + "epoch": 0.5762897914379802, + "grad_norm": 1.1614354848861694, + "learning_rate": 3.6354427543777574e-05, + "loss": 0.2301, + "step": 10500 + }, + { + "epoch": 0.5763995609220637, + "grad_norm": 2.3435943126678467, + "learning_rate": 3.634976249348867e-05, + "loss": 0.2721, + "step": 10502 + }, + { + "epoch": 0.576509330406147, + "grad_norm": 1.2828333377838135, + "learning_rate": 3.634509694534348e-05, + "loss": 0.2893, + "step": 10504 + }, + { + "epoch": 0.5766190998902305, + "grad_norm": 1.252988576889038, + "learning_rate": 3.6340430899546656e-05, + "loss": 0.3025, + "step": 10506 + }, + { + "epoch": 0.5767288693743139, + "grad_norm": 1.7489960193634033, + "learning_rate": 3.6335764356302864e-05, + "loss": 0.2852, + "step": 10508 + }, + { + "epoch": 0.5768386388583974, + "grad_norm": 0.9306700825691223, + "learning_rate": 3.633109731581682e-05, + "loss": 0.4469, + "step": 10510 + }, + { + "epoch": 0.5769484083424808, + "grad_norm": 1.3461819887161255, + "learning_rate": 3.6326429778293226e-05, + "loss": 0.2257, + "step": 10512 + }, + { + "epoch": 0.5770581778265642, + "grad_norm": 1.1171380281448364, + "learning_rate": 3.632176174393682e-05, + "loss": 0.325, + "step": 10514 + }, + { + "epoch": 0.5771679473106477, + "grad_norm": 1.5345162153244019, + "learning_rate": 3.631709321295238e-05, + "loss": 0.3499, + "step": 10516 + }, + { + "epoch": 0.577277716794731, + "grad_norm": 1.600005030632019, + "learning_rate": 3.631242418554469e-05, + "loss": 0.2958, + "step": 10518 + }, + { + "epoch": 0.5773874862788145, + "grad_norm": 1.6145706176757812, + "learning_rate": 3.630775466191854e-05, + "loss": 0.3715, + "step": 10520 + }, + { + "epoch": 0.5774972557628979, + "grad_norm": 1.22256338596344, + "learning_rate": 3.630308464227877e-05, + "loss": 0.279, + "step": 10522 + }, + { + "epoch": 0.5776070252469814, + "grad_norm": 1.4782462120056152, + "learning_rate": 3.629841412683023e-05, + "loss": 0.3236, + "step": 10524 + }, + { + "epoch": 0.5777167947310647, + "grad_norm": 3.271174669265747, + "learning_rate": 3.629374311577779e-05, + "loss": 0.4344, + "step": 10526 + }, + { + "epoch": 0.5778265642151482, + "grad_norm": 1.337047815322876, + "learning_rate": 3.6289071609326356e-05, + "loss": 0.2404, + "step": 10528 + }, + { + "epoch": 0.5779363336992316, + "grad_norm": 1.7367700338363647, + "learning_rate": 3.628439960768082e-05, + "loss": 0.3152, + "step": 10530 + }, + { + "epoch": 0.5780461031833151, + "grad_norm": 1.8375128507614136, + "learning_rate": 3.627972711104613e-05, + "loss": 0.3367, + "step": 10532 + }, + { + "epoch": 0.5781558726673984, + "grad_norm": 1.6252288818359375, + "learning_rate": 3.627505411962724e-05, + "loss": 0.3367, + "step": 10534 + }, + { + "epoch": 0.5782656421514819, + "grad_norm": 2.0654726028442383, + "learning_rate": 3.6270380633629145e-05, + "loss": 0.3349, + "step": 10536 + }, + { + "epoch": 0.5783754116355653, + "grad_norm": 4.0825042724609375, + "learning_rate": 3.626570665325684e-05, + "loss": 0.3554, + "step": 10538 + }, + { + "epoch": 0.5784851811196488, + "grad_norm": 1.3129194974899292, + "learning_rate": 3.626103217871533e-05, + "loss": 0.3064, + "step": 10540 + }, + { + "epoch": 0.5785949506037321, + "grad_norm": 1.4406661987304688, + "learning_rate": 3.625635721020969e-05, + "loss": 0.256, + "step": 10542 + }, + { + "epoch": 0.5787047200878156, + "grad_norm": 1.2892264127731323, + "learning_rate": 3.625168174794497e-05, + "loss": 0.2907, + "step": 10544 + }, + { + "epoch": 0.578814489571899, + "grad_norm": 1.5572338104248047, + "learning_rate": 3.624700579212626e-05, + "loss": 0.2698, + "step": 10546 + }, + { + "epoch": 0.5789242590559824, + "grad_norm": 1.3370567560195923, + "learning_rate": 3.6242329342958676e-05, + "loss": 0.3959, + "step": 10548 + }, + { + "epoch": 0.5790340285400659, + "grad_norm": 1.4365427494049072, + "learning_rate": 3.6237652400647345e-05, + "loss": 0.2053, + "step": 10550 + }, + { + "epoch": 0.5791437980241493, + "grad_norm": 1.3488268852233887, + "learning_rate": 3.623297496539741e-05, + "loss": 0.1953, + "step": 10552 + }, + { + "epoch": 0.5792535675082328, + "grad_norm": 1.241066336631775, + "learning_rate": 3.6228297037414074e-05, + "loss": 0.321, + "step": 10554 + }, + { + "epoch": 0.5793633369923161, + "grad_norm": 1.2951372861862183, + "learning_rate": 3.6223618616902524e-05, + "loss": 0.2827, + "step": 10556 + }, + { + "epoch": 0.5794731064763996, + "grad_norm": 0.9496516585350037, + "learning_rate": 3.6218939704067955e-05, + "loss": 0.2093, + "step": 10558 + }, + { + "epoch": 0.579582875960483, + "grad_norm": 2.132791757583618, + "learning_rate": 3.621426029911563e-05, + "loss": 0.4001, + "step": 10560 + }, + { + "epoch": 0.5796926454445664, + "grad_norm": 2.6051533222198486, + "learning_rate": 3.6209580402250815e-05, + "loss": 0.3881, + "step": 10562 + }, + { + "epoch": 0.5798024149286498, + "grad_norm": 1.7479677200317383, + "learning_rate": 3.6204900013678765e-05, + "loss": 0.3195, + "step": 10564 + }, + { + "epoch": 0.5799121844127333, + "grad_norm": 1.4920982122421265, + "learning_rate": 3.6200219133604816e-05, + "loss": 0.232, + "step": 10566 + }, + { + "epoch": 0.5800219538968167, + "grad_norm": 1.414447546005249, + "learning_rate": 3.619553776223429e-05, + "loss": 0.2672, + "step": 10568 + }, + { + "epoch": 0.5801317233809001, + "grad_norm": 1.1346818208694458, + "learning_rate": 3.619085589977251e-05, + "loss": 0.2692, + "step": 10570 + }, + { + "epoch": 0.5802414928649835, + "grad_norm": 3.4918463230133057, + "learning_rate": 3.618617354642487e-05, + "loss": 0.3571, + "step": 10572 + }, + { + "epoch": 0.580351262349067, + "grad_norm": 1.6808403730392456, + "learning_rate": 3.618149070239676e-05, + "loss": 0.3219, + "step": 10574 + }, + { + "epoch": 0.5804610318331503, + "grad_norm": 1.1852856874465942, + "learning_rate": 3.617680736789357e-05, + "loss": 0.341, + "step": 10576 + }, + { + "epoch": 0.5805708013172338, + "grad_norm": 2.1102702617645264, + "learning_rate": 3.617212354312076e-05, + "loss": 0.4381, + "step": 10578 + }, + { + "epoch": 0.5806805708013172, + "grad_norm": 1.2360416650772095, + "learning_rate": 3.616743922828377e-05, + "loss": 0.1849, + "step": 10580 + }, + { + "epoch": 0.5807903402854007, + "grad_norm": 1.6405123472213745, + "learning_rate": 3.6162754423588085e-05, + "loss": 0.3461, + "step": 10582 + }, + { + "epoch": 0.580900109769484, + "grad_norm": 1.7501472234725952, + "learning_rate": 3.615806912923921e-05, + "loss": 0.3559, + "step": 10584 + }, + { + "epoch": 0.5810098792535675, + "grad_norm": 1.797749400138855, + "learning_rate": 3.615338334544265e-05, + "loss": 0.2643, + "step": 10586 + }, + { + "epoch": 0.581119648737651, + "grad_norm": 1.8282439708709717, + "learning_rate": 3.614869707240395e-05, + "loss": 0.3826, + "step": 10588 + }, + { + "epoch": 0.5812294182217344, + "grad_norm": 2.3113410472869873, + "learning_rate": 3.614401031032867e-05, + "loss": 0.3097, + "step": 10590 + }, + { + "epoch": 0.5813391877058178, + "grad_norm": 1.792317509651184, + "learning_rate": 3.6139323059422415e-05, + "loss": 0.318, + "step": 10592 + }, + { + "epoch": 0.5814489571899012, + "grad_norm": 1.1813870668411255, + "learning_rate": 3.613463531989076e-05, + "loss": 0.2805, + "step": 10594 + }, + { + "epoch": 0.5815587266739847, + "grad_norm": 1.6313118934631348, + "learning_rate": 3.612994709193935e-05, + "loss": 0.2396, + "step": 10596 + }, + { + "epoch": 0.581668496158068, + "grad_norm": 1.9284731149673462, + "learning_rate": 3.612525837577384e-05, + "loss": 0.3161, + "step": 10598 + }, + { + "epoch": 0.5817782656421515, + "grad_norm": 3.059380054473877, + "learning_rate": 3.6120569171599886e-05, + "loss": 0.309, + "step": 10600 + }, + { + "epoch": 0.5818880351262349, + "grad_norm": 1.61190664768219, + "learning_rate": 3.611587947962319e-05, + "loss": 0.2356, + "step": 10602 + }, + { + "epoch": 0.5819978046103184, + "grad_norm": 1.426192045211792, + "learning_rate": 3.611118930004946e-05, + "loss": 0.3388, + "step": 10604 + }, + { + "epoch": 0.5821075740944017, + "grad_norm": 1.4110281467437744, + "learning_rate": 3.6106498633084424e-05, + "loss": 0.4727, + "step": 10606 + }, + { + "epoch": 0.5822173435784852, + "grad_norm": 1.4639925956726074, + "learning_rate": 3.610180747893385e-05, + "loss": 0.3508, + "step": 10608 + }, + { + "epoch": 0.5823271130625686, + "grad_norm": 0.8501638174057007, + "learning_rate": 3.6097115837803505e-05, + "loss": 0.2881, + "step": 10610 + }, + { + "epoch": 0.582436882546652, + "grad_norm": 3.5706684589385986, + "learning_rate": 3.609242370989919e-05, + "loss": 0.3265, + "step": 10612 + }, + { + "epoch": 0.5825466520307354, + "grad_norm": 2.0473482608795166, + "learning_rate": 3.6087731095426733e-05, + "loss": 0.406, + "step": 10614 + }, + { + "epoch": 0.5826564215148189, + "grad_norm": 1.1153079271316528, + "learning_rate": 3.608303799459196e-05, + "loss": 0.2089, + "step": 10616 + }, + { + "epoch": 0.5827661909989023, + "grad_norm": 1.2952258586883545, + "learning_rate": 3.607834440760074e-05, + "loss": 0.2792, + "step": 10618 + }, + { + "epoch": 0.5828759604829857, + "grad_norm": 1.818919062614441, + "learning_rate": 3.607365033465897e-05, + "loss": 0.3211, + "step": 10620 + }, + { + "epoch": 0.5829857299670691, + "grad_norm": 0.9692328572273254, + "learning_rate": 3.606895577597255e-05, + "loss": 0.2951, + "step": 10622 + }, + { + "epoch": 0.5830954994511526, + "grad_norm": 1.3673129081726074, + "learning_rate": 3.6064260731747376e-05, + "loss": 0.2657, + "step": 10624 + }, + { + "epoch": 0.583205268935236, + "grad_norm": 1.35821533203125, + "learning_rate": 3.6059565202189435e-05, + "loss": 0.3432, + "step": 10626 + }, + { + "epoch": 0.5833150384193194, + "grad_norm": 1.401965618133545, + "learning_rate": 3.605486918750468e-05, + "loss": 0.3108, + "step": 10628 + }, + { + "epoch": 0.5834248079034029, + "grad_norm": 1.5430024862289429, + "learning_rate": 3.60501726878991e-05, + "loss": 0.3389, + "step": 10630 + }, + { + "epoch": 0.5835345773874863, + "grad_norm": 1.7277885675430298, + "learning_rate": 3.6045475703578706e-05, + "loss": 0.2763, + "step": 10632 + }, + { + "epoch": 0.5836443468715697, + "grad_norm": 1.4160537719726562, + "learning_rate": 3.604077823474954e-05, + "loss": 0.2794, + "step": 10634 + }, + { + "epoch": 0.5837541163556531, + "grad_norm": 1.3371456861495972, + "learning_rate": 3.603608028161764e-05, + "loss": 0.2768, + "step": 10636 + }, + { + "epoch": 0.5838638858397366, + "grad_norm": 2.1538543701171875, + "learning_rate": 3.60313818443891e-05, + "loss": 0.5033, + "step": 10638 + }, + { + "epoch": 0.58397365532382, + "grad_norm": 2.2609307765960693, + "learning_rate": 3.6026682923269994e-05, + "loss": 0.2463, + "step": 10640 + }, + { + "epoch": 0.5840834248079034, + "grad_norm": 2.1381711959838867, + "learning_rate": 3.602198351846647e-05, + "loss": 0.2386, + "step": 10642 + }, + { + "epoch": 0.5841931942919868, + "grad_norm": 2.148484468460083, + "learning_rate": 3.601728363018464e-05, + "loss": 0.2811, + "step": 10644 + }, + { + "epoch": 0.5843029637760703, + "grad_norm": 1.8910096883773804, + "learning_rate": 3.601258325863067e-05, + "loss": 0.2482, + "step": 10646 + }, + { + "epoch": 0.5844127332601536, + "grad_norm": 1.3812121152877808, + "learning_rate": 3.600788240401076e-05, + "loss": 0.2334, + "step": 10648 + }, + { + "epoch": 0.5845225027442371, + "grad_norm": 2.889751672744751, + "learning_rate": 3.600318106653108e-05, + "loss": 0.1953, + "step": 10650 + }, + { + "epoch": 0.5846322722283205, + "grad_norm": 2.3433828353881836, + "learning_rate": 3.599847924639788e-05, + "loss": 0.2794, + "step": 10652 + }, + { + "epoch": 0.584742041712404, + "grad_norm": 1.8506253957748413, + "learning_rate": 3.59937769438174e-05, + "loss": 0.2344, + "step": 10654 + }, + { + "epoch": 0.5848518111964873, + "grad_norm": 1.2871284484863281, + "learning_rate": 3.59890741589959e-05, + "loss": 0.2245, + "step": 10656 + }, + { + "epoch": 0.5849615806805708, + "grad_norm": 3.6507325172424316, + "learning_rate": 3.5984370892139666e-05, + "loss": 0.3504, + "step": 10658 + }, + { + "epoch": 0.5850713501646543, + "grad_norm": 2.661036252975464, + "learning_rate": 3.597966714345502e-05, + "loss": 0.3536, + "step": 10660 + }, + { + "epoch": 0.5851811196487376, + "grad_norm": 2.6618714332580566, + "learning_rate": 3.597496291314827e-05, + "loss": 0.4341, + "step": 10662 + }, + { + "epoch": 0.5852908891328211, + "grad_norm": 2.6237142086029053, + "learning_rate": 3.5970258201425785e-05, + "loss": 0.2294, + "step": 10664 + }, + { + "epoch": 0.5854006586169045, + "grad_norm": 1.4231059551239014, + "learning_rate": 3.596555300849392e-05, + "loss": 0.3071, + "step": 10666 + }, + { + "epoch": 0.585510428100988, + "grad_norm": 2.0011394023895264, + "learning_rate": 3.5960847334559086e-05, + "loss": 0.2567, + "step": 10668 + }, + { + "epoch": 0.5856201975850713, + "grad_norm": 1.4701955318450928, + "learning_rate": 3.595614117982769e-05, + "loss": 0.2013, + "step": 10670 + }, + { + "epoch": 0.5857299670691548, + "grad_norm": 1.194176435470581, + "learning_rate": 3.595143454450617e-05, + "loss": 0.3991, + "step": 10672 + }, + { + "epoch": 0.5858397365532382, + "grad_norm": 1.542940378189087, + "learning_rate": 3.594672742880097e-05, + "loss": 0.3372, + "step": 10674 + }, + { + "epoch": 0.5859495060373217, + "grad_norm": 2.169760227203369, + "learning_rate": 3.594201983291858e-05, + "loss": 0.2717, + "step": 10676 + }, + { + "epoch": 0.586059275521405, + "grad_norm": 2.148538827896118, + "learning_rate": 3.5937311757065494e-05, + "loss": 0.3044, + "step": 10678 + }, + { + "epoch": 0.5861690450054885, + "grad_norm": 1.449044942855835, + "learning_rate": 3.593260320144823e-05, + "loss": 0.2516, + "step": 10680 + }, + { + "epoch": 0.5862788144895719, + "grad_norm": 2.3253912925720215, + "learning_rate": 3.592789416627332e-05, + "loss": 0.3284, + "step": 10682 + }, + { + "epoch": 0.5863885839736553, + "grad_norm": 1.7130199670791626, + "learning_rate": 3.592318465174736e-05, + "loss": 0.3171, + "step": 10684 + }, + { + "epoch": 0.5864983534577387, + "grad_norm": 0.8977075815200806, + "learning_rate": 3.591847465807687e-05, + "loss": 0.2275, + "step": 10686 + }, + { + "epoch": 0.5866081229418222, + "grad_norm": 2.2356274127960205, + "learning_rate": 3.591376418546852e-05, + "loss": 0.1723, + "step": 10688 + }, + { + "epoch": 0.5867178924259056, + "grad_norm": 1.293237566947937, + "learning_rate": 3.5909053234128895e-05, + "loss": 0.2988, + "step": 10690 + }, + { + "epoch": 0.586827661909989, + "grad_norm": 2.1600873470306396, + "learning_rate": 3.590434180426465e-05, + "loss": 0.3189, + "step": 10692 + }, + { + "epoch": 0.5869374313940724, + "grad_norm": 1.5562970638275146, + "learning_rate": 3.5899629896082454e-05, + "loss": 0.2912, + "step": 10694 + }, + { + "epoch": 0.5870472008781559, + "grad_norm": 1.3932527303695679, + "learning_rate": 3.589491750978899e-05, + "loss": 0.2909, + "step": 10696 + }, + { + "epoch": 0.5871569703622393, + "grad_norm": 1.7384482622146606, + "learning_rate": 3.5890204645590964e-05, + "loss": 0.2919, + "step": 10698 + }, + { + "epoch": 0.5872667398463227, + "grad_norm": 1.39371919631958, + "learning_rate": 3.588549130369512e-05, + "loss": 0.2344, + "step": 10700 + }, + { + "epoch": 0.5873765093304062, + "grad_norm": 2.244398832321167, + "learning_rate": 3.588077748430819e-05, + "loss": 0.2478, + "step": 10702 + }, + { + "epoch": 0.5874862788144896, + "grad_norm": 1.9751527309417725, + "learning_rate": 3.587606318763695e-05, + "loss": 0.2753, + "step": 10704 + }, + { + "epoch": 0.587596048298573, + "grad_norm": 1.2786295413970947, + "learning_rate": 3.5871348413888204e-05, + "loss": 0.2611, + "step": 10706 + }, + { + "epoch": 0.5877058177826564, + "grad_norm": 2.02703595161438, + "learning_rate": 3.586663316326876e-05, + "loss": 0.2819, + "step": 10708 + }, + { + "epoch": 0.5878155872667399, + "grad_norm": 2.3640987873077393, + "learning_rate": 3.5861917435985445e-05, + "loss": 0.3676, + "step": 10710 + }, + { + "epoch": 0.5879253567508232, + "grad_norm": 2.7612805366516113, + "learning_rate": 3.585720123224512e-05, + "loss": 0.3025, + "step": 10712 + }, + { + "epoch": 0.5880351262349067, + "grad_norm": 1.3672664165496826, + "learning_rate": 3.585248455225466e-05, + "loss": 0.3309, + "step": 10714 + }, + { + "epoch": 0.5881448957189901, + "grad_norm": 1.8845869302749634, + "learning_rate": 3.584776739622095e-05, + "loss": 0.3668, + "step": 10716 + }, + { + "epoch": 0.5882546652030736, + "grad_norm": 1.4013235569000244, + "learning_rate": 3.584304976435092e-05, + "loss": 0.3089, + "step": 10718 + }, + { + "epoch": 0.5883644346871569, + "grad_norm": 3.513866662979126, + "learning_rate": 3.5838331656851516e-05, + "loss": 0.3424, + "step": 10720 + }, + { + "epoch": 0.5884742041712404, + "grad_norm": 1.4490220546722412, + "learning_rate": 3.5833613073929684e-05, + "loss": 0.2118, + "step": 10722 + }, + { + "epoch": 0.5885839736553238, + "grad_norm": 2.404331922531128, + "learning_rate": 3.58288940157924e-05, + "loss": 0.3265, + "step": 10724 + }, + { + "epoch": 0.5886937431394073, + "grad_norm": 1.9479202032089233, + "learning_rate": 3.582417448264669e-05, + "loss": 0.3494, + "step": 10726 + }, + { + "epoch": 0.5888035126234906, + "grad_norm": 1.229591727256775, + "learning_rate": 3.581945447469954e-05, + "loss": 0.2943, + "step": 10728 + }, + { + "epoch": 0.5889132821075741, + "grad_norm": 1.787049651145935, + "learning_rate": 3.581473399215802e-05, + "loss": 0.344, + "step": 10730 + }, + { + "epoch": 0.5890230515916575, + "grad_norm": 1.6422349214553833, + "learning_rate": 3.581001303522919e-05, + "loss": 0.35, + "step": 10732 + }, + { + "epoch": 0.5891328210757409, + "grad_norm": 1.2761356830596924, + "learning_rate": 3.580529160412013e-05, + "loss": 0.3567, + "step": 10734 + }, + { + "epoch": 0.5892425905598244, + "grad_norm": 2.461888313293457, + "learning_rate": 3.5800569699037934e-05, + "loss": 0.3589, + "step": 10736 + }, + { + "epoch": 0.5893523600439078, + "grad_norm": 1.870080590248108, + "learning_rate": 3.5795847320189746e-05, + "loss": 0.264, + "step": 10738 + }, + { + "epoch": 0.5894621295279913, + "grad_norm": 1.2739890813827515, + "learning_rate": 3.57911244677827e-05, + "loss": 0.2654, + "step": 10740 + }, + { + "epoch": 0.5895718990120746, + "grad_norm": 1.5906860828399658, + "learning_rate": 3.5786401142023975e-05, + "loss": 0.2717, + "step": 10742 + }, + { + "epoch": 0.5896816684961581, + "grad_norm": 2.4611268043518066, + "learning_rate": 3.5781677343120755e-05, + "loss": 0.2653, + "step": 10744 + }, + { + "epoch": 0.5897914379802415, + "grad_norm": 1.6030514240264893, + "learning_rate": 3.577695307128024e-05, + "loss": 0.4305, + "step": 10746 + }, + { + "epoch": 0.589901207464325, + "grad_norm": 2.0266501903533936, + "learning_rate": 3.577222832670967e-05, + "loss": 0.3465, + "step": 10748 + }, + { + "epoch": 0.5900109769484083, + "grad_norm": 1.3415460586547852, + "learning_rate": 3.5767503109616296e-05, + "loss": 0.198, + "step": 10750 + }, + { + "epoch": 0.5901207464324918, + "grad_norm": 1.3156064748764038, + "learning_rate": 3.576277742020738e-05, + "loss": 0.3721, + "step": 10752 + }, + { + "epoch": 0.5902305159165752, + "grad_norm": 3.041720390319824, + "learning_rate": 3.575805125869022e-05, + "loss": 0.4404, + "step": 10754 + }, + { + "epoch": 0.5903402854006586, + "grad_norm": 2.190117597579956, + "learning_rate": 3.575332462527213e-05, + "loss": 0.2909, + "step": 10756 + }, + { + "epoch": 0.590450054884742, + "grad_norm": 1.3991286754608154, + "learning_rate": 3.574859752016045e-05, + "loss": 0.2513, + "step": 10758 + }, + { + "epoch": 0.5905598243688255, + "grad_norm": 1.7406070232391357, + "learning_rate": 3.574386994356251e-05, + "loss": 0.2595, + "step": 10760 + }, + { + "epoch": 0.5906695938529088, + "grad_norm": 1.5623794794082642, + "learning_rate": 3.573914189568571e-05, + "loss": 0.2362, + "step": 10762 + }, + { + "epoch": 0.5907793633369923, + "grad_norm": 1.0879113674163818, + "learning_rate": 3.573441337673743e-05, + "loss": 0.3417, + "step": 10764 + }, + { + "epoch": 0.5908891328210757, + "grad_norm": 1.785813808441162, + "learning_rate": 3.572968438692509e-05, + "loss": 0.346, + "step": 10766 + }, + { + "epoch": 0.5909989023051592, + "grad_norm": 1.3951983451843262, + "learning_rate": 3.572495492645614e-05, + "loss": 0.2647, + "step": 10768 + }, + { + "epoch": 0.5911086717892426, + "grad_norm": 1.8291288614273071, + "learning_rate": 3.572022499553802e-05, + "loss": 0.2881, + "step": 10770 + }, + { + "epoch": 0.591218441273326, + "grad_norm": 1.781607747077942, + "learning_rate": 3.5715494594378216e-05, + "loss": 0.3158, + "step": 10772 + }, + { + "epoch": 0.5913282107574095, + "grad_norm": 2.0765278339385986, + "learning_rate": 3.571076372318422e-05, + "loss": 0.2743, + "step": 10774 + }, + { + "epoch": 0.5914379802414929, + "grad_norm": 1.9309706687927246, + "learning_rate": 3.5706032382163554e-05, + "loss": 0.2704, + "step": 10776 + }, + { + "epoch": 0.5915477497255763, + "grad_norm": 2.1179704666137695, + "learning_rate": 3.5701300571523755e-05, + "loss": 0.33, + "step": 10778 + }, + { + "epoch": 0.5916575192096597, + "grad_norm": 1.3591874837875366, + "learning_rate": 3.56965682914724e-05, + "loss": 0.3025, + "step": 10780 + }, + { + "epoch": 0.5917672886937432, + "grad_norm": 1.5380465984344482, + "learning_rate": 3.5691835542217054e-05, + "loss": 0.3882, + "step": 10782 + }, + { + "epoch": 0.5918770581778265, + "grad_norm": 2.179777145385742, + "learning_rate": 3.568710232396531e-05, + "loss": 0.2898, + "step": 10784 + }, + { + "epoch": 0.59198682766191, + "grad_norm": 1.815198302268982, + "learning_rate": 3.568236863692482e-05, + "loss": 0.4146, + "step": 10786 + }, + { + "epoch": 0.5920965971459934, + "grad_norm": 1.1384223699569702, + "learning_rate": 3.5677634481303215e-05, + "loss": 0.2764, + "step": 10788 + }, + { + "epoch": 0.5922063666300769, + "grad_norm": 1.8051310777664185, + "learning_rate": 3.5672899857308134e-05, + "loss": 0.232, + "step": 10790 + }, + { + "epoch": 0.5923161361141602, + "grad_norm": 1.5768744945526123, + "learning_rate": 3.5668164765147284e-05, + "loss": 0.2624, + "step": 10792 + }, + { + "epoch": 0.5924259055982437, + "grad_norm": 1.1289161443710327, + "learning_rate": 3.566342920502837e-05, + "loss": 0.2117, + "step": 10794 + }, + { + "epoch": 0.5925356750823271, + "grad_norm": 1.9271942377090454, + "learning_rate": 3.565869317715911e-05, + "loss": 0.4362, + "step": 10796 + }, + { + "epoch": 0.5926454445664106, + "grad_norm": 3.002201795578003, + "learning_rate": 3.565395668174725e-05, + "loss": 0.3803, + "step": 10798 + }, + { + "epoch": 0.5927552140504939, + "grad_norm": 1.7495546340942383, + "learning_rate": 3.564921971900056e-05, + "loss": 0.2529, + "step": 10800 + }, + { + "epoch": 0.5928649835345774, + "grad_norm": 1.491882562637329, + "learning_rate": 3.564448228912682e-05, + "loss": 0.3395, + "step": 10802 + }, + { + "epoch": 0.5929747530186608, + "grad_norm": 2.6609246730804443, + "learning_rate": 3.563974439233384e-05, + "loss": 0.4524, + "step": 10804 + }, + { + "epoch": 0.5930845225027442, + "grad_norm": 1.8982800245285034, + "learning_rate": 3.563500602882945e-05, + "loss": 0.282, + "step": 10806 + }, + { + "epoch": 0.5931942919868277, + "grad_norm": 1.798258900642395, + "learning_rate": 3.5630267198821496e-05, + "loss": 0.3704, + "step": 10808 + }, + { + "epoch": 0.5933040614709111, + "grad_norm": 1.7037521600723267, + "learning_rate": 3.562552790251785e-05, + "loss": 0.2655, + "step": 10810 + }, + { + "epoch": 0.5934138309549946, + "grad_norm": 1.3552303314208984, + "learning_rate": 3.562078814012639e-05, + "loss": 0.2141, + "step": 10812 + }, + { + "epoch": 0.5935236004390779, + "grad_norm": 1.0977003574371338, + "learning_rate": 3.561604791185503e-05, + "loss": 0.182, + "step": 10814 + }, + { + "epoch": 0.5936333699231614, + "grad_norm": 1.6500356197357178, + "learning_rate": 3.5611307217911716e-05, + "loss": 0.2992, + "step": 10816 + }, + { + "epoch": 0.5937431394072448, + "grad_norm": 2.8304355144500732, + "learning_rate": 3.5606566058504375e-05, + "loss": 0.2268, + "step": 10818 + }, + { + "epoch": 0.5938529088913282, + "grad_norm": 1.5380091667175293, + "learning_rate": 3.5601824433840986e-05, + "loss": 0.2291, + "step": 10820 + }, + { + "epoch": 0.5939626783754116, + "grad_norm": 1.81062650680542, + "learning_rate": 3.559708234412954e-05, + "loss": 0.2871, + "step": 10822 + }, + { + "epoch": 0.5940724478594951, + "grad_norm": 1.2425330877304077, + "learning_rate": 3.559233978957805e-05, + "loss": 0.2461, + "step": 10824 + }, + { + "epoch": 0.5941822173435785, + "grad_norm": 1.535292148590088, + "learning_rate": 3.558759677039455e-05, + "loss": 0.284, + "step": 10826 + }, + { + "epoch": 0.5942919868276619, + "grad_norm": 1.323222279548645, + "learning_rate": 3.5582853286787086e-05, + "loss": 0.2246, + "step": 10828 + }, + { + "epoch": 0.5944017563117453, + "grad_norm": 2.2362797260284424, + "learning_rate": 3.5578109338963736e-05, + "loss": 0.3656, + "step": 10830 + }, + { + "epoch": 0.5945115257958288, + "grad_norm": 1.7045161724090576, + "learning_rate": 3.557336492713258e-05, + "loss": 0.3235, + "step": 10832 + }, + { + "epoch": 0.5946212952799121, + "grad_norm": 1.4874005317687988, + "learning_rate": 3.5568620051501756e-05, + "loss": 0.3554, + "step": 10834 + }, + { + "epoch": 0.5947310647639956, + "grad_norm": 2.055091381072998, + "learning_rate": 3.556387471227938e-05, + "loss": 0.3041, + "step": 10836 + }, + { + "epoch": 0.594840834248079, + "grad_norm": 3.6306021213531494, + "learning_rate": 3.5559128909673595e-05, + "loss": 0.2859, + "step": 10838 + }, + { + "epoch": 0.5949506037321625, + "grad_norm": 1.5992966890335083, + "learning_rate": 3.555438264389259e-05, + "loss": 0.3228, + "step": 10840 + }, + { + "epoch": 0.5950603732162458, + "grad_norm": 1.180989384651184, + "learning_rate": 3.554963591514457e-05, + "loss": 0.3038, + "step": 10842 + }, + { + "epoch": 0.5951701427003293, + "grad_norm": 1.3116540908813477, + "learning_rate": 3.5544888723637736e-05, + "loss": 0.1986, + "step": 10844 + }, + { + "epoch": 0.5952799121844128, + "grad_norm": 1.6001548767089844, + "learning_rate": 3.554014106958032e-05, + "loss": 0.313, + "step": 10846 + }, + { + "epoch": 0.5953896816684962, + "grad_norm": 1.2979800701141357, + "learning_rate": 3.553539295318059e-05, + "loss": 0.1718, + "step": 10848 + }, + { + "epoch": 0.5954994511525796, + "grad_norm": 1.6903339624404907, + "learning_rate": 3.5530644374646815e-05, + "loss": 0.2281, + "step": 10850 + }, + { + "epoch": 0.595609220636663, + "grad_norm": 1.8721674680709839, + "learning_rate": 3.552589533418728e-05, + "loss": 0.3137, + "step": 10852 + }, + { + "epoch": 0.5957189901207465, + "grad_norm": 1.2724125385284424, + "learning_rate": 3.5521145832010314e-05, + "loss": 0.2115, + "step": 10854 + }, + { + "epoch": 0.5958287596048298, + "grad_norm": 1.4310123920440674, + "learning_rate": 3.5516395868324256e-05, + "loss": 0.2785, + "step": 10856 + }, + { + "epoch": 0.5959385290889133, + "grad_norm": 1.6302125453948975, + "learning_rate": 3.551164544333745e-05, + "loss": 0.1261, + "step": 10858 + }, + { + "epoch": 0.5960482985729967, + "grad_norm": 1.0277296304702759, + "learning_rate": 3.5506894557258294e-05, + "loss": 0.2685, + "step": 10860 + }, + { + "epoch": 0.5961580680570802, + "grad_norm": 1.7706259489059448, + "learning_rate": 3.5502143210295165e-05, + "loss": 0.3346, + "step": 10862 + }, + { + "epoch": 0.5962678375411635, + "grad_norm": 1.0425622463226318, + "learning_rate": 3.549739140265648e-05, + "loss": 0.1445, + "step": 10864 + }, + { + "epoch": 0.596377607025247, + "grad_norm": 1.4231388568878174, + "learning_rate": 3.5492639134550695e-05, + "loss": 0.3478, + "step": 10866 + }, + { + "epoch": 0.5964873765093304, + "grad_norm": 1.3705350160598755, + "learning_rate": 3.5487886406186245e-05, + "loss": 0.2635, + "step": 10868 + }, + { + "epoch": 0.5965971459934138, + "grad_norm": 1.3260657787322998, + "learning_rate": 3.5483133217771625e-05, + "loss": 0.2086, + "step": 10870 + }, + { + "epoch": 0.5967069154774972, + "grad_norm": 2.2571890354156494, + "learning_rate": 3.547837956951533e-05, + "loss": 0.5403, + "step": 10872 + }, + { + "epoch": 0.5968166849615807, + "grad_norm": 1.5129846334457397, + "learning_rate": 3.547362546162588e-05, + "loss": 0.2424, + "step": 10874 + }, + { + "epoch": 0.5969264544456641, + "grad_norm": 1.6418815851211548, + "learning_rate": 3.54688708943118e-05, + "loss": 0.3773, + "step": 10876 + }, + { + "epoch": 0.5970362239297475, + "grad_norm": 1.6925742626190186, + "learning_rate": 3.546411586778167e-05, + "loss": 0.2503, + "step": 10878 + }, + { + "epoch": 0.5971459934138309, + "grad_norm": 1.2756041288375854, + "learning_rate": 3.545936038224405e-05, + "loss": 0.406, + "step": 10880 + }, + { + "epoch": 0.5972557628979144, + "grad_norm": 2.8987185955047607, + "learning_rate": 3.545460443790753e-05, + "loss": 0.2941, + "step": 10882 + }, + { + "epoch": 0.5973655323819979, + "grad_norm": 3.4773828983306885, + "learning_rate": 3.544984803498077e-05, + "loss": 0.4967, + "step": 10884 + }, + { + "epoch": 0.5974753018660812, + "grad_norm": 2.206066370010376, + "learning_rate": 3.544509117367238e-05, + "loss": 0.3319, + "step": 10886 + }, + { + "epoch": 0.5975850713501647, + "grad_norm": 2.0362699031829834, + "learning_rate": 3.5440333854191016e-05, + "loss": 0.3623, + "step": 10888 + }, + { + "epoch": 0.5976948408342481, + "grad_norm": 1.3062598705291748, + "learning_rate": 3.543557607674537e-05, + "loss": 0.2648, + "step": 10890 + }, + { + "epoch": 0.5978046103183315, + "grad_norm": 1.6798791885375977, + "learning_rate": 3.543081784154414e-05, + "loss": 0.3257, + "step": 10892 + }, + { + "epoch": 0.5979143798024149, + "grad_norm": 1.6806362867355347, + "learning_rate": 3.542605914879603e-05, + "loss": 0.18, + "step": 10894 + }, + { + "epoch": 0.5980241492864984, + "grad_norm": 1.053139567375183, + "learning_rate": 3.54212999987098e-05, + "loss": 0.2457, + "step": 10896 + }, + { + "epoch": 0.5981339187705818, + "grad_norm": 1.62759268283844, + "learning_rate": 3.54165403914942e-05, + "loss": 0.3256, + "step": 10898 + }, + { + "epoch": 0.5982436882546652, + "grad_norm": 1.7903300523757935, + "learning_rate": 3.541178032735801e-05, + "loss": 0.2759, + "step": 10900 + }, + { + "epoch": 0.5983534577387486, + "grad_norm": 1.584538221359253, + "learning_rate": 3.540701980651003e-05, + "loss": 0.3421, + "step": 10902 + }, + { + "epoch": 0.5984632272228321, + "grad_norm": 2.081554651260376, + "learning_rate": 3.5402258829159085e-05, + "loss": 0.3912, + "step": 10904 + }, + { + "epoch": 0.5985729967069154, + "grad_norm": 3.14155912399292, + "learning_rate": 3.5397497395514004e-05, + "loss": 0.3148, + "step": 10906 + }, + { + "epoch": 0.5986827661909989, + "grad_norm": 1.7258813381195068, + "learning_rate": 3.539273550578366e-05, + "loss": 0.28, + "step": 10908 + }, + { + "epoch": 0.5987925356750823, + "grad_norm": 1.315659999847412, + "learning_rate": 3.5387973160176926e-05, + "loss": 0.1642, + "step": 10910 + }, + { + "epoch": 0.5989023051591658, + "grad_norm": 1.5715672969818115, + "learning_rate": 3.53832103589027e-05, + "loss": 0.2566, + "step": 10912 + }, + { + "epoch": 0.5990120746432491, + "grad_norm": 2.060495138168335, + "learning_rate": 3.5378447102169895e-05, + "loss": 0.2262, + "step": 10914 + }, + { + "epoch": 0.5991218441273326, + "grad_norm": 2.0931735038757324, + "learning_rate": 3.537368339018747e-05, + "loss": 0.4024, + "step": 10916 + }, + { + "epoch": 0.5992316136114161, + "grad_norm": 1.5956059694290161, + "learning_rate": 3.5368919223164374e-05, + "loss": 0.3431, + "step": 10918 + }, + { + "epoch": 0.5993413830954994, + "grad_norm": 1.766589879989624, + "learning_rate": 3.536415460130959e-05, + "loss": 0.3928, + "step": 10920 + }, + { + "epoch": 0.5994511525795829, + "grad_norm": 1.8130223751068115, + "learning_rate": 3.535938952483211e-05, + "loss": 0.2167, + "step": 10922 + }, + { + "epoch": 0.5995609220636663, + "grad_norm": 2.3054120540618896, + "learning_rate": 3.535462399394096e-05, + "loss": 0.3149, + "step": 10924 + }, + { + "epoch": 0.5996706915477498, + "grad_norm": 1.6799339056015015, + "learning_rate": 3.534985800884517e-05, + "loss": 0.2527, + "step": 10926 + }, + { + "epoch": 0.5997804610318331, + "grad_norm": 1.5767425298690796, + "learning_rate": 3.5345091569753815e-05, + "loss": 0.3262, + "step": 10928 + }, + { + "epoch": 0.5998902305159166, + "grad_norm": 1.8141098022460938, + "learning_rate": 3.534032467687597e-05, + "loss": 0.3313, + "step": 10930 + }, + { + "epoch": 0.6, + "grad_norm": 2.4938981533050537, + "learning_rate": 3.5335557330420726e-05, + "loss": 0.2124, + "step": 10932 + }, + { + "epoch": 0.6001097694840835, + "grad_norm": 0.9914221167564392, + "learning_rate": 3.533078953059721e-05, + "loss": 0.2099, + "step": 10934 + }, + { + "epoch": 0.6002195389681668, + "grad_norm": 1.7480665445327759, + "learning_rate": 3.5326021277614565e-05, + "loss": 0.2924, + "step": 10936 + }, + { + "epoch": 0.6003293084522503, + "grad_norm": 1.915052056312561, + "learning_rate": 3.532125257168193e-05, + "loss": 0.3577, + "step": 10938 + }, + { + "epoch": 0.6004390779363337, + "grad_norm": 1.2378686666488647, + "learning_rate": 3.531648341300851e-05, + "loss": 0.2781, + "step": 10940 + }, + { + "epoch": 0.6005488474204171, + "grad_norm": 1.6678942441940308, + "learning_rate": 3.531171380180348e-05, + "loss": 0.3854, + "step": 10942 + }, + { + "epoch": 0.6006586169045005, + "grad_norm": 2.1953365802764893, + "learning_rate": 3.530694373827607e-05, + "loss": 0.3719, + "step": 10944 + }, + { + "epoch": 0.600768386388584, + "grad_norm": 1.6952431201934814, + "learning_rate": 3.5302173222635524e-05, + "loss": 0.2724, + "step": 10946 + }, + { + "epoch": 0.6008781558726674, + "grad_norm": 2.0790467262268066, + "learning_rate": 3.52974022550911e-05, + "loss": 0.3165, + "step": 10948 + }, + { + "epoch": 0.6009879253567508, + "grad_norm": 1.700341820716858, + "learning_rate": 3.529263083585206e-05, + "loss": 0.2797, + "step": 10950 + }, + { + "epoch": 0.6010976948408342, + "grad_norm": 1.1513196229934692, + "learning_rate": 3.528785896512772e-05, + "loss": 0.2342, + "step": 10952 + }, + { + "epoch": 0.6012074643249177, + "grad_norm": 3.5290985107421875, + "learning_rate": 3.528308664312739e-05, + "loss": 0.1915, + "step": 10954 + }, + { + "epoch": 0.6013172338090012, + "grad_norm": 2.3806557655334473, + "learning_rate": 3.5278313870060405e-05, + "loss": 0.3547, + "step": 10956 + }, + { + "epoch": 0.6014270032930845, + "grad_norm": 1.6095830202102661, + "learning_rate": 3.527354064613612e-05, + "loss": 0.2575, + "step": 10958 + }, + { + "epoch": 0.601536772777168, + "grad_norm": 3.573451519012451, + "learning_rate": 3.5268766971563925e-05, + "loss": 0.2707, + "step": 10960 + }, + { + "epoch": 0.6016465422612514, + "grad_norm": 1.9639290571212769, + "learning_rate": 3.52639928465532e-05, + "loss": 0.4588, + "step": 10962 + }, + { + "epoch": 0.6017563117453348, + "grad_norm": 1.5787744522094727, + "learning_rate": 3.525921827131338e-05, + "loss": 0.3077, + "step": 10964 + }, + { + "epoch": 0.6018660812294182, + "grad_norm": 1.0162615776062012, + "learning_rate": 3.5254443246053886e-05, + "loss": 0.1945, + "step": 10966 + }, + { + "epoch": 0.6019758507135017, + "grad_norm": 1.437718391418457, + "learning_rate": 3.524966777098417e-05, + "loss": 0.3508, + "step": 10968 + }, + { + "epoch": 0.602085620197585, + "grad_norm": 1.7232038974761963, + "learning_rate": 3.5244891846313736e-05, + "loss": 0.2029, + "step": 10970 + }, + { + "epoch": 0.6021953896816685, + "grad_norm": 2.1495022773742676, + "learning_rate": 3.5240115472252055e-05, + "loss": 0.2819, + "step": 10972 + }, + { + "epoch": 0.6023051591657519, + "grad_norm": 1.9281858205795288, + "learning_rate": 3.523533864900863e-05, + "loss": 0.3177, + "step": 10974 + }, + { + "epoch": 0.6024149286498354, + "grad_norm": 2.2989604473114014, + "learning_rate": 3.523056137679304e-05, + "loss": 0.2994, + "step": 10976 + }, + { + "epoch": 0.6025246981339187, + "grad_norm": 1.371534824371338, + "learning_rate": 3.5225783655814796e-05, + "loss": 0.2716, + "step": 10978 + }, + { + "epoch": 0.6026344676180022, + "grad_norm": 1.7072937488555908, + "learning_rate": 3.52210054862835e-05, + "loss": 0.3782, + "step": 10980 + }, + { + "epoch": 0.6027442371020856, + "grad_norm": 1.8284342288970947, + "learning_rate": 3.521622686840873e-05, + "loss": 0.3102, + "step": 10982 + }, + { + "epoch": 0.6028540065861691, + "grad_norm": 1.7780927419662476, + "learning_rate": 3.521144780240011e-05, + "loss": 0.2832, + "step": 10984 + }, + { + "epoch": 0.6029637760702524, + "grad_norm": 2.41943359375, + "learning_rate": 3.520666828846726e-05, + "loss": 0.3366, + "step": 10986 + }, + { + "epoch": 0.6030735455543359, + "grad_norm": 2.3437721729278564, + "learning_rate": 3.520188832681984e-05, + "loss": 0.404, + "step": 10988 + }, + { + "epoch": 0.6031833150384193, + "grad_norm": 1.4044790267944336, + "learning_rate": 3.519710791766754e-05, + "loss": 0.4405, + "step": 10990 + }, + { + "epoch": 0.6032930845225027, + "grad_norm": 1.530543327331543, + "learning_rate": 3.5192327061220024e-05, + "loss": 0.3686, + "step": 10992 + }, + { + "epoch": 0.6034028540065862, + "grad_norm": 0.8589890003204346, + "learning_rate": 3.5187545757687015e-05, + "loss": 0.2027, + "step": 10994 + }, + { + "epoch": 0.6035126234906696, + "grad_norm": 1.1626639366149902, + "learning_rate": 3.5182764007278255e-05, + "loss": 0.2094, + "step": 10996 + }, + { + "epoch": 0.6036223929747531, + "grad_norm": 1.3966883420944214, + "learning_rate": 3.517798181020348e-05, + "loss": 0.2787, + "step": 10998 + }, + { + "epoch": 0.6037321624588364, + "grad_norm": 0.9138810038566589, + "learning_rate": 3.517319916667247e-05, + "loss": 0.2225, + "step": 11000 + }, + { + "epoch": 0.6038419319429199, + "grad_norm": 2.456232786178589, + "learning_rate": 3.516841607689501e-05, + "loss": 0.2519, + "step": 11002 + }, + { + "epoch": 0.6039517014270033, + "grad_norm": 1.6176543235778809, + "learning_rate": 3.516363254108091e-05, + "loss": 0.3213, + "step": 11004 + }, + { + "epoch": 0.6040614709110868, + "grad_norm": 1.1560280323028564, + "learning_rate": 3.515884855944e-05, + "loss": 0.291, + "step": 11006 + }, + { + "epoch": 0.6041712403951701, + "grad_norm": 1.5522936582565308, + "learning_rate": 3.515406413218213e-05, + "loss": 0.3481, + "step": 11008 + }, + { + "epoch": 0.6042810098792536, + "grad_norm": 2.585526704788208, + "learning_rate": 3.514927925951717e-05, + "loss": 0.3433, + "step": 11010 + }, + { + "epoch": 0.604390779363337, + "grad_norm": 1.2401883602142334, + "learning_rate": 3.5144493941655e-05, + "loss": 0.3054, + "step": 11012 + }, + { + "epoch": 0.6045005488474204, + "grad_norm": 2.389439344406128, + "learning_rate": 3.513970817880554e-05, + "loss": 0.4118, + "step": 11014 + }, + { + "epoch": 0.6046103183315038, + "grad_norm": 2.1480209827423096, + "learning_rate": 3.513492197117871e-05, + "loss": 0.327, + "step": 11016 + }, + { + "epoch": 0.6047200878155873, + "grad_norm": 1.5174757242202759, + "learning_rate": 3.5130135318984456e-05, + "loss": 0.2604, + "step": 11018 + }, + { + "epoch": 0.6048298572996706, + "grad_norm": 1.300896406173706, + "learning_rate": 3.512534822243274e-05, + "loss": 0.2672, + "step": 11020 + }, + { + "epoch": 0.6049396267837541, + "grad_norm": 1.737919569015503, + "learning_rate": 3.512056068173356e-05, + "loss": 0.3877, + "step": 11022 + }, + { + "epoch": 0.6050493962678375, + "grad_norm": 2.671053647994995, + "learning_rate": 3.511577269709692e-05, + "loss": 0.3756, + "step": 11024 + }, + { + "epoch": 0.605159165751921, + "grad_norm": 1.5338321924209595, + "learning_rate": 3.511098426873283e-05, + "loss": 0.247, + "step": 11026 + }, + { + "epoch": 0.6052689352360044, + "grad_norm": 1.877814769744873, + "learning_rate": 3.510619539685134e-05, + "loss": 0.3142, + "step": 11028 + }, + { + "epoch": 0.6053787047200878, + "grad_norm": 1.5039621591567993, + "learning_rate": 3.510140608166251e-05, + "loss": 0.3236, + "step": 11030 + }, + { + "epoch": 0.6054884742041713, + "grad_norm": 3.7410733699798584, + "learning_rate": 3.5096616323376446e-05, + "loss": 0.3121, + "step": 11032 + }, + { + "epoch": 0.6055982436882547, + "grad_norm": 1.5612038373947144, + "learning_rate": 3.509182612220322e-05, + "loss": 0.2887, + "step": 11034 + }, + { + "epoch": 0.6057080131723381, + "grad_norm": 1.224359393119812, + "learning_rate": 3.508703547835297e-05, + "loss": 0.2544, + "step": 11036 + }, + { + "epoch": 0.6058177826564215, + "grad_norm": 3.69909405708313, + "learning_rate": 3.508224439203583e-05, + "loss": 0.2574, + "step": 11038 + }, + { + "epoch": 0.605927552140505, + "grad_norm": 1.5174697637557983, + "learning_rate": 3.507745286346198e-05, + "loss": 0.2361, + "step": 11040 + }, + { + "epoch": 0.6060373216245883, + "grad_norm": 1.3255610466003418, + "learning_rate": 3.507266089284157e-05, + "loss": 0.2852, + "step": 11042 + }, + { + "epoch": 0.6061470911086718, + "grad_norm": 4.479718208312988, + "learning_rate": 3.506786848038482e-05, + "loss": 0.2812, + "step": 11044 + }, + { + "epoch": 0.6062568605927552, + "grad_norm": 1.1320064067840576, + "learning_rate": 3.506307562630194e-05, + "loss": 0.2257, + "step": 11046 + }, + { + "epoch": 0.6063666300768387, + "grad_norm": 1.9836044311523438, + "learning_rate": 3.505828233080316e-05, + "loss": 0.3019, + "step": 11048 + }, + { + "epoch": 0.606476399560922, + "grad_norm": 2.8241193294525146, + "learning_rate": 3.505348859409876e-05, + "loss": 0.2917, + "step": 11050 + }, + { + "epoch": 0.6065861690450055, + "grad_norm": 2.1208901405334473, + "learning_rate": 3.504869441639901e-05, + "loss": 0.2274, + "step": 11052 + }, + { + "epoch": 0.6066959385290889, + "grad_norm": 1.3372254371643066, + "learning_rate": 3.5043899797914187e-05, + "loss": 0.2617, + "step": 11054 + }, + { + "epoch": 0.6068057080131724, + "grad_norm": 2.9754478931427, + "learning_rate": 3.5039104738854635e-05, + "loss": 0.4041, + "step": 11056 + }, + { + "epoch": 0.6069154774972557, + "grad_norm": 2.4192192554473877, + "learning_rate": 3.503430923943066e-05, + "loss": 0.2778, + "step": 11058 + }, + { + "epoch": 0.6070252469813392, + "grad_norm": 1.9849494695663452, + "learning_rate": 3.502951329985264e-05, + "loss": 0.2721, + "step": 11060 + }, + { + "epoch": 0.6071350164654226, + "grad_norm": 1.6995259523391724, + "learning_rate": 3.502471692033094e-05, + "loss": 0.3369, + "step": 11062 + }, + { + "epoch": 0.607244785949506, + "grad_norm": 1.4340404272079468, + "learning_rate": 3.501992010107594e-05, + "loss": 0.2756, + "step": 11064 + }, + { + "epoch": 0.6073545554335895, + "grad_norm": 1.4955871105194092, + "learning_rate": 3.501512284229807e-05, + "loss": 0.2489, + "step": 11066 + }, + { + "epoch": 0.6074643249176729, + "grad_norm": 1.706051230430603, + "learning_rate": 3.501032514420776e-05, + "loss": 0.3024, + "step": 11068 + }, + { + "epoch": 0.6075740944017564, + "grad_norm": 1.7405885457992554, + "learning_rate": 3.5005527007015455e-05, + "loss": 0.42, + "step": 11070 + }, + { + "epoch": 0.6076838638858397, + "grad_norm": 1.310171365737915, + "learning_rate": 3.500072843093162e-05, + "loss": 0.2227, + "step": 11072 + }, + { + "epoch": 0.6077936333699232, + "grad_norm": 1.5316340923309326, + "learning_rate": 3.4995929416166756e-05, + "loss": 0.244, + "step": 11074 + }, + { + "epoch": 0.6079034028540066, + "grad_norm": 1.6957083940505981, + "learning_rate": 3.4991129962931365e-05, + "loss": 0.2448, + "step": 11076 + }, + { + "epoch": 0.60801317233809, + "grad_norm": 1.5849864482879639, + "learning_rate": 3.498633007143596e-05, + "loss": 0.2525, + "step": 11078 + }, + { + "epoch": 0.6081229418221734, + "grad_norm": 1.6515547037124634, + "learning_rate": 3.4981529741891117e-05, + "loss": 0.3119, + "step": 11080 + }, + { + "epoch": 0.6082327113062569, + "grad_norm": 1.2858664989471436, + "learning_rate": 3.4976728974507384e-05, + "loss": 0.2938, + "step": 11082 + }, + { + "epoch": 0.6083424807903403, + "grad_norm": 1.6177762746810913, + "learning_rate": 3.4971927769495345e-05, + "loss": 0.3531, + "step": 11084 + }, + { + "epoch": 0.6084522502744237, + "grad_norm": 1.2895572185516357, + "learning_rate": 3.496712612706561e-05, + "loss": 0.2024, + "step": 11086 + }, + { + "epoch": 0.6085620197585071, + "grad_norm": 1.9505889415740967, + "learning_rate": 3.496232404742881e-05, + "loss": 0.3134, + "step": 11088 + }, + { + "epoch": 0.6086717892425906, + "grad_norm": 3.274311065673828, + "learning_rate": 3.495752153079557e-05, + "loss": 0.4672, + "step": 11090 + }, + { + "epoch": 0.6087815587266739, + "grad_norm": 1.2475345134735107, + "learning_rate": 3.495271857737657e-05, + "loss": 0.2449, + "step": 11092 + }, + { + "epoch": 0.6088913282107574, + "grad_norm": 1.8166966438293457, + "learning_rate": 3.494791518738247e-05, + "loss": 0.2512, + "step": 11094 + }, + { + "epoch": 0.6090010976948408, + "grad_norm": 1.7667336463928223, + "learning_rate": 3.494311136102399e-05, + "loss": 0.2468, + "step": 11096 + }, + { + "epoch": 0.6091108671789243, + "grad_norm": 1.8069961071014404, + "learning_rate": 3.4938307098511846e-05, + "loss": 0.4209, + "step": 11098 + }, + { + "epoch": 0.6092206366630076, + "grad_norm": 7.187271595001221, + "learning_rate": 3.4933502400056763e-05, + "loss": 0.1771, + "step": 11100 + }, + { + "epoch": 0.6093304061470911, + "grad_norm": 1.1383576393127441, + "learning_rate": 3.4928697265869515e-05, + "loss": 0.2024, + "step": 11102 + }, + { + "epoch": 0.6094401756311746, + "grad_norm": 2.0074775218963623, + "learning_rate": 3.492389169616087e-05, + "loss": 0.2927, + "step": 11104 + }, + { + "epoch": 0.609549945115258, + "grad_norm": 2.5830917358398438, + "learning_rate": 3.491908569114164e-05, + "loss": 0.3739, + "step": 11106 + }, + { + "epoch": 0.6096597145993414, + "grad_norm": 1.7583391666412354, + "learning_rate": 3.49142792510226e-05, + "loss": 0.2802, + "step": 11108 + }, + { + "epoch": 0.6097694840834248, + "grad_norm": 2.1043033599853516, + "learning_rate": 3.490947237601462e-05, + "loss": 0.4535, + "step": 11110 + }, + { + "epoch": 0.6098792535675083, + "grad_norm": 2.006622552871704, + "learning_rate": 3.4904665066328544e-05, + "loss": 0.1783, + "step": 11112 + }, + { + "epoch": 0.6099890230515916, + "grad_norm": 1.4877676963806152, + "learning_rate": 3.489985732217525e-05, + "loss": 0.357, + "step": 11114 + }, + { + "epoch": 0.6100987925356751, + "grad_norm": 1.2576507329940796, + "learning_rate": 3.489504914376561e-05, + "loss": 0.2843, + "step": 11116 + }, + { + "epoch": 0.6102085620197585, + "grad_norm": 2.293639898300171, + "learning_rate": 3.489024053131056e-05, + "loss": 0.2998, + "step": 11118 + }, + { + "epoch": 0.610318331503842, + "grad_norm": 1.3724260330200195, + "learning_rate": 3.488543148502101e-05, + "loss": 0.2529, + "step": 11120 + }, + { + "epoch": 0.6104281009879253, + "grad_norm": 1.792240023612976, + "learning_rate": 3.488062200510791e-05, + "loss": 0.2067, + "step": 11122 + }, + { + "epoch": 0.6105378704720088, + "grad_norm": 1.8356249332427979, + "learning_rate": 3.487581209178224e-05, + "loss": 0.2074, + "step": 11124 + }, + { + "epoch": 0.6106476399560922, + "grad_norm": 1.4529778957366943, + "learning_rate": 3.487100174525498e-05, + "loss": 0.2677, + "step": 11126 + }, + { + "epoch": 0.6107574094401756, + "grad_norm": 1.6959363222122192, + "learning_rate": 3.486619096573712e-05, + "loss": 0.3435, + "step": 11128 + }, + { + "epoch": 0.610867178924259, + "grad_norm": 1.5873736143112183, + "learning_rate": 3.486137975343971e-05, + "loss": 0.3712, + "step": 11130 + }, + { + "epoch": 0.6109769484083425, + "grad_norm": 1.2558101415634155, + "learning_rate": 3.485656810857378e-05, + "loss": 0.2009, + "step": 11132 + }, + { + "epoch": 0.6110867178924259, + "grad_norm": 2.7660140991210938, + "learning_rate": 3.4851756031350394e-05, + "loss": 0.2888, + "step": 11134 + }, + { + "epoch": 0.6111964873765093, + "grad_norm": 2.245230197906494, + "learning_rate": 3.484694352198063e-05, + "loss": 0.3363, + "step": 11136 + }, + { + "epoch": 0.6113062568605927, + "grad_norm": 2.6801199913024902, + "learning_rate": 3.484213058067559e-05, + "loss": 0.3693, + "step": 11138 + }, + { + "epoch": 0.6114160263446762, + "grad_norm": 1.7448500394821167, + "learning_rate": 3.483731720764639e-05, + "loss": 0.3229, + "step": 11140 + }, + { + "epoch": 0.6115257958287597, + "grad_norm": 2.5733234882354736, + "learning_rate": 3.483250340310418e-05, + "loss": 0.2129, + "step": 11142 + }, + { + "epoch": 0.611635565312843, + "grad_norm": 1.1610713005065918, + "learning_rate": 3.482768916726012e-05, + "loss": 0.2542, + "step": 11144 + }, + { + "epoch": 0.6117453347969265, + "grad_norm": 1.9487980604171753, + "learning_rate": 3.482287450032536e-05, + "loss": 0.2398, + "step": 11146 + }, + { + "epoch": 0.6118551042810099, + "grad_norm": 1.6835441589355469, + "learning_rate": 3.481805940251112e-05, + "loss": 0.3745, + "step": 11148 + }, + { + "epoch": 0.6119648737650933, + "grad_norm": 1.5696393251419067, + "learning_rate": 3.48132438740286e-05, + "loss": 0.2577, + "step": 11150 + }, + { + "epoch": 0.6120746432491767, + "grad_norm": 1.2347115278244019, + "learning_rate": 3.480842791508904e-05, + "loss": 0.2589, + "step": 11152 + }, + { + "epoch": 0.6121844127332602, + "grad_norm": 1.5203670263290405, + "learning_rate": 3.4803611525903685e-05, + "loss": 0.3085, + "step": 11154 + }, + { + "epoch": 0.6122941822173436, + "grad_norm": 2.265145778656006, + "learning_rate": 3.479879470668381e-05, + "loss": 0.3013, + "step": 11156 + }, + { + "epoch": 0.612403951701427, + "grad_norm": 2.112783432006836, + "learning_rate": 3.479397745764071e-05, + "loss": 0.3556, + "step": 11158 + }, + { + "epoch": 0.6125137211855104, + "grad_norm": 1.071569561958313, + "learning_rate": 3.4789159778985685e-05, + "loss": 0.19, + "step": 11160 + }, + { + "epoch": 0.6126234906695939, + "grad_norm": 1.5126649141311646, + "learning_rate": 3.4784341670930065e-05, + "loss": 0.4219, + "step": 11162 + }, + { + "epoch": 0.6127332601536772, + "grad_norm": 1.2325947284698486, + "learning_rate": 3.477952313368519e-05, + "loss": 0.2921, + "step": 11164 + }, + { + "epoch": 0.6128430296377607, + "grad_norm": 1.407361388206482, + "learning_rate": 3.4774704167462434e-05, + "loss": 0.2538, + "step": 11166 + }, + { + "epoch": 0.6129527991218441, + "grad_norm": 1.2478935718536377, + "learning_rate": 3.4769884772473174e-05, + "loss": 0.3857, + "step": 11168 + }, + { + "epoch": 0.6130625686059276, + "grad_norm": 1.6428751945495605, + "learning_rate": 3.4765064948928814e-05, + "loss": 0.3162, + "step": 11170 + }, + { + "epoch": 0.6131723380900109, + "grad_norm": 1.5804071426391602, + "learning_rate": 3.476024469704078e-05, + "loss": 0.275, + "step": 11172 + }, + { + "epoch": 0.6132821075740944, + "grad_norm": 1.4605246782302856, + "learning_rate": 3.47554240170205e-05, + "loss": 0.2489, + "step": 11174 + }, + { + "epoch": 0.6133918770581779, + "grad_norm": 2.238041877746582, + "learning_rate": 3.475060290907944e-05, + "loss": 0.2822, + "step": 11176 + }, + { + "epoch": 0.6135016465422612, + "grad_norm": 1.9234164953231812, + "learning_rate": 3.474578137342909e-05, + "loss": 0.282, + "step": 11178 + }, + { + "epoch": 0.6136114160263447, + "grad_norm": 1.367498755455017, + "learning_rate": 3.4740959410280926e-05, + "loss": 0.31, + "step": 11180 + }, + { + "epoch": 0.6137211855104281, + "grad_norm": 1.360995888710022, + "learning_rate": 3.4736137019846465e-05, + "loss": 0.26, + "step": 11182 + }, + { + "epoch": 0.6138309549945116, + "grad_norm": 1.3022894859313965, + "learning_rate": 3.473131420233725e-05, + "loss": 0.2353, + "step": 11184 + }, + { + "epoch": 0.6139407244785949, + "grad_norm": 1.1427127122879028, + "learning_rate": 3.4726490957964834e-05, + "loss": 0.2588, + "step": 11186 + }, + { + "epoch": 0.6140504939626784, + "grad_norm": 1.2912194728851318, + "learning_rate": 3.472166728694078e-05, + "loss": 0.2194, + "step": 11188 + }, + { + "epoch": 0.6141602634467618, + "grad_norm": 1.0980240106582642, + "learning_rate": 3.4716843189476687e-05, + "loss": 0.2766, + "step": 11190 + }, + { + "epoch": 0.6142700329308453, + "grad_norm": 1.7770130634307861, + "learning_rate": 3.471201866578416e-05, + "loss": 0.2639, + "step": 11192 + }, + { + "epoch": 0.6143798024149286, + "grad_norm": 2.2963790893554688, + "learning_rate": 3.4707193716074816e-05, + "loss": 0.2557, + "step": 11194 + }, + { + "epoch": 0.6144895718990121, + "grad_norm": 1.793613076210022, + "learning_rate": 3.470236834056032e-05, + "loss": 0.355, + "step": 11196 + }, + { + "epoch": 0.6145993413830955, + "grad_norm": 1.2777631282806396, + "learning_rate": 3.469754253945232e-05, + "loss": 0.3397, + "step": 11198 + }, + { + "epoch": 0.6147091108671789, + "grad_norm": 1.7891290187835693, + "learning_rate": 3.4692716312962504e-05, + "loss": 0.1913, + "step": 11200 + }, + { + "epoch": 0.6148188803512623, + "grad_norm": 1.6975538730621338, + "learning_rate": 3.4687889661302576e-05, + "loss": 0.2868, + "step": 11202 + }, + { + "epoch": 0.6149286498353458, + "grad_norm": 1.2284729480743408, + "learning_rate": 3.468306258468426e-05, + "loss": 0.3216, + "step": 11204 + }, + { + "epoch": 0.6150384193194292, + "grad_norm": 2.0081260204315186, + "learning_rate": 3.4678235083319296e-05, + "loss": 0.3773, + "step": 11206 + }, + { + "epoch": 0.6151481888035126, + "grad_norm": 1.3544796705245972, + "learning_rate": 3.467340715741943e-05, + "loss": 0.2969, + "step": 11208 + }, + { + "epoch": 0.615257958287596, + "grad_norm": 1.6562072038650513, + "learning_rate": 3.466857880719645e-05, + "loss": 0.3137, + "step": 11210 + }, + { + "epoch": 0.6153677277716795, + "grad_norm": 2.145965814590454, + "learning_rate": 3.466375003286214e-05, + "loss": 0.3258, + "step": 11212 + }, + { + "epoch": 0.615477497255763, + "grad_norm": 1.4171181917190552, + "learning_rate": 3.4658920834628335e-05, + "loss": 0.3115, + "step": 11214 + }, + { + "epoch": 0.6155872667398463, + "grad_norm": 1.2929754257202148, + "learning_rate": 3.465409121270684e-05, + "loss": 0.1921, + "step": 11216 + }, + { + "epoch": 0.6156970362239298, + "grad_norm": 1.6665719747543335, + "learning_rate": 3.4649261167309526e-05, + "loss": 0.2615, + "step": 11218 + }, + { + "epoch": 0.6158068057080132, + "grad_norm": 1.902171015739441, + "learning_rate": 3.464443069864826e-05, + "loss": 0.3914, + "step": 11220 + }, + { + "epoch": 0.6159165751920966, + "grad_norm": 1.7312859296798706, + "learning_rate": 3.463959980693492e-05, + "loss": 0.3587, + "step": 11222 + }, + { + "epoch": 0.61602634467618, + "grad_norm": 1.423956274986267, + "learning_rate": 3.463476849238142e-05, + "loss": 0.3357, + "step": 11224 + }, + { + "epoch": 0.6161361141602635, + "grad_norm": 1.2441866397857666, + "learning_rate": 3.462993675519968e-05, + "loss": 0.376, + "step": 11226 + }, + { + "epoch": 0.6162458836443468, + "grad_norm": 1.6463944911956787, + "learning_rate": 3.462510459560165e-05, + "loss": 0.2827, + "step": 11228 + }, + { + "epoch": 0.6163556531284303, + "grad_norm": 1.5615649223327637, + "learning_rate": 3.4620272013799286e-05, + "loss": 0.3535, + "step": 11230 + }, + { + "epoch": 0.6164654226125137, + "grad_norm": 2.3096518516540527, + "learning_rate": 3.461543901000458e-05, + "loss": 0.2609, + "step": 11232 + }, + { + "epoch": 0.6165751920965972, + "grad_norm": 2.771669626235962, + "learning_rate": 3.461060558442952e-05, + "loss": 0.2819, + "step": 11234 + }, + { + "epoch": 0.6166849615806805, + "grad_norm": 1.6168054342269897, + "learning_rate": 3.460577173728613e-05, + "loss": 0.3196, + "step": 11236 + }, + { + "epoch": 0.616794731064764, + "grad_norm": 1.502946138381958, + "learning_rate": 3.460093746878644e-05, + "loss": 0.2563, + "step": 11238 + }, + { + "epoch": 0.6169045005488474, + "grad_norm": 1.4421086311340332, + "learning_rate": 3.459610277914251e-05, + "loss": 0.2041, + "step": 11240 + }, + { + "epoch": 0.6170142700329309, + "grad_norm": 1.5602357387542725, + "learning_rate": 3.459126766856641e-05, + "loss": 0.263, + "step": 11242 + }, + { + "epoch": 0.6171240395170142, + "grad_norm": 1.901756763458252, + "learning_rate": 3.458643213727023e-05, + "loss": 0.2868, + "step": 11244 + }, + { + "epoch": 0.6172338090010977, + "grad_norm": 1.2873172760009766, + "learning_rate": 3.4581596185466094e-05, + "loss": 0.2353, + "step": 11246 + }, + { + "epoch": 0.6173435784851811, + "grad_norm": 1.245668888092041, + "learning_rate": 3.457675981336611e-05, + "loss": 0.1972, + "step": 11248 + }, + { + "epoch": 0.6174533479692645, + "grad_norm": 1.1076146364212036, + "learning_rate": 3.457192302118244e-05, + "loss": 0.2348, + "step": 11250 + }, + { + "epoch": 0.617563117453348, + "grad_norm": 1.629406452178955, + "learning_rate": 3.456708580912725e-05, + "loss": 0.2701, + "step": 11252 + }, + { + "epoch": 0.6176728869374314, + "grad_norm": 1.3071256875991821, + "learning_rate": 3.4562248177412715e-05, + "loss": 0.2627, + "step": 11254 + }, + { + "epoch": 0.6177826564215149, + "grad_norm": 2.070894956588745, + "learning_rate": 3.4557410126251036e-05, + "loss": 0.3129, + "step": 11256 + }, + { + "epoch": 0.6178924259055982, + "grad_norm": 2.2276992797851562, + "learning_rate": 3.455257165585444e-05, + "loss": 0.3674, + "step": 11258 + }, + { + "epoch": 0.6180021953896817, + "grad_norm": 0.8703026175498962, + "learning_rate": 3.454773276643516e-05, + "loss": 0.275, + "step": 11260 + }, + { + "epoch": 0.6181119648737651, + "grad_norm": 1.229760766029358, + "learning_rate": 3.454289345820546e-05, + "loss": 0.1991, + "step": 11262 + }, + { + "epoch": 0.6182217343578486, + "grad_norm": 1.418660283088684, + "learning_rate": 3.453805373137762e-05, + "loss": 0.2682, + "step": 11264 + }, + { + "epoch": 0.6183315038419319, + "grad_norm": 1.736142635345459, + "learning_rate": 3.453321358616393e-05, + "loss": 0.2415, + "step": 11266 + }, + { + "epoch": 0.6184412733260154, + "grad_norm": 1.6385972499847412, + "learning_rate": 3.452837302277668e-05, + "loss": 0.3079, + "step": 11268 + }, + { + "epoch": 0.6185510428100988, + "grad_norm": 5.270626068115234, + "learning_rate": 3.452353204142824e-05, + "loss": 0.279, + "step": 11270 + }, + { + "epoch": 0.6186608122941822, + "grad_norm": 1.6298435926437378, + "learning_rate": 3.451869064233094e-05, + "loss": 0.2508, + "step": 11272 + }, + { + "epoch": 0.6187705817782656, + "grad_norm": 1.5939037799835205, + "learning_rate": 3.451384882569714e-05, + "loss": 0.357, + "step": 11274 + }, + { + "epoch": 0.6188803512623491, + "grad_norm": 1.1952507495880127, + "learning_rate": 3.450900659173924e-05, + "loss": 0.238, + "step": 11276 + }, + { + "epoch": 0.6189901207464324, + "grad_norm": 1.186437964439392, + "learning_rate": 3.4504163940669634e-05, + "loss": 0.2942, + "step": 11278 + }, + { + "epoch": 0.6190998902305159, + "grad_norm": 2.9663431644439697, + "learning_rate": 3.4499320872700754e-05, + "loss": 0.2861, + "step": 11280 + }, + { + "epoch": 0.6192096597145993, + "grad_norm": 1.179264783859253, + "learning_rate": 3.4494477388045035e-05, + "loss": 0.2557, + "step": 11282 + }, + { + "epoch": 0.6193194291986828, + "grad_norm": 1.3796902894973755, + "learning_rate": 3.448963348691493e-05, + "loss": 0.2342, + "step": 11284 + }, + { + "epoch": 0.6194291986827661, + "grad_norm": 1.7045315504074097, + "learning_rate": 3.4484789169522927e-05, + "loss": 0.259, + "step": 11286 + }, + { + "epoch": 0.6195389681668496, + "grad_norm": 0.962422788143158, + "learning_rate": 3.447994443608151e-05, + "loss": 0.1339, + "step": 11288 + }, + { + "epoch": 0.6196487376509331, + "grad_norm": 1.0848796367645264, + "learning_rate": 3.44750992868032e-05, + "loss": 0.2345, + "step": 11290 + }, + { + "epoch": 0.6197585071350165, + "grad_norm": 1.2213809490203857, + "learning_rate": 3.447025372190054e-05, + "loss": 0.2085, + "step": 11292 + }, + { + "epoch": 0.6198682766190999, + "grad_norm": 2.201253890991211, + "learning_rate": 3.4465407741586056e-05, + "loss": 0.2797, + "step": 11294 + }, + { + "epoch": 0.6199780461031833, + "grad_norm": 1.8875738382339478, + "learning_rate": 3.446056134607234e-05, + "loss": 0.3012, + "step": 11296 + }, + { + "epoch": 0.6200878155872668, + "grad_norm": 3.2076642513275146, + "learning_rate": 3.445571453557196e-05, + "loss": 0.2159, + "step": 11298 + }, + { + "epoch": 0.6201975850713501, + "grad_norm": 1.349614143371582, + "learning_rate": 3.445086731029753e-05, + "loss": 0.2505, + "step": 11300 + }, + { + "epoch": 0.6203073545554336, + "grad_norm": 1.9307628870010376, + "learning_rate": 3.444601967046168e-05, + "loss": 0.431, + "step": 11302 + }, + { + "epoch": 0.620417124039517, + "grad_norm": 1.0675153732299805, + "learning_rate": 3.444117161627704e-05, + "loss": 0.1882, + "step": 11304 + }, + { + "epoch": 0.6205268935236005, + "grad_norm": 1.5391544103622437, + "learning_rate": 3.443632314795627e-05, + "loss": 0.3503, + "step": 11306 + }, + { + "epoch": 0.6206366630076838, + "grad_norm": 1.8463374376296997, + "learning_rate": 3.4431474265712046e-05, + "loss": 0.3497, + "step": 11308 + }, + { + "epoch": 0.6207464324917673, + "grad_norm": 1.0697907209396362, + "learning_rate": 3.4426624969757083e-05, + "loss": 0.32, + "step": 11310 + }, + { + "epoch": 0.6208562019758507, + "grad_norm": 2.673978805541992, + "learning_rate": 3.442177526030407e-05, + "loss": 0.4313, + "step": 11312 + }, + { + "epoch": 0.6209659714599342, + "grad_norm": 2.175520658493042, + "learning_rate": 3.4416925137565754e-05, + "loss": 0.2676, + "step": 11314 + }, + { + "epoch": 0.6210757409440175, + "grad_norm": 2.5551252365112305, + "learning_rate": 3.441207460175488e-05, + "loss": 0.4568, + "step": 11316 + }, + { + "epoch": 0.621185510428101, + "grad_norm": 1.9428786039352417, + "learning_rate": 3.440722365308421e-05, + "loss": 0.2833, + "step": 11318 + }, + { + "epoch": 0.6212952799121844, + "grad_norm": 1.0920673608779907, + "learning_rate": 3.440237229176654e-05, + "loss": 0.2994, + "step": 11320 + }, + { + "epoch": 0.6214050493962678, + "grad_norm": 1.7212493419647217, + "learning_rate": 3.439752051801467e-05, + "loss": 0.3379, + "step": 11322 + }, + { + "epoch": 0.6215148188803513, + "grad_norm": 0.9977014660835266, + "learning_rate": 3.439266833204143e-05, + "loss": 0.248, + "step": 11324 + }, + { + "epoch": 0.6216245883644347, + "grad_norm": 2.5080959796905518, + "learning_rate": 3.4387815734059654e-05, + "loss": 0.3765, + "step": 11326 + }, + { + "epoch": 0.6217343578485182, + "grad_norm": 1.7446403503417969, + "learning_rate": 3.4382962724282195e-05, + "loss": 0.3416, + "step": 11328 + }, + { + "epoch": 0.6218441273326015, + "grad_norm": 1.2038068771362305, + "learning_rate": 3.437810930292195e-05, + "loss": 0.1633, + "step": 11330 + }, + { + "epoch": 0.621953896816685, + "grad_norm": 1.5202490091323853, + "learning_rate": 3.437325547019179e-05, + "loss": 0.2521, + "step": 11332 + }, + { + "epoch": 0.6220636663007684, + "grad_norm": 2.3765742778778076, + "learning_rate": 3.436840122630464e-05, + "loss": 0.2859, + "step": 11334 + }, + { + "epoch": 0.6221734357848518, + "grad_norm": 1.4779284000396729, + "learning_rate": 3.436354657147343e-05, + "loss": 0.2461, + "step": 11336 + }, + { + "epoch": 0.6222832052689352, + "grad_norm": 1.3973312377929688, + "learning_rate": 3.4358691505911104e-05, + "loss": 0.3724, + "step": 11338 + }, + { + "epoch": 0.6223929747530187, + "grad_norm": 1.8128467798233032, + "learning_rate": 3.435383602983064e-05, + "loss": 0.2716, + "step": 11340 + }, + { + "epoch": 0.6225027442371021, + "grad_norm": 3.4604909420013428, + "learning_rate": 3.434898014344501e-05, + "loss": 0.404, + "step": 11342 + }, + { + "epoch": 0.6226125137211855, + "grad_norm": 1.4914791584014893, + "learning_rate": 3.434412384696723e-05, + "loss": 0.3234, + "step": 11344 + }, + { + "epoch": 0.6227222832052689, + "grad_norm": 2.0087087154388428, + "learning_rate": 3.433926714061032e-05, + "loss": 0.2209, + "step": 11346 + }, + { + "epoch": 0.6228320526893524, + "grad_norm": 2.621054172515869, + "learning_rate": 3.43344100245873e-05, + "loss": 0.3236, + "step": 11348 + }, + { + "epoch": 0.6229418221734357, + "grad_norm": 0.8437867164611816, + "learning_rate": 3.432955249911125e-05, + "loss": 0.2043, + "step": 11350 + }, + { + "epoch": 0.6230515916575192, + "grad_norm": 2.3408877849578857, + "learning_rate": 3.432469456439523e-05, + "loss": 0.2222, + "step": 11352 + }, + { + "epoch": 0.6231613611416026, + "grad_norm": 1.159508228302002, + "learning_rate": 3.4319836220652335e-05, + "loss": 0.208, + "step": 11354 + }, + { + "epoch": 0.6232711306256861, + "grad_norm": 3.3371918201446533, + "learning_rate": 3.4314977468095685e-05, + "loss": 0.2952, + "step": 11356 + }, + { + "epoch": 0.6233809001097694, + "grad_norm": 1.4851781129837036, + "learning_rate": 3.43101183069384e-05, + "loss": 0.374, + "step": 11358 + }, + { + "epoch": 0.6234906695938529, + "grad_norm": 4.035576820373535, + "learning_rate": 3.430525873739363e-05, + "loss": 0.3062, + "step": 11360 + }, + { + "epoch": 0.6236004390779364, + "grad_norm": 1.8993300199508667, + "learning_rate": 3.430039875967454e-05, + "loss": 0.3286, + "step": 11362 + }, + { + "epoch": 0.6237102085620198, + "grad_norm": 1.4824793338775635, + "learning_rate": 3.4295538373994314e-05, + "loss": 0.2548, + "step": 11364 + }, + { + "epoch": 0.6238199780461032, + "grad_norm": 1.7661354541778564, + "learning_rate": 3.429067758056613e-05, + "loss": 0.3387, + "step": 11366 + }, + { + "epoch": 0.6239297475301866, + "grad_norm": 2.952207088470459, + "learning_rate": 3.428581637960325e-05, + "loss": 0.3207, + "step": 11368 + }, + { + "epoch": 0.6240395170142701, + "grad_norm": 2.08144474029541, + "learning_rate": 3.428095477131888e-05, + "loss": 0.41, + "step": 11370 + }, + { + "epoch": 0.6241492864983534, + "grad_norm": 1.6289215087890625, + "learning_rate": 3.427609275592627e-05, + "loss": 0.2204, + "step": 11372 + }, + { + "epoch": 0.6242590559824369, + "grad_norm": 2.1817729473114014, + "learning_rate": 3.4271230333638716e-05, + "loss": 0.3722, + "step": 11374 + }, + { + "epoch": 0.6243688254665203, + "grad_norm": 3.7737417221069336, + "learning_rate": 3.426636750466949e-05, + "loss": 0.4152, + "step": 11376 + }, + { + "epoch": 0.6244785949506038, + "grad_norm": 1.4663549661636353, + "learning_rate": 3.4261504269231904e-05, + "loss": 0.2683, + "step": 11378 + }, + { + "epoch": 0.6245883644346871, + "grad_norm": 1.5758470296859741, + "learning_rate": 3.4256640627539276e-05, + "loss": 0.3725, + "step": 11380 + }, + { + "epoch": 0.6246981339187706, + "grad_norm": 1.3177751302719116, + "learning_rate": 3.425177657980496e-05, + "loss": 0.1833, + "step": 11382 + }, + { + "epoch": 0.624807903402854, + "grad_norm": 1.4622468948364258, + "learning_rate": 3.424691212624232e-05, + "loss": 0.3502, + "step": 11384 + }, + { + "epoch": 0.6249176728869374, + "grad_norm": 1.2588471174240112, + "learning_rate": 3.4242047267064715e-05, + "loss": 0.2606, + "step": 11386 + }, + { + "epoch": 0.6250274423710208, + "grad_norm": 1.378042459487915, + "learning_rate": 3.4237182002485556e-05, + "loss": 0.3426, + "step": 11388 + }, + { + "epoch": 0.6251372118551043, + "grad_norm": 1.570520281791687, + "learning_rate": 3.423231633271826e-05, + "loss": 0.3051, + "step": 11390 + }, + { + "epoch": 0.6252469813391877, + "grad_norm": 1.1454633474349976, + "learning_rate": 3.422745025797626e-05, + "loss": 0.2548, + "step": 11392 + }, + { + "epoch": 0.6253567508232711, + "grad_norm": 1.0235137939453125, + "learning_rate": 3.4222583778472996e-05, + "loss": 0.1986, + "step": 11394 + }, + { + "epoch": 0.6254665203073545, + "grad_norm": 1.542881965637207, + "learning_rate": 3.421771689442193e-05, + "loss": 0.319, + "step": 11396 + }, + { + "epoch": 0.625576289791438, + "grad_norm": 2.349393367767334, + "learning_rate": 3.421284960603657e-05, + "loss": 0.2923, + "step": 11398 + }, + { + "epoch": 0.6256860592755215, + "grad_norm": 2.201943874359131, + "learning_rate": 3.4207981913530404e-05, + "loss": 0.396, + "step": 11400 + }, + { + "epoch": 0.6257958287596048, + "grad_norm": 1.7032853364944458, + "learning_rate": 3.4203113817116957e-05, + "loss": 0.3252, + "step": 11402 + }, + { + "epoch": 0.6259055982436883, + "grad_norm": 1.3819714784622192, + "learning_rate": 3.4198245317009755e-05, + "loss": 0.3332, + "step": 11404 + }, + { + "epoch": 0.6260153677277717, + "grad_norm": 3.5808944702148438, + "learning_rate": 3.419337641342239e-05, + "loss": 0.2798, + "step": 11406 + }, + { + "epoch": 0.6261251372118551, + "grad_norm": 1.181855320930481, + "learning_rate": 3.4188507106568385e-05, + "loss": 0.2521, + "step": 11408 + }, + { + "epoch": 0.6262349066959385, + "grad_norm": 1.559135913848877, + "learning_rate": 3.418363739666137e-05, + "loss": 0.2837, + "step": 11410 + }, + { + "epoch": 0.626344676180022, + "grad_norm": 3.1315531730651855, + "learning_rate": 3.417876728391495e-05, + "loss": 0.3051, + "step": 11412 + }, + { + "epoch": 0.6264544456641054, + "grad_norm": 1.4146379232406616, + "learning_rate": 3.417389676854274e-05, + "loss": 0.2504, + "step": 11414 + }, + { + "epoch": 0.6265642151481888, + "grad_norm": 1.6544198989868164, + "learning_rate": 3.416902585075838e-05, + "loss": 0.335, + "step": 11416 + }, + { + "epoch": 0.6266739846322722, + "grad_norm": 1.370880126953125, + "learning_rate": 3.416415453077555e-05, + "loss": 0.4223, + "step": 11418 + }, + { + "epoch": 0.6267837541163557, + "grad_norm": 1.1104141473770142, + "learning_rate": 3.415928280880792e-05, + "loss": 0.3386, + "step": 11420 + }, + { + "epoch": 0.626893523600439, + "grad_norm": 2.3731582164764404, + "learning_rate": 3.4154410685069196e-05, + "loss": 0.2899, + "step": 11422 + }, + { + "epoch": 0.6270032930845225, + "grad_norm": 1.8379409313201904, + "learning_rate": 3.414953815977309e-05, + "loss": 0.2597, + "step": 11424 + }, + { + "epoch": 0.6271130625686059, + "grad_norm": 1.2714283466339111, + "learning_rate": 3.414466523313332e-05, + "loss": 0.2804, + "step": 11426 + }, + { + "epoch": 0.6272228320526894, + "grad_norm": 1.320778489112854, + "learning_rate": 3.4139791905363644e-05, + "loss": 0.1613, + "step": 11428 + }, + { + "epoch": 0.6273326015367727, + "grad_norm": 1.928603172302246, + "learning_rate": 3.4134918176677846e-05, + "loss": 0.2708, + "step": 11430 + }, + { + "epoch": 0.6274423710208562, + "grad_norm": 4.815691947937012, + "learning_rate": 3.41300440472897e-05, + "loss": 0.3205, + "step": 11432 + }, + { + "epoch": 0.6275521405049397, + "grad_norm": 1.058475136756897, + "learning_rate": 3.4125169517413e-05, + "loss": 0.2939, + "step": 11434 + }, + { + "epoch": 0.627661909989023, + "grad_norm": 2.478600263595581, + "learning_rate": 3.4120294587261586e-05, + "loss": 0.2881, + "step": 11436 + }, + { + "epoch": 0.6277716794731065, + "grad_norm": 2.072955369949341, + "learning_rate": 3.4115419257049286e-05, + "loss": 0.274, + "step": 11438 + }, + { + "epoch": 0.6278814489571899, + "grad_norm": 1.7688827514648438, + "learning_rate": 3.4110543526989946e-05, + "loss": 0.3643, + "step": 11440 + }, + { + "epoch": 0.6279912184412734, + "grad_norm": 1.2499971389770508, + "learning_rate": 3.410566739729746e-05, + "loss": 0.2889, + "step": 11442 + }, + { + "epoch": 0.6281009879253567, + "grad_norm": 2.6034672260284424, + "learning_rate": 3.410079086818571e-05, + "loss": 0.2905, + "step": 11444 + }, + { + "epoch": 0.6282107574094402, + "grad_norm": 1.3416608572006226, + "learning_rate": 3.409591393986859e-05, + "loss": 0.3396, + "step": 11446 + }, + { + "epoch": 0.6283205268935236, + "grad_norm": 1.4311747550964355, + "learning_rate": 3.4091036612560046e-05, + "loss": 0.2282, + "step": 11448 + }, + { + "epoch": 0.6284302963776071, + "grad_norm": 1.3604165315628052, + "learning_rate": 3.408615888647402e-05, + "loss": 0.2701, + "step": 11450 + }, + { + "epoch": 0.6285400658616904, + "grad_norm": 3.245046854019165, + "learning_rate": 3.408128076182446e-05, + "loss": 0.3227, + "step": 11452 + }, + { + "epoch": 0.6286498353457739, + "grad_norm": 1.1722625494003296, + "learning_rate": 3.407640223882536e-05, + "loss": 0.298, + "step": 11454 + }, + { + "epoch": 0.6287596048298573, + "grad_norm": 2.0603220462799072, + "learning_rate": 3.407152331769071e-05, + "loss": 0.346, + "step": 11456 + }, + { + "epoch": 0.6288693743139407, + "grad_norm": 0.9156492352485657, + "learning_rate": 3.4066643998634505e-05, + "loss": 0.1405, + "step": 11458 + }, + { + "epoch": 0.6289791437980241, + "grad_norm": 1.9179872274398804, + "learning_rate": 3.406176428187081e-05, + "loss": 0.2614, + "step": 11460 + }, + { + "epoch": 0.6290889132821076, + "grad_norm": 2.0664684772491455, + "learning_rate": 3.405688416761364e-05, + "loss": 0.2706, + "step": 11462 + }, + { + "epoch": 0.629198682766191, + "grad_norm": 2.5928072929382324, + "learning_rate": 3.4052003656077094e-05, + "loss": 0.3281, + "step": 11464 + }, + { + "epoch": 0.6293084522502744, + "grad_norm": 1.7114020586013794, + "learning_rate": 3.4047122747475224e-05, + "loss": 0.3378, + "step": 11466 + }, + { + "epoch": 0.6294182217343578, + "grad_norm": 2.9970285892486572, + "learning_rate": 3.4042241442022154e-05, + "loss": 0.3445, + "step": 11468 + }, + { + "epoch": 0.6295279912184413, + "grad_norm": 2.8112359046936035, + "learning_rate": 3.403735973993198e-05, + "loss": 0.2384, + "step": 11470 + }, + { + "epoch": 0.6296377607025248, + "grad_norm": 3.0685412883758545, + "learning_rate": 3.403247764141886e-05, + "loss": 0.4757, + "step": 11472 + }, + { + "epoch": 0.6297475301866081, + "grad_norm": 1.7445958852767944, + "learning_rate": 3.402759514669694e-05, + "loss": 0.2753, + "step": 11474 + }, + { + "epoch": 0.6298572996706916, + "grad_norm": 1.914095163345337, + "learning_rate": 3.402271225598038e-05, + "loss": 0.2593, + "step": 11476 + }, + { + "epoch": 0.629967069154775, + "grad_norm": 1.889801025390625, + "learning_rate": 3.401782896948338e-05, + "loss": 0.3439, + "step": 11478 + }, + { + "epoch": 0.6300768386388584, + "grad_norm": 1.3607722520828247, + "learning_rate": 3.4012945287420137e-05, + "loss": 0.293, + "step": 11480 + }, + { + "epoch": 0.6301866081229418, + "grad_norm": 1.664551019668579, + "learning_rate": 3.400806121000487e-05, + "loss": 0.1697, + "step": 11482 + }, + { + "epoch": 0.6302963776070253, + "grad_norm": 1.7801074981689453, + "learning_rate": 3.400317673745183e-05, + "loss": 0.3584, + "step": 11484 + }, + { + "epoch": 0.6304061470911086, + "grad_norm": 1.5313748121261597, + "learning_rate": 3.3998291869975266e-05, + "loss": 0.2649, + "step": 11486 + }, + { + "epoch": 0.6305159165751921, + "grad_norm": 2.405061960220337, + "learning_rate": 3.399340660778945e-05, + "loss": 0.2186, + "step": 11488 + }, + { + "epoch": 0.6306256860592755, + "grad_norm": 1.7773147821426392, + "learning_rate": 3.398852095110868e-05, + "loss": 0.3413, + "step": 11490 + }, + { + "epoch": 0.630735455543359, + "grad_norm": 2.085326671600342, + "learning_rate": 3.398363490014727e-05, + "loss": 0.4264, + "step": 11492 + }, + { + "epoch": 0.6308452250274423, + "grad_norm": 1.816078782081604, + "learning_rate": 3.3978748455119536e-05, + "loss": 0.2681, + "step": 11494 + }, + { + "epoch": 0.6309549945115258, + "grad_norm": 1.6118130683898926, + "learning_rate": 3.3973861616239824e-05, + "loss": 0.3928, + "step": 11496 + }, + { + "epoch": 0.6310647639956092, + "grad_norm": 1.4249306917190552, + "learning_rate": 3.3968974383722495e-05, + "loss": 0.3036, + "step": 11498 + }, + { + "epoch": 0.6311745334796927, + "grad_norm": 2.2704720497131348, + "learning_rate": 3.396408675778192e-05, + "loss": 0.4039, + "step": 11500 + }, + { + "epoch": 0.631284302963776, + "grad_norm": 2.41357421875, + "learning_rate": 3.39591987386325e-05, + "loss": 0.2296, + "step": 11502 + }, + { + "epoch": 0.6313940724478595, + "grad_norm": 1.2033264636993408, + "learning_rate": 3.395431032648866e-05, + "loss": 0.3116, + "step": 11504 + }, + { + "epoch": 0.6315038419319429, + "grad_norm": 0.9620398283004761, + "learning_rate": 3.394942152156482e-05, + "loss": 0.2364, + "step": 11506 + }, + { + "epoch": 0.6316136114160263, + "grad_norm": 1.844681978225708, + "learning_rate": 3.394453232407542e-05, + "loss": 0.5063, + "step": 11508 + }, + { + "epoch": 0.6317233809001098, + "grad_norm": 2.760812759399414, + "learning_rate": 3.3939642734234936e-05, + "loss": 0.3183, + "step": 11510 + }, + { + "epoch": 0.6318331503841932, + "grad_norm": 1.7804194688796997, + "learning_rate": 3.3934752752257836e-05, + "loss": 0.295, + "step": 11512 + }, + { + "epoch": 0.6319429198682767, + "grad_norm": 1.3317861557006836, + "learning_rate": 3.392986237835863e-05, + "loss": 0.3103, + "step": 11514 + }, + { + "epoch": 0.63205268935236, + "grad_norm": 2.861865997314453, + "learning_rate": 3.392497161275183e-05, + "loss": 0.2516, + "step": 11516 + }, + { + "epoch": 0.6321624588364435, + "grad_norm": 2.153665781021118, + "learning_rate": 3.392008045565197e-05, + "loss": 0.2425, + "step": 11518 + }, + { + "epoch": 0.6322722283205269, + "grad_norm": 0.9432848691940308, + "learning_rate": 3.391518890727359e-05, + "loss": 0.1931, + "step": 11520 + }, + { + "epoch": 0.6323819978046104, + "grad_norm": 1.7983802556991577, + "learning_rate": 3.3910296967831266e-05, + "loss": 0.2917, + "step": 11522 + }, + { + "epoch": 0.6324917672886937, + "grad_norm": 2.0655105113983154, + "learning_rate": 3.3905404637539595e-05, + "loss": 0.3082, + "step": 11524 + }, + { + "epoch": 0.6326015367727772, + "grad_norm": 1.9642789363861084, + "learning_rate": 3.3900511916613155e-05, + "loss": 0.3577, + "step": 11526 + }, + { + "epoch": 0.6327113062568606, + "grad_norm": 1.5671311616897583, + "learning_rate": 3.389561880526658e-05, + "loss": 0.328, + "step": 11528 + }, + { + "epoch": 0.632821075740944, + "grad_norm": 3.0287206172943115, + "learning_rate": 3.389072530371451e-05, + "loss": 0.2952, + "step": 11530 + }, + { + "epoch": 0.6329308452250274, + "grad_norm": 1.4050416946411133, + "learning_rate": 3.388583141217158e-05, + "loss": 0.2997, + "step": 11532 + }, + { + "epoch": 0.6330406147091109, + "grad_norm": 2.1438608169555664, + "learning_rate": 3.3880937130852466e-05, + "loss": 0.3992, + "step": 11534 + }, + { + "epoch": 0.6331503841931942, + "grad_norm": 1.3196790218353271, + "learning_rate": 3.387604245997187e-05, + "loss": 0.2611, + "step": 11536 + }, + { + "epoch": 0.6332601536772777, + "grad_norm": 1.258182406425476, + "learning_rate": 3.387114739974448e-05, + "loss": 0.2245, + "step": 11538 + }, + { + "epoch": 0.6333699231613611, + "grad_norm": 1.4680277109146118, + "learning_rate": 3.386625195038503e-05, + "loss": 0.4824, + "step": 11540 + }, + { + "epoch": 0.6334796926454446, + "grad_norm": 1.8762915134429932, + "learning_rate": 3.3861356112108247e-05, + "loss": 0.2506, + "step": 11542 + }, + { + "epoch": 0.6335894621295279, + "grad_norm": 2.1573307514190674, + "learning_rate": 3.385645988512889e-05, + "loss": 0.3094, + "step": 11544 + }, + { + "epoch": 0.6336992316136114, + "grad_norm": 1.6725794076919556, + "learning_rate": 3.3851563269661726e-05, + "loss": 0.2428, + "step": 11546 + }, + { + "epoch": 0.6338090010976949, + "grad_norm": 2.821166753768921, + "learning_rate": 3.384666626592156e-05, + "loss": 0.1973, + "step": 11548 + }, + { + "epoch": 0.6339187705817783, + "grad_norm": 2.260936975479126, + "learning_rate": 3.384176887412318e-05, + "loss": 0.3825, + "step": 11550 + }, + { + "epoch": 0.6340285400658617, + "grad_norm": 0.9325798153877258, + "learning_rate": 3.383687109448143e-05, + "loss": 0.2584, + "step": 11552 + }, + { + "epoch": 0.6341383095499451, + "grad_norm": 1.1809606552124023, + "learning_rate": 3.3831972927211135e-05, + "loss": 0.216, + "step": 11554 + }, + { + "epoch": 0.6342480790340286, + "grad_norm": 1.8289271593093872, + "learning_rate": 3.382707437252716e-05, + "loss": 0.3246, + "step": 11556 + }, + { + "epoch": 0.6343578485181119, + "grad_norm": 2.1692960262298584, + "learning_rate": 3.382217543064438e-05, + "loss": 0.1767, + "step": 11558 + }, + { + "epoch": 0.6344676180021954, + "grad_norm": 1.4941108226776123, + "learning_rate": 3.3817276101777676e-05, + "loss": 0.2361, + "step": 11560 + }, + { + "epoch": 0.6345773874862788, + "grad_norm": 1.3282887935638428, + "learning_rate": 3.381237638614196e-05, + "loss": 0.2406, + "step": 11562 + }, + { + "epoch": 0.6346871569703623, + "grad_norm": 0.8487446308135986, + "learning_rate": 3.3807476283952175e-05, + "loss": 0.1596, + "step": 11564 + }, + { + "epoch": 0.6347969264544456, + "grad_norm": 2.1466004848480225, + "learning_rate": 3.380257579542325e-05, + "loss": 0.3042, + "step": 11566 + }, + { + "epoch": 0.6349066959385291, + "grad_norm": 1.3938592672348022, + "learning_rate": 3.379767492077013e-05, + "loss": 0.2469, + "step": 11568 + }, + { + "epoch": 0.6350164654226125, + "grad_norm": 1.9814997911453247, + "learning_rate": 3.379277366020782e-05, + "loss": 0.3697, + "step": 11570 + }, + { + "epoch": 0.635126234906696, + "grad_norm": 1.6296392679214478, + "learning_rate": 3.37878720139513e-05, + "loss": 0.3392, + "step": 11572 + }, + { + "epoch": 0.6352360043907793, + "grad_norm": 1.9931331872940063, + "learning_rate": 3.378296998221557e-05, + "loss": 0.3448, + "step": 11574 + }, + { + "epoch": 0.6353457738748628, + "grad_norm": 1.847790241241455, + "learning_rate": 3.3778067565215684e-05, + "loss": 0.4498, + "step": 11576 + }, + { + "epoch": 0.6354555433589462, + "grad_norm": 0.9250761866569519, + "learning_rate": 3.3773164763166655e-05, + "loss": 0.1654, + "step": 11578 + }, + { + "epoch": 0.6355653128430296, + "grad_norm": 1.545846939086914, + "learning_rate": 3.376826157628356e-05, + "loss": 0.2801, + "step": 11580 + }, + { + "epoch": 0.6356750823271131, + "grad_norm": 1.2584208250045776, + "learning_rate": 3.3763358004781475e-05, + "loss": 0.1845, + "step": 11582 + }, + { + "epoch": 0.6357848518111965, + "grad_norm": 1.5817195177078247, + "learning_rate": 3.37584540488755e-05, + "loss": 0.3864, + "step": 11584 + }, + { + "epoch": 0.63589462129528, + "grad_norm": 1.6775060892105103, + "learning_rate": 3.375354970878073e-05, + "loss": 0.3891, + "step": 11586 + }, + { + "epoch": 0.6360043907793633, + "grad_norm": 1.355834722518921, + "learning_rate": 3.374864498471232e-05, + "loss": 0.2727, + "step": 11588 + }, + { + "epoch": 0.6361141602634468, + "grad_norm": 1.1836148500442505, + "learning_rate": 3.37437398768854e-05, + "loss": 0.2784, + "step": 11590 + }, + { + "epoch": 0.6362239297475302, + "grad_norm": 2.185481309890747, + "learning_rate": 3.373883438551512e-05, + "loss": 0.3274, + "step": 11592 + }, + { + "epoch": 0.6363336992316136, + "grad_norm": 1.4985105991363525, + "learning_rate": 3.373392851081668e-05, + "loss": 0.187, + "step": 11594 + }, + { + "epoch": 0.636443468715697, + "grad_norm": 1.5602763891220093, + "learning_rate": 3.372902225300526e-05, + "loss": 0.3265, + "step": 11596 + }, + { + "epoch": 0.6365532381997805, + "grad_norm": 1.2173686027526855, + "learning_rate": 3.372411561229609e-05, + "loss": 0.2348, + "step": 11598 + }, + { + "epoch": 0.6366630076838639, + "grad_norm": 1.439809799194336, + "learning_rate": 3.3719208588904375e-05, + "loss": 0.3487, + "step": 11600 + }, + { + "epoch": 0.6367727771679473, + "grad_norm": 1.1310036182403564, + "learning_rate": 3.3714301183045385e-05, + "loss": 0.1605, + "step": 11602 + }, + { + "epoch": 0.6368825466520307, + "grad_norm": 1.554064393043518, + "learning_rate": 3.3709393394934364e-05, + "loss": 0.266, + "step": 11604 + }, + { + "epoch": 0.6369923161361142, + "grad_norm": 1.7433404922485352, + "learning_rate": 3.370448522478661e-05, + "loss": 0.3363, + "step": 11606 + }, + { + "epoch": 0.6371020856201975, + "grad_norm": 2.1077425479888916, + "learning_rate": 3.36995766728174e-05, + "loss": 0.2451, + "step": 11608 + }, + { + "epoch": 0.637211855104281, + "grad_norm": 1.3658759593963623, + "learning_rate": 3.3694667739242066e-05, + "loss": 0.1414, + "step": 11610 + }, + { + "epoch": 0.6373216245883644, + "grad_norm": 1.5754755735397339, + "learning_rate": 3.3689758424275926e-05, + "loss": 0.304, + "step": 11612 + }, + { + "epoch": 0.6374313940724479, + "grad_norm": 1.4128620624542236, + "learning_rate": 3.3684848728134334e-05, + "loss": 0.3153, + "step": 11614 + }, + { + "epoch": 0.6375411635565312, + "grad_norm": 1.3730159997940063, + "learning_rate": 3.367993865103265e-05, + "loss": 0.1889, + "step": 11616 + }, + { + "epoch": 0.6376509330406147, + "grad_norm": 3.3051860332489014, + "learning_rate": 3.367502819318624e-05, + "loss": 0.2762, + "step": 11618 + }, + { + "epoch": 0.6377607025246982, + "grad_norm": 1.1816531419754028, + "learning_rate": 3.367011735481053e-05, + "loss": 0.2839, + "step": 11620 + }, + { + "epoch": 0.6378704720087816, + "grad_norm": 1.3923778533935547, + "learning_rate": 3.3665206136120906e-05, + "loss": 0.3641, + "step": 11622 + }, + { + "epoch": 0.637980241492865, + "grad_norm": 1.6709411144256592, + "learning_rate": 3.36602945373328e-05, + "loss": 0.3059, + "step": 11624 + }, + { + "epoch": 0.6380900109769484, + "grad_norm": 1.6397169828414917, + "learning_rate": 3.3655382558661685e-05, + "loss": 0.2986, + "step": 11626 + }, + { + "epoch": 0.6381997804610319, + "grad_norm": 1.7650777101516724, + "learning_rate": 3.365047020032301e-05, + "loss": 0.1969, + "step": 11628 + }, + { + "epoch": 0.6383095499451152, + "grad_norm": 1.83029305934906, + "learning_rate": 3.3645557462532245e-05, + "loss": 0.3909, + "step": 11630 + }, + { + "epoch": 0.6384193194291987, + "grad_norm": 2.1722888946533203, + "learning_rate": 3.364064434550489e-05, + "loss": 0.2579, + "step": 11632 + }, + { + "epoch": 0.6385290889132821, + "grad_norm": 1.1687514781951904, + "learning_rate": 3.363573084945648e-05, + "loss": 0.3595, + "step": 11634 + }, + { + "epoch": 0.6386388583973656, + "grad_norm": 1.428877592086792, + "learning_rate": 3.363081697460251e-05, + "loss": 0.3526, + "step": 11636 + }, + { + "epoch": 0.6387486278814489, + "grad_norm": 1.50995671749115, + "learning_rate": 3.362590272115855e-05, + "loss": 0.2477, + "step": 11638 + }, + { + "epoch": 0.6388583973655324, + "grad_norm": 1.1258245706558228, + "learning_rate": 3.3620988089340166e-05, + "loss": 0.2505, + "step": 11640 + }, + { + "epoch": 0.6389681668496158, + "grad_norm": 3.0748801231384277, + "learning_rate": 3.3616073079362926e-05, + "loss": 0.3316, + "step": 11642 + }, + { + "epoch": 0.6390779363336992, + "grad_norm": 1.2331076860427856, + "learning_rate": 3.361115769144243e-05, + "loss": 0.3151, + "step": 11644 + }, + { + "epoch": 0.6391877058177826, + "grad_norm": 1.675898790359497, + "learning_rate": 3.3606241925794295e-05, + "loss": 0.3623, + "step": 11646 + }, + { + "epoch": 0.6392974753018661, + "grad_norm": 1.3427362442016602, + "learning_rate": 3.360132578263414e-05, + "loss": 0.2646, + "step": 11648 + }, + { + "epoch": 0.6394072447859495, + "grad_norm": 1.3808821439743042, + "learning_rate": 3.359640926217763e-05, + "loss": 0.2426, + "step": 11650 + }, + { + "epoch": 0.6395170142700329, + "grad_norm": 1.525433897972107, + "learning_rate": 3.359149236464041e-05, + "loss": 0.367, + "step": 11652 + }, + { + "epoch": 0.6396267837541163, + "grad_norm": 1.1643424034118652, + "learning_rate": 3.358657509023815e-05, + "loss": 0.2436, + "step": 11654 + }, + { + "epoch": 0.6397365532381998, + "grad_norm": 2.16276478767395, + "learning_rate": 3.358165743918658e-05, + "loss": 0.2568, + "step": 11656 + }, + { + "epoch": 0.6398463227222833, + "grad_norm": 1.7097097635269165, + "learning_rate": 3.3576739411701394e-05, + "loss": 0.2262, + "step": 11658 + }, + { + "epoch": 0.6399560922063666, + "grad_norm": 3.3355181217193604, + "learning_rate": 3.357182100799831e-05, + "loss": 0.2226, + "step": 11660 + }, + { + "epoch": 0.6400658616904501, + "grad_norm": 1.1257539987564087, + "learning_rate": 3.356690222829309e-05, + "loss": 0.1786, + "step": 11662 + }, + { + "epoch": 0.6401756311745335, + "grad_norm": 1.8450404405593872, + "learning_rate": 3.356198307280149e-05, + "loss": 0.2738, + "step": 11664 + }, + { + "epoch": 0.6402854006586169, + "grad_norm": 1.201321005821228, + "learning_rate": 3.355706354173928e-05, + "loss": 0.2793, + "step": 11666 + }, + { + "epoch": 0.6403951701427003, + "grad_norm": 2.302471160888672, + "learning_rate": 3.355214363532227e-05, + "loss": 0.3607, + "step": 11668 + }, + { + "epoch": 0.6405049396267838, + "grad_norm": 1.7943074703216553, + "learning_rate": 3.354722335376626e-05, + "loss": 0.2012, + "step": 11670 + }, + { + "epoch": 0.6406147091108672, + "grad_norm": 1.832764983177185, + "learning_rate": 3.354230269728709e-05, + "loss": 0.3773, + "step": 11672 + }, + { + "epoch": 0.6407244785949506, + "grad_norm": 1.6969784498214722, + "learning_rate": 3.353738166610058e-05, + "loss": 0.233, + "step": 11674 + }, + { + "epoch": 0.640834248079034, + "grad_norm": 1.0176091194152832, + "learning_rate": 3.353246026042262e-05, + "loss": 0.3645, + "step": 11676 + }, + { + "epoch": 0.6409440175631175, + "grad_norm": 1.3330671787261963, + "learning_rate": 3.352753848046907e-05, + "loss": 0.2972, + "step": 11678 + }, + { + "epoch": 0.6410537870472008, + "grad_norm": 1.500980257987976, + "learning_rate": 3.352261632645582e-05, + "loss": 0.3226, + "step": 11680 + }, + { + "epoch": 0.6411635565312843, + "grad_norm": 1.0524522066116333, + "learning_rate": 3.35176937985988e-05, + "loss": 0.2848, + "step": 11682 + }, + { + "epoch": 0.6412733260153677, + "grad_norm": 1.7566285133361816, + "learning_rate": 3.351277089711391e-05, + "loss": 0.3231, + "step": 11684 + }, + { + "epoch": 0.6413830954994512, + "grad_norm": 1.4748502969741821, + "learning_rate": 3.35078476222171e-05, + "loss": 0.2392, + "step": 11686 + }, + { + "epoch": 0.6414928649835345, + "grad_norm": 2.0149917602539062, + "learning_rate": 3.350292397412435e-05, + "loss": 0.2282, + "step": 11688 + }, + { + "epoch": 0.641602634467618, + "grad_norm": 2.502678871154785, + "learning_rate": 3.349799995305162e-05, + "loss": 0.4036, + "step": 11690 + }, + { + "epoch": 0.6417124039517015, + "grad_norm": 1.5978715419769287, + "learning_rate": 3.349307555921489e-05, + "loss": 0.364, + "step": 11692 + }, + { + "epoch": 0.6418221734357848, + "grad_norm": 1.6894830465316772, + "learning_rate": 3.348815079283018e-05, + "loss": 0.2648, + "step": 11694 + }, + { + "epoch": 0.6419319429198683, + "grad_norm": 1.290704607963562, + "learning_rate": 3.348322565411352e-05, + "loss": 0.3382, + "step": 11696 + }, + { + "epoch": 0.6420417124039517, + "grad_norm": 2.2508020401000977, + "learning_rate": 3.347830014328094e-05, + "loss": 0.1931, + "step": 11698 + }, + { + "epoch": 0.6421514818880352, + "grad_norm": 1.689720869064331, + "learning_rate": 3.3473374260548506e-05, + "loss": 0.2864, + "step": 11700 + }, + { + "epoch": 0.6422612513721185, + "grad_norm": 1.482354998588562, + "learning_rate": 3.346844800613229e-05, + "loss": 0.3734, + "step": 11702 + }, + { + "epoch": 0.642371020856202, + "grad_norm": 1.3590646982192993, + "learning_rate": 3.346352138024837e-05, + "loss": 0.2877, + "step": 11704 + }, + { + "epoch": 0.6424807903402854, + "grad_norm": 1.3837109804153442, + "learning_rate": 3.345859438311287e-05, + "loss": 0.3134, + "step": 11706 + }, + { + "epoch": 0.6425905598243689, + "grad_norm": 1.5703577995300293, + "learning_rate": 3.3453667014941896e-05, + "loss": 0.3235, + "step": 11708 + }, + { + "epoch": 0.6427003293084522, + "grad_norm": 1.2619805335998535, + "learning_rate": 3.3448739275951595e-05, + "loss": 0.373, + "step": 11710 + }, + { + "epoch": 0.6428100987925357, + "grad_norm": 1.4733850955963135, + "learning_rate": 3.344381116635812e-05, + "loss": 0.324, + "step": 11712 + }, + { + "epoch": 0.6429198682766191, + "grad_norm": 1.5762628316879272, + "learning_rate": 3.343888268637765e-05, + "loss": 0.2733, + "step": 11714 + }, + { + "epoch": 0.6430296377607025, + "grad_norm": 1.8440239429473877, + "learning_rate": 3.343395383622635e-05, + "loss": 0.36, + "step": 11716 + }, + { + "epoch": 0.6431394072447859, + "grad_norm": 1.8386157751083374, + "learning_rate": 3.342902461612045e-05, + "loss": 0.4107, + "step": 11718 + }, + { + "epoch": 0.6432491767288694, + "grad_norm": 1.6854636669158936, + "learning_rate": 3.342409502627616e-05, + "loss": 0.2655, + "step": 11720 + }, + { + "epoch": 0.6433589462129528, + "grad_norm": 1.2582271099090576, + "learning_rate": 3.3419165066909705e-05, + "loss": 0.2066, + "step": 11722 + }, + { + "epoch": 0.6434687156970362, + "grad_norm": 1.092259407043457, + "learning_rate": 3.341423473823736e-05, + "loss": 0.234, + "step": 11724 + }, + { + "epoch": 0.6435784851811196, + "grad_norm": 1.9439724683761597, + "learning_rate": 3.340930404047537e-05, + "loss": 0.2902, + "step": 11726 + }, + { + "epoch": 0.6436882546652031, + "grad_norm": 1.198294758796692, + "learning_rate": 3.340437297384003e-05, + "loss": 0.3347, + "step": 11728 + }, + { + "epoch": 0.6437980241492866, + "grad_norm": 1.1714478731155396, + "learning_rate": 3.339944153854764e-05, + "loss": 0.2293, + "step": 11730 + }, + { + "epoch": 0.6439077936333699, + "grad_norm": 1.4239277839660645, + "learning_rate": 3.339450973481452e-05, + "loss": 0.3146, + "step": 11732 + }, + { + "epoch": 0.6440175631174534, + "grad_norm": 3.3550963401794434, + "learning_rate": 3.338957756285699e-05, + "loss": 0.2728, + "step": 11734 + }, + { + "epoch": 0.6441273326015368, + "grad_norm": 2.6609907150268555, + "learning_rate": 3.3384645022891427e-05, + "loss": 0.3578, + "step": 11736 + }, + { + "epoch": 0.6442371020856202, + "grad_norm": 1.6105161905288696, + "learning_rate": 3.337971211513417e-05, + "loss": 0.2473, + "step": 11738 + }, + { + "epoch": 0.6443468715697036, + "grad_norm": 1.945588231086731, + "learning_rate": 3.3374778839801614e-05, + "loss": 0.2607, + "step": 11740 + }, + { + "epoch": 0.6444566410537871, + "grad_norm": 0.9502175450325012, + "learning_rate": 3.336984519711015e-05, + "loss": 0.231, + "step": 11742 + }, + { + "epoch": 0.6445664105378704, + "grad_norm": 1.4391080141067505, + "learning_rate": 3.3364911187276186e-05, + "loss": 0.271, + "step": 11744 + }, + { + "epoch": 0.6446761800219539, + "grad_norm": 4.008232593536377, + "learning_rate": 3.3359976810516164e-05, + "loss": 0.2936, + "step": 11746 + }, + { + "epoch": 0.6447859495060373, + "grad_norm": 1.4646267890930176, + "learning_rate": 3.335504206704653e-05, + "loss": 0.2348, + "step": 11748 + }, + { + "epoch": 0.6448957189901208, + "grad_norm": 1.502054214477539, + "learning_rate": 3.3350106957083744e-05, + "loss": 0.3322, + "step": 11750 + }, + { + "epoch": 0.6450054884742041, + "grad_norm": 1.0134063959121704, + "learning_rate": 3.3345171480844275e-05, + "loss": 0.2861, + "step": 11752 + }, + { + "epoch": 0.6451152579582876, + "grad_norm": 2.1173930168151855, + "learning_rate": 3.334023563854463e-05, + "loss": 0.2316, + "step": 11754 + }, + { + "epoch": 0.645225027442371, + "grad_norm": 1.6073864698410034, + "learning_rate": 3.333529943040131e-05, + "loss": 0.3292, + "step": 11756 + }, + { + "epoch": 0.6453347969264545, + "grad_norm": 1.8040313720703125, + "learning_rate": 3.3330362856630845e-05, + "loss": 0.1949, + "step": 11758 + }, + { + "epoch": 0.6454445664105378, + "grad_norm": 1.6213263273239136, + "learning_rate": 3.332542591744978e-05, + "loss": 0.3142, + "step": 11760 + }, + { + "epoch": 0.6455543358946213, + "grad_norm": 1.904234528541565, + "learning_rate": 3.332048861307467e-05, + "loss": 0.3584, + "step": 11762 + }, + { + "epoch": 0.6456641053787047, + "grad_norm": 1.5057803392410278, + "learning_rate": 3.331555094372208e-05, + "loss": 0.3199, + "step": 11764 + }, + { + "epoch": 0.6457738748627881, + "grad_norm": 1.15034818649292, + "learning_rate": 3.331061290960863e-05, + "loss": 0.2618, + "step": 11766 + }, + { + "epoch": 0.6458836443468716, + "grad_norm": 1.2089356184005737, + "learning_rate": 3.33056745109509e-05, + "loss": 0.2239, + "step": 11768 + }, + { + "epoch": 0.645993413830955, + "grad_norm": 1.8509297370910645, + "learning_rate": 3.3300735747965505e-05, + "loss": 0.2073, + "step": 11770 + }, + { + "epoch": 0.6461031833150385, + "grad_norm": 1.3309451341629028, + "learning_rate": 3.3295796620869116e-05, + "loss": 0.3293, + "step": 11772 + }, + { + "epoch": 0.6462129527991218, + "grad_norm": 1.540007472038269, + "learning_rate": 3.329085712987836e-05, + "loss": 0.2541, + "step": 11774 + }, + { + "epoch": 0.6463227222832053, + "grad_norm": 0.9200062155723572, + "learning_rate": 3.328591727520992e-05, + "loss": 0.2553, + "step": 11776 + }, + { + "epoch": 0.6464324917672887, + "grad_norm": 2.941419839859009, + "learning_rate": 3.328097705708047e-05, + "loss": 0.4303, + "step": 11778 + }, + { + "epoch": 0.6465422612513722, + "grad_norm": 1.8141320943832397, + "learning_rate": 3.327603647570673e-05, + "loss": 0.3081, + "step": 11780 + }, + { + "epoch": 0.6466520307354555, + "grad_norm": 1.1469876766204834, + "learning_rate": 3.327109553130541e-05, + "loss": 0.319, + "step": 11782 + }, + { + "epoch": 0.646761800219539, + "grad_norm": 2.153017282485962, + "learning_rate": 3.3266154224093236e-05, + "loss": 0.4723, + "step": 11784 + }, + { + "epoch": 0.6468715697036224, + "grad_norm": 0.9964450597763062, + "learning_rate": 3.3261212554286975e-05, + "loss": 0.2338, + "step": 11786 + }, + { + "epoch": 0.6469813391877058, + "grad_norm": 3.0789403915405273, + "learning_rate": 3.325627052210337e-05, + "loss": 0.2988, + "step": 11788 + }, + { + "epoch": 0.6470911086717892, + "grad_norm": 1.201096534729004, + "learning_rate": 3.325132812775922e-05, + "loss": 0.276, + "step": 11790 + }, + { + "epoch": 0.6472008781558727, + "grad_norm": 1.5315577983856201, + "learning_rate": 3.324638537147132e-05, + "loss": 0.2767, + "step": 11792 + }, + { + "epoch": 0.647310647639956, + "grad_norm": 1.8808249235153198, + "learning_rate": 3.324144225345649e-05, + "loss": 0.4139, + "step": 11794 + }, + { + "epoch": 0.6474204171240395, + "grad_norm": 1.4033390283584595, + "learning_rate": 3.323649877393154e-05, + "loss": 0.3576, + "step": 11796 + }, + { + "epoch": 0.6475301866081229, + "grad_norm": 1.6677533388137817, + "learning_rate": 3.323155493311334e-05, + "loss": 0.2763, + "step": 11798 + }, + { + "epoch": 0.6476399560922064, + "grad_norm": 1.6054190397262573, + "learning_rate": 3.322661073121872e-05, + "loss": 0.1599, + "step": 11800 + }, + { + "epoch": 0.6477497255762897, + "grad_norm": 1.346048355102539, + "learning_rate": 3.322166616846458e-05, + "loss": 0.1987, + "step": 11802 + }, + { + "epoch": 0.6478594950603732, + "grad_norm": 1.338573694229126, + "learning_rate": 3.321672124506781e-05, + "loss": 0.1635, + "step": 11804 + }, + { + "epoch": 0.6479692645444567, + "grad_norm": 3.157203435897827, + "learning_rate": 3.321177596124532e-05, + "loss": 0.3225, + "step": 11806 + }, + { + "epoch": 0.6480790340285401, + "grad_norm": 1.4769556522369385, + "learning_rate": 3.3206830317214026e-05, + "loss": 0.2666, + "step": 11808 + }, + { + "epoch": 0.6481888035126235, + "grad_norm": 1.211470603942871, + "learning_rate": 3.320188431319088e-05, + "loss": 0.262, + "step": 11810 + }, + { + "epoch": 0.6482985729967069, + "grad_norm": 1.486333966255188, + "learning_rate": 3.319693794939283e-05, + "loss": 0.23, + "step": 11812 + }, + { + "epoch": 0.6484083424807904, + "grad_norm": 1.3994630575180054, + "learning_rate": 3.319199122603683e-05, + "loss": 0.254, + "step": 11814 + }, + { + "epoch": 0.6485181119648737, + "grad_norm": 1.5007655620574951, + "learning_rate": 3.318704414333991e-05, + "loss": 0.2892, + "step": 11816 + }, + { + "epoch": 0.6486278814489572, + "grad_norm": 1.686078667640686, + "learning_rate": 3.318209670151904e-05, + "loss": 0.2601, + "step": 11818 + }, + { + "epoch": 0.6487376509330406, + "grad_norm": 1.5851653814315796, + "learning_rate": 3.317714890079124e-05, + "loss": 0.2195, + "step": 11820 + }, + { + "epoch": 0.6488474204171241, + "grad_norm": 1.150773048400879, + "learning_rate": 3.3172200741373563e-05, + "loss": 0.1974, + "step": 11822 + }, + { + "epoch": 0.6489571899012074, + "grad_norm": 1.4204176664352417, + "learning_rate": 3.316725222348305e-05, + "loss": 0.2398, + "step": 11824 + }, + { + "epoch": 0.6490669593852909, + "grad_norm": 1.7618290185928345, + "learning_rate": 3.3162303347336764e-05, + "loss": 0.3081, + "step": 11826 + }, + { + "epoch": 0.6491767288693743, + "grad_norm": 1.2549690008163452, + "learning_rate": 3.3157354113151794e-05, + "loss": 0.2856, + "step": 11828 + }, + { + "epoch": 0.6492864983534578, + "grad_norm": 1.028354287147522, + "learning_rate": 3.315240452114523e-05, + "loss": 0.2155, + "step": 11830 + }, + { + "epoch": 0.6493962678375411, + "grad_norm": 1.4099552631378174, + "learning_rate": 3.3147454571534195e-05, + "loss": 0.2103, + "step": 11832 + }, + { + "epoch": 0.6495060373216246, + "grad_norm": 1.4947410821914673, + "learning_rate": 3.3142504264535804e-05, + "loss": 0.3276, + "step": 11834 + }, + { + "epoch": 0.649615806805708, + "grad_norm": 3.2990169525146484, + "learning_rate": 3.313755360036721e-05, + "loss": 0.2708, + "step": 11836 + }, + { + "epoch": 0.6497255762897914, + "grad_norm": 1.3023169040679932, + "learning_rate": 3.313260257924558e-05, + "loss": 0.2741, + "step": 11838 + }, + { + "epoch": 0.6498353457738749, + "grad_norm": 1.7752790451049805, + "learning_rate": 3.312765120138809e-05, + "loss": 0.22, + "step": 11840 + }, + { + "epoch": 0.6499451152579583, + "grad_norm": 1.7858359813690186, + "learning_rate": 3.312269946701191e-05, + "loss": 0.2811, + "step": 11842 + }, + { + "epoch": 0.6500548847420418, + "grad_norm": 2.475792169570923, + "learning_rate": 3.311774737633428e-05, + "loss": 0.3016, + "step": 11844 + }, + { + "epoch": 0.6501646542261251, + "grad_norm": 1.4403326511383057, + "learning_rate": 3.311279492957239e-05, + "loss": 0.2087, + "step": 11846 + }, + { + "epoch": 0.6502744237102086, + "grad_norm": 1.0355035066604614, + "learning_rate": 3.31078421269435e-05, + "loss": 0.1871, + "step": 11848 + }, + { + "epoch": 0.650384193194292, + "grad_norm": 1.3963491916656494, + "learning_rate": 3.310288896866486e-05, + "loss": 0.2114, + "step": 11850 + }, + { + "epoch": 0.6504939626783754, + "grad_norm": 2.795419216156006, + "learning_rate": 3.309793545495374e-05, + "loss": 0.3512, + "step": 11852 + }, + { + "epoch": 0.6506037321624588, + "grad_norm": 1.0417053699493408, + "learning_rate": 3.309298158602742e-05, + "loss": 0.2009, + "step": 11854 + }, + { + "epoch": 0.6507135016465423, + "grad_norm": 1.2556755542755127, + "learning_rate": 3.30880273621032e-05, + "loss": 0.256, + "step": 11856 + }, + { + "epoch": 0.6508232711306257, + "grad_norm": 2.07786226272583, + "learning_rate": 3.3083072783398416e-05, + "loss": 0.2862, + "step": 11858 + }, + { + "epoch": 0.6509330406147091, + "grad_norm": 1.0281652212142944, + "learning_rate": 3.307811785013038e-05, + "loss": 0.2038, + "step": 11860 + }, + { + "epoch": 0.6510428100987925, + "grad_norm": 1.5823301076889038, + "learning_rate": 3.307316256251644e-05, + "loss": 0.2494, + "step": 11862 + }, + { + "epoch": 0.651152579582876, + "grad_norm": 0.9518515467643738, + "learning_rate": 3.306820692077397e-05, + "loss": 0.2002, + "step": 11864 + }, + { + "epoch": 0.6512623490669593, + "grad_norm": 1.4231724739074707, + "learning_rate": 3.3063250925120334e-05, + "loss": 0.3227, + "step": 11866 + }, + { + "epoch": 0.6513721185510428, + "grad_norm": 1.564263105392456, + "learning_rate": 3.305829457577295e-05, + "loss": 0.2476, + "step": 11868 + }, + { + "epoch": 0.6514818880351262, + "grad_norm": 1.4120315313339233, + "learning_rate": 3.30533378729492e-05, + "loss": 0.2428, + "step": 11870 + }, + { + "epoch": 0.6515916575192097, + "grad_norm": 2.555974006652832, + "learning_rate": 3.304838081686653e-05, + "loss": 0.2968, + "step": 11872 + }, + { + "epoch": 0.651701427003293, + "grad_norm": 1.7470011711120605, + "learning_rate": 3.3043423407742375e-05, + "loss": 0.1945, + "step": 11874 + }, + { + "epoch": 0.6518111964873765, + "grad_norm": 1.3777915239334106, + "learning_rate": 3.3038465645794185e-05, + "loss": 0.2773, + "step": 11876 + }, + { + "epoch": 0.65192096597146, + "grad_norm": 1.676705002784729, + "learning_rate": 3.303350753123944e-05, + "loss": 0.4165, + "step": 11878 + }, + { + "epoch": 0.6520307354555434, + "grad_norm": 1.1259597539901733, + "learning_rate": 3.3028549064295626e-05, + "loss": 0.1875, + "step": 11880 + }, + { + "epoch": 0.6521405049396268, + "grad_norm": 2.145714044570923, + "learning_rate": 3.302359024518024e-05, + "loss": 0.3004, + "step": 11882 + }, + { + "epoch": 0.6522502744237102, + "grad_norm": 1.7361067533493042, + "learning_rate": 3.30186310741108e-05, + "loss": 0.2091, + "step": 11884 + }, + { + "epoch": 0.6523600439077937, + "grad_norm": 1.317044734954834, + "learning_rate": 3.301367155130485e-05, + "loss": 0.2912, + "step": 11886 + }, + { + "epoch": 0.652469813391877, + "grad_norm": 1.6874970197677612, + "learning_rate": 3.3008711676979934e-05, + "loss": 0.3062, + "step": 11888 + }, + { + "epoch": 0.6525795828759605, + "grad_norm": 1.003161907196045, + "learning_rate": 3.300375145135361e-05, + "loss": 0.2258, + "step": 11890 + }, + { + "epoch": 0.6526893523600439, + "grad_norm": 1.1214746236801147, + "learning_rate": 3.299879087464346e-05, + "loss": 0.4173, + "step": 11892 + }, + { + "epoch": 0.6527991218441274, + "grad_norm": 4.95458984375, + "learning_rate": 3.299382994706709e-05, + "loss": 0.316, + "step": 11894 + }, + { + "epoch": 0.6529088913282107, + "grad_norm": 2.159238815307617, + "learning_rate": 3.2988868668842095e-05, + "loss": 0.3857, + "step": 11896 + }, + { + "epoch": 0.6530186608122942, + "grad_norm": 1.2118914127349854, + "learning_rate": 3.298390704018611e-05, + "loss": 0.1724, + "step": 11898 + }, + { + "epoch": 0.6531284302963776, + "grad_norm": 1.1846765279769897, + "learning_rate": 3.2978945061316776e-05, + "loss": 0.2449, + "step": 11900 + }, + { + "epoch": 0.653238199780461, + "grad_norm": 2.0404717922210693, + "learning_rate": 3.2973982732451755e-05, + "loss": 0.2267, + "step": 11902 + }, + { + "epoch": 0.6533479692645444, + "grad_norm": 1.7932500839233398, + "learning_rate": 3.296902005380871e-05, + "loss": 0.1862, + "step": 11904 + }, + { + "epoch": 0.6534577387486279, + "grad_norm": 1.457886815071106, + "learning_rate": 3.296405702560532e-05, + "loss": 0.2442, + "step": 11906 + }, + { + "epoch": 0.6535675082327113, + "grad_norm": 1.1570322513580322, + "learning_rate": 3.295909364805931e-05, + "loss": 0.2202, + "step": 11908 + }, + { + "epoch": 0.6536772777167947, + "grad_norm": 1.662534236907959, + "learning_rate": 3.295412992138838e-05, + "loss": 0.2225, + "step": 11910 + }, + { + "epoch": 0.6537870472008781, + "grad_norm": 1.655820369720459, + "learning_rate": 3.294916584581027e-05, + "loss": 0.2594, + "step": 11912 + }, + { + "epoch": 0.6538968166849616, + "grad_norm": 1.6712532043457031, + "learning_rate": 3.294420142154274e-05, + "loss": 0.2875, + "step": 11914 + }, + { + "epoch": 0.6540065861690451, + "grad_norm": 1.7068195343017578, + "learning_rate": 3.293923664880354e-05, + "loss": 0.295, + "step": 11916 + }, + { + "epoch": 0.6541163556531284, + "grad_norm": 1.5354621410369873, + "learning_rate": 3.293427152781044e-05, + "loss": 0.2469, + "step": 11918 + }, + { + "epoch": 0.6542261251372119, + "grad_norm": 1.0800312757492065, + "learning_rate": 3.292930605878126e-05, + "loss": 0.224, + "step": 11920 + }, + { + "epoch": 0.6543358946212953, + "grad_norm": 1.0744558572769165, + "learning_rate": 3.29243402419338e-05, + "loss": 0.2794, + "step": 11922 + }, + { + "epoch": 0.6544456641053787, + "grad_norm": 2.262770175933838, + "learning_rate": 3.2919374077485874e-05, + "loss": 0.3545, + "step": 11924 + }, + { + "epoch": 0.6545554335894621, + "grad_norm": 1.3057842254638672, + "learning_rate": 3.291440756565533e-05, + "loss": 0.3514, + "step": 11926 + }, + { + "epoch": 0.6546652030735456, + "grad_norm": 1.2691551446914673, + "learning_rate": 3.290944070666002e-05, + "loss": 0.347, + "step": 11928 + }, + { + "epoch": 0.654774972557629, + "grad_norm": 1.0631778240203857, + "learning_rate": 3.2904473500717824e-05, + "loss": 0.2126, + "step": 11930 + }, + { + "epoch": 0.6548847420417124, + "grad_norm": 1.1675041913986206, + "learning_rate": 3.2899505948046626e-05, + "loss": 0.211, + "step": 11932 + }, + { + "epoch": 0.6549945115257958, + "grad_norm": 1.0253444910049438, + "learning_rate": 3.289453804886433e-05, + "loss": 0.2428, + "step": 11934 + }, + { + "epoch": 0.6551042810098793, + "grad_norm": 1.3142175674438477, + "learning_rate": 3.288956980338883e-05, + "loss": 0.2826, + "step": 11936 + }, + { + "epoch": 0.6552140504939626, + "grad_norm": 1.1419689655303955, + "learning_rate": 3.2884601211838085e-05, + "loss": 0.2215, + "step": 11938 + }, + { + "epoch": 0.6553238199780461, + "grad_norm": 0.982211172580719, + "learning_rate": 3.2879632274430025e-05, + "loss": 0.2518, + "step": 11940 + }, + { + "epoch": 0.6554335894621295, + "grad_norm": 1.0559133291244507, + "learning_rate": 3.287466299138262e-05, + "loss": 0.2304, + "step": 11942 + }, + { + "epoch": 0.655543358946213, + "grad_norm": 0.9702987670898438, + "learning_rate": 3.2869693362913844e-05, + "loss": 0.2828, + "step": 11944 + }, + { + "epoch": 0.6556531284302963, + "grad_norm": 1.3823935985565186, + "learning_rate": 3.28647233892417e-05, + "loss": 0.4323, + "step": 11946 + }, + { + "epoch": 0.6557628979143798, + "grad_norm": 2.1079421043395996, + "learning_rate": 3.2859753070584175e-05, + "loss": 0.2996, + "step": 11948 + }, + { + "epoch": 0.6558726673984633, + "grad_norm": 1.0331621170043945, + "learning_rate": 3.2854782407159305e-05, + "loss": 0.1994, + "step": 11950 + }, + { + "epoch": 0.6559824368825466, + "grad_norm": 1.318634271621704, + "learning_rate": 3.284981139918513e-05, + "loss": 0.2135, + "step": 11952 + }, + { + "epoch": 0.6560922063666301, + "grad_norm": 1.5525981187820435, + "learning_rate": 3.2844840046879686e-05, + "loss": 0.253, + "step": 11954 + }, + { + "epoch": 0.6562019758507135, + "grad_norm": 0.82659512758255, + "learning_rate": 3.2839868350461064e-05, + "loss": 0.3026, + "step": 11956 + }, + { + "epoch": 0.656311745334797, + "grad_norm": 0.8996397256851196, + "learning_rate": 3.2834896310147336e-05, + "loss": 0.2597, + "step": 11958 + }, + { + "epoch": 0.6564215148188803, + "grad_norm": 1.3046307563781738, + "learning_rate": 3.282992392615659e-05, + "loss": 0.2286, + "step": 11960 + }, + { + "epoch": 0.6565312843029638, + "grad_norm": 2.7890114784240723, + "learning_rate": 3.2824951198706954e-05, + "loss": 0.2606, + "step": 11962 + }, + { + "epoch": 0.6566410537870472, + "grad_norm": 1.3106755018234253, + "learning_rate": 3.281997812801656e-05, + "loss": 0.2381, + "step": 11964 + }, + { + "epoch": 0.6567508232711307, + "grad_norm": 2.6189115047454834, + "learning_rate": 3.281500471430353e-05, + "loss": 0.3349, + "step": 11966 + }, + { + "epoch": 0.656860592755214, + "grad_norm": 1.2418079376220703, + "learning_rate": 3.2810030957786044e-05, + "loss": 0.3153, + "step": 11968 + }, + { + "epoch": 0.6569703622392975, + "grad_norm": 2.008423089981079, + "learning_rate": 3.280505685868226e-05, + "loss": 0.3002, + "step": 11970 + }, + { + "epoch": 0.6570801317233809, + "grad_norm": 2.464531898498535, + "learning_rate": 3.2800082417210376e-05, + "loss": 0.3309, + "step": 11972 + }, + { + "epoch": 0.6571899012074643, + "grad_norm": 2.6307573318481445, + "learning_rate": 3.2795107633588586e-05, + "loss": 0.3132, + "step": 11974 + }, + { + "epoch": 0.6572996706915477, + "grad_norm": 2.032191276550293, + "learning_rate": 3.279013250803512e-05, + "loss": 0.231, + "step": 11976 + }, + { + "epoch": 0.6574094401756312, + "grad_norm": 1.68844473361969, + "learning_rate": 3.278515704076821e-05, + "loss": 0.2605, + "step": 11978 + }, + { + "epoch": 0.6575192096597146, + "grad_norm": 1.6436026096343994, + "learning_rate": 3.278018123200609e-05, + "loss": 0.3213, + "step": 11980 + }, + { + "epoch": 0.657628979143798, + "grad_norm": 1.7832777500152588, + "learning_rate": 3.277520508196705e-05, + "loss": 0.3517, + "step": 11982 + }, + { + "epoch": 0.6577387486278814, + "grad_norm": 1.1144706010818481, + "learning_rate": 3.277022859086934e-05, + "loss": 0.3155, + "step": 11984 + }, + { + "epoch": 0.6578485181119649, + "grad_norm": 1.5536766052246094, + "learning_rate": 3.276525175893126e-05, + "loss": 0.3106, + "step": 11986 + }, + { + "epoch": 0.6579582875960484, + "grad_norm": 1.4872207641601562, + "learning_rate": 3.276027458637113e-05, + "loss": 0.3067, + "step": 11988 + }, + { + "epoch": 0.6580680570801317, + "grad_norm": 1.396561861038208, + "learning_rate": 3.275529707340728e-05, + "loss": 0.3403, + "step": 11990 + }, + { + "epoch": 0.6581778265642152, + "grad_norm": 1.33284330368042, + "learning_rate": 3.275031922025801e-05, + "loss": 0.3226, + "step": 11992 + }, + { + "epoch": 0.6582875960482986, + "grad_norm": 5.779906272888184, + "learning_rate": 3.274534102714172e-05, + "loss": 0.408, + "step": 11994 + }, + { + "epoch": 0.658397365532382, + "grad_norm": 0.9427807927131653, + "learning_rate": 3.274036249427675e-05, + "loss": 0.1843, + "step": 11996 + }, + { + "epoch": 0.6585071350164654, + "grad_norm": 1.3038877248764038, + "learning_rate": 3.2735383621881485e-05, + "loss": 0.2559, + "step": 11998 + }, + { + "epoch": 0.6586169045005489, + "grad_norm": 1.355515718460083, + "learning_rate": 3.2730404410174334e-05, + "loss": 0.3594, + "step": 12000 + }, + { + "epoch": 0.6587266739846322, + "grad_norm": 1.567481279373169, + "learning_rate": 3.272542485937369e-05, + "loss": 0.4032, + "step": 12002 + }, + { + "epoch": 0.6588364434687157, + "grad_norm": 1.997698426246643, + "learning_rate": 3.2720444969697995e-05, + "loss": 0.4207, + "step": 12004 + }, + { + "epoch": 0.6589462129527991, + "grad_norm": 2.009182929992676, + "learning_rate": 3.27154647413657e-05, + "loss": 0.2534, + "step": 12006 + }, + { + "epoch": 0.6590559824368826, + "grad_norm": 1.4970088005065918, + "learning_rate": 3.271048417459524e-05, + "loss": 0.3322, + "step": 12008 + }, + { + "epoch": 0.6591657519209659, + "grad_norm": 1.1610870361328125, + "learning_rate": 3.270550326960511e-05, + "loss": 0.1715, + "step": 12010 + }, + { + "epoch": 0.6592755214050494, + "grad_norm": 1.861795425415039, + "learning_rate": 3.2700522026613785e-05, + "loss": 0.2516, + "step": 12012 + }, + { + "epoch": 0.6593852908891328, + "grad_norm": 0.9994825720787048, + "learning_rate": 3.2695540445839764e-05, + "loss": 0.2431, + "step": 12014 + }, + { + "epoch": 0.6594950603732163, + "grad_norm": 1.2535507678985596, + "learning_rate": 3.269055852750156e-05, + "loss": 0.3379, + "step": 12016 + }, + { + "epoch": 0.6596048298572996, + "grad_norm": 1.393520474433899, + "learning_rate": 3.2685576271817716e-05, + "loss": 0.3537, + "step": 12018 + }, + { + "epoch": 0.6597145993413831, + "grad_norm": 1.0464879274368286, + "learning_rate": 3.268059367900678e-05, + "loss": 0.3001, + "step": 12020 + }, + { + "epoch": 0.6598243688254665, + "grad_norm": 1.0609928369522095, + "learning_rate": 3.26756107492873e-05, + "loss": 0.2156, + "step": 12022 + }, + { + "epoch": 0.65993413830955, + "grad_norm": 1.2798807621002197, + "learning_rate": 3.267062748287786e-05, + "loss": 0.2094, + "step": 12024 + }, + { + "epoch": 0.6600439077936334, + "grad_norm": 1.552375316619873, + "learning_rate": 3.2665643879997056e-05, + "loss": 0.3033, + "step": 12026 + }, + { + "epoch": 0.6601536772777168, + "grad_norm": 1.4222116470336914, + "learning_rate": 3.2660659940863474e-05, + "loss": 0.2035, + "step": 12028 + }, + { + "epoch": 0.6602634467618003, + "grad_norm": 1.1940289735794067, + "learning_rate": 3.2655675665695754e-05, + "loss": 0.2011, + "step": 12030 + }, + { + "epoch": 0.6603732162458836, + "grad_norm": 1.6953003406524658, + "learning_rate": 3.2650691054712526e-05, + "loss": 0.3335, + "step": 12032 + }, + { + "epoch": 0.6604829857299671, + "grad_norm": 1.6401591300964355, + "learning_rate": 3.2645706108132424e-05, + "loss": 0.3615, + "step": 12034 + }, + { + "epoch": 0.6605927552140505, + "grad_norm": 2.34342098236084, + "learning_rate": 3.264072082617413e-05, + "loss": 0.2451, + "step": 12036 + }, + { + "epoch": 0.660702524698134, + "grad_norm": 2.0744311809539795, + "learning_rate": 3.263573520905633e-05, + "loss": 0.188, + "step": 12038 + }, + { + "epoch": 0.6608122941822173, + "grad_norm": 1.6672863960266113, + "learning_rate": 3.263074925699769e-05, + "loss": 0.3527, + "step": 12040 + }, + { + "epoch": 0.6609220636663008, + "grad_norm": 1.6401288509368896, + "learning_rate": 3.262576297021695e-05, + "loss": 0.3229, + "step": 12042 + }, + { + "epoch": 0.6610318331503842, + "grad_norm": 2.4241156578063965, + "learning_rate": 3.26207763489328e-05, + "loss": 0.3541, + "step": 12044 + }, + { + "epoch": 0.6611416026344676, + "grad_norm": 3.725498914718628, + "learning_rate": 3.2615789393363995e-05, + "loss": 0.2077, + "step": 12046 + }, + { + "epoch": 0.661251372118551, + "grad_norm": 1.2687137126922607, + "learning_rate": 3.261080210372929e-05, + "loss": 0.2762, + "step": 12048 + }, + { + "epoch": 0.6613611416026345, + "grad_norm": 1.912251353263855, + "learning_rate": 3.260581448024745e-05, + "loss": 0.1826, + "step": 12050 + }, + { + "epoch": 0.6614709110867178, + "grad_norm": 2.4224741458892822, + "learning_rate": 3.260082652313726e-05, + "loss": 0.3145, + "step": 12052 + }, + { + "epoch": 0.6615806805708013, + "grad_norm": 5.888850688934326, + "learning_rate": 3.25958382326175e-05, + "loss": 0.2384, + "step": 12054 + }, + { + "epoch": 0.6616904500548847, + "grad_norm": 1.150119423866272, + "learning_rate": 3.259084960890701e-05, + "loss": 0.2509, + "step": 12056 + }, + { + "epoch": 0.6618002195389682, + "grad_norm": 2.2046241760253906, + "learning_rate": 3.2585860652224585e-05, + "loss": 0.1635, + "step": 12058 + }, + { + "epoch": 0.6619099890230515, + "grad_norm": 1.6713124513626099, + "learning_rate": 3.258087136278908e-05, + "loss": 0.2699, + "step": 12060 + }, + { + "epoch": 0.662019758507135, + "grad_norm": 1.5831544399261475, + "learning_rate": 3.2575881740819355e-05, + "loss": 0.2894, + "step": 12062 + }, + { + "epoch": 0.6621295279912185, + "grad_norm": 1.386809229850769, + "learning_rate": 3.257089178653426e-05, + "loss": 0.4065, + "step": 12064 + }, + { + "epoch": 0.6622392974753019, + "grad_norm": 1.6598252058029175, + "learning_rate": 3.25659015001527e-05, + "loss": 0.3166, + "step": 12066 + }, + { + "epoch": 0.6623490669593853, + "grad_norm": 2.4867560863494873, + "learning_rate": 3.256091088189357e-05, + "loss": 0.2224, + "step": 12068 + }, + { + "epoch": 0.6624588364434687, + "grad_norm": 1.9412992000579834, + "learning_rate": 3.2555919931975766e-05, + "loss": 0.2166, + "step": 12070 + }, + { + "epoch": 0.6625686059275522, + "grad_norm": 1.8707467317581177, + "learning_rate": 3.255092865061823e-05, + "loss": 0.3166, + "step": 12072 + }, + { + "epoch": 0.6626783754116355, + "grad_norm": 1.4927692413330078, + "learning_rate": 3.25459370380399e-05, + "loss": 0.2487, + "step": 12074 + }, + { + "epoch": 0.662788144895719, + "grad_norm": 1.199230670928955, + "learning_rate": 3.254094509445974e-05, + "loss": 0.3032, + "step": 12076 + }, + { + "epoch": 0.6628979143798024, + "grad_norm": 1.844226598739624, + "learning_rate": 3.253595282009671e-05, + "loss": 0.313, + "step": 12078 + }, + { + "epoch": 0.6630076838638859, + "grad_norm": 6.666082859039307, + "learning_rate": 3.2530960215169795e-05, + "loss": 0.3574, + "step": 12080 + }, + { + "epoch": 0.6631174533479692, + "grad_norm": 1.8307101726531982, + "learning_rate": 3.2525967279898015e-05, + "loss": 0.3658, + "step": 12082 + }, + { + "epoch": 0.6632272228320527, + "grad_norm": 1.029599666595459, + "learning_rate": 3.252097401450036e-05, + "loss": 0.2654, + "step": 12084 + }, + { + "epoch": 0.6633369923161361, + "grad_norm": 1.1237937211990356, + "learning_rate": 3.251598041919587e-05, + "loss": 0.2667, + "step": 12086 + }, + { + "epoch": 0.6634467618002196, + "grad_norm": 1.1701273918151855, + "learning_rate": 3.25109864942036e-05, + "loss": 0.2115, + "step": 12088 + }, + { + "epoch": 0.6635565312843029, + "grad_norm": 1.5718259811401367, + "learning_rate": 3.250599223974258e-05, + "loss": 0.2684, + "step": 12090 + }, + { + "epoch": 0.6636663007683864, + "grad_norm": 1.2330659627914429, + "learning_rate": 3.250099765603191e-05, + "loss": 0.3441, + "step": 12092 + }, + { + "epoch": 0.6637760702524698, + "grad_norm": 1.9853899478912354, + "learning_rate": 3.249600274329066e-05, + "loss": 0.3505, + "step": 12094 + }, + { + "epoch": 0.6638858397365532, + "grad_norm": 1.0299257040023804, + "learning_rate": 3.249100750173794e-05, + "loss": 0.2409, + "step": 12096 + }, + { + "epoch": 0.6639956092206367, + "grad_norm": 1.244553565979004, + "learning_rate": 3.248601193159287e-05, + "loss": 0.2304, + "step": 12098 + }, + { + "epoch": 0.6641053787047201, + "grad_norm": 1.2400388717651367, + "learning_rate": 3.2481016033074556e-05, + "loss": 0.3143, + "step": 12100 + }, + { + "epoch": 0.6642151481888036, + "grad_norm": 1.3702125549316406, + "learning_rate": 3.247601980640217e-05, + "loss": 0.2654, + "step": 12102 + }, + { + "epoch": 0.6643249176728869, + "grad_norm": 2.337545394897461, + "learning_rate": 3.2471023251794866e-05, + "loss": 0.3618, + "step": 12104 + }, + { + "epoch": 0.6644346871569704, + "grad_norm": 1.557511806488037, + "learning_rate": 3.24660263694718e-05, + "loss": 0.3539, + "step": 12106 + }, + { + "epoch": 0.6645444566410538, + "grad_norm": 1.987335443496704, + "learning_rate": 3.246102915965217e-05, + "loss": 0.2147, + "step": 12108 + }, + { + "epoch": 0.6646542261251372, + "grad_norm": 1.5255850553512573, + "learning_rate": 3.2456031622555197e-05, + "loss": 0.4002, + "step": 12110 + }, + { + "epoch": 0.6647639956092206, + "grad_norm": 1.0744751691818237, + "learning_rate": 3.245103375840007e-05, + "loss": 0.2264, + "step": 12112 + }, + { + "epoch": 0.6648737650933041, + "grad_norm": 1.4024606943130493, + "learning_rate": 3.244603556740603e-05, + "loss": 0.2168, + "step": 12114 + }, + { + "epoch": 0.6649835345773875, + "grad_norm": 1.9742785692214966, + "learning_rate": 3.244103704979233e-05, + "loss": 0.1572, + "step": 12116 + }, + { + "epoch": 0.6650933040614709, + "grad_norm": 1.1614954471588135, + "learning_rate": 3.243603820577822e-05, + "loss": 0.2485, + "step": 12118 + }, + { + "epoch": 0.6652030735455543, + "grad_norm": 2.794738292694092, + "learning_rate": 3.243103903558297e-05, + "loss": 0.3135, + "step": 12120 + }, + { + "epoch": 0.6653128430296378, + "grad_norm": 1.0305708646774292, + "learning_rate": 3.2426039539425876e-05, + "loss": 0.2804, + "step": 12122 + }, + { + "epoch": 0.6654226125137211, + "grad_norm": 0.8131890892982483, + "learning_rate": 3.2421039717526245e-05, + "loss": 0.264, + "step": 12124 + }, + { + "epoch": 0.6655323819978046, + "grad_norm": 1.1870479583740234, + "learning_rate": 3.2416039570103375e-05, + "loss": 0.2618, + "step": 12126 + }, + { + "epoch": 0.665642151481888, + "grad_norm": 1.5457820892333984, + "learning_rate": 3.2411039097376614e-05, + "loss": 0.3914, + "step": 12128 + }, + { + "epoch": 0.6657519209659715, + "grad_norm": 1.1542904376983643, + "learning_rate": 3.240603829956531e-05, + "loss": 0.1839, + "step": 12130 + }, + { + "epoch": 0.6658616904500548, + "grad_norm": 2.8820314407348633, + "learning_rate": 3.240103717688881e-05, + "loss": 0.2694, + "step": 12132 + }, + { + "epoch": 0.6659714599341383, + "grad_norm": 1.8136587142944336, + "learning_rate": 3.23960357295665e-05, + "loss": 0.2641, + "step": 12134 + }, + { + "epoch": 0.6660812294182218, + "grad_norm": 1.389190435409546, + "learning_rate": 3.2391033957817754e-05, + "loss": 0.2388, + "step": 12136 + }, + { + "epoch": 0.6661909989023052, + "grad_norm": 3.726317882537842, + "learning_rate": 3.2386031861861976e-05, + "loss": 0.3667, + "step": 12138 + }, + { + "epoch": 0.6663007683863886, + "grad_norm": 1.5580041408538818, + "learning_rate": 3.2381029441918596e-05, + "loss": 0.2327, + "step": 12140 + }, + { + "epoch": 0.666410537870472, + "grad_norm": 1.5278750658035278, + "learning_rate": 3.237602669820704e-05, + "loss": 0.1968, + "step": 12142 + }, + { + "epoch": 0.6665203073545555, + "grad_norm": 1.3839216232299805, + "learning_rate": 3.237102363094674e-05, + "loss": 0.3603, + "step": 12144 + }, + { + "epoch": 0.6666300768386388, + "grad_norm": 1.4692914485931396, + "learning_rate": 3.236602024035716e-05, + "loss": 0.2252, + "step": 12146 + }, + { + "epoch": 0.6667398463227223, + "grad_norm": 1.6762038469314575, + "learning_rate": 3.236101652665779e-05, + "loss": 0.2572, + "step": 12148 + }, + { + "epoch": 0.6668496158068057, + "grad_norm": 2.0755069255828857, + "learning_rate": 3.23560124900681e-05, + "loss": 0.2453, + "step": 12150 + }, + { + "epoch": 0.6669593852908892, + "grad_norm": 0.889755368232727, + "learning_rate": 3.23510081308076e-05, + "loss": 0.2243, + "step": 12152 + }, + { + "epoch": 0.6670691547749725, + "grad_norm": 2.3446319103240967, + "learning_rate": 3.2346003449095805e-05, + "loss": 0.3125, + "step": 12154 + }, + { + "epoch": 0.667178924259056, + "grad_norm": 1.218403697013855, + "learning_rate": 3.234099844515224e-05, + "loss": 0.3225, + "step": 12156 + }, + { + "epoch": 0.6672886937431394, + "grad_norm": 1.8036084175109863, + "learning_rate": 3.233599311919644e-05, + "loss": 0.2325, + "step": 12158 + }, + { + "epoch": 0.6673984632272228, + "grad_norm": 1.4457966089248657, + "learning_rate": 3.2330987471447985e-05, + "loss": 0.2802, + "step": 12160 + }, + { + "epoch": 0.6675082327113062, + "grad_norm": 2.038309335708618, + "learning_rate": 3.2325981502126433e-05, + "loss": 0.3231, + "step": 12162 + }, + { + "epoch": 0.6676180021953897, + "grad_norm": 1.6679043769836426, + "learning_rate": 3.232097521145138e-05, + "loss": 0.343, + "step": 12164 + }, + { + "epoch": 0.6677277716794731, + "grad_norm": 1.7438416481018066, + "learning_rate": 3.231596859964242e-05, + "loss": 0.3395, + "step": 12166 + }, + { + "epoch": 0.6678375411635565, + "grad_norm": 1.2831321954727173, + "learning_rate": 3.2310961666919166e-05, + "loss": 0.2332, + "step": 12168 + }, + { + "epoch": 0.6679473106476399, + "grad_norm": 1.5001360177993774, + "learning_rate": 3.230595441350125e-05, + "loss": 0.3021, + "step": 12170 + }, + { + "epoch": 0.6680570801317234, + "grad_norm": 2.7892985343933105, + "learning_rate": 3.230094683960832e-05, + "loss": 0.2101, + "step": 12172 + }, + { + "epoch": 0.6681668496158069, + "grad_norm": 2.4207403659820557, + "learning_rate": 3.229593894546001e-05, + "loss": 0.2361, + "step": 12174 + }, + { + "epoch": 0.6682766190998902, + "grad_norm": 1.2264652252197266, + "learning_rate": 3.229093073127602e-05, + "loss": 0.2437, + "step": 12176 + }, + { + "epoch": 0.6683863885839737, + "grad_norm": 1.4514801502227783, + "learning_rate": 3.228592219727602e-05, + "loss": 0.3513, + "step": 12178 + }, + { + "epoch": 0.6684961580680571, + "grad_norm": 1.7612237930297852, + "learning_rate": 3.22809133436797e-05, + "loss": 0.4287, + "step": 12180 + }, + { + "epoch": 0.6686059275521405, + "grad_norm": 1.3772971630096436, + "learning_rate": 3.2275904170706797e-05, + "loss": 0.2818, + "step": 12182 + }, + { + "epoch": 0.6687156970362239, + "grad_norm": 1.5987961292266846, + "learning_rate": 3.227089467857703e-05, + "loss": 0.2206, + "step": 12184 + }, + { + "epoch": 0.6688254665203074, + "grad_norm": 1.6105568408966064, + "learning_rate": 3.226588486751012e-05, + "loss": 0.2453, + "step": 12186 + }, + { + "epoch": 0.6689352360043908, + "grad_norm": 1.2863134145736694, + "learning_rate": 3.226087473772584e-05, + "loss": 0.156, + "step": 12188 + }, + { + "epoch": 0.6690450054884742, + "grad_norm": 1.1565845012664795, + "learning_rate": 3.225586428944396e-05, + "loss": 0.2059, + "step": 12190 + }, + { + "epoch": 0.6691547749725576, + "grad_norm": 1.5686748027801514, + "learning_rate": 3.225085352288426e-05, + "loss": 0.2837, + "step": 12192 + }, + { + "epoch": 0.6692645444566411, + "grad_norm": 1.704770565032959, + "learning_rate": 3.2245842438266526e-05, + "loss": 0.2643, + "step": 12194 + }, + { + "epoch": 0.6693743139407244, + "grad_norm": 3.4745705127716064, + "learning_rate": 3.224083103581059e-05, + "loss": 0.2618, + "step": 12196 + }, + { + "epoch": 0.6694840834248079, + "grad_norm": 1.6659256219863892, + "learning_rate": 3.223581931573625e-05, + "loss": 0.2038, + "step": 12198 + }, + { + "epoch": 0.6695938529088913, + "grad_norm": 1.8124796152114868, + "learning_rate": 3.223080727826337e-05, + "loss": 0.3318, + "step": 12200 + }, + { + "epoch": 0.6697036223929748, + "grad_norm": 2.5248069763183594, + "learning_rate": 3.222579492361179e-05, + "loss": 0.3271, + "step": 12202 + }, + { + "epoch": 0.6698133918770581, + "grad_norm": 1.8934838771820068, + "learning_rate": 3.222078225200138e-05, + "loss": 0.2144, + "step": 12204 + }, + { + "epoch": 0.6699231613611416, + "grad_norm": 1.3969297409057617, + "learning_rate": 3.221576926365202e-05, + "loss": 0.2656, + "step": 12206 + }, + { + "epoch": 0.670032930845225, + "grad_norm": 1.80121910572052, + "learning_rate": 3.22107559587836e-05, + "loss": 0.293, + "step": 12208 + }, + { + "epoch": 0.6701427003293084, + "grad_norm": 1.8200098276138306, + "learning_rate": 3.220574233761603e-05, + "loss": 0.3924, + "step": 12210 + }, + { + "epoch": 0.6702524698133919, + "grad_norm": 1.813820242881775, + "learning_rate": 3.220072840036923e-05, + "loss": 0.3701, + "step": 12212 + }, + { + "epoch": 0.6703622392974753, + "grad_norm": 1.4606448411941528, + "learning_rate": 3.219571414726315e-05, + "loss": 0.2592, + "step": 12214 + }, + { + "epoch": 0.6704720087815588, + "grad_norm": 1.673535943031311, + "learning_rate": 3.219069957851772e-05, + "loss": 0.2725, + "step": 12216 + }, + { + "epoch": 0.6705817782656421, + "grad_norm": 1.140576720237732, + "learning_rate": 3.2185684694352916e-05, + "loss": 0.2427, + "step": 12218 + }, + { + "epoch": 0.6706915477497256, + "grad_norm": 1.3584790229797363, + "learning_rate": 3.218066949498871e-05, + "loss": 0.1768, + "step": 12220 + }, + { + "epoch": 0.670801317233809, + "grad_norm": 1.0269495248794556, + "learning_rate": 3.217565398064509e-05, + "loss": 0.1571, + "step": 12222 + }, + { + "epoch": 0.6709110867178925, + "grad_norm": 2.5971322059631348, + "learning_rate": 3.217063815154208e-05, + "loss": 0.2871, + "step": 12224 + }, + { + "epoch": 0.6710208562019758, + "grad_norm": 4.408097743988037, + "learning_rate": 3.2165622007899676e-05, + "loss": 0.2758, + "step": 12226 + }, + { + "epoch": 0.6711306256860593, + "grad_norm": 1.1442826986312866, + "learning_rate": 3.2160605549937915e-05, + "loss": 0.27, + "step": 12228 + }, + { + "epoch": 0.6712403951701427, + "grad_norm": 1.3526978492736816, + "learning_rate": 3.2155588777876856e-05, + "loss": 0.3465, + "step": 12230 + }, + { + "epoch": 0.6713501646542261, + "grad_norm": 1.169558048248291, + "learning_rate": 3.215057169193655e-05, + "loss": 0.2702, + "step": 12232 + }, + { + "epoch": 0.6714599341383095, + "grad_norm": 2.1468238830566406, + "learning_rate": 3.214555429233707e-05, + "loss": 0.2504, + "step": 12234 + }, + { + "epoch": 0.671569703622393, + "grad_norm": 1.3334753513336182, + "learning_rate": 3.214053657929851e-05, + "loss": 0.3042, + "step": 12236 + }, + { + "epoch": 0.6716794731064764, + "grad_norm": 1.4293367862701416, + "learning_rate": 3.2135518553040964e-05, + "loss": 0.2515, + "step": 12238 + }, + { + "epoch": 0.6717892425905598, + "grad_norm": 1.0835713148117065, + "learning_rate": 3.2130500213784557e-05, + "loss": 0.3131, + "step": 12240 + }, + { + "epoch": 0.6718990120746432, + "grad_norm": 2.5214972496032715, + "learning_rate": 3.21254815617494e-05, + "loss": 0.1509, + "step": 12242 + }, + { + "epoch": 0.6720087815587267, + "grad_norm": 2.7417445182800293, + "learning_rate": 3.212046259715566e-05, + "loss": 0.1852, + "step": 12244 + }, + { + "epoch": 0.6721185510428102, + "grad_norm": 1.2745304107666016, + "learning_rate": 3.211544332022348e-05, + "loss": 0.306, + "step": 12246 + }, + { + "epoch": 0.6722283205268935, + "grad_norm": 2.3561742305755615, + "learning_rate": 3.211042373117302e-05, + "loss": 0.2788, + "step": 12248 + }, + { + "epoch": 0.672338090010977, + "grad_norm": 1.5182366371154785, + "learning_rate": 3.210540383022449e-05, + "loss": 0.2603, + "step": 12250 + }, + { + "epoch": 0.6724478594950604, + "grad_norm": 1.107342004776001, + "learning_rate": 3.210038361759807e-05, + "loss": 0.2409, + "step": 12252 + }, + { + "epoch": 0.6725576289791438, + "grad_norm": 2.5495612621307373, + "learning_rate": 3.209536309351397e-05, + "loss": 0.3122, + "step": 12254 + }, + { + "epoch": 0.6726673984632272, + "grad_norm": 2.1378543376922607, + "learning_rate": 3.2090342258192415e-05, + "loss": 0.2483, + "step": 12256 + }, + { + "epoch": 0.6727771679473107, + "grad_norm": 1.2653981447219849, + "learning_rate": 3.208532111185365e-05, + "loss": 0.1346, + "step": 12258 + }, + { + "epoch": 0.672886937431394, + "grad_norm": 2.339092493057251, + "learning_rate": 3.208029965471793e-05, + "loss": 0.3141, + "step": 12260 + }, + { + "epoch": 0.6729967069154775, + "grad_norm": 1.6265015602111816, + "learning_rate": 3.207527788700551e-05, + "loss": 0.4555, + "step": 12262 + }, + { + "epoch": 0.6731064763995609, + "grad_norm": 2.909217596054077, + "learning_rate": 3.207025580893667e-05, + "loss": 0.415, + "step": 12264 + }, + { + "epoch": 0.6732162458836444, + "grad_norm": 1.7911242246627808, + "learning_rate": 3.206523342073172e-05, + "loss": 0.2516, + "step": 12266 + }, + { + "epoch": 0.6733260153677277, + "grad_norm": 2.206916332244873, + "learning_rate": 3.206021072261094e-05, + "loss": 0.2537, + "step": 12268 + }, + { + "epoch": 0.6734357848518112, + "grad_norm": 3.1253321170806885, + "learning_rate": 3.2055187714794674e-05, + "loss": 0.2626, + "step": 12270 + }, + { + "epoch": 0.6735455543358946, + "grad_norm": 1.6329541206359863, + "learning_rate": 3.205016439750323e-05, + "loss": 0.3269, + "step": 12272 + }, + { + "epoch": 0.6736553238199781, + "grad_norm": 1.6693825721740723, + "learning_rate": 3.204514077095699e-05, + "loss": 0.3008, + "step": 12274 + }, + { + "epoch": 0.6737650933040614, + "grad_norm": 1.3737614154815674, + "learning_rate": 3.2040116835376285e-05, + "loss": 0.2476, + "step": 12276 + }, + { + "epoch": 0.6738748627881449, + "grad_norm": 1.7734318971633911, + "learning_rate": 3.2035092590981514e-05, + "loss": 0.2592, + "step": 12278 + }, + { + "epoch": 0.6739846322722283, + "grad_norm": 1.243033528327942, + "learning_rate": 3.2030068037993035e-05, + "loss": 0.2365, + "step": 12280 + }, + { + "epoch": 0.6740944017563117, + "grad_norm": 0.9969084858894348, + "learning_rate": 3.202504317663128e-05, + "loss": 0.2042, + "step": 12282 + }, + { + "epoch": 0.6742041712403952, + "grad_norm": 1.1582841873168945, + "learning_rate": 3.2020018007116646e-05, + "loss": 0.225, + "step": 12284 + }, + { + "epoch": 0.6743139407244786, + "grad_norm": 1.0581419467926025, + "learning_rate": 3.2014992529669566e-05, + "loss": 0.2412, + "step": 12286 + }, + { + "epoch": 0.6744237102085621, + "grad_norm": 1.808688759803772, + "learning_rate": 3.200996674451047e-05, + "loss": 0.2623, + "step": 12288 + }, + { + "epoch": 0.6745334796926454, + "grad_norm": 2.2062196731567383, + "learning_rate": 3.2004940651859844e-05, + "loss": 0.3744, + "step": 12290 + }, + { + "epoch": 0.6746432491767289, + "grad_norm": 1.8185371160507202, + "learning_rate": 3.199991425193812e-05, + "loss": 0.2159, + "step": 12292 + }, + { + "epoch": 0.6747530186608123, + "grad_norm": 1.4943993091583252, + "learning_rate": 3.199488754496582e-05, + "loss": 0.2557, + "step": 12294 + }, + { + "epoch": 0.6748627881448958, + "grad_norm": 1.8594516515731812, + "learning_rate": 3.1989860531163405e-05, + "loss": 0.2064, + "step": 12296 + }, + { + "epoch": 0.6749725576289791, + "grad_norm": 2.047466993331909, + "learning_rate": 3.198483321075141e-05, + "loss": 0.3934, + "step": 12298 + }, + { + "epoch": 0.6750823271130626, + "grad_norm": 1.5927177667617798, + "learning_rate": 3.197980558395034e-05, + "loss": 0.2439, + "step": 12300 + }, + { + "epoch": 0.675192096597146, + "grad_norm": 1.1651890277862549, + "learning_rate": 3.1974777650980735e-05, + "loss": 0.2977, + "step": 12302 + }, + { + "epoch": 0.6753018660812294, + "grad_norm": 2.56992244720459, + "learning_rate": 3.1969749412063145e-05, + "loss": 0.3693, + "step": 12304 + }, + { + "epoch": 0.6754116355653128, + "grad_norm": 1.0747343301773071, + "learning_rate": 3.196472086741815e-05, + "loss": 0.3114, + "step": 12306 + }, + { + "epoch": 0.6755214050493963, + "grad_norm": 1.7801055908203125, + "learning_rate": 3.195969201726631e-05, + "loss": 0.1632, + "step": 12308 + }, + { + "epoch": 0.6756311745334797, + "grad_norm": 2.749688148498535, + "learning_rate": 3.1954662861828204e-05, + "loss": 0.2921, + "step": 12310 + }, + { + "epoch": 0.6757409440175631, + "grad_norm": 1.4329420328140259, + "learning_rate": 3.194963340132446e-05, + "loss": 0.3026, + "step": 12312 + }, + { + "epoch": 0.6758507135016465, + "grad_norm": 1.5482451915740967, + "learning_rate": 3.194460363597569e-05, + "loss": 0.3548, + "step": 12314 + }, + { + "epoch": 0.67596048298573, + "grad_norm": 1.3073841333389282, + "learning_rate": 3.193957356600251e-05, + "loss": 0.3672, + "step": 12316 + }, + { + "epoch": 0.6760702524698133, + "grad_norm": 1.1917685270309448, + "learning_rate": 3.193454319162557e-05, + "loss": 0.2103, + "step": 12318 + }, + { + "epoch": 0.6761800219538968, + "grad_norm": 1.3077259063720703, + "learning_rate": 3.192951251306553e-05, + "loss": 0.3062, + "step": 12320 + }, + { + "epoch": 0.6762897914379803, + "grad_norm": 1.2011336088180542, + "learning_rate": 3.192448153054306e-05, + "loss": 0.266, + "step": 12322 + }, + { + "epoch": 0.6763995609220637, + "grad_norm": 1.361814022064209, + "learning_rate": 3.191945024427885e-05, + "loss": 0.3503, + "step": 12324 + }, + { + "epoch": 0.6765093304061471, + "grad_norm": 1.145397424697876, + "learning_rate": 3.1914418654493586e-05, + "loss": 0.3665, + "step": 12326 + }, + { + "epoch": 0.6766190998902305, + "grad_norm": 1.2479585409164429, + "learning_rate": 3.190938676140797e-05, + "loss": 0.2995, + "step": 12328 + }, + { + "epoch": 0.676728869374314, + "grad_norm": 1.4802768230438232, + "learning_rate": 3.190435456524275e-05, + "loss": 0.1805, + "step": 12330 + }, + { + "epoch": 0.6768386388583973, + "grad_norm": 1.4607123136520386, + "learning_rate": 3.189932206621865e-05, + "loss": 0.2683, + "step": 12332 + }, + { + "epoch": 0.6769484083424808, + "grad_norm": 1.240898847579956, + "learning_rate": 3.1894289264556417e-05, + "loss": 0.3072, + "step": 12334 + }, + { + "epoch": 0.6770581778265642, + "grad_norm": 1.6153391599655151, + "learning_rate": 3.188925616047681e-05, + "loss": 0.3443, + "step": 12336 + }, + { + "epoch": 0.6771679473106477, + "grad_norm": 1.0849980115890503, + "learning_rate": 3.1884222754200625e-05, + "loss": 0.2016, + "step": 12338 + }, + { + "epoch": 0.677277716794731, + "grad_norm": 1.2328187227249146, + "learning_rate": 3.187918904594863e-05, + "loss": 0.2204, + "step": 12340 + }, + { + "epoch": 0.6773874862788145, + "grad_norm": 1.609490156173706, + "learning_rate": 3.187415503594166e-05, + "loss": 0.245, + "step": 12342 + }, + { + "epoch": 0.6774972557628979, + "grad_norm": 1.584892749786377, + "learning_rate": 3.186912072440049e-05, + "loss": 0.2385, + "step": 12344 + }, + { + "epoch": 0.6776070252469814, + "grad_norm": 1.4547443389892578, + "learning_rate": 3.186408611154597e-05, + "loss": 0.2814, + "step": 12346 + }, + { + "epoch": 0.6777167947310647, + "grad_norm": 1.0364515781402588, + "learning_rate": 3.185905119759895e-05, + "loss": 0.2116, + "step": 12348 + }, + { + "epoch": 0.6778265642151482, + "grad_norm": 1.8307671546936035, + "learning_rate": 3.1854015982780275e-05, + "loss": 0.2658, + "step": 12350 + }, + { + "epoch": 0.6779363336992316, + "grad_norm": 2.017564535140991, + "learning_rate": 3.184898046731082e-05, + "loss": 0.232, + "step": 12352 + }, + { + "epoch": 0.678046103183315, + "grad_norm": 1.9247591495513916, + "learning_rate": 3.1843944651411456e-05, + "loss": 0.2501, + "step": 12354 + }, + { + "epoch": 0.6781558726673985, + "grad_norm": 1.2855802774429321, + "learning_rate": 3.18389085353031e-05, + "loss": 0.2158, + "step": 12356 + }, + { + "epoch": 0.6782656421514819, + "grad_norm": 3.32788348197937, + "learning_rate": 3.183387211920663e-05, + "loss": 0.2726, + "step": 12358 + }, + { + "epoch": 0.6783754116355654, + "grad_norm": 1.3913823366165161, + "learning_rate": 3.182883540334301e-05, + "loss": 0.2989, + "step": 12360 + }, + { + "epoch": 0.6784851811196487, + "grad_norm": 1.8601527214050293, + "learning_rate": 3.1823798387933134e-05, + "loss": 0.4455, + "step": 12362 + }, + { + "epoch": 0.6785949506037322, + "grad_norm": 1.3059706687927246, + "learning_rate": 3.181876107319797e-05, + "loss": 0.2468, + "step": 12364 + }, + { + "epoch": 0.6787047200878156, + "grad_norm": 2.1254706382751465, + "learning_rate": 3.181372345935848e-05, + "loss": 0.2536, + "step": 12366 + }, + { + "epoch": 0.678814489571899, + "grad_norm": 1.3754957914352417, + "learning_rate": 3.180868554663564e-05, + "loss": 0.2545, + "step": 12368 + }, + { + "epoch": 0.6789242590559824, + "grad_norm": 1.866571068763733, + "learning_rate": 3.180364733525043e-05, + "loss": 0.264, + "step": 12370 + }, + { + "epoch": 0.6790340285400659, + "grad_norm": 2.4073081016540527, + "learning_rate": 3.179860882542385e-05, + "loss": 0.2877, + "step": 12372 + }, + { + "epoch": 0.6791437980241493, + "grad_norm": 1.4584429264068604, + "learning_rate": 3.179357001737692e-05, + "loss": 0.4039, + "step": 12374 + }, + { + "epoch": 0.6792535675082327, + "grad_norm": 1.38982093334198, + "learning_rate": 3.178853091133066e-05, + "loss": 0.3288, + "step": 12376 + }, + { + "epoch": 0.6793633369923161, + "grad_norm": 1.212857961654663, + "learning_rate": 3.178349150750612e-05, + "loss": 0.2634, + "step": 12378 + }, + { + "epoch": 0.6794731064763996, + "grad_norm": 3.071079730987549, + "learning_rate": 3.1778451806124346e-05, + "loss": 0.2583, + "step": 12380 + }, + { + "epoch": 0.679582875960483, + "grad_norm": 3.4262871742248535, + "learning_rate": 3.17734118074064e-05, + "loss": 0.2261, + "step": 12382 + }, + { + "epoch": 0.6796926454445664, + "grad_norm": 3.025269031524658, + "learning_rate": 3.176837151157337e-05, + "loss": 0.3993, + "step": 12384 + }, + { + "epoch": 0.6798024149286498, + "grad_norm": 2.43485951423645, + "learning_rate": 3.176333091884635e-05, + "loss": 0.22, + "step": 12386 + }, + { + "epoch": 0.6799121844127333, + "grad_norm": 1.13909912109375, + "learning_rate": 3.175829002944643e-05, + "loss": 0.3311, + "step": 12388 + }, + { + "epoch": 0.6800219538968166, + "grad_norm": 1.4662575721740723, + "learning_rate": 3.175324884359474e-05, + "loss": 0.1945, + "step": 12390 + }, + { + "epoch": 0.6801317233809001, + "grad_norm": 1.2966811656951904, + "learning_rate": 3.1748207361512416e-05, + "loss": 0.292, + "step": 12392 + }, + { + "epoch": 0.6802414928649836, + "grad_norm": 1.3668485879898071, + "learning_rate": 3.174316558342059e-05, + "loss": 0.2982, + "step": 12394 + }, + { + "epoch": 0.680351262349067, + "grad_norm": 1.9756765365600586, + "learning_rate": 3.173812350954041e-05, + "loss": 0.2123, + "step": 12396 + }, + { + "epoch": 0.6804610318331504, + "grad_norm": 1.2009015083312988, + "learning_rate": 3.173308114009308e-05, + "loss": 0.3568, + "step": 12398 + }, + { + "epoch": 0.6805708013172338, + "grad_norm": 2.333350658416748, + "learning_rate": 3.172803847529976e-05, + "loss": 0.2049, + "step": 12400 + }, + { + "epoch": 0.6806805708013173, + "grad_norm": 1.1577160358428955, + "learning_rate": 3.172299551538164e-05, + "loss": 0.3092, + "step": 12402 + }, + { + "epoch": 0.6807903402854006, + "grad_norm": 1.5357073545455933, + "learning_rate": 3.171795226055995e-05, + "loss": 0.2771, + "step": 12404 + }, + { + "epoch": 0.6809001097694841, + "grad_norm": 1.9535592794418335, + "learning_rate": 3.1712908711055897e-05, + "loss": 0.3026, + "step": 12406 + }, + { + "epoch": 0.6810098792535675, + "grad_norm": 1.7783825397491455, + "learning_rate": 3.170786486709071e-05, + "loss": 0.2904, + "step": 12408 + }, + { + "epoch": 0.681119648737651, + "grad_norm": 1.9082902669906616, + "learning_rate": 3.170282072888566e-05, + "loss": 0.307, + "step": 12410 + }, + { + "epoch": 0.6812294182217343, + "grad_norm": 0.8979579210281372, + "learning_rate": 3.169777629666199e-05, + "loss": 0.2568, + "step": 12412 + }, + { + "epoch": 0.6813391877058178, + "grad_norm": 1.789161205291748, + "learning_rate": 3.169273157064097e-05, + "loss": 0.2464, + "step": 12414 + }, + { + "epoch": 0.6814489571899012, + "grad_norm": 1.4536083936691284, + "learning_rate": 3.168768655104389e-05, + "loss": 0.2629, + "step": 12416 + }, + { + "epoch": 0.6815587266739846, + "grad_norm": 1.5284160375595093, + "learning_rate": 3.1682641238092064e-05, + "loss": 0.2993, + "step": 12418 + }, + { + "epoch": 0.681668496158068, + "grad_norm": 1.2529560327529907, + "learning_rate": 3.1677595632006786e-05, + "loss": 0.377, + "step": 12420 + }, + { + "epoch": 0.6817782656421515, + "grad_norm": 0.9937953352928162, + "learning_rate": 3.1672549733009396e-05, + "loss": 0.259, + "step": 12422 + }, + { + "epoch": 0.6818880351262349, + "grad_norm": 2.732011556625366, + "learning_rate": 3.1667503541321216e-05, + "loss": 0.1818, + "step": 12424 + }, + { + "epoch": 0.6819978046103183, + "grad_norm": 1.6104716062545776, + "learning_rate": 3.1662457057163604e-05, + "loss": 0.292, + "step": 12426 + }, + { + "epoch": 0.6821075740944017, + "grad_norm": 2.3173775672912598, + "learning_rate": 3.165741028075793e-05, + "loss": 0.2664, + "step": 12428 + }, + { + "epoch": 0.6822173435784852, + "grad_norm": 1.244768500328064, + "learning_rate": 3.165236321232557e-05, + "loss": 0.3292, + "step": 12430 + }, + { + "epoch": 0.6823271130625687, + "grad_norm": 1.3772408962249756, + "learning_rate": 3.164731585208789e-05, + "loss": 0.2946, + "step": 12432 + }, + { + "epoch": 0.682436882546652, + "grad_norm": 0.916666567325592, + "learning_rate": 3.1642268200266317e-05, + "loss": 0.1548, + "step": 12434 + }, + { + "epoch": 0.6825466520307355, + "grad_norm": 1.4068984985351562, + "learning_rate": 3.163722025708227e-05, + "loss": 0.2435, + "step": 12436 + }, + { + "epoch": 0.6826564215148189, + "grad_norm": 2.0931177139282227, + "learning_rate": 3.163217202275715e-05, + "loss": 0.2642, + "step": 12438 + }, + { + "epoch": 0.6827661909989023, + "grad_norm": 1.0896358489990234, + "learning_rate": 3.1627123497512415e-05, + "loss": 0.2925, + "step": 12440 + }, + { + "epoch": 0.6828759604829857, + "grad_norm": 1.647046446800232, + "learning_rate": 3.162207468156952e-05, + "loss": 0.3235, + "step": 12442 + }, + { + "epoch": 0.6829857299670692, + "grad_norm": 1.7311114072799683, + "learning_rate": 3.161702557514993e-05, + "loss": 0.2796, + "step": 12444 + }, + { + "epoch": 0.6830954994511526, + "grad_norm": 2.238919496536255, + "learning_rate": 3.161197617847511e-05, + "loss": 0.2676, + "step": 12446 + }, + { + "epoch": 0.683205268935236, + "grad_norm": 1.3105181455612183, + "learning_rate": 3.160692649176657e-05, + "loss": 0.2828, + "step": 12448 + }, + { + "epoch": 0.6833150384193194, + "grad_norm": 2.2279810905456543, + "learning_rate": 3.16018765152458e-05, + "loss": 0.2738, + "step": 12450 + }, + { + "epoch": 0.6834248079034029, + "grad_norm": 1.2560701370239258, + "learning_rate": 3.1596826249134324e-05, + "loss": 0.2419, + "step": 12452 + }, + { + "epoch": 0.6835345773874862, + "grad_norm": 1.5377308130264282, + "learning_rate": 3.1591775693653674e-05, + "loss": 0.2861, + "step": 12454 + }, + { + "epoch": 0.6836443468715697, + "grad_norm": 1.3435720205307007, + "learning_rate": 3.1586724849025385e-05, + "loss": 0.2606, + "step": 12456 + }, + { + "epoch": 0.6837541163556531, + "grad_norm": 3.2967941761016846, + "learning_rate": 3.1581673715471006e-05, + "loss": 0.3469, + "step": 12458 + }, + { + "epoch": 0.6838638858397366, + "grad_norm": 1.269344687461853, + "learning_rate": 3.157662229321212e-05, + "loss": 0.275, + "step": 12460 + }, + { + "epoch": 0.6839736553238199, + "grad_norm": 1.4938890933990479, + "learning_rate": 3.15715705824703e-05, + "loss": 0.1532, + "step": 12462 + }, + { + "epoch": 0.6840834248079034, + "grad_norm": 1.455647587776184, + "learning_rate": 3.156651858346714e-05, + "loss": 0.1392, + "step": 12464 + }, + { + "epoch": 0.6841931942919868, + "grad_norm": 1.8334153890609741, + "learning_rate": 3.156146629642425e-05, + "loss": 0.3941, + "step": 12466 + }, + { + "epoch": 0.6843029637760702, + "grad_norm": 2.1609086990356445, + "learning_rate": 3.1556413721563235e-05, + "loss": 0.3277, + "step": 12468 + }, + { + "epoch": 0.6844127332601537, + "grad_norm": 1.5209178924560547, + "learning_rate": 3.155136085910573e-05, + "loss": 0.1979, + "step": 12470 + }, + { + "epoch": 0.6845225027442371, + "grad_norm": 2.471162796020508, + "learning_rate": 3.154630770927339e-05, + "loss": 0.2635, + "step": 12472 + }, + { + "epoch": 0.6846322722283206, + "grad_norm": 1.4583524465560913, + "learning_rate": 3.1541254272287865e-05, + "loss": 0.2624, + "step": 12474 + }, + { + "epoch": 0.6847420417124039, + "grad_norm": 1.1052459478378296, + "learning_rate": 3.153620054837081e-05, + "loss": 0.207, + "step": 12476 + }, + { + "epoch": 0.6848518111964874, + "grad_norm": 2.1180741786956787, + "learning_rate": 3.153114653774393e-05, + "loss": 0.3007, + "step": 12478 + }, + { + "epoch": 0.6849615806805708, + "grad_norm": 2.1210274696350098, + "learning_rate": 3.1526092240628895e-05, + "loss": 0.335, + "step": 12480 + }, + { + "epoch": 0.6850713501646543, + "grad_norm": 1.4130135774612427, + "learning_rate": 3.152103765724743e-05, + "loss": 0.3048, + "step": 12482 + }, + { + "epoch": 0.6851811196487376, + "grad_norm": 2.7379462718963623, + "learning_rate": 3.151598278782124e-05, + "loss": 0.2377, + "step": 12484 + }, + { + "epoch": 0.6852908891328211, + "grad_norm": 2.3890936374664307, + "learning_rate": 3.151092763257206e-05, + "loss": 0.3159, + "step": 12486 + }, + { + "epoch": 0.6854006586169045, + "grad_norm": 1.9963504076004028, + "learning_rate": 3.150587219172164e-05, + "loss": 0.2767, + "step": 12488 + }, + { + "epoch": 0.685510428100988, + "grad_norm": 3.0516228675842285, + "learning_rate": 3.150081646549174e-05, + "loss": 0.2855, + "step": 12490 + }, + { + "epoch": 0.6856201975850713, + "grad_norm": 1.3551896810531616, + "learning_rate": 3.1495760454104116e-05, + "loss": 0.2792, + "step": 12492 + }, + { + "epoch": 0.6857299670691548, + "grad_norm": 1.3045599460601807, + "learning_rate": 3.149070415778056e-05, + "loss": 0.1828, + "step": 12494 + }, + { + "epoch": 0.6858397365532382, + "grad_norm": 1.7462568283081055, + "learning_rate": 3.148564757674286e-05, + "loss": 0.4179, + "step": 12496 + }, + { + "epoch": 0.6859495060373216, + "grad_norm": 2.4370036125183105, + "learning_rate": 3.148059071121282e-05, + "loss": 0.28, + "step": 12498 + }, + { + "epoch": 0.686059275521405, + "grad_norm": 1.4774799346923828, + "learning_rate": 3.1475533561412256e-05, + "loss": 0.1972, + "step": 12500 + }, + { + "epoch": 0.6861690450054885, + "grad_norm": 1.8003942966461182, + "learning_rate": 3.147047612756302e-05, + "loss": 0.3818, + "step": 12502 + }, + { + "epoch": 0.686278814489572, + "grad_norm": 1.3758741617202759, + "learning_rate": 3.1465418409886935e-05, + "loss": 0.2726, + "step": 12504 + }, + { + "epoch": 0.6863885839736553, + "grad_norm": 1.2259657382965088, + "learning_rate": 3.1460360408605866e-05, + "loss": 0.2656, + "step": 12506 + }, + { + "epoch": 0.6864983534577388, + "grad_norm": 2.800041675567627, + "learning_rate": 3.1455302123941685e-05, + "loss": 0.3137, + "step": 12508 + }, + { + "epoch": 0.6866081229418222, + "grad_norm": 1.9987443685531616, + "learning_rate": 3.1450243556116266e-05, + "loss": 0.2734, + "step": 12510 + }, + { + "epoch": 0.6867178924259056, + "grad_norm": 1.854569435119629, + "learning_rate": 3.14451847053515e-05, + "loss": 0.274, + "step": 12512 + }, + { + "epoch": 0.686827661909989, + "grad_norm": 1.4432698488235474, + "learning_rate": 3.1440125571869306e-05, + "loss": 0.241, + "step": 12514 + }, + { + "epoch": 0.6869374313940725, + "grad_norm": 1.3847519159317017, + "learning_rate": 3.1435066155891576e-05, + "loss": 0.1979, + "step": 12516 + }, + { + "epoch": 0.6870472008781559, + "grad_norm": 1.3411953449249268, + "learning_rate": 3.143000645764028e-05, + "loss": 0.2788, + "step": 12518 + }, + { + "epoch": 0.6871569703622393, + "grad_norm": 1.4738284349441528, + "learning_rate": 3.142494647733733e-05, + "loss": 0.249, + "step": 12520 + }, + { + "epoch": 0.6872667398463227, + "grad_norm": 1.1744401454925537, + "learning_rate": 3.1419886215204694e-05, + "loss": 0.261, + "step": 12522 + }, + { + "epoch": 0.6873765093304062, + "grad_norm": 2.460946798324585, + "learning_rate": 3.141482567146434e-05, + "loss": 0.2628, + "step": 12524 + }, + { + "epoch": 0.6874862788144895, + "grad_norm": 1.5847938060760498, + "learning_rate": 3.1409764846338245e-05, + "loss": 0.2294, + "step": 12526 + }, + { + "epoch": 0.687596048298573, + "grad_norm": 1.337211012840271, + "learning_rate": 3.1404703740048406e-05, + "loss": 0.1445, + "step": 12528 + }, + { + "epoch": 0.6877058177826564, + "grad_norm": 2.076197862625122, + "learning_rate": 3.139964235281682e-05, + "loss": 0.2015, + "step": 12530 + }, + { + "epoch": 0.6878155872667399, + "grad_norm": 2.6122846603393555, + "learning_rate": 3.139458068486551e-05, + "loss": 0.203, + "step": 12532 + }, + { + "epoch": 0.6879253567508232, + "grad_norm": 2.686518430709839, + "learning_rate": 3.1389518736416507e-05, + "loss": 0.3751, + "step": 12534 + }, + { + "epoch": 0.6880351262349067, + "grad_norm": 2.7052457332611084, + "learning_rate": 3.138445650769185e-05, + "loss": 0.2726, + "step": 12536 + }, + { + "epoch": 0.6881448957189901, + "grad_norm": 1.6107864379882812, + "learning_rate": 3.137939399891359e-05, + "loss": 0.2395, + "step": 12538 + }, + { + "epoch": 0.6882546652030735, + "grad_norm": 1.8589428663253784, + "learning_rate": 3.13743312103038e-05, + "loss": 0.2908, + "step": 12540 + }, + { + "epoch": 0.688364434687157, + "grad_norm": 1.2668483257293701, + "learning_rate": 3.1369268142084556e-05, + "loss": 0.263, + "step": 12542 + }, + { + "epoch": 0.6884742041712404, + "grad_norm": 2.216956377029419, + "learning_rate": 3.136420479447795e-05, + "loss": 0.4471, + "step": 12544 + }, + { + "epoch": 0.6885839736553239, + "grad_norm": 1.734403133392334, + "learning_rate": 3.135914116770609e-05, + "loss": 0.3178, + "step": 12546 + }, + { + "epoch": 0.6886937431394072, + "grad_norm": 1.7025413513183594, + "learning_rate": 3.1354077261991074e-05, + "loss": 0.4252, + "step": 12548 + }, + { + "epoch": 0.6888035126234907, + "grad_norm": 1.2725874185562134, + "learning_rate": 3.1349013077555045e-05, + "loss": 0.2023, + "step": 12550 + }, + { + "epoch": 0.6889132821075741, + "grad_norm": 1.5531208515167236, + "learning_rate": 3.1343948614620145e-05, + "loss": 0.3536, + "step": 12552 + }, + { + "epoch": 0.6890230515916576, + "grad_norm": 1.0145618915557861, + "learning_rate": 3.1338883873408516e-05, + "loss": 0.2163, + "step": 12554 + }, + { + "epoch": 0.6891328210757409, + "grad_norm": 1.5642026662826538, + "learning_rate": 3.133381885414233e-05, + "loss": 0.2337, + "step": 12556 + }, + { + "epoch": 0.6892425905598244, + "grad_norm": 1.3670145273208618, + "learning_rate": 3.132875355704376e-05, + "loss": 0.1711, + "step": 12558 + }, + { + "epoch": 0.6893523600439078, + "grad_norm": 1.5865962505340576, + "learning_rate": 3.132368798233499e-05, + "loss": 0.3547, + "step": 12560 + }, + { + "epoch": 0.6894621295279912, + "grad_norm": 2.1395459175109863, + "learning_rate": 3.1318622130238236e-05, + "loss": 0.3236, + "step": 12562 + }, + { + "epoch": 0.6895718990120746, + "grad_norm": 1.4760798215866089, + "learning_rate": 3.13135560009757e-05, + "loss": 0.2333, + "step": 12564 + }, + { + "epoch": 0.6896816684961581, + "grad_norm": 1.0063371658325195, + "learning_rate": 3.1308489594769605e-05, + "loss": 0.2333, + "step": 12566 + }, + { + "epoch": 0.6897914379802415, + "grad_norm": 1.873952031135559, + "learning_rate": 3.130342291184219e-05, + "loss": 0.2995, + "step": 12568 + }, + { + "epoch": 0.6899012074643249, + "grad_norm": 1.5126047134399414, + "learning_rate": 3.129835595241571e-05, + "loss": 0.2999, + "step": 12570 + }, + { + "epoch": 0.6900109769484083, + "grad_norm": 1.2175836563110352, + "learning_rate": 3.129328871671243e-05, + "loss": 0.3062, + "step": 12572 + }, + { + "epoch": 0.6901207464324918, + "grad_norm": 1.62493896484375, + "learning_rate": 3.128822120495462e-05, + "loss": 0.2558, + "step": 12574 + }, + { + "epoch": 0.6902305159165751, + "grad_norm": 1.0781134366989136, + "learning_rate": 3.1283153417364545e-05, + "loss": 0.225, + "step": 12576 + }, + { + "epoch": 0.6903402854006586, + "grad_norm": 2.1235570907592773, + "learning_rate": 3.127808535416454e-05, + "loss": 0.2086, + "step": 12578 + }, + { + "epoch": 0.6904500548847421, + "grad_norm": 1.0554890632629395, + "learning_rate": 3.1273017015576885e-05, + "loss": 0.2115, + "step": 12580 + }, + { + "epoch": 0.6905598243688255, + "grad_norm": 2.026413917541504, + "learning_rate": 3.126794840182392e-05, + "loss": 0.3065, + "step": 12582 + }, + { + "epoch": 0.6906695938529089, + "grad_norm": 1.9246972799301147, + "learning_rate": 3.1262879513127976e-05, + "loss": 0.3026, + "step": 12584 + }, + { + "epoch": 0.6907793633369923, + "grad_norm": 3.2583210468292236, + "learning_rate": 3.125781034971139e-05, + "loss": 0.1533, + "step": 12586 + }, + { + "epoch": 0.6908891328210758, + "grad_norm": 1.355456829071045, + "learning_rate": 3.1252740911796527e-05, + "loss": 0.3559, + "step": 12588 + }, + { + "epoch": 0.6909989023051591, + "grad_norm": 3.1071548461914062, + "learning_rate": 3.124767119960576e-05, + "loss": 0.3647, + "step": 12590 + }, + { + "epoch": 0.6911086717892426, + "grad_norm": 1.466719150543213, + "learning_rate": 3.124260121336146e-05, + "loss": 0.3084, + "step": 12592 + }, + { + "epoch": 0.691218441273326, + "grad_norm": 1.6612459421157837, + "learning_rate": 3.123753095328604e-05, + "loss": 0.5396, + "step": 12594 + }, + { + "epoch": 0.6913282107574095, + "grad_norm": 1.4256198406219482, + "learning_rate": 3.12324604196019e-05, + "loss": 0.2751, + "step": 12596 + }, + { + "epoch": 0.6914379802414928, + "grad_norm": 2.1753203868865967, + "learning_rate": 3.122738961253145e-05, + "loss": 0.2282, + "step": 12598 + }, + { + "epoch": 0.6915477497255763, + "grad_norm": 1.5644097328186035, + "learning_rate": 3.122231853229713e-05, + "loss": 0.2921, + "step": 12600 + }, + { + "epoch": 0.6916575192096597, + "grad_norm": 1.3453915119171143, + "learning_rate": 3.121724717912138e-05, + "loss": 0.2482, + "step": 12602 + }, + { + "epoch": 0.6917672886937432, + "grad_norm": 1.5724775791168213, + "learning_rate": 3.1212175553226644e-05, + "loss": 0.3281, + "step": 12604 + }, + { + "epoch": 0.6918770581778265, + "grad_norm": 2.151446580886841, + "learning_rate": 3.1207103654835394e-05, + "loss": 0.2663, + "step": 12606 + }, + { + "epoch": 0.69198682766191, + "grad_norm": 1.7903273105621338, + "learning_rate": 3.120203148417012e-05, + "loss": 0.34, + "step": 12608 + }, + { + "epoch": 0.6920965971459934, + "grad_norm": 1.5786566734313965, + "learning_rate": 3.11969590414533e-05, + "loss": 0.1904, + "step": 12610 + }, + { + "epoch": 0.6922063666300768, + "grad_norm": 1.8824272155761719, + "learning_rate": 3.119188632690744e-05, + "loss": 0.2802, + "step": 12612 + }, + { + "epoch": 0.6923161361141603, + "grad_norm": 1.6946966648101807, + "learning_rate": 3.118681334075506e-05, + "loss": 0.3829, + "step": 12614 + }, + { + "epoch": 0.6924259055982437, + "grad_norm": 1.650410532951355, + "learning_rate": 3.118174008321867e-05, + "loss": 0.4129, + "step": 12616 + }, + { + "epoch": 0.6925356750823272, + "grad_norm": 1.8274866342544556, + "learning_rate": 3.117666655452083e-05, + "loss": 0.4244, + "step": 12618 + }, + { + "epoch": 0.6926454445664105, + "grad_norm": 2.3804240226745605, + "learning_rate": 3.117159275488407e-05, + "loss": 0.334, + "step": 12620 + }, + { + "epoch": 0.692755214050494, + "grad_norm": 1.7847295999526978, + "learning_rate": 3.116651868453097e-05, + "loss": 0.267, + "step": 12622 + }, + { + "epoch": 0.6928649835345774, + "grad_norm": 1.7579412460327148, + "learning_rate": 3.1161444343684076e-05, + "loss": 0.1844, + "step": 12624 + }, + { + "epoch": 0.6929747530186608, + "grad_norm": 1.5055025815963745, + "learning_rate": 3.1156369732566006e-05, + "loss": 0.2333, + "step": 12626 + }, + { + "epoch": 0.6930845225027442, + "grad_norm": 2.0760533809661865, + "learning_rate": 3.115129485139933e-05, + "loss": 0.2752, + "step": 12628 + }, + { + "epoch": 0.6931942919868277, + "grad_norm": 1.3460825681686401, + "learning_rate": 3.1146219700406674e-05, + "loss": 0.3416, + "step": 12630 + }, + { + "epoch": 0.6933040614709111, + "grad_norm": 1.4119505882263184, + "learning_rate": 3.1141144279810666e-05, + "loss": 0.2222, + "step": 12632 + }, + { + "epoch": 0.6934138309549945, + "grad_norm": 1.321868658065796, + "learning_rate": 3.1136068589833914e-05, + "loss": 0.3758, + "step": 12634 + }, + { + "epoch": 0.6935236004390779, + "grad_norm": 1.9402568340301514, + "learning_rate": 3.1130992630699074e-05, + "loss": 0.2724, + "step": 12636 + }, + { + "epoch": 0.6936333699231614, + "grad_norm": 1.4250460863113403, + "learning_rate": 3.1125916402628814e-05, + "loss": 0.2824, + "step": 12638 + }, + { + "epoch": 0.6937431394072447, + "grad_norm": 1.2368134260177612, + "learning_rate": 3.1120839905845794e-05, + "loss": 0.1463, + "step": 12640 + }, + { + "epoch": 0.6938529088913282, + "grad_norm": 1.1766114234924316, + "learning_rate": 3.111576314057268e-05, + "loss": 0.2059, + "step": 12642 + }, + { + "epoch": 0.6939626783754116, + "grad_norm": 2.2569570541381836, + "learning_rate": 3.1110686107032196e-05, + "loss": 0.2437, + "step": 12644 + }, + { + "epoch": 0.6940724478594951, + "grad_norm": 2.202061414718628, + "learning_rate": 3.110560880544701e-05, + "loss": 0.3825, + "step": 12646 + }, + { + "epoch": 0.6941822173435784, + "grad_norm": 1.3569514751434326, + "learning_rate": 3.110053123603986e-05, + "loss": 0.2468, + "step": 12648 + }, + { + "epoch": 0.6942919868276619, + "grad_norm": 1.1898168325424194, + "learning_rate": 3.1095453399033466e-05, + "loss": 0.184, + "step": 12650 + }, + { + "epoch": 0.6944017563117454, + "grad_norm": 1.6514487266540527, + "learning_rate": 3.109037529465056e-05, + "loss": 0.3055, + "step": 12652 + }, + { + "epoch": 0.6945115257958288, + "grad_norm": 1.4080032110214233, + "learning_rate": 3.108529692311391e-05, + "loss": 0.3246, + "step": 12654 + }, + { + "epoch": 0.6946212952799122, + "grad_norm": 1.154447078704834, + "learning_rate": 3.1080218284646266e-05, + "loss": 0.3127, + "step": 12656 + }, + { + "epoch": 0.6947310647639956, + "grad_norm": 1.7198145389556885, + "learning_rate": 3.107513937947041e-05, + "loss": 0.2452, + "step": 12658 + }, + { + "epoch": 0.6948408342480791, + "grad_norm": 1.2223706245422363, + "learning_rate": 3.1070060207809115e-05, + "loss": 0.2484, + "step": 12660 + }, + { + "epoch": 0.6949506037321624, + "grad_norm": 1.929752230644226, + "learning_rate": 3.1064980769885187e-05, + "loss": 0.2145, + "step": 12662 + }, + { + "epoch": 0.6950603732162459, + "grad_norm": 1.7267688512802124, + "learning_rate": 3.105990106592144e-05, + "loss": 0.2955, + "step": 12664 + }, + { + "epoch": 0.6951701427003293, + "grad_norm": 2.3198354244232178, + "learning_rate": 3.1054821096140676e-05, + "loss": 0.2687, + "step": 12666 + }, + { + "epoch": 0.6952799121844128, + "grad_norm": 2.083498001098633, + "learning_rate": 3.104974086076575e-05, + "loss": 0.2763, + "step": 12668 + }, + { + "epoch": 0.6953896816684961, + "grad_norm": 1.6475166082382202, + "learning_rate": 3.10446603600195e-05, + "loss": 0.2411, + "step": 12670 + }, + { + "epoch": 0.6954994511525796, + "grad_norm": 1.278164267539978, + "learning_rate": 3.103957959412476e-05, + "loss": 0.2364, + "step": 12672 + }, + { + "epoch": 0.695609220636663, + "grad_norm": 1.3549001216888428, + "learning_rate": 3.103449856330443e-05, + "loss": 0.2861, + "step": 12674 + }, + { + "epoch": 0.6957189901207464, + "grad_norm": 1.454182505607605, + "learning_rate": 3.1029417267781367e-05, + "loss": 0.3319, + "step": 12676 + }, + { + "epoch": 0.6958287596048298, + "grad_norm": Infinity, + "learning_rate": 3.102687652082597e-05, + "loss": 0.2857, + "step": 12678 + }, + { + "epoch": 0.6959385290889133, + "grad_norm": 2.010495662689209, + "learning_rate": 3.1021794828666744e-05, + "loss": 0.3296, + "step": 12680 + }, + { + "epoch": 0.6960482985729967, + "grad_norm": 1.406460165977478, + "learning_rate": 3.1016712872362035e-05, + "loss": 0.2767, + "step": 12682 + }, + { + "epoch": 0.6961580680570801, + "grad_norm": 1.2970402240753174, + "learning_rate": 3.1011630652134774e-05, + "loss": 0.3157, + "step": 12684 + }, + { + "epoch": 0.6962678375411635, + "grad_norm": 1.4212733507156372, + "learning_rate": 3.100654816820788e-05, + "loss": 0.1622, + "step": 12686 + }, + { + "epoch": 0.696377607025247, + "grad_norm": 2.520721197128296, + "learning_rate": 3.100146542080432e-05, + "loss": 0.277, + "step": 12688 + }, + { + "epoch": 0.6964873765093305, + "grad_norm": 3.2635598182678223, + "learning_rate": 3.0996382410147005e-05, + "loss": 0.268, + "step": 12690 + }, + { + "epoch": 0.6965971459934138, + "grad_norm": 1.6665798425674438, + "learning_rate": 3.099129913645894e-05, + "loss": 0.1651, + "step": 12692 + }, + { + "epoch": 0.6967069154774973, + "grad_norm": 1.4362361431121826, + "learning_rate": 3.0986215599963085e-05, + "loss": 0.3045, + "step": 12694 + }, + { + "epoch": 0.6968166849615807, + "grad_norm": 1.6901012659072876, + "learning_rate": 3.098113180088243e-05, + "loss": 0.3248, + "step": 12696 + }, + { + "epoch": 0.6969264544456641, + "grad_norm": 1.4443011283874512, + "learning_rate": 3.0976047739439974e-05, + "loss": 0.1899, + "step": 12698 + }, + { + "epoch": 0.6970362239297475, + "grad_norm": 1.1996554136276245, + "learning_rate": 3.097096341585874e-05, + "loss": 0.2784, + "step": 12700 + }, + { + "epoch": 0.697145993413831, + "grad_norm": 1.6946967840194702, + "learning_rate": 3.096587883036174e-05, + "loss": 0.2006, + "step": 12702 + }, + { + "epoch": 0.6972557628979144, + "grad_norm": 1.4189645051956177, + "learning_rate": 3.096079398317201e-05, + "loss": 0.3113, + "step": 12704 + }, + { + "epoch": 0.6973655323819978, + "grad_norm": 1.672541856765747, + "learning_rate": 3.09557088745126e-05, + "loss": 0.2634, + "step": 12706 + }, + { + "epoch": 0.6974753018660812, + "grad_norm": 1.820178747177124, + "learning_rate": 3.095062350460656e-05, + "loss": 0.3127, + "step": 12708 + }, + { + "epoch": 0.6975850713501647, + "grad_norm": 1.27256178855896, + "learning_rate": 3.094553787367698e-05, + "loss": 0.2662, + "step": 12710 + }, + { + "epoch": 0.697694840834248, + "grad_norm": 1.179595947265625, + "learning_rate": 3.0940451981946924e-05, + "loss": 0.3972, + "step": 12712 + }, + { + "epoch": 0.6978046103183315, + "grad_norm": 4.082700252532959, + "learning_rate": 3.093536582963947e-05, + "loss": 0.3383, + "step": 12714 + }, + { + "epoch": 0.6979143798024149, + "grad_norm": 1.1015064716339111, + "learning_rate": 3.0930279416977745e-05, + "loss": 0.241, + "step": 12716 + }, + { + "epoch": 0.6980241492864984, + "grad_norm": 1.2099629640579224, + "learning_rate": 3.092519274418487e-05, + "loss": 0.2122, + "step": 12718 + }, + { + "epoch": 0.6981339187705817, + "grad_norm": 1.1138479709625244, + "learning_rate": 3.092010581148395e-05, + "loss": 0.2495, + "step": 12720 + }, + { + "epoch": 0.6982436882546652, + "grad_norm": 1.6641700267791748, + "learning_rate": 3.091501861909813e-05, + "loss": 0.2664, + "step": 12722 + }, + { + "epoch": 0.6983534577387486, + "grad_norm": 1.4468331336975098, + "learning_rate": 3.0909931167250564e-05, + "loss": 0.2528, + "step": 12724 + }, + { + "epoch": 0.698463227222832, + "grad_norm": 1.546808123588562, + "learning_rate": 3.090484345616441e-05, + "loss": 0.251, + "step": 12726 + }, + { + "epoch": 0.6985729967069155, + "grad_norm": 1.2900563478469849, + "learning_rate": 3.089975548606283e-05, + "loss": 0.2664, + "step": 12728 + }, + { + "epoch": 0.6986827661909989, + "grad_norm": 1.5685572624206543, + "learning_rate": 3.089466725716903e-05, + "loss": 0.2674, + "step": 12730 + }, + { + "epoch": 0.6987925356750824, + "grad_norm": 1.1609736680984497, + "learning_rate": 3.088957876970619e-05, + "loss": 0.1677, + "step": 12732 + }, + { + "epoch": 0.6989023051591657, + "grad_norm": 2.5789990425109863, + "learning_rate": 3.088449002389751e-05, + "loss": 0.2734, + "step": 12734 + }, + { + "epoch": 0.6990120746432492, + "grad_norm": 1.2660505771636963, + "learning_rate": 3.087940101996622e-05, + "loss": 0.2562, + "step": 12736 + }, + { + "epoch": 0.6991218441273326, + "grad_norm": 2.0633273124694824, + "learning_rate": 3.087431175813554e-05, + "loss": 0.1614, + "step": 12738 + }, + { + "epoch": 0.6992316136114161, + "grad_norm": 1.5814666748046875, + "learning_rate": 3.086922223862871e-05, + "loss": 0.2577, + "step": 12740 + }, + { + "epoch": 0.6993413830954994, + "grad_norm": 0.9559969902038574, + "learning_rate": 3.0864132461668995e-05, + "loss": 0.2596, + "step": 12742 + }, + { + "epoch": 0.6994511525795829, + "grad_norm": 1.233529806137085, + "learning_rate": 3.085904242747963e-05, + "loss": 0.3287, + "step": 12744 + }, + { + "epoch": 0.6995609220636663, + "grad_norm": 1.877331256866455, + "learning_rate": 3.0853952136283923e-05, + "loss": 0.2034, + "step": 12746 + }, + { + "epoch": 0.6996706915477497, + "grad_norm": 2.2377781867980957, + "learning_rate": 3.0848861588305136e-05, + "loss": 0.2381, + "step": 12748 + }, + { + "epoch": 0.6997804610318331, + "grad_norm": 1.6582218408584595, + "learning_rate": 3.084377078376658e-05, + "loss": 0.1498, + "step": 12750 + }, + { + "epoch": 0.6998902305159166, + "grad_norm": 1.5508089065551758, + "learning_rate": 3.0838679722891544e-05, + "loss": 0.2748, + "step": 12752 + }, + { + "epoch": 0.7, + "grad_norm": 1.1396368741989136, + "learning_rate": 3.083358840590336e-05, + "loss": 0.2307, + "step": 12754 + }, + { + "epoch": 0.7001097694840834, + "grad_norm": 1.1240837574005127, + "learning_rate": 3.082849683302536e-05, + "loss": 0.3677, + "step": 12756 + }, + { + "epoch": 0.7002195389681668, + "grad_norm": 1.552085518836975, + "learning_rate": 3.082340500448087e-05, + "loss": 0.1756, + "step": 12758 + }, + { + "epoch": 0.7003293084522503, + "grad_norm": 1.1840170621871948, + "learning_rate": 3.0818312920493264e-05, + "loss": 0.2211, + "step": 12760 + }, + { + "epoch": 0.7004390779363338, + "grad_norm": 1.9925864934921265, + "learning_rate": 3.0813220581285894e-05, + "loss": 0.2378, + "step": 12762 + }, + { + "epoch": 0.7005488474204171, + "grad_norm": 1.1594136953353882, + "learning_rate": 3.080812798708213e-05, + "loss": 0.2791, + "step": 12764 + }, + { + "epoch": 0.7006586169045006, + "grad_norm": 1.4172284603118896, + "learning_rate": 3.080303513810537e-05, + "loss": 0.2105, + "step": 12766 + }, + { + "epoch": 0.700768386388584, + "grad_norm": 1.4963282346725464, + "learning_rate": 3.0797942034579016e-05, + "loss": 0.2444, + "step": 12768 + }, + { + "epoch": 0.7008781558726674, + "grad_norm": 1.5249009132385254, + "learning_rate": 3.079284867672645e-05, + "loss": 0.2778, + "step": 12770 + }, + { + "epoch": 0.7009879253567508, + "grad_norm": 1.0424491167068481, + "learning_rate": 3.078775506477112e-05, + "loss": 0.2553, + "step": 12772 + }, + { + "epoch": 0.7010976948408343, + "grad_norm": 1.5764102935791016, + "learning_rate": 3.078266119893643e-05, + "loss": 0.275, + "step": 12774 + }, + { + "epoch": 0.7012074643249177, + "grad_norm": 1.4255105257034302, + "learning_rate": 3.077756707944585e-05, + "loss": 0.2806, + "step": 12776 + }, + { + "epoch": 0.7013172338090011, + "grad_norm": 1.6223254203796387, + "learning_rate": 3.0772472706522806e-05, + "loss": 0.191, + "step": 12778 + }, + { + "epoch": 0.7014270032930845, + "grad_norm": 1.4557433128356934, + "learning_rate": 3.076737808039079e-05, + "loss": 0.2468, + "step": 12780 + }, + { + "epoch": 0.701536772777168, + "grad_norm": 1.0703833103179932, + "learning_rate": 3.076228320127326e-05, + "loss": 0.2267, + "step": 12782 + }, + { + "epoch": 0.7016465422612513, + "grad_norm": 1.2211450338363647, + "learning_rate": 3.075718806939371e-05, + "loss": 0.2446, + "step": 12784 + }, + { + "epoch": 0.7017563117453348, + "grad_norm": 1.2205698490142822, + "learning_rate": 3.075209268497563e-05, + "loss": 0.2322, + "step": 12786 + }, + { + "epoch": 0.7018660812294182, + "grad_norm": 1.3290749788284302, + "learning_rate": 3.074699704824252e-05, + "loss": 0.1845, + "step": 12788 + }, + { + "epoch": 0.7019758507135017, + "grad_norm": 1.611080288887024, + "learning_rate": 3.0741901159417924e-05, + "loss": 0.2532, + "step": 12790 + }, + { + "epoch": 0.702085620197585, + "grad_norm": 2.5221357345581055, + "learning_rate": 3.073680501872536e-05, + "loss": 0.3089, + "step": 12792 + }, + { + "epoch": 0.7021953896816685, + "grad_norm": 1.4524791240692139, + "learning_rate": 3.0731708626388364e-05, + "loss": 0.3711, + "step": 12794 + }, + { + "epoch": 0.7023051591657519, + "grad_norm": 0.9464358687400818, + "learning_rate": 3.0726611982630496e-05, + "loss": 0.279, + "step": 12796 + }, + { + "epoch": 0.7024149286498353, + "grad_norm": 4.169971466064453, + "learning_rate": 3.072151508767533e-05, + "loss": 0.308, + "step": 12798 + }, + { + "epoch": 0.7025246981339188, + "grad_norm": 1.4730573892593384, + "learning_rate": 3.071641794174642e-05, + "loss": 0.2625, + "step": 12800 + }, + { + "epoch": 0.7026344676180022, + "grad_norm": 1.7361212968826294, + "learning_rate": 3.0711320545067355e-05, + "loss": 0.3344, + "step": 12802 + }, + { + "epoch": 0.7027442371020857, + "grad_norm": 2.572155714035034, + "learning_rate": 3.070622289786175e-05, + "loss": 0.3359, + "step": 12804 + }, + { + "epoch": 0.702854006586169, + "grad_norm": 1.2311128377914429, + "learning_rate": 3.070112500035319e-05, + "loss": 0.2409, + "step": 12806 + }, + { + "epoch": 0.7029637760702525, + "grad_norm": 1.9514557123184204, + "learning_rate": 3.069602685276532e-05, + "loss": 0.31, + "step": 12808 + }, + { + "epoch": 0.7030735455543359, + "grad_norm": 1.2081694602966309, + "learning_rate": 3.069092845532174e-05, + "loss": 0.174, + "step": 12810 + }, + { + "epoch": 0.7031833150384194, + "grad_norm": 1.5300637483596802, + "learning_rate": 3.068582980824611e-05, + "loss": 0.1876, + "step": 12812 + }, + { + "epoch": 0.7032930845225027, + "grad_norm": 1.6111246347427368, + "learning_rate": 3.0680730911762075e-05, + "loss": 0.2257, + "step": 12814 + }, + { + "epoch": 0.7034028540065862, + "grad_norm": 1.6662200689315796, + "learning_rate": 3.0675631766093304e-05, + "loss": 0.1805, + "step": 12816 + }, + { + "epoch": 0.7035126234906696, + "grad_norm": 1.3079479932785034, + "learning_rate": 3.0670532371463465e-05, + "loss": 0.3524, + "step": 12818 + }, + { + "epoch": 0.703622392974753, + "grad_norm": 1.2754676342010498, + "learning_rate": 3.0665432728096234e-05, + "loss": 0.3031, + "step": 12820 + }, + { + "epoch": 0.7037321624588364, + "grad_norm": 1.8365486860275269, + "learning_rate": 3.066033283621533e-05, + "loss": 0.2864, + "step": 12822 + }, + { + "epoch": 0.7038419319429199, + "grad_norm": 1.6501024961471558, + "learning_rate": 3.065523269604444e-05, + "loss": 0.1862, + "step": 12824 + }, + { + "epoch": 0.7039517014270033, + "grad_norm": 1.4184517860412598, + "learning_rate": 3.065013230780728e-05, + "loss": 0.252, + "step": 12826 + }, + { + "epoch": 0.7040614709110867, + "grad_norm": 1.4625134468078613, + "learning_rate": 3.06450316717276e-05, + "loss": 0.3048, + "step": 12828 + }, + { + "epoch": 0.7041712403951701, + "grad_norm": 1.1598753929138184, + "learning_rate": 3.063993078802911e-05, + "loss": 0.2666, + "step": 12830 + }, + { + "epoch": 0.7042810098792536, + "grad_norm": 1.120234489440918, + "learning_rate": 3.063482965693558e-05, + "loss": 0.1971, + "step": 12832 + }, + { + "epoch": 0.7043907793633369, + "grad_norm": 1.764536738395691, + "learning_rate": 3.0629728278670756e-05, + "loss": 0.3208, + "step": 12834 + }, + { + "epoch": 0.7045005488474204, + "grad_norm": 0.9745375514030457, + "learning_rate": 3.0624626653458423e-05, + "loss": 0.1494, + "step": 12836 + }, + { + "epoch": 0.7046103183315039, + "grad_norm": 2.0802578926086426, + "learning_rate": 3.0619524781522355e-05, + "loss": 0.4336, + "step": 12838 + }, + { + "epoch": 0.7047200878155873, + "grad_norm": 0.930344820022583, + "learning_rate": 3.061442266308635e-05, + "loss": 0.1577, + "step": 12840 + }, + { + "epoch": 0.7048298572996707, + "grad_norm": 1.5257436037063599, + "learning_rate": 3.060932029837421e-05, + "loss": 0.2892, + "step": 12842 + }, + { + "epoch": 0.7049396267837541, + "grad_norm": 5.295181751251221, + "learning_rate": 3.0604217687609746e-05, + "loss": 0.317, + "step": 12844 + }, + { + "epoch": 0.7050493962678376, + "grad_norm": 1.2744112014770508, + "learning_rate": 3.0599114831016796e-05, + "loss": 0.3483, + "step": 12846 + }, + { + "epoch": 0.705159165751921, + "grad_norm": 1.31413733959198, + "learning_rate": 3.059401172881918e-05, + "loss": 0.3001, + "step": 12848 + }, + { + "epoch": 0.7052689352360044, + "grad_norm": 1.7634655237197876, + "learning_rate": 3.058890838124075e-05, + "loss": 0.2499, + "step": 12850 + }, + { + "epoch": 0.7053787047200878, + "grad_norm": 1.003002405166626, + "learning_rate": 3.058380478850538e-05, + "loss": 0.2713, + "step": 12852 + }, + { + "epoch": 0.7054884742041713, + "grad_norm": 0.8621692061424255, + "learning_rate": 3.0578700950836915e-05, + "loss": 0.2148, + "step": 12854 + }, + { + "epoch": 0.7055982436882546, + "grad_norm": 1.505501389503479, + "learning_rate": 3.057359686845924e-05, + "loss": 0.2527, + "step": 12856 + }, + { + "epoch": 0.7057080131723381, + "grad_norm": 1.9474350214004517, + "learning_rate": 3.0568492541596257e-05, + "loss": 0.4812, + "step": 12858 + }, + { + "epoch": 0.7058177826564215, + "grad_norm": 1.3644121885299683, + "learning_rate": 3.0563387970471855e-05, + "loss": 0.3063, + "step": 12860 + }, + { + "epoch": 0.705927552140505, + "grad_norm": 1.8998527526855469, + "learning_rate": 3.055828315530995e-05, + "loss": 0.244, + "step": 12862 + }, + { + "epoch": 0.7060373216245883, + "grad_norm": 1.9928761720657349, + "learning_rate": 3.0553178096334464e-05, + "loss": 0.236, + "step": 12864 + }, + { + "epoch": 0.7061470911086718, + "grad_norm": 1.3057286739349365, + "learning_rate": 3.054807279376934e-05, + "loss": 0.2722, + "step": 12866 + }, + { + "epoch": 0.7062568605927552, + "grad_norm": 0.9807634353637695, + "learning_rate": 3.05429672478385e-05, + "loss": 0.3683, + "step": 12868 + }, + { + "epoch": 0.7063666300768386, + "grad_norm": 1.8502596616744995, + "learning_rate": 3.053786145876592e-05, + "loss": 0.2717, + "step": 12870 + }, + { + "epoch": 0.706476399560922, + "grad_norm": 2.0320193767547607, + "learning_rate": 3.053275542677554e-05, + "loss": 0.2249, + "step": 12872 + }, + { + "epoch": 0.7065861690450055, + "grad_norm": 1.2074192762374878, + "learning_rate": 3.052764915209136e-05, + "loss": 0.2294, + "step": 12874 + }, + { + "epoch": 0.706695938529089, + "grad_norm": 1.7110753059387207, + "learning_rate": 3.052254263493736e-05, + "loss": 0.2293, + "step": 12876 + }, + { + "epoch": 0.7068057080131723, + "grad_norm": 0.9456422924995422, + "learning_rate": 3.0517435875537536e-05, + "loss": 0.1562, + "step": 12878 + }, + { + "epoch": 0.7069154774972558, + "grad_norm": 1.0828908681869507, + "learning_rate": 3.051232887411588e-05, + "loss": 0.2643, + "step": 12880 + }, + { + "epoch": 0.7070252469813392, + "grad_norm": 1.4421306848526, + "learning_rate": 3.0507221630896433e-05, + "loss": 0.2288, + "step": 12882 + }, + { + "epoch": 0.7071350164654227, + "grad_norm": 1.0745724439620972, + "learning_rate": 3.050211414610321e-05, + "loss": 0.24, + "step": 12884 + }, + { + "epoch": 0.707244785949506, + "grad_norm": 1.392905592918396, + "learning_rate": 3.0497006419960256e-05, + "loss": 0.2536, + "step": 12886 + }, + { + "epoch": 0.7073545554335895, + "grad_norm": 1.0617128610610962, + "learning_rate": 3.0491898452691625e-05, + "loss": 0.1394, + "step": 12888 + }, + { + "epoch": 0.7074643249176729, + "grad_norm": 1.4537166357040405, + "learning_rate": 3.048679024452137e-05, + "loss": 0.3052, + "step": 12890 + }, + { + "epoch": 0.7075740944017563, + "grad_norm": 1.568987488746643, + "learning_rate": 3.0481681795673556e-05, + "loss": 0.2296, + "step": 12892 + }, + { + "epoch": 0.7076838638858397, + "grad_norm": 1.89082670211792, + "learning_rate": 3.0476573106372276e-05, + "loss": 0.2189, + "step": 12894 + }, + { + "epoch": 0.7077936333699232, + "grad_norm": 2.7154123783111572, + "learning_rate": 3.0471464176841624e-05, + "loss": 0.2732, + "step": 12896 + }, + { + "epoch": 0.7079034028540065, + "grad_norm": 3.4891421794891357, + "learning_rate": 3.0466355007305697e-05, + "loss": 0.3172, + "step": 12898 + }, + { + "epoch": 0.70801317233809, + "grad_norm": 1.7786725759506226, + "learning_rate": 3.0461245597988603e-05, + "loss": 0.2618, + "step": 12900 + }, + { + "epoch": 0.7081229418221734, + "grad_norm": 1.6289743185043335, + "learning_rate": 3.045613594911448e-05, + "loss": 0.1935, + "step": 12902 + }, + { + "epoch": 0.7082327113062569, + "grad_norm": 1.2846992015838623, + "learning_rate": 3.045102606090745e-05, + "loss": 0.2927, + "step": 12904 + }, + { + "epoch": 0.7083424807903402, + "grad_norm": 1.7767349481582642, + "learning_rate": 3.0445915933591658e-05, + "loss": 0.2878, + "step": 12906 + }, + { + "epoch": 0.7084522502744237, + "grad_norm": 1.0473568439483643, + "learning_rate": 3.0440805567391273e-05, + "loss": 0.2443, + "step": 12908 + }, + { + "epoch": 0.7085620197585072, + "grad_norm": 2.182405471801758, + "learning_rate": 3.0435694962530437e-05, + "loss": 0.2548, + "step": 12910 + }, + { + "epoch": 0.7086717892425906, + "grad_norm": 1.705395221710205, + "learning_rate": 3.0430584119233348e-05, + "loss": 0.2515, + "step": 12912 + }, + { + "epoch": 0.708781558726674, + "grad_norm": 1.4791501760482788, + "learning_rate": 3.0425473037724182e-05, + "loss": 0.2636, + "step": 12914 + }, + { + "epoch": 0.7088913282107574, + "grad_norm": 1.5687830448150635, + "learning_rate": 3.042036171822714e-05, + "loss": 0.2552, + "step": 12916 + }, + { + "epoch": 0.7090010976948409, + "grad_norm": 1.101912021636963, + "learning_rate": 3.041525016096643e-05, + "loss": 0.2135, + "step": 12918 + }, + { + "epoch": 0.7091108671789242, + "grad_norm": 1.4936485290527344, + "learning_rate": 3.0410138366166268e-05, + "loss": 0.2498, + "step": 12920 + }, + { + "epoch": 0.7092206366630077, + "grad_norm": 3.05521297454834, + "learning_rate": 3.0405026334050884e-05, + "loss": 0.2752, + "step": 12922 + }, + { + "epoch": 0.7093304061470911, + "grad_norm": 1.5999261140823364, + "learning_rate": 3.039991406484451e-05, + "loss": 0.3613, + "step": 12924 + }, + { + "epoch": 0.7094401756311746, + "grad_norm": 2.6435976028442383, + "learning_rate": 3.03948015587714e-05, + "loss": 0.3624, + "step": 12926 + }, + { + "epoch": 0.7095499451152579, + "grad_norm": 1.2746477127075195, + "learning_rate": 3.038968881605583e-05, + "loss": 0.3172, + "step": 12928 + }, + { + "epoch": 0.7096597145993414, + "grad_norm": 1.7283469438552856, + "learning_rate": 3.0384575836922036e-05, + "loss": 0.3193, + "step": 12930 + }, + { + "epoch": 0.7097694840834248, + "grad_norm": 1.0342299938201904, + "learning_rate": 3.0379462621594324e-05, + "loss": 0.2546, + "step": 12932 + }, + { + "epoch": 0.7098792535675083, + "grad_norm": 2.1790049076080322, + "learning_rate": 3.037434917029698e-05, + "loss": 0.3296, + "step": 12934 + }, + { + "epoch": 0.7099890230515916, + "grad_norm": 1.5248403549194336, + "learning_rate": 3.03692354832543e-05, + "loss": 0.1394, + "step": 12936 + }, + { + "epoch": 0.7100987925356751, + "grad_norm": 1.645107626914978, + "learning_rate": 3.0364121560690593e-05, + "loss": 0.2111, + "step": 12938 + }, + { + "epoch": 0.7102085620197585, + "grad_norm": 1.3377726078033447, + "learning_rate": 3.0359007402830192e-05, + "loss": 0.37, + "step": 12940 + }, + { + "epoch": 0.7103183315038419, + "grad_norm": 1.624897837638855, + "learning_rate": 3.0353893009897416e-05, + "loss": 0.312, + "step": 12942 + }, + { + "epoch": 0.7104281009879253, + "grad_norm": 1.5258777141571045, + "learning_rate": 3.0348778382116627e-05, + "loss": 0.157, + "step": 12944 + }, + { + "epoch": 0.7105378704720088, + "grad_norm": 0.9331977963447571, + "learning_rate": 3.0343663519712157e-05, + "loss": 0.3508, + "step": 12946 + }, + { + "epoch": 0.7106476399560923, + "grad_norm": 1.9064933061599731, + "learning_rate": 3.0338548422908374e-05, + "loss": 0.3356, + "step": 12948 + }, + { + "epoch": 0.7107574094401756, + "grad_norm": 1.1462035179138184, + "learning_rate": 3.033343309192966e-05, + "loss": 0.2141, + "step": 12950 + }, + { + "epoch": 0.7108671789242591, + "grad_norm": 1.6883127689361572, + "learning_rate": 3.0328317527000395e-05, + "loss": 0.2097, + "step": 12952 + }, + { + "epoch": 0.7109769484083425, + "grad_norm": 0.9069922566413879, + "learning_rate": 3.0323201728344957e-05, + "loss": 0.3175, + "step": 12954 + }, + { + "epoch": 0.711086717892426, + "grad_norm": 2.0884835720062256, + "learning_rate": 3.031808569618777e-05, + "loss": 0.1529, + "step": 12956 + }, + { + "epoch": 0.7111964873765093, + "grad_norm": 1.7763051986694336, + "learning_rate": 3.0312969430753246e-05, + "loss": 0.2718, + "step": 12958 + }, + { + "epoch": 0.7113062568605928, + "grad_norm": 1.22206711769104, + "learning_rate": 3.0307852932265797e-05, + "loss": 0.1746, + "step": 12960 + }, + { + "epoch": 0.7114160263446762, + "grad_norm": 1.4935961961746216, + "learning_rate": 3.030273620094987e-05, + "loss": 0.3346, + "step": 12962 + }, + { + "epoch": 0.7115257958287596, + "grad_norm": 1.8215354681015015, + "learning_rate": 3.029761923702991e-05, + "loss": 0.3071, + "step": 12964 + }, + { + "epoch": 0.711635565312843, + "grad_norm": 1.7762953042984009, + "learning_rate": 3.0292502040730362e-05, + "loss": 0.3164, + "step": 12966 + }, + { + "epoch": 0.7117453347969265, + "grad_norm": 1.5156251192092896, + "learning_rate": 3.0287384612275703e-05, + "loss": 0.2965, + "step": 12968 + }, + { + "epoch": 0.7118551042810098, + "grad_norm": 1.2678683996200562, + "learning_rate": 3.0282266951890398e-05, + "loss": 0.29, + "step": 12970 + }, + { + "epoch": 0.7119648737650933, + "grad_norm": 1.8282827138900757, + "learning_rate": 3.0277149059798943e-05, + "loss": 0.2789, + "step": 12972 + }, + { + "epoch": 0.7120746432491767, + "grad_norm": 1.5984677076339722, + "learning_rate": 3.027203093622582e-05, + "loss": 0.3444, + "step": 12974 + }, + { + "epoch": 0.7121844127332602, + "grad_norm": 1.810978889465332, + "learning_rate": 3.0266912581395557e-05, + "loss": 0.3352, + "step": 12976 + }, + { + "epoch": 0.7122941822173435, + "grad_norm": 1.9970934391021729, + "learning_rate": 3.026179399553264e-05, + "loss": 0.283, + "step": 12978 + }, + { + "epoch": 0.712403951701427, + "grad_norm": 1.7255427837371826, + "learning_rate": 3.0256675178861628e-05, + "loss": 0.352, + "step": 12980 + }, + { + "epoch": 0.7125137211855104, + "grad_norm": 2.8718714714050293, + "learning_rate": 3.0251556131607032e-05, + "loss": 0.215, + "step": 12982 + }, + { + "epoch": 0.7126234906695939, + "grad_norm": 1.0559076070785522, + "learning_rate": 3.024643685399341e-05, + "loss": 0.2558, + "step": 12984 + }, + { + "epoch": 0.7127332601536773, + "grad_norm": 1.5563383102416992, + "learning_rate": 3.024131734624531e-05, + "loss": 0.2666, + "step": 12986 + }, + { + "epoch": 0.7128430296377607, + "grad_norm": 3.1463782787323, + "learning_rate": 3.0236197608587313e-05, + "loss": 0.2538, + "step": 12988 + }, + { + "epoch": 0.7129527991218442, + "grad_norm": 1.4918495416641235, + "learning_rate": 3.023107764124399e-05, + "loss": 0.1996, + "step": 12990 + }, + { + "epoch": 0.7130625686059275, + "grad_norm": 1.5612764358520508, + "learning_rate": 3.0225957444439916e-05, + "loss": 0.2188, + "step": 12992 + }, + { + "epoch": 0.713172338090011, + "grad_norm": 1.8153573274612427, + "learning_rate": 3.0220837018399712e-05, + "loss": 0.3667, + "step": 12994 + }, + { + "epoch": 0.7132821075740944, + "grad_norm": 3.0500805377960205, + "learning_rate": 3.0215716363347956e-05, + "loss": 0.1741, + "step": 12996 + }, + { + "epoch": 0.7133918770581779, + "grad_norm": 1.3173948526382446, + "learning_rate": 3.0210595479509297e-05, + "loss": 0.1992, + "step": 12998 + }, + { + "epoch": 0.7135016465422612, + "grad_norm": 1.9731847047805786, + "learning_rate": 3.0205474367108328e-05, + "loss": 0.2864, + "step": 13000 + }, + { + "epoch": 0.7136114160263447, + "grad_norm": 1.4347317218780518, + "learning_rate": 3.0200353026369716e-05, + "loss": 0.2081, + "step": 13002 + }, + { + "epoch": 0.7137211855104281, + "grad_norm": 1.2140886783599854, + "learning_rate": 3.0195231457518087e-05, + "loss": 0.2436, + "step": 13004 + }, + { + "epoch": 0.7138309549945115, + "grad_norm": 1.9184850454330444, + "learning_rate": 3.019010966077811e-05, + "loss": 0.2481, + "step": 13006 + }, + { + "epoch": 0.7139407244785949, + "grad_norm": 3.7218973636627197, + "learning_rate": 3.018498763637445e-05, + "loss": 0.2126, + "step": 13008 + }, + { + "epoch": 0.7140504939626784, + "grad_norm": 2.5066118240356445, + "learning_rate": 3.017986538453178e-05, + "loss": 0.2967, + "step": 13010 + }, + { + "epoch": 0.7141602634467618, + "grad_norm": 1.3613677024841309, + "learning_rate": 3.0174742905474794e-05, + "loss": 0.1988, + "step": 13012 + }, + { + "epoch": 0.7142700329308452, + "grad_norm": 1.8075093030929565, + "learning_rate": 3.0169620199428183e-05, + "loss": 0.2204, + "step": 13014 + }, + { + "epoch": 0.7143798024149286, + "grad_norm": 1.9010839462280273, + "learning_rate": 3.0164497266616648e-05, + "loss": 0.3001, + "step": 13016 + }, + { + "epoch": 0.7144895718990121, + "grad_norm": 1.2225455045700073, + "learning_rate": 3.0159374107264925e-05, + "loss": 0.3038, + "step": 13018 + }, + { + "epoch": 0.7145993413830956, + "grad_norm": 1.0629589557647705, + "learning_rate": 3.0154250721597732e-05, + "loss": 0.2321, + "step": 13020 + }, + { + "epoch": 0.7147091108671789, + "grad_norm": 2.099564552307129, + "learning_rate": 3.0149127109839793e-05, + "loss": 0.3209, + "step": 13022 + }, + { + "epoch": 0.7148188803512624, + "grad_norm": 1.2059545516967773, + "learning_rate": 3.0144003272215877e-05, + "loss": 0.2708, + "step": 13024 + }, + { + "epoch": 0.7149286498353458, + "grad_norm": 3.127507448196411, + "learning_rate": 3.0138879208950722e-05, + "loss": 0.2605, + "step": 13026 + }, + { + "epoch": 0.7150384193194292, + "grad_norm": 1.6232233047485352, + "learning_rate": 3.0133754920269103e-05, + "loss": 0.2912, + "step": 13028 + }, + { + "epoch": 0.7151481888035126, + "grad_norm": 2.339841842651367, + "learning_rate": 3.012863040639579e-05, + "loss": 0.324, + "step": 13030 + }, + { + "epoch": 0.7152579582875961, + "grad_norm": 1.572165846824646, + "learning_rate": 3.0123505667555584e-05, + "loss": 0.2307, + "step": 13032 + }, + { + "epoch": 0.7153677277716795, + "grad_norm": 1.8816317319869995, + "learning_rate": 3.0118380703973265e-05, + "loss": 0.453, + "step": 13034 + }, + { + "epoch": 0.7154774972557629, + "grad_norm": 1.9111571311950684, + "learning_rate": 3.011325551587365e-05, + "loss": 0.3426, + "step": 13036 + }, + { + "epoch": 0.7155872667398463, + "grad_norm": 1.320780634880066, + "learning_rate": 3.0108130103481554e-05, + "loss": 0.2424, + "step": 13038 + }, + { + "epoch": 0.7156970362239298, + "grad_norm": 1.7201687097549438, + "learning_rate": 3.010300446702179e-05, + "loss": 0.2286, + "step": 13040 + }, + { + "epoch": 0.7158068057080131, + "grad_norm": 4.766092777252197, + "learning_rate": 3.009787860671921e-05, + "loss": 0.4304, + "step": 13042 + }, + { + "epoch": 0.7159165751920966, + "grad_norm": 1.8741662502288818, + "learning_rate": 3.0092752522798652e-05, + "loss": 0.2077, + "step": 13044 + }, + { + "epoch": 0.71602634467618, + "grad_norm": 1.1941092014312744, + "learning_rate": 3.008762621548496e-05, + "loss": 0.2403, + "step": 13046 + }, + { + "epoch": 0.7161361141602635, + "grad_norm": 1.4980072975158691, + "learning_rate": 3.0082499685003025e-05, + "loss": 0.3014, + "step": 13048 + }, + { + "epoch": 0.7162458836443468, + "grad_norm": 1.5377601385116577, + "learning_rate": 3.0077372931577702e-05, + "loss": 0.2257, + "step": 13050 + }, + { + "epoch": 0.7163556531284303, + "grad_norm": 1.1214725971221924, + "learning_rate": 3.0072245955433882e-05, + "loss": 0.2139, + "step": 13052 + }, + { + "epoch": 0.7164654226125137, + "grad_norm": 1.1347414255142212, + "learning_rate": 3.006711875679646e-05, + "loss": 0.3004, + "step": 13054 + }, + { + "epoch": 0.7165751920965971, + "grad_norm": 1.8534208536148071, + "learning_rate": 3.006199133589034e-05, + "loss": 0.3348, + "step": 13056 + }, + { + "epoch": 0.7166849615806806, + "grad_norm": 1.8157984018325806, + "learning_rate": 3.0056863692940428e-05, + "loss": 0.2619, + "step": 13058 + }, + { + "epoch": 0.716794731064764, + "grad_norm": 2.2647781372070312, + "learning_rate": 3.0051735828171653e-05, + "loss": 0.3213, + "step": 13060 + }, + { + "epoch": 0.7169045005488475, + "grad_norm": 1.990967035293579, + "learning_rate": 3.004660774180895e-05, + "loss": 0.2538, + "step": 13062 + }, + { + "epoch": 0.7170142700329308, + "grad_norm": 1.677652359008789, + "learning_rate": 3.004147943407727e-05, + "loss": 0.3352, + "step": 13064 + }, + { + "epoch": 0.7171240395170143, + "grad_norm": 1.6083234548568726, + "learning_rate": 3.003635090520155e-05, + "loss": 0.1944, + "step": 13066 + }, + { + "epoch": 0.7172338090010977, + "grad_norm": 1.3357958793640137, + "learning_rate": 3.0031222155406763e-05, + "loss": 0.2221, + "step": 13068 + }, + { + "epoch": 0.7173435784851812, + "grad_norm": 1.2842700481414795, + "learning_rate": 3.0026093184917868e-05, + "loss": 0.2473, + "step": 13070 + }, + { + "epoch": 0.7174533479692645, + "grad_norm": 1.3925222158432007, + "learning_rate": 3.0020963993959873e-05, + "loss": 0.319, + "step": 13072 + }, + { + "epoch": 0.717563117453348, + "grad_norm": 1.6654586791992188, + "learning_rate": 3.001583458275774e-05, + "loss": 0.3617, + "step": 13074 + }, + { + "epoch": 0.7176728869374314, + "grad_norm": 2.8416125774383545, + "learning_rate": 3.0010704951536482e-05, + "loss": 0.2962, + "step": 13076 + }, + { + "epoch": 0.7177826564215148, + "grad_norm": 1.314545750617981, + "learning_rate": 3.0005575100521118e-05, + "loss": 0.2219, + "step": 13078 + }, + { + "epoch": 0.7178924259055982, + "grad_norm": 1.3222860097885132, + "learning_rate": 3.0000445029936656e-05, + "loss": 0.1757, + "step": 13080 + }, + { + "epoch": 0.7180021953896817, + "grad_norm": 0.9324907660484314, + "learning_rate": 2.999531474000814e-05, + "loss": 0.2886, + "step": 13082 + }, + { + "epoch": 0.718111964873765, + "grad_norm": 1.6845322847366333, + "learning_rate": 2.9990184230960595e-05, + "loss": 0.3928, + "step": 13084 + }, + { + "epoch": 0.7182217343578485, + "grad_norm": 1.4214271306991577, + "learning_rate": 2.9985053503019078e-05, + "loss": 0.285, + "step": 13086 + }, + { + "epoch": 0.7183315038419319, + "grad_norm": 1.3999125957489014, + "learning_rate": 2.997992255640864e-05, + "loss": 0.2619, + "step": 13088 + }, + { + "epoch": 0.7184412733260154, + "grad_norm": 4.817384719848633, + "learning_rate": 2.9974791391354363e-05, + "loss": 0.2599, + "step": 13090 + }, + { + "epoch": 0.7185510428100987, + "grad_norm": 1.9019466638565063, + "learning_rate": 2.9969660008081314e-05, + "loss": 0.1998, + "step": 13092 + }, + { + "epoch": 0.7186608122941822, + "grad_norm": 1.7623761892318726, + "learning_rate": 2.9964528406814584e-05, + "loss": 0.2084, + "step": 13094 + }, + { + "epoch": 0.7187705817782657, + "grad_norm": 1.2744603157043457, + "learning_rate": 2.995939658777927e-05, + "loss": 0.2618, + "step": 13096 + }, + { + "epoch": 0.7188803512623491, + "grad_norm": 1.7047820091247559, + "learning_rate": 2.995426455120049e-05, + "loss": 0.2453, + "step": 13098 + }, + { + "epoch": 0.7189901207464325, + "grad_norm": 1.824493408203125, + "learning_rate": 2.994913229730334e-05, + "loss": 0.3863, + "step": 13100 + }, + { + "epoch": 0.7190998902305159, + "grad_norm": 1.9555842876434326, + "learning_rate": 2.9943999826312957e-05, + "loss": 0.206, + "step": 13102 + }, + { + "epoch": 0.7192096597145994, + "grad_norm": 2.133089542388916, + "learning_rate": 2.993886713845448e-05, + "loss": 0.3519, + "step": 13104 + }, + { + "epoch": 0.7193194291986827, + "grad_norm": 0.9411343932151794, + "learning_rate": 2.993373423395304e-05, + "loss": 0.1967, + "step": 13106 + }, + { + "epoch": 0.7194291986827662, + "grad_norm": 1.2291284799575806, + "learning_rate": 2.99286011130338e-05, + "loss": 0.2412, + "step": 13108 + }, + { + "epoch": 0.7195389681668496, + "grad_norm": 1.3585437536239624, + "learning_rate": 2.992346777592193e-05, + "loss": 0.2572, + "step": 13110 + }, + { + "epoch": 0.7196487376509331, + "grad_norm": 1.1836800575256348, + "learning_rate": 2.9918334222842602e-05, + "loss": 0.3264, + "step": 13112 + }, + { + "epoch": 0.7197585071350164, + "grad_norm": 1.6858282089233398, + "learning_rate": 2.9913200454020983e-05, + "loss": 0.2651, + "step": 13114 + }, + { + "epoch": 0.7198682766190999, + "grad_norm": 0.8949785232543945, + "learning_rate": 2.990806646968229e-05, + "loss": 0.1782, + "step": 13116 + }, + { + "epoch": 0.7199780461031833, + "grad_norm": 1.4118200540542603, + "learning_rate": 2.9902932270051704e-05, + "loss": 0.2975, + "step": 13118 + }, + { + "epoch": 0.7200878155872668, + "grad_norm": 1.05492103099823, + "learning_rate": 2.989779785535444e-05, + "loss": 0.2997, + "step": 13120 + }, + { + "epoch": 0.7201975850713501, + "grad_norm": 1.4894938468933105, + "learning_rate": 2.9892663225815726e-05, + "loss": 0.2634, + "step": 13122 + }, + { + "epoch": 0.7203073545554336, + "grad_norm": 1.7520837783813477, + "learning_rate": 2.988752838166079e-05, + "loss": 0.3648, + "step": 13124 + }, + { + "epoch": 0.720417124039517, + "grad_norm": 1.3772478103637695, + "learning_rate": 2.9882393323114864e-05, + "loss": 0.3748, + "step": 13126 + }, + { + "epoch": 0.7205268935236004, + "grad_norm": 1.636313796043396, + "learning_rate": 2.9877258050403212e-05, + "loss": 0.296, + "step": 13128 + }, + { + "epoch": 0.7206366630076838, + "grad_norm": 1.4125816822052002, + "learning_rate": 2.9872122563751077e-05, + "loss": 0.3441, + "step": 13130 + }, + { + "epoch": 0.7207464324917673, + "grad_norm": 1.5212799310684204, + "learning_rate": 2.9866986863383728e-05, + "loss": 0.3444, + "step": 13132 + }, + { + "epoch": 0.7208562019758508, + "grad_norm": 1.3221864700317383, + "learning_rate": 2.986185094952646e-05, + "loss": 0.1887, + "step": 13134 + }, + { + "epoch": 0.7209659714599341, + "grad_norm": 2.756314754486084, + "learning_rate": 2.985671482240453e-05, + "loss": 0.4467, + "step": 13136 + }, + { + "epoch": 0.7210757409440176, + "grad_norm": 1.4514187574386597, + "learning_rate": 2.9851578482243257e-05, + "loss": 0.3598, + "step": 13138 + }, + { + "epoch": 0.721185510428101, + "grad_norm": 2.332228660583496, + "learning_rate": 2.9846441929267942e-05, + "loss": 0.3218, + "step": 13140 + }, + { + "epoch": 0.7212952799121845, + "grad_norm": 1.7111480236053467, + "learning_rate": 2.9841305163703898e-05, + "loss": 0.3562, + "step": 13142 + }, + { + "epoch": 0.7214050493962678, + "grad_norm": 1.5984413623809814, + "learning_rate": 2.9836168185776436e-05, + "loss": 0.3749, + "step": 13144 + }, + { + "epoch": 0.7215148188803513, + "grad_norm": 1.0611674785614014, + "learning_rate": 2.983103099571091e-05, + "loss": 0.2113, + "step": 13146 + }, + { + "epoch": 0.7216245883644347, + "grad_norm": 1.2342439889907837, + "learning_rate": 2.982589359373265e-05, + "loss": 0.2122, + "step": 13148 + }, + { + "epoch": 0.7217343578485181, + "grad_norm": 1.041797161102295, + "learning_rate": 2.9820755980067005e-05, + "loss": 0.283, + "step": 13150 + }, + { + "epoch": 0.7218441273326015, + "grad_norm": 1.2786442041397095, + "learning_rate": 2.9815618154939347e-05, + "loss": 0.291, + "step": 13152 + }, + { + "epoch": 0.721953896816685, + "grad_norm": 2.4763669967651367, + "learning_rate": 2.9810480118575042e-05, + "loss": 0.1616, + "step": 13154 + }, + { + "epoch": 0.7220636663007683, + "grad_norm": 1.3042776584625244, + "learning_rate": 2.980534187119946e-05, + "loss": 0.2341, + "step": 13156 + }, + { + "epoch": 0.7221734357848518, + "grad_norm": 1.70366370677948, + "learning_rate": 2.9800203413038003e-05, + "loss": 0.1715, + "step": 13158 + }, + { + "epoch": 0.7222832052689352, + "grad_norm": 1.5533419847488403, + "learning_rate": 2.9795064744316064e-05, + "loss": 0.3116, + "step": 13160 + }, + { + "epoch": 0.7223929747530187, + "grad_norm": 1.828476905822754, + "learning_rate": 2.9789925865259046e-05, + "loss": 0.1864, + "step": 13162 + }, + { + "epoch": 0.722502744237102, + "grad_norm": 1.0810390710830688, + "learning_rate": 2.978478677609237e-05, + "loss": 0.2299, + "step": 13164 + }, + { + "epoch": 0.7226125137211855, + "grad_norm": 1.2832369804382324, + "learning_rate": 2.9779647477041462e-05, + "loss": 0.285, + "step": 13166 + }, + { + "epoch": 0.722722283205269, + "grad_norm": 1.866602897644043, + "learning_rate": 2.9774507968331762e-05, + "loss": 0.4521, + "step": 13168 + }, + { + "epoch": 0.7228320526893524, + "grad_norm": 1.562896490097046, + "learning_rate": 2.9769368250188697e-05, + "loss": 0.3611, + "step": 13170 + }, + { + "epoch": 0.7229418221734358, + "grad_norm": 1.348801851272583, + "learning_rate": 2.976422832283774e-05, + "loss": 0.215, + "step": 13172 + }, + { + "epoch": 0.7230515916575192, + "grad_norm": 1.7406468391418457, + "learning_rate": 2.9759088186504337e-05, + "loss": 0.2843, + "step": 13174 + }, + { + "epoch": 0.7231613611416027, + "grad_norm": 2.304133892059326, + "learning_rate": 2.975394784141397e-05, + "loss": 0.3265, + "step": 13176 + }, + { + "epoch": 0.723271130625686, + "grad_norm": 1.408852219581604, + "learning_rate": 2.974880728779212e-05, + "loss": 0.2293, + "step": 13178 + }, + { + "epoch": 0.7233809001097695, + "grad_norm": 2.2594478130340576, + "learning_rate": 2.9743666525864266e-05, + "loss": 0.338, + "step": 13180 + }, + { + "epoch": 0.7234906695938529, + "grad_norm": 2.282778263092041, + "learning_rate": 2.9738525555855924e-05, + "loss": 0.2868, + "step": 13182 + }, + { + "epoch": 0.7236004390779364, + "grad_norm": 1.6565648317337036, + "learning_rate": 2.973338437799259e-05, + "loss": 0.3212, + "step": 13184 + }, + { + "epoch": 0.7237102085620197, + "grad_norm": 2.857316017150879, + "learning_rate": 2.9728242992499783e-05, + "loss": 0.2611, + "step": 13186 + }, + { + "epoch": 0.7238199780461032, + "grad_norm": 2.3360137939453125, + "learning_rate": 2.9723101399603032e-05, + "loss": 0.2944, + "step": 13188 + }, + { + "epoch": 0.7239297475301866, + "grad_norm": 1.0035284757614136, + "learning_rate": 2.9717959599527874e-05, + "loss": 0.2216, + "step": 13190 + }, + { + "epoch": 0.72403951701427, + "grad_norm": 1.0479539632797241, + "learning_rate": 2.971281759249985e-05, + "loss": 0.1787, + "step": 13192 + }, + { + "epoch": 0.7241492864983534, + "grad_norm": 4.847830772399902, + "learning_rate": 2.9707675378744503e-05, + "loss": 0.2451, + "step": 13194 + }, + { + "epoch": 0.7242590559824369, + "grad_norm": 1.32900071144104, + "learning_rate": 2.9702532958487423e-05, + "loss": 0.2075, + "step": 13196 + }, + { + "epoch": 0.7243688254665203, + "grad_norm": 2.691025495529175, + "learning_rate": 2.9697390331954157e-05, + "loss": 0.2379, + "step": 13198 + }, + { + "epoch": 0.7244785949506037, + "grad_norm": 1.3143914937973022, + "learning_rate": 2.9692247499370295e-05, + "loss": 0.2737, + "step": 13200 + }, + { + "epoch": 0.7245883644346871, + "grad_norm": 2.347320556640625, + "learning_rate": 2.968710446096143e-05, + "loss": 0.3051, + "step": 13202 + }, + { + "epoch": 0.7246981339187706, + "grad_norm": 2.112602472305298, + "learning_rate": 2.968196121695316e-05, + "loss": 0.2707, + "step": 13204 + }, + { + "epoch": 0.7248079034028541, + "grad_norm": 2.3351340293884277, + "learning_rate": 2.9676817767571086e-05, + "loss": 0.2638, + "step": 13206 + }, + { + "epoch": 0.7249176728869374, + "grad_norm": 1.282254695892334, + "learning_rate": 2.967167411304083e-05, + "loss": 0.2646, + "step": 13208 + }, + { + "epoch": 0.7250274423710209, + "grad_norm": 1.4144775867462158, + "learning_rate": 2.966653025358802e-05, + "loss": 0.2112, + "step": 13210 + }, + { + "epoch": 0.7251372118551043, + "grad_norm": 2.5662879943847656, + "learning_rate": 2.9661386189438277e-05, + "loss": 0.2399, + "step": 13212 + }, + { + "epoch": 0.7252469813391877, + "grad_norm": 1.7138034105300903, + "learning_rate": 2.9656241920817274e-05, + "loss": 0.2647, + "step": 13214 + }, + { + "epoch": 0.7253567508232711, + "grad_norm": 3.084664821624756, + "learning_rate": 2.9651097447950636e-05, + "loss": 0.2983, + "step": 13216 + }, + { + "epoch": 0.7254665203073546, + "grad_norm": 1.1469385623931885, + "learning_rate": 2.9645952771064035e-05, + "loss": 0.2123, + "step": 13218 + }, + { + "epoch": 0.725576289791438, + "grad_norm": 1.7531312704086304, + "learning_rate": 2.964080789038315e-05, + "loss": 0.2524, + "step": 13220 + }, + { + "epoch": 0.7256860592755214, + "grad_norm": 1.6306047439575195, + "learning_rate": 2.9635662806133647e-05, + "loss": 0.2908, + "step": 13222 + }, + { + "epoch": 0.7257958287596048, + "grad_norm": 0.9365378022193909, + "learning_rate": 2.9630517518541217e-05, + "loss": 0.2491, + "step": 13224 + }, + { + "epoch": 0.7259055982436883, + "grad_norm": 1.0802167654037476, + "learning_rate": 2.9625372027831567e-05, + "loss": 0.2567, + "step": 13226 + }, + { + "epoch": 0.7260153677277716, + "grad_norm": 1.5666747093200684, + "learning_rate": 2.9620226334230388e-05, + "loss": 0.3042, + "step": 13228 + }, + { + "epoch": 0.7261251372118551, + "grad_norm": 2.928501605987549, + "learning_rate": 2.9615080437963406e-05, + "loss": 0.2834, + "step": 13230 + }, + { + "epoch": 0.7262349066959385, + "grad_norm": 1.4075103998184204, + "learning_rate": 2.9609934339256352e-05, + "loss": 0.3077, + "step": 13232 + }, + { + "epoch": 0.726344676180022, + "grad_norm": 2.037879467010498, + "learning_rate": 2.960478803833495e-05, + "loss": 0.2629, + "step": 13234 + }, + { + "epoch": 0.7264544456641053, + "grad_norm": 3.5809926986694336, + "learning_rate": 2.9599641535424938e-05, + "loss": 0.4354, + "step": 13236 + }, + { + "epoch": 0.7265642151481888, + "grad_norm": 1.3502552509307861, + "learning_rate": 2.9594494830752072e-05, + "loss": 0.2227, + "step": 13238 + }, + { + "epoch": 0.7266739846322722, + "grad_norm": 1.2470613718032837, + "learning_rate": 2.958934792454212e-05, + "loss": 0.1835, + "step": 13240 + }, + { + "epoch": 0.7267837541163557, + "grad_norm": 1.4336895942687988, + "learning_rate": 2.9584200817020825e-05, + "loss": 0.3269, + "step": 13242 + }, + { + "epoch": 0.7268935236004391, + "grad_norm": 1.2341161966323853, + "learning_rate": 2.9579053508413994e-05, + "loss": 0.2587, + "step": 13244 + }, + { + "epoch": 0.7270032930845225, + "grad_norm": 1.3275747299194336, + "learning_rate": 2.9573905998947404e-05, + "loss": 0.2528, + "step": 13246 + }, + { + "epoch": 0.727113062568606, + "grad_norm": 1.4221059083938599, + "learning_rate": 2.9568758288846836e-05, + "loss": 0.232, + "step": 13248 + }, + { + "epoch": 0.7272228320526893, + "grad_norm": 1.0358721017837524, + "learning_rate": 2.9563610378338115e-05, + "loss": 0.2191, + "step": 13250 + }, + { + "epoch": 0.7273326015367728, + "grad_norm": 1.8086241483688354, + "learning_rate": 2.955846226764704e-05, + "loss": 0.3591, + "step": 13252 + }, + { + "epoch": 0.7274423710208562, + "grad_norm": 1.6078989505767822, + "learning_rate": 2.9553313956999436e-05, + "loss": 0.2134, + "step": 13254 + }, + { + "epoch": 0.7275521405049397, + "grad_norm": 1.7756150960922241, + "learning_rate": 2.9548165446621134e-05, + "loss": 0.3549, + "step": 13256 + }, + { + "epoch": 0.727661909989023, + "grad_norm": 4.720341682434082, + "learning_rate": 2.9543016736737967e-05, + "loss": 0.299, + "step": 13258 + }, + { + "epoch": 0.7277716794731065, + "grad_norm": 1.7775179147720337, + "learning_rate": 2.9537867827575788e-05, + "loss": 0.2364, + "step": 13260 + }, + { + "epoch": 0.7278814489571899, + "grad_norm": 1.3269309997558594, + "learning_rate": 2.9532718719360452e-05, + "loss": 0.1849, + "step": 13262 + }, + { + "epoch": 0.7279912184412733, + "grad_norm": 1.5047588348388672, + "learning_rate": 2.952756941231783e-05, + "loss": 0.271, + "step": 13264 + }, + { + "epoch": 0.7281009879253567, + "grad_norm": 3.9354233741760254, + "learning_rate": 2.9522419906673786e-05, + "loss": 0.465, + "step": 13266 + }, + { + "epoch": 0.7282107574094402, + "grad_norm": 1.452080488204956, + "learning_rate": 2.9517270202654212e-05, + "loss": 0.3328, + "step": 13268 + }, + { + "epoch": 0.7283205268935236, + "grad_norm": 1.7350775003433228, + "learning_rate": 2.9512120300484995e-05, + "loss": 0.2615, + "step": 13270 + }, + { + "epoch": 0.728430296377607, + "grad_norm": 1.8208516836166382, + "learning_rate": 2.9506970200392032e-05, + "loss": 0.2935, + "step": 13272 + }, + { + "epoch": 0.7285400658616904, + "grad_norm": 1.1703819036483765, + "learning_rate": 2.9501819902601234e-05, + "loss": 0.155, + "step": 13274 + }, + { + "epoch": 0.7286498353457739, + "grad_norm": 1.1513643264770508, + "learning_rate": 2.9496669407338524e-05, + "loss": 0.2618, + "step": 13276 + }, + { + "epoch": 0.7287596048298574, + "grad_norm": 1.2414337396621704, + "learning_rate": 2.949151871482982e-05, + "loss": 0.2848, + "step": 13278 + }, + { + "epoch": 0.7288693743139407, + "grad_norm": 1.4283976554870605, + "learning_rate": 2.9486367825301052e-05, + "loss": 0.3341, + "step": 13280 + }, + { + "epoch": 0.7289791437980242, + "grad_norm": 1.4155491590499878, + "learning_rate": 2.948121673897818e-05, + "loss": 0.265, + "step": 13282 + }, + { + "epoch": 0.7290889132821076, + "grad_norm": 1.543744444847107, + "learning_rate": 2.947606545608714e-05, + "loss": 0.3216, + "step": 13284 + }, + { + "epoch": 0.729198682766191, + "grad_norm": 1.5095409154891968, + "learning_rate": 2.9470913976853908e-05, + "loss": 0.2843, + "step": 13286 + }, + { + "epoch": 0.7293084522502744, + "grad_norm": 1.1179548501968384, + "learning_rate": 2.946576230150444e-05, + "loss": 0.2371, + "step": 13288 + }, + { + "epoch": 0.7294182217343579, + "grad_norm": 1.1763041019439697, + "learning_rate": 2.946061043026472e-05, + "loss": 0.2081, + "step": 13290 + }, + { + "epoch": 0.7295279912184413, + "grad_norm": 1.8598957061767578, + "learning_rate": 2.9455458363360727e-05, + "loss": 0.3239, + "step": 13292 + }, + { + "epoch": 0.7296377607025247, + "grad_norm": 1.4782414436340332, + "learning_rate": 2.9450306101018465e-05, + "loss": 0.2018, + "step": 13294 + }, + { + "epoch": 0.7297475301866081, + "grad_norm": 1.5609328746795654, + "learning_rate": 2.9445153643463942e-05, + "loss": 0.3165, + "step": 13296 + }, + { + "epoch": 0.7298572996706916, + "grad_norm": 1.0247331857681274, + "learning_rate": 2.9440000990923143e-05, + "loss": 0.3257, + "step": 13298 + }, + { + "epoch": 0.7299670691547749, + "grad_norm": 0.9167296290397644, + "learning_rate": 2.9434848143622123e-05, + "loss": 0.1763, + "step": 13300 + }, + { + "epoch": 0.7300768386388584, + "grad_norm": 1.9947752952575684, + "learning_rate": 2.9429695101786892e-05, + "loss": 0.3392, + "step": 13302 + }, + { + "epoch": 0.7301866081229418, + "grad_norm": 1.5903087854385376, + "learning_rate": 2.942454186564349e-05, + "loss": 0.3194, + "step": 13304 + }, + { + "epoch": 0.7302963776070253, + "grad_norm": 2.08255934715271, + "learning_rate": 2.9419388435417966e-05, + "loss": 0.2064, + "step": 13306 + }, + { + "epoch": 0.7304061470911086, + "grad_norm": 2.4062271118164062, + "learning_rate": 2.9414234811336377e-05, + "loss": 0.361, + "step": 13308 + }, + { + "epoch": 0.7305159165751921, + "grad_norm": 1.1838139295578003, + "learning_rate": 2.9409080993624777e-05, + "loss": 0.3573, + "step": 13310 + }, + { + "epoch": 0.7306256860592755, + "grad_norm": 1.1998564004898071, + "learning_rate": 2.940392698250925e-05, + "loss": 0.2808, + "step": 13312 + }, + { + "epoch": 0.730735455543359, + "grad_norm": 1.3354201316833496, + "learning_rate": 2.9398772778215868e-05, + "loss": 0.2437, + "step": 13314 + }, + { + "epoch": 0.7308452250274424, + "grad_norm": 0.9576194882392883, + "learning_rate": 2.9393618380970715e-05, + "loss": 0.1628, + "step": 13316 + }, + { + "epoch": 0.7309549945115258, + "grad_norm": 1.6693518161773682, + "learning_rate": 2.93884637909999e-05, + "loss": 0.3849, + "step": 13318 + }, + { + "epoch": 0.7310647639956093, + "grad_norm": 1.0360640287399292, + "learning_rate": 2.9383309008529525e-05, + "loss": 0.149, + "step": 13320 + }, + { + "epoch": 0.7311745334796926, + "grad_norm": 1.957793116569519, + "learning_rate": 2.937815403378569e-05, + "loss": 0.4307, + "step": 13322 + }, + { + "epoch": 0.7312843029637761, + "grad_norm": 0.8506854176521301, + "learning_rate": 2.9372998866994545e-05, + "loss": 0.1856, + "step": 13324 + }, + { + "epoch": 0.7313940724478595, + "grad_norm": 0.977513313293457, + "learning_rate": 2.9367843508382203e-05, + "loss": 0.1883, + "step": 13326 + }, + { + "epoch": 0.731503841931943, + "grad_norm": 1.2611747980117798, + "learning_rate": 2.93626879581748e-05, + "loss": 0.1768, + "step": 13328 + }, + { + "epoch": 0.7316136114160263, + "grad_norm": 1.182350516319275, + "learning_rate": 2.93575322165985e-05, + "loss": 0.2272, + "step": 13330 + }, + { + "epoch": 0.7317233809001098, + "grad_norm": 1.249416470527649, + "learning_rate": 2.935237628387944e-05, + "loss": 0.2149, + "step": 13332 + }, + { + "epoch": 0.7318331503841932, + "grad_norm": 1.235653042793274, + "learning_rate": 2.9347220160243787e-05, + "loss": 0.2146, + "step": 13334 + }, + { + "epoch": 0.7319429198682766, + "grad_norm": 2.0780131816864014, + "learning_rate": 2.9342063845917734e-05, + "loss": 0.4304, + "step": 13336 + }, + { + "epoch": 0.73205268935236, + "grad_norm": 0.9938422441482544, + "learning_rate": 2.9336907341127445e-05, + "loss": 0.1637, + "step": 13338 + }, + { + "epoch": 0.7321624588364435, + "grad_norm": 1.518501877784729, + "learning_rate": 2.933175064609911e-05, + "loss": 0.2761, + "step": 13340 + }, + { + "epoch": 0.7322722283205269, + "grad_norm": 2.015212297439575, + "learning_rate": 2.9326593761058936e-05, + "loss": 0.2606, + "step": 13342 + }, + { + "epoch": 0.7323819978046103, + "grad_norm": 0.9385280013084412, + "learning_rate": 2.9321436686233123e-05, + "loss": 0.1711, + "step": 13344 + }, + { + "epoch": 0.7324917672886937, + "grad_norm": 1.7528173923492432, + "learning_rate": 2.9316279421847875e-05, + "loss": 0.2523, + "step": 13346 + }, + { + "epoch": 0.7326015367727772, + "grad_norm": 1.1985982656478882, + "learning_rate": 2.9311121968129435e-05, + "loss": 0.2273, + "step": 13348 + }, + { + "epoch": 0.7327113062568605, + "grad_norm": 1.960058569908142, + "learning_rate": 2.9305964325304025e-05, + "loss": 0.2918, + "step": 13350 + }, + { + "epoch": 0.732821075740944, + "grad_norm": 2.0911526679992676, + "learning_rate": 2.930080649359788e-05, + "loss": 0.3417, + "step": 13352 + }, + { + "epoch": 0.7329308452250275, + "grad_norm": 1.4998233318328857, + "learning_rate": 2.929564847323726e-05, + "loss": 0.2582, + "step": 13354 + }, + { + "epoch": 0.7330406147091109, + "grad_norm": 2.53131365776062, + "learning_rate": 2.9290490264448412e-05, + "loss": 0.3248, + "step": 13356 + }, + { + "epoch": 0.7331503841931943, + "grad_norm": 1.5131618976593018, + "learning_rate": 2.92853318674576e-05, + "loss": 0.3254, + "step": 13358 + }, + { + "epoch": 0.7332601536772777, + "grad_norm": 1.659690260887146, + "learning_rate": 2.9280173282491096e-05, + "loss": 0.3429, + "step": 13360 + }, + { + "epoch": 0.7333699231613612, + "grad_norm": 1.6151549816131592, + "learning_rate": 2.927501450977519e-05, + "loss": 0.3072, + "step": 13362 + }, + { + "epoch": 0.7334796926454445, + "grad_norm": 3.7984390258789062, + "learning_rate": 2.9269855549536158e-05, + "loss": 0.2735, + "step": 13364 + }, + { + "epoch": 0.733589462129528, + "grad_norm": 1.1259469985961914, + "learning_rate": 2.9264696402000302e-05, + "loss": 0.1931, + "step": 13366 + }, + { + "epoch": 0.7336992316136114, + "grad_norm": 1.548986792564392, + "learning_rate": 2.9259537067393937e-05, + "loss": 0.2043, + "step": 13368 + }, + { + "epoch": 0.7338090010976949, + "grad_norm": 0.9404273629188538, + "learning_rate": 2.925437754594337e-05, + "loss": 0.2116, + "step": 13370 + }, + { + "epoch": 0.7339187705817782, + "grad_norm": 1.243436574935913, + "learning_rate": 2.9249217837874916e-05, + "loss": 0.2633, + "step": 13372 + }, + { + "epoch": 0.7340285400658617, + "grad_norm": 1.2433137893676758, + "learning_rate": 2.9244057943414915e-05, + "loss": 0.1819, + "step": 13374 + }, + { + "epoch": 0.7341383095499451, + "grad_norm": 2.9493701457977295, + "learning_rate": 2.92388978627897e-05, + "loss": 0.3105, + "step": 13376 + }, + { + "epoch": 0.7342480790340286, + "grad_norm": 1.586891531944275, + "learning_rate": 2.9233737596225613e-05, + "loss": 0.3319, + "step": 13378 + }, + { + "epoch": 0.7343578485181119, + "grad_norm": 1.6211819648742676, + "learning_rate": 2.9228577143949025e-05, + "loss": 0.3156, + "step": 13380 + }, + { + "epoch": 0.7344676180021954, + "grad_norm": 1.8706892728805542, + "learning_rate": 2.9223416506186285e-05, + "loss": 0.3722, + "step": 13382 + }, + { + "epoch": 0.7345773874862788, + "grad_norm": 1.299283742904663, + "learning_rate": 2.9218255683163766e-05, + "loss": 0.2737, + "step": 13384 + }, + { + "epoch": 0.7346871569703622, + "grad_norm": 2.3149826526641846, + "learning_rate": 2.9213094675107848e-05, + "loss": 0.369, + "step": 13386 + }, + { + "epoch": 0.7347969264544456, + "grad_norm": 1.1286691427230835, + "learning_rate": 2.9207933482244926e-05, + "loss": 0.2748, + "step": 13388 + }, + { + "epoch": 0.7349066959385291, + "grad_norm": 1.9015018939971924, + "learning_rate": 2.920277210480138e-05, + "loss": 0.2336, + "step": 13390 + }, + { + "epoch": 0.7350164654226126, + "grad_norm": 1.634835958480835, + "learning_rate": 2.9197610543003624e-05, + "loss": 0.2085, + "step": 13392 + }, + { + "epoch": 0.7351262349066959, + "grad_norm": 1.252933144569397, + "learning_rate": 2.9192448797078058e-05, + "loss": 0.2517, + "step": 13394 + }, + { + "epoch": 0.7352360043907794, + "grad_norm": 1.6698194742202759, + "learning_rate": 2.918728686725111e-05, + "loss": 0.4055, + "step": 13396 + }, + { + "epoch": 0.7353457738748628, + "grad_norm": 1.650749921798706, + "learning_rate": 2.9182124753749218e-05, + "loss": 0.3539, + "step": 13398 + }, + { + "epoch": 0.7354555433589463, + "grad_norm": 1.0233674049377441, + "learning_rate": 2.9176962456798805e-05, + "loss": 0.195, + "step": 13400 + }, + { + "epoch": 0.7355653128430296, + "grad_norm": 1.3836841583251953, + "learning_rate": 2.9171799976626313e-05, + "loss": 0.3014, + "step": 13402 + }, + { + "epoch": 0.7356750823271131, + "grad_norm": 1.7993807792663574, + "learning_rate": 2.9166637313458207e-05, + "loss": 0.3158, + "step": 13404 + }, + { + "epoch": 0.7357848518111965, + "grad_norm": 2.2111399173736572, + "learning_rate": 2.9161474467520934e-05, + "loss": 0.2995, + "step": 13406 + }, + { + "epoch": 0.7358946212952799, + "grad_norm": 1.8319576978683472, + "learning_rate": 2.9156311439040952e-05, + "loss": 0.2266, + "step": 13408 + }, + { + "epoch": 0.7360043907793633, + "grad_norm": 1.2788578271865845, + "learning_rate": 2.915114822824476e-05, + "loss": 0.2888, + "step": 13410 + }, + { + "epoch": 0.7361141602634468, + "grad_norm": 1.3554619550704956, + "learning_rate": 2.914598483535883e-05, + "loss": 0.2371, + "step": 13412 + }, + { + "epoch": 0.7362239297475301, + "grad_norm": 1.420877456665039, + "learning_rate": 2.9140821260609657e-05, + "loss": 0.2405, + "step": 13414 + }, + { + "epoch": 0.7363336992316136, + "grad_norm": 1.9721347093582153, + "learning_rate": 2.913565750422374e-05, + "loss": 0.2451, + "step": 13416 + }, + { + "epoch": 0.736443468715697, + "grad_norm": 1.1491481065750122, + "learning_rate": 2.9130493566427587e-05, + "loss": 0.2491, + "step": 13418 + }, + { + "epoch": 0.7365532381997805, + "grad_norm": 1.4685750007629395, + "learning_rate": 2.912532944744771e-05, + "loss": 0.3075, + "step": 13420 + }, + { + "epoch": 0.7366630076838638, + "grad_norm": 1.4024156332015991, + "learning_rate": 2.9120165147510636e-05, + "loss": 0.2185, + "step": 13422 + }, + { + "epoch": 0.7367727771679473, + "grad_norm": 0.8393707871437073, + "learning_rate": 2.911500066684289e-05, + "loss": 0.2622, + "step": 13424 + }, + { + "epoch": 0.7368825466520308, + "grad_norm": 2.40278959274292, + "learning_rate": 2.910983600567102e-05, + "loss": 0.2476, + "step": 13426 + }, + { + "epoch": 0.7369923161361142, + "grad_norm": 1.6163884401321411, + "learning_rate": 2.9104671164221576e-05, + "loss": 0.2817, + "step": 13428 + }, + { + "epoch": 0.7371020856201976, + "grad_norm": 1.4299424886703491, + "learning_rate": 2.9099506142721106e-05, + "loss": 0.2525, + "step": 13430 + }, + { + "epoch": 0.737211855104281, + "grad_norm": 1.3753166198730469, + "learning_rate": 2.909434094139617e-05, + "loss": 0.3819, + "step": 13432 + }, + { + "epoch": 0.7373216245883645, + "grad_norm": 1.3552265167236328, + "learning_rate": 2.9089175560473347e-05, + "loss": 0.2414, + "step": 13434 + }, + { + "epoch": 0.7374313940724478, + "grad_norm": 1.4358974695205688, + "learning_rate": 2.9084010000179217e-05, + "loss": 0.3138, + "step": 13436 + }, + { + "epoch": 0.7375411635565313, + "grad_norm": 1.5109988451004028, + "learning_rate": 2.9078844260740357e-05, + "loss": 0.3017, + "step": 13438 + }, + { + "epoch": 0.7376509330406147, + "grad_norm": 1.4737440347671509, + "learning_rate": 2.907367834238337e-05, + "loss": 0.2155, + "step": 13440 + }, + { + "epoch": 0.7377607025246982, + "grad_norm": 1.565064549446106, + "learning_rate": 2.9068512245334857e-05, + "loss": 0.2813, + "step": 13442 + }, + { + "epoch": 0.7378704720087815, + "grad_norm": 2.5782322883605957, + "learning_rate": 2.906334596982142e-05, + "loss": 0.3404, + "step": 13444 + }, + { + "epoch": 0.737980241492865, + "grad_norm": 1.7094076871871948, + "learning_rate": 2.9058179516069695e-05, + "loss": 0.3132, + "step": 13446 + }, + { + "epoch": 0.7380900109769484, + "grad_norm": 1.7228964567184448, + "learning_rate": 2.9053012884306297e-05, + "loss": 0.2028, + "step": 13448 + }, + { + "epoch": 0.7381997804610319, + "grad_norm": 1.8667176961898804, + "learning_rate": 2.9047846074757856e-05, + "loss": 0.2411, + "step": 13450 + }, + { + "epoch": 0.7383095499451152, + "grad_norm": 1.420392394065857, + "learning_rate": 2.9042679087651022e-05, + "loss": 0.2393, + "step": 13452 + }, + { + "epoch": 0.7384193194291987, + "grad_norm": 1.7681117057800293, + "learning_rate": 2.9037511923212435e-05, + "loss": 0.2682, + "step": 13454 + }, + { + "epoch": 0.7385290889132821, + "grad_norm": 1.383720874786377, + "learning_rate": 2.9032344581668764e-05, + "loss": 0.3712, + "step": 13456 + }, + { + "epoch": 0.7386388583973655, + "grad_norm": 1.5282870531082153, + "learning_rate": 2.9027177063246662e-05, + "loss": 0.2951, + "step": 13458 + }, + { + "epoch": 0.7387486278814489, + "grad_norm": 1.1503112316131592, + "learning_rate": 2.9022009368172814e-05, + "loss": 0.2015, + "step": 13460 + }, + { + "epoch": 0.7388583973655324, + "grad_norm": 1.8551902770996094, + "learning_rate": 2.9016841496673885e-05, + "loss": 0.2356, + "step": 13462 + }, + { + "epoch": 0.7389681668496159, + "grad_norm": 1.2553138732910156, + "learning_rate": 2.9011673448976578e-05, + "loss": 0.2141, + "step": 13464 + }, + { + "epoch": 0.7390779363336992, + "grad_norm": 3.5840651988983154, + "learning_rate": 2.900650522530759e-05, + "loss": 0.3393, + "step": 13466 + }, + { + "epoch": 0.7391877058177827, + "grad_norm": 1.1594058275222778, + "learning_rate": 2.9001336825893605e-05, + "loss": 0.2471, + "step": 13468 + }, + { + "epoch": 0.7392974753018661, + "grad_norm": 1.5636906623840332, + "learning_rate": 2.899616825096135e-05, + "loss": 0.2322, + "step": 13470 + }, + { + "epoch": 0.7394072447859495, + "grad_norm": 1.363059163093567, + "learning_rate": 2.899099950073754e-05, + "loss": 0.2923, + "step": 13472 + }, + { + "epoch": 0.7395170142700329, + "grad_norm": 1.5554585456848145, + "learning_rate": 2.8985830575448914e-05, + "loss": 0.2745, + "step": 13474 + }, + { + "epoch": 0.7396267837541164, + "grad_norm": 1.9819345474243164, + "learning_rate": 2.8980661475322186e-05, + "loss": 0.302, + "step": 13476 + }, + { + "epoch": 0.7397365532381998, + "grad_norm": 1.7646950483322144, + "learning_rate": 2.897549220058411e-05, + "loss": 0.2951, + "step": 13478 + }, + { + "epoch": 0.7398463227222832, + "grad_norm": 1.442673921585083, + "learning_rate": 2.8970322751461427e-05, + "loss": 0.3502, + "step": 13480 + }, + { + "epoch": 0.7399560922063666, + "grad_norm": 1.3337419033050537, + "learning_rate": 2.896515312818091e-05, + "loss": 0.2001, + "step": 13482 + }, + { + "epoch": 0.7400658616904501, + "grad_norm": 1.4482864141464233, + "learning_rate": 2.895998333096931e-05, + "loss": 0.257, + "step": 13484 + }, + { + "epoch": 0.7401756311745334, + "grad_norm": 1.122001051902771, + "learning_rate": 2.8954813360053406e-05, + "loss": 0.2681, + "step": 13486 + }, + { + "epoch": 0.7402854006586169, + "grad_norm": 2.343215227127075, + "learning_rate": 2.894964321565997e-05, + "loss": 0.2372, + "step": 13488 + }, + { + "epoch": 0.7403951701427003, + "grad_norm": 2.5822625160217285, + "learning_rate": 2.8944472898015807e-05, + "loss": 0.2337, + "step": 13490 + }, + { + "epoch": 0.7405049396267838, + "grad_norm": 1.3434470891952515, + "learning_rate": 2.89393024073477e-05, + "loss": 0.2986, + "step": 13492 + }, + { + "epoch": 0.7406147091108671, + "grad_norm": 1.8818819522857666, + "learning_rate": 2.8934131743882447e-05, + "loss": 0.207, + "step": 13494 + }, + { + "epoch": 0.7407244785949506, + "grad_norm": 1.4726016521453857, + "learning_rate": 2.8928960907846875e-05, + "loss": 0.2366, + "step": 13496 + }, + { + "epoch": 0.740834248079034, + "grad_norm": 1.2298386096954346, + "learning_rate": 2.892378989946779e-05, + "loss": 0.2003, + "step": 13498 + }, + { + "epoch": 0.7409440175631175, + "grad_norm": 1.8254878520965576, + "learning_rate": 2.891861871897201e-05, + "loss": 0.2267, + "step": 13500 + }, + { + "epoch": 0.7410537870472009, + "grad_norm": 1.8772075176239014, + "learning_rate": 2.8913447366586388e-05, + "loss": 0.1921, + "step": 13502 + }, + { + "epoch": 0.7411635565312843, + "grad_norm": 5.545892238616943, + "learning_rate": 2.8908275842537764e-05, + "loss": 0.2362, + "step": 13504 + }, + { + "epoch": 0.7412733260153678, + "grad_norm": 2.772766351699829, + "learning_rate": 2.890310414705297e-05, + "loss": 0.2716, + "step": 13506 + }, + { + "epoch": 0.7413830954994511, + "grad_norm": 1.5250840187072754, + "learning_rate": 2.889793228035887e-05, + "loss": 0.3418, + "step": 13508 + }, + { + "epoch": 0.7414928649835346, + "grad_norm": 2.001363754272461, + "learning_rate": 2.8892760242682332e-05, + "loss": 0.1906, + "step": 13510 + }, + { + "epoch": 0.741602634467618, + "grad_norm": 1.3432776927947998, + "learning_rate": 2.888758803425022e-05, + "loss": 0.357, + "step": 13512 + }, + { + "epoch": 0.7417124039517015, + "grad_norm": 1.3552796840667725, + "learning_rate": 2.8882415655289412e-05, + "loss": 0.2852, + "step": 13514 + }, + { + "epoch": 0.7418221734357848, + "grad_norm": 3.9585959911346436, + "learning_rate": 2.8877243106026803e-05, + "loss": 0.2263, + "step": 13516 + }, + { + "epoch": 0.7419319429198683, + "grad_norm": 1.349196434020996, + "learning_rate": 2.8872070386689276e-05, + "loss": 0.3212, + "step": 13518 + }, + { + "epoch": 0.7420417124039517, + "grad_norm": 1.3760870695114136, + "learning_rate": 2.8866897497503743e-05, + "loss": 0.2603, + "step": 13520 + }, + { + "epoch": 0.7421514818880351, + "grad_norm": 0.8073729872703552, + "learning_rate": 2.8861724438697107e-05, + "loss": 0.2242, + "step": 13522 + }, + { + "epoch": 0.7422612513721185, + "grad_norm": 3.193507194519043, + "learning_rate": 2.8856551210496273e-05, + "loss": 0.4608, + "step": 13524 + }, + { + "epoch": 0.742371020856202, + "grad_norm": 1.3793116807937622, + "learning_rate": 2.8851377813128184e-05, + "loss": 0.2397, + "step": 13526 + }, + { + "epoch": 0.7424807903402854, + "grad_norm": 1.5683873891830444, + "learning_rate": 2.884620424681976e-05, + "loss": 0.318, + "step": 13528 + }, + { + "epoch": 0.7425905598243688, + "grad_norm": 1.6612632274627686, + "learning_rate": 2.8841030511797923e-05, + "loss": 0.357, + "step": 13530 + }, + { + "epoch": 0.7427003293084522, + "grad_norm": 3.1413424015045166, + "learning_rate": 2.8835856608289653e-05, + "loss": 0.2624, + "step": 13532 + }, + { + "epoch": 0.7428100987925357, + "grad_norm": 1.9937812089920044, + "learning_rate": 2.883068253652188e-05, + "loss": 0.4128, + "step": 13534 + }, + { + "epoch": 0.7429198682766192, + "grad_norm": 3.3748207092285156, + "learning_rate": 2.8825508296721566e-05, + "loss": 0.2735, + "step": 13536 + }, + { + "epoch": 0.7430296377607025, + "grad_norm": 2.0914924144744873, + "learning_rate": 2.8820333889115685e-05, + "loss": 0.2236, + "step": 13538 + }, + { + "epoch": 0.743139407244786, + "grad_norm": 2.8453099727630615, + "learning_rate": 2.8815159313931206e-05, + "loss": 0.2923, + "step": 13540 + }, + { + "epoch": 0.7432491767288694, + "grad_norm": 2.4359633922576904, + "learning_rate": 2.880998457139511e-05, + "loss": 0.3965, + "step": 13542 + }, + { + "epoch": 0.7433589462129528, + "grad_norm": 1.0273517370224, + "learning_rate": 2.8804809661734394e-05, + "loss": 0.2677, + "step": 13544 + }, + { + "epoch": 0.7434687156970362, + "grad_norm": 3.03994083404541, + "learning_rate": 2.8799634585176057e-05, + "loss": 0.3089, + "step": 13546 + }, + { + "epoch": 0.7435784851811197, + "grad_norm": 1.2092831134796143, + "learning_rate": 2.879445934194709e-05, + "loss": 0.3462, + "step": 13548 + }, + { + "epoch": 0.743688254665203, + "grad_norm": 1.4859931468963623, + "learning_rate": 2.8789283932274514e-05, + "loss": 0.1913, + "step": 13550 + }, + { + "epoch": 0.7437980241492865, + "grad_norm": 1.8669564723968506, + "learning_rate": 2.8784108356385348e-05, + "loss": 0.296, + "step": 13552 + }, + { + "epoch": 0.7439077936333699, + "grad_norm": 1.8724560737609863, + "learning_rate": 2.877893261450661e-05, + "loss": 0.1973, + "step": 13554 + }, + { + "epoch": 0.7440175631174534, + "grad_norm": 1.952019453048706, + "learning_rate": 2.877375670686534e-05, + "loss": 0.227, + "step": 13556 + }, + { + "epoch": 0.7441273326015367, + "grad_norm": 1.5659912824630737, + "learning_rate": 2.8768580633688587e-05, + "loss": 0.3047, + "step": 13558 + }, + { + "epoch": 0.7442371020856202, + "grad_norm": 1.4342149496078491, + "learning_rate": 2.876340439520338e-05, + "loss": 0.3248, + "step": 13560 + }, + { + "epoch": 0.7443468715697036, + "grad_norm": 1.2720918655395508, + "learning_rate": 2.8758227991636788e-05, + "loss": 0.1807, + "step": 13562 + }, + { + "epoch": 0.7444566410537871, + "grad_norm": 1.2372883558273315, + "learning_rate": 2.8753051423215872e-05, + "loss": 0.2595, + "step": 13564 + }, + { + "epoch": 0.7445664105378704, + "grad_norm": 1.1660687923431396, + "learning_rate": 2.87478746901677e-05, + "loss": 0.2032, + "step": 13566 + }, + { + "epoch": 0.7446761800219539, + "grad_norm": 1.3475430011749268, + "learning_rate": 2.874269779271934e-05, + "loss": 0.1757, + "step": 13568 + }, + { + "epoch": 0.7447859495060373, + "grad_norm": 1.3621498346328735, + "learning_rate": 2.8737520731097893e-05, + "loss": 0.2938, + "step": 13570 + }, + { + "epoch": 0.7448957189901207, + "grad_norm": 1.2730426788330078, + "learning_rate": 2.873234350553044e-05, + "loss": 0.2347, + "step": 13572 + }, + { + "epoch": 0.7450054884742042, + "grad_norm": 0.951526403427124, + "learning_rate": 2.8727166116244086e-05, + "loss": 0.3583, + "step": 13574 + }, + { + "epoch": 0.7451152579582876, + "grad_norm": 1.6143083572387695, + "learning_rate": 2.8721988563465925e-05, + "loss": 0.3094, + "step": 13576 + }, + { + "epoch": 0.7452250274423711, + "grad_norm": 1.536942481994629, + "learning_rate": 2.871681084742308e-05, + "loss": 0.3589, + "step": 13578 + }, + { + "epoch": 0.7453347969264544, + "grad_norm": 1.0985691547393799, + "learning_rate": 2.871163296834266e-05, + "loss": 0.2968, + "step": 13580 + }, + { + "epoch": 0.7454445664105379, + "grad_norm": 1.2453793287277222, + "learning_rate": 2.8706454926451815e-05, + "loss": 0.2847, + "step": 13582 + }, + { + "epoch": 0.7455543358946213, + "grad_norm": 1.3196046352386475, + "learning_rate": 2.870127672197766e-05, + "loss": 0.228, + "step": 13584 + }, + { + "epoch": 0.7456641053787048, + "grad_norm": 1.3886500597000122, + "learning_rate": 2.8696098355147334e-05, + "loss": 0.2355, + "step": 13586 + }, + { + "epoch": 0.7457738748627881, + "grad_norm": 1.6985175609588623, + "learning_rate": 2.8690919826188e-05, + "loss": 0.2771, + "step": 13588 + }, + { + "epoch": 0.7458836443468716, + "grad_norm": 1.0032944679260254, + "learning_rate": 2.86857411353268e-05, + "loss": 0.2385, + "step": 13590 + }, + { + "epoch": 0.745993413830955, + "grad_norm": 1.5250394344329834, + "learning_rate": 2.86805622827909e-05, + "loss": 0.1932, + "step": 13592 + }, + { + "epoch": 0.7461031833150384, + "grad_norm": 1.3185361623764038, + "learning_rate": 2.867538326880748e-05, + "loss": 0.3344, + "step": 13594 + }, + { + "epoch": 0.7462129527991218, + "grad_norm": 0.8300945162773132, + "learning_rate": 2.8670204093603713e-05, + "loss": 0.1675, + "step": 13596 + }, + { + "epoch": 0.7463227222832053, + "grad_norm": 1.7652302980422974, + "learning_rate": 2.8665024757406778e-05, + "loss": 0.335, + "step": 13598 + }, + { + "epoch": 0.7464324917672887, + "grad_norm": 1.1419672966003418, + "learning_rate": 2.8659845260443867e-05, + "loss": 0.2159, + "step": 13600 + }, + { + "epoch": 0.7465422612513721, + "grad_norm": 1.6137956380844116, + "learning_rate": 2.865466560294218e-05, + "loss": 0.256, + "step": 13602 + }, + { + "epoch": 0.7466520307354555, + "grad_norm": 1.691311001777649, + "learning_rate": 2.8649485785128916e-05, + "loss": 0.2625, + "step": 13604 + }, + { + "epoch": 0.746761800219539, + "grad_norm": 2.4934630393981934, + "learning_rate": 2.86443058072313e-05, + "loss": 0.192, + "step": 13606 + }, + { + "epoch": 0.7468715697036223, + "grad_norm": 1.160772681236267, + "learning_rate": 2.8639125669476542e-05, + "loss": 0.2161, + "step": 13608 + }, + { + "epoch": 0.7469813391877058, + "grad_norm": 1.475041151046753, + "learning_rate": 2.8633945372091864e-05, + "loss": 0.2305, + "step": 13610 + }, + { + "epoch": 0.7470911086717893, + "grad_norm": 0.8500182032585144, + "learning_rate": 2.8628764915304512e-05, + "loss": 0.1576, + "step": 13612 + }, + { + "epoch": 0.7472008781558727, + "grad_norm": 3.1169564723968506, + "learning_rate": 2.8623584299341722e-05, + "loss": 0.2399, + "step": 13614 + }, + { + "epoch": 0.7473106476399561, + "grad_norm": 2.265178918838501, + "learning_rate": 2.861840352443073e-05, + "loss": 0.2733, + "step": 13616 + }, + { + "epoch": 0.7474204171240395, + "grad_norm": 1.8159408569335938, + "learning_rate": 2.8613222590798812e-05, + "loss": 0.2401, + "step": 13618 + }, + { + "epoch": 0.747530186608123, + "grad_norm": 1.7471336126327515, + "learning_rate": 2.8608041498673206e-05, + "loss": 0.3177, + "step": 13620 + }, + { + "epoch": 0.7476399560922063, + "grad_norm": 1.1342295408248901, + "learning_rate": 2.8602860248281187e-05, + "loss": 0.2231, + "step": 13622 + }, + { + "epoch": 0.7477497255762898, + "grad_norm": 1.3547791242599487, + "learning_rate": 2.8597678839850046e-05, + "loss": 0.2205, + "step": 13624 + }, + { + "epoch": 0.7478594950603732, + "grad_norm": 1.489531397819519, + "learning_rate": 2.859249727360705e-05, + "loss": 0.3332, + "step": 13626 + }, + { + "epoch": 0.7479692645444567, + "grad_norm": 1.2251397371292114, + "learning_rate": 2.858731554977948e-05, + "loss": 0.2323, + "step": 13628 + }, + { + "epoch": 0.74807903402854, + "grad_norm": 1.9564870595932007, + "learning_rate": 2.8582133668594658e-05, + "loss": 0.1736, + "step": 13630 + }, + { + "epoch": 0.7481888035126235, + "grad_norm": 2.0167620182037354, + "learning_rate": 2.8576951630279862e-05, + "loss": 0.2226, + "step": 13632 + }, + { + "epoch": 0.7482985729967069, + "grad_norm": 2.569486141204834, + "learning_rate": 2.8571769435062412e-05, + "loss": 0.2905, + "step": 13634 + }, + { + "epoch": 0.7484083424807904, + "grad_norm": 1.1429859399795532, + "learning_rate": 2.8566587083169623e-05, + "loss": 0.2257, + "step": 13636 + }, + { + "epoch": 0.7485181119648737, + "grad_norm": 2.001027822494507, + "learning_rate": 2.856140457482882e-05, + "loss": 0.1941, + "step": 13638 + }, + { + "epoch": 0.7486278814489572, + "grad_norm": 1.6814528703689575, + "learning_rate": 2.8556221910267328e-05, + "loss": 0.4036, + "step": 13640 + }, + { + "epoch": 0.7487376509330406, + "grad_norm": 1.3952454328536987, + "learning_rate": 2.855103908971249e-05, + "loss": 0.3669, + "step": 13642 + }, + { + "epoch": 0.748847420417124, + "grad_norm": 1.2187998294830322, + "learning_rate": 2.854585611339165e-05, + "loss": 0.2363, + "step": 13644 + }, + { + "epoch": 0.7489571899012074, + "grad_norm": 1.581225872039795, + "learning_rate": 2.8540672981532156e-05, + "loss": 0.2349, + "step": 13646 + }, + { + "epoch": 0.7490669593852909, + "grad_norm": 1.9732078313827515, + "learning_rate": 2.8535489694361365e-05, + "loss": 0.3128, + "step": 13648 + }, + { + "epoch": 0.7491767288693744, + "grad_norm": 1.4308772087097168, + "learning_rate": 2.853030625210664e-05, + "loss": 0.2595, + "step": 13650 + }, + { + "epoch": 0.7492864983534577, + "grad_norm": 2.659809112548828, + "learning_rate": 2.8525122654995368e-05, + "loss": 0.308, + "step": 13652 + }, + { + "epoch": 0.7493962678375412, + "grad_norm": 1.4550950527191162, + "learning_rate": 2.85199389032549e-05, + "loss": 0.3479, + "step": 13654 + }, + { + "epoch": 0.7495060373216246, + "grad_norm": 1.627542495727539, + "learning_rate": 2.851475499711264e-05, + "loss": 0.3105, + "step": 13656 + }, + { + "epoch": 0.749615806805708, + "grad_norm": 1.560602068901062, + "learning_rate": 2.850957093679597e-05, + "loss": 0.2029, + "step": 13658 + }, + { + "epoch": 0.7497255762897914, + "grad_norm": 2.6842920780181885, + "learning_rate": 2.85043867225323e-05, + "loss": 0.3605, + "step": 13660 + }, + { + "epoch": 0.7498353457738749, + "grad_norm": 1.4494906663894653, + "learning_rate": 2.849920235454903e-05, + "loss": 0.2984, + "step": 13662 + }, + { + "epoch": 0.7499451152579583, + "grad_norm": 1.4356234073638916, + "learning_rate": 2.849401783307356e-05, + "loss": 0.2554, + "step": 13664 + }, + { + "epoch": 0.7500548847420417, + "grad_norm": 1.113114833831787, + "learning_rate": 2.8488833158333322e-05, + "loss": 0.1571, + "step": 13666 + }, + { + "epoch": 0.7501646542261251, + "grad_norm": 3.454479455947876, + "learning_rate": 2.848364833055574e-05, + "loss": 0.1549, + "step": 13668 + }, + { + "epoch": 0.7502744237102086, + "grad_norm": 1.2637674808502197, + "learning_rate": 2.8478463349968244e-05, + "loss": 0.2454, + "step": 13670 + }, + { + "epoch": 0.750384193194292, + "grad_norm": 1.3238074779510498, + "learning_rate": 2.8473278216798266e-05, + "loss": 0.2425, + "step": 13672 + }, + { + "epoch": 0.7504939626783754, + "grad_norm": 2.051948070526123, + "learning_rate": 2.8468092931273265e-05, + "loss": 0.2613, + "step": 13674 + }, + { + "epoch": 0.7506037321624588, + "grad_norm": 2.0922696590423584, + "learning_rate": 2.8462907493620682e-05, + "loss": 0.3427, + "step": 13676 + }, + { + "epoch": 0.7507135016465423, + "grad_norm": 1.2864865064620972, + "learning_rate": 2.845772190406798e-05, + "loss": 0.2593, + "step": 13678 + }, + { + "epoch": 0.7508232711306256, + "grad_norm": 0.8769574761390686, + "learning_rate": 2.845253616284262e-05, + "loss": 0.2501, + "step": 13680 + }, + { + "epoch": 0.7509330406147091, + "grad_norm": 1.104520559310913, + "learning_rate": 2.844735027017208e-05, + "loss": 0.2268, + "step": 13682 + }, + { + "epoch": 0.7510428100987926, + "grad_norm": 1.1324084997177124, + "learning_rate": 2.8442164226283834e-05, + "loss": 0.2698, + "step": 13684 + }, + { + "epoch": 0.751152579582876, + "grad_norm": 3.1409215927124023, + "learning_rate": 2.8436978031405375e-05, + "loss": 0.2476, + "step": 13686 + }, + { + "epoch": 0.7512623490669594, + "grad_norm": 1.3640247583389282, + "learning_rate": 2.8431791685764193e-05, + "loss": 0.2711, + "step": 13688 + }, + { + "epoch": 0.7513721185510428, + "grad_norm": 1.2140754461288452, + "learning_rate": 2.8426605189587773e-05, + "loss": 0.2569, + "step": 13690 + }, + { + "epoch": 0.7514818880351263, + "grad_norm": 1.3132051229476929, + "learning_rate": 2.842141854310364e-05, + "loss": 0.1986, + "step": 13692 + }, + { + "epoch": 0.7515916575192096, + "grad_norm": 1.5872409343719482, + "learning_rate": 2.8416231746539297e-05, + "loss": 0.1931, + "step": 13694 + }, + { + "epoch": 0.7517014270032931, + "grad_norm": 1.4863717555999756, + "learning_rate": 2.841104480012225e-05, + "loss": 0.2642, + "step": 13696 + }, + { + "epoch": 0.7518111964873765, + "grad_norm": 1.4602473974227905, + "learning_rate": 2.840585770408004e-05, + "loss": 0.2294, + "step": 13698 + }, + { + "epoch": 0.75192096597146, + "grad_norm": 2.0169546604156494, + "learning_rate": 2.8400670458640204e-05, + "loss": 0.2609, + "step": 13700 + }, + { + "epoch": 0.7520307354555433, + "grad_norm": 1.2310975790023804, + "learning_rate": 2.839548306403026e-05, + "loss": 0.2058, + "step": 13702 + }, + { + "epoch": 0.7521405049396268, + "grad_norm": 1.0209797620773315, + "learning_rate": 2.8390295520477766e-05, + "loss": 0.199, + "step": 13704 + }, + { + "epoch": 0.7522502744237102, + "grad_norm": 1.4549161195755005, + "learning_rate": 2.838510782821028e-05, + "loss": 0.3151, + "step": 13706 + }, + { + "epoch": 0.7523600439077937, + "grad_norm": 1.399675726890564, + "learning_rate": 2.8379919987455334e-05, + "loss": 0.2141, + "step": 13708 + }, + { + "epoch": 0.752469813391877, + "grad_norm": 1.6963855028152466, + "learning_rate": 2.8374731998440513e-05, + "loss": 0.2564, + "step": 13710 + }, + { + "epoch": 0.7525795828759605, + "grad_norm": 1.2550873756408691, + "learning_rate": 2.8369543861393382e-05, + "loss": 0.1791, + "step": 13712 + }, + { + "epoch": 0.7526893523600439, + "grad_norm": 2.4325921535491943, + "learning_rate": 2.8364355576541513e-05, + "loss": 0.2457, + "step": 13714 + }, + { + "epoch": 0.7527991218441273, + "grad_norm": 2.1343870162963867, + "learning_rate": 2.835916714411251e-05, + "loss": 0.17, + "step": 13716 + }, + { + "epoch": 0.7529088913282107, + "grad_norm": 1.450561761856079, + "learning_rate": 2.835397856433394e-05, + "loss": 0.1812, + "step": 13718 + }, + { + "epoch": 0.7530186608122942, + "grad_norm": 1.4371007680892944, + "learning_rate": 2.8348789837433403e-05, + "loss": 0.3021, + "step": 13720 + }, + { + "epoch": 0.7531284302963777, + "grad_norm": 1.6564422845840454, + "learning_rate": 2.8343600963638516e-05, + "loss": 0.2501, + "step": 13722 + }, + { + "epoch": 0.753238199780461, + "grad_norm": 1.0662295818328857, + "learning_rate": 2.833841194317688e-05, + "loss": 0.228, + "step": 13724 + }, + { + "epoch": 0.7533479692645445, + "grad_norm": 2.4987995624542236, + "learning_rate": 2.8333222776276096e-05, + "loss": 0.3361, + "step": 13726 + }, + { + "epoch": 0.7534577387486279, + "grad_norm": 1.813982367515564, + "learning_rate": 2.832803346316381e-05, + "loss": 0.4524, + "step": 13728 + }, + { + "epoch": 0.7535675082327113, + "grad_norm": 1.5391795635223389, + "learning_rate": 2.8322844004067645e-05, + "loss": 0.2706, + "step": 13730 + }, + { + "epoch": 0.7536772777167947, + "grad_norm": 0.9980608820915222, + "learning_rate": 2.831765439921522e-05, + "loss": 0.2475, + "step": 13732 + }, + { + "epoch": 0.7537870472008782, + "grad_norm": 1.5577433109283447, + "learning_rate": 2.8312464648834202e-05, + "loss": 0.2104, + "step": 13734 + }, + { + "epoch": 0.7538968166849616, + "grad_norm": 1.257013201713562, + "learning_rate": 2.8307274753152225e-05, + "loss": 0.2395, + "step": 13736 + }, + { + "epoch": 0.754006586169045, + "grad_norm": 1.1107155084609985, + "learning_rate": 2.8302084712396936e-05, + "loss": 0.1845, + "step": 13738 + }, + { + "epoch": 0.7541163556531284, + "grad_norm": 1.864242672920227, + "learning_rate": 2.8296894526796014e-05, + "loss": 0.2152, + "step": 13740 + }, + { + "epoch": 0.7542261251372119, + "grad_norm": 2.812713384628296, + "learning_rate": 2.8291704196577106e-05, + "loss": 0.3256, + "step": 13742 + }, + { + "epoch": 0.7543358946212952, + "grad_norm": 1.512780785560608, + "learning_rate": 2.8286513721967906e-05, + "loss": 0.2416, + "step": 13744 + }, + { + "epoch": 0.7544456641053787, + "grad_norm": 1.319948673248291, + "learning_rate": 2.8281323103196073e-05, + "loss": 0.3021, + "step": 13746 + }, + { + "epoch": 0.7545554335894621, + "grad_norm": 1.7045392990112305, + "learning_rate": 2.8276132340489307e-05, + "loss": 0.2871, + "step": 13748 + }, + { + "epoch": 0.7546652030735456, + "grad_norm": 1.5948688983917236, + "learning_rate": 2.8270941434075298e-05, + "loss": 0.2767, + "step": 13750 + }, + { + "epoch": 0.7547749725576289, + "grad_norm": 2.2257800102233887, + "learning_rate": 2.8265750384181744e-05, + "loss": 0.3909, + "step": 13752 + }, + { + "epoch": 0.7548847420417124, + "grad_norm": 1.5903539657592773, + "learning_rate": 2.826055919103635e-05, + "loss": 0.3283, + "step": 13754 + }, + { + "epoch": 0.7549945115257958, + "grad_norm": 1.2999675273895264, + "learning_rate": 2.825536785486682e-05, + "loss": 0.2798, + "step": 13756 + }, + { + "epoch": 0.7551042810098793, + "grad_norm": 1.1012606620788574, + "learning_rate": 2.8250176375900876e-05, + "loss": 0.161, + "step": 13758 + }, + { + "epoch": 0.7552140504939627, + "grad_norm": 1.1029157638549805, + "learning_rate": 2.824498475436625e-05, + "loss": 0.1997, + "step": 13760 + }, + { + "epoch": 0.7553238199780461, + "grad_norm": 2.053725004196167, + "learning_rate": 2.8239792990490667e-05, + "loss": 0.5223, + "step": 13762 + }, + { + "epoch": 0.7554335894621296, + "grad_norm": 1.139578104019165, + "learning_rate": 2.8234601084501855e-05, + "loss": 0.2023, + "step": 13764 + }, + { + "epoch": 0.7555433589462129, + "grad_norm": 1.47820246219635, + "learning_rate": 2.8229409036627564e-05, + "loss": 0.3737, + "step": 13766 + }, + { + "epoch": 0.7556531284302964, + "grad_norm": 1.0618510246276855, + "learning_rate": 2.822421684709554e-05, + "loss": 0.2203, + "step": 13768 + }, + { + "epoch": 0.7557628979143798, + "grad_norm": 1.2627668380737305, + "learning_rate": 2.8219024516133547e-05, + "loss": 0.1355, + "step": 13770 + }, + { + "epoch": 0.7558726673984633, + "grad_norm": 1.633997917175293, + "learning_rate": 2.8213832043969336e-05, + "loss": 0.1923, + "step": 13772 + }, + { + "epoch": 0.7559824368825466, + "grad_norm": 1.8129222393035889, + "learning_rate": 2.8208639430830674e-05, + "loss": 0.3605, + "step": 13774 + }, + { + "epoch": 0.7560922063666301, + "grad_norm": 1.2278252840042114, + "learning_rate": 2.8203446676945337e-05, + "loss": 0.2367, + "step": 13776 + }, + { + "epoch": 0.7562019758507135, + "grad_norm": 1.7080953121185303, + "learning_rate": 2.819825378254111e-05, + "loss": 0.2597, + "step": 13778 + }, + { + "epoch": 0.756311745334797, + "grad_norm": 1.258820652961731, + "learning_rate": 2.8193060747845772e-05, + "loss": 0.2114, + "step": 13780 + }, + { + "epoch": 0.7564215148188803, + "grad_norm": 1.3481636047363281, + "learning_rate": 2.8187867573087118e-05, + "loss": 0.2949, + "step": 13782 + }, + { + "epoch": 0.7565312843029638, + "grad_norm": 1.2188234329223633, + "learning_rate": 2.8182674258492947e-05, + "loss": 0.3485, + "step": 13784 + }, + { + "epoch": 0.7566410537870472, + "grad_norm": 1.465378999710083, + "learning_rate": 2.8177480804291056e-05, + "loss": 0.2696, + "step": 13786 + }, + { + "epoch": 0.7567508232711306, + "grad_norm": 3.4757180213928223, + "learning_rate": 2.8172287210709257e-05, + "loss": 0.2585, + "step": 13788 + }, + { + "epoch": 0.756860592755214, + "grad_norm": 0.7669689059257507, + "learning_rate": 2.816709347797538e-05, + "loss": 0.1806, + "step": 13790 + }, + { + "epoch": 0.7569703622392975, + "grad_norm": 1.3922959566116333, + "learning_rate": 2.8161899606317242e-05, + "loss": 0.3148, + "step": 13792 + }, + { + "epoch": 0.7570801317233808, + "grad_norm": 1.1899811029434204, + "learning_rate": 2.8156705595962656e-05, + "loss": 0.2022, + "step": 13794 + }, + { + "epoch": 0.7571899012074643, + "grad_norm": 1.2745540142059326, + "learning_rate": 2.8151511447139477e-05, + "loss": 0.277, + "step": 13796 + }, + { + "epoch": 0.7572996706915478, + "grad_norm": 1.0398584604263306, + "learning_rate": 2.8146317160075537e-05, + "loss": 0.2229, + "step": 13798 + }, + { + "epoch": 0.7574094401756312, + "grad_norm": 1.5048719644546509, + "learning_rate": 2.8141122734998675e-05, + "loss": 0.2124, + "step": 13800 + }, + { + "epoch": 0.7575192096597146, + "grad_norm": 2.133465528488159, + "learning_rate": 2.8135928172136756e-05, + "loss": 0.2462, + "step": 13802 + }, + { + "epoch": 0.757628979143798, + "grad_norm": 1.2665250301361084, + "learning_rate": 2.813073347171764e-05, + "loss": 0.2754, + "step": 13804 + }, + { + "epoch": 0.7577387486278815, + "grad_norm": 1.2298829555511475, + "learning_rate": 2.8125538633969183e-05, + "loss": 0.2596, + "step": 13806 + }, + { + "epoch": 0.7578485181119649, + "grad_norm": 1.5380281209945679, + "learning_rate": 2.8120343659119264e-05, + "loss": 0.3235, + "step": 13808 + }, + { + "epoch": 0.7579582875960483, + "grad_norm": 1.418700933456421, + "learning_rate": 2.811514854739576e-05, + "loss": 0.2985, + "step": 13810 + }, + { + "epoch": 0.7580680570801317, + "grad_norm": 1.0091934204101562, + "learning_rate": 2.8109953299026535e-05, + "loss": 0.1918, + "step": 13812 + }, + { + "epoch": 0.7581778265642152, + "grad_norm": 1.66532301902771, + "learning_rate": 2.810475791423951e-05, + "loss": 0.2682, + "step": 13814 + }, + { + "epoch": 0.7582875960482985, + "grad_norm": 1.3974206447601318, + "learning_rate": 2.809956239326256e-05, + "loss": 0.187, + "step": 13816 + }, + { + "epoch": 0.758397365532382, + "grad_norm": 1.5774837732315063, + "learning_rate": 2.809436673632358e-05, + "loss": 0.2677, + "step": 13818 + }, + { + "epoch": 0.7585071350164654, + "grad_norm": 1.0848777294158936, + "learning_rate": 2.8089170943650496e-05, + "loss": 0.2722, + "step": 13820 + }, + { + "epoch": 0.7586169045005489, + "grad_norm": 1.234145998954773, + "learning_rate": 2.8083975015471215e-05, + "loss": 0.2863, + "step": 13822 + }, + { + "epoch": 0.7587266739846322, + "grad_norm": 1.5192288160324097, + "learning_rate": 2.8078778952013644e-05, + "loss": 0.2452, + "step": 13824 + }, + { + "epoch": 0.7588364434687157, + "grad_norm": 0.9094827771186829, + "learning_rate": 2.8073582753505723e-05, + "loss": 0.2291, + "step": 13826 + }, + { + "epoch": 0.7589462129527991, + "grad_norm": 1.8445271253585815, + "learning_rate": 2.8068386420175375e-05, + "loss": 0.2915, + "step": 13828 + }, + { + "epoch": 0.7590559824368825, + "grad_norm": 1.0339707136154175, + "learning_rate": 2.8063189952250535e-05, + "loss": 0.2183, + "step": 13830 + }, + { + "epoch": 0.759165751920966, + "grad_norm": 1.5155869722366333, + "learning_rate": 2.805799334995915e-05, + "loss": 0.2589, + "step": 13832 + }, + { + "epoch": 0.7592755214050494, + "grad_norm": 1.1036169528961182, + "learning_rate": 2.8052796613529168e-05, + "loss": 0.1741, + "step": 13834 + }, + { + "epoch": 0.7593852908891329, + "grad_norm": 1.06960129737854, + "learning_rate": 2.804759974318854e-05, + "loss": 0.1819, + "step": 13836 + }, + { + "epoch": 0.7594950603732162, + "grad_norm": 1.3204401731491089, + "learning_rate": 2.8042402739165236e-05, + "loss": 0.1682, + "step": 13838 + }, + { + "epoch": 0.7596048298572997, + "grad_norm": 1.5309919118881226, + "learning_rate": 2.803720560168721e-05, + "loss": 0.2497, + "step": 13840 + }, + { + "epoch": 0.7597145993413831, + "grad_norm": 1.3700329065322876, + "learning_rate": 2.803200833098244e-05, + "loss": 0.1701, + "step": 13842 + }, + { + "epoch": 0.7598243688254666, + "grad_norm": 1.3098559379577637, + "learning_rate": 2.8026810927278902e-05, + "loss": 0.2202, + "step": 13844 + }, + { + "epoch": 0.7599341383095499, + "grad_norm": 1.0833489894866943, + "learning_rate": 2.802161339080458e-05, + "loss": 0.1857, + "step": 13846 + }, + { + "epoch": 0.7600439077936334, + "grad_norm": 1.604956865310669, + "learning_rate": 2.8016415721787463e-05, + "loss": 0.1933, + "step": 13848 + }, + { + "epoch": 0.7601536772777168, + "grad_norm": 1.3372704982757568, + "learning_rate": 2.8011217920455546e-05, + "loss": 0.2503, + "step": 13850 + }, + { + "epoch": 0.7602634467618002, + "grad_norm": 2.301969051361084, + "learning_rate": 2.8006019987036836e-05, + "loss": 0.3419, + "step": 13852 + }, + { + "epoch": 0.7603732162458836, + "grad_norm": 2.784813404083252, + "learning_rate": 2.8000821921759334e-05, + "loss": 0.3799, + "step": 13854 + }, + { + "epoch": 0.7604829857299671, + "grad_norm": 0.9630001187324524, + "learning_rate": 2.7995623724851043e-05, + "loss": 0.1797, + "step": 13856 + }, + { + "epoch": 0.7605927552140505, + "grad_norm": 1.7626488208770752, + "learning_rate": 2.799042539654001e-05, + "loss": 0.1974, + "step": 13858 + }, + { + "epoch": 0.7607025246981339, + "grad_norm": 1.3807203769683838, + "learning_rate": 2.7985226937054225e-05, + "loss": 0.2061, + "step": 13860 + }, + { + "epoch": 0.7608122941822173, + "grad_norm": 1.1561075448989868, + "learning_rate": 2.7980028346621738e-05, + "loss": 0.211, + "step": 13862 + }, + { + "epoch": 0.7609220636663008, + "grad_norm": 1.1180169582366943, + "learning_rate": 2.797482962547059e-05, + "loss": 0.3132, + "step": 13864 + }, + { + "epoch": 0.7610318331503841, + "grad_norm": 1.2672265768051147, + "learning_rate": 2.7969630773828802e-05, + "loss": 0.2334, + "step": 13866 + }, + { + "epoch": 0.7611416026344676, + "grad_norm": 1.5439350605010986, + "learning_rate": 2.796443179192444e-05, + "loss": 0.2707, + "step": 13868 + }, + { + "epoch": 0.7612513721185511, + "grad_norm": 1.654942274093628, + "learning_rate": 2.7959232679985547e-05, + "loss": 0.3354, + "step": 13870 + }, + { + "epoch": 0.7613611416026345, + "grad_norm": 1.2905880212783813, + "learning_rate": 2.7954033438240184e-05, + "loss": 0.2342, + "step": 13872 + }, + { + "epoch": 0.7614709110867179, + "grad_norm": 0.9653217792510986, + "learning_rate": 2.794883406691641e-05, + "loss": 0.2815, + "step": 13874 + }, + { + "epoch": 0.7615806805708013, + "grad_norm": 1.5936334133148193, + "learning_rate": 2.794363456624231e-05, + "loss": 0.2224, + "step": 13876 + }, + { + "epoch": 0.7616904500548848, + "grad_norm": 1.7566848993301392, + "learning_rate": 2.7938434936445945e-05, + "loss": 0.2376, + "step": 13878 + }, + { + "epoch": 0.7618002195389681, + "grad_norm": 1.4249454736709595, + "learning_rate": 2.7933235177755395e-05, + "loss": 0.2136, + "step": 13880 + }, + { + "epoch": 0.7619099890230516, + "grad_norm": 1.5386559963226318, + "learning_rate": 2.7928035290398764e-05, + "loss": 0.2532, + "step": 13882 + }, + { + "epoch": 0.762019758507135, + "grad_norm": 0.9482088088989258, + "learning_rate": 2.792283527460413e-05, + "loss": 0.2502, + "step": 13884 + }, + { + "epoch": 0.7621295279912185, + "grad_norm": 2.155744791030884, + "learning_rate": 2.791763513059959e-05, + "loss": 0.1678, + "step": 13886 + }, + { + "epoch": 0.7622392974753018, + "grad_norm": 1.587888240814209, + "learning_rate": 2.7912434858613257e-05, + "loss": 0.2338, + "step": 13888 + }, + { + "epoch": 0.7623490669593853, + "grad_norm": 1.6714838743209839, + "learning_rate": 2.7907234458873237e-05, + "loss": 0.222, + "step": 13890 + }, + { + "epoch": 0.7624588364434687, + "grad_norm": 1.7824105024337769, + "learning_rate": 2.7902033931607634e-05, + "loss": 0.3513, + "step": 13892 + }, + { + "epoch": 0.7625686059275522, + "grad_norm": 0.9928033947944641, + "learning_rate": 2.7896833277044586e-05, + "loss": 0.3048, + "step": 13894 + }, + { + "epoch": 0.7626783754116355, + "grad_norm": 1.7274373769760132, + "learning_rate": 2.7891632495412217e-05, + "loss": 0.2152, + "step": 13896 + }, + { + "epoch": 0.762788144895719, + "grad_norm": 1.4598579406738281, + "learning_rate": 2.7886431586938642e-05, + "loss": 0.3249, + "step": 13898 + }, + { + "epoch": 0.7628979143798024, + "grad_norm": 1.5805808305740356, + "learning_rate": 2.7881230551852023e-05, + "loss": 0.2508, + "step": 13900 + }, + { + "epoch": 0.7630076838638858, + "grad_norm": 0.8990519046783447, + "learning_rate": 2.787602939038049e-05, + "loss": 0.1734, + "step": 13902 + }, + { + "epoch": 0.7631174533479692, + "grad_norm": 1.5542653799057007, + "learning_rate": 2.787082810275218e-05, + "loss": 0.3692, + "step": 13904 + }, + { + "epoch": 0.7632272228320527, + "grad_norm": 1.2371244430541992, + "learning_rate": 2.7865626689195267e-05, + "loss": 0.2118, + "step": 13906 + }, + { + "epoch": 0.7633369923161362, + "grad_norm": 2.297492504119873, + "learning_rate": 2.7860425149937896e-05, + "loss": 0.2505, + "step": 13908 + }, + { + "epoch": 0.7634467618002195, + "grad_norm": 1.7872076034545898, + "learning_rate": 2.7855223485208238e-05, + "loss": 0.2349, + "step": 13910 + }, + { + "epoch": 0.763556531284303, + "grad_norm": 1.1701760292053223, + "learning_rate": 2.785002169523447e-05, + "loss": 0.3269, + "step": 13912 + }, + { + "epoch": 0.7636663007683864, + "grad_norm": 1.982738971710205, + "learning_rate": 2.7844819780244762e-05, + "loss": 0.2818, + "step": 13914 + }, + { + "epoch": 0.7637760702524699, + "grad_norm": 1.5788261890411377, + "learning_rate": 2.783961774046729e-05, + "loss": 0.259, + "step": 13916 + }, + { + "epoch": 0.7638858397365532, + "grad_norm": 1.8646929264068604, + "learning_rate": 2.7834415576130252e-05, + "loss": 0.342, + "step": 13918 + }, + { + "epoch": 0.7639956092206367, + "grad_norm": 1.0636022090911865, + "learning_rate": 2.782921328746183e-05, + "loss": 0.246, + "step": 13920 + }, + { + "epoch": 0.7641053787047201, + "grad_norm": 2.1117632389068604, + "learning_rate": 2.7824010874690227e-05, + "loss": 0.2559, + "step": 13922 + }, + { + "epoch": 0.7642151481888035, + "grad_norm": 1.2991586923599243, + "learning_rate": 2.7818808338043643e-05, + "loss": 0.1796, + "step": 13924 + }, + { + "epoch": 0.7643249176728869, + "grad_norm": 1.4903695583343506, + "learning_rate": 2.7813605677750297e-05, + "loss": 0.267, + "step": 13926 + }, + { + "epoch": 0.7644346871569704, + "grad_norm": 1.400131106376648, + "learning_rate": 2.780840289403839e-05, + "loss": 0.2231, + "step": 13928 + }, + { + "epoch": 0.7645444566410537, + "grad_norm": 2.297232151031494, + "learning_rate": 2.7803199987136153e-05, + "loss": 0.1659, + "step": 13930 + }, + { + "epoch": 0.7646542261251372, + "grad_norm": 1.4084457159042358, + "learning_rate": 2.7797996957271805e-05, + "loss": 0.2633, + "step": 13932 + }, + { + "epoch": 0.7647639956092206, + "grad_norm": 1.1241812705993652, + "learning_rate": 2.779279380467357e-05, + "loss": 0.1704, + "step": 13934 + }, + { + "epoch": 0.7648737650933041, + "grad_norm": 2.6642065048217773, + "learning_rate": 2.7787590529569695e-05, + "loss": 0.2891, + "step": 13936 + }, + { + "epoch": 0.7649835345773874, + "grad_norm": 1.0729328393936157, + "learning_rate": 2.778238713218842e-05, + "loss": 0.3367, + "step": 13938 + }, + { + "epoch": 0.7650933040614709, + "grad_norm": 1.414785385131836, + "learning_rate": 2.7777183612757985e-05, + "loss": 0.3556, + "step": 13940 + }, + { + "epoch": 0.7652030735455544, + "grad_norm": 1.3221039772033691, + "learning_rate": 2.7771979971506645e-05, + "loss": 0.2494, + "step": 13942 + }, + { + "epoch": 0.7653128430296378, + "grad_norm": 1.1593077182769775, + "learning_rate": 2.7766776208662664e-05, + "loss": 0.2187, + "step": 13944 + }, + { + "epoch": 0.7654226125137212, + "grad_norm": 1.3767720460891724, + "learning_rate": 2.776157232445429e-05, + "loss": 0.2009, + "step": 13946 + }, + { + "epoch": 0.7655323819978046, + "grad_norm": 1.4691975116729736, + "learning_rate": 2.7756368319109803e-05, + "loss": 0.3081, + "step": 13948 + }, + { + "epoch": 0.7656421514818881, + "grad_norm": 2.4390571117401123, + "learning_rate": 2.7751164192857477e-05, + "loss": 0.3333, + "step": 13950 + }, + { + "epoch": 0.7657519209659714, + "grad_norm": 1.4803508520126343, + "learning_rate": 2.774595994592558e-05, + "loss": 0.3104, + "step": 13952 + }, + { + "epoch": 0.7658616904500549, + "grad_norm": 0.9701854586601257, + "learning_rate": 2.77407555785424e-05, + "loss": 0.1682, + "step": 13954 + }, + { + "epoch": 0.7659714599341383, + "grad_norm": 1.0044758319854736, + "learning_rate": 2.7735551090936236e-05, + "loss": 0.1674, + "step": 13956 + }, + { + "epoch": 0.7660812294182218, + "grad_norm": 0.9774039387702942, + "learning_rate": 2.7730346483335374e-05, + "loss": 0.2751, + "step": 13958 + }, + { + "epoch": 0.7661909989023051, + "grad_norm": 1.4248192310333252, + "learning_rate": 2.7725141755968105e-05, + "loss": 0.2516, + "step": 13960 + }, + { + "epoch": 0.7663007683863886, + "grad_norm": 1.1478145122528076, + "learning_rate": 2.771993690906275e-05, + "loss": 0.1727, + "step": 13962 + }, + { + "epoch": 0.766410537870472, + "grad_norm": 0.7641187906265259, + "learning_rate": 2.771473194284761e-05, + "loss": 0.2309, + "step": 13964 + }, + { + "epoch": 0.7665203073545555, + "grad_norm": 1.656551718711853, + "learning_rate": 2.7709526857551e-05, + "loss": 0.3169, + "step": 13966 + }, + { + "epoch": 0.7666300768386388, + "grad_norm": 1.4718711376190186, + "learning_rate": 2.7704321653401245e-05, + "loss": 0.2582, + "step": 13968 + }, + { + "epoch": 0.7667398463227223, + "grad_norm": 1.341835379600525, + "learning_rate": 2.7699116330626667e-05, + "loss": 0.2208, + "step": 13970 + }, + { + "epoch": 0.7668496158068057, + "grad_norm": 1.6441620588302612, + "learning_rate": 2.76939108894556e-05, + "loss": 0.3748, + "step": 13972 + }, + { + "epoch": 0.7669593852908891, + "grad_norm": 1.8820406198501587, + "learning_rate": 2.7688705330116378e-05, + "loss": 0.2855, + "step": 13974 + }, + { + "epoch": 0.7670691547749725, + "grad_norm": 1.8496692180633545, + "learning_rate": 2.7683499652837346e-05, + "loss": 0.3522, + "step": 13976 + }, + { + "epoch": 0.767178924259056, + "grad_norm": 1.0520350933074951, + "learning_rate": 2.7678293857846844e-05, + "loss": 0.195, + "step": 13978 + }, + { + "epoch": 0.7672886937431395, + "grad_norm": 1.7147655487060547, + "learning_rate": 2.7673087945373234e-05, + "loss": 0.2679, + "step": 13980 + }, + { + "epoch": 0.7673984632272228, + "grad_norm": 1.820926308631897, + "learning_rate": 2.766788191564486e-05, + "loss": 0.2375, + "step": 13982 + }, + { + "epoch": 0.7675082327113063, + "grad_norm": 1.8987607955932617, + "learning_rate": 2.7662675768890086e-05, + "loss": 0.2759, + "step": 13984 + }, + { + "epoch": 0.7676180021953897, + "grad_norm": 1.7016897201538086, + "learning_rate": 2.765746950533729e-05, + "loss": 0.2897, + "step": 13986 + }, + { + "epoch": 0.7677277716794731, + "grad_norm": 1.1201575994491577, + "learning_rate": 2.7652263125214846e-05, + "loss": 0.2798, + "step": 13988 + }, + { + "epoch": 0.7678375411635565, + "grad_norm": 2.1096854209899902, + "learning_rate": 2.764705662875111e-05, + "loss": 0.2179, + "step": 13990 + }, + { + "epoch": 0.76794731064764, + "grad_norm": 1.9889063835144043, + "learning_rate": 2.7641850016174487e-05, + "loss": 0.2846, + "step": 13992 + }, + { + "epoch": 0.7680570801317234, + "grad_norm": 1.2450770139694214, + "learning_rate": 2.7636643287713355e-05, + "loss": 0.2284, + "step": 13994 + }, + { + "epoch": 0.7681668496158068, + "grad_norm": 1.2381197214126587, + "learning_rate": 2.7631436443596097e-05, + "loss": 0.1836, + "step": 13996 + }, + { + "epoch": 0.7682766190998902, + "grad_norm": 1.5377812385559082, + "learning_rate": 2.7626229484051126e-05, + "loss": 0.2165, + "step": 13998 + }, + { + "epoch": 0.7683863885839737, + "grad_norm": 1.1368091106414795, + "learning_rate": 2.762102240930684e-05, + "loss": 0.2369, + "step": 14000 + }, + { + "epoch": 0.768496158068057, + "grad_norm": 1.3726904392242432, + "learning_rate": 2.7615815219591647e-05, + "loss": 0.3001, + "step": 14002 + }, + { + "epoch": 0.7686059275521405, + "grad_norm": 2.322359561920166, + "learning_rate": 2.7610607915133958e-05, + "loss": 0.2653, + "step": 14004 + }, + { + "epoch": 0.7687156970362239, + "grad_norm": 1.4413422346115112, + "learning_rate": 2.76054004961622e-05, + "loss": 0.2999, + "step": 14006 + }, + { + "epoch": 0.7688254665203074, + "grad_norm": 1.1952804327011108, + "learning_rate": 2.7600192962904776e-05, + "loss": 0.2997, + "step": 14008 + }, + { + "epoch": 0.7689352360043907, + "grad_norm": 1.8722511529922485, + "learning_rate": 2.7594985315590132e-05, + "loss": 0.2834, + "step": 14010 + }, + { + "epoch": 0.7690450054884742, + "grad_norm": 2.235551357269287, + "learning_rate": 2.7589777554446698e-05, + "loss": 0.1928, + "step": 14012 + }, + { + "epoch": 0.7691547749725576, + "grad_norm": 1.6050227880477905, + "learning_rate": 2.75845696797029e-05, + "loss": 0.3715, + "step": 14014 + }, + { + "epoch": 0.769264544456641, + "grad_norm": 1.8302639722824097, + "learning_rate": 2.7579361691587198e-05, + "loss": 0.4762, + "step": 14016 + }, + { + "epoch": 0.7693743139407245, + "grad_norm": 1.7255797386169434, + "learning_rate": 2.7574153590328034e-05, + "loss": 0.3264, + "step": 14018 + }, + { + "epoch": 0.7694840834248079, + "grad_norm": 1.9859576225280762, + "learning_rate": 2.756894537615385e-05, + "loss": 0.2224, + "step": 14020 + }, + { + "epoch": 0.7695938529088914, + "grad_norm": 1.8131375312805176, + "learning_rate": 2.756373704929312e-05, + "loss": 0.274, + "step": 14022 + }, + { + "epoch": 0.7697036223929747, + "grad_norm": 1.332290530204773, + "learning_rate": 2.7558528609974298e-05, + "loss": 0.2686, + "step": 14024 + }, + { + "epoch": 0.7698133918770582, + "grad_norm": 1.5979746580123901, + "learning_rate": 2.7553320058425846e-05, + "loss": 0.1901, + "step": 14026 + }, + { + "epoch": 0.7699231613611416, + "grad_norm": 1.27544105052948, + "learning_rate": 2.754811139487625e-05, + "loss": 0.303, + "step": 14028 + }, + { + "epoch": 0.7700329308452251, + "grad_norm": 1.5172264575958252, + "learning_rate": 2.7542902619553985e-05, + "loss": 0.181, + "step": 14030 + }, + { + "epoch": 0.7701427003293084, + "grad_norm": 1.2433031797409058, + "learning_rate": 2.7537693732687524e-05, + "loss": 0.2539, + "step": 14032 + }, + { + "epoch": 0.7702524698133919, + "grad_norm": 1.6894394159317017, + "learning_rate": 2.7532484734505366e-05, + "loss": 0.2503, + "step": 14034 + }, + { + "epoch": 0.7703622392974753, + "grad_norm": 1.2226167917251587, + "learning_rate": 2.7527275625235993e-05, + "loss": 0.263, + "step": 14036 + }, + { + "epoch": 0.7704720087815587, + "grad_norm": 1.2704449892044067, + "learning_rate": 2.7522066405107906e-05, + "loss": 0.2362, + "step": 14038 + }, + { + "epoch": 0.7705817782656421, + "grad_norm": 1.7845553159713745, + "learning_rate": 2.751685707434961e-05, + "loss": 0.1798, + "step": 14040 + }, + { + "epoch": 0.7706915477497256, + "grad_norm": 1.3027697801589966, + "learning_rate": 2.7511647633189608e-05, + "loss": 0.2681, + "step": 14042 + }, + { + "epoch": 0.770801317233809, + "grad_norm": 1.3143397569656372, + "learning_rate": 2.7506438081856412e-05, + "loss": 0.2918, + "step": 14044 + }, + { + "epoch": 0.7709110867178924, + "grad_norm": 1.8448290824890137, + "learning_rate": 2.7501228420578533e-05, + "loss": 0.2203, + "step": 14046 + }, + { + "epoch": 0.7710208562019758, + "grad_norm": 1.2053790092468262, + "learning_rate": 2.749601864958451e-05, + "loss": 0.2313, + "step": 14048 + }, + { + "epoch": 0.7711306256860593, + "grad_norm": 1.0517218112945557, + "learning_rate": 2.7490808769102856e-05, + "loss": 0.2124, + "step": 14050 + }, + { + "epoch": 0.7712403951701426, + "grad_norm": 3.110602617263794, + "learning_rate": 2.7485598779362097e-05, + "loss": 0.2619, + "step": 14052 + }, + { + "epoch": 0.7713501646542261, + "grad_norm": 1.6064751148223877, + "learning_rate": 2.7480388680590786e-05, + "loss": 0.2739, + "step": 14054 + }, + { + "epoch": 0.7714599341383096, + "grad_norm": 1.7197264432907104, + "learning_rate": 2.7475178473017442e-05, + "loss": 0.3863, + "step": 14056 + }, + { + "epoch": 0.771569703622393, + "grad_norm": 2.095150947570801, + "learning_rate": 2.7469968156870622e-05, + "loss": 0.2166, + "step": 14058 + }, + { + "epoch": 0.7716794731064764, + "grad_norm": 3.125762462615967, + "learning_rate": 2.7464757732378883e-05, + "loss": 0.1919, + "step": 14060 + }, + { + "epoch": 0.7717892425905598, + "grad_norm": 1.2951266765594482, + "learning_rate": 2.7459547199770775e-05, + "loss": 0.2664, + "step": 14062 + }, + { + "epoch": 0.7718990120746433, + "grad_norm": 1.0789765119552612, + "learning_rate": 2.745433655927484e-05, + "loss": 0.1547, + "step": 14064 + }, + { + "epoch": 0.7720087815587267, + "grad_norm": 0.9550390243530273, + "learning_rate": 2.7449125811119668e-05, + "loss": 0.1714, + "step": 14066 + }, + { + "epoch": 0.7721185510428101, + "grad_norm": 2.0796022415161133, + "learning_rate": 2.7443914955533817e-05, + "loss": 0.2299, + "step": 14068 + }, + { + "epoch": 0.7722283205268935, + "grad_norm": 1.2341492176055908, + "learning_rate": 2.7438703992745857e-05, + "loss": 0.2763, + "step": 14070 + }, + { + "epoch": 0.772338090010977, + "grad_norm": 1.1409393548965454, + "learning_rate": 2.743349292298437e-05, + "loss": 0.2542, + "step": 14072 + }, + { + "epoch": 0.7724478594950603, + "grad_norm": 1.7480597496032715, + "learning_rate": 2.742828174647794e-05, + "loss": 0.2123, + "step": 14074 + }, + { + "epoch": 0.7725576289791438, + "grad_norm": 1.186822533607483, + "learning_rate": 2.7423070463455147e-05, + "loss": 0.2412, + "step": 14076 + }, + { + "epoch": 0.7726673984632272, + "grad_norm": 0.7063749432563782, + "learning_rate": 2.7417859074144604e-05, + "loss": 0.1751, + "step": 14078 + }, + { + "epoch": 0.7727771679473107, + "grad_norm": 1.060444951057434, + "learning_rate": 2.7412647578774886e-05, + "loss": 0.1999, + "step": 14080 + }, + { + "epoch": 0.772886937431394, + "grad_norm": 1.4792687892913818, + "learning_rate": 2.7407435977574602e-05, + "loss": 0.2185, + "step": 14082 + }, + { + "epoch": 0.7729967069154775, + "grad_norm": 1.5911002159118652, + "learning_rate": 2.740222427077237e-05, + "loss": 0.3737, + "step": 14084 + }, + { + "epoch": 0.7731064763995609, + "grad_norm": 1.6349247694015503, + "learning_rate": 2.7397012458596782e-05, + "loss": 0.2622, + "step": 14086 + }, + { + "epoch": 0.7732162458836443, + "grad_norm": 0.9789019227027893, + "learning_rate": 2.7391800541276463e-05, + "loss": 0.2299, + "step": 14088 + }, + { + "epoch": 0.7733260153677278, + "grad_norm": 1.474560022354126, + "learning_rate": 2.7386588519040028e-05, + "loss": 0.1992, + "step": 14090 + }, + { + "epoch": 0.7734357848518112, + "grad_norm": 0.8356228470802307, + "learning_rate": 2.7381376392116115e-05, + "loss": 0.2111, + "step": 14092 + }, + { + "epoch": 0.7735455543358947, + "grad_norm": 1.034188985824585, + "learning_rate": 2.7376164160733337e-05, + "loss": 0.2158, + "step": 14094 + }, + { + "epoch": 0.773655323819978, + "grad_norm": 1.3502706289291382, + "learning_rate": 2.7370951825120346e-05, + "loss": 0.1735, + "step": 14096 + }, + { + "epoch": 0.7737650933040615, + "grad_norm": 1.1367467641830444, + "learning_rate": 2.7365739385505766e-05, + "loss": 0.2277, + "step": 14098 + }, + { + "epoch": 0.7738748627881449, + "grad_norm": 1.6527841091156006, + "learning_rate": 2.736052684211824e-05, + "loss": 0.3091, + "step": 14100 + }, + { + "epoch": 0.7739846322722284, + "grad_norm": 1.1475690603256226, + "learning_rate": 2.7355314195186427e-05, + "loss": 0.1942, + "step": 14102 + }, + { + "epoch": 0.7740944017563117, + "grad_norm": 1.9265788793563843, + "learning_rate": 2.735010144493897e-05, + "loss": 0.248, + "step": 14104 + }, + { + "epoch": 0.7742041712403952, + "grad_norm": 1.5165386199951172, + "learning_rate": 2.7344888591604524e-05, + "loss": 0.305, + "step": 14106 + }, + { + "epoch": 0.7743139407244786, + "grad_norm": 1.0509628057479858, + "learning_rate": 2.733967563541176e-05, + "loss": 0.158, + "step": 14108 + }, + { + "epoch": 0.774423710208562, + "grad_norm": 1.1693922281265259, + "learning_rate": 2.7334462576589344e-05, + "loss": 0.1934, + "step": 14110 + }, + { + "epoch": 0.7745334796926454, + "grad_norm": 1.5803812742233276, + "learning_rate": 2.7329249415365933e-05, + "loss": 0.3115, + "step": 14112 + }, + { + "epoch": 0.7746432491767289, + "grad_norm": 1.2832075357437134, + "learning_rate": 2.7324036151970213e-05, + "loss": 0.1824, + "step": 14114 + }, + { + "epoch": 0.7747530186608123, + "grad_norm": 1.3250555992126465, + "learning_rate": 2.7318822786630865e-05, + "loss": 0.2866, + "step": 14116 + }, + { + "epoch": 0.7748627881448957, + "grad_norm": 1.011643648147583, + "learning_rate": 2.731360931957656e-05, + "loss": 0.1455, + "step": 14118 + }, + { + "epoch": 0.7749725576289791, + "grad_norm": 1.2583036422729492, + "learning_rate": 2.7308395751035993e-05, + "loss": 0.1757, + "step": 14120 + }, + { + "epoch": 0.7750823271130626, + "grad_norm": 2.067007303237915, + "learning_rate": 2.7303182081237867e-05, + "loss": 0.1891, + "step": 14122 + }, + { + "epoch": 0.7751920965971459, + "grad_norm": 1.3102712631225586, + "learning_rate": 2.729796831041086e-05, + "loss": 0.2758, + "step": 14124 + }, + { + "epoch": 0.7753018660812294, + "grad_norm": 1.9672460556030273, + "learning_rate": 2.7292754438783695e-05, + "loss": 0.3342, + "step": 14126 + }, + { + "epoch": 0.7754116355653129, + "grad_norm": 1.292393684387207, + "learning_rate": 2.7287540466585065e-05, + "loss": 0.3575, + "step": 14128 + }, + { + "epoch": 0.7755214050493963, + "grad_norm": 1.1637625694274902, + "learning_rate": 2.7282326394043677e-05, + "loss": 0.2388, + "step": 14130 + }, + { + "epoch": 0.7756311745334797, + "grad_norm": 1.719852328300476, + "learning_rate": 2.7277112221388252e-05, + "loss": 0.2087, + "step": 14132 + }, + { + "epoch": 0.7757409440175631, + "grad_norm": 2.5163066387176514, + "learning_rate": 2.7271897948847508e-05, + "loss": 0.2095, + "step": 14134 + }, + { + "epoch": 0.7758507135016466, + "grad_norm": 3.320500373840332, + "learning_rate": 2.726668357665017e-05, + "loss": 0.2515, + "step": 14136 + }, + { + "epoch": 0.77596048298573, + "grad_norm": 1.570421814918518, + "learning_rate": 2.7261469105024962e-05, + "loss": 0.281, + "step": 14138 + }, + { + "epoch": 0.7760702524698134, + "grad_norm": 1.2118357419967651, + "learning_rate": 2.7256254534200626e-05, + "loss": 0.3674, + "step": 14140 + }, + { + "epoch": 0.7761800219538968, + "grad_norm": 1.2100929021835327, + "learning_rate": 2.725103986440589e-05, + "loss": 0.2386, + "step": 14142 + }, + { + "epoch": 0.7762897914379803, + "grad_norm": 1.8042720556259155, + "learning_rate": 2.7245825095869494e-05, + "loss": 0.2334, + "step": 14144 + }, + { + "epoch": 0.7763995609220636, + "grad_norm": 1.9721908569335938, + "learning_rate": 2.724061022882019e-05, + "loss": 0.2728, + "step": 14146 + }, + { + "epoch": 0.7765093304061471, + "grad_norm": 1.0993268489837646, + "learning_rate": 2.723539526348671e-05, + "loss": 0.2987, + "step": 14148 + }, + { + "epoch": 0.7766190998902305, + "grad_norm": 1.5155904293060303, + "learning_rate": 2.7230180200097833e-05, + "loss": 0.2232, + "step": 14150 + }, + { + "epoch": 0.776728869374314, + "grad_norm": 2.2346343994140625, + "learning_rate": 2.722496503888231e-05, + "loss": 0.341, + "step": 14152 + }, + { + "epoch": 0.7768386388583973, + "grad_norm": 1.5106871128082275, + "learning_rate": 2.7219749780068898e-05, + "loss": 0.2497, + "step": 14154 + }, + { + "epoch": 0.7769484083424808, + "grad_norm": 4.591533660888672, + "learning_rate": 2.7214534423886357e-05, + "loss": 0.5033, + "step": 14156 + }, + { + "epoch": 0.7770581778265642, + "grad_norm": 2.0430831909179688, + "learning_rate": 2.7209318970563473e-05, + "loss": 0.2452, + "step": 14158 + }, + { + "epoch": 0.7771679473106476, + "grad_norm": 1.3598647117614746, + "learning_rate": 2.720410342032902e-05, + "loss": 0.2573, + "step": 14160 + }, + { + "epoch": 0.777277716794731, + "grad_norm": 1.018877625465393, + "learning_rate": 2.7198887773411763e-05, + "loss": 0.2226, + "step": 14162 + }, + { + "epoch": 0.7773874862788145, + "grad_norm": 1.5879998207092285, + "learning_rate": 2.7193672030040495e-05, + "loss": 0.2332, + "step": 14164 + }, + { + "epoch": 0.777497255762898, + "grad_norm": 1.0059113502502441, + "learning_rate": 2.718845619044401e-05, + "loss": 0.1808, + "step": 14166 + }, + { + "epoch": 0.7776070252469813, + "grad_norm": 1.2403621673583984, + "learning_rate": 2.7183240254851096e-05, + "loss": 0.2038, + "step": 14168 + }, + { + "epoch": 0.7777167947310648, + "grad_norm": 1.3229084014892578, + "learning_rate": 2.7178024223490543e-05, + "loss": 0.3094, + "step": 14170 + }, + { + "epoch": 0.7778265642151482, + "grad_norm": 1.4304112195968628, + "learning_rate": 2.7172808096591162e-05, + "loss": 0.2943, + "step": 14172 + }, + { + "epoch": 0.7779363336992317, + "grad_norm": 1.545556664466858, + "learning_rate": 2.716759187438175e-05, + "loss": 0.1874, + "step": 14174 + }, + { + "epoch": 0.778046103183315, + "grad_norm": 1.1552345752716064, + "learning_rate": 2.7162375557091124e-05, + "loss": 0.2784, + "step": 14176 + }, + { + "epoch": 0.7781558726673985, + "grad_norm": 2.3808810710906982, + "learning_rate": 2.7157159144948092e-05, + "loss": 0.3186, + "step": 14178 + }, + { + "epoch": 0.7782656421514819, + "grad_norm": 1.7524853944778442, + "learning_rate": 2.715194263818146e-05, + "loss": 0.3201, + "step": 14180 + }, + { + "epoch": 0.7783754116355653, + "grad_norm": 2.2702653408050537, + "learning_rate": 2.714672603702007e-05, + "loss": 0.28, + "step": 14182 + }, + { + "epoch": 0.7784851811196487, + "grad_norm": 1.0036247968673706, + "learning_rate": 2.7141509341692744e-05, + "loss": 0.2018, + "step": 14184 + }, + { + "epoch": 0.7785949506037322, + "grad_norm": 1.729594111442566, + "learning_rate": 2.71362925524283e-05, + "loss": 0.2281, + "step": 14186 + }, + { + "epoch": 0.7787047200878155, + "grad_norm": 1.442559003829956, + "learning_rate": 2.7131075669455587e-05, + "loss": 0.3183, + "step": 14188 + }, + { + "epoch": 0.778814489571899, + "grad_norm": 0.9761623740196228, + "learning_rate": 2.712585869300343e-05, + "loss": 0.2126, + "step": 14190 + }, + { + "epoch": 0.7789242590559824, + "grad_norm": 1.1800254583358765, + "learning_rate": 2.7120641623300675e-05, + "loss": 0.2758, + "step": 14192 + }, + { + "epoch": 0.7790340285400659, + "grad_norm": 2.262655019760132, + "learning_rate": 2.711542446057617e-05, + "loss": 0.327, + "step": 14194 + }, + { + "epoch": 0.7791437980241492, + "grad_norm": 1.3424994945526123, + "learning_rate": 2.7110207205058768e-05, + "loss": 0.2899, + "step": 14196 + }, + { + "epoch": 0.7792535675082327, + "grad_norm": 1.0982543230056763, + "learning_rate": 2.7104989856977313e-05, + "loss": 0.2272, + "step": 14198 + }, + { + "epoch": 0.7793633369923162, + "grad_norm": 1.6603583097457886, + "learning_rate": 2.709977241656068e-05, + "loss": 0.2593, + "step": 14200 + }, + { + "epoch": 0.7794731064763996, + "grad_norm": 1.1027826070785522, + "learning_rate": 2.709455488403772e-05, + "loss": 0.1919, + "step": 14202 + }, + { + "epoch": 0.779582875960483, + "grad_norm": 0.9968022704124451, + "learning_rate": 2.7089337259637306e-05, + "loss": 0.2168, + "step": 14204 + }, + { + "epoch": 0.7796926454445664, + "grad_norm": 2.4046926498413086, + "learning_rate": 2.70841195435883e-05, + "loss": 0.2697, + "step": 14206 + }, + { + "epoch": 0.7798024149286499, + "grad_norm": 1.6027213335037231, + "learning_rate": 2.7078901736119582e-05, + "loss": 0.2501, + "step": 14208 + }, + { + "epoch": 0.7799121844127332, + "grad_norm": 1.7125487327575684, + "learning_rate": 2.707368383746003e-05, + "loss": 0.3351, + "step": 14210 + }, + { + "epoch": 0.7800219538968167, + "grad_norm": 1.3545554876327515, + "learning_rate": 2.706846584783852e-05, + "loss": 0.2251, + "step": 14212 + }, + { + "epoch": 0.7801317233809001, + "grad_norm": 0.796027660369873, + "learning_rate": 2.706324776748395e-05, + "loss": 0.1632, + "step": 14214 + }, + { + "epoch": 0.7802414928649836, + "grad_norm": 1.6314940452575684, + "learning_rate": 2.7058029596625207e-05, + "loss": 0.3297, + "step": 14216 + }, + { + "epoch": 0.7803512623490669, + "grad_norm": 1.3010486364364624, + "learning_rate": 2.705281133549119e-05, + "loss": 0.2315, + "step": 14218 + }, + { + "epoch": 0.7804610318331504, + "grad_norm": 1.1517696380615234, + "learning_rate": 2.7047592984310787e-05, + "loss": 0.4673, + "step": 14220 + }, + { + "epoch": 0.7805708013172338, + "grad_norm": 1.282725214958191, + "learning_rate": 2.7042374543312905e-05, + "loss": 0.3194, + "step": 14222 + }, + { + "epoch": 0.7806805708013173, + "grad_norm": 1.9738327264785767, + "learning_rate": 2.703715601272645e-05, + "loss": 0.355, + "step": 14224 + }, + { + "epoch": 0.7807903402854006, + "grad_norm": 1.3003205060958862, + "learning_rate": 2.7031937392780334e-05, + "loss": 0.2312, + "step": 14226 + }, + { + "epoch": 0.7809001097694841, + "grad_norm": 1.9741495847702026, + "learning_rate": 2.7026718683703473e-05, + "loss": 0.2987, + "step": 14228 + }, + { + "epoch": 0.7810098792535675, + "grad_norm": 1.7143173217773438, + "learning_rate": 2.7021499885724778e-05, + "loss": 0.3578, + "step": 14230 + }, + { + "epoch": 0.7811196487376509, + "grad_norm": 1.919749140739441, + "learning_rate": 2.7016280999073186e-05, + "loss": 0.2954, + "step": 14232 + }, + { + "epoch": 0.7812294182217343, + "grad_norm": 1.6653814315795898, + "learning_rate": 2.70110620239776e-05, + "loss": 0.344, + "step": 14234 + }, + { + "epoch": 0.7813391877058178, + "grad_norm": 2.02841854095459, + "learning_rate": 2.700584296066697e-05, + "loss": 0.2471, + "step": 14236 + }, + { + "epoch": 0.7814489571899013, + "grad_norm": 1.6079516410827637, + "learning_rate": 2.7000623809370223e-05, + "loss": 0.2509, + "step": 14238 + }, + { + "epoch": 0.7815587266739846, + "grad_norm": 1.092958927154541, + "learning_rate": 2.699540457031629e-05, + "loss": 0.2183, + "step": 14240 + }, + { + "epoch": 0.7816684961580681, + "grad_norm": 1.2495930194854736, + "learning_rate": 2.6990185243734124e-05, + "loss": 0.1885, + "step": 14242 + }, + { + "epoch": 0.7817782656421515, + "grad_norm": 1.5595107078552246, + "learning_rate": 2.6984965829852667e-05, + "loss": 0.35, + "step": 14244 + }, + { + "epoch": 0.781888035126235, + "grad_norm": 1.5415140390396118, + "learning_rate": 2.697974632890086e-05, + "loss": 0.27, + "step": 14246 + }, + { + "epoch": 0.7819978046103183, + "grad_norm": 1.081984281539917, + "learning_rate": 2.6974526741107664e-05, + "loss": 0.1374, + "step": 14248 + }, + { + "epoch": 0.7821075740944018, + "grad_norm": 1.7360132932662964, + "learning_rate": 2.6969307066702037e-05, + "loss": 0.3185, + "step": 14250 + }, + { + "epoch": 0.7822173435784852, + "grad_norm": 1.454219937324524, + "learning_rate": 2.6964087305912928e-05, + "loss": 0.2198, + "step": 14252 + }, + { + "epoch": 0.7823271130625686, + "grad_norm": 1.34281587600708, + "learning_rate": 2.6958867458969316e-05, + "loss": 0.3346, + "step": 14254 + }, + { + "epoch": 0.782436882546652, + "grad_norm": 1.5844340324401855, + "learning_rate": 2.695364752610016e-05, + "loss": 0.3989, + "step": 14256 + }, + { + "epoch": 0.7825466520307355, + "grad_norm": 1.2297258377075195, + "learning_rate": 2.6948427507534436e-05, + "loss": 0.1455, + "step": 14258 + }, + { + "epoch": 0.7826564215148188, + "grad_norm": 1.2978510856628418, + "learning_rate": 2.6943207403501115e-05, + "loss": 0.2529, + "step": 14260 + }, + { + "epoch": 0.7827661909989023, + "grad_norm": 1.3813682794570923, + "learning_rate": 2.6937987214229187e-05, + "loss": 0.2757, + "step": 14262 + }, + { + "epoch": 0.7828759604829857, + "grad_norm": 1.5924286842346191, + "learning_rate": 2.6932766939947624e-05, + "loss": 0.2371, + "step": 14264 + }, + { + "epoch": 0.7829857299670692, + "grad_norm": 3.8757920265197754, + "learning_rate": 2.692754658088541e-05, + "loss": 0.3079, + "step": 14266 + }, + { + "epoch": 0.7830954994511525, + "grad_norm": 1.1818500757217407, + "learning_rate": 2.6922326137271555e-05, + "loss": 0.2731, + "step": 14268 + }, + { + "epoch": 0.783205268935236, + "grad_norm": 1.0606904029846191, + "learning_rate": 2.6917105609335025e-05, + "loss": 0.3061, + "step": 14270 + }, + { + "epoch": 0.7833150384193194, + "grad_norm": 0.7891285419464111, + "learning_rate": 2.6911884997304837e-05, + "loss": 0.1851, + "step": 14272 + }, + { + "epoch": 0.7834248079034029, + "grad_norm": 1.5676532983779907, + "learning_rate": 2.6906664301409996e-05, + "loss": 0.2214, + "step": 14274 + }, + { + "epoch": 0.7835345773874863, + "grad_norm": 0.959446370601654, + "learning_rate": 2.69014435218795e-05, + "loss": 0.2558, + "step": 14276 + }, + { + "epoch": 0.7836443468715697, + "grad_norm": 1.5297662019729614, + "learning_rate": 2.6896222658942348e-05, + "loss": 0.3068, + "step": 14278 + }, + { + "epoch": 0.7837541163556532, + "grad_norm": 0.7847957611083984, + "learning_rate": 2.689100171282758e-05, + "loss": 0.229, + "step": 14280 + }, + { + "epoch": 0.7838638858397365, + "grad_norm": 1.599617838859558, + "learning_rate": 2.6885780683764183e-05, + "loss": 0.1781, + "step": 14282 + }, + { + "epoch": 0.78397365532382, + "grad_norm": 4.23007869720459, + "learning_rate": 2.688055957198119e-05, + "loss": 0.2722, + "step": 14284 + }, + { + "epoch": 0.7840834248079034, + "grad_norm": 1.5694303512573242, + "learning_rate": 2.687533837770762e-05, + "loss": 0.2963, + "step": 14286 + }, + { + "epoch": 0.7841931942919869, + "grad_norm": 1.3215898275375366, + "learning_rate": 2.6870117101172514e-05, + "loss": 0.2123, + "step": 14288 + }, + { + "epoch": 0.7843029637760702, + "grad_norm": 0.9841314554214478, + "learning_rate": 2.6864895742604885e-05, + "loss": 0.1739, + "step": 14290 + }, + { + "epoch": 0.7844127332601537, + "grad_norm": 1.3842387199401855, + "learning_rate": 2.6859674302233785e-05, + "loss": 0.2287, + "step": 14292 + }, + { + "epoch": 0.7845225027442371, + "grad_norm": 1.134946584701538, + "learning_rate": 2.6854452780288236e-05, + "loss": 0.2476, + "step": 14294 + }, + { + "epoch": 0.7846322722283205, + "grad_norm": 2.017843246459961, + "learning_rate": 2.684923117699728e-05, + "loss": 0.2402, + "step": 14296 + }, + { + "epoch": 0.7847420417124039, + "grad_norm": 0.8831682801246643, + "learning_rate": 2.6844009492589978e-05, + "loss": 0.2055, + "step": 14298 + }, + { + "epoch": 0.7848518111964874, + "grad_norm": 1.3233669996261597, + "learning_rate": 2.6838787727295363e-05, + "loss": 0.2268, + "step": 14300 + }, + { + "epoch": 0.7849615806805708, + "grad_norm": 1.2895196676254272, + "learning_rate": 2.6833565881342492e-05, + "loss": 0.209, + "step": 14302 + }, + { + "epoch": 0.7850713501646542, + "grad_norm": 2.202061176300049, + "learning_rate": 2.6828343954960428e-05, + "loss": 0.1879, + "step": 14304 + }, + { + "epoch": 0.7851811196487376, + "grad_norm": 1.13482666015625, + "learning_rate": 2.682312194837822e-05, + "loss": 0.1965, + "step": 14306 + }, + { + "epoch": 0.7852908891328211, + "grad_norm": 1.9759387969970703, + "learning_rate": 2.6817899861824934e-05, + "loss": 0.3969, + "step": 14308 + }, + { + "epoch": 0.7854006586169044, + "grad_norm": 1.1931531429290771, + "learning_rate": 2.681267769552964e-05, + "loss": 0.2248, + "step": 14310 + }, + { + "epoch": 0.7855104281009879, + "grad_norm": 1.462199330329895, + "learning_rate": 2.6807455449721407e-05, + "loss": 0.3389, + "step": 14312 + }, + { + "epoch": 0.7856201975850714, + "grad_norm": 1.0751475095748901, + "learning_rate": 2.68022331246293e-05, + "loss": 0.2272, + "step": 14314 + }, + { + "epoch": 0.7857299670691548, + "grad_norm": 1.221261739730835, + "learning_rate": 2.67970107204824e-05, + "loss": 0.2188, + "step": 14316 + }, + { + "epoch": 0.7858397365532382, + "grad_norm": 1.7031151056289673, + "learning_rate": 2.67917882375098e-05, + "loss": 0.3219, + "step": 14318 + }, + { + "epoch": 0.7859495060373216, + "grad_norm": 1.1077803373336792, + "learning_rate": 2.6786565675940572e-05, + "loss": 0.3065, + "step": 14320 + }, + { + "epoch": 0.7860592755214051, + "grad_norm": 1.3509413003921509, + "learning_rate": 2.6781343036003798e-05, + "loss": 0.2843, + "step": 14322 + }, + { + "epoch": 0.7861690450054885, + "grad_norm": 2.5435454845428467, + "learning_rate": 2.6776120317928576e-05, + "loss": 0.4373, + "step": 14324 + }, + { + "epoch": 0.7862788144895719, + "grad_norm": 2.25249981880188, + "learning_rate": 2.6770897521944e-05, + "loss": 0.3409, + "step": 14326 + }, + { + "epoch": 0.7863885839736553, + "grad_norm": 2.098515510559082, + "learning_rate": 2.6765674648279172e-05, + "loss": 0.1809, + "step": 14328 + }, + { + "epoch": 0.7864983534577388, + "grad_norm": 1.2234762907028198, + "learning_rate": 2.6760451697163185e-05, + "loss": 0.2507, + "step": 14330 + }, + { + "epoch": 0.7866081229418221, + "grad_norm": 1.258909821510315, + "learning_rate": 2.6755228668825137e-05, + "loss": 0.3556, + "step": 14332 + }, + { + "epoch": 0.7867178924259056, + "grad_norm": 1.1934912204742432, + "learning_rate": 2.675000556349415e-05, + "loss": 0.1835, + "step": 14334 + }, + { + "epoch": 0.786827661909989, + "grad_norm": 1.1939204931259155, + "learning_rate": 2.6744782381399335e-05, + "loss": 0.3004, + "step": 14336 + }, + { + "epoch": 0.7869374313940725, + "grad_norm": 1.8615950345993042, + "learning_rate": 2.6739559122769797e-05, + "loss": 0.2828, + "step": 14338 + }, + { + "epoch": 0.7870472008781558, + "grad_norm": 1.5713365077972412, + "learning_rate": 2.6734335787834656e-05, + "loss": 0.271, + "step": 14340 + }, + { + "epoch": 0.7871569703622393, + "grad_norm": 1.0186949968338013, + "learning_rate": 2.672911237682304e-05, + "loss": 0.2139, + "step": 14342 + }, + { + "epoch": 0.7872667398463227, + "grad_norm": 1.9714330434799194, + "learning_rate": 2.672388888996406e-05, + "loss": 0.1484, + "step": 14344 + }, + { + "epoch": 0.7873765093304061, + "grad_norm": 2.371453046798706, + "learning_rate": 2.6718665327486854e-05, + "loss": 0.29, + "step": 14346 + }, + { + "epoch": 0.7874862788144896, + "grad_norm": 1.247697353363037, + "learning_rate": 2.6713441689620554e-05, + "loss": 0.2429, + "step": 14348 + }, + { + "epoch": 0.787596048298573, + "grad_norm": 1.4011693000793457, + "learning_rate": 2.6708217976594296e-05, + "loss": 0.2188, + "step": 14350 + }, + { + "epoch": 0.7877058177826565, + "grad_norm": 1.552718997001648, + "learning_rate": 2.6702994188637204e-05, + "loss": 0.2322, + "step": 14352 + }, + { + "epoch": 0.7878155872667398, + "grad_norm": 2.2776169776916504, + "learning_rate": 2.6697770325978433e-05, + "loss": 0.378, + "step": 14354 + }, + { + "epoch": 0.7879253567508233, + "grad_norm": 1.3179155588150024, + "learning_rate": 2.6692546388847122e-05, + "loss": 0.2908, + "step": 14356 + }, + { + "epoch": 0.7880351262349067, + "grad_norm": 1.3510146141052246, + "learning_rate": 2.6687322377472418e-05, + "loss": 0.2115, + "step": 14358 + }, + { + "epoch": 0.7881448957189902, + "grad_norm": 1.035820722579956, + "learning_rate": 2.6682098292083473e-05, + "loss": 0.2324, + "step": 14360 + }, + { + "epoch": 0.7882546652030735, + "grad_norm": 2.1819753646850586, + "learning_rate": 2.6676874132909435e-05, + "loss": 0.3437, + "step": 14362 + }, + { + "epoch": 0.788364434687157, + "grad_norm": 4.23912239074707, + "learning_rate": 2.6671649900179475e-05, + "loss": 0.2509, + "step": 14364 + }, + { + "epoch": 0.7884742041712404, + "grad_norm": 1.4399968385696411, + "learning_rate": 2.666642559412274e-05, + "loss": 0.3119, + "step": 14366 + }, + { + "epoch": 0.7885839736553238, + "grad_norm": 1.4332343339920044, + "learning_rate": 2.6661201214968408e-05, + "loss": 0.2167, + "step": 14368 + }, + { + "epoch": 0.7886937431394072, + "grad_norm": 0.869835376739502, + "learning_rate": 2.665597676294563e-05, + "loss": 0.1778, + "step": 14370 + }, + { + "epoch": 0.7888035126234907, + "grad_norm": 1.1311627626419067, + "learning_rate": 2.6650752238283583e-05, + "loss": 0.2114, + "step": 14372 + }, + { + "epoch": 0.788913282107574, + "grad_norm": 1.597032904624939, + "learning_rate": 2.6645527641211444e-05, + "loss": 0.1873, + "step": 14374 + }, + { + "epoch": 0.7890230515916575, + "grad_norm": 1.0034441947937012, + "learning_rate": 2.6640302971958376e-05, + "loss": 0.1868, + "step": 14376 + }, + { + "epoch": 0.7891328210757409, + "grad_norm": 1.2773240804672241, + "learning_rate": 2.663507823075358e-05, + "loss": 0.2743, + "step": 14378 + }, + { + "epoch": 0.7892425905598244, + "grad_norm": 2.069777250289917, + "learning_rate": 2.6629853417826224e-05, + "loss": 0.3339, + "step": 14380 + }, + { + "epoch": 0.7893523600439077, + "grad_norm": 1.7334522008895874, + "learning_rate": 2.6624628533405493e-05, + "loss": 0.1702, + "step": 14382 + }, + { + "epoch": 0.7894621295279912, + "grad_norm": 1.2458184957504272, + "learning_rate": 2.6619403577720586e-05, + "loss": 0.169, + "step": 14384 + }, + { + "epoch": 0.7895718990120747, + "grad_norm": 1.0590150356292725, + "learning_rate": 2.6614178551000692e-05, + "loss": 0.1788, + "step": 14386 + }, + { + "epoch": 0.7896816684961581, + "grad_norm": 1.630574107170105, + "learning_rate": 2.6608953453474992e-05, + "loss": 0.287, + "step": 14388 + }, + { + "epoch": 0.7897914379802415, + "grad_norm": 1.1187057495117188, + "learning_rate": 2.660372828537271e-05, + "loss": 0.2371, + "step": 14390 + }, + { + "epoch": 0.7899012074643249, + "grad_norm": 2.947481632232666, + "learning_rate": 2.6598503046923024e-05, + "loss": 0.261, + "step": 14392 + }, + { + "epoch": 0.7900109769484084, + "grad_norm": 1.2266697883605957, + "learning_rate": 2.659327773835515e-05, + "loss": 0.1751, + "step": 14394 + }, + { + "epoch": 0.7901207464324917, + "grad_norm": 1.386742353439331, + "learning_rate": 2.6588052359898298e-05, + "loss": 0.2685, + "step": 14396 + }, + { + "epoch": 0.7902305159165752, + "grad_norm": 1.3339734077453613, + "learning_rate": 2.6582826911781677e-05, + "loss": 0.1896, + "step": 14398 + }, + { + "epoch": 0.7903402854006586, + "grad_norm": 1.1222164630889893, + "learning_rate": 2.657760139423449e-05, + "loss": 0.164, + "step": 14400 + }, + { + "epoch": 0.7904500548847421, + "grad_norm": 1.1995681524276733, + "learning_rate": 2.6572375807485973e-05, + "loss": 0.1761, + "step": 14402 + }, + { + "epoch": 0.7905598243688254, + "grad_norm": 1.4365078210830688, + "learning_rate": 2.656715015176533e-05, + "loss": 0.3453, + "step": 14404 + }, + { + "epoch": 0.7906695938529089, + "grad_norm": 1.1377004384994507, + "learning_rate": 2.656192442730179e-05, + "loss": 0.3255, + "step": 14406 + }, + { + "epoch": 0.7907793633369923, + "grad_norm": 1.2909919023513794, + "learning_rate": 2.6556698634324574e-05, + "loss": 0.1936, + "step": 14408 + }, + { + "epoch": 0.7908891328210758, + "grad_norm": 1.252480387687683, + "learning_rate": 2.655147277306292e-05, + "loss": 0.2276, + "step": 14410 + }, + { + "epoch": 0.7909989023051591, + "grad_norm": 2.140299081802368, + "learning_rate": 2.654624684374605e-05, + "loss": 0.328, + "step": 14412 + }, + { + "epoch": 0.7911086717892426, + "grad_norm": 1.0770817995071411, + "learning_rate": 2.654102084660321e-05, + "loss": 0.1651, + "step": 14414 + }, + { + "epoch": 0.791218441273326, + "grad_norm": 1.6889382600784302, + "learning_rate": 2.6535794781863633e-05, + "loss": 0.2715, + "step": 14416 + }, + { + "epoch": 0.7913282107574094, + "grad_norm": 1.7878044843673706, + "learning_rate": 2.653056864975655e-05, + "loss": 0.249, + "step": 14418 + }, + { + "epoch": 0.7914379802414928, + "grad_norm": 1.1426568031311035, + "learning_rate": 2.652534245051122e-05, + "loss": 0.3533, + "step": 14420 + }, + { + "epoch": 0.7915477497255763, + "grad_norm": 1.3084949254989624, + "learning_rate": 2.652011618435688e-05, + "loss": 0.242, + "step": 14422 + }, + { + "epoch": 0.7916575192096598, + "grad_norm": 2.0760507583618164, + "learning_rate": 2.651488985152279e-05, + "loss": 0.2482, + "step": 14424 + }, + { + "epoch": 0.7917672886937431, + "grad_norm": 1.2204025983810425, + "learning_rate": 2.650966345223819e-05, + "loss": 0.2145, + "step": 14426 + }, + { + "epoch": 0.7918770581778266, + "grad_norm": 1.1479597091674805, + "learning_rate": 2.6504436986732338e-05, + "loss": 0.232, + "step": 14428 + }, + { + "epoch": 0.79198682766191, + "grad_norm": 1.6389473676681519, + "learning_rate": 2.6499210455234496e-05, + "loss": 0.1852, + "step": 14430 + }, + { + "epoch": 0.7920965971459935, + "grad_norm": 3.565429449081421, + "learning_rate": 2.649398385797393e-05, + "loss": 0.2976, + "step": 14432 + }, + { + "epoch": 0.7922063666300768, + "grad_norm": 1.06175696849823, + "learning_rate": 2.6488757195179903e-05, + "loss": 0.1997, + "step": 14434 + }, + { + "epoch": 0.7923161361141603, + "grad_norm": 1.125227928161621, + "learning_rate": 2.648353046708167e-05, + "loss": 0.1942, + "step": 14436 + }, + { + "epoch": 0.7924259055982437, + "grad_norm": 2.0055885314941406, + "learning_rate": 2.6478303673908505e-05, + "loss": 0.1753, + "step": 14438 + }, + { + "epoch": 0.7925356750823271, + "grad_norm": 1.2147818803787231, + "learning_rate": 2.6473076815889697e-05, + "loss": 0.154, + "step": 14440 + }, + { + "epoch": 0.7926454445664105, + "grad_norm": 1.4396162033081055, + "learning_rate": 2.6467849893254503e-05, + "loss": 0.2556, + "step": 14442 + }, + { + "epoch": 0.792755214050494, + "grad_norm": 1.8820666074752808, + "learning_rate": 2.646262290623221e-05, + "loss": 0.3125, + "step": 14444 + }, + { + "epoch": 0.7928649835345773, + "grad_norm": 1.292826533317566, + "learning_rate": 2.6457395855052098e-05, + "loss": 0.2041, + "step": 14446 + }, + { + "epoch": 0.7929747530186608, + "grad_norm": 1.5301316976547241, + "learning_rate": 2.645216873994345e-05, + "loss": 0.2651, + "step": 14448 + }, + { + "epoch": 0.7930845225027442, + "grad_norm": 1.8546826839447021, + "learning_rate": 2.644694156113555e-05, + "loss": 0.239, + "step": 14450 + }, + { + "epoch": 0.7931942919868277, + "grad_norm": 1.7569535970687866, + "learning_rate": 2.6441714318857692e-05, + "loss": 0.2543, + "step": 14452 + }, + { + "epoch": 0.793304061470911, + "grad_norm": 1.904520034790039, + "learning_rate": 2.6436487013339174e-05, + "loss": 0.1769, + "step": 14454 + }, + { + "epoch": 0.7934138309549945, + "grad_norm": 1.3171836137771606, + "learning_rate": 2.6431259644809276e-05, + "loss": 0.137, + "step": 14456 + }, + { + "epoch": 0.793523600439078, + "grad_norm": 2.3144876956939697, + "learning_rate": 2.642603221349731e-05, + "loss": 0.2274, + "step": 14458 + }, + { + "epoch": 0.7936333699231614, + "grad_norm": 1.0291965007781982, + "learning_rate": 2.6420804719632575e-05, + "loss": 0.1492, + "step": 14460 + }, + { + "epoch": 0.7937431394072448, + "grad_norm": 1.312058925628662, + "learning_rate": 2.641557716344436e-05, + "loss": 0.2533, + "step": 14462 + }, + { + "epoch": 0.7938529088913282, + "grad_norm": 3.6915011405944824, + "learning_rate": 2.6410349545161993e-05, + "loss": 0.2088, + "step": 14464 + }, + { + "epoch": 0.7939626783754117, + "grad_norm": 1.1735236644744873, + "learning_rate": 2.640512186501477e-05, + "loss": 0.1896, + "step": 14466 + }, + { + "epoch": 0.794072447859495, + "grad_norm": 1.5577110052108765, + "learning_rate": 2.6399894123232e-05, + "loss": 0.2964, + "step": 14468 + }, + { + "epoch": 0.7941822173435785, + "grad_norm": 1.2751851081848145, + "learning_rate": 2.639466632004301e-05, + "loss": 0.2747, + "step": 14470 + }, + { + "epoch": 0.7942919868276619, + "grad_norm": 1.1524196863174438, + "learning_rate": 2.6389438455677108e-05, + "loss": 0.21, + "step": 14472 + }, + { + "epoch": 0.7944017563117454, + "grad_norm": 1.7953330278396606, + "learning_rate": 2.6384210530363613e-05, + "loss": 0.2275, + "step": 14474 + }, + { + "epoch": 0.7945115257958287, + "grad_norm": 1.0005362033843994, + "learning_rate": 2.6378982544331853e-05, + "loss": 0.1783, + "step": 14476 + }, + { + "epoch": 0.7946212952799122, + "grad_norm": 1.3439583778381348, + "learning_rate": 2.637375449781115e-05, + "loss": 0.233, + "step": 14478 + }, + { + "epoch": 0.7947310647639956, + "grad_norm": 1.7409652471542358, + "learning_rate": 2.636852639103083e-05, + "loss": 0.1995, + "step": 14480 + }, + { + "epoch": 0.794840834248079, + "grad_norm": 1.3681279420852661, + "learning_rate": 2.636329822422022e-05, + "loss": 0.2508, + "step": 14482 + }, + { + "epoch": 0.7949506037321624, + "grad_norm": 1.51840078830719, + "learning_rate": 2.6358069997608666e-05, + "loss": 0.1761, + "step": 14484 + }, + { + "epoch": 0.7950603732162459, + "grad_norm": 1.2795215845108032, + "learning_rate": 2.635284171142549e-05, + "loss": 0.2898, + "step": 14486 + }, + { + "epoch": 0.7951701427003293, + "grad_norm": 0.793093740940094, + "learning_rate": 2.6347613365900046e-05, + "loss": 0.1346, + "step": 14488 + }, + { + "epoch": 0.7952799121844127, + "grad_norm": 1.6546238660812378, + "learning_rate": 2.6342384961261662e-05, + "loss": 0.3932, + "step": 14490 + }, + { + "epoch": 0.7953896816684961, + "grad_norm": 1.2267496585845947, + "learning_rate": 2.633715649773968e-05, + "loss": 0.2484, + "step": 14492 + }, + { + "epoch": 0.7954994511525796, + "grad_norm": 1.6984976530075073, + "learning_rate": 2.633192797556346e-05, + "loss": 0.2346, + "step": 14494 + }, + { + "epoch": 0.7956092206366631, + "grad_norm": 1.4063396453857422, + "learning_rate": 2.6326699394962333e-05, + "loss": 0.2829, + "step": 14496 + }, + { + "epoch": 0.7957189901207464, + "grad_norm": 1.4213874340057373, + "learning_rate": 2.632147075616566e-05, + "loss": 0.3731, + "step": 14498 + }, + { + "epoch": 0.7958287596048299, + "grad_norm": 1.3528510332107544, + "learning_rate": 2.631624205940279e-05, + "loss": 0.3706, + "step": 14500 + }, + { + "epoch": 0.7959385290889133, + "grad_norm": 1.315791368484497, + "learning_rate": 2.6311013304903087e-05, + "loss": 0.2661, + "step": 14502 + }, + { + "epoch": 0.7960482985729967, + "grad_norm": 2.3487892150878906, + "learning_rate": 2.6305784492895907e-05, + "loss": 0.3322, + "step": 14504 + }, + { + "epoch": 0.7961580680570801, + "grad_norm": 1.2255206108093262, + "learning_rate": 2.6300555623610613e-05, + "loss": 0.1854, + "step": 14506 + }, + { + "epoch": 0.7962678375411636, + "grad_norm": 0.9395112991333008, + "learning_rate": 2.6295326697276563e-05, + "loss": 0.1946, + "step": 14508 + }, + { + "epoch": 0.796377607025247, + "grad_norm": 1.7526100873947144, + "learning_rate": 2.629009771412312e-05, + "loss": 0.2544, + "step": 14510 + }, + { + "epoch": 0.7964873765093304, + "grad_norm": 0.8132378458976746, + "learning_rate": 2.628486867437966e-05, + "loss": 0.112, + "step": 14512 + }, + { + "epoch": 0.7965971459934138, + "grad_norm": 1.6556317806243896, + "learning_rate": 2.627963957827556e-05, + "loss": 0.1919, + "step": 14514 + }, + { + "epoch": 0.7967069154774973, + "grad_norm": 1.2176157236099243, + "learning_rate": 2.6274410426040186e-05, + "loss": 0.2235, + "step": 14516 + }, + { + "epoch": 0.7968166849615806, + "grad_norm": 5.997232913970947, + "learning_rate": 2.626918121790291e-05, + "loss": 0.1564, + "step": 14518 + }, + { + "epoch": 0.7969264544456641, + "grad_norm": 1.442447304725647, + "learning_rate": 2.6263951954093125e-05, + "loss": 0.2261, + "step": 14520 + }, + { + "epoch": 0.7970362239297475, + "grad_norm": 1.1229572296142578, + "learning_rate": 2.62587226348402e-05, + "loss": 0.1418, + "step": 14522 + }, + { + "epoch": 0.797145993413831, + "grad_norm": 1.4029234647750854, + "learning_rate": 2.625349326037352e-05, + "loss": 0.2284, + "step": 14524 + }, + { + "epoch": 0.7972557628979143, + "grad_norm": 11.00593376159668, + "learning_rate": 2.6248263830922475e-05, + "loss": 0.2502, + "step": 14526 + }, + { + "epoch": 0.7973655323819978, + "grad_norm": 4.780119895935059, + "learning_rate": 2.624303434671645e-05, + "loss": 0.295, + "step": 14528 + }, + { + "epoch": 0.7974753018660812, + "grad_norm": 1.5623314380645752, + "learning_rate": 2.6237804807984832e-05, + "loss": 0.317, + "step": 14530 + }, + { + "epoch": 0.7975850713501647, + "grad_norm": 1.0405187606811523, + "learning_rate": 2.6232575214957028e-05, + "loss": 0.1966, + "step": 14532 + }, + { + "epoch": 0.7976948408342481, + "grad_norm": 1.3336942195892334, + "learning_rate": 2.6227345567862426e-05, + "loss": 0.2766, + "step": 14534 + }, + { + "epoch": 0.7978046103183315, + "grad_norm": 1.6605353355407715, + "learning_rate": 2.622211586693042e-05, + "loss": 0.2582, + "step": 14536 + }, + { + "epoch": 0.797914379802415, + "grad_norm": 2.247328042984009, + "learning_rate": 2.6216886112390415e-05, + "loss": 0.2218, + "step": 14538 + }, + { + "epoch": 0.7980241492864983, + "grad_norm": 1.5627708435058594, + "learning_rate": 2.621165630447181e-05, + "loss": 0.2919, + "step": 14540 + }, + { + "epoch": 0.7981339187705818, + "grad_norm": 1.98099946975708, + "learning_rate": 2.620642644340401e-05, + "loss": 0.2437, + "step": 14542 + }, + { + "epoch": 0.7982436882546652, + "grad_norm": 1.460645079612732, + "learning_rate": 2.6201196529416427e-05, + "loss": 0.2046, + "step": 14544 + }, + { + "epoch": 0.7983534577387487, + "grad_norm": 1.2450127601623535, + "learning_rate": 2.6195966562738473e-05, + "loss": 0.2307, + "step": 14546 + }, + { + "epoch": 0.798463227222832, + "grad_norm": 1.6584116220474243, + "learning_rate": 2.6190736543599547e-05, + "loss": 0.2729, + "step": 14548 + }, + { + "epoch": 0.7985729967069155, + "grad_norm": 2.020597219467163, + "learning_rate": 2.6185506472229083e-05, + "loss": 0.2911, + "step": 14550 + }, + { + "epoch": 0.7986827661909989, + "grad_norm": 1.138325572013855, + "learning_rate": 2.618027634885648e-05, + "loss": 0.2645, + "step": 14552 + }, + { + "epoch": 0.7987925356750823, + "grad_norm": 2.3726937770843506, + "learning_rate": 2.617504617371116e-05, + "loss": 0.2483, + "step": 14554 + }, + { + "epoch": 0.7989023051591657, + "grad_norm": 0.9241223335266113, + "learning_rate": 2.6169815947022553e-05, + "loss": 0.2037, + "step": 14556 + }, + { + "epoch": 0.7990120746432492, + "grad_norm": 1.0423163175582886, + "learning_rate": 2.616458566902007e-05, + "loss": 0.2104, + "step": 14558 + }, + { + "epoch": 0.7991218441273326, + "grad_norm": 1.3554404973983765, + "learning_rate": 2.6159355339933145e-05, + "loss": 0.3279, + "step": 14560 + }, + { + "epoch": 0.799231613611416, + "grad_norm": 1.6359940767288208, + "learning_rate": 2.615412495999121e-05, + "loss": 0.3366, + "step": 14562 + }, + { + "epoch": 0.7993413830954994, + "grad_norm": 1.0836354494094849, + "learning_rate": 2.614889452942369e-05, + "loss": 0.2388, + "step": 14564 + }, + { + "epoch": 0.7994511525795829, + "grad_norm": 1.4665571451187134, + "learning_rate": 2.614366404846001e-05, + "loss": 0.3534, + "step": 14566 + }, + { + "epoch": 0.7995609220636662, + "grad_norm": 1.2330889701843262, + "learning_rate": 2.613843351732962e-05, + "loss": 0.1965, + "step": 14568 + }, + { + "epoch": 0.7996706915477497, + "grad_norm": 1.558181881904602, + "learning_rate": 2.613320293626194e-05, + "loss": 0.217, + "step": 14570 + }, + { + "epoch": 0.7997804610318332, + "grad_norm": 1.2633920907974243, + "learning_rate": 2.612797230548642e-05, + "loss": 0.1763, + "step": 14572 + }, + { + "epoch": 0.7998902305159166, + "grad_norm": 1.2709858417510986, + "learning_rate": 2.6122741625232493e-05, + "loss": 0.1665, + "step": 14574 + }, + { + "epoch": 0.8, + "grad_norm": 1.8901147842407227, + "learning_rate": 2.6117510895729613e-05, + "loss": 0.3309, + "step": 14576 + }, + { + "epoch": 0.8001097694840834, + "grad_norm": 1.0570974349975586, + "learning_rate": 2.611228011720722e-05, + "loss": 0.1846, + "step": 14578 + }, + { + "epoch": 0.8002195389681669, + "grad_norm": 1.0217822790145874, + "learning_rate": 2.6107049289894763e-05, + "loss": 0.2364, + "step": 14580 + }, + { + "epoch": 0.8003293084522503, + "grad_norm": 1.6246232986450195, + "learning_rate": 2.6101818414021696e-05, + "loss": 0.2202, + "step": 14582 + }, + { + "epoch": 0.8004390779363337, + "grad_norm": 1.4950141906738281, + "learning_rate": 2.6096587489817454e-05, + "loss": 0.2749, + "step": 14584 + }, + { + "epoch": 0.8005488474204171, + "grad_norm": 1.7862242460250854, + "learning_rate": 2.6091356517511505e-05, + "loss": 0.2227, + "step": 14586 + }, + { + "epoch": 0.8006586169045006, + "grad_norm": 1.2905062437057495, + "learning_rate": 2.6086125497333304e-05, + "loss": 0.2353, + "step": 14588 + }, + { + "epoch": 0.8007683863885839, + "grad_norm": 1.8404871225357056, + "learning_rate": 2.6080894429512305e-05, + "loss": 0.2143, + "step": 14590 + }, + { + "epoch": 0.8008781558726674, + "grad_norm": 1.6942222118377686, + "learning_rate": 2.6075663314277976e-05, + "loss": 0.2672, + "step": 14592 + }, + { + "epoch": 0.8009879253567508, + "grad_norm": 1.6343791484832764, + "learning_rate": 2.6070432151859775e-05, + "loss": 0.3497, + "step": 14594 + }, + { + "epoch": 0.8010976948408343, + "grad_norm": 4.850611686706543, + "learning_rate": 2.6065200942487156e-05, + "loss": 0.165, + "step": 14596 + }, + { + "epoch": 0.8012074643249176, + "grad_norm": 3.001486301422119, + "learning_rate": 2.6059969686389608e-05, + "loss": 0.3681, + "step": 14598 + }, + { + "epoch": 0.8013172338090011, + "grad_norm": 1.8606624603271484, + "learning_rate": 2.6054738383796585e-05, + "loss": 0.4745, + "step": 14600 + }, + { + "epoch": 0.8014270032930845, + "grad_norm": 1.0411356687545776, + "learning_rate": 2.604950703493755e-05, + "loss": 0.1878, + "step": 14602 + }, + { + "epoch": 0.801536772777168, + "grad_norm": 0.7942651510238647, + "learning_rate": 2.6044275640041987e-05, + "loss": 0.148, + "step": 14604 + }, + { + "epoch": 0.8016465422612514, + "grad_norm": 2.0132408142089844, + "learning_rate": 2.6039044199339375e-05, + "loss": 0.3122, + "step": 14606 + }, + { + "epoch": 0.8017563117453348, + "grad_norm": 1.0647995471954346, + "learning_rate": 2.603381271305918e-05, + "loss": 0.1575, + "step": 14608 + }, + { + "epoch": 0.8018660812294183, + "grad_norm": 1.2663960456848145, + "learning_rate": 2.6028581181430888e-05, + "loss": 0.1602, + "step": 14610 + }, + { + "epoch": 0.8019758507135016, + "grad_norm": 0.9483279585838318, + "learning_rate": 2.6023349604683976e-05, + "loss": 0.243, + "step": 14612 + }, + { + "epoch": 0.8020856201975851, + "grad_norm": 0.9749879837036133, + "learning_rate": 2.601811798304793e-05, + "loss": 0.2025, + "step": 14614 + }, + { + "epoch": 0.8021953896816685, + "grad_norm": 1.424850344657898, + "learning_rate": 2.6012886316752227e-05, + "loss": 0.2475, + "step": 14616 + }, + { + "epoch": 0.802305159165752, + "grad_norm": 2.008366823196411, + "learning_rate": 2.600765460602636e-05, + "loss": 0.3095, + "step": 14618 + }, + { + "epoch": 0.8024149286498353, + "grad_norm": 2.1275644302368164, + "learning_rate": 2.6002422851099824e-05, + "loss": 0.2477, + "step": 14620 + }, + { + "epoch": 0.8025246981339188, + "grad_norm": 1.9084255695343018, + "learning_rate": 2.5997191052202093e-05, + "loss": 0.4142, + "step": 14622 + }, + { + "epoch": 0.8026344676180022, + "grad_norm": 1.1992905139923096, + "learning_rate": 2.5991959209562665e-05, + "loss": 0.2995, + "step": 14624 + }, + { + "epoch": 0.8027442371020856, + "grad_norm": 1.2603604793548584, + "learning_rate": 2.5986727323411047e-05, + "loss": 0.1425, + "step": 14626 + }, + { + "epoch": 0.802854006586169, + "grad_norm": 1.3459628820419312, + "learning_rate": 2.598149539397672e-05, + "loss": 0.2459, + "step": 14628 + }, + { + "epoch": 0.8029637760702525, + "grad_norm": 3.0548622608184814, + "learning_rate": 2.5976263421489188e-05, + "loss": 0.2809, + "step": 14630 + }, + { + "epoch": 0.8030735455543359, + "grad_norm": 1.3368594646453857, + "learning_rate": 2.5971031406177947e-05, + "loss": 0.2251, + "step": 14632 + }, + { + "epoch": 0.8031833150384193, + "grad_norm": 1.7245397567749023, + "learning_rate": 2.5965799348272503e-05, + "loss": 0.2603, + "step": 14634 + }, + { + "epoch": 0.8032930845225027, + "grad_norm": 0.9010858535766602, + "learning_rate": 2.5960567248002366e-05, + "loss": 0.3252, + "step": 14636 + }, + { + "epoch": 0.8034028540065862, + "grad_norm": 2.514956474304199, + "learning_rate": 2.5955335105597034e-05, + "loss": 0.228, + "step": 14638 + }, + { + "epoch": 0.8035126234906695, + "grad_norm": 1.511772632598877, + "learning_rate": 2.5950102921286006e-05, + "loss": 0.2025, + "step": 14640 + }, + { + "epoch": 0.803622392974753, + "grad_norm": 2.121488094329834, + "learning_rate": 2.5944870695298813e-05, + "loss": 0.312, + "step": 14642 + }, + { + "epoch": 0.8037321624588365, + "grad_norm": 1.1400309801101685, + "learning_rate": 2.593963842786495e-05, + "loss": 0.1804, + "step": 14644 + }, + { + "epoch": 0.8038419319429199, + "grad_norm": 1.1585800647735596, + "learning_rate": 2.5934406119213928e-05, + "loss": 0.2534, + "step": 14646 + }, + { + "epoch": 0.8039517014270033, + "grad_norm": 1.5840171575546265, + "learning_rate": 2.5929173769575266e-05, + "loss": 0.1926, + "step": 14648 + }, + { + "epoch": 0.8040614709110867, + "grad_norm": 2.016948938369751, + "learning_rate": 2.5923941379178486e-05, + "loss": 0.1961, + "step": 14650 + }, + { + "epoch": 0.8041712403951702, + "grad_norm": 1.5108083486557007, + "learning_rate": 2.59187089482531e-05, + "loss": 0.3733, + "step": 14652 + }, + { + "epoch": 0.8042810098792536, + "grad_norm": 1.3438149690628052, + "learning_rate": 2.5913476477028637e-05, + "loss": 0.2624, + "step": 14654 + }, + { + "epoch": 0.804390779363337, + "grad_norm": 1.1976017951965332, + "learning_rate": 2.5908243965734608e-05, + "loss": 0.3751, + "step": 14656 + }, + { + "epoch": 0.8045005488474204, + "grad_norm": 1.4074859619140625, + "learning_rate": 2.5903011414600536e-05, + "loss": 0.2215, + "step": 14658 + }, + { + "epoch": 0.8046103183315039, + "grad_norm": 1.4120465517044067, + "learning_rate": 2.5897778823855955e-05, + "loss": 0.2308, + "step": 14660 + }, + { + "epoch": 0.8047200878155872, + "grad_norm": 1.324837327003479, + "learning_rate": 2.5892546193730393e-05, + "loss": 0.2589, + "step": 14662 + }, + { + "epoch": 0.8048298572996707, + "grad_norm": 1.7206170558929443, + "learning_rate": 2.588731352445336e-05, + "loss": 0.2065, + "step": 14664 + }, + { + "epoch": 0.8049396267837541, + "grad_norm": 1.5391457080841064, + "learning_rate": 2.5882080816254415e-05, + "loss": 0.2662, + "step": 14666 + }, + { + "epoch": 0.8050493962678376, + "grad_norm": 1.0082545280456543, + "learning_rate": 2.5876848069363075e-05, + "loss": 0.3004, + "step": 14668 + }, + { + "epoch": 0.8051591657519209, + "grad_norm": 1.4270292520523071, + "learning_rate": 2.5871615284008866e-05, + "loss": 0.2248, + "step": 14670 + }, + { + "epoch": 0.8052689352360044, + "grad_norm": 1.4049949645996094, + "learning_rate": 2.586638246042134e-05, + "loss": 0.2371, + "step": 14672 + }, + { + "epoch": 0.8053787047200878, + "grad_norm": 0.9808753132820129, + "learning_rate": 2.5861149598830026e-05, + "loss": 0.2625, + "step": 14674 + }, + { + "epoch": 0.8054884742041712, + "grad_norm": 1.622787594795227, + "learning_rate": 2.585591669946446e-05, + "loss": 0.2506, + "step": 14676 + }, + { + "epoch": 0.8055982436882546, + "grad_norm": 3.047595977783203, + "learning_rate": 2.5850683762554184e-05, + "loss": 0.3403, + "step": 14678 + }, + { + "epoch": 0.8057080131723381, + "grad_norm": 0.9642164707183838, + "learning_rate": 2.584545078832875e-05, + "loss": 0.1766, + "step": 14680 + }, + { + "epoch": 0.8058177826564216, + "grad_norm": 1.515470027923584, + "learning_rate": 2.584021777701769e-05, + "loss": 0.2534, + "step": 14682 + }, + { + "epoch": 0.8059275521405049, + "grad_norm": 1.5934255123138428, + "learning_rate": 2.583498472885056e-05, + "loss": 0.3024, + "step": 14684 + }, + { + "epoch": 0.8060373216245884, + "grad_norm": 1.479154348373413, + "learning_rate": 2.5829751644056898e-05, + "loss": 0.2619, + "step": 14686 + }, + { + "epoch": 0.8061470911086718, + "grad_norm": 1.6125844717025757, + "learning_rate": 2.5824518522866255e-05, + "loss": 0.1507, + "step": 14688 + }, + { + "epoch": 0.8062568605927553, + "grad_norm": 1.6245858669281006, + "learning_rate": 2.5819285365508182e-05, + "loss": 0.2841, + "step": 14690 + }, + { + "epoch": 0.8063666300768386, + "grad_norm": 1.702701449394226, + "learning_rate": 2.581405217221224e-05, + "loss": 0.2816, + "step": 14692 + }, + { + "epoch": 0.8064763995609221, + "grad_norm": 1.3628175258636475, + "learning_rate": 2.5808818943207963e-05, + "loss": 0.3025, + "step": 14694 + }, + { + "epoch": 0.8065861690450055, + "grad_norm": 1.7221925258636475, + "learning_rate": 2.5803585678724916e-05, + "loss": 0.3125, + "step": 14696 + }, + { + "epoch": 0.8066959385290889, + "grad_norm": 1.7127313613891602, + "learning_rate": 2.579835237899267e-05, + "loss": 0.3157, + "step": 14698 + }, + { + "epoch": 0.8068057080131723, + "grad_norm": 1.3433587551116943, + "learning_rate": 2.579311904424076e-05, + "loss": 0.2, + "step": 14700 + }, + { + "epoch": 0.8069154774972558, + "grad_norm": 0.970636785030365, + "learning_rate": 2.5787885674698758e-05, + "loss": 0.1763, + "step": 14702 + }, + { + "epoch": 0.8070252469813392, + "grad_norm": 1.0838247537612915, + "learning_rate": 2.5782652270596223e-05, + "loss": 0.2022, + "step": 14704 + }, + { + "epoch": 0.8071350164654226, + "grad_norm": 1.5815314054489136, + "learning_rate": 2.577741883216272e-05, + "loss": 0.1666, + "step": 14706 + }, + { + "epoch": 0.807244785949506, + "grad_norm": 1.5830594301223755, + "learning_rate": 2.577218535962781e-05, + "loss": 0.2687, + "step": 14708 + }, + { + "epoch": 0.8073545554335895, + "grad_norm": 1.1151652336120605, + "learning_rate": 2.5766951853221057e-05, + "loss": 0.1927, + "step": 14710 + }, + { + "epoch": 0.8074643249176728, + "grad_norm": 1.0150864124298096, + "learning_rate": 2.576171831317204e-05, + "loss": 0.2379, + "step": 14712 + }, + { + "epoch": 0.8075740944017563, + "grad_norm": 4.796515941619873, + "learning_rate": 2.5756484739710308e-05, + "loss": 0.2566, + "step": 14714 + }, + { + "epoch": 0.8076838638858397, + "grad_norm": 1.9460679292678833, + "learning_rate": 2.5751251133065445e-05, + "loss": 0.2348, + "step": 14716 + }, + { + "epoch": 0.8077936333699232, + "grad_norm": 1.2980623245239258, + "learning_rate": 2.5746017493467023e-05, + "loss": 0.387, + "step": 14718 + }, + { + "epoch": 0.8079034028540066, + "grad_norm": 2.6762847900390625, + "learning_rate": 2.5740783821144615e-05, + "loss": 0.1734, + "step": 14720 + }, + { + "epoch": 0.80801317233809, + "grad_norm": 1.0648083686828613, + "learning_rate": 2.573555011632779e-05, + "loss": 0.1503, + "step": 14722 + }, + { + "epoch": 0.8081229418221735, + "grad_norm": 1.1371244192123413, + "learning_rate": 2.573031637924612e-05, + "loss": 0.165, + "step": 14724 + }, + { + "epoch": 0.8082327113062568, + "grad_norm": 1.1917157173156738, + "learning_rate": 2.5725082610129192e-05, + "loss": 0.2665, + "step": 14726 + }, + { + "epoch": 0.8083424807903403, + "grad_norm": 1.2744066715240479, + "learning_rate": 2.5719848809206586e-05, + "loss": 0.2051, + "step": 14728 + }, + { + "epoch": 0.8084522502744237, + "grad_norm": 2.4522900581359863, + "learning_rate": 2.571461497670788e-05, + "loss": 0.1682, + "step": 14730 + }, + { + "epoch": 0.8085620197585072, + "grad_norm": 1.1189281940460205, + "learning_rate": 2.570938111286265e-05, + "loss": 0.1491, + "step": 14732 + }, + { + "epoch": 0.8086717892425905, + "grad_norm": 1.66757071018219, + "learning_rate": 2.570414721790048e-05, + "loss": 0.2433, + "step": 14734 + }, + { + "epoch": 0.808781558726674, + "grad_norm": 1.2643002271652222, + "learning_rate": 2.5698913292050964e-05, + "loss": 0.4011, + "step": 14736 + }, + { + "epoch": 0.8088913282107574, + "grad_norm": 1.2526555061340332, + "learning_rate": 2.5693679335543676e-05, + "loss": 0.1808, + "step": 14738 + }, + { + "epoch": 0.8090010976948409, + "grad_norm": 1.4549248218536377, + "learning_rate": 2.5688445348608203e-05, + "loss": 0.2587, + "step": 14740 + }, + { + "epoch": 0.8091108671789242, + "grad_norm": 1.5732227563858032, + "learning_rate": 2.5683211331474145e-05, + "loss": 0.2505, + "step": 14742 + }, + { + "epoch": 0.8092206366630077, + "grad_norm": 1.6957190036773682, + "learning_rate": 2.567797728437108e-05, + "loss": 0.2096, + "step": 14744 + }, + { + "epoch": 0.8093304061470911, + "grad_norm": 1.261574387550354, + "learning_rate": 2.5672743207528605e-05, + "loss": 0.1987, + "step": 14746 + }, + { + "epoch": 0.8094401756311745, + "grad_norm": 1.1389106512069702, + "learning_rate": 2.566750910117632e-05, + "loss": 0.2914, + "step": 14748 + }, + { + "epoch": 0.8095499451152579, + "grad_norm": 1.8566299676895142, + "learning_rate": 2.5662274965543792e-05, + "loss": 0.3328, + "step": 14750 + }, + { + "epoch": 0.8096597145993414, + "grad_norm": 1.9953289031982422, + "learning_rate": 2.5657040800860647e-05, + "loss": 0.2202, + "step": 14752 + }, + { + "epoch": 0.8097694840834249, + "grad_norm": 1.0657199621200562, + "learning_rate": 2.565180660735646e-05, + "loss": 0.2026, + "step": 14754 + }, + { + "epoch": 0.8098792535675082, + "grad_norm": 3.36728835105896, + "learning_rate": 2.5646572385260836e-05, + "loss": 0.2026, + "step": 14756 + }, + { + "epoch": 0.8099890230515917, + "grad_norm": 1.0169782638549805, + "learning_rate": 2.5641338134803378e-05, + "loss": 0.3802, + "step": 14758 + }, + { + "epoch": 0.8100987925356751, + "grad_norm": 1.6649538278579712, + "learning_rate": 2.5636103856213685e-05, + "loss": 0.25, + "step": 14760 + }, + { + "epoch": 0.8102085620197585, + "grad_norm": 1.0548125505447388, + "learning_rate": 2.563086954972134e-05, + "loss": 0.3359, + "step": 14762 + }, + { + "epoch": 0.8103183315038419, + "grad_norm": 1.5248534679412842, + "learning_rate": 2.5625635215555977e-05, + "loss": 0.1783, + "step": 14764 + }, + { + "epoch": 0.8104281009879254, + "grad_norm": 1.1290180683135986, + "learning_rate": 2.562040085394718e-05, + "loss": 0.1887, + "step": 14766 + }, + { + "epoch": 0.8105378704720088, + "grad_norm": 1.7580904960632324, + "learning_rate": 2.5615166465124547e-05, + "loss": 0.3308, + "step": 14768 + }, + { + "epoch": 0.8106476399560922, + "grad_norm": 1.4970958232879639, + "learning_rate": 2.560993204931769e-05, + "loss": 0.2811, + "step": 14770 + }, + { + "epoch": 0.8107574094401756, + "grad_norm": 1.121113896369934, + "learning_rate": 2.5604697606756233e-05, + "loss": 0.2143, + "step": 14772 + }, + { + "epoch": 0.8108671789242591, + "grad_norm": 1.4649410247802734, + "learning_rate": 2.5599463137669766e-05, + "loss": 0.2535, + "step": 14774 + }, + { + "epoch": 0.8109769484083424, + "grad_norm": 0.9430072903633118, + "learning_rate": 2.5594228642287906e-05, + "loss": 0.1374, + "step": 14776 + }, + { + "epoch": 0.8110867178924259, + "grad_norm": 1.3807591199874878, + "learning_rate": 2.558899412084026e-05, + "loss": 0.1931, + "step": 14778 + }, + { + "epoch": 0.8111964873765093, + "grad_norm": 1.9038740396499634, + "learning_rate": 2.5583759573556436e-05, + "loss": 0.366, + "step": 14780 + }, + { + "epoch": 0.8113062568605928, + "grad_norm": 1.5259850025177002, + "learning_rate": 2.5578525000666053e-05, + "loss": 0.1784, + "step": 14782 + }, + { + "epoch": 0.8114160263446761, + "grad_norm": 2.073573589324951, + "learning_rate": 2.5573290402398726e-05, + "loss": 0.3342, + "step": 14784 + }, + { + "epoch": 0.8115257958287596, + "grad_norm": 1.3266862630844116, + "learning_rate": 2.556805577898407e-05, + "loss": 0.2258, + "step": 14786 + }, + { + "epoch": 0.811635565312843, + "grad_norm": 1.005645990371704, + "learning_rate": 2.5562821130651694e-05, + "loss": 0.1534, + "step": 14788 + }, + { + "epoch": 0.8117453347969265, + "grad_norm": 1.159562587738037, + "learning_rate": 2.5557586457631226e-05, + "loss": 0.1961, + "step": 14790 + }, + { + "epoch": 0.8118551042810099, + "grad_norm": 1.580458641052246, + "learning_rate": 2.5552351760152275e-05, + "loss": 0.2326, + "step": 14792 + }, + { + "epoch": 0.8119648737650933, + "grad_norm": 1.1826999187469482, + "learning_rate": 2.554711703844447e-05, + "loss": 0.2129, + "step": 14794 + }, + { + "epoch": 0.8120746432491768, + "grad_norm": 1.3932989835739136, + "learning_rate": 2.554188229273743e-05, + "loss": 0.2054, + "step": 14796 + }, + { + "epoch": 0.8121844127332601, + "grad_norm": 1.7144371271133423, + "learning_rate": 2.553664752326076e-05, + "loss": 0.3418, + "step": 14798 + }, + { + "epoch": 0.8122941822173436, + "grad_norm": 1.3398711681365967, + "learning_rate": 2.55314127302441e-05, + "loss": 0.3198, + "step": 14800 + }, + { + "epoch": 0.812403951701427, + "grad_norm": 1.8135366439819336, + "learning_rate": 2.552617791391707e-05, + "loss": 0.3604, + "step": 14802 + }, + { + "epoch": 0.8125137211855105, + "grad_norm": 2.1558709144592285, + "learning_rate": 2.55209430745093e-05, + "loss": 0.191, + "step": 14804 + }, + { + "epoch": 0.8126234906695938, + "grad_norm": 1.5801037549972534, + "learning_rate": 2.55157082122504e-05, + "loss": 0.2049, + "step": 14806 + }, + { + "epoch": 0.8127332601536773, + "grad_norm": 1.4063687324523926, + "learning_rate": 2.5510473327370016e-05, + "loss": 0.2469, + "step": 14808 + }, + { + "epoch": 0.8128430296377607, + "grad_norm": 1.3454095125198364, + "learning_rate": 2.550523842009776e-05, + "loss": 0.1421, + "step": 14810 + }, + { + "epoch": 0.8129527991218441, + "grad_norm": 1.6825820207595825, + "learning_rate": 2.550000349066327e-05, + "loss": 0.2276, + "step": 14812 + }, + { + "epoch": 0.8130625686059275, + "grad_norm": 1.7588979005813599, + "learning_rate": 2.549476853929617e-05, + "loss": 0.2612, + "step": 14814 + }, + { + "epoch": 0.813172338090011, + "grad_norm": 1.3526875972747803, + "learning_rate": 2.5489533566226096e-05, + "loss": 0.2708, + "step": 14816 + }, + { + "epoch": 0.8132821075740944, + "grad_norm": 1.608485221862793, + "learning_rate": 2.5484298571682676e-05, + "loss": 0.184, + "step": 14818 + }, + { + "epoch": 0.8133918770581778, + "grad_norm": 3.7398102283477783, + "learning_rate": 2.5479063555895543e-05, + "loss": 0.2535, + "step": 14820 + }, + { + "epoch": 0.8135016465422612, + "grad_norm": 0.9718306064605713, + "learning_rate": 2.5473828519094333e-05, + "loss": 0.1715, + "step": 14822 + }, + { + "epoch": 0.8136114160263447, + "grad_norm": 1.0784040689468384, + "learning_rate": 2.5468593461508676e-05, + "loss": 0.1584, + "step": 14824 + }, + { + "epoch": 0.813721185510428, + "grad_norm": 1.1209944486618042, + "learning_rate": 2.5463358383368212e-05, + "loss": 0.1225, + "step": 14826 + }, + { + "epoch": 0.8138309549945115, + "grad_norm": 1.241037130355835, + "learning_rate": 2.5458123284902573e-05, + "loss": 0.2784, + "step": 14828 + }, + { + "epoch": 0.813940724478595, + "grad_norm": 0.9573106169700623, + "learning_rate": 2.5452888166341393e-05, + "loss": 0.2012, + "step": 14830 + }, + { + "epoch": 0.8140504939626784, + "grad_norm": 1.6942845582962036, + "learning_rate": 2.5447653027914325e-05, + "loss": 0.2798, + "step": 14832 + }, + { + "epoch": 0.8141602634467618, + "grad_norm": 0.7866865396499634, + "learning_rate": 2.5442417869850997e-05, + "loss": 0.1578, + "step": 14834 + }, + { + "epoch": 0.8142700329308452, + "grad_norm": 1.2699031829833984, + "learning_rate": 2.5437182692381047e-05, + "loss": 0.3036, + "step": 14836 + }, + { + "epoch": 0.8143798024149287, + "grad_norm": 1.9052870273590088, + "learning_rate": 2.5431947495734115e-05, + "loss": 0.3082, + "step": 14838 + }, + { + "epoch": 0.814489571899012, + "grad_norm": 2.029440402984619, + "learning_rate": 2.542671228013985e-05, + "loss": 0.2111, + "step": 14840 + }, + { + "epoch": 0.8145993413830955, + "grad_norm": 1.8600682020187378, + "learning_rate": 2.5421477045827892e-05, + "loss": 0.2391, + "step": 14842 + }, + { + "epoch": 0.8147091108671789, + "grad_norm": 1.1966552734375, + "learning_rate": 2.541624179302788e-05, + "loss": 0.3172, + "step": 14844 + }, + { + "epoch": 0.8148188803512624, + "grad_norm": 1.5496560335159302, + "learning_rate": 2.5411006521969455e-05, + "loss": 0.2619, + "step": 14846 + }, + { + "epoch": 0.8149286498353457, + "grad_norm": 1.5779485702514648, + "learning_rate": 2.540577123288227e-05, + "loss": 0.2716, + "step": 14848 + }, + { + "epoch": 0.8150384193194292, + "grad_norm": 1.9550883769989014, + "learning_rate": 2.5400535925995965e-05, + "loss": 0.2052, + "step": 14850 + }, + { + "epoch": 0.8151481888035126, + "grad_norm": 1.5260998010635376, + "learning_rate": 2.5395300601540194e-05, + "loss": 0.3284, + "step": 14852 + }, + { + "epoch": 0.8152579582875961, + "grad_norm": 1.5576753616333008, + "learning_rate": 2.5390065259744595e-05, + "loss": 0.3417, + "step": 14854 + }, + { + "epoch": 0.8153677277716794, + "grad_norm": 1.1171756982803345, + "learning_rate": 2.538482990083882e-05, + "loss": 0.2082, + "step": 14856 + }, + { + "epoch": 0.8154774972557629, + "grad_norm": 1.6786630153656006, + "learning_rate": 2.537959452505252e-05, + "loss": 0.2981, + "step": 14858 + }, + { + "epoch": 0.8155872667398463, + "grad_norm": 1.4908634424209595, + "learning_rate": 2.5374359132615327e-05, + "loss": 0.2254, + "step": 14860 + }, + { + "epoch": 0.8156970362239298, + "grad_norm": 1.8637287616729736, + "learning_rate": 2.536912372375692e-05, + "loss": 0.1859, + "step": 14862 + }, + { + "epoch": 0.8158068057080132, + "grad_norm": 2.048954963684082, + "learning_rate": 2.5363888298706928e-05, + "loss": 0.2333, + "step": 14864 + }, + { + "epoch": 0.8159165751920966, + "grad_norm": 1.3030260801315308, + "learning_rate": 2.535865285769501e-05, + "loss": 0.3069, + "step": 14866 + }, + { + "epoch": 0.8160263446761801, + "grad_norm": 2.1572811603546143, + "learning_rate": 2.5353417400950825e-05, + "loss": 0.2494, + "step": 14868 + }, + { + "epoch": 0.8161361141602634, + "grad_norm": 2.323774814605713, + "learning_rate": 2.5348181928704013e-05, + "loss": 0.2674, + "step": 14870 + }, + { + "epoch": 0.8162458836443469, + "grad_norm": 1.6756809949874878, + "learning_rate": 2.5342946441184222e-05, + "loss": 0.2545, + "step": 14872 + }, + { + "epoch": 0.8163556531284303, + "grad_norm": 1.1146985292434692, + "learning_rate": 2.533771093862113e-05, + "loss": 0.2854, + "step": 14874 + }, + { + "epoch": 0.8164654226125138, + "grad_norm": 1.6021291017532349, + "learning_rate": 2.5332475421244372e-05, + "loss": 0.301, + "step": 14876 + }, + { + "epoch": 0.8165751920965971, + "grad_norm": 1.4785367250442505, + "learning_rate": 2.5327239889283612e-05, + "loss": 0.3126, + "step": 14878 + }, + { + "epoch": 0.8166849615806806, + "grad_norm": 2.7690227031707764, + "learning_rate": 2.5322004342968502e-05, + "loss": 0.25, + "step": 14880 + }, + { + "epoch": 0.816794731064764, + "grad_norm": 2.3448734283447266, + "learning_rate": 2.531676878252871e-05, + "loss": 0.1779, + "step": 14882 + }, + { + "epoch": 0.8169045005488474, + "grad_norm": 2.8481812477111816, + "learning_rate": 2.5311533208193878e-05, + "loss": 0.2189, + "step": 14884 + }, + { + "epoch": 0.8170142700329308, + "grad_norm": 1.9728882312774658, + "learning_rate": 2.530629762019367e-05, + "loss": 0.3359, + "step": 14886 + }, + { + "epoch": 0.8171240395170143, + "grad_norm": 1.2620114088058472, + "learning_rate": 2.5301062018757748e-05, + "loss": 0.2411, + "step": 14888 + }, + { + "epoch": 0.8172338090010977, + "grad_norm": 1.7464929819107056, + "learning_rate": 2.5295826404115764e-05, + "loss": 0.2846, + "step": 14890 + }, + { + "epoch": 0.8173435784851811, + "grad_norm": 1.2351969480514526, + "learning_rate": 2.529059077649738e-05, + "loss": 0.2316, + "step": 14892 + }, + { + "epoch": 0.8174533479692645, + "grad_norm": 1.1866436004638672, + "learning_rate": 2.5285355136132266e-05, + "loss": 0.1843, + "step": 14894 + }, + { + "epoch": 0.817563117453348, + "grad_norm": 1.3704652786254883, + "learning_rate": 2.5280119483250065e-05, + "loss": 0.2345, + "step": 14896 + }, + { + "epoch": 0.8176728869374313, + "grad_norm": 1.094599723815918, + "learning_rate": 2.5274883818080458e-05, + "loss": 0.223, + "step": 14898 + }, + { + "epoch": 0.8177826564215148, + "grad_norm": 3.0930612087249756, + "learning_rate": 2.5269648140853096e-05, + "loss": 0.2714, + "step": 14900 + }, + { + "epoch": 0.8178924259055983, + "grad_norm": 2.70940899848938, + "learning_rate": 2.5264412451797638e-05, + "loss": 0.2422, + "step": 14902 + }, + { + "epoch": 0.8180021953896817, + "grad_norm": 1.3988211154937744, + "learning_rate": 2.525917675114376e-05, + "loss": 0.2045, + "step": 14904 + }, + { + "epoch": 0.8181119648737651, + "grad_norm": 1.5005227327346802, + "learning_rate": 2.525394103912111e-05, + "loss": 0.2132, + "step": 14906 + }, + { + "epoch": 0.8182217343578485, + "grad_norm": 1.6512421369552612, + "learning_rate": 2.5248705315959364e-05, + "loss": 0.3069, + "step": 14908 + }, + { + "epoch": 0.818331503841932, + "grad_norm": 1.2655137777328491, + "learning_rate": 2.5243469581888174e-05, + "loss": 0.176, + "step": 14910 + }, + { + "epoch": 0.8184412733260154, + "grad_norm": 1.1208505630493164, + "learning_rate": 2.5238233837137226e-05, + "loss": 0.1997, + "step": 14912 + }, + { + "epoch": 0.8185510428100988, + "grad_norm": 1.7786020040512085, + "learning_rate": 2.5232998081936165e-05, + "loss": 0.3569, + "step": 14914 + }, + { + "epoch": 0.8186608122941822, + "grad_norm": 3.4818472862243652, + "learning_rate": 2.5227762316514662e-05, + "loss": 0.5029, + "step": 14916 + }, + { + "epoch": 0.8187705817782657, + "grad_norm": 1.4387844800949097, + "learning_rate": 2.5222526541102393e-05, + "loss": 0.2315, + "step": 14918 + }, + { + "epoch": 0.818880351262349, + "grad_norm": 1.423258900642395, + "learning_rate": 2.5217290755929013e-05, + "loss": 0.2544, + "step": 14920 + }, + { + "epoch": 0.8189901207464325, + "grad_norm": 1.1463466882705688, + "learning_rate": 2.521205496122419e-05, + "loss": 0.1704, + "step": 14922 + }, + { + "epoch": 0.8190998902305159, + "grad_norm": 2.2697794437408447, + "learning_rate": 2.5206819157217605e-05, + "loss": 0.2874, + "step": 14924 + }, + { + "epoch": 0.8192096597145994, + "grad_norm": 0.862223207950592, + "learning_rate": 2.520158334413892e-05, + "loss": 0.1786, + "step": 14926 + }, + { + "epoch": 0.8193194291986827, + "grad_norm": 1.1490919589996338, + "learning_rate": 2.5196347522217784e-05, + "loss": 0.1863, + "step": 14928 + }, + { + "epoch": 0.8194291986827662, + "grad_norm": 0.8815138339996338, + "learning_rate": 2.5191111691683893e-05, + "loss": 0.2597, + "step": 14930 + }, + { + "epoch": 0.8195389681668496, + "grad_norm": 1.530532956123352, + "learning_rate": 2.5185875852766903e-05, + "loss": 0.2228, + "step": 14932 + }, + { + "epoch": 0.819648737650933, + "grad_norm": 1.7060569524765015, + "learning_rate": 2.5180640005696487e-05, + "loss": 0.219, + "step": 14934 + }, + { + "epoch": 0.8197585071350164, + "grad_norm": 1.0386884212493896, + "learning_rate": 2.517540415070231e-05, + "loss": 0.2037, + "step": 14936 + }, + { + "epoch": 0.8198682766190999, + "grad_norm": 1.5380476713180542, + "learning_rate": 2.5170168288014046e-05, + "loss": 0.2459, + "step": 14938 + }, + { + "epoch": 0.8199780461031834, + "grad_norm": 1.3557411432266235, + "learning_rate": 2.5164932417861364e-05, + "loss": 0.277, + "step": 14940 + }, + { + "epoch": 0.8200878155872667, + "grad_norm": 1.9013569355010986, + "learning_rate": 2.5159696540473948e-05, + "loss": 0.2645, + "step": 14942 + }, + { + "epoch": 0.8201975850713502, + "grad_norm": 0.926803469657898, + "learning_rate": 2.5154460656081453e-05, + "loss": 0.1401, + "step": 14944 + }, + { + "epoch": 0.8203073545554336, + "grad_norm": 1.2712531089782715, + "learning_rate": 2.514922476491355e-05, + "loss": 0.1559, + "step": 14946 + }, + { + "epoch": 0.820417124039517, + "grad_norm": 2.33311128616333, + "learning_rate": 2.514398886719992e-05, + "loss": 0.2781, + "step": 14948 + }, + { + "epoch": 0.8205268935236004, + "grad_norm": 0.8342782855033875, + "learning_rate": 2.513875296317023e-05, + "loss": 0.1934, + "step": 14950 + }, + { + "epoch": 0.8206366630076839, + "grad_norm": 1.4872796535491943, + "learning_rate": 2.513351705305415e-05, + "loss": 0.3147, + "step": 14952 + }, + { + "epoch": 0.8207464324917673, + "grad_norm": 1.092112421989441, + "learning_rate": 2.5128281137081362e-05, + "loss": 0.2299, + "step": 14954 + }, + { + "epoch": 0.8208562019758507, + "grad_norm": 1.8398507833480835, + "learning_rate": 2.5123045215481532e-05, + "loss": 0.2557, + "step": 14956 + }, + { + "epoch": 0.8209659714599341, + "grad_norm": 2.350829601287842, + "learning_rate": 2.5117809288484334e-05, + "loss": 0.387, + "step": 14958 + }, + { + "epoch": 0.8210757409440176, + "grad_norm": 1.3210265636444092, + "learning_rate": 2.5112573356319453e-05, + "loss": 0.2296, + "step": 14960 + }, + { + "epoch": 0.821185510428101, + "grad_norm": 1.444179654121399, + "learning_rate": 2.5107337419216542e-05, + "loss": 0.3739, + "step": 14962 + }, + { + "epoch": 0.8212952799121844, + "grad_norm": 0.9900021553039551, + "learning_rate": 2.5102101477405283e-05, + "loss": 0.1586, + "step": 14964 + }, + { + "epoch": 0.8214050493962678, + "grad_norm": 1.836014986038208, + "learning_rate": 2.5096865531115355e-05, + "loss": 0.2159, + "step": 14966 + }, + { + "epoch": 0.8215148188803513, + "grad_norm": 1.630706787109375, + "learning_rate": 2.509162958057643e-05, + "loss": 0.214, + "step": 14968 + }, + { + "epoch": 0.8216245883644346, + "grad_norm": 1.3392583131790161, + "learning_rate": 2.508639362601818e-05, + "loss": 0.2514, + "step": 14970 + }, + { + "epoch": 0.8217343578485181, + "grad_norm": 1.8046174049377441, + "learning_rate": 2.5081157667670284e-05, + "loss": 0.3056, + "step": 14972 + }, + { + "epoch": 0.8218441273326015, + "grad_norm": 1.375868797302246, + "learning_rate": 2.5075921705762416e-05, + "loss": 0.2183, + "step": 14974 + }, + { + "epoch": 0.821953896816685, + "grad_norm": 1.2676663398742676, + "learning_rate": 2.5070685740524246e-05, + "loss": 0.2697, + "step": 14976 + }, + { + "epoch": 0.8220636663007684, + "grad_norm": 1.2145730257034302, + "learning_rate": 2.5065449772185456e-05, + "loss": 0.1797, + "step": 14978 + }, + { + "epoch": 0.8221734357848518, + "grad_norm": 3.141425609588623, + "learning_rate": 2.5060213800975717e-05, + "loss": 0.2907, + "step": 14980 + }, + { + "epoch": 0.8222832052689353, + "grad_norm": 1.155463695526123, + "learning_rate": 2.50549778271247e-05, + "loss": 0.2073, + "step": 14982 + }, + { + "epoch": 0.8223929747530186, + "grad_norm": 0.9521095156669617, + "learning_rate": 2.5049741850862086e-05, + "loss": 0.1947, + "step": 14984 + }, + { + "epoch": 0.8225027442371021, + "grad_norm": 2.9468166828155518, + "learning_rate": 2.5044505872417557e-05, + "loss": 0.2031, + "step": 14986 + }, + { + "epoch": 0.8226125137211855, + "grad_norm": 1.780767560005188, + "learning_rate": 2.5039269892020772e-05, + "loss": 0.288, + "step": 14988 + }, + { + "epoch": 0.822722283205269, + "grad_norm": 1.8937879800796509, + "learning_rate": 2.5034033909901428e-05, + "loss": 0.2458, + "step": 14990 + }, + { + "epoch": 0.8228320526893523, + "grad_norm": 1.291651725769043, + "learning_rate": 2.502879792628918e-05, + "loss": 0.2928, + "step": 14992 + }, + { + "epoch": 0.8229418221734358, + "grad_norm": 1.4914029836654663, + "learning_rate": 2.502356194141372e-05, + "loss": 0.2031, + "step": 14994 + }, + { + "epoch": 0.8230515916575192, + "grad_norm": 1.6745688915252686, + "learning_rate": 2.501832595550471e-05, + "loss": 0.3946, + "step": 14996 + }, + { + "epoch": 0.8231613611416027, + "grad_norm": 1.611183524131775, + "learning_rate": 2.5013089968791842e-05, + "loss": 0.2907, + "step": 14998 + }, + { + "epoch": 0.823271130625686, + "grad_norm": 1.6582316160202026, + "learning_rate": 2.5007853981504787e-05, + "loss": 0.3534, + "step": 15000 + }, + { + "epoch": 0.8233809001097695, + "grad_norm": 1.9335989952087402, + "learning_rate": 2.5002617993873207e-05, + "loss": 0.3357, + "step": 15002 + }, + { + "epoch": 0.8234906695938529, + "grad_norm": 1.9923772811889648, + "learning_rate": 2.49973820061268e-05, + "loss": 0.2521, + "step": 15004 + }, + { + "epoch": 0.8236004390779363, + "grad_norm": 1.025448203086853, + "learning_rate": 2.499214601849522e-05, + "loss": 0.2796, + "step": 15006 + }, + { + "epoch": 0.8237102085620197, + "grad_norm": 1.5734554529190063, + "learning_rate": 2.498691003120816e-05, + "loss": 0.2489, + "step": 15008 + }, + { + "epoch": 0.8238199780461032, + "grad_norm": 1.223750352859497, + "learning_rate": 2.4981674044495292e-05, + "loss": 0.3172, + "step": 15010 + }, + { + "epoch": 0.8239297475301867, + "grad_norm": 1.4366183280944824, + "learning_rate": 2.4976438058586285e-05, + "loss": 0.255, + "step": 15012 + }, + { + "epoch": 0.82403951701427, + "grad_norm": 2.2506704330444336, + "learning_rate": 2.4971202073710824e-05, + "loss": 0.1988, + "step": 15014 + }, + { + "epoch": 0.8241492864983535, + "grad_norm": 1.4200676679611206, + "learning_rate": 2.4965966090098584e-05, + "loss": 0.2644, + "step": 15016 + }, + { + "epoch": 0.8242590559824369, + "grad_norm": 3.0938377380371094, + "learning_rate": 2.4960730107979233e-05, + "loss": 0.2869, + "step": 15018 + }, + { + "epoch": 0.8243688254665203, + "grad_norm": 1.1945300102233887, + "learning_rate": 2.4955494127582446e-05, + "loss": 0.1885, + "step": 15020 + }, + { + "epoch": 0.8244785949506037, + "grad_norm": 1.9366766214370728, + "learning_rate": 2.4950258149137913e-05, + "loss": 0.3013, + "step": 15022 + }, + { + "epoch": 0.8245883644346872, + "grad_norm": 1.3678791522979736, + "learning_rate": 2.4945022172875307e-05, + "loss": 0.274, + "step": 15024 + }, + { + "epoch": 0.8246981339187706, + "grad_norm": 1.565171718597412, + "learning_rate": 2.493978619902429e-05, + "loss": 0.297, + "step": 15026 + }, + { + "epoch": 0.824807903402854, + "grad_norm": 1.127042531967163, + "learning_rate": 2.4934550227814553e-05, + "loss": 0.1855, + "step": 15028 + }, + { + "epoch": 0.8249176728869374, + "grad_norm": 2.823357105255127, + "learning_rate": 2.492931425947576e-05, + "loss": 0.3586, + "step": 15030 + }, + { + "epoch": 0.8250274423710209, + "grad_norm": 1.8857371807098389, + "learning_rate": 2.4924078294237586e-05, + "loss": 0.1744, + "step": 15032 + }, + { + "epoch": 0.8251372118551042, + "grad_norm": 1.5095560550689697, + "learning_rate": 2.491884233232972e-05, + "loss": 0.2813, + "step": 15034 + }, + { + "epoch": 0.8252469813391877, + "grad_norm": 0.9053865075111389, + "learning_rate": 2.4913606373981825e-05, + "loss": 0.2693, + "step": 15036 + }, + { + "epoch": 0.8253567508232711, + "grad_norm": 1.7184044122695923, + "learning_rate": 2.4908370419423575e-05, + "loss": 0.2264, + "step": 15038 + }, + { + "epoch": 0.8254665203073546, + "grad_norm": 2.2919440269470215, + "learning_rate": 2.4903134468884654e-05, + "loss": 0.2718, + "step": 15040 + }, + { + "epoch": 0.8255762897914379, + "grad_norm": 1.3688989877700806, + "learning_rate": 2.489789852259473e-05, + "loss": 0.3332, + "step": 15042 + }, + { + "epoch": 0.8256860592755214, + "grad_norm": 1.6890841722488403, + "learning_rate": 2.4892662580783467e-05, + "loss": 0.3151, + "step": 15044 + }, + { + "epoch": 0.8257958287596048, + "grad_norm": 2.0017619132995605, + "learning_rate": 2.488742664368056e-05, + "loss": 0.2544, + "step": 15046 + }, + { + "epoch": 0.8259055982436883, + "grad_norm": 1.2764878273010254, + "learning_rate": 2.4882190711515668e-05, + "loss": 0.2449, + "step": 15048 + }, + { + "epoch": 0.8260153677277717, + "grad_norm": 1.2788937091827393, + "learning_rate": 2.487695478451847e-05, + "loss": 0.2505, + "step": 15050 + }, + { + "epoch": 0.8261251372118551, + "grad_norm": 1.4583972692489624, + "learning_rate": 2.4871718862918637e-05, + "loss": 0.2305, + "step": 15052 + }, + { + "epoch": 0.8262349066959386, + "grad_norm": 1.531015157699585, + "learning_rate": 2.486648294694585e-05, + "loss": 0.2546, + "step": 15054 + }, + { + "epoch": 0.8263446761800219, + "grad_norm": 1.2788950204849243, + "learning_rate": 2.4861247036829776e-05, + "loss": 0.3309, + "step": 15056 + }, + { + "epoch": 0.8264544456641054, + "grad_norm": 1.5048964023590088, + "learning_rate": 2.4856011132800086e-05, + "loss": 0.2574, + "step": 15058 + }, + { + "epoch": 0.8265642151481888, + "grad_norm": 1.1894680261611938, + "learning_rate": 2.4850775235086457e-05, + "loss": 0.184, + "step": 15060 + }, + { + "epoch": 0.8266739846322723, + "grad_norm": 1.9643841981887817, + "learning_rate": 2.4845539343918556e-05, + "loss": 0.2362, + "step": 15062 + }, + { + "epoch": 0.8267837541163556, + "grad_norm": 1.5529954433441162, + "learning_rate": 2.4840303459526058e-05, + "loss": 0.2542, + "step": 15064 + }, + { + "epoch": 0.8268935236004391, + "grad_norm": 1.2353345155715942, + "learning_rate": 2.4835067582138638e-05, + "loss": 0.23, + "step": 15066 + }, + { + "epoch": 0.8270032930845225, + "grad_norm": 1.722405195236206, + "learning_rate": 2.4829831711985957e-05, + "loss": 0.2977, + "step": 15068 + }, + { + "epoch": 0.827113062568606, + "grad_norm": 1.068513035774231, + "learning_rate": 2.48245958492977e-05, + "loss": 0.1712, + "step": 15070 + }, + { + "epoch": 0.8272228320526893, + "grad_norm": 1.2497738599777222, + "learning_rate": 2.4819359994303526e-05, + "loss": 0.1877, + "step": 15072 + }, + { + "epoch": 0.8273326015367728, + "grad_norm": 1.3195133209228516, + "learning_rate": 2.4814124147233106e-05, + "loss": 0.2575, + "step": 15074 + }, + { + "epoch": 0.8274423710208562, + "grad_norm": 3.7467212677001953, + "learning_rate": 2.4808888308316116e-05, + "loss": 0.1844, + "step": 15076 + }, + { + "epoch": 0.8275521405049396, + "grad_norm": 3.3843865394592285, + "learning_rate": 2.480365247778223e-05, + "loss": 0.3035, + "step": 15078 + }, + { + "epoch": 0.827661909989023, + "grad_norm": 1.11642587184906, + "learning_rate": 2.479841665586109e-05, + "loss": 0.1998, + "step": 15080 + }, + { + "epoch": 0.8277716794731065, + "grad_norm": 1.5215487480163574, + "learning_rate": 2.4793180842782394e-05, + "loss": 0.2506, + "step": 15082 + }, + { + "epoch": 0.8278814489571898, + "grad_norm": 0.9869683980941772, + "learning_rate": 2.478794503877581e-05, + "loss": 0.2263, + "step": 15084 + }, + { + "epoch": 0.8279912184412733, + "grad_norm": 1.1615418195724487, + "learning_rate": 2.4782709244070992e-05, + "loss": 0.1759, + "step": 15086 + }, + { + "epoch": 0.8281009879253568, + "grad_norm": 1.190032720565796, + "learning_rate": 2.477747345889761e-05, + "loss": 0.1711, + "step": 15088 + }, + { + "epoch": 0.8282107574094402, + "grad_norm": 1.8076624870300293, + "learning_rate": 2.477223768348534e-05, + "loss": 0.289, + "step": 15090 + }, + { + "epoch": 0.8283205268935236, + "grad_norm": 1.1694796085357666, + "learning_rate": 2.476700191806384e-05, + "loss": 0.261, + "step": 15092 + }, + { + "epoch": 0.828430296377607, + "grad_norm": 1.4685477018356323, + "learning_rate": 2.4761766162862783e-05, + "loss": 0.2795, + "step": 15094 + }, + { + "epoch": 0.8285400658616905, + "grad_norm": 1.3648601770401, + "learning_rate": 2.475653041811183e-05, + "loss": 0.226, + "step": 15096 + }, + { + "epoch": 0.8286498353457739, + "grad_norm": 1.0262340307235718, + "learning_rate": 2.4751294684040642e-05, + "loss": 0.3051, + "step": 15098 + }, + { + "epoch": 0.8287596048298573, + "grad_norm": 1.0816233158111572, + "learning_rate": 2.4746058960878897e-05, + "loss": 0.141, + "step": 15100 + }, + { + "epoch": 0.8288693743139407, + "grad_norm": 1.279900312423706, + "learning_rate": 2.4740823248856255e-05, + "loss": 0.3195, + "step": 15102 + }, + { + "epoch": 0.8289791437980242, + "grad_norm": 3.08981990814209, + "learning_rate": 2.4735587548202365e-05, + "loss": 0.188, + "step": 15104 + }, + { + "epoch": 0.8290889132821075, + "grad_norm": 2.3134257793426514, + "learning_rate": 2.4730351859146913e-05, + "loss": 0.2948, + "step": 15106 + }, + { + "epoch": 0.829198682766191, + "grad_norm": 2.710400342941284, + "learning_rate": 2.472511618191955e-05, + "loss": 0.167, + "step": 15108 + }, + { + "epoch": 0.8293084522502744, + "grad_norm": 2.0359151363372803, + "learning_rate": 2.4719880516749934e-05, + "loss": 0.3929, + "step": 15110 + }, + { + "epoch": 0.8294182217343579, + "grad_norm": 1.06497323513031, + "learning_rate": 2.4714644863867736e-05, + "loss": 0.1373, + "step": 15112 + }, + { + "epoch": 0.8295279912184412, + "grad_norm": 1.3318511247634888, + "learning_rate": 2.470940922350262e-05, + "loss": 0.2573, + "step": 15114 + }, + { + "epoch": 0.8296377607025247, + "grad_norm": 1.5141788721084595, + "learning_rate": 2.4704173595884242e-05, + "loss": 0.2629, + "step": 15116 + }, + { + "epoch": 0.8297475301866081, + "grad_norm": 1.0773741006851196, + "learning_rate": 2.4698937981242258e-05, + "loss": 0.216, + "step": 15118 + }, + { + "epoch": 0.8298572996706916, + "grad_norm": 1.5871708393096924, + "learning_rate": 2.4693702379806337e-05, + "loss": 0.2751, + "step": 15120 + }, + { + "epoch": 0.829967069154775, + "grad_norm": 1.2743643522262573, + "learning_rate": 2.468846679180613e-05, + "loss": 0.3059, + "step": 15122 + }, + { + "epoch": 0.8300768386388584, + "grad_norm": 1.3564274311065674, + "learning_rate": 2.4683231217471294e-05, + "loss": 0.3278, + "step": 15124 + }, + { + "epoch": 0.8301866081229419, + "grad_norm": 1.4643843173980713, + "learning_rate": 2.46779956570315e-05, + "loss": 0.2052, + "step": 15126 + }, + { + "epoch": 0.8302963776070252, + "grad_norm": 2.0868096351623535, + "learning_rate": 2.4672760110716394e-05, + "loss": 0.3044, + "step": 15128 + }, + { + "epoch": 0.8304061470911087, + "grad_norm": 1.9664194583892822, + "learning_rate": 2.4667524578755634e-05, + "loss": 0.2162, + "step": 15130 + }, + { + "epoch": 0.8305159165751921, + "grad_norm": 1.1623566150665283, + "learning_rate": 2.466228906137888e-05, + "loss": 0.2296, + "step": 15132 + }, + { + "epoch": 0.8306256860592756, + "grad_norm": 1.4369380474090576, + "learning_rate": 2.465705355881578e-05, + "loss": 0.1678, + "step": 15134 + }, + { + "epoch": 0.8307354555433589, + "grad_norm": 1.0246002674102783, + "learning_rate": 2.4651818071296002e-05, + "loss": 0.2108, + "step": 15136 + }, + { + "epoch": 0.8308452250274424, + "grad_norm": 1.1502354145050049, + "learning_rate": 2.4646582599049188e-05, + "loss": 0.1695, + "step": 15138 + }, + { + "epoch": 0.8309549945115258, + "grad_norm": 1.7446010112762451, + "learning_rate": 2.4641347142304987e-05, + "loss": 0.2599, + "step": 15140 + }, + { + "epoch": 0.8310647639956092, + "grad_norm": 1.8042654991149902, + "learning_rate": 2.4636111701293068e-05, + "loss": 0.1721, + "step": 15142 + }, + { + "epoch": 0.8311745334796926, + "grad_norm": 0.8916968703269958, + "learning_rate": 2.463087627624308e-05, + "loss": 0.2393, + "step": 15144 + }, + { + "epoch": 0.8312843029637761, + "grad_norm": 1.9168565273284912, + "learning_rate": 2.4625640867384672e-05, + "loss": 0.2065, + "step": 15146 + }, + { + "epoch": 0.8313940724478595, + "grad_norm": 1.1821719408035278, + "learning_rate": 2.4620405474947487e-05, + "loss": 0.1584, + "step": 15148 + }, + { + "epoch": 0.8315038419319429, + "grad_norm": 1.1442674398422241, + "learning_rate": 2.4615170099161184e-05, + "loss": 0.2091, + "step": 15150 + }, + { + "epoch": 0.8316136114160263, + "grad_norm": 1.7103629112243652, + "learning_rate": 2.460993474025541e-05, + "loss": 0.1614, + "step": 15152 + }, + { + "epoch": 0.8317233809001098, + "grad_norm": 1.3117787837982178, + "learning_rate": 2.460469939845981e-05, + "loss": 0.1884, + "step": 15154 + }, + { + "epoch": 0.8318331503841931, + "grad_norm": 2.535393714904785, + "learning_rate": 2.4599464074004037e-05, + "loss": 0.4319, + "step": 15156 + }, + { + "epoch": 0.8319429198682766, + "grad_norm": 3.1089560985565186, + "learning_rate": 2.459422876711774e-05, + "loss": 0.2351, + "step": 15158 + }, + { + "epoch": 0.8320526893523601, + "grad_norm": 1.3603479862213135, + "learning_rate": 2.458899347803055e-05, + "loss": 0.1921, + "step": 15160 + }, + { + "epoch": 0.8321624588364435, + "grad_norm": 0.6834058165550232, + "learning_rate": 2.4583758206972133e-05, + "loss": 0.1275, + "step": 15162 + }, + { + "epoch": 0.8322722283205269, + "grad_norm": 1.6683272123336792, + "learning_rate": 2.457852295417212e-05, + "loss": 0.1941, + "step": 15164 + }, + { + "epoch": 0.8323819978046103, + "grad_norm": 1.199482798576355, + "learning_rate": 2.4573287719860158e-05, + "loss": 0.2178, + "step": 15166 + }, + { + "epoch": 0.8324917672886938, + "grad_norm": 2.2892587184906006, + "learning_rate": 2.456805250426589e-05, + "loss": 0.1674, + "step": 15168 + }, + { + "epoch": 0.8326015367727772, + "grad_norm": 0.7686349153518677, + "learning_rate": 2.4562817307618956e-05, + "loss": 0.2128, + "step": 15170 + }, + { + "epoch": 0.8327113062568606, + "grad_norm": 1.3554095029830933, + "learning_rate": 2.455758213014901e-05, + "loss": 0.2877, + "step": 15172 + }, + { + "epoch": 0.832821075740944, + "grad_norm": 2.426851749420166, + "learning_rate": 2.4552346972085674e-05, + "loss": 0.3425, + "step": 15174 + }, + { + "epoch": 0.8329308452250275, + "grad_norm": 1.3087255954742432, + "learning_rate": 2.4547111833658603e-05, + "loss": 0.2917, + "step": 15176 + }, + { + "epoch": 0.8330406147091108, + "grad_norm": 2.0544750690460205, + "learning_rate": 2.4541876715097432e-05, + "loss": 0.2945, + "step": 15178 + }, + { + "epoch": 0.8331503841931943, + "grad_norm": 1.2696324586868286, + "learning_rate": 2.4536641616631793e-05, + "loss": 0.2488, + "step": 15180 + }, + { + "epoch": 0.8332601536772777, + "grad_norm": 1.4015274047851562, + "learning_rate": 2.453140653849133e-05, + "loss": 0.1778, + "step": 15182 + }, + { + "epoch": 0.8333699231613612, + "grad_norm": 1.4416313171386719, + "learning_rate": 2.4526171480905673e-05, + "loss": 0.1762, + "step": 15184 + }, + { + "epoch": 0.8334796926454445, + "grad_norm": 1.6430246829986572, + "learning_rate": 2.4520936444104463e-05, + "loss": 0.2984, + "step": 15186 + }, + { + "epoch": 0.833589462129528, + "grad_norm": 1.4504882097244263, + "learning_rate": 2.4515701428317334e-05, + "loss": 0.2023, + "step": 15188 + }, + { + "epoch": 0.8336992316136114, + "grad_norm": 1.2901055812835693, + "learning_rate": 2.451046643377391e-05, + "loss": 0.3257, + "step": 15190 + }, + { + "epoch": 0.8338090010976948, + "grad_norm": 1.9000359773635864, + "learning_rate": 2.4505231460703838e-05, + "loss": 0.289, + "step": 15192 + }, + { + "epoch": 0.8339187705817782, + "grad_norm": 1.2443944215774536, + "learning_rate": 2.4499996509336742e-05, + "loss": 0.207, + "step": 15194 + }, + { + "epoch": 0.8340285400658617, + "grad_norm": 1.8124083280563354, + "learning_rate": 2.4494761579902247e-05, + "loss": 0.2913, + "step": 15196 + }, + { + "epoch": 0.8341383095499452, + "grad_norm": 1.0495798587799072, + "learning_rate": 2.4489526672629996e-05, + "loss": 0.2055, + "step": 15198 + }, + { + "epoch": 0.8342480790340285, + "grad_norm": 1.1971532106399536, + "learning_rate": 2.4484291787749597e-05, + "loss": 0.1982, + "step": 15200 + }, + { + "epoch": 0.834357848518112, + "grad_norm": 0.8450045585632324, + "learning_rate": 2.4479056925490706e-05, + "loss": 0.133, + "step": 15202 + }, + { + "epoch": 0.8344676180021954, + "grad_norm": 1.2223623991012573, + "learning_rate": 2.4473822086082928e-05, + "loss": 0.2778, + "step": 15204 + }, + { + "epoch": 0.8345773874862789, + "grad_norm": 1.1549841165542603, + "learning_rate": 2.4468587269755903e-05, + "loss": 0.2388, + "step": 15206 + }, + { + "epoch": 0.8346871569703622, + "grad_norm": 1.4447020292282104, + "learning_rate": 2.4463352476739246e-05, + "loss": 0.4119, + "step": 15208 + }, + { + "epoch": 0.8347969264544457, + "grad_norm": 1.4505231380462646, + "learning_rate": 2.445811770726258e-05, + "loss": 0.2596, + "step": 15210 + }, + { + "epoch": 0.8349066959385291, + "grad_norm": 1.286739468574524, + "learning_rate": 2.4452882961555534e-05, + "loss": 0.2358, + "step": 15212 + }, + { + "epoch": 0.8350164654226125, + "grad_norm": 3.692923069000244, + "learning_rate": 2.4447648239847727e-05, + "loss": 0.2261, + "step": 15214 + }, + { + "epoch": 0.8351262349066959, + "grad_norm": 1.0417670011520386, + "learning_rate": 2.4442413542368776e-05, + "loss": 0.2407, + "step": 15216 + }, + { + "epoch": 0.8352360043907794, + "grad_norm": 1.4130823612213135, + "learning_rate": 2.443717886934831e-05, + "loss": 0.2472, + "step": 15218 + }, + { + "epoch": 0.8353457738748628, + "grad_norm": 1.308371901512146, + "learning_rate": 2.443194422101594e-05, + "loss": 0.3279, + "step": 15220 + }, + { + "epoch": 0.8354555433589462, + "grad_norm": 1.4782614707946777, + "learning_rate": 2.442670959760128e-05, + "loss": 0.2823, + "step": 15222 + }, + { + "epoch": 0.8355653128430296, + "grad_norm": 2.6408517360687256, + "learning_rate": 2.4421474999333956e-05, + "loss": 0.3316, + "step": 15224 + }, + { + "epoch": 0.8356750823271131, + "grad_norm": 1.4103926420211792, + "learning_rate": 2.441624042644357e-05, + "loss": 0.3393, + "step": 15226 + }, + { + "epoch": 0.8357848518111964, + "grad_norm": 0.82582688331604, + "learning_rate": 2.4411005879159753e-05, + "loss": 0.2814, + "step": 15228 + }, + { + "epoch": 0.8358946212952799, + "grad_norm": 1.3073222637176514, + "learning_rate": 2.4405771357712097e-05, + "loss": 0.2215, + "step": 15230 + }, + { + "epoch": 0.8360043907793633, + "grad_norm": 1.667373776435852, + "learning_rate": 2.4400536862330237e-05, + "loss": 0.2564, + "step": 15232 + }, + { + "epoch": 0.8361141602634468, + "grad_norm": 1.5598942041397095, + "learning_rate": 2.4395302393243766e-05, + "loss": 0.1393, + "step": 15234 + }, + { + "epoch": 0.8362239297475302, + "grad_norm": 1.3638485670089722, + "learning_rate": 2.4390067950682304e-05, + "loss": 0.2231, + "step": 15236 + }, + { + "epoch": 0.8363336992316136, + "grad_norm": 0.858519971370697, + "learning_rate": 2.438483353487546e-05, + "loss": 0.1441, + "step": 15238 + }, + { + "epoch": 0.8364434687156971, + "grad_norm": 1.1021965742111206, + "learning_rate": 2.4379599146052827e-05, + "loss": 0.2316, + "step": 15240 + }, + { + "epoch": 0.8365532381997804, + "grad_norm": 1.237978458404541, + "learning_rate": 2.437436478444403e-05, + "loss": 0.2297, + "step": 15242 + }, + { + "epoch": 0.8366630076838639, + "grad_norm": 1.4476312398910522, + "learning_rate": 2.436913045027866e-05, + "loss": 0.2248, + "step": 15244 + }, + { + "epoch": 0.8367727771679473, + "grad_norm": 1.2652509212493896, + "learning_rate": 2.436389614378632e-05, + "loss": 0.2719, + "step": 15246 + }, + { + "epoch": 0.8368825466520308, + "grad_norm": 2.0885250568389893, + "learning_rate": 2.4358661865196628e-05, + "loss": 0.2512, + "step": 15248 + }, + { + "epoch": 0.8369923161361141, + "grad_norm": 1.685298204421997, + "learning_rate": 2.4353427614739173e-05, + "loss": 0.1966, + "step": 15250 + }, + { + "epoch": 0.8371020856201976, + "grad_norm": 1.0391210317611694, + "learning_rate": 2.4348193392643545e-05, + "loss": 0.2009, + "step": 15252 + }, + { + "epoch": 0.837211855104281, + "grad_norm": 0.9065111875534058, + "learning_rate": 2.4342959199139365e-05, + "loss": 0.107, + "step": 15254 + }, + { + "epoch": 0.8373216245883645, + "grad_norm": 1.3332676887512207, + "learning_rate": 2.4337725034456217e-05, + "loss": 0.2988, + "step": 15256 + }, + { + "epoch": 0.8374313940724478, + "grad_norm": 1.9255599975585938, + "learning_rate": 2.4332490898823697e-05, + "loss": 0.2688, + "step": 15258 + }, + { + "epoch": 0.8375411635565313, + "grad_norm": 1.7283077239990234, + "learning_rate": 2.4327256792471404e-05, + "loss": 0.2, + "step": 15260 + }, + { + "epoch": 0.8376509330406147, + "grad_norm": 1.6005374193191528, + "learning_rate": 2.432202271562892e-05, + "loss": 0.263, + "step": 15262 + }, + { + "epoch": 0.8377607025246981, + "grad_norm": 1.1089978218078613, + "learning_rate": 2.431678866852586e-05, + "loss": 0.2273, + "step": 15264 + }, + { + "epoch": 0.8378704720087815, + "grad_norm": 1.048728346824646, + "learning_rate": 2.4311554651391796e-05, + "loss": 0.3078, + "step": 15266 + }, + { + "epoch": 0.837980241492865, + "grad_norm": 1.220550775527954, + "learning_rate": 2.430632066445633e-05, + "loss": 0.27, + "step": 15268 + }, + { + "epoch": 0.8380900109769485, + "grad_norm": 1.2950186729431152, + "learning_rate": 2.4301086707949038e-05, + "loss": 0.2411, + "step": 15270 + }, + { + "epoch": 0.8381997804610318, + "grad_norm": 0.859463095664978, + "learning_rate": 2.429585278209952e-05, + "loss": 0.1598, + "step": 15272 + }, + { + "epoch": 0.8383095499451153, + "grad_norm": 1.412030816078186, + "learning_rate": 2.4290618887137355e-05, + "loss": 0.2091, + "step": 15274 + }, + { + "epoch": 0.8384193194291987, + "grad_norm": 2.132246255874634, + "learning_rate": 2.4285385023292124e-05, + "loss": 0.1514, + "step": 15276 + }, + { + "epoch": 0.8385290889132822, + "grad_norm": 1.4014239311218262, + "learning_rate": 2.4280151190793417e-05, + "loss": 0.2688, + "step": 15278 + }, + { + "epoch": 0.8386388583973655, + "grad_norm": 2.097022294998169, + "learning_rate": 2.427491738987081e-05, + "loss": 0.2747, + "step": 15280 + }, + { + "epoch": 0.838748627881449, + "grad_norm": 1.133561611175537, + "learning_rate": 2.426968362075388e-05, + "loss": 0.1946, + "step": 15282 + }, + { + "epoch": 0.8388583973655324, + "grad_norm": 1.9828402996063232, + "learning_rate": 2.4264449883672223e-05, + "loss": 0.1962, + "step": 15284 + }, + { + "epoch": 0.8389681668496158, + "grad_norm": 0.8773364424705505, + "learning_rate": 2.4259216178855398e-05, + "loss": 0.2742, + "step": 15286 + }, + { + "epoch": 0.8390779363336992, + "grad_norm": 1.7180601358413696, + "learning_rate": 2.4253982506532983e-05, + "loss": 0.2458, + "step": 15288 + }, + { + "epoch": 0.8391877058177827, + "grad_norm": 1.7983791828155518, + "learning_rate": 2.424874886693456e-05, + "loss": 0.2843, + "step": 15290 + }, + { + "epoch": 0.839297475301866, + "grad_norm": 2.4760491847991943, + "learning_rate": 2.424351526028969e-05, + "loss": 0.3415, + "step": 15292 + }, + { + "epoch": 0.8394072447859495, + "grad_norm": 1.663426160812378, + "learning_rate": 2.4238281686827967e-05, + "loss": 0.2773, + "step": 15294 + }, + { + "epoch": 0.8395170142700329, + "grad_norm": 1.2545241117477417, + "learning_rate": 2.4233048146778942e-05, + "loss": 0.1921, + "step": 15296 + }, + { + "epoch": 0.8396267837541164, + "grad_norm": 1.0354020595550537, + "learning_rate": 2.4227814640372196e-05, + "loss": 0.1354, + "step": 15298 + }, + { + "epoch": 0.8397365532381997, + "grad_norm": 1.872342824935913, + "learning_rate": 2.4222581167837287e-05, + "loss": 0.2831, + "step": 15300 + }, + { + "epoch": 0.8398463227222832, + "grad_norm": 1.6432784795761108, + "learning_rate": 2.421734772940378e-05, + "loss": 0.2265, + "step": 15302 + }, + { + "epoch": 0.8399560922063666, + "grad_norm": 1.387900471687317, + "learning_rate": 2.4212114325301248e-05, + "loss": 0.2954, + "step": 15304 + }, + { + "epoch": 0.84006586169045, + "grad_norm": 1.4026623964309692, + "learning_rate": 2.4206880955759247e-05, + "loss": 0.2958, + "step": 15306 + }, + { + "epoch": 0.8401756311745335, + "grad_norm": 1.224428415298462, + "learning_rate": 2.4201647621007336e-05, + "loss": 0.2101, + "step": 15308 + }, + { + "epoch": 0.8402854006586169, + "grad_norm": 1.346890926361084, + "learning_rate": 2.4196414321275087e-05, + "loss": 0.2445, + "step": 15310 + }, + { + "epoch": 0.8403951701427004, + "grad_norm": 2.2049777507781982, + "learning_rate": 2.4191181056792042e-05, + "loss": 0.2149, + "step": 15312 + }, + { + "epoch": 0.8405049396267837, + "grad_norm": 2.1697463989257812, + "learning_rate": 2.4185947827787773e-05, + "loss": 0.2407, + "step": 15314 + }, + { + "epoch": 0.8406147091108672, + "grad_norm": 0.9852806925773621, + "learning_rate": 2.4180714634491824e-05, + "loss": 0.1413, + "step": 15316 + }, + { + "epoch": 0.8407244785949506, + "grad_norm": 2.5577380657196045, + "learning_rate": 2.417548147713375e-05, + "loss": 0.2053, + "step": 15318 + }, + { + "epoch": 0.8408342480790341, + "grad_norm": 1.313568353652954, + "learning_rate": 2.417024835594311e-05, + "loss": 0.1884, + "step": 15320 + }, + { + "epoch": 0.8409440175631174, + "grad_norm": 0.9576706886291504, + "learning_rate": 2.416501527114944e-05, + "loss": 0.3053, + "step": 15322 + }, + { + "epoch": 0.8410537870472009, + "grad_norm": 1.5449047088623047, + "learning_rate": 2.4159782222982307e-05, + "loss": 0.206, + "step": 15324 + }, + { + "epoch": 0.8411635565312843, + "grad_norm": 1.2260581254959106, + "learning_rate": 2.4154549211671248e-05, + "loss": 0.1961, + "step": 15326 + }, + { + "epoch": 0.8412733260153678, + "grad_norm": 1.0868960618972778, + "learning_rate": 2.4149316237445812e-05, + "loss": 0.2095, + "step": 15328 + }, + { + "epoch": 0.8413830954994511, + "grad_norm": 2.108654737472534, + "learning_rate": 2.4144083300535545e-05, + "loss": 0.2847, + "step": 15330 + }, + { + "epoch": 0.8414928649835346, + "grad_norm": 0.7527856230735779, + "learning_rate": 2.4138850401169976e-05, + "loss": 0.118, + "step": 15332 + }, + { + "epoch": 0.841602634467618, + "grad_norm": 1.1971827745437622, + "learning_rate": 2.4133617539578665e-05, + "loss": 0.1916, + "step": 15334 + }, + { + "epoch": 0.8417124039517014, + "grad_norm": 1.5183417797088623, + "learning_rate": 2.412838471599114e-05, + "loss": 0.287, + "step": 15336 + }, + { + "epoch": 0.8418221734357848, + "grad_norm": 0.8952679634094238, + "learning_rate": 2.412315193063693e-05, + "loss": 0.243, + "step": 15338 + }, + { + "epoch": 0.8419319429198683, + "grad_norm": 1.3359131813049316, + "learning_rate": 2.411791918374559e-05, + "loss": 0.3239, + "step": 15340 + }, + { + "epoch": 0.8420417124039516, + "grad_norm": 1.3464387655258179, + "learning_rate": 2.4112686475546643e-05, + "loss": 0.196, + "step": 15342 + }, + { + "epoch": 0.8421514818880351, + "grad_norm": 1.4672269821166992, + "learning_rate": 2.4107453806269616e-05, + "loss": 0.3703, + "step": 15344 + }, + { + "epoch": 0.8422612513721186, + "grad_norm": 1.4944474697113037, + "learning_rate": 2.410222117614405e-05, + "loss": 0.1545, + "step": 15346 + }, + { + "epoch": 0.842371020856202, + "grad_norm": 1.0951765775680542, + "learning_rate": 2.4096988585399473e-05, + "loss": 0.2604, + "step": 15348 + }, + { + "epoch": 0.8424807903402854, + "grad_norm": 1.1965080499649048, + "learning_rate": 2.40917560342654e-05, + "loss": 0.3732, + "step": 15350 + }, + { + "epoch": 0.8425905598243688, + "grad_norm": 1.5663015842437744, + "learning_rate": 2.4086523522971366e-05, + "loss": 0.2938, + "step": 15352 + }, + { + "epoch": 0.8427003293084523, + "grad_norm": 1.1678000688552856, + "learning_rate": 2.4081291051746898e-05, + "loss": 0.2554, + "step": 15354 + }, + { + "epoch": 0.8428100987925357, + "grad_norm": 0.9836000204086304, + "learning_rate": 2.4076058620821516e-05, + "loss": 0.168, + "step": 15356 + }, + { + "epoch": 0.8429198682766191, + "grad_norm": 1.3728970289230347, + "learning_rate": 2.4070826230424733e-05, + "loss": 0.2535, + "step": 15358 + }, + { + "epoch": 0.8430296377607025, + "grad_norm": 2.8426144123077393, + "learning_rate": 2.406559388078608e-05, + "loss": 0.2911, + "step": 15360 + }, + { + "epoch": 0.843139407244786, + "grad_norm": 0.9763059020042419, + "learning_rate": 2.4060361572135056e-05, + "loss": 0.1278, + "step": 15362 + }, + { + "epoch": 0.8432491767288693, + "grad_norm": 1.5118818283081055, + "learning_rate": 2.4055129304701192e-05, + "loss": 0.2471, + "step": 15364 + }, + { + "epoch": 0.8433589462129528, + "grad_norm": 1.3751498460769653, + "learning_rate": 2.4049897078714e-05, + "loss": 0.2704, + "step": 15366 + }, + { + "epoch": 0.8434687156970362, + "grad_norm": 1.3422919511795044, + "learning_rate": 2.4044664894402972e-05, + "loss": 0.2563, + "step": 15368 + }, + { + "epoch": 0.8435784851811197, + "grad_norm": 1.3499748706817627, + "learning_rate": 2.403943275199764e-05, + "loss": 0.2395, + "step": 15370 + }, + { + "epoch": 0.843688254665203, + "grad_norm": 0.9298483729362488, + "learning_rate": 2.40342006517275e-05, + "loss": 0.1926, + "step": 15372 + }, + { + "epoch": 0.8437980241492865, + "grad_norm": 2.663571357727051, + "learning_rate": 2.402896859382206e-05, + "loss": 0.3517, + "step": 15374 + }, + { + "epoch": 0.8439077936333699, + "grad_norm": 1.388674020767212, + "learning_rate": 2.402373657851082e-05, + "loss": 0.314, + "step": 15376 + }, + { + "epoch": 0.8440175631174534, + "grad_norm": 1.2948750257492065, + "learning_rate": 2.4018504606023293e-05, + "loss": 0.2018, + "step": 15378 + }, + { + "epoch": 0.8441273326015367, + "grad_norm": 1.4481074810028076, + "learning_rate": 2.4013272676588962e-05, + "loss": 0.22, + "step": 15380 + }, + { + "epoch": 0.8442371020856202, + "grad_norm": 1.3510023355484009, + "learning_rate": 2.400804079043733e-05, + "loss": 0.2713, + "step": 15382 + }, + { + "epoch": 0.8443468715697037, + "grad_norm": 1.8737534284591675, + "learning_rate": 2.400280894779791e-05, + "loss": 0.1974, + "step": 15384 + }, + { + "epoch": 0.844456641053787, + "grad_norm": 0.9958437085151672, + "learning_rate": 2.3997577148900185e-05, + "loss": 0.1958, + "step": 15386 + }, + { + "epoch": 0.8445664105378705, + "grad_norm": 1.099496841430664, + "learning_rate": 2.399234539397364e-05, + "loss": 0.2857, + "step": 15388 + }, + { + "epoch": 0.8446761800219539, + "grad_norm": 2.6952524185180664, + "learning_rate": 2.3987113683247775e-05, + "loss": 0.2894, + "step": 15390 + }, + { + "epoch": 0.8447859495060374, + "grad_norm": 1.4646263122558594, + "learning_rate": 2.398188201695208e-05, + "loss": 0.2609, + "step": 15392 + }, + { + "epoch": 0.8448957189901207, + "grad_norm": 1.3802258968353271, + "learning_rate": 2.3976650395316026e-05, + "loss": 0.2324, + "step": 15394 + }, + { + "epoch": 0.8450054884742042, + "grad_norm": 1.5277408361434937, + "learning_rate": 2.3971418818569115e-05, + "loss": 0.1974, + "step": 15396 + }, + { + "epoch": 0.8451152579582876, + "grad_norm": 1.269200086593628, + "learning_rate": 2.396618728694083e-05, + "loss": 0.3106, + "step": 15398 + }, + { + "epoch": 0.845225027442371, + "grad_norm": 3.1551129817962646, + "learning_rate": 2.396095580066063e-05, + "loss": 0.3621, + "step": 15400 + }, + { + "epoch": 0.8453347969264544, + "grad_norm": 1.2275049686431885, + "learning_rate": 2.395572435995802e-05, + "loss": 0.2023, + "step": 15402 + }, + { + "epoch": 0.8454445664105379, + "grad_norm": 1.4949026107788086, + "learning_rate": 2.3950492965062457e-05, + "loss": 0.201, + "step": 15404 + }, + { + "epoch": 0.8455543358946213, + "grad_norm": 2.013211488723755, + "learning_rate": 2.394526161620343e-05, + "loss": 0.2093, + "step": 15406 + }, + { + "epoch": 0.8456641053787047, + "grad_norm": 1.7314863204956055, + "learning_rate": 2.3940030313610405e-05, + "loss": 0.2287, + "step": 15408 + }, + { + "epoch": 0.8457738748627881, + "grad_norm": 1.219923734664917, + "learning_rate": 2.3934799057512846e-05, + "loss": 0.2379, + "step": 15410 + }, + { + "epoch": 0.8458836443468716, + "grad_norm": 1.7436789274215698, + "learning_rate": 2.3929567848140228e-05, + "loss": 0.2924, + "step": 15412 + }, + { + "epoch": 0.8459934138309549, + "grad_norm": 1.4598712921142578, + "learning_rate": 2.3924336685722027e-05, + "loss": 0.2512, + "step": 15414 + }, + { + "epoch": 0.8461031833150384, + "grad_norm": 1.3849643468856812, + "learning_rate": 2.3919105570487694e-05, + "loss": 0.2202, + "step": 15416 + }, + { + "epoch": 0.8462129527991219, + "grad_norm": 1.3689936399459839, + "learning_rate": 2.39138745026667e-05, + "loss": 0.1795, + "step": 15418 + }, + { + "epoch": 0.8463227222832053, + "grad_norm": 1.7829984426498413, + "learning_rate": 2.3908643482488498e-05, + "loss": 0.2339, + "step": 15420 + }, + { + "epoch": 0.8464324917672887, + "grad_norm": 1.4390692710876465, + "learning_rate": 2.3903412510182555e-05, + "loss": 0.2178, + "step": 15422 + }, + { + "epoch": 0.8465422612513721, + "grad_norm": 0.8234957456588745, + "learning_rate": 2.3898181585978313e-05, + "loss": 0.1479, + "step": 15424 + }, + { + "epoch": 0.8466520307354556, + "grad_norm": 0.8160361051559448, + "learning_rate": 2.3892950710105243e-05, + "loss": 0.2072, + "step": 15426 + }, + { + "epoch": 0.846761800219539, + "grad_norm": 0.9787569046020508, + "learning_rate": 2.3887719882792785e-05, + "loss": 0.2181, + "step": 15428 + }, + { + "epoch": 0.8468715697036224, + "grad_norm": 2.1267013549804688, + "learning_rate": 2.388248910427039e-05, + "loss": 0.3176, + "step": 15430 + }, + { + "epoch": 0.8469813391877058, + "grad_norm": 1.514979600906372, + "learning_rate": 2.3877258374767513e-05, + "loss": 0.2524, + "step": 15432 + }, + { + "epoch": 0.8470911086717893, + "grad_norm": 1.0817986726760864, + "learning_rate": 2.3872027694513594e-05, + "loss": 0.2259, + "step": 15434 + }, + { + "epoch": 0.8472008781558726, + "grad_norm": 1.3490924835205078, + "learning_rate": 2.386679706373807e-05, + "loss": 0.2288, + "step": 15436 + }, + { + "epoch": 0.8473106476399561, + "grad_norm": 1.9886599779129028, + "learning_rate": 2.3861566482670394e-05, + "loss": 0.2743, + "step": 15438 + }, + { + "epoch": 0.8474204171240395, + "grad_norm": 1.8840267658233643, + "learning_rate": 2.3856335951539997e-05, + "loss": 0.2607, + "step": 15440 + }, + { + "epoch": 0.847530186608123, + "grad_norm": 1.7816883325576782, + "learning_rate": 2.3851105470576316e-05, + "loss": 0.2528, + "step": 15442 + }, + { + "epoch": 0.8476399560922063, + "grad_norm": 3.8557565212249756, + "learning_rate": 2.3845875040008788e-05, + "loss": 0.323, + "step": 15444 + }, + { + "epoch": 0.8477497255762898, + "grad_norm": 1.7922457456588745, + "learning_rate": 2.3840644660066854e-05, + "loss": 0.2843, + "step": 15446 + }, + { + "epoch": 0.8478594950603732, + "grad_norm": 1.5081596374511719, + "learning_rate": 2.3835414330979927e-05, + "loss": 0.2295, + "step": 15448 + }, + { + "epoch": 0.8479692645444566, + "grad_norm": 1.2677853107452393, + "learning_rate": 2.3830184052977453e-05, + "loss": 0.2658, + "step": 15450 + }, + { + "epoch": 0.84807903402854, + "grad_norm": 2.432896375656128, + "learning_rate": 2.3824953826288844e-05, + "loss": 0.208, + "step": 15452 + }, + { + "epoch": 0.8481888035126235, + "grad_norm": 1.3423131704330444, + "learning_rate": 2.3819723651143525e-05, + "loss": 0.2874, + "step": 15454 + }, + { + "epoch": 0.848298572996707, + "grad_norm": 1.203570008277893, + "learning_rate": 2.3814493527770923e-05, + "loss": 0.2145, + "step": 15456 + }, + { + "epoch": 0.8484083424807903, + "grad_norm": 1.7309057712554932, + "learning_rate": 2.380926345640046e-05, + "loss": 0.2433, + "step": 15458 + }, + { + "epoch": 0.8485181119648738, + "grad_norm": 1.3700414896011353, + "learning_rate": 2.3804033437261533e-05, + "loss": 0.1952, + "step": 15460 + }, + { + "epoch": 0.8486278814489572, + "grad_norm": 0.8545668721199036, + "learning_rate": 2.379880347058358e-05, + "loss": 0.1389, + "step": 15462 + }, + { + "epoch": 0.8487376509330407, + "grad_norm": 1.886829137802124, + "learning_rate": 2.3793573556595997e-05, + "loss": 0.29, + "step": 15464 + }, + { + "epoch": 0.848847420417124, + "grad_norm": 1.5489815473556519, + "learning_rate": 2.3788343695528197e-05, + "loss": 0.2684, + "step": 15466 + }, + { + "epoch": 0.8489571899012075, + "grad_norm": 1.3163940906524658, + "learning_rate": 2.3783113887609595e-05, + "loss": 0.2744, + "step": 15468 + }, + { + "epoch": 0.8490669593852909, + "grad_norm": 0.8095985651016235, + "learning_rate": 2.3777884133069592e-05, + "loss": 0.2222, + "step": 15470 + }, + { + "epoch": 0.8491767288693743, + "grad_norm": 0.9671428203582764, + "learning_rate": 2.377265443213758e-05, + "loss": 0.2223, + "step": 15472 + }, + { + "epoch": 0.8492864983534577, + "grad_norm": 1.2022855281829834, + "learning_rate": 2.3767424785042968e-05, + "loss": 0.2962, + "step": 15474 + }, + { + "epoch": 0.8493962678375412, + "grad_norm": 0.9482195973396301, + "learning_rate": 2.3762195192015167e-05, + "loss": 0.197, + "step": 15476 + }, + { + "epoch": 0.8495060373216246, + "grad_norm": 1.3784757852554321, + "learning_rate": 2.3756965653283557e-05, + "loss": 0.223, + "step": 15478 + }, + { + "epoch": 0.849615806805708, + "grad_norm": 1.1887075901031494, + "learning_rate": 2.375173616907753e-05, + "loss": 0.2531, + "step": 15480 + }, + { + "epoch": 0.8497255762897914, + "grad_norm": 1.2878201007843018, + "learning_rate": 2.3746506739626483e-05, + "loss": 0.3086, + "step": 15482 + }, + { + "epoch": 0.8498353457738749, + "grad_norm": 3.015476942062378, + "learning_rate": 2.374127736515981e-05, + "loss": 0.2173, + "step": 15484 + }, + { + "epoch": 0.8499451152579582, + "grad_norm": 1.2811747789382935, + "learning_rate": 2.3736048045906877e-05, + "loss": 0.1903, + "step": 15486 + }, + { + "epoch": 0.8500548847420417, + "grad_norm": 1.1430977582931519, + "learning_rate": 2.3730818782097093e-05, + "loss": 0.2361, + "step": 15488 + }, + { + "epoch": 0.8501646542261251, + "grad_norm": 1.323629379272461, + "learning_rate": 2.3725589573959817e-05, + "loss": 0.2256, + "step": 15490 + }, + { + "epoch": 0.8502744237102086, + "grad_norm": 1.8610962629318237, + "learning_rate": 2.3720360421724445e-05, + "loss": 0.1947, + "step": 15492 + }, + { + "epoch": 0.850384193194292, + "grad_norm": 2.022047758102417, + "learning_rate": 2.3715131325620343e-05, + "loss": 0.2486, + "step": 15494 + }, + { + "epoch": 0.8504939626783754, + "grad_norm": 1.4783906936645508, + "learning_rate": 2.3709902285876886e-05, + "loss": 0.3031, + "step": 15496 + }, + { + "epoch": 0.8506037321624589, + "grad_norm": 2.0756475925445557, + "learning_rate": 2.370467330272345e-05, + "loss": 0.2683, + "step": 15498 + }, + { + "epoch": 0.8507135016465422, + "grad_norm": 1.4189441204071045, + "learning_rate": 2.3699444376389403e-05, + "loss": 0.3442, + "step": 15500 + }, + { + "epoch": 0.8508232711306257, + "grad_norm": 1.7486666440963745, + "learning_rate": 2.3694215507104096e-05, + "loss": 0.2437, + "step": 15502 + }, + { + "epoch": 0.8509330406147091, + "grad_norm": 2.853088855743408, + "learning_rate": 2.368898669509691e-05, + "loss": 0.1934, + "step": 15504 + }, + { + "epoch": 0.8510428100987926, + "grad_norm": 2.120251417160034, + "learning_rate": 2.368375794059721e-05, + "loss": 0.2255, + "step": 15506 + }, + { + "epoch": 0.8511525795828759, + "grad_norm": 1.4540951251983643, + "learning_rate": 2.3678529243834348e-05, + "loss": 0.1686, + "step": 15508 + }, + { + "epoch": 0.8512623490669594, + "grad_norm": 1.281296730041504, + "learning_rate": 2.367330060503767e-05, + "loss": 0.3145, + "step": 15510 + }, + { + "epoch": 0.8513721185510428, + "grad_norm": 1.436870813369751, + "learning_rate": 2.366807202443655e-05, + "loss": 0.2864, + "step": 15512 + }, + { + "epoch": 0.8514818880351263, + "grad_norm": 0.9362894296646118, + "learning_rate": 2.3662843502260327e-05, + "loss": 0.2522, + "step": 15514 + }, + { + "epoch": 0.8515916575192096, + "grad_norm": 1.3534621000289917, + "learning_rate": 2.3657615038738343e-05, + "loss": 0.1369, + "step": 15516 + }, + { + "epoch": 0.8517014270032931, + "grad_norm": 1.852168321609497, + "learning_rate": 2.365238663409996e-05, + "loss": 0.3069, + "step": 15518 + }, + { + "epoch": 0.8518111964873765, + "grad_norm": 1.1961737871170044, + "learning_rate": 2.3647158288574514e-05, + "loss": 0.1787, + "step": 15520 + }, + { + "epoch": 0.8519209659714599, + "grad_norm": 1.6282212734222412, + "learning_rate": 2.3641930002391337e-05, + "loss": 0.2321, + "step": 15522 + }, + { + "epoch": 0.8520307354555433, + "grad_norm": 1.0945161581039429, + "learning_rate": 2.3636701775779783e-05, + "loss": 0.3634, + "step": 15524 + }, + { + "epoch": 0.8521405049396268, + "grad_norm": 1.2958940267562866, + "learning_rate": 2.3631473608969186e-05, + "loss": 0.1958, + "step": 15526 + }, + { + "epoch": 0.8522502744237103, + "grad_norm": 3.2681872844696045, + "learning_rate": 2.3626245502188864e-05, + "loss": 0.2509, + "step": 15528 + }, + { + "epoch": 0.8523600439077936, + "grad_norm": 1.1438636779785156, + "learning_rate": 2.362101745566816e-05, + "loss": 0.2567, + "step": 15530 + }, + { + "epoch": 0.8524698133918771, + "grad_norm": 1.3920656442642212, + "learning_rate": 2.3615789469636396e-05, + "loss": 0.2693, + "step": 15532 + }, + { + "epoch": 0.8525795828759605, + "grad_norm": 1.227824091911316, + "learning_rate": 2.3610561544322898e-05, + "loss": 0.1776, + "step": 15534 + }, + { + "epoch": 0.852689352360044, + "grad_norm": 1.1408077478408813, + "learning_rate": 2.3605333679956992e-05, + "loss": 0.2292, + "step": 15536 + }, + { + "epoch": 0.8527991218441273, + "grad_norm": 1.079973816871643, + "learning_rate": 2.3600105876768e-05, + "loss": 0.2412, + "step": 15538 + }, + { + "epoch": 0.8529088913282108, + "grad_norm": 2.5422470569610596, + "learning_rate": 2.3594878134985234e-05, + "loss": 0.2444, + "step": 15540 + }, + { + "epoch": 0.8530186608122942, + "grad_norm": 1.1259700059890747, + "learning_rate": 2.3589650454838013e-05, + "loss": 0.2814, + "step": 15542 + }, + { + "epoch": 0.8531284302963776, + "grad_norm": 1.1394779682159424, + "learning_rate": 2.3584422836555642e-05, + "loss": 0.197, + "step": 15544 + }, + { + "epoch": 0.853238199780461, + "grad_norm": 1.2002965211868286, + "learning_rate": 2.3579195280367434e-05, + "loss": 0.1762, + "step": 15546 + }, + { + "epoch": 0.8533479692645445, + "grad_norm": 1.2590687274932861, + "learning_rate": 2.3573967786502697e-05, + "loss": 0.177, + "step": 15548 + }, + { + "epoch": 0.8534577387486278, + "grad_norm": 1.5159345865249634, + "learning_rate": 2.356874035519073e-05, + "loss": 0.2638, + "step": 15550 + }, + { + "epoch": 0.8535675082327113, + "grad_norm": 1.2456125020980835, + "learning_rate": 2.3563512986660835e-05, + "loss": 0.2141, + "step": 15552 + }, + { + "epoch": 0.8536772777167947, + "grad_norm": 1.4507604837417603, + "learning_rate": 2.3558285681142314e-05, + "loss": 0.2486, + "step": 15554 + }, + { + "epoch": 0.8537870472008782, + "grad_norm": 2.1016244888305664, + "learning_rate": 2.3553058438864458e-05, + "loss": 0.2932, + "step": 15556 + }, + { + "epoch": 0.8538968166849615, + "grad_norm": 1.7916548252105713, + "learning_rate": 2.3547831260056557e-05, + "loss": 0.2032, + "step": 15558 + }, + { + "epoch": 0.854006586169045, + "grad_norm": 3.20784854888916, + "learning_rate": 2.3542604144947915e-05, + "loss": 0.388, + "step": 15560 + }, + { + "epoch": 0.8541163556531284, + "grad_norm": 1.6434959173202515, + "learning_rate": 2.35373770937678e-05, + "loss": 0.2439, + "step": 15562 + }, + { + "epoch": 0.8542261251372119, + "grad_norm": 0.9635435342788696, + "learning_rate": 2.35321501067455e-05, + "loss": 0.1686, + "step": 15564 + }, + { + "epoch": 0.8543358946212953, + "grad_norm": 0.9943174719810486, + "learning_rate": 2.3526923184110302e-05, + "loss": 0.2715, + "step": 15566 + }, + { + "epoch": 0.8544456641053787, + "grad_norm": 2.2062416076660156, + "learning_rate": 2.352169632609149e-05, + "loss": 0.345, + "step": 15568 + }, + { + "epoch": 0.8545554335894622, + "grad_norm": 1.7669808864593506, + "learning_rate": 2.3516469532918337e-05, + "loss": 0.1715, + "step": 15570 + }, + { + "epoch": 0.8546652030735455, + "grad_norm": 0.9286289215087891, + "learning_rate": 2.3511242804820103e-05, + "loss": 0.2311, + "step": 15572 + }, + { + "epoch": 0.854774972557629, + "grad_norm": 2.986746072769165, + "learning_rate": 2.3506016142026072e-05, + "loss": 0.1907, + "step": 15574 + }, + { + "epoch": 0.8548847420417124, + "grad_norm": 0.7374218702316284, + "learning_rate": 2.350078954476551e-05, + "loss": 0.1734, + "step": 15576 + }, + { + "epoch": 0.8549945115257959, + "grad_norm": 1.6448988914489746, + "learning_rate": 2.3495563013267664e-05, + "loss": 0.3705, + "step": 15578 + }, + { + "epoch": 0.8551042810098792, + "grad_norm": 1.31244695186615, + "learning_rate": 2.349033654776182e-05, + "loss": 0.2654, + "step": 15580 + }, + { + "epoch": 0.8552140504939627, + "grad_norm": 1.7487826347351074, + "learning_rate": 2.348511014847722e-05, + "loss": 0.3736, + "step": 15582 + }, + { + "epoch": 0.8553238199780461, + "grad_norm": 2.624539375305176, + "learning_rate": 2.3479883815643125e-05, + "loss": 0.3786, + "step": 15584 + }, + { + "epoch": 0.8554335894621296, + "grad_norm": 1.291395902633667, + "learning_rate": 2.347465754948879e-05, + "loss": 0.19, + "step": 15586 + }, + { + "epoch": 0.8555433589462129, + "grad_norm": 1.2622536420822144, + "learning_rate": 2.3469431350243456e-05, + "loss": 0.2714, + "step": 15588 + }, + { + "epoch": 0.8556531284302964, + "grad_norm": 2.5281753540039062, + "learning_rate": 2.346420521813638e-05, + "loss": 0.2126, + "step": 15590 + }, + { + "epoch": 0.8557628979143798, + "grad_norm": 1.3386074304580688, + "learning_rate": 2.3458979153396806e-05, + "loss": 0.2383, + "step": 15592 + }, + { + "epoch": 0.8558726673984632, + "grad_norm": 1.1297292709350586, + "learning_rate": 2.345375315625395e-05, + "loss": 0.1183, + "step": 15594 + }, + { + "epoch": 0.8559824368825466, + "grad_norm": 1.4270070791244507, + "learning_rate": 2.3448527226937082e-05, + "loss": 0.2038, + "step": 15596 + }, + { + "epoch": 0.8560922063666301, + "grad_norm": 1.4936060905456543, + "learning_rate": 2.344330136567543e-05, + "loss": 0.2319, + "step": 15598 + }, + { + "epoch": 0.8562019758507134, + "grad_norm": 1.7004185914993286, + "learning_rate": 2.343807557269822e-05, + "loss": 0.198, + "step": 15600 + }, + { + "epoch": 0.8563117453347969, + "grad_norm": 1.5424587726593018, + "learning_rate": 2.3432849848234675e-05, + "loss": 0.3044, + "step": 15602 + }, + { + "epoch": 0.8564215148188804, + "grad_norm": 1.4876844882965088, + "learning_rate": 2.3427624192514033e-05, + "loss": 0.2496, + "step": 15604 + }, + { + "epoch": 0.8565312843029638, + "grad_norm": 0.8940309882164001, + "learning_rate": 2.3422398605765515e-05, + "loss": 0.2284, + "step": 15606 + }, + { + "epoch": 0.8566410537870472, + "grad_norm": 1.1787512302398682, + "learning_rate": 2.3417173088218332e-05, + "loss": 0.3504, + "step": 15608 + }, + { + "epoch": 0.8567508232711306, + "grad_norm": 0.7455372214317322, + "learning_rate": 2.3411947640101704e-05, + "loss": 0.1547, + "step": 15610 + }, + { + "epoch": 0.8568605927552141, + "grad_norm": 1.9718763828277588, + "learning_rate": 2.3406722261644856e-05, + "loss": 0.3199, + "step": 15612 + }, + { + "epoch": 0.8569703622392975, + "grad_norm": 1.2813295125961304, + "learning_rate": 2.340149695307698e-05, + "loss": 0.28, + "step": 15614 + }, + { + "epoch": 0.8570801317233809, + "grad_norm": 1.3617361783981323, + "learning_rate": 2.33962717146273e-05, + "loss": 0.2303, + "step": 15616 + }, + { + "epoch": 0.8571899012074643, + "grad_norm": 1.6489598751068115, + "learning_rate": 2.339104654652501e-05, + "loss": 0.1828, + "step": 15618 + }, + { + "epoch": 0.8572996706915478, + "grad_norm": 1.1808409690856934, + "learning_rate": 2.338582144899932e-05, + "loss": 0.1537, + "step": 15620 + }, + { + "epoch": 0.8574094401756311, + "grad_norm": 3.8536646366119385, + "learning_rate": 2.3380596422279423e-05, + "loss": 0.198, + "step": 15622 + }, + { + "epoch": 0.8575192096597146, + "grad_norm": 1.381210446357727, + "learning_rate": 2.3375371466594506e-05, + "loss": 0.2619, + "step": 15624 + }, + { + "epoch": 0.857628979143798, + "grad_norm": 0.7544345855712891, + "learning_rate": 2.3370146582173775e-05, + "loss": 0.1317, + "step": 15626 + }, + { + "epoch": 0.8577387486278815, + "grad_norm": 4.778482437133789, + "learning_rate": 2.3364921769246423e-05, + "loss": 0.2324, + "step": 15628 + }, + { + "epoch": 0.8578485181119648, + "grad_norm": 1.5823781490325928, + "learning_rate": 2.3359697028041623e-05, + "loss": 0.268, + "step": 15630 + }, + { + "epoch": 0.8579582875960483, + "grad_norm": 1.3487886190414429, + "learning_rate": 2.3354472358788562e-05, + "loss": 0.175, + "step": 15632 + }, + { + "epoch": 0.8580680570801317, + "grad_norm": 1.6850031614303589, + "learning_rate": 2.3349247761716423e-05, + "loss": 0.2039, + "step": 15634 + }, + { + "epoch": 0.8581778265642152, + "grad_norm": 1.1857964992523193, + "learning_rate": 2.334402323705438e-05, + "loss": 0.316, + "step": 15636 + }, + { + "epoch": 0.8582875960482985, + "grad_norm": 1.3199270963668823, + "learning_rate": 2.3338798785031598e-05, + "loss": 0.2072, + "step": 15638 + }, + { + "epoch": 0.858397365532382, + "grad_norm": 1.3163191080093384, + "learning_rate": 2.3333574405877262e-05, + "loss": 0.2861, + "step": 15640 + }, + { + "epoch": 0.8585071350164655, + "grad_norm": 1.1767241954803467, + "learning_rate": 2.3328350099820534e-05, + "loss": 0.2269, + "step": 15642 + }, + { + "epoch": 0.8586169045005488, + "grad_norm": 1.6533015966415405, + "learning_rate": 2.3323125867090568e-05, + "loss": 0.244, + "step": 15644 + }, + { + "epoch": 0.8587266739846323, + "grad_norm": 1.3172473907470703, + "learning_rate": 2.3317901707916536e-05, + "loss": 0.2132, + "step": 15646 + }, + { + "epoch": 0.8588364434687157, + "grad_norm": 2.1540167331695557, + "learning_rate": 2.3312677622527595e-05, + "loss": 0.2412, + "step": 15648 + }, + { + "epoch": 0.8589462129527992, + "grad_norm": 1.3059892654418945, + "learning_rate": 2.3307453611152887e-05, + "loss": 0.2444, + "step": 15650 + }, + { + "epoch": 0.8590559824368825, + "grad_norm": 1.4079713821411133, + "learning_rate": 2.3302229674021577e-05, + "loss": 0.2011, + "step": 15652 + }, + { + "epoch": 0.859165751920966, + "grad_norm": 3.0060462951660156, + "learning_rate": 2.32970058113628e-05, + "loss": 0.2943, + "step": 15654 + }, + { + "epoch": 0.8592755214050494, + "grad_norm": 1.0017340183258057, + "learning_rate": 2.3291782023405713e-05, + "loss": 0.2156, + "step": 15656 + }, + { + "epoch": 0.8593852908891328, + "grad_norm": 1.7235863208770752, + "learning_rate": 2.3286558310379445e-05, + "loss": 0.3568, + "step": 15658 + }, + { + "epoch": 0.8594950603732162, + "grad_norm": 1.6931781768798828, + "learning_rate": 2.328133467251315e-05, + "loss": 0.2627, + "step": 15660 + }, + { + "epoch": 0.8596048298572997, + "grad_norm": 0.9135633707046509, + "learning_rate": 2.3276111110035945e-05, + "loss": 0.2279, + "step": 15662 + }, + { + "epoch": 0.8597145993413831, + "grad_norm": 1.3861995935440063, + "learning_rate": 2.3270887623176968e-05, + "loss": 0.2608, + "step": 15664 + }, + { + "epoch": 0.8598243688254665, + "grad_norm": 0.8116251230239868, + "learning_rate": 2.326566421216535e-05, + "loss": 0.2511, + "step": 15666 + }, + { + "epoch": 0.8599341383095499, + "grad_norm": 1.5162779092788696, + "learning_rate": 2.3260440877230205e-05, + "loss": 0.2395, + "step": 15668 + }, + { + "epoch": 0.8600439077936334, + "grad_norm": 1.310283899307251, + "learning_rate": 2.325521761860067e-05, + "loss": 0.2076, + "step": 15670 + }, + { + "epoch": 0.8601536772777167, + "grad_norm": 0.9738346338272095, + "learning_rate": 2.3249994436505852e-05, + "loss": 0.175, + "step": 15672 + }, + { + "epoch": 0.8602634467618002, + "grad_norm": 1.5589641332626343, + "learning_rate": 2.3244771331174865e-05, + "loss": 0.2644, + "step": 15674 + }, + { + "epoch": 0.8603732162458837, + "grad_norm": 2.25769305229187, + "learning_rate": 2.3239548302836828e-05, + "loss": 0.4212, + "step": 15676 + }, + { + "epoch": 0.8604829857299671, + "grad_norm": 1.5961450338363647, + "learning_rate": 2.323432535172084e-05, + "loss": 0.3276, + "step": 15678 + }, + { + "epoch": 0.8605927552140505, + "grad_norm": 1.4621915817260742, + "learning_rate": 2.3229102478056007e-05, + "loss": 0.3065, + "step": 15680 + }, + { + "epoch": 0.8607025246981339, + "grad_norm": 2.4902901649475098, + "learning_rate": 2.322387968207143e-05, + "loss": 0.2713, + "step": 15682 + }, + { + "epoch": 0.8608122941822174, + "grad_norm": 1.6012884378433228, + "learning_rate": 2.3218656963996205e-05, + "loss": 0.2359, + "step": 15684 + }, + { + "epoch": 0.8609220636663008, + "grad_norm": 1.2792646884918213, + "learning_rate": 2.3213434324059437e-05, + "loss": 0.2426, + "step": 15686 + }, + { + "epoch": 0.8610318331503842, + "grad_norm": 1.433497428894043, + "learning_rate": 2.3208211762490202e-05, + "loss": 0.3236, + "step": 15688 + }, + { + "epoch": 0.8611416026344676, + "grad_norm": 5.439034461975098, + "learning_rate": 2.3202989279517597e-05, + "loss": 0.2724, + "step": 15690 + }, + { + "epoch": 0.8612513721185511, + "grad_norm": 2.4292328357696533, + "learning_rate": 2.3197766875370706e-05, + "loss": 0.1804, + "step": 15692 + }, + { + "epoch": 0.8613611416026344, + "grad_norm": 1.5057423114776611, + "learning_rate": 2.31925445502786e-05, + "loss": 0.2479, + "step": 15694 + }, + { + "epoch": 0.8614709110867179, + "grad_norm": 0.9913451075553894, + "learning_rate": 2.3187322304470365e-05, + "loss": 0.1337, + "step": 15696 + }, + { + "epoch": 0.8615806805708013, + "grad_norm": 1.5669522285461426, + "learning_rate": 2.318210013817507e-05, + "loss": 0.3152, + "step": 15698 + }, + { + "epoch": 0.8616904500548848, + "grad_norm": 1.0882351398468018, + "learning_rate": 2.3176878051621784e-05, + "loss": 0.1925, + "step": 15700 + }, + { + "epoch": 0.8618002195389681, + "grad_norm": 2.357545852661133, + "learning_rate": 2.317165604503958e-05, + "loss": 0.2798, + "step": 15702 + }, + { + "epoch": 0.8619099890230516, + "grad_norm": 1.2173506021499634, + "learning_rate": 2.3166434118657514e-05, + "loss": 0.2741, + "step": 15704 + }, + { + "epoch": 0.862019758507135, + "grad_norm": 1.0582255125045776, + "learning_rate": 2.3161212272704643e-05, + "loss": 0.1929, + "step": 15706 + }, + { + "epoch": 0.8621295279912184, + "grad_norm": 1.8222668170928955, + "learning_rate": 2.3155990507410035e-05, + "loss": 0.2198, + "step": 15708 + }, + { + "epoch": 0.8622392974753018, + "grad_norm": 1.4451775550842285, + "learning_rate": 2.3150768823002722e-05, + "loss": 0.2478, + "step": 15710 + }, + { + "epoch": 0.8623490669593853, + "grad_norm": 0.8849972486495972, + "learning_rate": 2.3145547219711774e-05, + "loss": 0.1918, + "step": 15712 + }, + { + "epoch": 0.8624588364434688, + "grad_norm": 1.7700616121292114, + "learning_rate": 2.3140325697766217e-05, + "loss": 0.3245, + "step": 15714 + }, + { + "epoch": 0.8625686059275521, + "grad_norm": 1.0784822702407837, + "learning_rate": 2.3135104257395114e-05, + "loss": 0.2117, + "step": 15716 + }, + { + "epoch": 0.8626783754116356, + "grad_norm": 1.2661157846450806, + "learning_rate": 2.3129882898827485e-05, + "loss": 0.2479, + "step": 15718 + }, + { + "epoch": 0.862788144895719, + "grad_norm": 1.1730012893676758, + "learning_rate": 2.3124661622292378e-05, + "loss": 0.194, + "step": 15720 + }, + { + "epoch": 0.8628979143798025, + "grad_norm": 1.3357287645339966, + "learning_rate": 2.3119440428018817e-05, + "loss": 0.2366, + "step": 15722 + }, + { + "epoch": 0.8630076838638858, + "grad_norm": 1.0102967023849487, + "learning_rate": 2.3114219316235822e-05, + "loss": 0.1763, + "step": 15724 + }, + { + "epoch": 0.8631174533479693, + "grad_norm": 1.6460860967636108, + "learning_rate": 2.310899828717243e-05, + "loss": 0.2946, + "step": 15726 + }, + { + "epoch": 0.8632272228320527, + "grad_norm": 2.1470255851745605, + "learning_rate": 2.3103777341057655e-05, + "loss": 0.2183, + "step": 15728 + }, + { + "epoch": 0.8633369923161361, + "grad_norm": 1.0309581756591797, + "learning_rate": 2.3098556478120506e-05, + "loss": 0.1686, + "step": 15730 + }, + { + "epoch": 0.8634467618002195, + "grad_norm": 1.4127497673034668, + "learning_rate": 2.309333569859001e-05, + "loss": 0.2748, + "step": 15732 + }, + { + "epoch": 0.863556531284303, + "grad_norm": 1.5147545337677002, + "learning_rate": 2.308811500269517e-05, + "loss": 0.2684, + "step": 15734 + }, + { + "epoch": 0.8636663007683864, + "grad_norm": 4.016904354095459, + "learning_rate": 2.308289439066498e-05, + "loss": 0.2157, + "step": 15736 + }, + { + "epoch": 0.8637760702524698, + "grad_norm": 1.173979640007019, + "learning_rate": 2.307767386272846e-05, + "loss": 0.1566, + "step": 15738 + }, + { + "epoch": 0.8638858397365532, + "grad_norm": 1.997139811515808, + "learning_rate": 2.3072453419114597e-05, + "loss": 0.3659, + "step": 15740 + }, + { + "epoch": 0.8639956092206367, + "grad_norm": 1.2112990617752075, + "learning_rate": 2.3067233060052388e-05, + "loss": 0.2363, + "step": 15742 + }, + { + "epoch": 0.86410537870472, + "grad_norm": 0.9415709376335144, + "learning_rate": 2.3062012785770815e-05, + "loss": 0.2095, + "step": 15744 + }, + { + "epoch": 0.8642151481888035, + "grad_norm": 1.5424546003341675, + "learning_rate": 2.3056792596498884e-05, + "loss": 0.2768, + "step": 15746 + }, + { + "epoch": 0.8643249176728869, + "grad_norm": 1.1282545328140259, + "learning_rate": 2.3051572492465567e-05, + "loss": 0.216, + "step": 15748 + }, + { + "epoch": 0.8644346871569704, + "grad_norm": 1.738124132156372, + "learning_rate": 2.3046352473899842e-05, + "loss": 0.3163, + "step": 15750 + }, + { + "epoch": 0.8645444566410538, + "grad_norm": 1.351974606513977, + "learning_rate": 2.3041132541030687e-05, + "loss": 0.2087, + "step": 15752 + }, + { + "epoch": 0.8646542261251372, + "grad_norm": 1.7745640277862549, + "learning_rate": 2.3035912694087078e-05, + "loss": 0.1996, + "step": 15754 + }, + { + "epoch": 0.8647639956092207, + "grad_norm": 1.0023311376571655, + "learning_rate": 2.3030692933297972e-05, + "loss": 0.1352, + "step": 15756 + }, + { + "epoch": 0.864873765093304, + "grad_norm": 2.4599387645721436, + "learning_rate": 2.3025473258892345e-05, + "loss": 0.3465, + "step": 15758 + }, + { + "epoch": 0.8649835345773875, + "grad_norm": 1.9569470882415771, + "learning_rate": 2.3020253671099144e-05, + "loss": 0.2717, + "step": 15760 + }, + { + "epoch": 0.8650933040614709, + "grad_norm": 2.285439968109131, + "learning_rate": 2.3015034170147342e-05, + "loss": 0.2818, + "step": 15762 + }, + { + "epoch": 0.8652030735455544, + "grad_norm": 2.1395130157470703, + "learning_rate": 2.3009814756265885e-05, + "loss": 0.2045, + "step": 15764 + }, + { + "epoch": 0.8653128430296377, + "grad_norm": 1.1560560464859009, + "learning_rate": 2.3004595429683713e-05, + "loss": 0.2856, + "step": 15766 + }, + { + "epoch": 0.8654226125137212, + "grad_norm": 1.1068446636199951, + "learning_rate": 2.2999376190629786e-05, + "loss": 0.1656, + "step": 15768 + }, + { + "epoch": 0.8655323819978046, + "grad_norm": 1.3749467134475708, + "learning_rate": 2.2994157039333042e-05, + "loss": 0.2589, + "step": 15770 + }, + { + "epoch": 0.8656421514818881, + "grad_norm": 1.6448158025741577, + "learning_rate": 2.2988937976022408e-05, + "loss": 0.1725, + "step": 15772 + }, + { + "epoch": 0.8657519209659714, + "grad_norm": 4.068378925323486, + "learning_rate": 2.298371900092683e-05, + "loss": 0.4342, + "step": 15774 + }, + { + "epoch": 0.8658616904500549, + "grad_norm": 1.531223177909851, + "learning_rate": 2.297850011427522e-05, + "loss": 0.2728, + "step": 15776 + }, + { + "epoch": 0.8659714599341383, + "grad_norm": 1.4066975116729736, + "learning_rate": 2.2973281316296533e-05, + "loss": 0.1903, + "step": 15778 + }, + { + "epoch": 0.8660812294182217, + "grad_norm": 1.1193552017211914, + "learning_rate": 2.2968062607219665e-05, + "loss": 0.2698, + "step": 15780 + }, + { + "epoch": 0.8661909989023051, + "grad_norm": 1.008250117301941, + "learning_rate": 2.2962843987273554e-05, + "loss": 0.2706, + "step": 15782 + }, + { + "epoch": 0.8663007683863886, + "grad_norm": 1.877902865409851, + "learning_rate": 2.29576254566871e-05, + "loss": 0.2625, + "step": 15784 + }, + { + "epoch": 0.8664105378704721, + "grad_norm": 3.3407957553863525, + "learning_rate": 2.295240701568922e-05, + "loss": 0.2782, + "step": 15786 + }, + { + "epoch": 0.8665203073545554, + "grad_norm": 1.623363971710205, + "learning_rate": 2.2947188664508816e-05, + "loss": 0.2043, + "step": 15788 + }, + { + "epoch": 0.8666300768386389, + "grad_norm": 1.1392896175384521, + "learning_rate": 2.2941970403374795e-05, + "loss": 0.1569, + "step": 15790 + }, + { + "epoch": 0.8667398463227223, + "grad_norm": 1.0190647840499878, + "learning_rate": 2.2936752232516052e-05, + "loss": 0.2036, + "step": 15792 + }, + { + "epoch": 0.8668496158068058, + "grad_norm": 0.9895774126052856, + "learning_rate": 2.2931534152161485e-05, + "loss": 0.2687, + "step": 15794 + }, + { + "epoch": 0.8669593852908891, + "grad_norm": 1.8532733917236328, + "learning_rate": 2.292631616253998e-05, + "loss": 0.3374, + "step": 15796 + }, + { + "epoch": 0.8670691547749726, + "grad_norm": 1.9011461734771729, + "learning_rate": 2.2921098263880427e-05, + "loss": 0.1832, + "step": 15798 + }, + { + "epoch": 0.867178924259056, + "grad_norm": 1.3070443868637085, + "learning_rate": 2.291588045641171e-05, + "loss": 0.2627, + "step": 15800 + }, + { + "epoch": 0.8672886937431394, + "grad_norm": 1.1617074012756348, + "learning_rate": 2.2910662740362704e-05, + "loss": 0.1553, + "step": 15802 + }, + { + "epoch": 0.8673984632272228, + "grad_norm": 1.3767951726913452, + "learning_rate": 2.2905445115962286e-05, + "loss": 0.2447, + "step": 15804 + }, + { + "epoch": 0.8675082327113063, + "grad_norm": 1.699184536933899, + "learning_rate": 2.2900227583439318e-05, + "loss": 0.2124, + "step": 15806 + }, + { + "epoch": 0.8676180021953896, + "grad_norm": 1.3092172145843506, + "learning_rate": 2.2895010143022686e-05, + "loss": 0.2193, + "step": 15808 + }, + { + "epoch": 0.8677277716794731, + "grad_norm": 1.2056190967559814, + "learning_rate": 2.2889792794941235e-05, + "loss": 0.2682, + "step": 15810 + }, + { + "epoch": 0.8678375411635565, + "grad_norm": 1.2097396850585938, + "learning_rate": 2.288457553942383e-05, + "loss": 0.2237, + "step": 15812 + }, + { + "epoch": 0.86794731064764, + "grad_norm": 1.37248957157135, + "learning_rate": 2.287935837669933e-05, + "loss": 0.2083, + "step": 15814 + }, + { + "epoch": 0.8680570801317233, + "grad_norm": 2.017137289047241, + "learning_rate": 2.2874141306996576e-05, + "loss": 0.4147, + "step": 15816 + }, + { + "epoch": 0.8681668496158068, + "grad_norm": 1.2170379161834717, + "learning_rate": 2.286892433054442e-05, + "loss": 0.2608, + "step": 15818 + }, + { + "epoch": 0.8682766190998902, + "grad_norm": 2.872098445892334, + "learning_rate": 2.2863707447571703e-05, + "loss": 0.3651, + "step": 15820 + }, + { + "epoch": 0.8683863885839737, + "grad_norm": 3.1876890659332275, + "learning_rate": 2.285849065830726e-05, + "loss": 0.2188, + "step": 15822 + }, + { + "epoch": 0.8684961580680571, + "grad_norm": 1.008268117904663, + "learning_rate": 2.2853273962979933e-05, + "loss": 0.1907, + "step": 15824 + }, + { + "epoch": 0.8686059275521405, + "grad_norm": 1.1284737586975098, + "learning_rate": 2.2848057361818544e-05, + "loss": 0.1967, + "step": 15826 + }, + { + "epoch": 0.868715697036224, + "grad_norm": 1.225062370300293, + "learning_rate": 2.284284085505192e-05, + "loss": 0.2403, + "step": 15828 + }, + { + "epoch": 0.8688254665203073, + "grad_norm": 0.9641575217247009, + "learning_rate": 2.2837624442908885e-05, + "loss": 0.2595, + "step": 15830 + }, + { + "epoch": 0.8689352360043908, + "grad_norm": 1.1216461658477783, + "learning_rate": 2.283240812561826e-05, + "loss": 0.3031, + "step": 15832 + }, + { + "epoch": 0.8690450054884742, + "grad_norm": 1.5201561450958252, + "learning_rate": 2.2827191903408844e-05, + "loss": 0.245, + "step": 15834 + }, + { + "epoch": 0.8691547749725577, + "grad_norm": 1.3659974336624146, + "learning_rate": 2.2821975776509456e-05, + "loss": 0.4412, + "step": 15836 + }, + { + "epoch": 0.869264544456641, + "grad_norm": 1.9384338855743408, + "learning_rate": 2.2816759745148906e-05, + "loss": 0.2996, + "step": 15838 + }, + { + "epoch": 0.8693743139407245, + "grad_norm": 1.129591941833496, + "learning_rate": 2.2811543809555992e-05, + "loss": 0.2972, + "step": 15840 + }, + { + "epoch": 0.8694840834248079, + "grad_norm": 2.3439888954162598, + "learning_rate": 2.2806327969959504e-05, + "loss": 0.2582, + "step": 15842 + }, + { + "epoch": 0.8695938529088914, + "grad_norm": 2.277115821838379, + "learning_rate": 2.2801112226588243e-05, + "loss": 0.3492, + "step": 15844 + }, + { + "epoch": 0.8697036223929747, + "grad_norm": 1.1105877161026, + "learning_rate": 2.2795896579670987e-05, + "loss": 0.2422, + "step": 15846 + }, + { + "epoch": 0.8698133918770582, + "grad_norm": 1.3607609272003174, + "learning_rate": 2.279068102943653e-05, + "loss": 0.234, + "step": 15848 + }, + { + "epoch": 0.8699231613611416, + "grad_norm": 1.2014343738555908, + "learning_rate": 2.2785465576113645e-05, + "loss": 0.3375, + "step": 15850 + }, + { + "epoch": 0.870032930845225, + "grad_norm": 1.0563291311264038, + "learning_rate": 2.278025021993111e-05, + "loss": 0.1955, + "step": 15852 + }, + { + "epoch": 0.8701427003293084, + "grad_norm": 1.4346802234649658, + "learning_rate": 2.27750349611177e-05, + "loss": 0.3052, + "step": 15854 + }, + { + "epoch": 0.8702524698133919, + "grad_norm": 0.8817837834358215, + "learning_rate": 2.276981979990217e-05, + "loss": 0.2496, + "step": 15856 + }, + { + "epoch": 0.8703622392974752, + "grad_norm": 1.4258031845092773, + "learning_rate": 2.276460473651329e-05, + "loss": 0.2507, + "step": 15858 + }, + { + "epoch": 0.8704720087815587, + "grad_norm": 2.4796786308288574, + "learning_rate": 2.2759389771179823e-05, + "loss": 0.2555, + "step": 15860 + }, + { + "epoch": 0.8705817782656422, + "grad_norm": 1.53849458694458, + "learning_rate": 2.275417490413052e-05, + "loss": 0.2878, + "step": 15862 + }, + { + "epoch": 0.8706915477497256, + "grad_norm": 1.6429444551467896, + "learning_rate": 2.2748960135594118e-05, + "loss": 0.3525, + "step": 15864 + }, + { + "epoch": 0.870801317233809, + "grad_norm": 1.65473210811615, + "learning_rate": 2.2743745465799373e-05, + "loss": 0.3458, + "step": 15866 + }, + { + "epoch": 0.8709110867178924, + "grad_norm": 1.2892853021621704, + "learning_rate": 2.2738530894975034e-05, + "loss": 0.1978, + "step": 15868 + }, + { + "epoch": 0.8710208562019759, + "grad_norm": 1.2944506406784058, + "learning_rate": 2.2733316423349834e-05, + "loss": 0.3005, + "step": 15870 + }, + { + "epoch": 0.8711306256860593, + "grad_norm": 1.0262293815612793, + "learning_rate": 2.272810205115249e-05, + "loss": 0.2211, + "step": 15872 + }, + { + "epoch": 0.8712403951701427, + "grad_norm": 1.450449824333191, + "learning_rate": 2.272288777861175e-05, + "loss": 0.2137, + "step": 15874 + }, + { + "epoch": 0.8713501646542261, + "grad_norm": 1.440582513809204, + "learning_rate": 2.271767360595633e-05, + "loss": 0.2191, + "step": 15876 + }, + { + "epoch": 0.8714599341383096, + "grad_norm": 1.8194485902786255, + "learning_rate": 2.2712459533414944e-05, + "loss": 0.2539, + "step": 15878 + }, + { + "epoch": 0.8715697036223929, + "grad_norm": 1.207568645477295, + "learning_rate": 2.270724556121631e-05, + "loss": 0.1906, + "step": 15880 + }, + { + "epoch": 0.8716794731064764, + "grad_norm": 0.8588234782218933, + "learning_rate": 2.270203168958914e-05, + "loss": 0.1544, + "step": 15882 + }, + { + "epoch": 0.8717892425905598, + "grad_norm": 0.9809253811836243, + "learning_rate": 2.269681791876214e-05, + "loss": 0.2033, + "step": 15884 + }, + { + "epoch": 0.8718990120746433, + "grad_norm": 1.6345406770706177, + "learning_rate": 2.269160424896401e-05, + "loss": 0.2299, + "step": 15886 + }, + { + "epoch": 0.8720087815587266, + "grad_norm": 1.4062451124191284, + "learning_rate": 2.2686390680423446e-05, + "loss": 0.3549, + "step": 15888 + }, + { + "epoch": 0.8721185510428101, + "grad_norm": 1.2264587879180908, + "learning_rate": 2.2681177213369147e-05, + "loss": 0.2563, + "step": 15890 + }, + { + "epoch": 0.8722283205268935, + "grad_norm": 2.669088363647461, + "learning_rate": 2.2675963848029796e-05, + "loss": 0.1668, + "step": 15892 + }, + { + "epoch": 0.872338090010977, + "grad_norm": 1.9627083539962769, + "learning_rate": 2.2670750584634073e-05, + "loss": 0.2763, + "step": 15894 + }, + { + "epoch": 0.8724478594950603, + "grad_norm": 0.6385243535041809, + "learning_rate": 2.266553742341066e-05, + "loss": 0.1455, + "step": 15896 + }, + { + "epoch": 0.8725576289791438, + "grad_norm": 1.9426860809326172, + "learning_rate": 2.2660324364588235e-05, + "loss": 0.2924, + "step": 15898 + }, + { + "epoch": 0.8726673984632273, + "grad_norm": 1.4521414041519165, + "learning_rate": 2.2655111408395478e-05, + "loss": 0.235, + "step": 15900 + }, + { + "epoch": 0.8727771679473106, + "grad_norm": 2.251701593399048, + "learning_rate": 2.2649898555061032e-05, + "loss": 0.1553, + "step": 15902 + }, + { + "epoch": 0.8728869374313941, + "grad_norm": 0.7855494022369385, + "learning_rate": 2.264468580481358e-05, + "loss": 0.2063, + "step": 15904 + }, + { + "epoch": 0.8729967069154775, + "grad_norm": 1.2349319458007812, + "learning_rate": 2.2639473157881766e-05, + "loss": 0.2663, + "step": 15906 + }, + { + "epoch": 0.873106476399561, + "grad_norm": 0.9260892271995544, + "learning_rate": 2.263426061449424e-05, + "loss": 0.2353, + "step": 15908 + }, + { + "epoch": 0.8732162458836443, + "grad_norm": 1.2231589555740356, + "learning_rate": 2.2629048174879663e-05, + "loss": 0.1547, + "step": 15910 + }, + { + "epoch": 0.8733260153677278, + "grad_norm": 1.5443403720855713, + "learning_rate": 2.262383583926667e-05, + "loss": 0.1901, + "step": 15912 + }, + { + "epoch": 0.8734357848518112, + "grad_norm": 1.3782674074172974, + "learning_rate": 2.261862360788389e-05, + "loss": 0.3955, + "step": 15914 + }, + { + "epoch": 0.8735455543358946, + "grad_norm": 1.924731731414795, + "learning_rate": 2.2613411480959978e-05, + "loss": 0.2217, + "step": 15916 + }, + { + "epoch": 0.873655323819978, + "grad_norm": 1.512198805809021, + "learning_rate": 2.260819945872355e-05, + "loss": 0.2879, + "step": 15918 + }, + { + "epoch": 0.8737650933040615, + "grad_norm": 1.9822551012039185, + "learning_rate": 2.2602987541403227e-05, + "loss": 0.2796, + "step": 15920 + }, + { + "epoch": 0.8738748627881449, + "grad_norm": 1.7596654891967773, + "learning_rate": 2.2597775729227643e-05, + "loss": 0.1673, + "step": 15922 + }, + { + "epoch": 0.8739846322722283, + "grad_norm": 1.0161045789718628, + "learning_rate": 2.25925640224254e-05, + "loss": 0.1666, + "step": 15924 + }, + { + "epoch": 0.8740944017563117, + "grad_norm": 1.4850938320159912, + "learning_rate": 2.2587352421225116e-05, + "loss": 0.2554, + "step": 15926 + }, + { + "epoch": 0.8742041712403952, + "grad_norm": 1.534252405166626, + "learning_rate": 2.2582140925855395e-05, + "loss": 0.1987, + "step": 15928 + }, + { + "epoch": 0.8743139407244785, + "grad_norm": 1.1791383028030396, + "learning_rate": 2.2576929536544848e-05, + "loss": 0.2637, + "step": 15930 + }, + { + "epoch": 0.874423710208562, + "grad_norm": 1.5534930229187012, + "learning_rate": 2.2571718253522066e-05, + "loss": 0.2471, + "step": 15932 + }, + { + "epoch": 0.8745334796926455, + "grad_norm": 1.6495883464813232, + "learning_rate": 2.2566507077015635e-05, + "loss": 0.3345, + "step": 15934 + }, + { + "epoch": 0.8746432491767289, + "grad_norm": 1.3783411979675293, + "learning_rate": 2.256129600725415e-05, + "loss": 0.3281, + "step": 15936 + }, + { + "epoch": 0.8747530186608123, + "grad_norm": 1.0276488065719604, + "learning_rate": 2.2556085044466185e-05, + "loss": 0.1964, + "step": 15938 + }, + { + "epoch": 0.8748627881448957, + "grad_norm": 1.3582922220230103, + "learning_rate": 2.2550874188880334e-05, + "loss": 0.1645, + "step": 15940 + }, + { + "epoch": 0.8749725576289792, + "grad_norm": 1.2384114265441895, + "learning_rate": 2.2545663440725163e-05, + "loss": 0.1982, + "step": 15942 + }, + { + "epoch": 0.8750823271130626, + "grad_norm": 1.6282051801681519, + "learning_rate": 2.2540452800229235e-05, + "loss": 0.3513, + "step": 15944 + }, + { + "epoch": 0.875192096597146, + "grad_norm": 1.361922264099121, + "learning_rate": 2.253524226762112e-05, + "loss": 0.2766, + "step": 15946 + }, + { + "epoch": 0.8753018660812294, + "grad_norm": 1.2334132194519043, + "learning_rate": 2.2530031843129383e-05, + "loss": 0.1601, + "step": 15948 + }, + { + "epoch": 0.8754116355653129, + "grad_norm": 1.1293466091156006, + "learning_rate": 2.2524821526982564e-05, + "loss": 0.1698, + "step": 15950 + }, + { + "epoch": 0.8755214050493962, + "grad_norm": 1.4483122825622559, + "learning_rate": 2.2519611319409227e-05, + "loss": 0.2326, + "step": 15952 + }, + { + "epoch": 0.8756311745334797, + "grad_norm": 1.4129458665847778, + "learning_rate": 2.2514401220637912e-05, + "loss": 0.2641, + "step": 15954 + }, + { + "epoch": 0.8757409440175631, + "grad_norm": 1.4708268642425537, + "learning_rate": 2.2509191230897146e-05, + "loss": 0.249, + "step": 15956 + }, + { + "epoch": 0.8758507135016466, + "grad_norm": 1.6428738832473755, + "learning_rate": 2.250398135041549e-05, + "loss": 0.2798, + "step": 15958 + }, + { + "epoch": 0.8759604829857299, + "grad_norm": 1.1579029560089111, + "learning_rate": 2.2498771579421466e-05, + "loss": 0.3189, + "step": 15960 + }, + { + "epoch": 0.8760702524698134, + "grad_norm": 1.3641130924224854, + "learning_rate": 2.2493561918143593e-05, + "loss": 0.3091, + "step": 15962 + }, + { + "epoch": 0.8761800219538968, + "grad_norm": 0.8882120251655579, + "learning_rate": 2.2488352366810398e-05, + "loss": 0.163, + "step": 15964 + }, + { + "epoch": 0.8762897914379802, + "grad_norm": 1.6232738494873047, + "learning_rate": 2.2483142925650398e-05, + "loss": 0.1528, + "step": 15966 + }, + { + "epoch": 0.8763995609220636, + "grad_norm": 1.4281619787216187, + "learning_rate": 2.24779335948921e-05, + "loss": 0.2356, + "step": 15968 + }, + { + "epoch": 0.8765093304061471, + "grad_norm": 1.271216630935669, + "learning_rate": 2.2472724374764013e-05, + "loss": 0.2397, + "step": 15970 + }, + { + "epoch": 0.8766190998902306, + "grad_norm": 1.6153011322021484, + "learning_rate": 2.2467515265494643e-05, + "loss": 0.222, + "step": 15972 + }, + { + "epoch": 0.8767288693743139, + "grad_norm": 1.4437155723571777, + "learning_rate": 2.2462306267312478e-05, + "loss": 0.2726, + "step": 15974 + }, + { + "epoch": 0.8768386388583974, + "grad_norm": 1.7470759153366089, + "learning_rate": 2.245709738044602e-05, + "loss": 0.2777, + "step": 15976 + }, + { + "epoch": 0.8769484083424808, + "grad_norm": 1.1802222728729248, + "learning_rate": 2.2451888605123754e-05, + "loss": 0.2381, + "step": 15978 + }, + { + "epoch": 0.8770581778265643, + "grad_norm": 1.2670531272888184, + "learning_rate": 2.2446679941574156e-05, + "loss": 0.2605, + "step": 15980 + }, + { + "epoch": 0.8771679473106476, + "grad_norm": 1.1032696962356567, + "learning_rate": 2.244147139002571e-05, + "loss": 0.1883, + "step": 15982 + }, + { + "epoch": 0.8772777167947311, + "grad_norm": 2.745121955871582, + "learning_rate": 2.2436262950706893e-05, + "loss": 0.2633, + "step": 15984 + }, + { + "epoch": 0.8773874862788145, + "grad_norm": 1.3942891359329224, + "learning_rate": 2.2431054623846153e-05, + "loss": 0.3328, + "step": 15986 + }, + { + "epoch": 0.8774972557628979, + "grad_norm": 1.067645788192749, + "learning_rate": 2.2425846409671968e-05, + "loss": 0.2459, + "step": 15988 + }, + { + "epoch": 0.8776070252469813, + "grad_norm": 0.957892656326294, + "learning_rate": 2.2420638308412805e-05, + "loss": 0.1545, + "step": 15990 + }, + { + "epoch": 0.8777167947310648, + "grad_norm": 1.2542171478271484, + "learning_rate": 2.24154303202971e-05, + "loss": 0.2479, + "step": 15992 + }, + { + "epoch": 0.8778265642151482, + "grad_norm": 1.9322190284729004, + "learning_rate": 2.2410222445553304e-05, + "loss": 0.2716, + "step": 15994 + }, + { + "epoch": 0.8779363336992316, + "grad_norm": 1.4107719659805298, + "learning_rate": 2.2405014684409873e-05, + "loss": 0.2752, + "step": 15996 + }, + { + "epoch": 0.878046103183315, + "grad_norm": 0.9026569724082947, + "learning_rate": 2.239980703709523e-05, + "loss": 0.1421, + "step": 15998 + }, + { + "epoch": 0.8781558726673985, + "grad_norm": 1.0649770498275757, + "learning_rate": 2.239459950383781e-05, + "loss": 0.3243, + "step": 16000 + }, + { + "epoch": 0.8782656421514818, + "grad_norm": 0.8765593767166138, + "learning_rate": 2.2389392084866045e-05, + "loss": 0.178, + "step": 16002 + }, + { + "epoch": 0.8783754116355653, + "grad_norm": 1.193585753440857, + "learning_rate": 2.238418478040836e-05, + "loss": 0.3024, + "step": 16004 + }, + { + "epoch": 0.8784851811196487, + "grad_norm": 1.4113948345184326, + "learning_rate": 2.2378977590693165e-05, + "loss": 0.2137, + "step": 16006 + }, + { + "epoch": 0.8785949506037322, + "grad_norm": 1.5442556142807007, + "learning_rate": 2.2373770515948883e-05, + "loss": 0.307, + "step": 16008 + }, + { + "epoch": 0.8787047200878156, + "grad_norm": 2.1614534854888916, + "learning_rate": 2.2368563556403915e-05, + "loss": 0.2376, + "step": 16010 + }, + { + "epoch": 0.878814489571899, + "grad_norm": 1.5016926527023315, + "learning_rate": 2.236335671228666e-05, + "loss": 0.2389, + "step": 16012 + }, + { + "epoch": 0.8789242590559825, + "grad_norm": 1.34162437915802, + "learning_rate": 2.235814998382553e-05, + "loss": 0.3158, + "step": 16014 + }, + { + "epoch": 0.8790340285400658, + "grad_norm": 1.2351607084274292, + "learning_rate": 2.2352943371248898e-05, + "loss": 0.1469, + "step": 16016 + }, + { + "epoch": 0.8791437980241493, + "grad_norm": 0.9549513459205627, + "learning_rate": 2.2347736874785163e-05, + "loss": 0.1999, + "step": 16018 + }, + { + "epoch": 0.8792535675082327, + "grad_norm": 1.4368454217910767, + "learning_rate": 2.2342530494662705e-05, + "loss": 0.2488, + "step": 16020 + }, + { + "epoch": 0.8793633369923162, + "grad_norm": 1.2177455425262451, + "learning_rate": 2.2337324231109913e-05, + "loss": 0.2353, + "step": 16022 + }, + { + "epoch": 0.8794731064763995, + "grad_norm": 1.3439842462539673, + "learning_rate": 2.2332118084355142e-05, + "loss": 0.2844, + "step": 16024 + }, + { + "epoch": 0.879582875960483, + "grad_norm": 0.949881911277771, + "learning_rate": 2.2326912054626772e-05, + "loss": 0.1648, + "step": 16026 + }, + { + "epoch": 0.8796926454445664, + "grad_norm": 1.594521164894104, + "learning_rate": 2.2321706142153162e-05, + "loss": 0.308, + "step": 16028 + }, + { + "epoch": 0.8798024149286499, + "grad_norm": 1.1723719835281372, + "learning_rate": 2.231650034716266e-05, + "loss": 0.1516, + "step": 16030 + }, + { + "epoch": 0.8799121844127332, + "grad_norm": 1.7179899215698242, + "learning_rate": 2.2311294669883628e-05, + "loss": 0.2238, + "step": 16032 + }, + { + "epoch": 0.8800219538968167, + "grad_norm": 1.4064841270446777, + "learning_rate": 2.230608911054441e-05, + "loss": 0.1852, + "step": 16034 + }, + { + "epoch": 0.8801317233809001, + "grad_norm": 1.4205011129379272, + "learning_rate": 2.230088366937334e-05, + "loss": 0.2319, + "step": 16036 + }, + { + "epoch": 0.8802414928649835, + "grad_norm": 3.3011176586151123, + "learning_rate": 2.2295678346598764e-05, + "loss": 0.1794, + "step": 16038 + }, + { + "epoch": 0.8803512623490669, + "grad_norm": 1.1546034812927246, + "learning_rate": 2.2290473142449013e-05, + "loss": 0.2739, + "step": 16040 + }, + { + "epoch": 0.8804610318331504, + "grad_norm": 1.3395713567733765, + "learning_rate": 2.22852680571524e-05, + "loss": 0.2608, + "step": 16042 + }, + { + "epoch": 0.8805708013172339, + "grad_norm": 1.006385087966919, + "learning_rate": 2.228006309093726e-05, + "loss": 0.3091, + "step": 16044 + }, + { + "epoch": 0.8806805708013172, + "grad_norm": 1.1187342405319214, + "learning_rate": 2.2274858244031907e-05, + "loss": 0.363, + "step": 16046 + }, + { + "epoch": 0.8807903402854007, + "grad_norm": 1.5100085735321045, + "learning_rate": 2.2269653516664635e-05, + "loss": 0.1817, + "step": 16048 + }, + { + "epoch": 0.8809001097694841, + "grad_norm": 1.1128705739974976, + "learning_rate": 2.2264448909063766e-05, + "loss": 0.231, + "step": 16050 + }, + { + "epoch": 0.8810098792535676, + "grad_norm": 1.3523529767990112, + "learning_rate": 2.2259244421457596e-05, + "loss": 0.2072, + "step": 16052 + }, + { + "epoch": 0.8811196487376509, + "grad_norm": 1.28023362159729, + "learning_rate": 2.2254040054074428e-05, + "loss": 0.2482, + "step": 16054 + }, + { + "epoch": 0.8812294182217344, + "grad_norm": 1.372289776802063, + "learning_rate": 2.2248835807142525e-05, + "loss": 0.3043, + "step": 16056 + }, + { + "epoch": 0.8813391877058178, + "grad_norm": 0.9637529253959656, + "learning_rate": 2.22436316808902e-05, + "loss": 0.2019, + "step": 16058 + }, + { + "epoch": 0.8814489571899012, + "grad_norm": 1.260578989982605, + "learning_rate": 2.2238427675545714e-05, + "loss": 0.3392, + "step": 16060 + }, + { + "epoch": 0.8815587266739846, + "grad_norm": 1.6898815631866455, + "learning_rate": 2.223322379133734e-05, + "loss": 0.2691, + "step": 16062 + }, + { + "epoch": 0.8816684961580681, + "grad_norm": 0.9954720139503479, + "learning_rate": 2.222802002849336e-05, + "loss": 0.1965, + "step": 16064 + }, + { + "epoch": 0.8817782656421514, + "grad_norm": 1.5623129606246948, + "learning_rate": 2.2222816387242017e-05, + "loss": 0.2431, + "step": 16066 + }, + { + "epoch": 0.8818880351262349, + "grad_norm": 1.2087764739990234, + "learning_rate": 2.221761286781159e-05, + "loss": 0.2813, + "step": 16068 + }, + { + "epoch": 0.8819978046103183, + "grad_norm": 1.070290446281433, + "learning_rate": 2.221240947043031e-05, + "loss": 0.2141, + "step": 16070 + }, + { + "epoch": 0.8821075740944018, + "grad_norm": 1.275215983390808, + "learning_rate": 2.2207206195326434e-05, + "loss": 0.3338, + "step": 16072 + }, + { + "epoch": 0.8822173435784851, + "grad_norm": 1.1887010335922241, + "learning_rate": 2.2202003042728208e-05, + "loss": 0.1904, + "step": 16074 + }, + { + "epoch": 0.8823271130625686, + "grad_norm": 1.2301709651947021, + "learning_rate": 2.219680001286386e-05, + "loss": 0.2803, + "step": 16076 + }, + { + "epoch": 0.882436882546652, + "grad_norm": 1.4387145042419434, + "learning_rate": 2.219159710596161e-05, + "loss": 0.2027, + "step": 16078 + }, + { + "epoch": 0.8825466520307355, + "grad_norm": 1.0634753704071045, + "learning_rate": 2.2186394322249706e-05, + "loss": 0.2266, + "step": 16080 + }, + { + "epoch": 0.8826564215148189, + "grad_norm": 1.5238484144210815, + "learning_rate": 2.218119166195636e-05, + "loss": 0.3529, + "step": 16082 + }, + { + "epoch": 0.8827661909989023, + "grad_norm": 1.6156485080718994, + "learning_rate": 2.217598912530978e-05, + "loss": 0.2275, + "step": 16084 + }, + { + "epoch": 0.8828759604829858, + "grad_norm": 1.360234260559082, + "learning_rate": 2.2170786712538176e-05, + "loss": 0.2082, + "step": 16086 + }, + { + "epoch": 0.8829857299670691, + "grad_norm": 1.1746559143066406, + "learning_rate": 2.2165584423869757e-05, + "loss": 0.2217, + "step": 16088 + }, + { + "epoch": 0.8830954994511526, + "grad_norm": 1.708628535270691, + "learning_rate": 2.2160382259532717e-05, + "loss": 0.2602, + "step": 16090 + }, + { + "epoch": 0.883205268935236, + "grad_norm": 3.3920981884002686, + "learning_rate": 2.2155180219755244e-05, + "loss": 0.2419, + "step": 16092 + }, + { + "epoch": 0.8833150384193195, + "grad_norm": 1.3145476579666138, + "learning_rate": 2.2149978304765534e-05, + "loss": 0.2577, + "step": 16094 + }, + { + "epoch": 0.8834248079034028, + "grad_norm": 1.6755802631378174, + "learning_rate": 2.2144776514791768e-05, + "loss": 0.2715, + "step": 16096 + }, + { + "epoch": 0.8835345773874863, + "grad_norm": 1.3457077741622925, + "learning_rate": 2.213957485006211e-05, + "loss": 0.239, + "step": 16098 + }, + { + "epoch": 0.8836443468715697, + "grad_norm": 1.4551045894622803, + "learning_rate": 2.2134373310804746e-05, + "loss": 0.2171, + "step": 16100 + }, + { + "epoch": 0.8837541163556532, + "grad_norm": 1.8433746099472046, + "learning_rate": 2.2129171897247824e-05, + "loss": 0.2876, + "step": 16102 + }, + { + "epoch": 0.8838638858397365, + "grad_norm": 1.141985535621643, + "learning_rate": 2.2123970609619524e-05, + "loss": 0.1872, + "step": 16104 + }, + { + "epoch": 0.88397365532382, + "grad_norm": 2.605499505996704, + "learning_rate": 2.211876944814799e-05, + "loss": 0.2691, + "step": 16106 + }, + { + "epoch": 0.8840834248079034, + "grad_norm": 1.317980408668518, + "learning_rate": 2.2113568413061357e-05, + "loss": 0.341, + "step": 16108 + }, + { + "epoch": 0.8841931942919868, + "grad_norm": 1.2163583040237427, + "learning_rate": 2.210836750458779e-05, + "loss": 0.3194, + "step": 16110 + }, + { + "epoch": 0.8843029637760702, + "grad_norm": 1.450264573097229, + "learning_rate": 2.2103166722955413e-05, + "loss": 0.2848, + "step": 16112 + }, + { + "epoch": 0.8844127332601537, + "grad_norm": 1.2668917179107666, + "learning_rate": 2.209796606839237e-05, + "loss": 0.2421, + "step": 16114 + }, + { + "epoch": 0.884522502744237, + "grad_norm": 1.1916694641113281, + "learning_rate": 2.209276554112677e-05, + "loss": 0.1409, + "step": 16116 + }, + { + "epoch": 0.8846322722283205, + "grad_norm": 2.1735260486602783, + "learning_rate": 2.208756514138675e-05, + "loss": 0.3303, + "step": 16118 + }, + { + "epoch": 0.884742041712404, + "grad_norm": 0.7639878988265991, + "learning_rate": 2.2082364869400416e-05, + "loss": 0.1526, + "step": 16120 + }, + { + "epoch": 0.8848518111964874, + "grad_norm": 1.4936403036117554, + "learning_rate": 2.2077164725395876e-05, + "loss": 0.1889, + "step": 16122 + }, + { + "epoch": 0.8849615806805708, + "grad_norm": 1.2033401727676392, + "learning_rate": 2.207196470960124e-05, + "loss": 0.2906, + "step": 16124 + }, + { + "epoch": 0.8850713501646542, + "grad_norm": 1.3287371397018433, + "learning_rate": 2.2066764822244608e-05, + "loss": 0.3747, + "step": 16126 + }, + { + "epoch": 0.8851811196487377, + "grad_norm": 0.6831086277961731, + "learning_rate": 2.2061565063554064e-05, + "loss": 0.154, + "step": 16128 + }, + { + "epoch": 0.8852908891328211, + "grad_norm": 2.1753053665161133, + "learning_rate": 2.2056365433757697e-05, + "loss": 0.2417, + "step": 16130 + }, + { + "epoch": 0.8854006586169045, + "grad_norm": 1.7575385570526123, + "learning_rate": 2.2051165933083594e-05, + "loss": 0.307, + "step": 16132 + }, + { + "epoch": 0.8855104281009879, + "grad_norm": 1.216795802116394, + "learning_rate": 2.2045966561759822e-05, + "loss": 0.1746, + "step": 16134 + }, + { + "epoch": 0.8856201975850714, + "grad_norm": 0.9527718424797058, + "learning_rate": 2.2040767320014462e-05, + "loss": 0.1385, + "step": 16136 + }, + { + "epoch": 0.8857299670691547, + "grad_norm": 1.3695011138916016, + "learning_rate": 2.203556820807556e-05, + "loss": 0.2421, + "step": 16138 + }, + { + "epoch": 0.8858397365532382, + "grad_norm": 1.1361663341522217, + "learning_rate": 2.20303692261712e-05, + "loss": 0.2074, + "step": 16140 + }, + { + "epoch": 0.8859495060373216, + "grad_norm": 1.2526196241378784, + "learning_rate": 2.2025170374529412e-05, + "loss": 0.2158, + "step": 16142 + }, + { + "epoch": 0.8860592755214051, + "grad_norm": 1.2806512117385864, + "learning_rate": 2.2019971653378264e-05, + "loss": 0.2542, + "step": 16144 + }, + { + "epoch": 0.8861690450054884, + "grad_norm": 1.1983662843704224, + "learning_rate": 2.2014773062945777e-05, + "loss": 0.3031, + "step": 16146 + }, + { + "epoch": 0.8862788144895719, + "grad_norm": 1.2343333959579468, + "learning_rate": 2.200957460346e-05, + "loss": 0.1967, + "step": 16148 + }, + { + "epoch": 0.8863885839736553, + "grad_norm": 2.295659303665161, + "learning_rate": 2.200437627514896e-05, + "loss": 0.295, + "step": 16150 + }, + { + "epoch": 0.8864983534577388, + "grad_norm": 1.2535390853881836, + "learning_rate": 2.1999178078240675e-05, + "loss": 0.1485, + "step": 16152 + }, + { + "epoch": 0.8866081229418221, + "grad_norm": 1.3815078735351562, + "learning_rate": 2.1993980012963174e-05, + "loss": 0.2682, + "step": 16154 + }, + { + "epoch": 0.8867178924259056, + "grad_norm": 1.8869773149490356, + "learning_rate": 2.198878207954446e-05, + "loss": 0.2609, + "step": 16156 + }, + { + "epoch": 0.8868276619099891, + "grad_norm": 1.5059261322021484, + "learning_rate": 2.1983584278212542e-05, + "loss": 0.2323, + "step": 16158 + }, + { + "epoch": 0.8869374313940724, + "grad_norm": 1.670240044593811, + "learning_rate": 2.1978386609195428e-05, + "loss": 0.1773, + "step": 16160 + }, + { + "epoch": 0.8870472008781559, + "grad_norm": 1.263875126838684, + "learning_rate": 2.197318907272111e-05, + "loss": 0.2308, + "step": 16162 + }, + { + "epoch": 0.8871569703622393, + "grad_norm": 1.3956036567687988, + "learning_rate": 2.1967991669017568e-05, + "loss": 0.2731, + "step": 16164 + }, + { + "epoch": 0.8872667398463228, + "grad_norm": 1.018885612487793, + "learning_rate": 2.19627943983128e-05, + "loss": 0.2195, + "step": 16166 + }, + { + "epoch": 0.8873765093304061, + "grad_norm": 1.3957252502441406, + "learning_rate": 2.1957597260834763e-05, + "loss": 0.3015, + "step": 16168 + }, + { + "epoch": 0.8874862788144896, + "grad_norm": 2.096017837524414, + "learning_rate": 2.195240025681146e-05, + "loss": 0.1955, + "step": 16170 + }, + { + "epoch": 0.887596048298573, + "grad_norm": 1.1274964809417725, + "learning_rate": 2.194720338647083e-05, + "loss": 0.2904, + "step": 16172 + }, + { + "epoch": 0.8877058177826564, + "grad_norm": 0.8580595850944519, + "learning_rate": 2.194200665004085e-05, + "loss": 0.2144, + "step": 16174 + }, + { + "epoch": 0.8878155872667398, + "grad_norm": 1.330966830253601, + "learning_rate": 2.193681004774947e-05, + "loss": 0.3053, + "step": 16176 + }, + { + "epoch": 0.8879253567508233, + "grad_norm": 0.9067702889442444, + "learning_rate": 2.1931613579824628e-05, + "loss": 0.171, + "step": 16178 + }, + { + "epoch": 0.8880351262349067, + "grad_norm": 0.7554737329483032, + "learning_rate": 2.1926417246494283e-05, + "loss": 0.2275, + "step": 16180 + }, + { + "epoch": 0.8881448957189901, + "grad_norm": 3.558304786682129, + "learning_rate": 2.192122104798636e-05, + "loss": 0.1913, + "step": 16182 + }, + { + "epoch": 0.8882546652030735, + "grad_norm": 1.028308391571045, + "learning_rate": 2.1916024984528794e-05, + "loss": 0.2226, + "step": 16184 + }, + { + "epoch": 0.888364434687157, + "grad_norm": 1.1348800659179688, + "learning_rate": 2.1910829056349507e-05, + "loss": 0.2563, + "step": 16186 + }, + { + "epoch": 0.8884742041712403, + "grad_norm": 1.3484272956848145, + "learning_rate": 2.1905633263676426e-05, + "loss": 0.1745, + "step": 16188 + }, + { + "epoch": 0.8885839736553238, + "grad_norm": 1.4197803735733032, + "learning_rate": 2.190043760673745e-05, + "loss": 0.2469, + "step": 16190 + }, + { + "epoch": 0.8886937431394073, + "grad_norm": 0.8585793972015381, + "learning_rate": 2.1895242085760502e-05, + "loss": 0.1689, + "step": 16192 + }, + { + "epoch": 0.8888035126234907, + "grad_norm": 1.8256804943084717, + "learning_rate": 2.1890046700973467e-05, + "loss": 0.2531, + "step": 16194 + }, + { + "epoch": 0.8889132821075741, + "grad_norm": 2.4185736179351807, + "learning_rate": 2.1884851452604256e-05, + "loss": 0.2095, + "step": 16196 + }, + { + "epoch": 0.8890230515916575, + "grad_norm": 2.3595268726348877, + "learning_rate": 2.1879656340880735e-05, + "loss": 0.1781, + "step": 16198 + }, + { + "epoch": 0.889132821075741, + "grad_norm": 1.656485915184021, + "learning_rate": 2.187446136603082e-05, + "loss": 0.2323, + "step": 16200 + }, + { + "epoch": 0.8892425905598244, + "grad_norm": 2.1694610118865967, + "learning_rate": 2.1869266528282358e-05, + "loss": 0.2233, + "step": 16202 + }, + { + "epoch": 0.8893523600439078, + "grad_norm": 1.0047177076339722, + "learning_rate": 2.186407182786324e-05, + "loss": 0.1568, + "step": 16204 + }, + { + "epoch": 0.8894621295279912, + "grad_norm": 3.4995808601379395, + "learning_rate": 2.1858877265001327e-05, + "loss": 0.2945, + "step": 16206 + }, + { + "epoch": 0.8895718990120747, + "grad_norm": 0.9872674345970154, + "learning_rate": 2.185368283992447e-05, + "loss": 0.1895, + "step": 16208 + }, + { + "epoch": 0.889681668496158, + "grad_norm": 1.4852627515792847, + "learning_rate": 2.1848488552860525e-05, + "loss": 0.2935, + "step": 16210 + }, + { + "epoch": 0.8897914379802415, + "grad_norm": 1.7579094171524048, + "learning_rate": 2.1843294404037347e-05, + "loss": 0.3194, + "step": 16212 + }, + { + "epoch": 0.8899012074643249, + "grad_norm": 1.3349577188491821, + "learning_rate": 2.1838100393682764e-05, + "loss": 0.2559, + "step": 16214 + }, + { + "epoch": 0.8900109769484084, + "grad_norm": 1.4975560903549194, + "learning_rate": 2.1832906522024622e-05, + "loss": 0.239, + "step": 16216 + }, + { + "epoch": 0.8901207464324917, + "grad_norm": 2.9044618606567383, + "learning_rate": 2.1827712789290746e-05, + "loss": 0.1629, + "step": 16218 + }, + { + "epoch": 0.8902305159165752, + "grad_norm": 1.0167657136917114, + "learning_rate": 2.182251919570895e-05, + "loss": 0.2187, + "step": 16220 + }, + { + "epoch": 0.8903402854006586, + "grad_norm": 1.0693936347961426, + "learning_rate": 2.1817325741507066e-05, + "loss": 0.2262, + "step": 16222 + }, + { + "epoch": 0.890450054884742, + "grad_norm": 1.0656591653823853, + "learning_rate": 2.1812132426912894e-05, + "loss": 0.1794, + "step": 16224 + }, + { + "epoch": 0.8905598243688254, + "grad_norm": 2.3490452766418457, + "learning_rate": 2.1806939252154234e-05, + "loss": 0.2247, + "step": 16226 + }, + { + "epoch": 0.8906695938529089, + "grad_norm": 2.2094414234161377, + "learning_rate": 2.180174621745889e-05, + "loss": 0.3308, + "step": 16228 + }, + { + "epoch": 0.8907793633369924, + "grad_norm": 1.2612711191177368, + "learning_rate": 2.1796553323054665e-05, + "loss": 0.3138, + "step": 16230 + }, + { + "epoch": 0.8908891328210757, + "grad_norm": 2.58717942237854, + "learning_rate": 2.1791360569169332e-05, + "loss": 0.2103, + "step": 16232 + }, + { + "epoch": 0.8909989023051592, + "grad_norm": 1.281988501548767, + "learning_rate": 2.1786167956030666e-05, + "loss": 0.1831, + "step": 16234 + }, + { + "epoch": 0.8911086717892426, + "grad_norm": 1.0110706090927124, + "learning_rate": 2.178097548386646e-05, + "loss": 0.1021, + "step": 16236 + }, + { + "epoch": 0.8912184412733261, + "grad_norm": 1.700217843055725, + "learning_rate": 2.1775783152904462e-05, + "loss": 0.3988, + "step": 16238 + }, + { + "epoch": 0.8913282107574094, + "grad_norm": 1.0768600702285767, + "learning_rate": 2.177059096337244e-05, + "loss": 0.1577, + "step": 16240 + }, + { + "epoch": 0.8914379802414929, + "grad_norm": 1.6136469841003418, + "learning_rate": 2.1765398915498154e-05, + "loss": 0.2995, + "step": 16242 + }, + { + "epoch": 0.8915477497255763, + "grad_norm": 1.3880220651626587, + "learning_rate": 2.1760207009509342e-05, + "loss": 0.2196, + "step": 16244 + }, + { + "epoch": 0.8916575192096597, + "grad_norm": 1.636248230934143, + "learning_rate": 2.1755015245633756e-05, + "loss": 0.2578, + "step": 16246 + }, + { + "epoch": 0.8917672886937431, + "grad_norm": 1.3548954725265503, + "learning_rate": 2.174982362409913e-05, + "loss": 0.2308, + "step": 16248 + }, + { + "epoch": 0.8918770581778266, + "grad_norm": 0.979415774345398, + "learning_rate": 2.1744632145133186e-05, + "loss": 0.1779, + "step": 16250 + }, + { + "epoch": 0.89198682766191, + "grad_norm": 4.052970886230469, + "learning_rate": 2.173944080896366e-05, + "loss": 0.2501, + "step": 16252 + }, + { + "epoch": 0.8920965971459934, + "grad_norm": 2.0957603454589844, + "learning_rate": 2.1734249615818265e-05, + "loss": 0.1583, + "step": 16254 + }, + { + "epoch": 0.8922063666300768, + "grad_norm": 1.161447525024414, + "learning_rate": 2.1729058565924708e-05, + "loss": 0.2322, + "step": 16256 + }, + { + "epoch": 0.8923161361141603, + "grad_norm": 1.2546287775039673, + "learning_rate": 2.17238676595107e-05, + "loss": 0.2796, + "step": 16258 + }, + { + "epoch": 0.8924259055982436, + "grad_norm": 1.3506019115447998, + "learning_rate": 2.1718676896803926e-05, + "loss": 0.2274, + "step": 16260 + }, + { + "epoch": 0.8925356750823271, + "grad_norm": 2.512610673904419, + "learning_rate": 2.17134862780321e-05, + "loss": 0.2437, + "step": 16262 + }, + { + "epoch": 0.8926454445664105, + "grad_norm": 1.612535834312439, + "learning_rate": 2.170829580342289e-05, + "loss": 0.2478, + "step": 16264 + }, + { + "epoch": 0.892755214050494, + "grad_norm": 2.005237102508545, + "learning_rate": 2.1703105473203988e-05, + "loss": 0.3015, + "step": 16266 + }, + { + "epoch": 0.8928649835345774, + "grad_norm": 2.1363391876220703, + "learning_rate": 2.1697915287603067e-05, + "loss": 0.363, + "step": 16268 + }, + { + "epoch": 0.8929747530186608, + "grad_norm": 1.460723876953125, + "learning_rate": 2.1692725246847778e-05, + "loss": 0.2781, + "step": 16270 + }, + { + "epoch": 0.8930845225027443, + "grad_norm": 0.8848304748535156, + "learning_rate": 2.16875353511658e-05, + "loss": 0.2053, + "step": 16272 + }, + { + "epoch": 0.8931942919868276, + "grad_norm": 1.9539027214050293, + "learning_rate": 2.1682345600784783e-05, + "loss": 0.1702, + "step": 16274 + }, + { + "epoch": 0.8933040614709111, + "grad_norm": 1.2086982727050781, + "learning_rate": 2.1677155995932364e-05, + "loss": 0.3563, + "step": 16276 + }, + { + "epoch": 0.8934138309549945, + "grad_norm": Infinity, + "learning_rate": 2.167456124815052e-05, + "loss": 0.2648, + "step": 16278 + }, + { + "epoch": 0.893523600439078, + "grad_norm": 1.4921679496765137, + "learning_rate": 2.166937186201784e-05, + "loss": 0.2632, + "step": 16280 + }, + { + "epoch": 0.8936333699231613, + "grad_norm": 1.3629590272903442, + "learning_rate": 2.1664182621982855e-05, + "loss": 0.2276, + "step": 16282 + }, + { + "epoch": 0.8937431394072448, + "grad_norm": 1.399404525756836, + "learning_rate": 2.1658993528273197e-05, + "loss": 0.2507, + "step": 16284 + }, + { + "epoch": 0.8938529088913282, + "grad_norm": 1.4247907400131226, + "learning_rate": 2.165380458111648e-05, + "loss": 0.2808, + "step": 16286 + }, + { + "epoch": 0.8939626783754117, + "grad_norm": 1.5617786645889282, + "learning_rate": 2.1648615780740316e-05, + "loss": 0.2697, + "step": 16288 + }, + { + "epoch": 0.894072447859495, + "grad_norm": 1.342708945274353, + "learning_rate": 2.1643427127372306e-05, + "loss": 0.2575, + "step": 16290 + }, + { + "epoch": 0.8941822173435785, + "grad_norm": 1.4816118478775024, + "learning_rate": 2.163823862124007e-05, + "loss": 0.2972, + "step": 16292 + }, + { + "epoch": 0.8942919868276619, + "grad_norm": 1.9656474590301514, + "learning_rate": 2.1633050262571185e-05, + "loss": 0.2055, + "step": 16294 + }, + { + "epoch": 0.8944017563117453, + "grad_norm": 1.3326741456985474, + "learning_rate": 2.162786205159324e-05, + "loss": 0.2442, + "step": 16296 + }, + { + "epoch": 0.8945115257958287, + "grad_norm": 1.5557751655578613, + "learning_rate": 2.162267398853382e-05, + "loss": 0.2431, + "step": 16298 + }, + { + "epoch": 0.8946212952799122, + "grad_norm": 2.0600876808166504, + "learning_rate": 2.1617486073620498e-05, + "loss": 0.3823, + "step": 16300 + }, + { + "epoch": 0.8947310647639956, + "grad_norm": 1.222469449043274, + "learning_rate": 2.1612298307080832e-05, + "loss": 0.2384, + "step": 16302 + }, + { + "epoch": 0.894840834248079, + "grad_norm": 1.516700267791748, + "learning_rate": 2.1607110689142393e-05, + "loss": 0.2858, + "step": 16304 + }, + { + "epoch": 0.8949506037321625, + "grad_norm": 1.854614496231079, + "learning_rate": 2.160192322003273e-05, + "loss": 0.316, + "step": 16306 + }, + { + "epoch": 0.8950603732162459, + "grad_norm": 0.9043842554092407, + "learning_rate": 2.1596735899979396e-05, + "loss": 0.2039, + "step": 16308 + }, + { + "epoch": 0.8951701427003294, + "grad_norm": 1.1568931341171265, + "learning_rate": 2.1591548729209935e-05, + "loss": 0.2469, + "step": 16310 + }, + { + "epoch": 0.8952799121844127, + "grad_norm": 1.385068655014038, + "learning_rate": 2.1586361707951866e-05, + "loss": 0.1742, + "step": 16312 + }, + { + "epoch": 0.8953896816684962, + "grad_norm": 0.9618444442749023, + "learning_rate": 2.1581174836432735e-05, + "loss": 0.1915, + "step": 16314 + }, + { + "epoch": 0.8954994511525796, + "grad_norm": 1.1804596185684204, + "learning_rate": 2.157598811488006e-05, + "loss": 0.1917, + "step": 16316 + }, + { + "epoch": 0.895609220636663, + "grad_norm": 2.0606489181518555, + "learning_rate": 2.157080154352134e-05, + "loss": 0.2746, + "step": 16318 + }, + { + "epoch": 0.8957189901207464, + "grad_norm": 1.5041053295135498, + "learning_rate": 2.1565615122584092e-05, + "loss": 0.289, + "step": 16320 + }, + { + "epoch": 0.8958287596048299, + "grad_norm": 1.893246054649353, + "learning_rate": 2.156042885229583e-05, + "loss": 0.2206, + "step": 16322 + }, + { + "epoch": 0.8959385290889132, + "grad_norm": 1.3448835611343384, + "learning_rate": 2.155524273288405e-05, + "loss": 0.3297, + "step": 16324 + }, + { + "epoch": 0.8960482985729967, + "grad_norm": 1.8326060771942139, + "learning_rate": 2.155005676457622e-05, + "loss": 0.3925, + "step": 16326 + }, + { + "epoch": 0.8961580680570801, + "grad_norm": 1.3500491380691528, + "learning_rate": 2.154487094759984e-05, + "loss": 0.1884, + "step": 16328 + }, + { + "epoch": 0.8962678375411636, + "grad_norm": 1.2903345823287964, + "learning_rate": 2.1539685282182378e-05, + "loss": 0.2107, + "step": 16330 + }, + { + "epoch": 0.8963776070252469, + "grad_norm": 2.4507272243499756, + "learning_rate": 2.15344997685513e-05, + "loss": 0.1464, + "step": 16332 + }, + { + "epoch": 0.8964873765093304, + "grad_norm": 1.061566710472107, + "learning_rate": 2.1529314406934078e-05, + "loss": 0.2548, + "step": 16334 + }, + { + "epoch": 0.8965971459934138, + "grad_norm": 1.2340137958526611, + "learning_rate": 2.152412919755816e-05, + "loss": 0.2673, + "step": 16336 + }, + { + "epoch": 0.8967069154774973, + "grad_norm": 0.8835122585296631, + "learning_rate": 2.151894414065099e-05, + "loss": 0.1576, + "step": 16338 + }, + { + "epoch": 0.8968166849615807, + "grad_norm": 1.7348090410232544, + "learning_rate": 2.1513759236440023e-05, + "loss": 0.1834, + "step": 16340 + }, + { + "epoch": 0.8969264544456641, + "grad_norm": 1.7060878276824951, + "learning_rate": 2.1508574485152684e-05, + "loss": 0.2248, + "step": 16342 + }, + { + "epoch": 0.8970362239297476, + "grad_norm": 1.5968587398529053, + "learning_rate": 2.1503389887016404e-05, + "loss": 0.2267, + "step": 16344 + }, + { + "epoch": 0.8971459934138309, + "grad_norm": 1.7026865482330322, + "learning_rate": 2.1498205442258606e-05, + "loss": 0.3363, + "step": 16346 + }, + { + "epoch": 0.8972557628979144, + "grad_norm": 1.0978626012802124, + "learning_rate": 2.1493021151106703e-05, + "loss": 0.3036, + "step": 16348 + }, + { + "epoch": 0.8973655323819978, + "grad_norm": 1.5950968265533447, + "learning_rate": 2.1487837013788106e-05, + "loss": 0.3248, + "step": 16350 + }, + { + "epoch": 0.8974753018660813, + "grad_norm": 2.019258499145508, + "learning_rate": 2.1482653030530217e-05, + "loss": 0.2909, + "step": 16352 + }, + { + "epoch": 0.8975850713501646, + "grad_norm": 1.4697299003601074, + "learning_rate": 2.1477469201560435e-05, + "loss": 0.1634, + "step": 16354 + }, + { + "epoch": 0.8976948408342481, + "grad_norm": 1.6050324440002441, + "learning_rate": 2.1472285527106137e-05, + "loss": 0.2337, + "step": 16356 + }, + { + "epoch": 0.8978046103183315, + "grad_norm": 1.4605714082717896, + "learning_rate": 2.1467102007394715e-05, + "loss": 0.2873, + "step": 16358 + }, + { + "epoch": 0.897914379802415, + "grad_norm": 1.034104824066162, + "learning_rate": 2.146191864265354e-05, + "loss": 0.1737, + "step": 16360 + }, + { + "epoch": 0.8980241492864983, + "grad_norm": 0.9566481113433838, + "learning_rate": 2.1456735433109975e-05, + "loss": 0.2622, + "step": 16362 + }, + { + "epoch": 0.8981339187705818, + "grad_norm": 1.4667277336120605, + "learning_rate": 2.1451552378991392e-05, + "loss": 0.2478, + "step": 16364 + }, + { + "epoch": 0.8982436882546652, + "grad_norm": 0.9594940543174744, + "learning_rate": 2.1446369480525135e-05, + "loss": 0.1402, + "step": 16366 + }, + { + "epoch": 0.8983534577387486, + "grad_norm": 1.1055129766464233, + "learning_rate": 2.1441186737938555e-05, + "loss": 0.1436, + "step": 16368 + }, + { + "epoch": 0.898463227222832, + "grad_norm": 1.0956940650939941, + "learning_rate": 2.1436004151458995e-05, + "loss": 0.2428, + "step": 16370 + }, + { + "epoch": 0.8985729967069155, + "grad_norm": 1.691962480545044, + "learning_rate": 2.1430821721313782e-05, + "loss": 0.2697, + "step": 16372 + }, + { + "epoch": 0.8986827661909988, + "grad_norm": 2.0254967212677, + "learning_rate": 2.1425639447730246e-05, + "loss": 0.2273, + "step": 16374 + }, + { + "epoch": 0.8987925356750823, + "grad_norm": 1.4232717752456665, + "learning_rate": 2.142045733093571e-05, + "loss": 0.2018, + "step": 16376 + }, + { + "epoch": 0.8989023051591658, + "grad_norm": 1.4827885627746582, + "learning_rate": 2.141527537115749e-05, + "loss": 0.2052, + "step": 16378 + }, + { + "epoch": 0.8990120746432492, + "grad_norm": 1.6669822931289673, + "learning_rate": 2.1410093568622878e-05, + "loss": 0.2505, + "step": 16380 + }, + { + "epoch": 0.8991218441273326, + "grad_norm": 1.255805253982544, + "learning_rate": 2.1404911923559175e-05, + "loss": 0.2461, + "step": 16382 + }, + { + "epoch": 0.899231613611416, + "grad_norm": 2.804495096206665, + "learning_rate": 2.1399730436193697e-05, + "loss": 0.2612, + "step": 16384 + }, + { + "epoch": 0.8993413830954995, + "grad_norm": 1.5393381118774414, + "learning_rate": 2.1394549106753708e-05, + "loss": 0.1794, + "step": 16386 + }, + { + "epoch": 0.8994511525795829, + "grad_norm": 1.6370868682861328, + "learning_rate": 2.138936793546649e-05, + "loss": 0.3418, + "step": 16388 + }, + { + "epoch": 0.8995609220636663, + "grad_norm": 2.3355979919433594, + "learning_rate": 2.138418692255932e-05, + "loss": 0.2672, + "step": 16390 + }, + { + "epoch": 0.8996706915477497, + "grad_norm": 1.2996034622192383, + "learning_rate": 2.137900606825946e-05, + "loss": 0.2037, + "step": 16392 + }, + { + "epoch": 0.8997804610318332, + "grad_norm": 1.4396097660064697, + "learning_rate": 2.137382537279416e-05, + "loss": 0.2759, + "step": 16394 + }, + { + "epoch": 0.8998902305159165, + "grad_norm": 1.6409335136413574, + "learning_rate": 2.1368644836390684e-05, + "loss": 0.2691, + "step": 16396 + }, + { + "epoch": 0.9, + "grad_norm": 1.1567237377166748, + "learning_rate": 2.1363464459276264e-05, + "loss": 0.1864, + "step": 16398 + }, + { + "epoch": 0.9001097694840834, + "grad_norm": 1.559913992881775, + "learning_rate": 2.1358284241678146e-05, + "loss": 0.2636, + "step": 16400 + }, + { + "epoch": 0.9002195389681669, + "grad_norm": 1.0568922758102417, + "learning_rate": 2.135310418382356e-05, + "loss": 0.2582, + "step": 16402 + }, + { + "epoch": 0.9003293084522502, + "grad_norm": 1.36655592918396, + "learning_rate": 2.1347924285939714e-05, + "loss": 0.3028, + "step": 16404 + }, + { + "epoch": 0.9004390779363337, + "grad_norm": 1.6511114835739136, + "learning_rate": 2.134274454825384e-05, + "loss": 0.2429, + "step": 16406 + }, + { + "epoch": 0.9005488474204171, + "grad_norm": 1.9579898118972778, + "learning_rate": 2.1337564970993145e-05, + "loss": 0.2627, + "step": 16408 + }, + { + "epoch": 0.9006586169045006, + "grad_norm": 1.2828510999679565, + "learning_rate": 2.1332385554384814e-05, + "loss": 0.2264, + "step": 16410 + }, + { + "epoch": 0.9007683863885839, + "grad_norm": 0.9602496027946472, + "learning_rate": 2.1327206298656056e-05, + "loss": 0.1432, + "step": 16412 + }, + { + "epoch": 0.9008781558726674, + "grad_norm": 1.522099494934082, + "learning_rate": 2.1322027204034066e-05, + "loss": 0.2411, + "step": 16414 + }, + { + "epoch": 0.9009879253567509, + "grad_norm": 1.5926100015640259, + "learning_rate": 2.1316848270746015e-05, + "loss": 0.2132, + "step": 16416 + }, + { + "epoch": 0.9010976948408342, + "grad_norm": 1.2683628797531128, + "learning_rate": 2.131166949901907e-05, + "loss": 0.3011, + "step": 16418 + }, + { + "epoch": 0.9012074643249177, + "grad_norm": 1.8296802043914795, + "learning_rate": 2.130649088908041e-05, + "loss": 0.3527, + "step": 16420 + }, + { + "epoch": 0.9013172338090011, + "grad_norm": 1.0488700866699219, + "learning_rate": 2.130131244115719e-05, + "loss": 0.1706, + "step": 16422 + }, + { + "epoch": 0.9014270032930846, + "grad_norm": 0.9709923267364502, + "learning_rate": 2.129613415547655e-05, + "loss": 0.2064, + "step": 16424 + }, + { + "epoch": 0.9015367727771679, + "grad_norm": 2.319329261779785, + "learning_rate": 2.1290956032265656e-05, + "loss": 0.2793, + "step": 16426 + }, + { + "epoch": 0.9016465422612514, + "grad_norm": 1.2168362140655518, + "learning_rate": 2.1285778071751634e-05, + "loss": 0.2793, + "step": 16428 + }, + { + "epoch": 0.9017563117453348, + "grad_norm": 1.1911345720291138, + "learning_rate": 2.1280600274161615e-05, + "loss": 0.3125, + "step": 16430 + }, + { + "epoch": 0.9018660812294182, + "grad_norm": 1.3598394393920898, + "learning_rate": 2.1275422639722724e-05, + "loss": 0.2045, + "step": 16432 + }, + { + "epoch": 0.9019758507135016, + "grad_norm": 1.3929671049118042, + "learning_rate": 2.127024516866208e-05, + "loss": 0.2331, + "step": 16434 + }, + { + "epoch": 0.9020856201975851, + "grad_norm": 1.3311879634857178, + "learning_rate": 2.1265067861206784e-05, + "loss": 0.2182, + "step": 16436 + }, + { + "epoch": 0.9021953896816685, + "grad_norm": 1.3078330755233765, + "learning_rate": 2.1259890717583947e-05, + "loss": 0.2882, + "step": 16438 + }, + { + "epoch": 0.9023051591657519, + "grad_norm": 1.4328373670578003, + "learning_rate": 2.1254713738020658e-05, + "loss": 0.283, + "step": 16440 + }, + { + "epoch": 0.9024149286498353, + "grad_norm": 2.5803933143615723, + "learning_rate": 2.1249536922744007e-05, + "loss": 0.2739, + "step": 16442 + }, + { + "epoch": 0.9025246981339188, + "grad_norm": 1.5883488655090332, + "learning_rate": 2.1244360271981073e-05, + "loss": 0.2374, + "step": 16444 + }, + { + "epoch": 0.9026344676180021, + "grad_norm": 1.6742271184921265, + "learning_rate": 2.123918378595894e-05, + "loss": 0.2303, + "step": 16446 + }, + { + "epoch": 0.9027442371020856, + "grad_norm": 1.6542080640792847, + "learning_rate": 2.1234007464904654e-05, + "loss": 0.2129, + "step": 16448 + }, + { + "epoch": 0.9028540065861691, + "grad_norm": 1.1565138101577759, + "learning_rate": 2.1228831309045294e-05, + "loss": 0.191, + "step": 16450 + }, + { + "epoch": 0.9029637760702525, + "grad_norm": 1.6856977939605713, + "learning_rate": 2.1223655318607904e-05, + "loss": 0.2795, + "step": 16452 + }, + { + "epoch": 0.9030735455543359, + "grad_norm": 1.202186107635498, + "learning_rate": 2.121847949381952e-05, + "loss": 0.256, + "step": 16454 + }, + { + "epoch": 0.9031833150384193, + "grad_norm": 1.9468507766723633, + "learning_rate": 2.121330383490719e-05, + "loss": 0.302, + "step": 16456 + }, + { + "epoch": 0.9032930845225028, + "grad_norm": 1.5318900346755981, + "learning_rate": 2.1208128342097943e-05, + "loss": 0.2734, + "step": 16458 + }, + { + "epoch": 0.9034028540065862, + "grad_norm": 1.7816414833068848, + "learning_rate": 2.1202953015618794e-05, + "loss": 0.2699, + "step": 16460 + }, + { + "epoch": 0.9035126234906696, + "grad_norm": 1.724407434463501, + "learning_rate": 2.1197777855696765e-05, + "loss": 0.2584, + "step": 16462 + }, + { + "epoch": 0.903622392974753, + "grad_norm": 0.9787769913673401, + "learning_rate": 2.1192602862558864e-05, + "loss": 0.1483, + "step": 16464 + }, + { + "epoch": 0.9037321624588365, + "grad_norm": 0.6955295205116272, + "learning_rate": 2.1187428036432083e-05, + "loss": 0.1497, + "step": 16466 + }, + { + "epoch": 0.9038419319429198, + "grad_norm": 1.3676164150238037, + "learning_rate": 2.1182253377543425e-05, + "loss": 0.1714, + "step": 16468 + }, + { + "epoch": 0.9039517014270033, + "grad_norm": 0.9319619536399841, + "learning_rate": 2.1177078886119876e-05, + "loss": 0.2289, + "step": 16470 + }, + { + "epoch": 0.9040614709110867, + "grad_norm": 1.1989060640335083, + "learning_rate": 2.11719045623884e-05, + "loss": 0.2351, + "step": 16472 + }, + { + "epoch": 0.9041712403951702, + "grad_norm": 1.1532635688781738, + "learning_rate": 2.116673040657598e-05, + "loss": 0.1676, + "step": 16474 + }, + { + "epoch": 0.9042810098792535, + "grad_norm": 1.0902400016784668, + "learning_rate": 2.116155641890959e-05, + "loss": 0.2139, + "step": 16476 + }, + { + "epoch": 0.904390779363337, + "grad_norm": 2.1506903171539307, + "learning_rate": 2.1156382599616172e-05, + "loss": 0.2336, + "step": 16478 + }, + { + "epoch": 0.9045005488474204, + "grad_norm": 1.2747236490249634, + "learning_rate": 2.1151208948922676e-05, + "loss": 0.2632, + "step": 16480 + }, + { + "epoch": 0.9046103183315038, + "grad_norm": 0.9968788623809814, + "learning_rate": 2.1146035467056054e-05, + "loss": 0.2676, + "step": 16482 + }, + { + "epoch": 0.9047200878155872, + "grad_norm": 1.1269495487213135, + "learning_rate": 2.114086215424322e-05, + "loss": 0.2384, + "step": 16484 + }, + { + "epoch": 0.9048298572996707, + "grad_norm": 1.4648348093032837, + "learning_rate": 2.1135689010711123e-05, + "loss": 0.2233, + "step": 16486 + }, + { + "epoch": 0.9049396267837542, + "grad_norm": 1.684369683265686, + "learning_rate": 2.1130516036686675e-05, + "loss": 0.2371, + "step": 16488 + }, + { + "epoch": 0.9050493962678375, + "grad_norm": 0.9039198160171509, + "learning_rate": 2.112534323239678e-05, + "loss": 0.1715, + "step": 16490 + }, + { + "epoch": 0.905159165751921, + "grad_norm": 0.8660351634025574, + "learning_rate": 2.112017059806835e-05, + "loss": 0.1669, + "step": 16492 + }, + { + "epoch": 0.9052689352360044, + "grad_norm": 1.122564435005188, + "learning_rate": 2.1114998133928287e-05, + "loss": 0.1837, + "step": 16494 + }, + { + "epoch": 0.9053787047200879, + "grad_norm": 1.4094104766845703, + "learning_rate": 2.1109825840203464e-05, + "loss": 0.2041, + "step": 16496 + }, + { + "epoch": 0.9054884742041712, + "grad_norm": 1.2416738271713257, + "learning_rate": 2.1104653717120783e-05, + "loss": 0.2093, + "step": 16498 + }, + { + "epoch": 0.9055982436882547, + "grad_norm": 1.1197723150253296, + "learning_rate": 2.1099481764907108e-05, + "loss": 0.2159, + "step": 16500 + }, + { + "epoch": 0.9057080131723381, + "grad_norm": 0.9747914671897888, + "learning_rate": 2.1094309983789297e-05, + "loss": 0.1616, + "step": 16502 + }, + { + "epoch": 0.9058177826564215, + "grad_norm": 0.9536772966384888, + "learning_rate": 2.1089138373994223e-05, + "loss": 0.137, + "step": 16504 + }, + { + "epoch": 0.9059275521405049, + "grad_norm": 1.8035048246383667, + "learning_rate": 2.1083966935748746e-05, + "loss": 0.2692, + "step": 16506 + }, + { + "epoch": 0.9060373216245884, + "grad_norm": 0.9926894903182983, + "learning_rate": 2.10787956692797e-05, + "loss": 0.1573, + "step": 16508 + }, + { + "epoch": 0.9061470911086718, + "grad_norm": 1.5204461812973022, + "learning_rate": 2.1073624574813912e-05, + "loss": 0.2591, + "step": 16510 + }, + { + "epoch": 0.9062568605927552, + "grad_norm": 1.3785970211029053, + "learning_rate": 2.106845365257823e-05, + "loss": 0.1494, + "step": 16512 + }, + { + "epoch": 0.9063666300768386, + "grad_norm": 1.1136932373046875, + "learning_rate": 2.1063282902799468e-05, + "loss": 0.1583, + "step": 16514 + }, + { + "epoch": 0.9064763995609221, + "grad_norm": 1.763782024383545, + "learning_rate": 2.1058112325704436e-05, + "loss": 0.3781, + "step": 16516 + }, + { + "epoch": 0.9065861690450054, + "grad_norm": 1.3113112449645996, + "learning_rate": 2.105294192151995e-05, + "loss": 0.2794, + "step": 16518 + }, + { + "epoch": 0.9066959385290889, + "grad_norm": 0.9726749062538147, + "learning_rate": 2.1047771690472804e-05, + "loss": 0.2052, + "step": 16520 + }, + { + "epoch": 0.9068057080131723, + "grad_norm": 1.5194700956344604, + "learning_rate": 2.1042601632789784e-05, + "loss": 0.1659, + "step": 16522 + }, + { + "epoch": 0.9069154774972558, + "grad_norm": 2.654261589050293, + "learning_rate": 2.1037431748697688e-05, + "loss": 0.2918, + "step": 16524 + }, + { + "epoch": 0.9070252469813392, + "grad_norm": 1.5440120697021484, + "learning_rate": 2.103226203842328e-05, + "loss": 0.2861, + "step": 16526 + }, + { + "epoch": 0.9071350164654226, + "grad_norm": 1.6625317335128784, + "learning_rate": 2.1027092502193334e-05, + "loss": 0.2496, + "step": 16528 + }, + { + "epoch": 0.9072447859495061, + "grad_norm": 1.0842968225479126, + "learning_rate": 2.1021923140234618e-05, + "loss": 0.1611, + "step": 16530 + }, + { + "epoch": 0.9073545554335894, + "grad_norm": 2.030789375305176, + "learning_rate": 2.1016753952773867e-05, + "loss": 0.3732, + "step": 16532 + }, + { + "epoch": 0.9074643249176729, + "grad_norm": 1.0631046295166016, + "learning_rate": 2.1011584940037836e-05, + "loss": 0.2262, + "step": 16534 + }, + { + "epoch": 0.9075740944017563, + "grad_norm": 1.1010420322418213, + "learning_rate": 2.100641610225328e-05, + "loss": 0.2705, + "step": 16536 + }, + { + "epoch": 0.9076838638858398, + "grad_norm": 1.3781756162643433, + "learning_rate": 2.100124743964691e-05, + "loss": 0.1735, + "step": 16538 + }, + { + "epoch": 0.9077936333699231, + "grad_norm": 1.0732053518295288, + "learning_rate": 2.0996078952445452e-05, + "loss": 0.1965, + "step": 16540 + }, + { + "epoch": 0.9079034028540066, + "grad_norm": 2.3067407608032227, + "learning_rate": 2.099091064087563e-05, + "loss": 0.2334, + "step": 16542 + }, + { + "epoch": 0.90801317233809, + "grad_norm": 1.3099604845046997, + "learning_rate": 2.0985742505164144e-05, + "loss": 0.2119, + "step": 16544 + }, + { + "epoch": 0.9081229418221735, + "grad_norm": 1.3372238874435425, + "learning_rate": 2.0980574545537688e-05, + "loss": 0.1874, + "step": 16546 + }, + { + "epoch": 0.9082327113062568, + "grad_norm": 1.147965669631958, + "learning_rate": 2.0975406762222966e-05, + "loss": 0.1882, + "step": 16548 + }, + { + "epoch": 0.9083424807903403, + "grad_norm": 2.2692244052886963, + "learning_rate": 2.0970239155446658e-05, + "loss": 0.2318, + "step": 16550 + }, + { + "epoch": 0.9084522502744237, + "grad_norm": 1.2016263008117676, + "learning_rate": 2.0965071725435436e-05, + "loss": 0.2473, + "step": 16552 + }, + { + "epoch": 0.9085620197585071, + "grad_norm": 1.960801362991333, + "learning_rate": 2.0959904472415977e-05, + "loss": 0.267, + "step": 16554 + }, + { + "epoch": 0.9086717892425905, + "grad_norm": 1.2851334810256958, + "learning_rate": 2.0954737396614937e-05, + "loss": 0.2062, + "step": 16556 + }, + { + "epoch": 0.908781558726674, + "grad_norm": 2.6874663829803467, + "learning_rate": 2.0949570498258967e-05, + "loss": 0.1908, + "step": 16558 + }, + { + "epoch": 0.9088913282107574, + "grad_norm": 1.4526644945144653, + "learning_rate": 2.0944403777574718e-05, + "loss": 0.1983, + "step": 16560 + }, + { + "epoch": 0.9090010976948408, + "grad_norm": 1.7212603092193604, + "learning_rate": 2.0939237234788818e-05, + "loss": 0.237, + "step": 16562 + }, + { + "epoch": 0.9091108671789243, + "grad_norm": 1.8992323875427246, + "learning_rate": 2.0934070870127912e-05, + "loss": 0.2464, + "step": 16564 + }, + { + "epoch": 0.9092206366630077, + "grad_norm": 1.4282922744750977, + "learning_rate": 2.092890468381861e-05, + "loss": 0.1997, + "step": 16566 + }, + { + "epoch": 0.9093304061470912, + "grad_norm": 1.1739258766174316, + "learning_rate": 2.0923738676087534e-05, + "loss": 0.2742, + "step": 16568 + }, + { + "epoch": 0.9094401756311745, + "grad_norm": 2.0786690711975098, + "learning_rate": 2.091857284716129e-05, + "loss": 0.248, + "step": 16570 + }, + { + "epoch": 0.909549945115258, + "grad_norm": 1.6786680221557617, + "learning_rate": 2.091340719726647e-05, + "loss": 0.2981, + "step": 16572 + }, + { + "epoch": 0.9096597145993414, + "grad_norm": 1.140270709991455, + "learning_rate": 2.090824172662967e-05, + "loss": 0.2156, + "step": 16574 + }, + { + "epoch": 0.9097694840834248, + "grad_norm": 1.5811653137207031, + "learning_rate": 2.0903076435477467e-05, + "loss": 0.2615, + "step": 16576 + }, + { + "epoch": 0.9098792535675082, + "grad_norm": 1.1696796417236328, + "learning_rate": 2.0897911324036444e-05, + "loss": 0.1848, + "step": 16578 + }, + { + "epoch": 0.9099890230515917, + "grad_norm": 1.2425668239593506, + "learning_rate": 2.089274639253317e-05, + "loss": 0.2358, + "step": 16580 + }, + { + "epoch": 0.910098792535675, + "grad_norm": 1.440729022026062, + "learning_rate": 2.088758164119419e-05, + "loss": 0.1899, + "step": 16582 + }, + { + "epoch": 0.9102085620197585, + "grad_norm": 1.7929091453552246, + "learning_rate": 2.088241707024607e-05, + "loss": 0.234, + "step": 16584 + }, + { + "epoch": 0.9103183315038419, + "grad_norm": 1.6694269180297852, + "learning_rate": 2.087725267991535e-05, + "loss": 0.2274, + "step": 16586 + }, + { + "epoch": 0.9104281009879254, + "grad_norm": 1.1091151237487793, + "learning_rate": 2.0872088470428553e-05, + "loss": 0.259, + "step": 16588 + }, + { + "epoch": 0.9105378704720087, + "grad_norm": 1.2038081884384155, + "learning_rate": 2.0866924442012225e-05, + "loss": 0.2282, + "step": 16590 + }, + { + "epoch": 0.9106476399560922, + "grad_norm": 1.914934754371643, + "learning_rate": 2.0861760594892867e-05, + "loss": 0.2242, + "step": 16592 + }, + { + "epoch": 0.9107574094401756, + "grad_norm": 2.372410297393799, + "learning_rate": 2.0856596929297006e-05, + "loss": 0.2752, + "step": 16594 + }, + { + "epoch": 0.9108671789242591, + "grad_norm": 1.707481026649475, + "learning_rate": 2.085143344545114e-05, + "loss": 0.2328, + "step": 16596 + }, + { + "epoch": 0.9109769484083425, + "grad_norm": 1.250362753868103, + "learning_rate": 2.0846270143581773e-05, + "loss": 0.2416, + "step": 16598 + }, + { + "epoch": 0.9110867178924259, + "grad_norm": 1.167681097984314, + "learning_rate": 2.084110702391538e-05, + "loss": 0.2442, + "step": 16600 + }, + { + "epoch": 0.9111964873765094, + "grad_norm": 1.1056715250015259, + "learning_rate": 2.0835944086678445e-05, + "loss": 0.3149, + "step": 16602 + }, + { + "epoch": 0.9113062568605927, + "grad_norm": 1.0259771347045898, + "learning_rate": 2.0830781332097446e-05, + "loss": 0.2465, + "step": 16604 + }, + { + "epoch": 0.9114160263446762, + "grad_norm": 1.0718138217926025, + "learning_rate": 2.082561876039884e-05, + "loss": 0.1444, + "step": 16606 + }, + { + "epoch": 0.9115257958287596, + "grad_norm": 1.2725378274917603, + "learning_rate": 2.0820456371809078e-05, + "loss": 0.2627, + "step": 16608 + }, + { + "epoch": 0.9116355653128431, + "grad_norm": 1.5480685234069824, + "learning_rate": 2.0815294166554623e-05, + "loss": 0.2434, + "step": 16610 + }, + { + "epoch": 0.9117453347969264, + "grad_norm": 1.1452341079711914, + "learning_rate": 2.08101321448619e-05, + "loss": 0.2108, + "step": 16612 + }, + { + "epoch": 0.9118551042810099, + "grad_norm": 1.0907412767410278, + "learning_rate": 2.0804970306957343e-05, + "loss": 0.1979, + "step": 16614 + }, + { + "epoch": 0.9119648737650933, + "grad_norm": 0.8242121934890747, + "learning_rate": 2.079980865306739e-05, + "loss": 0.1738, + "step": 16616 + }, + { + "epoch": 0.9120746432491768, + "grad_norm": 1.1184545755386353, + "learning_rate": 2.079464718341843e-05, + "loss": 0.145, + "step": 16618 + }, + { + "epoch": 0.9121844127332601, + "grad_norm": 1.5518345832824707, + "learning_rate": 2.0789485898236896e-05, + "loss": 0.3189, + "step": 16620 + }, + { + "epoch": 0.9122941822173436, + "grad_norm": 0.9611409902572632, + "learning_rate": 2.0784324797749168e-05, + "loss": 0.2037, + "step": 16622 + }, + { + "epoch": 0.912403951701427, + "grad_norm": 1.6799482107162476, + "learning_rate": 2.0779163882181655e-05, + "loss": 0.2553, + "step": 16624 + }, + { + "epoch": 0.9125137211855104, + "grad_norm": 2.0130395889282227, + "learning_rate": 2.0774003151760724e-05, + "loss": 0.282, + "step": 16626 + }, + { + "epoch": 0.9126234906695938, + "grad_norm": 0.7319064140319824, + "learning_rate": 2.076884260671276e-05, + "loss": 0.1378, + "step": 16628 + }, + { + "epoch": 0.9127332601536773, + "grad_norm": 1.3408445119857788, + "learning_rate": 2.0763682247264137e-05, + "loss": 0.2464, + "step": 16630 + }, + { + "epoch": 0.9128430296377607, + "grad_norm": 1.7875807285308838, + "learning_rate": 2.075852207364119e-05, + "loss": 0.3489, + "step": 16632 + }, + { + "epoch": 0.9129527991218441, + "grad_norm": 0.9317126870155334, + "learning_rate": 2.075336208607029e-05, + "loss": 0.1532, + "step": 16634 + }, + { + "epoch": 0.9130625686059276, + "grad_norm": 1.2825617790222168, + "learning_rate": 2.0748202284777777e-05, + "loss": 0.1709, + "step": 16636 + }, + { + "epoch": 0.913172338090011, + "grad_norm": 1.551279067993164, + "learning_rate": 2.074304266998997e-05, + "loss": 0.2942, + "step": 16638 + }, + { + "epoch": 0.9132821075740944, + "grad_norm": 1.7314680814743042, + "learning_rate": 2.0737883241933213e-05, + "loss": 0.25, + "step": 16640 + }, + { + "epoch": 0.9133918770581778, + "grad_norm": 1.4802024364471436, + "learning_rate": 2.073272400083382e-05, + "loss": 0.2662, + "step": 16642 + }, + { + "epoch": 0.9135016465422613, + "grad_norm": 1.1437244415283203, + "learning_rate": 2.0727564946918087e-05, + "loss": 0.1835, + "step": 16644 + }, + { + "epoch": 0.9136114160263447, + "grad_norm": 1.2685770988464355, + "learning_rate": 2.072240608041233e-05, + "loss": 0.2686, + "step": 16646 + }, + { + "epoch": 0.9137211855104281, + "grad_norm": 0.9742645025253296, + "learning_rate": 2.0717247401542844e-05, + "loss": 0.2472, + "step": 16648 + }, + { + "epoch": 0.9138309549945115, + "grad_norm": 1.511294960975647, + "learning_rate": 2.07120889105359e-05, + "loss": 0.2131, + "step": 16650 + }, + { + "epoch": 0.913940724478595, + "grad_norm": 1.7023483514785767, + "learning_rate": 2.070693060761779e-05, + "loss": 0.304, + "step": 16652 + }, + { + "epoch": 0.9140504939626783, + "grad_norm": 2.36088490486145, + "learning_rate": 2.070177249301476e-05, + "loss": 0.2032, + "step": 16654 + }, + { + "epoch": 0.9141602634467618, + "grad_norm": 0.9741427898406982, + "learning_rate": 2.06966145669531e-05, + "loss": 0.1765, + "step": 16656 + }, + { + "epoch": 0.9142700329308452, + "grad_norm": 0.8963158130645752, + "learning_rate": 2.069145682965904e-05, + "loss": 0.1657, + "step": 16658 + }, + { + "epoch": 0.9143798024149287, + "grad_norm": 1.7017871141433716, + "learning_rate": 2.0686299281358835e-05, + "loss": 0.3522, + "step": 16660 + }, + { + "epoch": 0.914489571899012, + "grad_norm": 1.2607688903808594, + "learning_rate": 2.0681141922278712e-05, + "loss": 0.1627, + "step": 16662 + }, + { + "epoch": 0.9145993413830955, + "grad_norm": 1.684759497642517, + "learning_rate": 2.067598475264491e-05, + "loss": 0.1739, + "step": 16664 + }, + { + "epoch": 0.9147091108671789, + "grad_norm": 1.1479038000106812, + "learning_rate": 2.067082777268364e-05, + "loss": 0.3083, + "step": 16666 + }, + { + "epoch": 0.9148188803512624, + "grad_norm": 1.0088361501693726, + "learning_rate": 2.0665670982621105e-05, + "loss": 0.2205, + "step": 16668 + }, + { + "epoch": 0.9149286498353457, + "grad_norm": 1.4621542692184448, + "learning_rate": 2.0660514382683524e-05, + "loss": 0.3547, + "step": 16670 + }, + { + "epoch": 0.9150384193194292, + "grad_norm": 1.8926312923431396, + "learning_rate": 2.065535797309708e-05, + "loss": 0.2133, + "step": 16672 + }, + { + "epoch": 0.9151481888035127, + "grad_norm": 1.5255582332611084, + "learning_rate": 2.065020175408795e-05, + "loss": 0.2596, + "step": 16674 + }, + { + "epoch": 0.915257958287596, + "grad_norm": 0.8471286296844482, + "learning_rate": 2.0645045725882332e-05, + "loss": 0.1635, + "step": 16676 + }, + { + "epoch": 0.9153677277716795, + "grad_norm": 1.3320236206054688, + "learning_rate": 2.063988988870638e-05, + "loss": 0.2828, + "step": 16678 + }, + { + "epoch": 0.9154774972557629, + "grad_norm": 1.5800930261611938, + "learning_rate": 2.0634734242786257e-05, + "loss": 0.1964, + "step": 16680 + }, + { + "epoch": 0.9155872667398464, + "grad_norm": 1.6938821077346802, + "learning_rate": 2.062957878834812e-05, + "loss": 0.2627, + "step": 16682 + }, + { + "epoch": 0.9156970362239297, + "grad_norm": 1.8965541124343872, + "learning_rate": 2.0624423525618098e-05, + "loss": 0.2712, + "step": 16684 + }, + { + "epoch": 0.9158068057080132, + "grad_norm": 1.968278408050537, + "learning_rate": 2.0619268454822347e-05, + "loss": 0.2869, + "step": 16686 + }, + { + "epoch": 0.9159165751920966, + "grad_norm": 1.5059622526168823, + "learning_rate": 2.0614113576186978e-05, + "loss": 0.222, + "step": 16688 + }, + { + "epoch": 0.91602634467618, + "grad_norm": 1.09673011302948, + "learning_rate": 2.0608958889938118e-05, + "loss": 0.3124, + "step": 16690 + }, + { + "epoch": 0.9161361141602634, + "grad_norm": 1.0393935441970825, + "learning_rate": 2.0603804396301876e-05, + "loss": 0.2356, + "step": 16692 + }, + { + "epoch": 0.9162458836443469, + "grad_norm": 1.2508772611618042, + "learning_rate": 2.0598650095504347e-05, + "loss": 0.2674, + "step": 16694 + }, + { + "epoch": 0.9163556531284303, + "grad_norm": 0.9402973651885986, + "learning_rate": 2.0593495987771634e-05, + "loss": 0.1486, + "step": 16696 + }, + { + "epoch": 0.9164654226125137, + "grad_norm": 1.1617894172668457, + "learning_rate": 2.058834207332981e-05, + "loss": 0.1603, + "step": 16698 + }, + { + "epoch": 0.9165751920965971, + "grad_norm": 1.0244641304016113, + "learning_rate": 2.0583188352404954e-05, + "loss": 0.1528, + "step": 16700 + }, + { + "epoch": 0.9166849615806806, + "grad_norm": 1.3450673818588257, + "learning_rate": 2.057803482522314e-05, + "loss": 0.1779, + "step": 16702 + }, + { + "epoch": 0.916794731064764, + "grad_norm": 1.5373034477233887, + "learning_rate": 2.057288149201042e-05, + "loss": 0.2583, + "step": 16704 + }, + { + "epoch": 0.9169045005488474, + "grad_norm": 1.1503334045410156, + "learning_rate": 2.0567728352992852e-05, + "loss": 0.2024, + "step": 16706 + }, + { + "epoch": 0.9170142700329309, + "grad_norm": 1.2154144048690796, + "learning_rate": 2.056257540839647e-05, + "loss": 0.1765, + "step": 16708 + }, + { + "epoch": 0.9171240395170143, + "grad_norm": 1.4413986206054688, + "learning_rate": 2.055742265844731e-05, + "loss": 0.3222, + "step": 16710 + }, + { + "epoch": 0.9172338090010977, + "grad_norm": 1.1815084218978882, + "learning_rate": 2.05522701033714e-05, + "loss": 0.2006, + "step": 16712 + }, + { + "epoch": 0.9173435784851811, + "grad_norm": 1.3815665245056152, + "learning_rate": 2.0547117743394744e-05, + "loss": 0.2256, + "step": 16714 + }, + { + "epoch": 0.9174533479692646, + "grad_norm": 2.5368168354034424, + "learning_rate": 2.054196557874337e-05, + "loss": 0.2493, + "step": 16716 + }, + { + "epoch": 0.917563117453348, + "grad_norm": 1.5386501550674438, + "learning_rate": 2.053681360964326e-05, + "loss": 0.2095, + "step": 16718 + }, + { + "epoch": 0.9176728869374314, + "grad_norm": 1.788823127746582, + "learning_rate": 2.0531661836320422e-05, + "loss": 0.3211, + "step": 16720 + }, + { + "epoch": 0.9177826564215148, + "grad_norm": 1.0979334115982056, + "learning_rate": 2.0526510259000827e-05, + "loss": 0.1914, + "step": 16722 + }, + { + "epoch": 0.9178924259055983, + "grad_norm": 1.333168864250183, + "learning_rate": 2.0521358877910444e-05, + "loss": 0.242, + "step": 16724 + }, + { + "epoch": 0.9180021953896816, + "grad_norm": 0.9603593349456787, + "learning_rate": 2.0516207693275245e-05, + "loss": 0.2205, + "step": 16726 + }, + { + "epoch": 0.9181119648737651, + "grad_norm": 1.4208967685699463, + "learning_rate": 2.0511056705321185e-05, + "loss": 0.2548, + "step": 16728 + }, + { + "epoch": 0.9182217343578485, + "grad_norm": 0.9997597932815552, + "learning_rate": 2.0505905914274207e-05, + "loss": 0.2255, + "step": 16730 + }, + { + "epoch": 0.918331503841932, + "grad_norm": 1.0205166339874268, + "learning_rate": 2.050075532036026e-05, + "loss": 0.2102, + "step": 16732 + }, + { + "epoch": 0.9184412733260153, + "grad_norm": 1.3585118055343628, + "learning_rate": 2.0495604923805268e-05, + "loss": 0.3295, + "step": 16734 + }, + { + "epoch": 0.9185510428100988, + "grad_norm": 1.787887454032898, + "learning_rate": 2.0490454724835147e-05, + "loss": 0.2707, + "step": 16736 + }, + { + "epoch": 0.9186608122941822, + "grad_norm": 1.6983773708343506, + "learning_rate": 2.048530472367582e-05, + "loss": 0.2878, + "step": 16738 + }, + { + "epoch": 0.9187705817782656, + "grad_norm": 1.732344627380371, + "learning_rate": 2.0480154920553186e-05, + "loss": 0.2135, + "step": 16740 + }, + { + "epoch": 0.918880351262349, + "grad_norm": 1.5326874256134033, + "learning_rate": 2.047500531569314e-05, + "loss": 0.2195, + "step": 16742 + }, + { + "epoch": 0.9189901207464325, + "grad_norm": 1.0762985944747925, + "learning_rate": 2.0469855909321564e-05, + "loss": 0.2802, + "step": 16744 + }, + { + "epoch": 0.919099890230516, + "grad_norm": 1.1848593950271606, + "learning_rate": 2.0464706701664354e-05, + "loss": 0.2514, + "step": 16746 + }, + { + "epoch": 0.9192096597145993, + "grad_norm": 1.7661689519882202, + "learning_rate": 2.0459557692947367e-05, + "loss": 0.2022, + "step": 16748 + }, + { + "epoch": 0.9193194291986828, + "grad_norm": 1.4833309650421143, + "learning_rate": 2.0454408883396465e-05, + "loss": 0.1744, + "step": 16750 + }, + { + "epoch": 0.9194291986827662, + "grad_norm": 1.5359270572662354, + "learning_rate": 2.04492602732375e-05, + "loss": 0.247, + "step": 16752 + }, + { + "epoch": 0.9195389681668497, + "grad_norm": 2.0493996143341064, + "learning_rate": 2.0444111862696314e-05, + "loss": 0.2112, + "step": 16754 + }, + { + "epoch": 0.919648737650933, + "grad_norm": 1.6024640798568726, + "learning_rate": 2.0438963651998747e-05, + "loss": 0.2253, + "step": 16756 + }, + { + "epoch": 0.9197585071350165, + "grad_norm": 1.319040060043335, + "learning_rate": 2.0433815641370622e-05, + "loss": 0.3249, + "step": 16758 + }, + { + "epoch": 0.9198682766190999, + "grad_norm": 1.4358748197555542, + "learning_rate": 2.042866783103775e-05, + "loss": 0.1899, + "step": 16760 + }, + { + "epoch": 0.9199780461031833, + "grad_norm": 1.766697883605957, + "learning_rate": 2.0423520221225947e-05, + "loss": 0.3222, + "step": 16762 + }, + { + "epoch": 0.9200878155872667, + "grad_norm": 1.8756924867630005, + "learning_rate": 2.0418372812161012e-05, + "loss": 0.238, + "step": 16764 + }, + { + "epoch": 0.9201975850713502, + "grad_norm": 1.5161045789718628, + "learning_rate": 2.0413225604068727e-05, + "loss": 0.2336, + "step": 16766 + }, + { + "epoch": 0.9203073545554336, + "grad_norm": 1.2678120136260986, + "learning_rate": 2.0408078597174886e-05, + "loss": 0.224, + "step": 16768 + }, + { + "epoch": 0.920417124039517, + "grad_norm": 1.7000011205673218, + "learning_rate": 2.0402931791705255e-05, + "loss": 0.2547, + "step": 16770 + }, + { + "epoch": 0.9205268935236004, + "grad_norm": 1.1301026344299316, + "learning_rate": 2.0397785187885598e-05, + "loss": 0.2435, + "step": 16772 + }, + { + "epoch": 0.9206366630076839, + "grad_norm": 1.0284593105316162, + "learning_rate": 2.0392638785941664e-05, + "loss": 0.2731, + "step": 16774 + }, + { + "epoch": 0.9207464324917672, + "grad_norm": 1.1337722539901733, + "learning_rate": 2.038749258609922e-05, + "loss": 0.2371, + "step": 16776 + }, + { + "epoch": 0.9208562019758507, + "grad_norm": 1.1911219358444214, + "learning_rate": 2.0382346588583987e-05, + "loss": 0.1762, + "step": 16778 + }, + { + "epoch": 0.9209659714599341, + "grad_norm": 0.7568132281303406, + "learning_rate": 2.037720079362169e-05, + "loss": 0.1458, + "step": 16780 + }, + { + "epoch": 0.9210757409440176, + "grad_norm": 1.4217337369918823, + "learning_rate": 2.0372055201438067e-05, + "loss": 0.2156, + "step": 16782 + }, + { + "epoch": 0.921185510428101, + "grad_norm": 1.721410870552063, + "learning_rate": 2.0366909812258817e-05, + "loss": 0.2513, + "step": 16784 + }, + { + "epoch": 0.9212952799121844, + "grad_norm": 0.8135682344436646, + "learning_rate": 2.0361764626309636e-05, + "loss": 0.1741, + "step": 16786 + }, + { + "epoch": 0.9214050493962679, + "grad_norm": 1.5994352102279663, + "learning_rate": 2.0356619643816234e-05, + "loss": 0.1761, + "step": 16788 + }, + { + "epoch": 0.9215148188803512, + "grad_norm": 1.3180783987045288, + "learning_rate": 2.035147486500428e-05, + "loss": 0.2891, + "step": 16790 + }, + { + "epoch": 0.9216245883644347, + "grad_norm": 1.229097604751587, + "learning_rate": 2.034633029009945e-05, + "loss": 0.2454, + "step": 16792 + }, + { + "epoch": 0.9217343578485181, + "grad_norm": 1.2067152261734009, + "learning_rate": 2.0341185919327423e-05, + "loss": 0.2277, + "step": 16794 + }, + { + "epoch": 0.9218441273326016, + "grad_norm": 0.9602715969085693, + "learning_rate": 2.0336041752913843e-05, + "loss": 0.1919, + "step": 16796 + }, + { + "epoch": 0.9219538968166849, + "grad_norm": 1.7780088186264038, + "learning_rate": 2.033089779108437e-05, + "loss": 0.3086, + "step": 16798 + }, + { + "epoch": 0.9220636663007684, + "grad_norm": 1.5597691535949707, + "learning_rate": 2.032575403406463e-05, + "loss": 0.1738, + "step": 16800 + }, + { + "epoch": 0.9221734357848518, + "grad_norm": 1.196648120880127, + "learning_rate": 2.0320610482080264e-05, + "loss": 0.1801, + "step": 16802 + }, + { + "epoch": 0.9222832052689353, + "grad_norm": 0.9214540123939514, + "learning_rate": 2.031546713535688e-05, + "loss": 0.2453, + "step": 16804 + }, + { + "epoch": 0.9223929747530186, + "grad_norm": 1.1280637979507446, + "learning_rate": 2.031032399412011e-05, + "loss": 0.225, + "step": 16806 + }, + { + "epoch": 0.9225027442371021, + "grad_norm": 2.1862387657165527, + "learning_rate": 2.0305181058595552e-05, + "loss": 0.3254, + "step": 16808 + }, + { + "epoch": 0.9226125137211855, + "grad_norm": 1.2496845722198486, + "learning_rate": 2.0300038329008792e-05, + "loss": 0.2682, + "step": 16810 + }, + { + "epoch": 0.922722283205269, + "grad_norm": 1.112260341644287, + "learning_rate": 2.029489580558542e-05, + "loss": 0.1533, + "step": 16812 + }, + { + "epoch": 0.9228320526893523, + "grad_norm": 1.540747046470642, + "learning_rate": 2.0289753488551018e-05, + "loss": 0.1935, + "step": 16814 + }, + { + "epoch": 0.9229418221734358, + "grad_norm": 1.9278901815414429, + "learning_rate": 2.0284611378131136e-05, + "loss": 0.2255, + "step": 16816 + }, + { + "epoch": 0.9230515916575192, + "grad_norm": 1.8017834424972534, + "learning_rate": 2.027946947455135e-05, + "loss": 0.2355, + "step": 16818 + }, + { + "epoch": 0.9231613611416026, + "grad_norm": 1.5208103656768799, + "learning_rate": 2.02743277780372e-05, + "loss": 0.3065, + "step": 16820 + }, + { + "epoch": 0.9232711306256861, + "grad_norm": 1.0788962841033936, + "learning_rate": 2.026918628881423e-05, + "loss": 0.2637, + "step": 16822 + }, + { + "epoch": 0.9233809001097695, + "grad_norm": 1.3828626871109009, + "learning_rate": 2.0264045007107973e-05, + "loss": 0.2562, + "step": 16824 + }, + { + "epoch": 0.923490669593853, + "grad_norm": 2.3175947666168213, + "learning_rate": 2.0258903933143948e-05, + "loss": 0.3952, + "step": 16826 + }, + { + "epoch": 0.9236004390779363, + "grad_norm": 1.2900148630142212, + "learning_rate": 2.0253763067147657e-05, + "loss": 0.1859, + "step": 16828 + }, + { + "epoch": 0.9237102085620198, + "grad_norm": 1.4011274576187134, + "learning_rate": 2.024862240934462e-05, + "loss": 0.3619, + "step": 16830 + }, + { + "epoch": 0.9238199780461032, + "grad_norm": 1.0779736042022705, + "learning_rate": 2.0243481959960327e-05, + "loss": 0.2306, + "step": 16832 + }, + { + "epoch": 0.9239297475301866, + "grad_norm": 1.1678640842437744, + "learning_rate": 2.0238341719220254e-05, + "loss": 0.2767, + "step": 16834 + }, + { + "epoch": 0.92403951701427, + "grad_norm": 2.08482027053833, + "learning_rate": 2.0233201687349887e-05, + "loss": 0.2113, + "step": 16836 + }, + { + "epoch": 0.9241492864983535, + "grad_norm": 1.262610673904419, + "learning_rate": 2.0228061864574695e-05, + "loss": 0.2627, + "step": 16838 + }, + { + "epoch": 0.9242590559824369, + "grad_norm": 1.1803638935089111, + "learning_rate": 2.022292225112013e-05, + "loss": 0.1547, + "step": 16840 + }, + { + "epoch": 0.9243688254665203, + "grad_norm": 1.5448228120803833, + "learning_rate": 2.0217782847211643e-05, + "loss": 0.1608, + "step": 16842 + }, + { + "epoch": 0.9244785949506037, + "grad_norm": 1.1131609678268433, + "learning_rate": 2.021264365307468e-05, + "loss": 0.2451, + "step": 16844 + }, + { + "epoch": 0.9245883644346872, + "grad_norm": 1.5058331489562988, + "learning_rate": 2.020750466893465e-05, + "loss": 0.2501, + "step": 16846 + }, + { + "epoch": 0.9246981339187705, + "grad_norm": 2.0063204765319824, + "learning_rate": 2.0202365895017e-05, + "loss": 0.4086, + "step": 16848 + }, + { + "epoch": 0.924807903402854, + "grad_norm": 1.0712536573410034, + "learning_rate": 2.0197227331547128e-05, + "loss": 0.2711, + "step": 16850 + }, + { + "epoch": 0.9249176728869374, + "grad_norm": 1.1869292259216309, + "learning_rate": 2.0192088978750433e-05, + "loss": 0.2058, + "step": 16852 + }, + { + "epoch": 0.9250274423710209, + "grad_norm": 1.0038262605667114, + "learning_rate": 2.018695083685232e-05, + "loss": 0.174, + "step": 16854 + }, + { + "epoch": 0.9251372118551043, + "grad_norm": 1.420252799987793, + "learning_rate": 2.0181812906078164e-05, + "loss": 0.2295, + "step": 16856 + }, + { + "epoch": 0.9252469813391877, + "grad_norm": 1.7632863521575928, + "learning_rate": 2.017667518665334e-05, + "loss": 0.2511, + "step": 16858 + }, + { + "epoch": 0.9253567508232712, + "grad_norm": 1.0855333805084229, + "learning_rate": 2.0171537678803225e-05, + "loss": 0.155, + "step": 16860 + }, + { + "epoch": 0.9254665203073545, + "grad_norm": 1.2097680568695068, + "learning_rate": 2.0166400382753163e-05, + "loss": 0.1939, + "step": 16862 + }, + { + "epoch": 0.925576289791438, + "grad_norm": 1.0945230722427368, + "learning_rate": 2.0161263298728495e-05, + "loss": 0.2552, + "step": 16864 + }, + { + "epoch": 0.9256860592755214, + "grad_norm": 1.1718597412109375, + "learning_rate": 2.0156126426954573e-05, + "loss": 0.2047, + "step": 16866 + }, + { + "epoch": 0.9257958287596049, + "grad_norm": 1.6186599731445312, + "learning_rate": 2.0150989767656728e-05, + "loss": 0.2727, + "step": 16868 + }, + { + "epoch": 0.9259055982436882, + "grad_norm": 1.442978024482727, + "learning_rate": 2.014585332106027e-05, + "loss": 0.282, + "step": 16870 + }, + { + "epoch": 0.9260153677277717, + "grad_norm": 1.695927619934082, + "learning_rate": 2.014071708739051e-05, + "loss": 0.3912, + "step": 16872 + }, + { + "epoch": 0.9261251372118551, + "grad_norm": 1.129112720489502, + "learning_rate": 2.013558106687275e-05, + "loss": 0.1425, + "step": 16874 + }, + { + "epoch": 0.9262349066959386, + "grad_norm": 1.663652777671814, + "learning_rate": 2.0130445259732285e-05, + "loss": 0.3249, + "step": 16876 + }, + { + "epoch": 0.9263446761800219, + "grad_norm": 1.6915943622589111, + "learning_rate": 2.012530966619438e-05, + "loss": 0.2607, + "step": 16878 + }, + { + "epoch": 0.9264544456641054, + "grad_norm": 1.430191993713379, + "learning_rate": 2.012017428648433e-05, + "loss": 0.227, + "step": 16880 + }, + { + "epoch": 0.9265642151481888, + "grad_norm": 0.8036177754402161, + "learning_rate": 2.011503912082738e-05, + "loss": 0.1582, + "step": 16882 + }, + { + "epoch": 0.9266739846322722, + "grad_norm": 1.982994556427002, + "learning_rate": 2.01099041694488e-05, + "loss": 0.2533, + "step": 16884 + }, + { + "epoch": 0.9267837541163556, + "grad_norm": 1.2534669637680054, + "learning_rate": 2.010476943257382e-05, + "loss": 0.3418, + "step": 16886 + }, + { + "epoch": 0.9268935236004391, + "grad_norm": 1.0576475858688354, + "learning_rate": 2.0099634910427678e-05, + "loss": 0.1508, + "step": 16888 + }, + { + "epoch": 0.9270032930845225, + "grad_norm": 1.5223323106765747, + "learning_rate": 2.0094500603235607e-05, + "loss": 0.2998, + "step": 16890 + }, + { + "epoch": 0.9271130625686059, + "grad_norm": 1.0674792528152466, + "learning_rate": 2.0089366511222813e-05, + "loss": 0.3017, + "step": 16892 + }, + { + "epoch": 0.9272228320526894, + "grad_norm": 1.1910678148269653, + "learning_rate": 2.0084232634614503e-05, + "loss": 0.2505, + "step": 16894 + }, + { + "epoch": 0.9273326015367728, + "grad_norm": 1.4830806255340576, + "learning_rate": 2.0079098973635872e-05, + "loss": 0.1983, + "step": 16896 + }, + { + "epoch": 0.9274423710208562, + "grad_norm": 1.0820451974868774, + "learning_rate": 2.0073965528512125e-05, + "loss": 0.1815, + "step": 16898 + }, + { + "epoch": 0.9275521405049396, + "grad_norm": 0.8610121011734009, + "learning_rate": 2.0068832299468428e-05, + "loss": 0.2113, + "step": 16900 + }, + { + "epoch": 0.9276619099890231, + "grad_norm": 1.9292055368423462, + "learning_rate": 2.006369928672995e-05, + "loss": 0.2837, + "step": 16902 + }, + { + "epoch": 0.9277716794731065, + "grad_norm": 0.9248585104942322, + "learning_rate": 2.0058566490521847e-05, + "loss": 0.166, + "step": 16904 + }, + { + "epoch": 0.9278814489571899, + "grad_norm": 1.503425121307373, + "learning_rate": 2.0053433911069275e-05, + "loss": 0.171, + "step": 16906 + }, + { + "epoch": 0.9279912184412733, + "grad_norm": 1.4325109720230103, + "learning_rate": 2.0048301548597363e-05, + "loss": 0.2256, + "step": 16908 + }, + { + "epoch": 0.9281009879253568, + "grad_norm": 1.9414827823638916, + "learning_rate": 2.0043169403331262e-05, + "loss": 0.3234, + "step": 16910 + }, + { + "epoch": 0.9282107574094401, + "grad_norm": 1.6849217414855957, + "learning_rate": 2.0038037475496075e-05, + "loss": 0.3548, + "step": 16912 + }, + { + "epoch": 0.9283205268935236, + "grad_norm": 1.9520471096038818, + "learning_rate": 2.0032905765316914e-05, + "loss": 0.3096, + "step": 16914 + }, + { + "epoch": 0.928430296377607, + "grad_norm": 1.2494022846221924, + "learning_rate": 2.0027774273018892e-05, + "loss": 0.174, + "step": 16916 + }, + { + "epoch": 0.9285400658616905, + "grad_norm": 1.2106690406799316, + "learning_rate": 2.0022642998827094e-05, + "loss": 0.4238, + "step": 16918 + }, + { + "epoch": 0.9286498353457738, + "grad_norm": 0.8940424919128418, + "learning_rate": 2.00175119429666e-05, + "loss": 0.1123, + "step": 16920 + }, + { + "epoch": 0.9287596048298573, + "grad_norm": 1.1529929637908936, + "learning_rate": 2.0012381105662495e-05, + "loss": 0.298, + "step": 16922 + }, + { + "epoch": 0.9288693743139407, + "grad_norm": 0.9356910586357117, + "learning_rate": 2.000725048713983e-05, + "loss": 0.1561, + "step": 16924 + }, + { + "epoch": 0.9289791437980242, + "grad_norm": 2.2212862968444824, + "learning_rate": 2.0002120087623662e-05, + "loss": 0.2849, + "step": 16926 + }, + { + "epoch": 0.9290889132821075, + "grad_norm": 1.1054129600524902, + "learning_rate": 1.999698990733904e-05, + "loss": 0.1496, + "step": 16928 + }, + { + "epoch": 0.929198682766191, + "grad_norm": 0.9688382744789124, + "learning_rate": 1.9991859946511e-05, + "loss": 0.1595, + "step": 16930 + }, + { + "epoch": 0.9293084522502745, + "grad_norm": 1.6453092098236084, + "learning_rate": 1.998673020536456e-05, + "loss": 0.2234, + "step": 16932 + }, + { + "epoch": 0.9294182217343578, + "grad_norm": 2.747161388397217, + "learning_rate": 1.9981600684124746e-05, + "loss": 0.3105, + "step": 16934 + }, + { + "epoch": 0.9295279912184413, + "grad_norm": 1.6476927995681763, + "learning_rate": 1.9976471383016557e-05, + "loss": 0.2564, + "step": 16936 + }, + { + "epoch": 0.9296377607025247, + "grad_norm": 1.5873790979385376, + "learning_rate": 1.9971342302264982e-05, + "loss": 0.259, + "step": 16938 + }, + { + "epoch": 0.9297475301866082, + "grad_norm": 1.251145601272583, + "learning_rate": 1.9966213442095028e-05, + "loss": 0.2933, + "step": 16940 + }, + { + "epoch": 0.9298572996706915, + "grad_norm": 1.4840630292892456, + "learning_rate": 1.9961084802731654e-05, + "loss": 0.2252, + "step": 16942 + }, + { + "epoch": 0.929967069154775, + "grad_norm": 1.6296130418777466, + "learning_rate": 1.9955956384399828e-05, + "loss": 0.2437, + "step": 16944 + }, + { + "epoch": 0.9300768386388584, + "grad_norm": 1.2449296712875366, + "learning_rate": 1.995082818732452e-05, + "loss": 0.323, + "step": 16946 + }, + { + "epoch": 0.9301866081229418, + "grad_norm": 0.9558162093162537, + "learning_rate": 1.994570021173067e-05, + "loss": 0.1393, + "step": 16948 + }, + { + "epoch": 0.9302963776070252, + "grad_norm": 1.3146400451660156, + "learning_rate": 1.994057245784321e-05, + "loss": 0.1978, + "step": 16950 + }, + { + "epoch": 0.9304061470911087, + "grad_norm": 1.355965256690979, + "learning_rate": 1.9935444925887082e-05, + "loss": 0.3357, + "step": 16952 + }, + { + "epoch": 0.9305159165751921, + "grad_norm": 1.1886720657348633, + "learning_rate": 1.9930317616087196e-05, + "loss": 0.2043, + "step": 16954 + }, + { + "epoch": 0.9306256860592755, + "grad_norm": 1.0889595746994019, + "learning_rate": 1.992519052866845e-05, + "loss": 0.1731, + "step": 16956 + }, + { + "epoch": 0.9307354555433589, + "grad_norm": 1.4760611057281494, + "learning_rate": 1.9920063663855772e-05, + "loss": 0.2293, + "step": 16958 + }, + { + "epoch": 0.9308452250274424, + "grad_norm": 1.1835918426513672, + "learning_rate": 1.9914937021874032e-05, + "loss": 0.2007, + "step": 16960 + }, + { + "epoch": 0.9309549945115257, + "grad_norm": 1.155435562133789, + "learning_rate": 1.9909810602948116e-05, + "loss": 0.1742, + "step": 16962 + }, + { + "epoch": 0.9310647639956092, + "grad_norm": 1.3289620876312256, + "learning_rate": 1.9904684407302883e-05, + "loss": 0.2003, + "step": 16964 + }, + { + "epoch": 0.9311745334796927, + "grad_norm": 1.7021671533584595, + "learning_rate": 1.9899558435163207e-05, + "loss": 0.2929, + "step": 16966 + }, + { + "epoch": 0.9312843029637761, + "grad_norm": 1.071193814277649, + "learning_rate": 1.989443268675393e-05, + "loss": 0.1374, + "step": 16968 + }, + { + "epoch": 0.9313940724478595, + "grad_norm": 1.4264458417892456, + "learning_rate": 1.9889307162299897e-05, + "loss": 0.1421, + "step": 16970 + }, + { + "epoch": 0.9315038419319429, + "grad_norm": 2.530095338821411, + "learning_rate": 1.988418186202594e-05, + "loss": 0.1674, + "step": 16972 + }, + { + "epoch": 0.9316136114160264, + "grad_norm": 1.4264417886734009, + "learning_rate": 1.9879056786156866e-05, + "loss": 0.3806, + "step": 16974 + }, + { + "epoch": 0.9317233809001098, + "grad_norm": 2.5471410751342773, + "learning_rate": 1.9873931934917506e-05, + "loss": 0.2061, + "step": 16976 + }, + { + "epoch": 0.9318331503841932, + "grad_norm": 1.5795798301696777, + "learning_rate": 1.986880730853265e-05, + "loss": 0.1635, + "step": 16978 + }, + { + "epoch": 0.9319429198682766, + "grad_norm": 1.8749070167541504, + "learning_rate": 1.9863682907227088e-05, + "loss": 0.2719, + "step": 16980 + }, + { + "epoch": 0.9320526893523601, + "grad_norm": 1.5614995956420898, + "learning_rate": 1.985855873122561e-05, + "loss": 0.2349, + "step": 16982 + }, + { + "epoch": 0.9321624588364434, + "grad_norm": 1.157348871231079, + "learning_rate": 1.9853434780752973e-05, + "loss": 0.3032, + "step": 16984 + }, + { + "epoch": 0.9322722283205269, + "grad_norm": 1.065163254737854, + "learning_rate": 1.9848311056033948e-05, + "loss": 0.1888, + "step": 16986 + }, + { + "epoch": 0.9323819978046103, + "grad_norm": 0.814180850982666, + "learning_rate": 1.9843187557293284e-05, + "loss": 0.1479, + "step": 16988 + }, + { + "epoch": 0.9324917672886938, + "grad_norm": 1.3915847539901733, + "learning_rate": 1.983806428475573e-05, + "loss": 0.185, + "step": 16990 + }, + { + "epoch": 0.9326015367727771, + "grad_norm": 2.2947731018066406, + "learning_rate": 1.983294123864602e-05, + "loss": 0.214, + "step": 16992 + }, + { + "epoch": 0.9327113062568606, + "grad_norm": 1.1636332273483276, + "learning_rate": 1.9827818419188856e-05, + "loss": 0.233, + "step": 16994 + }, + { + "epoch": 0.932821075740944, + "grad_norm": 1.29899263381958, + "learning_rate": 1.9822695826608972e-05, + "loss": 0.1961, + "step": 16996 + }, + { + "epoch": 0.9329308452250274, + "grad_norm": 1.5403436422348022, + "learning_rate": 1.981757346113106e-05, + "loss": 0.271, + "step": 16998 + }, + { + "epoch": 0.9330406147091108, + "grad_norm": 1.5002939701080322, + "learning_rate": 1.9812451322979805e-05, + "loss": 0.1304, + "step": 17000 + }, + { + "epoch": 0.9331503841931943, + "grad_norm": 0.950350284576416, + "learning_rate": 1.9807329412379903e-05, + "loss": 0.1298, + "step": 17002 + }, + { + "epoch": 0.9332601536772778, + "grad_norm": 1.1040829420089722, + "learning_rate": 1.980220772955602e-05, + "loss": 0.245, + "step": 17004 + }, + { + "epoch": 0.9333699231613611, + "grad_norm": 1.7414133548736572, + "learning_rate": 1.9797086274732818e-05, + "loss": 0.2414, + "step": 17006 + }, + { + "epoch": 0.9334796926454446, + "grad_norm": 1.3752931356430054, + "learning_rate": 1.979196504813495e-05, + "loss": 0.2515, + "step": 17008 + }, + { + "epoch": 0.933589462129528, + "grad_norm": 1.0742735862731934, + "learning_rate": 1.9786844049987052e-05, + "loss": 0.1516, + "step": 17010 + }, + { + "epoch": 0.9336992316136115, + "grad_norm": 1.5134912729263306, + "learning_rate": 1.9781723280513768e-05, + "loss": 0.2133, + "step": 17012 + }, + { + "epoch": 0.9338090010976948, + "grad_norm": 1.8940401077270508, + "learning_rate": 1.9776602739939714e-05, + "loss": 0.2088, + "step": 17014 + }, + { + "epoch": 0.9339187705817783, + "grad_norm": 1.2763936519622803, + "learning_rate": 1.977148242848949e-05, + "loss": 0.2204, + "step": 17016 + }, + { + "epoch": 0.9340285400658617, + "grad_norm": 1.126664400100708, + "learning_rate": 1.9766362346387717e-05, + "loss": 0.1811, + "step": 17018 + }, + { + "epoch": 0.9341383095499451, + "grad_norm": 1.0504447221755981, + "learning_rate": 1.9761242493858987e-05, + "loss": 0.2085, + "step": 17020 + }, + { + "epoch": 0.9342480790340285, + "grad_norm": 0.8206415176391602, + "learning_rate": 1.975612287112787e-05, + "loss": 0.1696, + "step": 17022 + }, + { + "epoch": 0.934357848518112, + "grad_norm": 1.1812289953231812, + "learning_rate": 1.975100347841894e-05, + "loss": 0.2957, + "step": 17024 + }, + { + "epoch": 0.9344676180021954, + "grad_norm": 1.2588151693344116, + "learning_rate": 1.9745884315956764e-05, + "loss": 0.2462, + "step": 17026 + }, + { + "epoch": 0.9345773874862788, + "grad_norm": 1.1472963094711304, + "learning_rate": 1.9740765383965893e-05, + "loss": 0.2239, + "step": 17028 + }, + { + "epoch": 0.9346871569703622, + "grad_norm": 1.2808573246002197, + "learning_rate": 1.973564668267086e-05, + "loss": 0.2617, + "step": 17030 + }, + { + "epoch": 0.9347969264544457, + "grad_norm": 4.816305637359619, + "learning_rate": 1.9730528212296208e-05, + "loss": 0.3106, + "step": 17032 + }, + { + "epoch": 0.934906695938529, + "grad_norm": 1.7051811218261719, + "learning_rate": 1.972540997306645e-05, + "loss": 0.2677, + "step": 17034 + }, + { + "epoch": 0.9350164654226125, + "grad_norm": 1.3653392791748047, + "learning_rate": 1.9720291965206095e-05, + "loss": 0.2365, + "step": 17036 + }, + { + "epoch": 0.9351262349066959, + "grad_norm": 1.4274005889892578, + "learning_rate": 1.9715174188939657e-05, + "loss": 0.3394, + "step": 17038 + }, + { + "epoch": 0.9352360043907794, + "grad_norm": 1.0161250829696655, + "learning_rate": 1.9710056644491614e-05, + "loss": 0.214, + "step": 17040 + }, + { + "epoch": 0.9353457738748628, + "grad_norm": 1.3416926860809326, + "learning_rate": 1.970493933208645e-05, + "loss": 0.2233, + "step": 17042 + }, + { + "epoch": 0.9354555433589462, + "grad_norm": 0.905860424041748, + "learning_rate": 1.969982225194864e-05, + "loss": 0.1914, + "step": 17044 + }, + { + "epoch": 0.9355653128430297, + "grad_norm": 1.694493055343628, + "learning_rate": 1.9694705404302632e-05, + "loss": 0.1537, + "step": 17046 + }, + { + "epoch": 0.935675082327113, + "grad_norm": 1.8420711755752563, + "learning_rate": 1.9689588789372896e-05, + "loss": 0.2289, + "step": 17048 + }, + { + "epoch": 0.9357848518111965, + "grad_norm": 2.7674193382263184, + "learning_rate": 1.968447240738385e-05, + "loss": 0.3801, + "step": 17050 + }, + { + "epoch": 0.9358946212952799, + "grad_norm": 1.0637673139572144, + "learning_rate": 1.9679356258559944e-05, + "loss": 0.2541, + "step": 17052 + }, + { + "epoch": 0.9360043907793634, + "grad_norm": 1.0853073596954346, + "learning_rate": 1.9674240343125588e-05, + "loss": 0.1844, + "step": 17054 + }, + { + "epoch": 0.9361141602634467, + "grad_norm": 0.9679655432701111, + "learning_rate": 1.9669124661305185e-05, + "loss": 0.2665, + "step": 17056 + }, + { + "epoch": 0.9362239297475302, + "grad_norm": 1.722578525543213, + "learning_rate": 1.9664009213323145e-05, + "loss": 0.3074, + "step": 17058 + }, + { + "epoch": 0.9363336992316136, + "grad_norm": 1.0833262205123901, + "learning_rate": 1.9658893999403847e-05, + "loss": 0.1764, + "step": 17060 + }, + { + "epoch": 0.9364434687156971, + "grad_norm": 1.125285267829895, + "learning_rate": 1.9653779019771678e-05, + "loss": 0.2526, + "step": 17062 + }, + { + "epoch": 0.9365532381997804, + "grad_norm": 1.216549038887024, + "learning_rate": 1.9648664274651e-05, + "loss": 0.2251, + "step": 17064 + }, + { + "epoch": 0.9366630076838639, + "grad_norm": 1.8253134489059448, + "learning_rate": 1.9643549764266173e-05, + "loss": 0.1279, + "step": 17066 + }, + { + "epoch": 0.9367727771679473, + "grad_norm": 1.3858145475387573, + "learning_rate": 1.9638435488841546e-05, + "loss": 0.2393, + "step": 17068 + }, + { + "epoch": 0.9368825466520307, + "grad_norm": 2.766947031021118, + "learning_rate": 1.9633321448601454e-05, + "loss": 0.2173, + "step": 17070 + }, + { + "epoch": 0.9369923161361141, + "grad_norm": 1.1416221857070923, + "learning_rate": 1.9628207643770223e-05, + "loss": 0.2077, + "step": 17072 + }, + { + "epoch": 0.9371020856201976, + "grad_norm": 1.2852377891540527, + "learning_rate": 1.9623094074572173e-05, + "loss": 0.2534, + "step": 17074 + }, + { + "epoch": 0.937211855104281, + "grad_norm": 1.110273838043213, + "learning_rate": 1.96179807412316e-05, + "loss": 0.2107, + "step": 17076 + }, + { + "epoch": 0.9373216245883644, + "grad_norm": 2.2271690368652344, + "learning_rate": 1.9612867643972817e-05, + "loss": 0.1201, + "step": 17078 + }, + { + "epoch": 0.9374313940724479, + "grad_norm": 1.3720966577529907, + "learning_rate": 1.9607754783020092e-05, + "loss": 0.2541, + "step": 17080 + }, + { + "epoch": 0.9375411635565313, + "grad_norm": 1.1362733840942383, + "learning_rate": 1.9602642158597716e-05, + "loss": 0.2898, + "step": 17082 + }, + { + "epoch": 0.9376509330406148, + "grad_norm": 2.0159988403320312, + "learning_rate": 1.959752977092995e-05, + "loss": 0.356, + "step": 17084 + }, + { + "epoch": 0.9377607025246981, + "grad_norm": 1.6999742984771729, + "learning_rate": 1.9592417620241037e-05, + "loss": 0.2582, + "step": 17086 + }, + { + "epoch": 0.9378704720087816, + "grad_norm": 1.6098930835723877, + "learning_rate": 1.9587305706755236e-05, + "loss": 0.26, + "step": 17088 + }, + { + "epoch": 0.937980241492865, + "grad_norm": 1.2648040056228638, + "learning_rate": 1.9582194030696776e-05, + "loss": 0.2047, + "step": 17090 + }, + { + "epoch": 0.9380900109769484, + "grad_norm": 1.2720937728881836, + "learning_rate": 1.957708259228987e-05, + "loss": 0.1803, + "step": 17092 + }, + { + "epoch": 0.9381997804610318, + "grad_norm": 3.3018958568573, + "learning_rate": 1.9571971391758744e-05, + "loss": 0.2156, + "step": 17094 + }, + { + "epoch": 0.9383095499451153, + "grad_norm": 1.0099987983703613, + "learning_rate": 1.9566860429327595e-05, + "loss": 0.2954, + "step": 17096 + }, + { + "epoch": 0.9384193194291987, + "grad_norm": 1.2466100454330444, + "learning_rate": 1.956174970522061e-05, + "loss": 0.2532, + "step": 17098 + }, + { + "epoch": 0.9385290889132821, + "grad_norm": 1.375240683555603, + "learning_rate": 1.955663921966198e-05, + "loss": 0.2098, + "step": 17100 + }, + { + "epoch": 0.9386388583973655, + "grad_norm": 2.2097091674804688, + "learning_rate": 1.9551528972875867e-05, + "loss": 0.1709, + "step": 17102 + }, + { + "epoch": 0.938748627881449, + "grad_norm": 1.233736276626587, + "learning_rate": 1.9546418965086442e-05, + "loss": 0.2744, + "step": 17104 + }, + { + "epoch": 0.9388583973655323, + "grad_norm": 1.200587272644043, + "learning_rate": 1.9541309196517837e-05, + "loss": 0.167, + "step": 17106 + }, + { + "epoch": 0.9389681668496158, + "grad_norm": 1.5764986276626587, + "learning_rate": 1.9536199667394215e-05, + "loss": 0.294, + "step": 17108 + }, + { + "epoch": 0.9390779363336992, + "grad_norm": 0.7813524603843689, + "learning_rate": 1.9531090377939687e-05, + "loss": 0.1832, + "step": 17110 + }, + { + "epoch": 0.9391877058177827, + "grad_norm": 1.5071594715118408, + "learning_rate": 1.9525981328378384e-05, + "loss": 0.2406, + "step": 17112 + }, + { + "epoch": 0.9392974753018661, + "grad_norm": 1.3838716745376587, + "learning_rate": 1.9520872518934408e-05, + "loss": 0.2037, + "step": 17114 + }, + { + "epoch": 0.9394072447859495, + "grad_norm": 1.0713698863983154, + "learning_rate": 1.951576394983185e-05, + "loss": 0.2373, + "step": 17116 + }, + { + "epoch": 0.939517014270033, + "grad_norm": 1.324499487876892, + "learning_rate": 1.951065562129481e-05, + "loss": 0.1945, + "step": 17118 + }, + { + "epoch": 0.9396267837541163, + "grad_norm": 0.7599047422409058, + "learning_rate": 1.9505547533547358e-05, + "loss": 0.1755, + "step": 17120 + }, + { + "epoch": 0.9397365532381998, + "grad_norm": 1.7316601276397705, + "learning_rate": 1.9500439686813556e-05, + "loss": 0.2263, + "step": 17122 + }, + { + "epoch": 0.9398463227222832, + "grad_norm": 2.092182159423828, + "learning_rate": 1.9495332081317464e-05, + "loss": 0.3035, + "step": 17124 + }, + { + "epoch": 0.9399560922063667, + "grad_norm": 1.7174829244613647, + "learning_rate": 1.949022471728313e-05, + "loss": 0.2225, + "step": 17126 + }, + { + "epoch": 0.94006586169045, + "grad_norm": 2.80183482170105, + "learning_rate": 1.9485117594934574e-05, + "loss": 0.1664, + "step": 17128 + }, + { + "epoch": 0.9401756311745335, + "grad_norm": 1.0736134052276611, + "learning_rate": 1.9480010714495836e-05, + "loss": 0.1941, + "step": 17130 + }, + { + "epoch": 0.9402854006586169, + "grad_norm": 1.013445258140564, + "learning_rate": 1.947490407619092e-05, + "loss": 0.2419, + "step": 17132 + }, + { + "epoch": 0.9403951701427004, + "grad_norm": 1.4652818441390991, + "learning_rate": 1.9469797680243828e-05, + "loss": 0.2548, + "step": 17134 + }, + { + "epoch": 0.9405049396267837, + "grad_norm": 1.8104197978973389, + "learning_rate": 1.9464691526878555e-05, + "loss": 0.1815, + "step": 17136 + }, + { + "epoch": 0.9406147091108672, + "grad_norm": 1.55990469455719, + "learning_rate": 1.9459585616319073e-05, + "loss": 0.253, + "step": 17138 + }, + { + "epoch": 0.9407244785949506, + "grad_norm": 1.4210947751998901, + "learning_rate": 1.945447994878937e-05, + "loss": 0.1423, + "step": 17140 + }, + { + "epoch": 0.940834248079034, + "grad_norm": 1.4346814155578613, + "learning_rate": 1.944937452451339e-05, + "loss": 0.3086, + "step": 17142 + }, + { + "epoch": 0.9409440175631174, + "grad_norm": 1.0958575010299683, + "learning_rate": 1.9444269343715092e-05, + "loss": 0.1928, + "step": 17144 + }, + { + "epoch": 0.9410537870472009, + "grad_norm": 0.9465155601501465, + "learning_rate": 1.9439164406618404e-05, + "loss": 0.2346, + "step": 17146 + }, + { + "epoch": 0.9411635565312843, + "grad_norm": 2.64056134223938, + "learning_rate": 1.9434059713447265e-05, + "loss": 0.3842, + "step": 17148 + }, + { + "epoch": 0.9412733260153677, + "grad_norm": 1.5103294849395752, + "learning_rate": 1.9428955264425587e-05, + "loss": 0.2456, + "step": 17150 + }, + { + "epoch": 0.9413830954994512, + "grad_norm": 1.4837877750396729, + "learning_rate": 1.942385105977727e-05, + "loss": 0.2094, + "step": 17152 + }, + { + "epoch": 0.9414928649835346, + "grad_norm": 1.218709111213684, + "learning_rate": 1.941874709972622e-05, + "loss": 0.2047, + "step": 17154 + }, + { + "epoch": 0.941602634467618, + "grad_norm": 0.9816610217094421, + "learning_rate": 1.9413643384496316e-05, + "loss": 0.2462, + "step": 17156 + }, + { + "epoch": 0.9417124039517014, + "grad_norm": 1.5560084581375122, + "learning_rate": 1.9408539914311428e-05, + "loss": 0.2645, + "step": 17158 + }, + { + "epoch": 0.9418221734357849, + "grad_norm": 1.7087875604629517, + "learning_rate": 1.9403436689395426e-05, + "loss": 0.1649, + "step": 17160 + }, + { + "epoch": 0.9419319429198683, + "grad_norm": 1.717740535736084, + "learning_rate": 1.9398333709972165e-05, + "loss": 0.1306, + "step": 17162 + }, + { + "epoch": 0.9420417124039517, + "grad_norm": 0.8666294813156128, + "learning_rate": 1.9393230976265473e-05, + "loss": 0.1603, + "step": 17164 + }, + { + "epoch": 0.9421514818880351, + "grad_norm": 1.9220175743103027, + "learning_rate": 1.93881284884992e-05, + "loss": 0.2405, + "step": 17166 + }, + { + "epoch": 0.9422612513721186, + "grad_norm": 1.0582466125488281, + "learning_rate": 1.9383026246897143e-05, + "loss": 0.2261, + "step": 17168 + }, + { + "epoch": 0.942371020856202, + "grad_norm": 2.5031886100769043, + "learning_rate": 1.9377924251683133e-05, + "loss": 0.2642, + "step": 17170 + }, + { + "epoch": 0.9424807903402854, + "grad_norm": 1.1725995540618896, + "learning_rate": 1.9372822503080957e-05, + "loss": 0.2589, + "step": 17172 + }, + { + "epoch": 0.9425905598243688, + "grad_norm": 1.0392593145370483, + "learning_rate": 1.9367721001314414e-05, + "loss": 0.1172, + "step": 17174 + }, + { + "epoch": 0.9427003293084523, + "grad_norm": 1.9020787477493286, + "learning_rate": 1.936261974660727e-05, + "loss": 0.2119, + "step": 17176 + }, + { + "epoch": 0.9428100987925356, + "grad_norm": 1.5061452388763428, + "learning_rate": 1.9357518739183286e-05, + "loss": 0.2583, + "step": 17178 + }, + { + "epoch": 0.9429198682766191, + "grad_norm": 1.2614915370941162, + "learning_rate": 1.9352417979266233e-05, + "loss": 0.2873, + "step": 17180 + }, + { + "epoch": 0.9430296377607025, + "grad_norm": 1.4230822324752808, + "learning_rate": 1.9347317467079846e-05, + "loss": 0.1917, + "step": 17182 + }, + { + "epoch": 0.943139407244786, + "grad_norm": 1.1388213634490967, + "learning_rate": 1.9342217202847856e-05, + "loss": 0.2804, + "step": 17184 + }, + { + "epoch": 0.9432491767288693, + "grad_norm": 1.1911298036575317, + "learning_rate": 1.9337117186793992e-05, + "loss": 0.1943, + "step": 17186 + }, + { + "epoch": 0.9433589462129528, + "grad_norm": 1.786365032196045, + "learning_rate": 1.9332017419141962e-05, + "loss": 0.2867, + "step": 17188 + }, + { + "epoch": 0.9434687156970363, + "grad_norm": 1.2984554767608643, + "learning_rate": 1.932691790011547e-05, + "loss": 0.2707, + "step": 17190 + }, + { + "epoch": 0.9435784851811196, + "grad_norm": 1.355178713798523, + "learning_rate": 1.9321818629938208e-05, + "loss": 0.3401, + "step": 17192 + }, + { + "epoch": 0.9436882546652031, + "grad_norm": 1.6673017740249634, + "learning_rate": 1.9316719608833845e-05, + "loss": 0.2541, + "step": 17194 + }, + { + "epoch": 0.9437980241492865, + "grad_norm": 4.209554672241211, + "learning_rate": 1.9311620837026057e-05, + "loss": 0.2179, + "step": 17196 + }, + { + "epoch": 0.94390779363337, + "grad_norm": 2.01912784576416, + "learning_rate": 1.930652231473849e-05, + "loss": 0.2395, + "step": 17198 + }, + { + "epoch": 0.9440175631174533, + "grad_norm": 1.1282880306243896, + "learning_rate": 1.930142404219481e-05, + "loss": 0.1605, + "step": 17200 + }, + { + "epoch": 0.9441273326015368, + "grad_norm": 1.1851083040237427, + "learning_rate": 1.9296326019618636e-05, + "loss": 0.2443, + "step": 17202 + }, + { + "epoch": 0.9442371020856202, + "grad_norm": 1.0877904891967773, + "learning_rate": 1.9291228247233605e-05, + "loss": 0.1215, + "step": 17204 + }, + { + "epoch": 0.9443468715697036, + "grad_norm": 3.3713066577911377, + "learning_rate": 1.9286130725263325e-05, + "loss": 0.1633, + "step": 17206 + }, + { + "epoch": 0.944456641053787, + "grad_norm": 1.4748497009277344, + "learning_rate": 1.9281033453931388e-05, + "loss": 0.1479, + "step": 17208 + }, + { + "epoch": 0.9445664105378705, + "grad_norm": 1.9275915622711182, + "learning_rate": 1.92759364334614e-05, + "loss": 0.3406, + "step": 17210 + }, + { + "epoch": 0.9446761800219539, + "grad_norm": 1.0916537046432495, + "learning_rate": 1.9270839664076936e-05, + "loss": 0.2101, + "step": 17212 + }, + { + "epoch": 0.9447859495060373, + "grad_norm": 1.1141072511672974, + "learning_rate": 1.926574314600156e-05, + "loss": 0.1615, + "step": 17214 + }, + { + "epoch": 0.9448957189901207, + "grad_norm": 0.9775232672691345, + "learning_rate": 1.926064687945884e-05, + "loss": 0.1884, + "step": 17216 + }, + { + "epoch": 0.9450054884742042, + "grad_norm": 0.9976372718811035, + "learning_rate": 1.9255550864672316e-05, + "loss": 0.1571, + "step": 17218 + }, + { + "epoch": 0.9451152579582875, + "grad_norm": 1.6165709495544434, + "learning_rate": 1.9250455101865526e-05, + "loss": 0.17, + "step": 17220 + }, + { + "epoch": 0.945225027442371, + "grad_norm": 1.610524296760559, + "learning_rate": 1.9245359591261996e-05, + "loss": 0.2082, + "step": 17222 + }, + { + "epoch": 0.9453347969264544, + "grad_norm": 1.0026419162750244, + "learning_rate": 1.9240264333085245e-05, + "loss": 0.2796, + "step": 17224 + }, + { + "epoch": 0.9454445664105379, + "grad_norm": 1.247725486755371, + "learning_rate": 1.923516932755876e-05, + "loss": 0.2148, + "step": 17226 + }, + { + "epoch": 0.9455543358946213, + "grad_norm": 1.2478574514389038, + "learning_rate": 1.9230074574906042e-05, + "loss": 0.2024, + "step": 17228 + }, + { + "epoch": 0.9456641053787047, + "grad_norm": 1.1987000703811646, + "learning_rate": 1.9224980075350586e-05, + "loss": 0.1898, + "step": 17230 + }, + { + "epoch": 0.9457738748627882, + "grad_norm": 1.9264758825302124, + "learning_rate": 1.9219885829115843e-05, + "loss": 0.2752, + "step": 17232 + }, + { + "epoch": 0.9458836443468716, + "grad_norm": 1.594122052192688, + "learning_rate": 1.921479183642528e-05, + "loss": 0.2559, + "step": 17234 + }, + { + "epoch": 0.945993413830955, + "grad_norm": 1.5596548318862915, + "learning_rate": 1.920969809750234e-05, + "loss": 0.2026, + "step": 17236 + }, + { + "epoch": 0.9461031833150384, + "grad_norm": 1.6589449644088745, + "learning_rate": 1.9204604612570463e-05, + "loss": 0.2276, + "step": 17238 + }, + { + "epoch": 0.9462129527991219, + "grad_norm": 1.031783103942871, + "learning_rate": 1.9199511381853076e-05, + "loss": 0.145, + "step": 17240 + }, + { + "epoch": 0.9463227222832052, + "grad_norm": 0.9310327172279358, + "learning_rate": 1.9194418405573588e-05, + "loss": 0.1173, + "step": 17242 + }, + { + "epoch": 0.9464324917672887, + "grad_norm": 1.6549266576766968, + "learning_rate": 1.91893256839554e-05, + "loss": 0.2395, + "step": 17244 + }, + { + "epoch": 0.9465422612513721, + "grad_norm": 0.883405327796936, + "learning_rate": 1.9184233217221916e-05, + "loss": 0.1619, + "step": 17246 + }, + { + "epoch": 0.9466520307354556, + "grad_norm": 2.304750680923462, + "learning_rate": 1.9179141005596505e-05, + "loss": 0.2127, + "step": 17248 + }, + { + "epoch": 0.9467618002195389, + "grad_norm": 1.7815155982971191, + "learning_rate": 1.917404904930254e-05, + "loss": 0.2278, + "step": 17250 + }, + { + "epoch": 0.9468715697036224, + "grad_norm": 1.914420485496521, + "learning_rate": 1.916895734856338e-05, + "loss": 0.2441, + "step": 17252 + }, + { + "epoch": 0.9469813391877058, + "grad_norm": 1.2464988231658936, + "learning_rate": 1.9163865903602374e-05, + "loss": 0.2682, + "step": 17254 + }, + { + "epoch": 0.9470911086717893, + "grad_norm": 2.1013150215148926, + "learning_rate": 1.9158774714642845e-05, + "loss": 0.2791, + "step": 17256 + }, + { + "epoch": 0.9472008781558726, + "grad_norm": 0.8386157751083374, + "learning_rate": 1.9153683781908127e-05, + "loss": 0.2094, + "step": 17258 + }, + { + "epoch": 0.9473106476399561, + "grad_norm": 0.9724357724189758, + "learning_rate": 1.914859310562154e-05, + "loss": 0.125, + "step": 17260 + }, + { + "epoch": 0.9474204171240396, + "grad_norm": 1.8956998586654663, + "learning_rate": 1.9143502686006383e-05, + "loss": 0.3238, + "step": 17262 + }, + { + "epoch": 0.9475301866081229, + "grad_norm": 1.276647686958313, + "learning_rate": 1.9138412523285936e-05, + "loss": 0.2275, + "step": 17264 + }, + { + "epoch": 0.9476399560922064, + "grad_norm": 1.6150932312011719, + "learning_rate": 1.91333226176835e-05, + "loss": 0.2044, + "step": 17266 + }, + { + "epoch": 0.9477497255762898, + "grad_norm": 1.0639195442199707, + "learning_rate": 1.9128232969422315e-05, + "loss": 0.127, + "step": 17268 + }, + { + "epoch": 0.9478594950603733, + "grad_norm": 1.508305549621582, + "learning_rate": 1.9123143578725657e-05, + "loss": 0.3243, + "step": 17270 + }, + { + "epoch": 0.9479692645444566, + "grad_norm": 1.017277479171753, + "learning_rate": 1.9118054445816767e-05, + "loss": 0.284, + "step": 17272 + }, + { + "epoch": 0.9480790340285401, + "grad_norm": 1.4561984539031982, + "learning_rate": 1.9112965570918884e-05, + "loss": 0.2538, + "step": 17274 + }, + { + "epoch": 0.9481888035126235, + "grad_norm": 0.9840224385261536, + "learning_rate": 1.9107876954255217e-05, + "loss": 0.2148, + "step": 17276 + }, + { + "epoch": 0.948298572996707, + "grad_norm": 1.284846544265747, + "learning_rate": 1.9102788596048994e-05, + "loss": 0.1841, + "step": 17278 + }, + { + "epoch": 0.9484083424807903, + "grad_norm": 1.8575471639633179, + "learning_rate": 1.9097700496523404e-05, + "loss": 0.3367, + "step": 17280 + }, + { + "epoch": 0.9485181119648738, + "grad_norm": 1.0519477128982544, + "learning_rate": 1.9092612655901647e-05, + "loss": 0.1373, + "step": 17282 + }, + { + "epoch": 0.9486278814489572, + "grad_norm": 2.620269298553467, + "learning_rate": 1.908752507440689e-05, + "loss": 0.2444, + "step": 17284 + }, + { + "epoch": 0.9487376509330406, + "grad_norm": 1.1093659400939941, + "learning_rate": 1.90824377522623e-05, + "loss": 0.2326, + "step": 17286 + }, + { + "epoch": 0.948847420417124, + "grad_norm": 1.7667092084884644, + "learning_rate": 1.9077350689691032e-05, + "loss": 0.3025, + "step": 17288 + }, + { + "epoch": 0.9489571899012075, + "grad_norm": 1.9422982931137085, + "learning_rate": 1.9072263886916238e-05, + "loss": 0.2289, + "step": 17290 + }, + { + "epoch": 0.9490669593852908, + "grad_norm": 1.6244785785675049, + "learning_rate": 1.906717734416105e-05, + "loss": 0.2446, + "step": 17292 + }, + { + "epoch": 0.9491767288693743, + "grad_norm": 0.9486223459243774, + "learning_rate": 1.9062091061648576e-05, + "loss": 0.1136, + "step": 17294 + }, + { + "epoch": 0.9492864983534577, + "grad_norm": 1.411002516746521, + "learning_rate": 1.905700503960194e-05, + "loss": 0.2529, + "step": 17296 + }, + { + "epoch": 0.9493962678375412, + "grad_norm": 1.1041275262832642, + "learning_rate": 1.9051919278244233e-05, + "loss": 0.1882, + "step": 17298 + }, + { + "epoch": 0.9495060373216246, + "grad_norm": 1.8699579238891602, + "learning_rate": 1.9046833777798533e-05, + "loss": 0.2516, + "step": 17300 + }, + { + "epoch": 0.949615806805708, + "grad_norm": 0.9532637596130371, + "learning_rate": 1.904174853848793e-05, + "loss": 0.1239, + "step": 17302 + }, + { + "epoch": 0.9497255762897915, + "grad_norm": 1.1420115232467651, + "learning_rate": 1.9036663560535483e-05, + "loss": 0.2221, + "step": 17304 + }, + { + "epoch": 0.9498353457738749, + "grad_norm": 1.3714030981063843, + "learning_rate": 1.9031578844164235e-05, + "loss": 0.1859, + "step": 17306 + }, + { + "epoch": 0.9499451152579583, + "grad_norm": 1.2020196914672852, + "learning_rate": 1.9026494389597238e-05, + "loss": 0.1776, + "step": 17308 + }, + { + "epoch": 0.9500548847420417, + "grad_norm": 1.4173822402954102, + "learning_rate": 1.902141019705752e-05, + "loss": 0.2226, + "step": 17310 + }, + { + "epoch": 0.9501646542261252, + "grad_norm": 1.8218508958816528, + "learning_rate": 1.9016326266768088e-05, + "loss": 0.3586, + "step": 17312 + }, + { + "epoch": 0.9502744237102085, + "grad_norm": 1.435328483581543, + "learning_rate": 1.9011242598951962e-05, + "loss": 0.1835, + "step": 17314 + }, + { + "epoch": 0.950384193194292, + "grad_norm": 2.449146032333374, + "learning_rate": 1.9006159193832125e-05, + "loss": 0.185, + "step": 17316 + }, + { + "epoch": 0.9504939626783754, + "grad_norm": 2.0589044094085693, + "learning_rate": 1.9001076051631562e-05, + "loss": 0.2032, + "step": 17318 + }, + { + "epoch": 0.9506037321624589, + "grad_norm": 1.34407377243042, + "learning_rate": 1.8995993172573253e-05, + "loss": 0.1817, + "step": 17320 + }, + { + "epoch": 0.9507135016465422, + "grad_norm": 1.0108416080474854, + "learning_rate": 1.899091055688015e-05, + "loss": 0.1352, + "step": 17322 + }, + { + "epoch": 0.9508232711306257, + "grad_norm": 0.7747324109077454, + "learning_rate": 1.8985828204775206e-05, + "loss": 0.0951, + "step": 17324 + }, + { + "epoch": 0.9509330406147091, + "grad_norm": 1.1673637628555298, + "learning_rate": 1.898074611648136e-05, + "loss": 0.1903, + "step": 17326 + }, + { + "epoch": 0.9510428100987925, + "grad_norm": 1.124523639678955, + "learning_rate": 1.8975664292221532e-05, + "loss": 0.2589, + "step": 17328 + }, + { + "epoch": 0.9511525795828759, + "grad_norm": 1.904670238494873, + "learning_rate": 1.8970582732218632e-05, + "loss": 0.2496, + "step": 17330 + }, + { + "epoch": 0.9512623490669594, + "grad_norm": 1.2522283792495728, + "learning_rate": 1.8965501436695577e-05, + "loss": 0.2641, + "step": 17332 + }, + { + "epoch": 0.9513721185510428, + "grad_norm": 1.247128963470459, + "learning_rate": 1.8960420405875244e-05, + "loss": 0.2427, + "step": 17334 + }, + { + "epoch": 0.9514818880351262, + "grad_norm": 1.3354111909866333, + "learning_rate": 1.8955339639980512e-05, + "loss": 0.1794, + "step": 17336 + }, + { + "epoch": 0.9515916575192097, + "grad_norm": 1.0663180351257324, + "learning_rate": 1.8950259139234254e-05, + "loss": 0.1476, + "step": 17338 + }, + { + "epoch": 0.9517014270032931, + "grad_norm": 1.555719017982483, + "learning_rate": 1.894517890385933e-05, + "loss": 0.2161, + "step": 17340 + }, + { + "epoch": 0.9518111964873766, + "grad_norm": 1.625081181526184, + "learning_rate": 1.894009893407857e-05, + "loss": 0.2242, + "step": 17342 + }, + { + "epoch": 0.9519209659714599, + "grad_norm": 1.5923782587051392, + "learning_rate": 1.893501923011482e-05, + "loss": 0.3061, + "step": 17344 + }, + { + "epoch": 0.9520307354555434, + "grad_norm": 1.4056648015975952, + "learning_rate": 1.8929939792190894e-05, + "loss": 0.1627, + "step": 17346 + }, + { + "epoch": 0.9521405049396268, + "grad_norm": 1.2500251531600952, + "learning_rate": 1.8924860620529594e-05, + "loss": 0.2005, + "step": 17348 + }, + { + "epoch": 0.9522502744237102, + "grad_norm": 1.6205540895462036, + "learning_rate": 1.891978171535373e-05, + "loss": 0.3167, + "step": 17350 + }, + { + "epoch": 0.9523600439077936, + "grad_norm": 1.3816449642181396, + "learning_rate": 1.891470307688609e-05, + "loss": 0.2508, + "step": 17352 + }, + { + "epoch": 0.9524698133918771, + "grad_norm": 2.019594669342041, + "learning_rate": 1.890962470534944e-05, + "loss": 0.2805, + "step": 17354 + }, + { + "epoch": 0.9525795828759605, + "grad_norm": 0.9980028867721558, + "learning_rate": 1.890454660096654e-05, + "loss": 0.0999, + "step": 17356 + }, + { + "epoch": 0.9526893523600439, + "grad_norm": 0.9996972680091858, + "learning_rate": 1.8899468763960147e-05, + "loss": 0.2478, + "step": 17358 + }, + { + "epoch": 0.9527991218441273, + "grad_norm": 1.1431442499160767, + "learning_rate": 1.8894391194552997e-05, + "loss": 0.1529, + "step": 17360 + }, + { + "epoch": 0.9529088913282108, + "grad_norm": 1.2101466655731201, + "learning_rate": 1.8889313892967813e-05, + "loss": 0.2024, + "step": 17362 + }, + { + "epoch": 0.9530186608122941, + "grad_norm": 1.5760639905929565, + "learning_rate": 1.888423685942732e-05, + "loss": 0.2522, + "step": 17364 + }, + { + "epoch": 0.9531284302963776, + "grad_norm": 1.1514918804168701, + "learning_rate": 1.8879160094154215e-05, + "loss": 0.2704, + "step": 17366 + }, + { + "epoch": 0.953238199780461, + "grad_norm": 1.1869851350784302, + "learning_rate": 1.887408359737119e-05, + "loss": 0.1994, + "step": 17368 + }, + { + "epoch": 0.9533479692645445, + "grad_norm": 1.241001844406128, + "learning_rate": 1.8869007369300928e-05, + "loss": 0.2202, + "step": 17370 + }, + { + "epoch": 0.9534577387486279, + "grad_norm": 2.265460252761841, + "learning_rate": 1.886393141016609e-05, + "loss": 0.3047, + "step": 17372 + }, + { + "epoch": 0.9535675082327113, + "grad_norm": 0.7858079075813293, + "learning_rate": 1.8858855720189347e-05, + "loss": 0.1667, + "step": 17374 + }, + { + "epoch": 0.9536772777167948, + "grad_norm": 3.677724599838257, + "learning_rate": 1.8853780299593332e-05, + "loss": 0.2915, + "step": 17376 + }, + { + "epoch": 0.9537870472008781, + "grad_norm": 1.8445624113082886, + "learning_rate": 1.884870514860067e-05, + "loss": 0.2353, + "step": 17378 + }, + { + "epoch": 0.9538968166849616, + "grad_norm": 1.0650135278701782, + "learning_rate": 1.8843630267434e-05, + "loss": 0.2082, + "step": 17380 + }, + { + "epoch": 0.954006586169045, + "grad_norm": 1.430629849433899, + "learning_rate": 1.8838555656315923e-05, + "loss": 0.2299, + "step": 17382 + }, + { + "epoch": 0.9541163556531285, + "grad_norm": 1.1499301195144653, + "learning_rate": 1.8833481315469042e-05, + "loss": 0.267, + "step": 17384 + }, + { + "epoch": 0.9542261251372118, + "grad_norm": 1.1335517168045044, + "learning_rate": 1.8828407245115932e-05, + "loss": 0.1746, + "step": 17386 + }, + { + "epoch": 0.9543358946212953, + "grad_norm": 1.0137583017349243, + "learning_rate": 1.8823333445479174e-05, + "loss": 0.2048, + "step": 17388 + }, + { + "epoch": 0.9544456641053787, + "grad_norm": 1.2746491432189941, + "learning_rate": 1.8818259916781333e-05, + "loss": 0.1801, + "step": 17390 + }, + { + "epoch": 0.9545554335894622, + "grad_norm": 1.312308430671692, + "learning_rate": 1.8813186659244943e-05, + "loss": 0.186, + "step": 17392 + }, + { + "epoch": 0.9546652030735455, + "grad_norm": 1.0738141536712646, + "learning_rate": 1.8808113673092566e-05, + "loss": 0.2748, + "step": 17394 + }, + { + "epoch": 0.954774972557629, + "grad_norm": 1.193565011024475, + "learning_rate": 1.8803040958546707e-05, + "loss": 0.328, + "step": 17396 + }, + { + "epoch": 0.9548847420417124, + "grad_norm": 1.188749074935913, + "learning_rate": 1.8797968515829886e-05, + "loss": 0.4083, + "step": 17398 + }, + { + "epoch": 0.9549945115257958, + "grad_norm": 0.9803271293640137, + "learning_rate": 1.879289634516461e-05, + "loss": 0.262, + "step": 17400 + }, + { + "epoch": 0.9551042810098792, + "grad_norm": 1.4233014583587646, + "learning_rate": 1.878782444677337e-05, + "loss": 0.1935, + "step": 17402 + }, + { + "epoch": 0.9552140504939627, + "grad_norm": 1.395903468132019, + "learning_rate": 1.8782752820878634e-05, + "loss": 0.1718, + "step": 17404 + }, + { + "epoch": 0.955323819978046, + "grad_norm": 1.1784757375717163, + "learning_rate": 1.8777681467702882e-05, + "loss": 0.2135, + "step": 17406 + }, + { + "epoch": 0.9554335894621295, + "grad_norm": 0.6775175929069519, + "learning_rate": 1.8772610387468555e-05, + "loss": 0.2199, + "step": 17408 + }, + { + "epoch": 0.955543358946213, + "grad_norm": 1.4925181865692139, + "learning_rate": 1.87675395803981e-05, + "loss": 0.1303, + "step": 17410 + }, + { + "epoch": 0.9556531284302964, + "grad_norm": 1.7387694120407104, + "learning_rate": 1.8762469046713956e-05, + "loss": 0.2048, + "step": 17412 + }, + { + "epoch": 0.9557628979143798, + "grad_norm": 1.0967522859573364, + "learning_rate": 1.8757398786638535e-05, + "loss": 0.1853, + "step": 17414 + }, + { + "epoch": 0.9558726673984632, + "grad_norm": 1.1850054264068604, + "learning_rate": 1.8752328800394242e-05, + "loss": 0.2501, + "step": 17416 + }, + { + "epoch": 0.9559824368825467, + "grad_norm": 1.2749874591827393, + "learning_rate": 1.874725908820348e-05, + "loss": 0.2972, + "step": 17418 + }, + { + "epoch": 0.9560922063666301, + "grad_norm": 1.7108350992202759, + "learning_rate": 1.8742189650288615e-05, + "loss": 0.1549, + "step": 17420 + }, + { + "epoch": 0.9562019758507135, + "grad_norm": 1.1438968181610107, + "learning_rate": 1.8737120486872033e-05, + "loss": 0.2472, + "step": 17422 + }, + { + "epoch": 0.9563117453347969, + "grad_norm": 1.4613804817199707, + "learning_rate": 1.8732051598176086e-05, + "loss": 0.2554, + "step": 17424 + }, + { + "epoch": 0.9564215148188804, + "grad_norm": 1.0859355926513672, + "learning_rate": 1.872698298442312e-05, + "loss": 0.2269, + "step": 17426 + }, + { + "epoch": 0.9565312843029637, + "grad_norm": 1.1776338815689087, + "learning_rate": 1.872191464583547e-05, + "loss": 0.2713, + "step": 17428 + }, + { + "epoch": 0.9566410537870472, + "grad_norm": 1.088175654411316, + "learning_rate": 1.8716846582635458e-05, + "loss": 0.1817, + "step": 17430 + }, + { + "epoch": 0.9567508232711306, + "grad_norm": 1.189194917678833, + "learning_rate": 1.8711778795045398e-05, + "loss": 0.2765, + "step": 17432 + }, + { + "epoch": 0.9568605927552141, + "grad_norm": 1.4642291069030762, + "learning_rate": 1.8706711283287576e-05, + "loss": 0.2847, + "step": 17434 + }, + { + "epoch": 0.9569703622392974, + "grad_norm": 2.013874053955078, + "learning_rate": 1.8701644047584293e-05, + "loss": 0.271, + "step": 17436 + }, + { + "epoch": 0.9570801317233809, + "grad_norm": 1.904244065284729, + "learning_rate": 1.8696577088157817e-05, + "loss": 0.3085, + "step": 17438 + }, + { + "epoch": 0.9571899012074643, + "grad_norm": 1.4267867803573608, + "learning_rate": 1.86915104052304e-05, + "loss": 0.3285, + "step": 17440 + }, + { + "epoch": 0.9572996706915478, + "grad_norm": 1.5892845392227173, + "learning_rate": 1.8686443999024304e-05, + "loss": 0.3766, + "step": 17442 + }, + { + "epoch": 0.9574094401756311, + "grad_norm": 0.7096275091171265, + "learning_rate": 1.868137786976177e-05, + "loss": 0.1965, + "step": 17444 + }, + { + "epoch": 0.9575192096597146, + "grad_norm": 1.2986788749694824, + "learning_rate": 1.867631201766501e-05, + "loss": 0.2835, + "step": 17446 + }, + { + "epoch": 0.9576289791437981, + "grad_norm": 1.136132836341858, + "learning_rate": 1.8671246442956243e-05, + "loss": 0.2151, + "step": 17448 + }, + { + "epoch": 0.9577387486278814, + "grad_norm": 1.1992275714874268, + "learning_rate": 1.8666181145857677e-05, + "loss": 0.1957, + "step": 17450 + }, + { + "epoch": 0.9578485181119649, + "grad_norm": 1.6734468936920166, + "learning_rate": 1.866111612659149e-05, + "loss": 0.1908, + "step": 17452 + }, + { + "epoch": 0.9579582875960483, + "grad_norm": 1.6091210842132568, + "learning_rate": 1.865605138537986e-05, + "loss": 0.1887, + "step": 17454 + }, + { + "epoch": 0.9580680570801318, + "grad_norm": 0.8055315613746643, + "learning_rate": 1.865098692244496e-05, + "loss": 0.1794, + "step": 17456 + }, + { + "epoch": 0.9581778265642151, + "grad_norm": 1.3532477617263794, + "learning_rate": 1.8645922738008932e-05, + "loss": 0.2469, + "step": 17458 + }, + { + "epoch": 0.9582875960482986, + "grad_norm": 1.2284431457519531, + "learning_rate": 1.864085883229392e-05, + "loss": 0.1553, + "step": 17460 + }, + { + "epoch": 0.958397365532382, + "grad_norm": 1.5813616514205933, + "learning_rate": 1.863579520552206e-05, + "loss": 0.2687, + "step": 17462 + }, + { + "epoch": 0.9585071350164655, + "grad_norm": 1.5270549058914185, + "learning_rate": 1.863073185791545e-05, + "loss": 0.3353, + "step": 17464 + }, + { + "epoch": 0.9586169045005488, + "grad_norm": 0.9838412404060364, + "learning_rate": 1.862566878969621e-05, + "loss": 0.2497, + "step": 17466 + }, + { + "epoch": 0.9587266739846323, + "grad_norm": 0.9252474308013916, + "learning_rate": 1.862060600108642e-05, + "loss": 0.1666, + "step": 17468 + }, + { + "epoch": 0.9588364434687157, + "grad_norm": 1.3584637641906738, + "learning_rate": 1.8615543492308155e-05, + "loss": 0.2356, + "step": 17470 + }, + { + "epoch": 0.9589462129527991, + "grad_norm": 1.3509730100631714, + "learning_rate": 1.8610481263583496e-05, + "loss": 0.1671, + "step": 17472 + }, + { + "epoch": 0.9590559824368825, + "grad_norm": 1.2232437133789062, + "learning_rate": 1.8605419315134493e-05, + "loss": 0.2163, + "step": 17474 + }, + { + "epoch": 0.959165751920966, + "grad_norm": 5.108393669128418, + "learning_rate": 1.8600357647183185e-05, + "loss": 0.2901, + "step": 17476 + }, + { + "epoch": 0.9592755214050493, + "grad_norm": 1.1242308616638184, + "learning_rate": 1.8595296259951596e-05, + "loss": 0.2899, + "step": 17478 + }, + { + "epoch": 0.9593852908891328, + "grad_norm": 1.5993460416793823, + "learning_rate": 1.8590235153661757e-05, + "loss": 0.3227, + "step": 17480 + }, + { + "epoch": 0.9594950603732162, + "grad_norm": 1.4563024044036865, + "learning_rate": 1.8585174328535666e-05, + "loss": 0.2473, + "step": 17482 + }, + { + "epoch": 0.9596048298572997, + "grad_norm": 1.4858438968658447, + "learning_rate": 1.8580113784795305e-05, + "loss": 0.1854, + "step": 17484 + }, + { + "epoch": 0.9597145993413831, + "grad_norm": 1.8211361169815063, + "learning_rate": 1.8575053522662675e-05, + "loss": 0.3216, + "step": 17486 + }, + { + "epoch": 0.9598243688254665, + "grad_norm": 1.2854571342468262, + "learning_rate": 1.856999354235973e-05, + "loss": 0.2426, + "step": 17488 + }, + { + "epoch": 0.95993413830955, + "grad_norm": 1.2986117601394653, + "learning_rate": 1.8564933844108423e-05, + "loss": 0.1623, + "step": 17490 + }, + { + "epoch": 0.9600439077936334, + "grad_norm": 0.8243726491928101, + "learning_rate": 1.8559874428130706e-05, + "loss": 0.2546, + "step": 17492 + }, + { + "epoch": 0.9601536772777168, + "grad_norm": 1.23626708984375, + "learning_rate": 1.8554815294648507e-05, + "loss": 0.1668, + "step": 17494 + }, + { + "epoch": 0.9602634467618002, + "grad_norm": 2.0105361938476562, + "learning_rate": 1.8549756443883746e-05, + "loss": 0.3147, + "step": 17496 + }, + { + "epoch": 0.9603732162458837, + "grad_norm": 1.794820785522461, + "learning_rate": 1.8544697876058324e-05, + "loss": 0.1551, + "step": 17498 + }, + { + "epoch": 0.960482985729967, + "grad_norm": 1.585732102394104, + "learning_rate": 1.8539639591394133e-05, + "loss": 0.2863, + "step": 17500 + }, + { + "epoch": 0.9605927552140505, + "grad_norm": 1.2097899913787842, + "learning_rate": 1.853458159011306e-05, + "loss": 0.2485, + "step": 17502 + }, + { + "epoch": 0.9607025246981339, + "grad_norm": 1.8274080753326416, + "learning_rate": 1.852952387243698e-05, + "loss": 0.2762, + "step": 17504 + }, + { + "epoch": 0.9608122941822174, + "grad_norm": 1.2690949440002441, + "learning_rate": 1.852446643858774e-05, + "loss": 0.3105, + "step": 17506 + }, + { + "epoch": 0.9609220636663007, + "grad_norm": 1.3586363792419434, + "learning_rate": 1.851940928878718e-05, + "loss": 0.3232, + "step": 17508 + }, + { + "epoch": 0.9610318331503842, + "grad_norm": 1.0912928581237793, + "learning_rate": 1.8514352423257148e-05, + "loss": 0.2949, + "step": 17510 + }, + { + "epoch": 0.9611416026344676, + "grad_norm": 1.3859294652938843, + "learning_rate": 1.8509295842219448e-05, + "loss": 0.1563, + "step": 17512 + }, + { + "epoch": 0.961251372118551, + "grad_norm": 1.1456587314605713, + "learning_rate": 1.8504239545895883e-05, + "loss": 0.1873, + "step": 17514 + }, + { + "epoch": 0.9613611416026344, + "grad_norm": 0.9772741794586182, + "learning_rate": 1.8499183534508263e-05, + "loss": 0.2183, + "step": 17516 + }, + { + "epoch": 0.9614709110867179, + "grad_norm": 0.8706521987915039, + "learning_rate": 1.849412780827836e-05, + "loss": 0.1509, + "step": 17518 + }, + { + "epoch": 0.9615806805708014, + "grad_norm": 1.404293179512024, + "learning_rate": 1.848907236742794e-05, + "loss": 0.2312, + "step": 17520 + }, + { + "epoch": 0.9616904500548847, + "grad_norm": 1.1661895513534546, + "learning_rate": 1.8484017212178766e-05, + "loss": 0.1596, + "step": 17522 + }, + { + "epoch": 0.9618002195389682, + "grad_norm": 1.0861437320709229, + "learning_rate": 1.8478962342752583e-05, + "loss": 0.1838, + "step": 17524 + }, + { + "epoch": 0.9619099890230516, + "grad_norm": 2.150294780731201, + "learning_rate": 1.847390775937111e-05, + "loss": 0.2873, + "step": 17526 + }, + { + "epoch": 0.9620197585071351, + "grad_norm": 1.5084234476089478, + "learning_rate": 1.8468853462256085e-05, + "loss": 0.2326, + "step": 17528 + }, + { + "epoch": 0.9621295279912184, + "grad_norm": 1.234790563583374, + "learning_rate": 1.8463799451629187e-05, + "loss": 0.3171, + "step": 17530 + }, + { + "epoch": 0.9622392974753019, + "grad_norm": 1.0781495571136475, + "learning_rate": 1.8458745727712144e-05, + "loss": 0.2061, + "step": 17532 + }, + { + "epoch": 0.9623490669593853, + "grad_norm": 2.158494710922241, + "learning_rate": 1.845369229072661e-05, + "loss": 0.3329, + "step": 17534 + }, + { + "epoch": 0.9624588364434687, + "grad_norm": 2.0008177757263184, + "learning_rate": 1.844863914089427e-05, + "loss": 0.2509, + "step": 17536 + }, + { + "epoch": 0.9625686059275521, + "grad_norm": 1.3417648077011108, + "learning_rate": 1.844358627843677e-05, + "loss": 0.234, + "step": 17538 + }, + { + "epoch": 0.9626783754116356, + "grad_norm": 1.1859021186828613, + "learning_rate": 1.8438533703575754e-05, + "loss": 0.1636, + "step": 17540 + }, + { + "epoch": 0.962788144895719, + "grad_norm": 1.4784265756607056, + "learning_rate": 1.843348141653286e-05, + "loss": 0.2759, + "step": 17542 + }, + { + "epoch": 0.9628979143798024, + "grad_norm": 1.308323860168457, + "learning_rate": 1.84284294175297e-05, + "loss": 0.199, + "step": 17544 + }, + { + "epoch": 0.9630076838638858, + "grad_norm": 1.772317886352539, + "learning_rate": 1.842337770678788e-05, + "loss": 0.2455, + "step": 17546 + }, + { + "epoch": 0.9631174533479693, + "grad_norm": 1.4109522104263306, + "learning_rate": 1.8418326284528996e-05, + "loss": 0.2333, + "step": 17548 + }, + { + "epoch": 0.9632272228320526, + "grad_norm": 0.7138198018074036, + "learning_rate": 1.8413275150974624e-05, + "loss": 0.1152, + "step": 17550 + }, + { + "epoch": 0.9633369923161361, + "grad_norm": 1.5268970727920532, + "learning_rate": 1.8408224306346335e-05, + "loss": 0.347, + "step": 17552 + }, + { + "epoch": 0.9634467618002195, + "grad_norm": 1.597503900527954, + "learning_rate": 1.8403173750865685e-05, + "loss": 0.2326, + "step": 17554 + }, + { + "epoch": 0.963556531284303, + "grad_norm": 1.4960085153579712, + "learning_rate": 1.8398123484754203e-05, + "loss": 0.1724, + "step": 17556 + }, + { + "epoch": 0.9636663007683864, + "grad_norm": 1.3780955076217651, + "learning_rate": 1.839307350823344e-05, + "loss": 0.2548, + "step": 17558 + }, + { + "epoch": 0.9637760702524698, + "grad_norm": 0.9480571746826172, + "learning_rate": 1.838802382152489e-05, + "loss": 0.153, + "step": 17560 + }, + { + "epoch": 0.9638858397365533, + "grad_norm": 1.4881503582000732, + "learning_rate": 1.838297442485008e-05, + "loss": 0.2178, + "step": 17562 + }, + { + "epoch": 0.9639956092206367, + "grad_norm": 3.5333168506622314, + "learning_rate": 1.8377925318430477e-05, + "loss": 0.2513, + "step": 17564 + }, + { + "epoch": 0.9641053787047201, + "grad_norm": 1.9734725952148438, + "learning_rate": 1.8372876502487584e-05, + "loss": 0.2223, + "step": 17566 + }, + { + "epoch": 0.9642151481888035, + "grad_norm": 1.70286226272583, + "learning_rate": 1.8367827977242858e-05, + "loss": 0.2746, + "step": 17568 + }, + { + "epoch": 0.964324917672887, + "grad_norm": 1.0704516172409058, + "learning_rate": 1.836277974291774e-05, + "loss": 0.1962, + "step": 17570 + }, + { + "epoch": 0.9644346871569703, + "grad_norm": 1.0729821920394897, + "learning_rate": 1.8357731799733686e-05, + "loss": 0.2435, + "step": 17572 + }, + { + "epoch": 0.9645444566410538, + "grad_norm": 1.2951548099517822, + "learning_rate": 1.835268414791212e-05, + "loss": 0.2237, + "step": 17574 + }, + { + "epoch": 0.9646542261251372, + "grad_norm": 1.6405892372131348, + "learning_rate": 1.8347636787674442e-05, + "loss": 0.2824, + "step": 17576 + }, + { + "epoch": 0.9647639956092207, + "grad_norm": 1.329848051071167, + "learning_rate": 1.834258971924208e-05, + "loss": 0.1917, + "step": 17578 + }, + { + "epoch": 0.964873765093304, + "grad_norm": 1.4438689947128296, + "learning_rate": 1.8337542942836406e-05, + "loss": 0.1926, + "step": 17580 + }, + { + "epoch": 0.9649835345773875, + "grad_norm": 1.641578197479248, + "learning_rate": 1.8332496458678793e-05, + "loss": 0.2603, + "step": 17582 + }, + { + "epoch": 0.9650933040614709, + "grad_norm": 1.8643460273742676, + "learning_rate": 1.8327450266990616e-05, + "loss": 0.2928, + "step": 17584 + }, + { + "epoch": 0.9652030735455543, + "grad_norm": 1.3872435092926025, + "learning_rate": 1.8322404367993217e-05, + "loss": 0.2155, + "step": 17586 + }, + { + "epoch": 0.9653128430296377, + "grad_norm": 1.0593327283859253, + "learning_rate": 1.8317358761907942e-05, + "loss": 0.2829, + "step": 17588 + }, + { + "epoch": 0.9654226125137212, + "grad_norm": 1.7452778816223145, + "learning_rate": 1.8312313448956105e-05, + "loss": 0.2898, + "step": 17590 + }, + { + "epoch": 0.9655323819978046, + "grad_norm": 1.650743842124939, + "learning_rate": 1.830726842935904e-05, + "loss": 0.1843, + "step": 17592 + }, + { + "epoch": 0.965642151481888, + "grad_norm": 2.4839580059051514, + "learning_rate": 1.8302223703338018e-05, + "loss": 0.3309, + "step": 17594 + }, + { + "epoch": 0.9657519209659715, + "grad_norm": 1.7740135192871094, + "learning_rate": 1.8297179271114346e-05, + "loss": 0.2264, + "step": 17596 + }, + { + "epoch": 0.9658616904500549, + "grad_norm": 1.0018091201782227, + "learning_rate": 1.8292135132909288e-05, + "loss": 0.2748, + "step": 17598 + }, + { + "epoch": 0.9659714599341384, + "grad_norm": 1.2738202810287476, + "learning_rate": 1.828709128894411e-05, + "loss": 0.2152, + "step": 17600 + }, + { + "epoch": 0.9660812294182217, + "grad_norm": 1.0314056873321533, + "learning_rate": 1.8282047739440055e-05, + "loss": 0.1701, + "step": 17602 + }, + { + "epoch": 0.9661909989023052, + "grad_norm": 1.1666560173034668, + "learning_rate": 1.827700448461836e-05, + "loss": 0.1986, + "step": 17604 + }, + { + "epoch": 0.9663007683863886, + "grad_norm": 1.2009403705596924, + "learning_rate": 1.827196152470024e-05, + "loss": 0.166, + "step": 17606 + }, + { + "epoch": 0.966410537870472, + "grad_norm": 0.9706559777259827, + "learning_rate": 1.8266918859906922e-05, + "loss": 0.2546, + "step": 17608 + }, + { + "epoch": 0.9665203073545554, + "grad_norm": 2.038780689239502, + "learning_rate": 1.826187649045959e-05, + "loss": 0.3406, + "step": 17610 + }, + { + "epoch": 0.9666300768386389, + "grad_norm": 1.5871349573135376, + "learning_rate": 1.825683441657942e-05, + "loss": 0.196, + "step": 17612 + }, + { + "epoch": 0.9667398463227223, + "grad_norm": 3.3776354789733887, + "learning_rate": 1.8251792638487596e-05, + "loss": 0.2339, + "step": 17614 + }, + { + "epoch": 0.9668496158068057, + "grad_norm": 1.1893393993377686, + "learning_rate": 1.824675115640527e-05, + "loss": 0.1774, + "step": 17616 + }, + { + "epoch": 0.9669593852908891, + "grad_norm": 1.5838840007781982, + "learning_rate": 1.8241709970553576e-05, + "loss": 0.2899, + "step": 17618 + }, + { + "epoch": 0.9670691547749726, + "grad_norm": 1.1189888715744019, + "learning_rate": 1.8236669081153657e-05, + "loss": 0.2058, + "step": 17620 + }, + { + "epoch": 0.9671789242590559, + "grad_norm": 1.38640296459198, + "learning_rate": 1.8231628488426634e-05, + "loss": 0.3587, + "step": 17622 + }, + { + "epoch": 0.9672886937431394, + "grad_norm": 0.8569034934043884, + "learning_rate": 1.8226588192593605e-05, + "loss": 0.246, + "step": 17624 + }, + { + "epoch": 0.9673984632272228, + "grad_norm": 1.72981858253479, + "learning_rate": 1.822154819387566e-05, + "loss": 0.3397, + "step": 17626 + }, + { + "epoch": 0.9675082327113063, + "grad_norm": 1.7360622882843018, + "learning_rate": 1.8216508492493886e-05, + "loss": 0.1803, + "step": 17628 + }, + { + "epoch": 0.9676180021953897, + "grad_norm": 1.6169626712799072, + "learning_rate": 1.821146908866935e-05, + "loss": 0.2298, + "step": 17630 + }, + { + "epoch": 0.9677277716794731, + "grad_norm": 1.4331440925598145, + "learning_rate": 1.8206429982623086e-05, + "loss": 0.1573, + "step": 17632 + }, + { + "epoch": 0.9678375411635566, + "grad_norm": 1.0830990076065063, + "learning_rate": 1.820139117457616e-05, + "loss": 0.2403, + "step": 17634 + }, + { + "epoch": 0.96794731064764, + "grad_norm": 1.14090895652771, + "learning_rate": 1.8196352664749576e-05, + "loss": 0.1325, + "step": 17636 + }, + { + "epoch": 0.9680570801317234, + "grad_norm": 1.3461928367614746, + "learning_rate": 1.8191314453364368e-05, + "loss": 0.3265, + "step": 17638 + }, + { + "epoch": 0.9681668496158068, + "grad_norm": 1.5526137351989746, + "learning_rate": 1.8186276540641527e-05, + "loss": 0.2494, + "step": 17640 + }, + { + "epoch": 0.9682766190998903, + "grad_norm": 1.011305332183838, + "learning_rate": 1.8181238926802033e-05, + "loss": 0.1967, + "step": 17642 + }, + { + "epoch": 0.9683863885839736, + "grad_norm": 1.5231257677078247, + "learning_rate": 1.817620161206687e-05, + "loss": 0.24, + "step": 17644 + }, + { + "epoch": 0.9684961580680571, + "grad_norm": 1.5867639780044556, + "learning_rate": 1.8171164596657006e-05, + "loss": 0.2881, + "step": 17646 + }, + { + "epoch": 0.9686059275521405, + "grad_norm": 0.9769860506057739, + "learning_rate": 1.8166127880793372e-05, + "loss": 0.1956, + "step": 17648 + }, + { + "epoch": 0.968715697036224, + "grad_norm": 0.9939494132995605, + "learning_rate": 1.8161091464696915e-05, + "loss": 0.2538, + "step": 17650 + }, + { + "epoch": 0.9688254665203073, + "grad_norm": 1.7668935060501099, + "learning_rate": 1.8156055348588546e-05, + "loss": 0.2502, + "step": 17652 + }, + { + "epoch": 0.9689352360043908, + "grad_norm": 1.0880820751190186, + "learning_rate": 1.815101953268919e-05, + "loss": 0.1728, + "step": 17654 + }, + { + "epoch": 0.9690450054884742, + "grad_norm": 1.0644919872283936, + "learning_rate": 1.814598401721973e-05, + "loss": 0.2632, + "step": 17656 + }, + { + "epoch": 0.9691547749725576, + "grad_norm": 1.3743617534637451, + "learning_rate": 1.8140948802401056e-05, + "loss": 0.1963, + "step": 17658 + }, + { + "epoch": 0.969264544456641, + "grad_norm": 1.0224454402923584, + "learning_rate": 1.8135913888454033e-05, + "loss": 0.1871, + "step": 17660 + }, + { + "epoch": 0.9693743139407245, + "grad_norm": 1.3657433986663818, + "learning_rate": 1.8130879275599515e-05, + "loss": 0.2793, + "step": 17662 + }, + { + "epoch": 0.9694840834248079, + "grad_norm": 1.1734967231750488, + "learning_rate": 1.8125844964058354e-05, + "loss": 0.2029, + "step": 17664 + }, + { + "epoch": 0.9695938529088913, + "grad_norm": 1.4423720836639404, + "learning_rate": 1.812081095405137e-05, + "loss": 0.1919, + "step": 17666 + }, + { + "epoch": 0.9697036223929748, + "grad_norm": 1.4008228778839111, + "learning_rate": 1.811577724579938e-05, + "loss": 0.184, + "step": 17668 + }, + { + "epoch": 0.9698133918770582, + "grad_norm": 1.4195749759674072, + "learning_rate": 1.811074383952319e-05, + "loss": 0.2357, + "step": 17670 + }, + { + "epoch": 0.9699231613611417, + "grad_norm": 1.5149625539779663, + "learning_rate": 1.8105710735443593e-05, + "loss": 0.3106, + "step": 17672 + }, + { + "epoch": 0.970032930845225, + "grad_norm": 1.4491889476776123, + "learning_rate": 1.8100677933781364e-05, + "loss": 0.1843, + "step": 17674 + }, + { + "epoch": 0.9701427003293085, + "grad_norm": 1.421656608581543, + "learning_rate": 1.809564543475726e-05, + "loss": 0.2235, + "step": 17676 + }, + { + "epoch": 0.9702524698133919, + "grad_norm": 1.0810296535491943, + "learning_rate": 1.8090613238592035e-05, + "loss": 0.2189, + "step": 17678 + }, + { + "epoch": 0.9703622392974753, + "grad_norm": 1.228881597518921, + "learning_rate": 1.808558134550643e-05, + "loss": 0.2804, + "step": 17680 + }, + { + "epoch": 0.9704720087815587, + "grad_norm": 1.5460718870162964, + "learning_rate": 1.8080549755721154e-05, + "loss": 0.2648, + "step": 17682 + }, + { + "epoch": 0.9705817782656422, + "grad_norm": 1.5727424621582031, + "learning_rate": 1.807551846945694e-05, + "loss": 0.223, + "step": 17684 + }, + { + "epoch": 0.9706915477497255, + "grad_norm": 1.6827421188354492, + "learning_rate": 1.807048748693447e-05, + "loss": 0.2705, + "step": 17686 + }, + { + "epoch": 0.970801317233809, + "grad_norm": 1.4611948728561401, + "learning_rate": 1.8065456808374433e-05, + "loss": 0.2664, + "step": 17688 + }, + { + "epoch": 0.9709110867178924, + "grad_norm": 1.2755060195922852, + "learning_rate": 1.8060426433997502e-05, + "loss": 0.2361, + "step": 17690 + }, + { + "epoch": 0.9710208562019759, + "grad_norm": 1.8088902235031128, + "learning_rate": 1.8055396364024317e-05, + "loss": 0.1443, + "step": 17692 + }, + { + "epoch": 0.9711306256860592, + "grad_norm": 1.3045400381088257, + "learning_rate": 1.8050366598675545e-05, + "loss": 0.2148, + "step": 17694 + }, + { + "epoch": 0.9712403951701427, + "grad_norm": 1.3831672668457031, + "learning_rate": 1.80453371381718e-05, + "loss": 0.2934, + "step": 17696 + }, + { + "epoch": 0.9713501646542261, + "grad_norm": 1.643011450767517, + "learning_rate": 1.80403079827337e-05, + "loss": 0.2795, + "step": 17698 + }, + { + "epoch": 0.9714599341383096, + "grad_norm": 1.201932430267334, + "learning_rate": 1.803527913258186e-05, + "loss": 0.2369, + "step": 17700 + }, + { + "epoch": 0.9715697036223929, + "grad_norm": 1.293087124824524, + "learning_rate": 1.803025058793686e-05, + "loss": 0.2397, + "step": 17702 + }, + { + "epoch": 0.9716794731064764, + "grad_norm": 0.9733791947364807, + "learning_rate": 1.802522234901927e-05, + "loss": 0.3053, + "step": 17704 + }, + { + "epoch": 0.9717892425905599, + "grad_norm": 0.9108561277389526, + "learning_rate": 1.8020194416049672e-05, + "loss": 0.1797, + "step": 17706 + }, + { + "epoch": 0.9718990120746432, + "grad_norm": 4.04627799987793, + "learning_rate": 1.8015166789248604e-05, + "loss": 0.167, + "step": 17708 + }, + { + "epoch": 0.9720087815587267, + "grad_norm": 1.4777600765228271, + "learning_rate": 1.80101394688366e-05, + "loss": 0.2247, + "step": 17710 + }, + { + "epoch": 0.9721185510428101, + "grad_norm": 2.0548758506774902, + "learning_rate": 1.800511245503418e-05, + "loss": 0.255, + "step": 17712 + }, + { + "epoch": 0.9722283205268936, + "grad_norm": 1.5706757307052612, + "learning_rate": 1.8000085748061873e-05, + "loss": 0.3768, + "step": 17714 + }, + { + "epoch": 0.9723380900109769, + "grad_norm": 1.1138938665390015, + "learning_rate": 1.7995059348140165e-05, + "loss": 0.1544, + "step": 17716 + }, + { + "epoch": 0.9724478594950604, + "grad_norm": 1.5197455883026123, + "learning_rate": 1.7990033255489524e-05, + "loss": 0.2058, + "step": 17718 + }, + { + "epoch": 0.9725576289791438, + "grad_norm": 1.270714282989502, + "learning_rate": 1.7985007470330444e-05, + "loss": 0.253, + "step": 17720 + }, + { + "epoch": 0.9726673984632273, + "grad_norm": 1.1195677518844604, + "learning_rate": 1.797998199288336e-05, + "loss": 0.1944, + "step": 17722 + }, + { + "epoch": 0.9727771679473106, + "grad_norm": 0.8848618268966675, + "learning_rate": 1.7974956823368727e-05, + "loss": 0.1485, + "step": 17724 + }, + { + "epoch": 0.9728869374313941, + "grad_norm": 1.0586528778076172, + "learning_rate": 1.7969931962006967e-05, + "loss": 0.2424, + "step": 17726 + }, + { + "epoch": 0.9729967069154775, + "grad_norm": 2.6864123344421387, + "learning_rate": 1.7964907409018496e-05, + "loss": 0.2685, + "step": 17728 + }, + { + "epoch": 0.9731064763995609, + "grad_norm": 1.2250174283981323, + "learning_rate": 1.7959883164623714e-05, + "loss": 0.203, + "step": 17730 + }, + { + "epoch": 0.9732162458836443, + "grad_norm": 1.6645318269729614, + "learning_rate": 1.7954859229043016e-05, + "loss": 0.2081, + "step": 17732 + }, + { + "epoch": 0.9733260153677278, + "grad_norm": 0.8219844102859497, + "learning_rate": 1.7949835602496766e-05, + "loss": 0.1424, + "step": 17734 + }, + { + "epoch": 0.9734357848518111, + "grad_norm": 1.1983381509780884, + "learning_rate": 1.7944812285205335e-05, + "loss": 0.2252, + "step": 17736 + }, + { + "epoch": 0.9735455543358946, + "grad_norm": 1.1947081089019775, + "learning_rate": 1.7939789277389067e-05, + "loss": 0.1862, + "step": 17738 + }, + { + "epoch": 0.973655323819978, + "grad_norm": 1.0352590084075928, + "learning_rate": 1.7934766579268292e-05, + "loss": 0.3348, + "step": 17740 + }, + { + "epoch": 0.9737650933040615, + "grad_norm": 3.1638131141662598, + "learning_rate": 1.7929744191063327e-05, + "loss": 0.24, + "step": 17742 + }, + { + "epoch": 0.973874862788145, + "grad_norm": 1.217915415763855, + "learning_rate": 1.7924722112994495e-05, + "loss": 0.2062, + "step": 17744 + }, + { + "epoch": 0.9739846322722283, + "grad_norm": 0.947403609752655, + "learning_rate": 1.7919700345282075e-05, + "loss": 0.1477, + "step": 17746 + }, + { + "epoch": 0.9740944017563118, + "grad_norm": 2.184014320373535, + "learning_rate": 1.7914678888146347e-05, + "loss": 0.2249, + "step": 17748 + }, + { + "epoch": 0.9742041712403952, + "grad_norm": 1.2914963960647583, + "learning_rate": 1.7909657741807587e-05, + "loss": 0.1366, + "step": 17750 + }, + { + "epoch": 0.9743139407244786, + "grad_norm": 1.3432210683822632, + "learning_rate": 1.7904636906486037e-05, + "loss": 0.2053, + "step": 17752 + }, + { + "epoch": 0.974423710208562, + "grad_norm": 1.1822361946105957, + "learning_rate": 1.7899616382401936e-05, + "loss": 0.2043, + "step": 17754 + }, + { + "epoch": 0.9745334796926455, + "grad_norm": 1.3526378870010376, + "learning_rate": 1.7894596169775512e-05, + "loss": 0.2221, + "step": 17756 + }, + { + "epoch": 0.9746432491767288, + "grad_norm": 1.4619284868240356, + "learning_rate": 1.788957626882698e-05, + "loss": 0.1751, + "step": 17758 + }, + { + "epoch": 0.9747530186608123, + "grad_norm": 1.59398353099823, + "learning_rate": 1.7884556679776525e-05, + "loss": 0.2355, + "step": 17760 + }, + { + "epoch": 0.9748627881448957, + "grad_norm": 1.1365967988967896, + "learning_rate": 1.7879537402844345e-05, + "loss": 0.1263, + "step": 17762 + }, + { + "epoch": 0.9749725576289792, + "grad_norm": 1.2762770652770996, + "learning_rate": 1.7874518438250597e-05, + "loss": 0.2331, + "step": 17764 + }, + { + "epoch": 0.9750823271130625, + "grad_norm": 1.297323226928711, + "learning_rate": 1.7869499786215456e-05, + "loss": 0.2101, + "step": 17766 + }, + { + "epoch": 0.975192096597146, + "grad_norm": 1.4711991548538208, + "learning_rate": 1.7864481446959045e-05, + "loss": 0.2739, + "step": 17768 + }, + { + "epoch": 0.9753018660812294, + "grad_norm": 1.0400675535202026, + "learning_rate": 1.7859463420701498e-05, + "loss": 0.1696, + "step": 17770 + }, + { + "epoch": 0.9754116355653129, + "grad_norm": 1.4289857149124146, + "learning_rate": 1.7854445707662928e-05, + "loss": 0.2738, + "step": 17772 + }, + { + "epoch": 0.9755214050493962, + "grad_norm": 0.9239574074745178, + "learning_rate": 1.7849428308063452e-05, + "loss": 0.187, + "step": 17774 + }, + { + "epoch": 0.9756311745334797, + "grad_norm": 1.4075974225997925, + "learning_rate": 1.7844411222123147e-05, + "loss": 0.1863, + "step": 17776 + }, + { + "epoch": 0.9757409440175632, + "grad_norm": 1.0988898277282715, + "learning_rate": 1.7839394450062087e-05, + "loss": 0.1993, + "step": 17778 + }, + { + "epoch": 0.9758507135016465, + "grad_norm": 1.4804881811141968, + "learning_rate": 1.7834377992100333e-05, + "loss": 0.2055, + "step": 17780 + }, + { + "epoch": 0.97596048298573, + "grad_norm": 1.417008638381958, + "learning_rate": 1.782936184845793e-05, + "loss": 0.2538, + "step": 17782 + }, + { + "epoch": 0.9760702524698134, + "grad_norm": 1.625551462173462, + "learning_rate": 1.782434601935491e-05, + "loss": 0.2071, + "step": 17784 + }, + { + "epoch": 0.9761800219538969, + "grad_norm": 1.2373831272125244, + "learning_rate": 1.7819330505011302e-05, + "loss": 0.2737, + "step": 17786 + }, + { + "epoch": 0.9762897914379802, + "grad_norm": 1.3066701889038086, + "learning_rate": 1.7814315305647093e-05, + "loss": 0.2494, + "step": 17788 + }, + { + "epoch": 0.9763995609220637, + "grad_norm": 1.2530391216278076, + "learning_rate": 1.780930042148229e-05, + "loss": 0.235, + "step": 17790 + }, + { + "epoch": 0.9765093304061471, + "grad_norm": 1.3338117599487305, + "learning_rate": 1.7804285852736864e-05, + "loss": 0.1457, + "step": 17792 + }, + { + "epoch": 0.9766190998902305, + "grad_norm": 1.3539295196533203, + "learning_rate": 1.779927159963078e-05, + "loss": 0.1815, + "step": 17794 + }, + { + "epoch": 0.9767288693743139, + "grad_norm": 1.4367974996566772, + "learning_rate": 1.779425766238398e-05, + "loss": 0.2421, + "step": 17796 + }, + { + "epoch": 0.9768386388583974, + "grad_norm": 1.6648601293563843, + "learning_rate": 1.7789244041216413e-05, + "loss": 0.1815, + "step": 17798 + }, + { + "epoch": 0.9769484083424808, + "grad_norm": 1.1081829071044922, + "learning_rate": 1.778423073634799e-05, + "loss": 0.2413, + "step": 17800 + }, + { + "epoch": 0.9770581778265642, + "grad_norm": 1.1572383642196655, + "learning_rate": 1.777921774799862e-05, + "loss": 0.276, + "step": 17802 + }, + { + "epoch": 0.9771679473106476, + "grad_norm": 1.1060580015182495, + "learning_rate": 1.7774205076388206e-05, + "loss": 0.1448, + "step": 17804 + }, + { + "epoch": 0.9772777167947311, + "grad_norm": 1.0557307004928589, + "learning_rate": 1.776919272173663e-05, + "loss": 0.1697, + "step": 17806 + }, + { + "epoch": 0.9773874862788144, + "grad_norm": 1.3416659832000732, + "learning_rate": 1.7764180684263752e-05, + "loss": 0.187, + "step": 17808 + }, + { + "epoch": 0.9774972557628979, + "grad_norm": 1.5040333271026611, + "learning_rate": 1.7759168964189415e-05, + "loss": 0.1848, + "step": 17810 + }, + { + "epoch": 0.9776070252469813, + "grad_norm": 1.1539427042007446, + "learning_rate": 1.7754157561733476e-05, + "loss": 0.1744, + "step": 17812 + }, + { + "epoch": 0.9777167947310648, + "grad_norm": 1.16114342212677, + "learning_rate": 1.7749146477115748e-05, + "loss": 0.2205, + "step": 17814 + }, + { + "epoch": 0.9778265642151482, + "grad_norm": 0.9588121175765991, + "learning_rate": 1.7744135710556045e-05, + "loss": 0.2765, + "step": 17816 + }, + { + "epoch": 0.9779363336992316, + "grad_norm": 0.9892104864120483, + "learning_rate": 1.7739125262274163e-05, + "loss": 0.1066, + "step": 17818 + }, + { + "epoch": 0.9780461031833151, + "grad_norm": 1.9321194887161255, + "learning_rate": 1.7734115132489886e-05, + "loss": 0.1891, + "step": 17820 + }, + { + "epoch": 0.9781558726673985, + "grad_norm": 1.5514240264892578, + "learning_rate": 1.7729105321422986e-05, + "loss": 0.194, + "step": 17822 + }, + { + "epoch": 0.9782656421514819, + "grad_norm": 1.5176479816436768, + "learning_rate": 1.772409582929321e-05, + "loss": 0.2169, + "step": 17824 + }, + { + "epoch": 0.9783754116355653, + "grad_norm": 1.7588233947753906, + "learning_rate": 1.77190866563203e-05, + "loss": 0.2512, + "step": 17826 + }, + { + "epoch": 0.9784851811196488, + "grad_norm": 1.636141300201416, + "learning_rate": 1.7714077802723994e-05, + "loss": 0.25, + "step": 17828 + }, + { + "epoch": 0.9785949506037321, + "grad_norm": 1.7294931411743164, + "learning_rate": 1.7709069268723992e-05, + "loss": 0.2891, + "step": 17830 + }, + { + "epoch": 0.9787047200878156, + "grad_norm": 0.8669235706329346, + "learning_rate": 1.7704061054539993e-05, + "loss": 0.2013, + "step": 17832 + }, + { + "epoch": 0.978814489571899, + "grad_norm": 1.677833080291748, + "learning_rate": 1.769905316039169e-05, + "loss": 0.2689, + "step": 17834 + }, + { + "epoch": 0.9789242590559825, + "grad_norm": 1.6744861602783203, + "learning_rate": 1.7694045586498752e-05, + "loss": 0.3208, + "step": 17836 + }, + { + "epoch": 0.9790340285400658, + "grad_norm": 1.1686694622039795, + "learning_rate": 1.7689038333080836e-05, + "loss": 0.1528, + "step": 17838 + }, + { + "epoch": 0.9791437980241493, + "grad_norm": 1.8506958484649658, + "learning_rate": 1.768403140035758e-05, + "loss": 0.185, + "step": 17840 + }, + { + "epoch": 0.9792535675082327, + "grad_norm": 1.433027744293213, + "learning_rate": 1.767902478854862e-05, + "loss": 0.1744, + "step": 17842 + }, + { + "epoch": 0.9793633369923161, + "grad_norm": 1.4126718044281006, + "learning_rate": 1.767401849787357e-05, + "loss": 0.3832, + "step": 17844 + }, + { + "epoch": 0.9794731064763995, + "grad_norm": 1.8567674160003662, + "learning_rate": 1.7669012528552014e-05, + "loss": 0.2863, + "step": 17846 + }, + { + "epoch": 0.979582875960483, + "grad_norm": 1.2527072429656982, + "learning_rate": 1.7664006880803563e-05, + "loss": 0.1941, + "step": 17848 + }, + { + "epoch": 0.9796926454445664, + "grad_norm": 1.2935810089111328, + "learning_rate": 1.765900155484777e-05, + "loss": 0.2756, + "step": 17850 + }, + { + "epoch": 0.9798024149286498, + "grad_norm": 2.8116588592529297, + "learning_rate": 1.7653996550904205e-05, + "loss": 0.2814, + "step": 17852 + }, + { + "epoch": 0.9799121844127333, + "grad_norm": 1.2224386930465698, + "learning_rate": 1.7648991869192405e-05, + "loss": 0.2079, + "step": 17854 + }, + { + "epoch": 0.9800219538968167, + "grad_norm": 4.292880058288574, + "learning_rate": 1.7643987509931903e-05, + "loss": 0.2756, + "step": 17856 + }, + { + "epoch": 0.9801317233809002, + "grad_norm": 1.246803641319275, + "learning_rate": 1.7638983473342213e-05, + "loss": 0.2103, + "step": 17858 + }, + { + "epoch": 0.9802414928649835, + "grad_norm": 1.1164551973342896, + "learning_rate": 1.7633979759642844e-05, + "loss": 0.2014, + "step": 17860 + }, + { + "epoch": 0.980351262349067, + "grad_norm": 0.9308801293373108, + "learning_rate": 1.7628976369053263e-05, + "loss": 0.1406, + "step": 17862 + }, + { + "epoch": 0.9804610318331504, + "grad_norm": 1.3170307874679565, + "learning_rate": 1.7623973301792966e-05, + "loss": 0.3254, + "step": 17864 + }, + { + "epoch": 0.9805708013172338, + "grad_norm": 1.8211073875427246, + "learning_rate": 1.7618970558081406e-05, + "loss": 0.2779, + "step": 17866 + }, + { + "epoch": 0.9806805708013172, + "grad_norm": 2.942034959793091, + "learning_rate": 1.7613968138138026e-05, + "loss": 0.1877, + "step": 17868 + }, + { + "epoch": 0.9807903402854007, + "grad_norm": 0.8291052579879761, + "learning_rate": 1.760896604218225e-05, + "loss": 0.1408, + "step": 17870 + }, + { + "epoch": 0.980900109769484, + "grad_norm": 3.4438252449035645, + "learning_rate": 1.760396427043351e-05, + "loss": 0.1949, + "step": 17872 + }, + { + "epoch": 0.9810098792535675, + "grad_norm": 1.6515021324157715, + "learning_rate": 1.7598962823111193e-05, + "loss": 0.3666, + "step": 17874 + }, + { + "epoch": 0.9811196487376509, + "grad_norm": 0.976202666759491, + "learning_rate": 1.759396170043469e-05, + "loss": 0.1634, + "step": 17876 + }, + { + "epoch": 0.9812294182217344, + "grad_norm": 2.580820322036743, + "learning_rate": 1.7588960902623385e-05, + "loss": 0.2148, + "step": 17878 + }, + { + "epoch": 0.9813391877058177, + "grad_norm": 1.142228364944458, + "learning_rate": 1.758396042989663e-05, + "loss": 0.2094, + "step": 17880 + }, + { + "epoch": 0.9814489571899012, + "grad_norm": 1.2749446630477905, + "learning_rate": 1.7578960282473768e-05, + "loss": 0.2596, + "step": 17882 + }, + { + "epoch": 0.9815587266739846, + "grad_norm": 1.8810328245162964, + "learning_rate": 1.7573960460574133e-05, + "loss": 0.2635, + "step": 17884 + }, + { + "epoch": 0.9816684961580681, + "grad_norm": 1.3126304149627686, + "learning_rate": 1.756896096441704e-05, + "loss": 0.1908, + "step": 17886 + }, + { + "epoch": 0.9817782656421514, + "grad_norm": 1.1645939350128174, + "learning_rate": 1.7563961794221795e-05, + "loss": 0.1511, + "step": 17888 + }, + { + "epoch": 0.9818880351262349, + "grad_norm": 1.1878583431243896, + "learning_rate": 1.7558962950207684e-05, + "loss": 0.1351, + "step": 17890 + }, + { + "epoch": 0.9819978046103184, + "grad_norm": 2.221527576446533, + "learning_rate": 1.7553964432593976e-05, + "loss": 0.2631, + "step": 17892 + }, + { + "epoch": 0.9821075740944017, + "grad_norm": 1.1972041130065918, + "learning_rate": 1.7548966241599934e-05, + "loss": 0.2471, + "step": 17894 + }, + { + "epoch": 0.9822173435784852, + "grad_norm": 1.68618905544281, + "learning_rate": 1.7543968377444806e-05, + "loss": 0.235, + "step": 17896 + }, + { + "epoch": 0.9823271130625686, + "grad_norm": 1.0457725524902344, + "learning_rate": 1.7538970840347825e-05, + "loss": 0.2624, + "step": 17898 + }, + { + "epoch": 0.9824368825466521, + "grad_norm": 0.8956717252731323, + "learning_rate": 1.75339736305282e-05, + "loss": 0.173, + "step": 17900 + }, + { + "epoch": 0.9825466520307354, + "grad_norm": 1.3681070804595947, + "learning_rate": 1.7528976748205146e-05, + "loss": 0.2731, + "step": 17902 + }, + { + "epoch": 0.9826564215148189, + "grad_norm": 1.1624630689620972, + "learning_rate": 1.7523980193597836e-05, + "loss": 0.2733, + "step": 17904 + }, + { + "epoch": 0.9827661909989023, + "grad_norm": 1.540663480758667, + "learning_rate": 1.7518983966925446e-05, + "loss": 0.2315, + "step": 17906 + }, + { + "epoch": 0.9828759604829858, + "grad_norm": 2.184438943862915, + "learning_rate": 1.7513988068407146e-05, + "loss": 0.3108, + "step": 17908 + }, + { + "epoch": 0.9829857299670691, + "grad_norm": 1.1555746793746948, + "learning_rate": 1.7508992498262065e-05, + "loss": 0.1898, + "step": 17910 + }, + { + "epoch": 0.9830954994511526, + "grad_norm": 1.3296395540237427, + "learning_rate": 1.7503997256709342e-05, + "loss": 0.2609, + "step": 17912 + }, + { + "epoch": 0.983205268935236, + "grad_norm": 0.7741798758506775, + "learning_rate": 1.7499002343968098e-05, + "loss": 0.2305, + "step": 17914 + }, + { + "epoch": 0.9833150384193194, + "grad_norm": 1.1683411598205566, + "learning_rate": 1.749400776025743e-05, + "loss": 0.1262, + "step": 17916 + }, + { + "epoch": 0.9834248079034028, + "grad_norm": 0.7913392186164856, + "learning_rate": 1.748901350579641e-05, + "loss": 0.2287, + "step": 17918 + }, + { + "epoch": 0.9835345773874863, + "grad_norm": 1.4132728576660156, + "learning_rate": 1.7484019580804135e-05, + "loss": 0.2698, + "step": 17920 + }, + { + "epoch": 0.9836443468715697, + "grad_norm": 1.7551661729812622, + "learning_rate": 1.747902598549965e-05, + "loss": 0.3014, + "step": 17922 + }, + { + "epoch": 0.9837541163556531, + "grad_norm": 2.139439582824707, + "learning_rate": 1.747403272010199e-05, + "loss": 0.2304, + "step": 17924 + }, + { + "epoch": 0.9838638858397366, + "grad_norm": 1.247286319732666, + "learning_rate": 1.74690397848302e-05, + "loss": 0.1587, + "step": 17926 + }, + { + "epoch": 0.98397365532382, + "grad_norm": 0.9531540274620056, + "learning_rate": 1.7464047179903296e-05, + "loss": 0.1887, + "step": 17928 + }, + { + "epoch": 0.9840834248079035, + "grad_norm": 1.0418155193328857, + "learning_rate": 1.745905490554027e-05, + "loss": 0.1909, + "step": 17930 + }, + { + "epoch": 0.9841931942919868, + "grad_norm": 1.0026994943618774, + "learning_rate": 1.74540629619601e-05, + "loss": 0.1366, + "step": 17932 + }, + { + "epoch": 0.9843029637760703, + "grad_norm": 1.1348761320114136, + "learning_rate": 1.7449071349381778e-05, + "loss": 0.1627, + "step": 17934 + }, + { + "epoch": 0.9844127332601537, + "grad_norm": 1.221258521080017, + "learning_rate": 1.7444080068024243e-05, + "loss": 0.1677, + "step": 17936 + }, + { + "epoch": 0.9845225027442371, + "grad_norm": 1.3050121068954468, + "learning_rate": 1.7439089118106443e-05, + "loss": 0.2591, + "step": 17938 + }, + { + "epoch": 0.9846322722283205, + "grad_norm": 1.1533441543579102, + "learning_rate": 1.7434098499847306e-05, + "loss": 0.2894, + "step": 17940 + }, + { + "epoch": 0.984742041712404, + "grad_norm": 1.3066407442092896, + "learning_rate": 1.7429108213465737e-05, + "loss": 0.2696, + "step": 17942 + }, + { + "epoch": 0.9848518111964873, + "grad_norm": 1.161505937576294, + "learning_rate": 1.7424118259180654e-05, + "loss": 0.1915, + "step": 17944 + }, + { + "epoch": 0.9849615806805708, + "grad_norm": 1.443350911140442, + "learning_rate": 1.7419128637210927e-05, + "loss": 0.2376, + "step": 17946 + }, + { + "epoch": 0.9850713501646542, + "grad_norm": 1.177079200744629, + "learning_rate": 1.741413934777542e-05, + "loss": 0.2299, + "step": 17948 + }, + { + "epoch": 0.9851811196487377, + "grad_norm": 1.303676962852478, + "learning_rate": 1.7409150391093e-05, + "loss": 0.1885, + "step": 17950 + }, + { + "epoch": 0.985290889132821, + "grad_norm": 1.0902029275894165, + "learning_rate": 1.7404161767382504e-05, + "loss": 0.222, + "step": 17952 + }, + { + "epoch": 0.9854006586169045, + "grad_norm": 1.4456719160079956, + "learning_rate": 1.739917347686274e-05, + "loss": 0.1971, + "step": 17954 + }, + { + "epoch": 0.9855104281009879, + "grad_norm": 3.718116044998169, + "learning_rate": 1.7394185519752545e-05, + "loss": 0.3067, + "step": 17956 + }, + { + "epoch": 0.9856201975850714, + "grad_norm": 1.8419708013534546, + "learning_rate": 1.738919789627071e-05, + "loss": 0.2433, + "step": 17958 + }, + { + "epoch": 0.9857299670691547, + "grad_norm": 1.475861668586731, + "learning_rate": 1.7384210606636007e-05, + "loss": 0.2032, + "step": 17960 + }, + { + "epoch": 0.9858397365532382, + "grad_norm": 1.3505430221557617, + "learning_rate": 1.7379223651067207e-05, + "loss": 0.17, + "step": 17962 + }, + { + "epoch": 0.9859495060373217, + "grad_norm": 1.3349432945251465, + "learning_rate": 1.7374237029783062e-05, + "loss": 0.2457, + "step": 17964 + }, + { + "epoch": 0.986059275521405, + "grad_norm": 1.3812081813812256, + "learning_rate": 1.7369250743002315e-05, + "loss": 0.1993, + "step": 17966 + }, + { + "epoch": 0.9861690450054885, + "grad_norm": 1.6870310306549072, + "learning_rate": 1.736426479094368e-05, + "loss": 0.2316, + "step": 17968 + }, + { + "epoch": 0.9862788144895719, + "grad_norm": 1.3542166948318481, + "learning_rate": 1.735927917382587e-05, + "loss": 0.222, + "step": 17970 + }, + { + "epoch": 0.9863885839736554, + "grad_norm": 2.0659019947052, + "learning_rate": 1.7354293891867582e-05, + "loss": 0.2094, + "step": 17972 + }, + { + "epoch": 0.9864983534577387, + "grad_norm": 1.2110552787780762, + "learning_rate": 1.7349308945287484e-05, + "loss": 0.2134, + "step": 17974 + }, + { + "epoch": 0.9866081229418222, + "grad_norm": 1.6463967561721802, + "learning_rate": 1.7344324334304255e-05, + "loss": 0.2954, + "step": 17976 + }, + { + "epoch": 0.9867178924259056, + "grad_norm": 1.6465606689453125, + "learning_rate": 1.733934005913653e-05, + "loss": 0.1944, + "step": 17978 + }, + { + "epoch": 0.986827661909989, + "grad_norm": 1.1350984573364258, + "learning_rate": 1.7334356120002957e-05, + "loss": 0.251, + "step": 17980 + }, + { + "epoch": 0.9869374313940724, + "grad_norm": 1.1191821098327637, + "learning_rate": 1.732937251712215e-05, + "loss": 0.2241, + "step": 17982 + }, + { + "epoch": 0.9870472008781559, + "grad_norm": 1.3339651823043823, + "learning_rate": 1.7324389250712702e-05, + "loss": 0.1857, + "step": 17984 + }, + { + "epoch": 0.9871569703622393, + "grad_norm": 1.6956675052642822, + "learning_rate": 1.7319406320993227e-05, + "loss": 0.3068, + "step": 17986 + }, + { + "epoch": 0.9872667398463227, + "grad_norm": 1.1552062034606934, + "learning_rate": 1.7314423728182283e-05, + "loss": 0.1246, + "step": 17988 + }, + { + "epoch": 0.9873765093304061, + "grad_norm": 1.1999531984329224, + "learning_rate": 1.7309441472498444e-05, + "loss": 0.282, + "step": 17990 + }, + { + "epoch": 0.9874862788144896, + "grad_norm": 1.259891390800476, + "learning_rate": 1.7304459554160245e-05, + "loss": 0.2014, + "step": 17992 + }, + { + "epoch": 0.987596048298573, + "grad_norm": 1.1158621311187744, + "learning_rate": 1.7299477973386224e-05, + "loss": 0.1878, + "step": 17994 + }, + { + "epoch": 0.9877058177826564, + "grad_norm": 1.06308114528656, + "learning_rate": 1.7294496730394895e-05, + "loss": 0.2407, + "step": 17996 + }, + { + "epoch": 0.9878155872667398, + "grad_norm": 4.465920925140381, + "learning_rate": 1.728951582540476e-05, + "loss": 0.3, + "step": 17998 + }, + { + "epoch": 0.9879253567508233, + "grad_norm": 1.2533795833587646, + "learning_rate": 1.7284535258634307e-05, + "loss": 0.298, + "step": 18000 + }, + { + "epoch": 0.9880351262349067, + "grad_norm": 0.9737697243690491, + "learning_rate": 1.7279555030302007e-05, + "loss": 0.1994, + "step": 18002 + }, + { + "epoch": 0.9881448957189901, + "grad_norm": 1.7299853563308716, + "learning_rate": 1.7274575140626318e-05, + "loss": 0.1603, + "step": 18004 + }, + { + "epoch": 0.9882546652030736, + "grad_norm": 2.8080060482025146, + "learning_rate": 1.726959558982568e-05, + "loss": 0.1926, + "step": 18006 + }, + { + "epoch": 0.988364434687157, + "grad_norm": 1.0681463479995728, + "learning_rate": 1.7264616378118528e-05, + "loss": 0.2462, + "step": 18008 + }, + { + "epoch": 0.9884742041712404, + "grad_norm": 1.2632598876953125, + "learning_rate": 1.7259637505723265e-05, + "loss": 0.2746, + "step": 18010 + }, + { + "epoch": 0.9885839736553238, + "grad_norm": 1.3722102642059326, + "learning_rate": 1.7254658972858293e-05, + "loss": 0.1655, + "step": 18012 + }, + { + "epoch": 0.9886937431394073, + "grad_norm": 0.9546123147010803, + "learning_rate": 1.7249680779741987e-05, + "loss": 0.1919, + "step": 18014 + }, + { + "epoch": 0.9888035126234906, + "grad_norm": 1.5798999071121216, + "learning_rate": 1.7244702926592733e-05, + "loss": 0.2723, + "step": 18016 + }, + { + "epoch": 0.9889132821075741, + "grad_norm": 1.7226207256317139, + "learning_rate": 1.7239725413628867e-05, + "loss": 0.2456, + "step": 18018 + }, + { + "epoch": 0.9890230515916575, + "grad_norm": 1.8886598348617554, + "learning_rate": 1.7234748241068742e-05, + "loss": 0.2528, + "step": 18020 + }, + { + "epoch": 0.989132821075741, + "grad_norm": 1.6672531366348267, + "learning_rate": 1.722977140913067e-05, + "loss": 0.1967, + "step": 18022 + }, + { + "epoch": 0.9892425905598243, + "grad_norm": 1.4903082847595215, + "learning_rate": 1.722479491803296e-05, + "loss": 0.2352, + "step": 18024 + }, + { + "epoch": 0.9893523600439078, + "grad_norm": 1.0355037450790405, + "learning_rate": 1.721981876799391e-05, + "loss": 0.1873, + "step": 18026 + }, + { + "epoch": 0.9894621295279912, + "grad_norm": 1.44488525390625, + "learning_rate": 1.7214842959231794e-05, + "loss": 0.2341, + "step": 18028 + }, + { + "epoch": 0.9895718990120747, + "grad_norm": 0.9816924333572388, + "learning_rate": 1.7209867491964883e-05, + "loss": 0.1594, + "step": 18030 + }, + { + "epoch": 0.989681668496158, + "grad_norm": 0.7917571663856506, + "learning_rate": 1.7204892366411416e-05, + "loss": 0.2574, + "step": 18032 + }, + { + "epoch": 0.9897914379802415, + "grad_norm": 1.0381715297698975, + "learning_rate": 1.7199917582789633e-05, + "loss": 0.1506, + "step": 18034 + }, + { + "epoch": 0.989901207464325, + "grad_norm": 1.2071247100830078, + "learning_rate": 1.719494314131775e-05, + "loss": 0.178, + "step": 18036 + }, + { + "epoch": 0.9900109769484083, + "grad_norm": 0.8195051550865173, + "learning_rate": 1.718996904221397e-05, + "loss": 0.2137, + "step": 18038 + }, + { + "epoch": 0.9901207464324918, + "grad_norm": 1.4073076248168945, + "learning_rate": 1.7184995285696477e-05, + "loss": 0.1555, + "step": 18040 + }, + { + "epoch": 0.9902305159165752, + "grad_norm": 1.6445338726043701, + "learning_rate": 1.7180021871983454e-05, + "loss": 0.2807, + "step": 18042 + }, + { + "epoch": 0.9903402854006587, + "grad_norm": 1.1246343851089478, + "learning_rate": 1.717504880129304e-05, + "loss": 0.1618, + "step": 18044 + }, + { + "epoch": 0.990450054884742, + "grad_norm": 1.4132155179977417, + "learning_rate": 1.7170076073843412e-05, + "loss": 0.2581, + "step": 18046 + }, + { + "epoch": 0.9905598243688255, + "grad_norm": 1.0363644361495972, + "learning_rate": 1.716510368985267e-05, + "loss": 0.2389, + "step": 18048 + }, + { + "epoch": 0.9906695938529089, + "grad_norm": 2.562138557434082, + "learning_rate": 1.716013164953894e-05, + "loss": 0.2931, + "step": 18050 + }, + { + "epoch": 0.9907793633369923, + "grad_norm": 1.0435714721679688, + "learning_rate": 1.7155159953120313e-05, + "loss": 0.2168, + "step": 18052 + }, + { + "epoch": 0.9908891328210757, + "grad_norm": 2.0387535095214844, + "learning_rate": 1.7150188600814877e-05, + "loss": 0.2555, + "step": 18054 + }, + { + "epoch": 0.9909989023051592, + "grad_norm": 1.238996982574463, + "learning_rate": 1.71452175928407e-05, + "loss": 0.2382, + "step": 18056 + }, + { + "epoch": 0.9911086717892426, + "grad_norm": 1.4534200429916382, + "learning_rate": 1.714024692941583e-05, + "loss": 0.1724, + "step": 18058 + }, + { + "epoch": 0.991218441273326, + "grad_norm": 1.0861629247665405, + "learning_rate": 1.7135276610758307e-05, + "loss": 0.2525, + "step": 18060 + }, + { + "epoch": 0.9913282107574094, + "grad_norm": 1.5620629787445068, + "learning_rate": 1.713030663708616e-05, + "loss": 0.1568, + "step": 18062 + }, + { + "epoch": 0.9914379802414929, + "grad_norm": 2.210616111755371, + "learning_rate": 1.7125337008617386e-05, + "loss": 0.1741, + "step": 18064 + }, + { + "epoch": 0.9915477497255762, + "grad_norm": 0.941028892993927, + "learning_rate": 1.712036772556998e-05, + "loss": 0.196, + "step": 18066 + }, + { + "epoch": 0.9916575192096597, + "grad_norm": 1.813791275024414, + "learning_rate": 1.7115398788161925e-05, + "loss": 0.2915, + "step": 18068 + }, + { + "epoch": 0.9917672886937431, + "grad_norm": 1.4047645330429077, + "learning_rate": 1.7110430196611174e-05, + "loss": 0.1953, + "step": 18070 + }, + { + "epoch": 0.9918770581778266, + "grad_norm": 1.8078348636627197, + "learning_rate": 1.7105461951135686e-05, + "loss": 0.2221, + "step": 18072 + }, + { + "epoch": 0.99198682766191, + "grad_norm": 1.6518609523773193, + "learning_rate": 1.7100494051953373e-05, + "loss": 0.3283, + "step": 18074 + }, + { + "epoch": 0.9920965971459934, + "grad_norm": 1.3315621614456177, + "learning_rate": 1.7095526499282172e-05, + "loss": 0.2296, + "step": 18076 + }, + { + "epoch": 0.9922063666300769, + "grad_norm": 1.235178828239441, + "learning_rate": 1.7090559293339974e-05, + "loss": 0.1944, + "step": 18078 + }, + { + "epoch": 0.9923161361141603, + "grad_norm": 2.1508450508117676, + "learning_rate": 1.708559243434467e-05, + "loss": 0.1244, + "step": 18080 + }, + { + "epoch": 0.9924259055982437, + "grad_norm": 1.1904489994049072, + "learning_rate": 1.7080625922514132e-05, + "loss": 0.214, + "step": 18082 + }, + { + "epoch": 0.9925356750823271, + "grad_norm": 1.28837251663208, + "learning_rate": 1.7075659758066208e-05, + "loss": 0.1725, + "step": 18084 + }, + { + "epoch": 0.9926454445664106, + "grad_norm": 1.2587296962738037, + "learning_rate": 1.7070693941218742e-05, + "loss": 0.1291, + "step": 18086 + }, + { + "epoch": 0.9927552140504939, + "grad_norm": 1.3787261247634888, + "learning_rate": 1.7065728472189563e-05, + "loss": 0.233, + "step": 18088 + }, + { + "epoch": 0.9928649835345774, + "grad_norm": 1.164246916770935, + "learning_rate": 1.706076335119647e-05, + "loss": 0.2192, + "step": 18090 + }, + { + "epoch": 0.9929747530186608, + "grad_norm": 2.694976568222046, + "learning_rate": 1.7055798578457266e-05, + "loss": 0.2195, + "step": 18092 + }, + { + "epoch": 0.9930845225027443, + "grad_norm": 0.8135653734207153, + "learning_rate": 1.7050834154189733e-05, + "loss": 0.1541, + "step": 18094 + }, + { + "epoch": 0.9931942919868276, + "grad_norm": 1.4641973972320557, + "learning_rate": 1.7045870078611627e-05, + "loss": 0.1709, + "step": 18096 + }, + { + "epoch": 0.9933040614709111, + "grad_norm": 1.3160045146942139, + "learning_rate": 1.70409063519407e-05, + "loss": 0.2999, + "step": 18098 + }, + { + "epoch": 0.9934138309549945, + "grad_norm": 1.4535367488861084, + "learning_rate": 1.703594297439469e-05, + "loss": 0.2332, + "step": 18100 + }, + { + "epoch": 0.993523600439078, + "grad_norm": 1.0151000022888184, + "learning_rate": 1.7030979946191306e-05, + "loss": 0.3024, + "step": 18102 + }, + { + "epoch": 0.9936333699231613, + "grad_norm": 1.630025863647461, + "learning_rate": 1.702601726754825e-05, + "loss": 0.2703, + "step": 18104 + }, + { + "epoch": 0.9937431394072448, + "grad_norm": 1.517034888267517, + "learning_rate": 1.7021054938683223e-05, + "loss": 0.1451, + "step": 18106 + }, + { + "epoch": 0.9938529088913282, + "grad_norm": 1.1384141445159912, + "learning_rate": 1.7016092959813893e-05, + "loss": 0.1821, + "step": 18108 + }, + { + "epoch": 0.9939626783754116, + "grad_norm": 1.4123791456222534, + "learning_rate": 1.7011131331157907e-05, + "loss": 0.2489, + "step": 18110 + }, + { + "epoch": 0.9940724478594951, + "grad_norm": 1.9003533124923706, + "learning_rate": 1.7006170052932916e-05, + "loss": 0.3532, + "step": 18112 + }, + { + "epoch": 0.9941822173435785, + "grad_norm": 1.5946546792984009, + "learning_rate": 1.7001209125356543e-05, + "loss": 0.3177, + "step": 18114 + }, + { + "epoch": 0.994291986827662, + "grad_norm": 0.8325018882751465, + "learning_rate": 1.6996248548646394e-05, + "loss": 0.1334, + "step": 18116 + }, + { + "epoch": 0.9944017563117453, + "grad_norm": 1.5858323574066162, + "learning_rate": 1.6991288323020075e-05, + "loss": 0.3507, + "step": 18118 + }, + { + "epoch": 0.9945115257958288, + "grad_norm": 1.1186262369155884, + "learning_rate": 1.6986328448695148e-05, + "loss": 0.168, + "step": 18120 + }, + { + "epoch": 0.9946212952799122, + "grad_norm": 1.0636301040649414, + "learning_rate": 1.6981368925889203e-05, + "loss": 0.2261, + "step": 18122 + }, + { + "epoch": 0.9947310647639956, + "grad_norm": 1.8942551612854004, + "learning_rate": 1.6976409754819767e-05, + "loss": 0.2298, + "step": 18124 + }, + { + "epoch": 0.994840834248079, + "grad_norm": 0.948589563369751, + "learning_rate": 1.697145093570438e-05, + "loss": 0.1207, + "step": 18126 + }, + { + "epoch": 0.9949506037321625, + "grad_norm": 1.0701930522918701, + "learning_rate": 1.6966492468760565e-05, + "loss": 0.1995, + "step": 18128 + }, + { + "epoch": 0.9950603732162459, + "grad_norm": 1.3021845817565918, + "learning_rate": 1.696153435420582e-05, + "loss": 0.1898, + "step": 18130 + }, + { + "epoch": 0.9951701427003293, + "grad_norm": 1.3297920227050781, + "learning_rate": 1.6956576592257635e-05, + "loss": 0.2902, + "step": 18132 + }, + { + "epoch": 0.9952799121844127, + "grad_norm": 0.9799164533615112, + "learning_rate": 1.6951619183133477e-05, + "loss": 0.1793, + "step": 18134 + }, + { + "epoch": 0.9953896816684962, + "grad_norm": 1.790246844291687, + "learning_rate": 1.69466621270508e-05, + "loss": 0.2987, + "step": 18136 + }, + { + "epoch": 0.9954994511525795, + "grad_norm": 1.4825443029403687, + "learning_rate": 1.6941705424227054e-05, + "loss": 0.2104, + "step": 18138 + }, + { + "epoch": 0.995609220636663, + "grad_norm": 1.1557230949401855, + "learning_rate": 1.693674907487966e-05, + "loss": 0.2996, + "step": 18140 + }, + { + "epoch": 0.9957189901207464, + "grad_norm": 1.540940761566162, + "learning_rate": 1.6931793079226034e-05, + "loss": 0.2741, + "step": 18142 + }, + { + "epoch": 0.9958287596048299, + "grad_norm": 1.2395601272583008, + "learning_rate": 1.6926837437483566e-05, + "loss": 0.1587, + "step": 18144 + }, + { + "epoch": 0.9959385290889132, + "grad_norm": 1.4432930946350098, + "learning_rate": 1.6921882149869628e-05, + "loss": 0.2514, + "step": 18146 + }, + { + "epoch": 0.9960482985729967, + "grad_norm": 1.2481552362442017, + "learning_rate": 1.6916927216601593e-05, + "loss": 0.2173, + "step": 18148 + }, + { + "epoch": 0.9961580680570802, + "grad_norm": 1.764037013053894, + "learning_rate": 1.69119726378968e-05, + "loss": 0.1855, + "step": 18150 + }, + { + "epoch": 0.9962678375411635, + "grad_norm": 0.8663426041603088, + "learning_rate": 1.6907018413972586e-05, + "loss": 0.1796, + "step": 18152 + }, + { + "epoch": 0.996377607025247, + "grad_norm": 0.7743386626243591, + "learning_rate": 1.690206454504627e-05, + "loss": 0.1014, + "step": 18154 + }, + { + "epoch": 0.9964873765093304, + "grad_norm": 1.2293121814727783, + "learning_rate": 1.6897111031335145e-05, + "loss": 0.194, + "step": 18156 + }, + { + "epoch": 0.9965971459934139, + "grad_norm": 1.3449833393096924, + "learning_rate": 1.6892157873056506e-05, + "loss": 0.2164, + "step": 18158 + }, + { + "epoch": 0.9967069154774972, + "grad_norm": 1.109276533126831, + "learning_rate": 1.688720507042762e-05, + "loss": 0.1722, + "step": 18160 + }, + { + "epoch": 0.9968166849615807, + "grad_norm": 1.9153090715408325, + "learning_rate": 1.6882252623665736e-05, + "loss": 0.2616, + "step": 18162 + }, + { + "epoch": 0.9969264544456641, + "grad_norm": 1.0958738327026367, + "learning_rate": 1.6877300532988094e-05, + "loss": 0.1817, + "step": 18164 + }, + { + "epoch": 0.9970362239297476, + "grad_norm": 1.7004623413085938, + "learning_rate": 1.6872348798611915e-05, + "loss": 0.3501, + "step": 18166 + }, + { + "epoch": 0.9971459934138309, + "grad_norm": 0.8730112910270691, + "learning_rate": 1.686739742075442e-05, + "loss": 0.2221, + "step": 18168 + }, + { + "epoch": 0.9972557628979144, + "grad_norm": 1.9020458459854126, + "learning_rate": 1.6862446399632783e-05, + "loss": 0.2243, + "step": 18170 + }, + { + "epoch": 0.9973655323819978, + "grad_norm": 2.5292818546295166, + "learning_rate": 1.6857495735464195e-05, + "loss": 0.2212, + "step": 18172 + }, + { + "epoch": 0.9974753018660812, + "grad_norm": 1.418446660041809, + "learning_rate": 1.6852545428465814e-05, + "loss": 0.2287, + "step": 18174 + }, + { + "epoch": 0.9975850713501646, + "grad_norm": 1.8597182035446167, + "learning_rate": 1.6847595478854773e-05, + "loss": 0.1443, + "step": 18176 + }, + { + "epoch": 0.9976948408342481, + "grad_norm": 1.9669151306152344, + "learning_rate": 1.6842645886848212e-05, + "loss": 0.2412, + "step": 18178 + }, + { + "epoch": 0.9978046103183315, + "grad_norm": 1.566890835762024, + "learning_rate": 1.6837696652663242e-05, + "loss": 0.3802, + "step": 18180 + }, + { + "epoch": 0.9979143798024149, + "grad_norm": 1.2821217775344849, + "learning_rate": 1.6832747776516954e-05, + "loss": 0.243, + "step": 18182 + }, + { + "epoch": 0.9980241492864984, + "grad_norm": 1.4208831787109375, + "learning_rate": 1.6827799258626442e-05, + "loss": 0.2001, + "step": 18184 + }, + { + "epoch": 0.9981339187705818, + "grad_norm": 0.9302811622619629, + "learning_rate": 1.6822851099208765e-05, + "loss": 0.1947, + "step": 18186 + }, + { + "epoch": 0.9982436882546653, + "grad_norm": 1.0003106594085693, + "learning_rate": 1.681790329848097e-05, + "loss": 0.2864, + "step": 18188 + }, + { + "epoch": 0.9983534577387486, + "grad_norm": 1.4807082414627075, + "learning_rate": 1.68129558566601e-05, + "loss": 0.2022, + "step": 18190 + }, + { + "epoch": 0.9984632272228321, + "grad_norm": 1.1269475221633911, + "learning_rate": 1.6808008773963173e-05, + "loss": 0.2733, + "step": 18192 + }, + { + "epoch": 0.9985729967069155, + "grad_norm": 1.0555461645126343, + "learning_rate": 1.6803062050607187e-05, + "loss": 0.1722, + "step": 18194 + }, + { + "epoch": 0.9986827661909989, + "grad_norm": 1.1835594177246094, + "learning_rate": 1.6798115686809125e-05, + "loss": 0.2585, + "step": 18196 + }, + { + "epoch": 0.9987925356750823, + "grad_norm": 1.346018671989441, + "learning_rate": 1.6793169682785977e-05, + "loss": 0.1625, + "step": 18198 + }, + { + "epoch": 0.9989023051591658, + "grad_norm": 1.7880443334579468, + "learning_rate": 1.6788224038754687e-05, + "loss": 0.1319, + "step": 18200 + }, + { + "epoch": 0.9990120746432491, + "grad_norm": 0.8333824872970581, + "learning_rate": 1.6783278754932187e-05, + "loss": 0.2167, + "step": 18202 + }, + { + "epoch": 0.9991218441273326, + "grad_norm": 1.068245768547058, + "learning_rate": 1.677833383153542e-05, + "loss": 0.1912, + "step": 18204 + }, + { + "epoch": 0.999231613611416, + "grad_norm": 1.7817180156707764, + "learning_rate": 1.6773389268781282e-05, + "loss": 0.2491, + "step": 18206 + }, + { + "epoch": 0.9993413830954995, + "grad_norm": 1.1029386520385742, + "learning_rate": 1.676844506688667e-05, + "loss": 0.2153, + "step": 18208 + }, + { + "epoch": 0.9994511525795828, + "grad_norm": 1.7083536386489868, + "learning_rate": 1.6763501226068465e-05, + "loss": 0.2018, + "step": 18210 + }, + { + "epoch": 0.9995609220636663, + "grad_norm": 1.0237996578216553, + "learning_rate": 1.6758557746543518e-05, + "loss": 0.1629, + "step": 18212 + }, + { + "epoch": 0.9996706915477497, + "grad_norm": 0.9468595385551453, + "learning_rate": 1.675361462852868e-05, + "loss": 0.2531, + "step": 18214 + }, + { + "epoch": 0.9997804610318332, + "grad_norm": 1.2268515825271606, + "learning_rate": 1.6748671872240785e-05, + "loss": 0.2274, + "step": 18216 + }, + { + "epoch": 0.9998902305159165, + "grad_norm": 2.5217905044555664, + "learning_rate": 1.6743729477896636e-05, + "loss": 0.2498, + "step": 18218 + }, + { + "epoch": 1.0, + "grad_norm": 1.252016305923462, + "learning_rate": 1.6738787445713037e-05, + "loss": 0.2407, + "step": 18220 + }, + { + "epoch": 1.0001097694840835, + "grad_norm": 1.4718986749649048, + "learning_rate": 1.6733845775906773e-05, + "loss": 0.2143, + "step": 18222 + }, + { + "epoch": 1.000219538968167, + "grad_norm": 1.0624723434448242, + "learning_rate": 1.67289044686946e-05, + "loss": 0.1716, + "step": 18224 + }, + { + "epoch": 1.0003293084522502, + "grad_norm": 1.0767370462417603, + "learning_rate": 1.672396352429327e-05, + "loss": 0.1742, + "step": 18226 + }, + { + "epoch": 1.0004390779363337, + "grad_norm": 1.9632922410964966, + "learning_rate": 1.6719022942919527e-05, + "loss": 0.282, + "step": 18228 + }, + { + "epoch": 1.0005488474204172, + "grad_norm": 1.245956301689148, + "learning_rate": 1.6714082724790088e-05, + "loss": 0.1892, + "step": 18230 + }, + { + "epoch": 1.0006586169045006, + "grad_norm": 1.401057481765747, + "learning_rate": 1.6709142870121643e-05, + "loss": 0.2535, + "step": 18232 + }, + { + "epoch": 1.000768386388584, + "grad_norm": 2.0193676948547363, + "learning_rate": 1.670420337913089e-05, + "loss": 0.2855, + "step": 18234 + }, + { + "epoch": 1.0008781558726674, + "grad_norm": 1.6174160242080688, + "learning_rate": 1.6699264252034497e-05, + "loss": 0.1687, + "step": 18236 + }, + { + "epoch": 1.0009879253567509, + "grad_norm": 1.4179115295410156, + "learning_rate": 1.6694325489049108e-05, + "loss": 0.2486, + "step": 18238 + }, + { + "epoch": 1.0010976948408343, + "grad_norm": 1.0531511306762695, + "learning_rate": 1.668938709039138e-05, + "loss": 0.1564, + "step": 18240 + }, + { + "epoch": 1.0012074643249176, + "grad_norm": 1.839751124382019, + "learning_rate": 1.668444905627792e-05, + "loss": 0.2747, + "step": 18242 + }, + { + "epoch": 1.001317233809001, + "grad_norm": 1.4691253900527954, + "learning_rate": 1.6679511386925337e-05, + "loss": 0.1963, + "step": 18244 + }, + { + "epoch": 1.0014270032930845, + "grad_norm": 1.3935692310333252, + "learning_rate": 1.667457408255023e-05, + "loss": 0.2945, + "step": 18246 + }, + { + "epoch": 1.001536772777168, + "grad_norm": 0.7604548335075378, + "learning_rate": 1.666963714336916e-05, + "loss": 0.1752, + "step": 18248 + }, + { + "epoch": 1.0016465422612513, + "grad_norm": 0.9037689566612244, + "learning_rate": 1.6664700569598696e-05, + "loss": 0.1833, + "step": 18250 + }, + { + "epoch": 1.0017563117453347, + "grad_norm": 1.4173023700714111, + "learning_rate": 1.6659764361455383e-05, + "loss": 0.1819, + "step": 18252 + }, + { + "epoch": 1.0018660812294182, + "grad_norm": 0.8922256827354431, + "learning_rate": 1.665482851915573e-05, + "loss": 0.1781, + "step": 18254 + }, + { + "epoch": 1.0019758507135017, + "grad_norm": 0.7722444534301758, + "learning_rate": 1.6649893042916258e-05, + "loss": 0.1287, + "step": 18256 + }, + { + "epoch": 1.0020856201975852, + "grad_norm": 0.9732455015182495, + "learning_rate": 1.664495793295347e-05, + "loss": 0.2034, + "step": 18258 + }, + { + "epoch": 1.0021953896816684, + "grad_norm": 1.3269808292388916, + "learning_rate": 1.6640023189483835e-05, + "loss": 0.2085, + "step": 18260 + }, + { + "epoch": 1.002305159165752, + "grad_norm": 2.069471597671509, + "learning_rate": 1.6635088812723813e-05, + "loss": 0.2904, + "step": 18262 + }, + { + "epoch": 1.0024149286498354, + "grad_norm": 1.5655286312103271, + "learning_rate": 1.663015480288986e-05, + "loss": 0.2236, + "step": 18264 + }, + { + "epoch": 1.0025246981339189, + "grad_norm": 2.0459063053131104, + "learning_rate": 1.6625221160198395e-05, + "loss": 0.1849, + "step": 18266 + }, + { + "epoch": 1.0026344676180021, + "grad_norm": 0.9910246729850769, + "learning_rate": 1.662028788486583e-05, + "loss": 0.2409, + "step": 18268 + }, + { + "epoch": 1.0027442371020856, + "grad_norm": 1.3225618600845337, + "learning_rate": 1.6615354977108576e-05, + "loss": 0.1488, + "step": 18270 + }, + { + "epoch": 1.002854006586169, + "grad_norm": 1.0027052164077759, + "learning_rate": 1.6610422437143007e-05, + "loss": 0.1526, + "step": 18272 + }, + { + "epoch": 1.0029637760702526, + "grad_norm": 1.2380560636520386, + "learning_rate": 1.6605490265185485e-05, + "loss": 0.1313, + "step": 18274 + }, + { + "epoch": 1.0030735455543358, + "grad_norm": 1.1332428455352783, + "learning_rate": 1.660055846145237e-05, + "loss": 0.1529, + "step": 18276 + }, + { + "epoch": 1.0031833150384193, + "grad_norm": 2.0100460052490234, + "learning_rate": 1.6595627026159984e-05, + "loss": 0.282, + "step": 18278 + }, + { + "epoch": 1.0032930845225028, + "grad_norm": 0.9265925884246826, + "learning_rate": 1.659069595952464e-05, + "loss": 0.1931, + "step": 18280 + }, + { + "epoch": 1.0034028540065862, + "grad_norm": 1.115951418876648, + "learning_rate": 1.6585765261762655e-05, + "loss": 0.288, + "step": 18282 + }, + { + "epoch": 1.0035126234906695, + "grad_norm": 0.9394807815551758, + "learning_rate": 1.65808349330903e-05, + "loss": 0.1999, + "step": 18284 + }, + { + "epoch": 1.003622392974753, + "grad_norm": 1.3001309633255005, + "learning_rate": 1.6575904973723844e-05, + "loss": 0.239, + "step": 18286 + }, + { + "epoch": 1.0037321624588365, + "grad_norm": 1.7059693336486816, + "learning_rate": 1.6570975383879546e-05, + "loss": 0.3065, + "step": 18288 + }, + { + "epoch": 1.00384193194292, + "grad_norm": 2.054152488708496, + "learning_rate": 1.6566046163773647e-05, + "loss": 0.3297, + "step": 18290 + }, + { + "epoch": 1.0039517014270032, + "grad_norm": 1.0846539735794067, + "learning_rate": 1.656111731362236e-05, + "loss": 0.2019, + "step": 18292 + }, + { + "epoch": 1.0040614709110867, + "grad_norm": 1.8302847146987915, + "learning_rate": 1.6556188833641878e-05, + "loss": 0.1861, + "step": 18294 + }, + { + "epoch": 1.0041712403951701, + "grad_norm": 1.3555501699447632, + "learning_rate": 1.6551260724048408e-05, + "loss": 0.2112, + "step": 18296 + }, + { + "epoch": 1.0042810098792536, + "grad_norm": 1.1597504615783691, + "learning_rate": 1.6546332985058106e-05, + "loss": 0.1204, + "step": 18298 + }, + { + "epoch": 1.004390779363337, + "grad_norm": 1.8751580715179443, + "learning_rate": 1.6541405616887137e-05, + "loss": 0.2302, + "step": 18300 + }, + { + "epoch": 1.0045005488474203, + "grad_norm": 1.5461583137512207, + "learning_rate": 1.6536478619751635e-05, + "loss": 0.2762, + "step": 18302 + }, + { + "epoch": 1.0046103183315038, + "grad_norm": 0.9421944618225098, + "learning_rate": 1.6531551993867717e-05, + "loss": 0.2329, + "step": 18304 + }, + { + "epoch": 1.0047200878155873, + "grad_norm": 4.904019832611084, + "learning_rate": 1.65266257394515e-05, + "loss": 0.2831, + "step": 18306 + }, + { + "epoch": 1.0048298572996708, + "grad_norm": 1.3492845296859741, + "learning_rate": 1.6521699856719062e-05, + "loss": 0.2599, + "step": 18308 + }, + { + "epoch": 1.004939626783754, + "grad_norm": 2.331536293029785, + "learning_rate": 1.6516774345886486e-05, + "loss": 0.1613, + "step": 18310 + }, + { + "epoch": 1.0050493962678375, + "grad_norm": 1.8708208799362183, + "learning_rate": 1.6511849207169826e-05, + "loss": 0.3078, + "step": 18312 + }, + { + "epoch": 1.005159165751921, + "grad_norm": 1.1788547039031982, + "learning_rate": 1.6506924440785122e-05, + "loss": 0.2236, + "step": 18314 + }, + { + "epoch": 1.0052689352360045, + "grad_norm": 1.5017404556274414, + "learning_rate": 1.650200004694839e-05, + "loss": 0.2256, + "step": 18316 + }, + { + "epoch": 1.0053787047200877, + "grad_norm": 1.1903430223464966, + "learning_rate": 1.6497076025875653e-05, + "loss": 0.1612, + "step": 18318 + }, + { + "epoch": 1.0054884742041712, + "grad_norm": 2.2049129009246826, + "learning_rate": 1.6492152377782898e-05, + "loss": 0.2872, + "step": 18320 + }, + { + "epoch": 1.0055982436882547, + "grad_norm": 0.9124676585197449, + "learning_rate": 1.6487229102886097e-05, + "loss": 0.1709, + "step": 18322 + }, + { + "epoch": 1.0057080131723382, + "grad_norm": 1.1732912063598633, + "learning_rate": 1.648230620140121e-05, + "loss": 0.1355, + "step": 18324 + }, + { + "epoch": 1.0058177826564214, + "grad_norm": 0.8762179613113403, + "learning_rate": 1.6477383673544183e-05, + "loss": 0.2232, + "step": 18326 + }, + { + "epoch": 1.0059275521405049, + "grad_norm": 1.63493013381958, + "learning_rate": 1.647246151953094e-05, + "loss": 0.186, + "step": 18328 + }, + { + "epoch": 1.0060373216245884, + "grad_norm": 1.5272315740585327, + "learning_rate": 1.6467539739577383e-05, + "loss": 0.1373, + "step": 18330 + }, + { + "epoch": 1.0061470911086718, + "grad_norm": 1.2189457416534424, + "learning_rate": 1.6462618333899422e-05, + "loss": 0.1713, + "step": 18332 + }, + { + "epoch": 1.0062568605927553, + "grad_norm": 2.8012783527374268, + "learning_rate": 1.6457697302712918e-05, + "loss": 0.1486, + "step": 18334 + }, + { + "epoch": 1.0063666300768386, + "grad_norm": 1.2693705558776855, + "learning_rate": 1.6452776646233742e-05, + "loss": 0.1831, + "step": 18336 + }, + { + "epoch": 1.006476399560922, + "grad_norm": 1.4393810033798218, + "learning_rate": 1.644785636467774e-05, + "loss": 0.302, + "step": 18338 + }, + { + "epoch": 1.0065861690450055, + "grad_norm": 1.1245803833007812, + "learning_rate": 1.644293645826072e-05, + "loss": 0.1838, + "step": 18340 + }, + { + "epoch": 1.006695938529089, + "grad_norm": 0.907069206237793, + "learning_rate": 1.643801692719852e-05, + "loss": 0.2512, + "step": 18342 + }, + { + "epoch": 1.0068057080131723, + "grad_norm": 0.8799880743026733, + "learning_rate": 1.643309777170692e-05, + "loss": 0.1101, + "step": 18344 + }, + { + "epoch": 1.0069154774972557, + "grad_norm": 1.3612310886383057, + "learning_rate": 1.642817899200169e-05, + "loss": 0.2109, + "step": 18346 + }, + { + "epoch": 1.0070252469813392, + "grad_norm": 1.153641939163208, + "learning_rate": 1.642326058829861e-05, + "loss": 0.2707, + "step": 18348 + }, + { + "epoch": 1.0071350164654227, + "grad_norm": 0.9895077347755432, + "learning_rate": 1.641834256081342e-05, + "loss": 0.2038, + "step": 18350 + }, + { + "epoch": 1.007244785949506, + "grad_norm": 1.272355556488037, + "learning_rate": 1.6413424909761846e-05, + "loss": 0.2405, + "step": 18352 + }, + { + "epoch": 1.0073545554335894, + "grad_norm": 1.368269443511963, + "learning_rate": 1.64085076353596e-05, + "loss": 0.259, + "step": 18354 + }, + { + "epoch": 1.007464324917673, + "grad_norm": 1.2046259641647339, + "learning_rate": 1.6403590737822376e-05, + "loss": 0.1209, + "step": 18356 + }, + { + "epoch": 1.0075740944017564, + "grad_norm": 1.2291796207427979, + "learning_rate": 1.639867421736586e-05, + "loss": 0.2416, + "step": 18358 + }, + { + "epoch": 1.0076838638858396, + "grad_norm": 1.2710120677947998, + "learning_rate": 1.6393758074205708e-05, + "loss": 0.1901, + "step": 18360 + }, + { + "epoch": 1.0077936333699231, + "grad_norm": 1.341607928276062, + "learning_rate": 1.638884230855757e-05, + "loss": 0.1571, + "step": 18362 + }, + { + "epoch": 1.0079034028540066, + "grad_norm": 1.2136967182159424, + "learning_rate": 1.6383926920637077e-05, + "loss": 0.1663, + "step": 18364 + }, + { + "epoch": 1.00801317233809, + "grad_norm": 1.2961527109146118, + "learning_rate": 1.6379011910659837e-05, + "loss": 0.1625, + "step": 18366 + }, + { + "epoch": 1.0081229418221733, + "grad_norm": 1.252769112586975, + "learning_rate": 1.637409727884145e-05, + "loss": 0.1878, + "step": 18368 + }, + { + "epoch": 1.0082327113062568, + "grad_norm": 1.0296287536621094, + "learning_rate": 1.6369183025397493e-05, + "loss": 0.1645, + "step": 18370 + }, + { + "epoch": 1.0083424807903403, + "grad_norm": 1.1726199388504028, + "learning_rate": 1.6364269150543532e-05, + "loss": 0.1686, + "step": 18372 + }, + { + "epoch": 1.0084522502744238, + "grad_norm": 1.0091748237609863, + "learning_rate": 1.6359355654495113e-05, + "loss": 0.3591, + "step": 18374 + }, + { + "epoch": 1.0085620197585072, + "grad_norm": 1.8586221933364868, + "learning_rate": 1.6354442537467757e-05, + "loss": 0.171, + "step": 18376 + }, + { + "epoch": 1.0086717892425905, + "grad_norm": 1.0719845294952393, + "learning_rate": 1.6349529799676995e-05, + "loss": 0.2189, + "step": 18378 + }, + { + "epoch": 1.008781558726674, + "grad_norm": 1.950170874595642, + "learning_rate": 1.634461744133831e-05, + "loss": 0.2244, + "step": 18380 + }, + { + "epoch": 1.0088913282107574, + "grad_norm": 0.8918461203575134, + "learning_rate": 1.6339705462667196e-05, + "loss": 0.1594, + "step": 18382 + }, + { + "epoch": 1.009001097694841, + "grad_norm": 1.9032856225967407, + "learning_rate": 1.63347938638791e-05, + "loss": 0.1839, + "step": 18384 + }, + { + "epoch": 1.0091108671789242, + "grad_norm": 1.4269062280654907, + "learning_rate": 1.632988264518948e-05, + "loss": 0.2158, + "step": 18386 + }, + { + "epoch": 1.0092206366630077, + "grad_norm": 1.4740171432495117, + "learning_rate": 1.6324971806813767e-05, + "loss": 0.3011, + "step": 18388 + }, + { + "epoch": 1.0093304061470911, + "grad_norm": 1.3986337184906006, + "learning_rate": 1.632006134896736e-05, + "loss": 0.169, + "step": 18390 + }, + { + "epoch": 1.0094401756311746, + "grad_norm": 0.962369978427887, + "learning_rate": 1.6315151271865672e-05, + "loss": 0.2657, + "step": 18392 + }, + { + "epoch": 1.0095499451152579, + "grad_norm": 1.418782114982605, + "learning_rate": 1.6310241575724077e-05, + "loss": 0.242, + "step": 18394 + }, + { + "epoch": 1.0096597145993413, + "grad_norm": 2.1662070751190186, + "learning_rate": 1.6305332260757936e-05, + "loss": 0.1738, + "step": 18396 + }, + { + "epoch": 1.0097694840834248, + "grad_norm": 1.7535909414291382, + "learning_rate": 1.63004233271826e-05, + "loss": 0.2451, + "step": 18398 + }, + { + "epoch": 1.0098792535675083, + "grad_norm": 4.425863265991211, + "learning_rate": 1.6295514775213398e-05, + "loss": 0.2892, + "step": 18400 + }, + { + "epoch": 1.0099890230515916, + "grad_norm": 1.2898037433624268, + "learning_rate": 1.629060660506564e-05, + "loss": 0.2438, + "step": 18402 + }, + { + "epoch": 1.010098792535675, + "grad_norm": 1.300481915473938, + "learning_rate": 1.6285698816954624e-05, + "loss": 0.1744, + "step": 18404 + }, + { + "epoch": 1.0102085620197585, + "grad_norm": 1.2245948314666748, + "learning_rate": 1.6280791411095637e-05, + "loss": 0.1616, + "step": 18406 + }, + { + "epoch": 1.010318331503842, + "grad_norm": 1.169378399848938, + "learning_rate": 1.6275884387703918e-05, + "loss": 0.2442, + "step": 18408 + }, + { + "epoch": 1.0104281009879255, + "grad_norm": 1.3709378242492676, + "learning_rate": 1.627097774699474e-05, + "loss": 0.1864, + "step": 18410 + }, + { + "epoch": 1.0105378704720087, + "grad_norm": 1.2084871530532837, + "learning_rate": 1.6266071489183327e-05, + "loss": 0.1858, + "step": 18412 + }, + { + "epoch": 1.0106476399560922, + "grad_norm": 1.940967321395874, + "learning_rate": 1.6261165614484887e-05, + "loss": 0.1204, + "step": 18414 + }, + { + "epoch": 1.0107574094401757, + "grad_norm": 1.0766258239746094, + "learning_rate": 1.625626012311461e-05, + "loss": 0.1974, + "step": 18416 + }, + { + "epoch": 1.0108671789242591, + "grad_norm": 1.949081540107727, + "learning_rate": 1.6251355015287683e-05, + "loss": 0.2951, + "step": 18418 + }, + { + "epoch": 1.0109769484083424, + "grad_norm": 1.475021243095398, + "learning_rate": 1.6246450291219266e-05, + "loss": 0.2036, + "step": 18420 + }, + { + "epoch": 1.0110867178924259, + "grad_norm": 0.7755205035209656, + "learning_rate": 1.6241545951124504e-05, + "loss": 0.1964, + "step": 18422 + }, + { + "epoch": 1.0111964873765094, + "grad_norm": 2.104400634765625, + "learning_rate": 1.623664199521853e-05, + "loss": 0.2728, + "step": 18424 + }, + { + "epoch": 1.0113062568605928, + "grad_norm": 1.8421905040740967, + "learning_rate": 1.623173842371644e-05, + "loss": 0.2133, + "step": 18426 + }, + { + "epoch": 1.011416026344676, + "grad_norm": 1.999017357826233, + "learning_rate": 1.6226835236833354e-05, + "loss": 0.2393, + "step": 18428 + }, + { + "epoch": 1.0115257958287596, + "grad_norm": 1.143355369567871, + "learning_rate": 1.622193243478433e-05, + "loss": 0.2333, + "step": 18430 + }, + { + "epoch": 1.011635565312843, + "grad_norm": 1.26078462600708, + "learning_rate": 1.621703001778443e-05, + "loss": 0.1752, + "step": 18432 + }, + { + "epoch": 1.0117453347969265, + "grad_norm": 1.6504231691360474, + "learning_rate": 1.621212798604871e-05, + "loss": 0.2729, + "step": 18434 + }, + { + "epoch": 1.0118551042810098, + "grad_norm": 1.5231410264968872, + "learning_rate": 1.620722633979219e-05, + "loss": 0.3441, + "step": 18436 + }, + { + "epoch": 1.0119648737650933, + "grad_norm": 0.9168989658355713, + "learning_rate": 1.620232507922987e-05, + "loss": 0.1291, + "step": 18438 + }, + { + "epoch": 1.0120746432491767, + "grad_norm": 0.8612396717071533, + "learning_rate": 1.6197424204576757e-05, + "loss": 0.1719, + "step": 18440 + }, + { + "epoch": 1.0121844127332602, + "grad_norm": 3.795095205307007, + "learning_rate": 1.6192523716047827e-05, + "loss": 0.1626, + "step": 18442 + }, + { + "epoch": 1.0122941822173437, + "grad_norm": 1.539628267288208, + "learning_rate": 1.6187623613858038e-05, + "loss": 0.2586, + "step": 18444 + }, + { + "epoch": 1.012403951701427, + "grad_norm": 1.0353896617889404, + "learning_rate": 1.618272389822233e-05, + "loss": 0.1381, + "step": 18446 + }, + { + "epoch": 1.0125137211855104, + "grad_norm": 1.1867194175720215, + "learning_rate": 1.617782456935563e-05, + "loss": 0.1894, + "step": 18448 + }, + { + "epoch": 1.012623490669594, + "grad_norm": 1.0186796188354492, + "learning_rate": 1.6172925627472846e-05, + "loss": 0.1518, + "step": 18450 + }, + { + "epoch": 1.0127332601536774, + "grad_norm": 1.1447904109954834, + "learning_rate": 1.6168027072788867e-05, + "loss": 0.2266, + "step": 18452 + }, + { + "epoch": 1.0128430296377606, + "grad_norm": 0.7944174408912659, + "learning_rate": 1.6163128905518578e-05, + "loss": 0.1393, + "step": 18454 + }, + { + "epoch": 1.012952799121844, + "grad_norm": 0.851800262928009, + "learning_rate": 1.6158231125876823e-05, + "loss": 0.2498, + "step": 18456 + }, + { + "epoch": 1.0130625686059276, + "grad_norm": 1.7672349214553833, + "learning_rate": 1.6153333734078448e-05, + "loss": 0.2393, + "step": 18458 + }, + { + "epoch": 1.013172338090011, + "grad_norm": 1.08183753490448, + "learning_rate": 1.614843673033828e-05, + "loss": 0.1886, + "step": 18460 + }, + { + "epoch": 1.0132821075740943, + "grad_norm": 0.8838032484054565, + "learning_rate": 1.6143540114871124e-05, + "loss": 0.1705, + "step": 18462 + }, + { + "epoch": 1.0133918770581778, + "grad_norm": 0.974000096321106, + "learning_rate": 1.6138643887891763e-05, + "loss": 0.1971, + "step": 18464 + }, + { + "epoch": 1.0135016465422613, + "grad_norm": 0.8932402729988098, + "learning_rate": 1.6133748049614984e-05, + "loss": 0.1888, + "step": 18466 + }, + { + "epoch": 1.0136114160263447, + "grad_norm": 1.4452314376831055, + "learning_rate": 1.612885260025552e-05, + "loss": 0.1916, + "step": 18468 + }, + { + "epoch": 1.013721185510428, + "grad_norm": 1.4793468713760376, + "learning_rate": 1.6123957540028135e-05, + "loss": 0.2234, + "step": 18470 + }, + { + "epoch": 1.0138309549945115, + "grad_norm": 1.474075198173523, + "learning_rate": 1.611906286914753e-05, + "loss": 0.3231, + "step": 18472 + }, + { + "epoch": 1.013940724478595, + "grad_norm": 0.9414989948272705, + "learning_rate": 1.6114168587828426e-05, + "loss": 0.1243, + "step": 18474 + }, + { + "epoch": 1.0140504939626784, + "grad_norm": 1.7743436098098755, + "learning_rate": 1.6109274696285495e-05, + "loss": 0.3189, + "step": 18476 + }, + { + "epoch": 1.0141602634467617, + "grad_norm": 1.0784236192703247, + "learning_rate": 1.610438119473342e-05, + "loss": 0.1448, + "step": 18478 + }, + { + "epoch": 1.0142700329308452, + "grad_norm": 1.205573558807373, + "learning_rate": 1.6099488083386847e-05, + "loss": 0.1448, + "step": 18480 + }, + { + "epoch": 1.0143798024149286, + "grad_norm": 1.839614987373352, + "learning_rate": 1.609459536246041e-05, + "loss": 0.2646, + "step": 18482 + }, + { + "epoch": 1.0144895718990121, + "grad_norm": 0.9304049015045166, + "learning_rate": 1.6089703032168733e-05, + "loss": 0.1348, + "step": 18484 + }, + { + "epoch": 1.0145993413830956, + "grad_norm": 1.2989929914474487, + "learning_rate": 1.608481109272642e-05, + "loss": 0.2675, + "step": 18486 + }, + { + "epoch": 1.0147091108671789, + "grad_norm": 0.841053307056427, + "learning_rate": 1.6079919544348045e-05, + "loss": 0.1574, + "step": 18488 + }, + { + "epoch": 1.0148188803512623, + "grad_norm": 1.9507521390914917, + "learning_rate": 1.607502838724818e-05, + "loss": 0.2662, + "step": 18490 + }, + { + "epoch": 1.0149286498353458, + "grad_norm": 2.2518651485443115, + "learning_rate": 1.607013762164138e-05, + "loss": 0.2636, + "step": 18492 + }, + { + "epoch": 1.0150384193194293, + "grad_norm": 1.0524330139160156, + "learning_rate": 1.606524724774217e-05, + "loss": 0.1452, + "step": 18494 + }, + { + "epoch": 1.0151481888035125, + "grad_norm": 1.6957476139068604, + "learning_rate": 1.6060357265765073e-05, + "loss": 0.2433, + "step": 18496 + }, + { + "epoch": 1.015257958287596, + "grad_norm": 1.3199232816696167, + "learning_rate": 1.605546767592458e-05, + "loss": 0.1318, + "step": 18498 + }, + { + "epoch": 1.0153677277716795, + "grad_norm": 0.8798060417175293, + "learning_rate": 1.605057847843518e-05, + "loss": 0.1101, + "step": 18500 + }, + { + "epoch": 1.015477497255763, + "grad_norm": 1.065272569656372, + "learning_rate": 1.6045689673511334e-05, + "loss": 0.2131, + "step": 18502 + }, + { + "epoch": 1.0155872667398462, + "grad_norm": 0.9880923628807068, + "learning_rate": 1.6040801261367493e-05, + "loss": 0.1309, + "step": 18504 + }, + { + "epoch": 1.0156970362239297, + "grad_norm": 1.367726445198059, + "learning_rate": 1.6035913242218083e-05, + "loss": 0.2023, + "step": 18506 + }, + { + "epoch": 1.0158068057080132, + "grad_norm": 1.1349841356277466, + "learning_rate": 1.603102561627751e-05, + "loss": 0.2568, + "step": 18508 + }, + { + "epoch": 1.0159165751920967, + "grad_norm": 1.040857195854187, + "learning_rate": 1.6026138383760186e-05, + "loss": 0.3268, + "step": 18510 + }, + { + "epoch": 1.01602634467618, + "grad_norm": 1.233585000038147, + "learning_rate": 1.6021251544880467e-05, + "loss": 0.1771, + "step": 18512 + }, + { + "epoch": 1.0161361141602634, + "grad_norm": 1.2608301639556885, + "learning_rate": 1.6016365099852735e-05, + "loss": 0.1934, + "step": 18514 + }, + { + "epoch": 1.0162458836443469, + "grad_norm": 1.1070901155471802, + "learning_rate": 1.6011479048891324e-05, + "loss": 0.1485, + "step": 18516 + }, + { + "epoch": 1.0163556531284303, + "grad_norm": 1.5043646097183228, + "learning_rate": 1.6006593392210554e-05, + "loss": 0.1756, + "step": 18518 + }, + { + "epoch": 1.0164654226125138, + "grad_norm": 1.1913498640060425, + "learning_rate": 1.6001708130024746e-05, + "loss": 0.1504, + "step": 18520 + }, + { + "epoch": 1.016575192096597, + "grad_norm": 1.5142258405685425, + "learning_rate": 1.5996823262548184e-05, + "loss": 0.1652, + "step": 18522 + }, + { + "epoch": 1.0166849615806806, + "grad_norm": 2.863731861114502, + "learning_rate": 1.5991938789995137e-05, + "loss": 0.1912, + "step": 18524 + }, + { + "epoch": 1.016794731064764, + "grad_norm": 1.73065984249115, + "learning_rate": 1.5987054712579876e-05, + "loss": 0.1905, + "step": 18526 + }, + { + "epoch": 1.0169045005488475, + "grad_norm": 2.5696773529052734, + "learning_rate": 1.5982171030516623e-05, + "loss": 0.1789, + "step": 18528 + }, + { + "epoch": 1.0170142700329308, + "grad_norm": 1.0677621364593506, + "learning_rate": 1.5977287744019624e-05, + "loss": 0.2022, + "step": 18530 + }, + { + "epoch": 1.0171240395170142, + "grad_norm": 1.1811033487319946, + "learning_rate": 1.5972404853303062e-05, + "loss": 0.2636, + "step": 18532 + }, + { + "epoch": 1.0172338090010977, + "grad_norm": 0.9193452000617981, + "learning_rate": 1.5967522358581138e-05, + "loss": 0.1219, + "step": 18534 + }, + { + "epoch": 1.0173435784851812, + "grad_norm": 1.4661900997161865, + "learning_rate": 1.5962640260068017e-05, + "loss": 0.1624, + "step": 18536 + }, + { + "epoch": 1.0174533479692645, + "grad_norm": 0.7517236471176147, + "learning_rate": 1.595775855797785e-05, + "loss": 0.1485, + "step": 18538 + }, + { + "epoch": 1.017563117453348, + "grad_norm": 0.9918652772903442, + "learning_rate": 1.595287725252478e-05, + "loss": 0.1472, + "step": 18540 + }, + { + "epoch": 1.0176728869374314, + "grad_norm": 0.925354540348053, + "learning_rate": 1.5947996343922915e-05, + "loss": 0.1686, + "step": 18542 + }, + { + "epoch": 1.0177826564215149, + "grad_norm": 1.0344197750091553, + "learning_rate": 1.594311583238636e-05, + "loss": 0.2654, + "step": 18544 + }, + { + "epoch": 1.0178924259055981, + "grad_norm": 1.7729610204696655, + "learning_rate": 1.59382357181292e-05, + "loss": 0.1395, + "step": 18546 + }, + { + "epoch": 1.0180021953896816, + "grad_norm": 0.8475968837738037, + "learning_rate": 1.59333560013655e-05, + "loss": 0.2078, + "step": 18548 + }, + { + "epoch": 1.018111964873765, + "grad_norm": 1.9371198415756226, + "learning_rate": 1.5928476682309304e-05, + "loss": 0.2088, + "step": 18550 + }, + { + "epoch": 1.0182217343578486, + "grad_norm": 1.6842665672302246, + "learning_rate": 1.592359776117465e-05, + "loss": 0.2564, + "step": 18552 + }, + { + "epoch": 1.018331503841932, + "grad_norm": 1.2397781610488892, + "learning_rate": 1.5918719238175544e-05, + "loss": 0.242, + "step": 18554 + }, + { + "epoch": 1.0184412733260153, + "grad_norm": 1.8111262321472168, + "learning_rate": 1.5913841113525992e-05, + "loss": 0.2892, + "step": 18556 + }, + { + "epoch": 1.0185510428100988, + "grad_norm": 0.8796392679214478, + "learning_rate": 1.590896338743995e-05, + "loss": 0.1276, + "step": 18558 + }, + { + "epoch": 1.0186608122941823, + "grad_norm": 1.4405962228775024, + "learning_rate": 1.5904086060131412e-05, + "loss": 0.1982, + "step": 18560 + }, + { + "epoch": 1.0187705817782657, + "grad_norm": 1.9109452962875366, + "learning_rate": 1.5899209131814298e-05, + "loss": 0.2489, + "step": 18562 + }, + { + "epoch": 1.018880351262349, + "grad_norm": 1.072724461555481, + "learning_rate": 1.5894332602702545e-05, + "loss": 0.2578, + "step": 18564 + }, + { + "epoch": 1.0189901207464325, + "grad_norm": 1.5873836278915405, + "learning_rate": 1.5889456473010056e-05, + "loss": 0.2366, + "step": 18566 + }, + { + "epoch": 1.019099890230516, + "grad_norm": 1.4948651790618896, + "learning_rate": 1.5884580742950723e-05, + "loss": 0.1769, + "step": 18568 + }, + { + "epoch": 1.0192096597145994, + "grad_norm": 1.1863948106765747, + "learning_rate": 1.587970541273842e-05, + "loss": 0.1873, + "step": 18570 + }, + { + "epoch": 1.0193194291986827, + "grad_norm": 1.1472939252853394, + "learning_rate": 1.5874830482587e-05, + "loss": 0.1369, + "step": 18572 + }, + { + "epoch": 1.0194291986827662, + "grad_norm": 0.9910283088684082, + "learning_rate": 1.5869955952710308e-05, + "loss": 0.2011, + "step": 18574 + }, + { + "epoch": 1.0195389681668496, + "grad_norm": 1.4798873662948608, + "learning_rate": 1.586508182332216e-05, + "loss": 0.188, + "step": 18576 + }, + { + "epoch": 1.0196487376509331, + "grad_norm": 1.1603822708129883, + "learning_rate": 1.5860208094636358e-05, + "loss": 0.2167, + "step": 18578 + }, + { + "epoch": 1.0197585071350164, + "grad_norm": 1.1925737857818604, + "learning_rate": 1.585533476686669e-05, + "loss": 0.1901, + "step": 18580 + }, + { + "epoch": 1.0198682766190998, + "grad_norm": 1.3877978324890137, + "learning_rate": 1.5850461840226926e-05, + "loss": 0.342, + "step": 18582 + }, + { + "epoch": 1.0199780461031833, + "grad_norm": 1.012703537940979, + "learning_rate": 1.5845589314930813e-05, + "loss": 0.1689, + "step": 18584 + }, + { + "epoch": 1.0200878155872668, + "grad_norm": 1.8154091835021973, + "learning_rate": 1.5840717191192083e-05, + "loss": 0.2661, + "step": 18586 + }, + { + "epoch": 1.02019758507135, + "grad_norm": 1.1291309595108032, + "learning_rate": 1.5835845469224447e-05, + "loss": 0.1907, + "step": 18588 + }, + { + "epoch": 1.0203073545554335, + "grad_norm": 3.301379919052124, + "learning_rate": 1.5830974149241622e-05, + "loss": 0.1859, + "step": 18590 + }, + { + "epoch": 1.020417124039517, + "grad_norm": 1.1739553213119507, + "learning_rate": 1.582610323145727e-05, + "loss": 0.1735, + "step": 18592 + }, + { + "epoch": 1.0205268935236005, + "grad_norm": 0.8561968207359314, + "learning_rate": 1.5821232716085056e-05, + "loss": 0.2735, + "step": 18594 + }, + { + "epoch": 1.020636663007684, + "grad_norm": 1.2548729181289673, + "learning_rate": 1.581636260333863e-05, + "loss": 0.2446, + "step": 18596 + }, + { + "epoch": 1.0207464324917672, + "grad_norm": 1.1556308269500732, + "learning_rate": 1.5811492893431617e-05, + "loss": 0.2234, + "step": 18598 + }, + { + "epoch": 1.0208562019758507, + "grad_norm": 1.3691905736923218, + "learning_rate": 1.5806623586577622e-05, + "loss": 0.2491, + "step": 18600 + }, + { + "epoch": 1.0209659714599342, + "grad_norm": 1.2697391510009766, + "learning_rate": 1.5801754682990247e-05, + "loss": 0.3316, + "step": 18602 + }, + { + "epoch": 1.0210757409440177, + "grad_norm": 0.783691942691803, + "learning_rate": 1.5796886182883053e-05, + "loss": 0.1127, + "step": 18604 + }, + { + "epoch": 1.021185510428101, + "grad_norm": 1.4168139696121216, + "learning_rate": 1.5792018086469606e-05, + "loss": 0.19, + "step": 18606 + }, + { + "epoch": 1.0212952799121844, + "grad_norm": 1.3858065605163574, + "learning_rate": 1.578715039396344e-05, + "loss": 0.1766, + "step": 18608 + }, + { + "epoch": 1.0214050493962679, + "grad_norm": 1.566335916519165, + "learning_rate": 1.5782283105578076e-05, + "loss": 0.1983, + "step": 18610 + }, + { + "epoch": 1.0215148188803513, + "grad_norm": 0.9685210585594177, + "learning_rate": 1.577741622152702e-05, + "loss": 0.1355, + "step": 18612 + }, + { + "epoch": 1.0216245883644346, + "grad_norm": 1.767595648765564, + "learning_rate": 1.5772549742023758e-05, + "loss": 0.2115, + "step": 18614 + }, + { + "epoch": 1.021734357848518, + "grad_norm": 1.163926362991333, + "learning_rate": 1.5767683667281746e-05, + "loss": 0.2643, + "step": 18616 + }, + { + "epoch": 1.0218441273326015, + "grad_norm": 1.1540781259536743, + "learning_rate": 1.5762817997514443e-05, + "loss": 0.205, + "step": 18618 + }, + { + "epoch": 1.021953896816685, + "grad_norm": 0.9055280685424805, + "learning_rate": 1.5757952732935288e-05, + "loss": 0.1531, + "step": 18620 + }, + { + "epoch": 1.0220636663007683, + "grad_norm": 0.9269468188285828, + "learning_rate": 1.575308787375769e-05, + "loss": 0.2043, + "step": 18622 + }, + { + "epoch": 1.0221734357848518, + "grad_norm": 1.3470544815063477, + "learning_rate": 1.574822342019504e-05, + "loss": 0.2303, + "step": 18624 + }, + { + "epoch": 1.0222832052689352, + "grad_norm": 1.1711581945419312, + "learning_rate": 1.5743359372460726e-05, + "loss": 0.1767, + "step": 18626 + }, + { + "epoch": 1.0223929747530187, + "grad_norm": 1.3699617385864258, + "learning_rate": 1.5738495730768105e-05, + "loss": 0.1502, + "step": 18628 + }, + { + "epoch": 1.0225027442371022, + "grad_norm": 0.8782106041908264, + "learning_rate": 1.5733632495330514e-05, + "loss": 0.2122, + "step": 18630 + }, + { + "epoch": 1.0226125137211854, + "grad_norm": 1.831921100616455, + "learning_rate": 1.5728769666361287e-05, + "loss": 0.1466, + "step": 18632 + }, + { + "epoch": 1.022722283205269, + "grad_norm": 1.5588334798812866, + "learning_rate": 1.572390724407373e-05, + "loss": 0.2591, + "step": 18634 + }, + { + "epoch": 1.0228320526893524, + "grad_norm": 1.174238681793213, + "learning_rate": 1.5719045228681125e-05, + "loss": 0.2059, + "step": 18636 + }, + { + "epoch": 1.0229418221734359, + "grad_norm": 1.3621797561645508, + "learning_rate": 1.5714183620396756e-05, + "loss": 0.2439, + "step": 18638 + }, + { + "epoch": 1.0230515916575191, + "grad_norm": 2.7258565425872803, + "learning_rate": 1.570932241943387e-05, + "loss": 0.3612, + "step": 18640 + }, + { + "epoch": 1.0231613611416026, + "grad_norm": 1.9127004146575928, + "learning_rate": 1.5704461626005702e-05, + "loss": 0.1926, + "step": 18642 + }, + { + "epoch": 1.023271130625686, + "grad_norm": 1.204809546470642, + "learning_rate": 1.5699601240325474e-05, + "loss": 0.1298, + "step": 18644 + }, + { + "epoch": 1.0233809001097696, + "grad_norm": 1.1291643381118774, + "learning_rate": 1.5694741262606376e-05, + "loss": 0.1533, + "step": 18646 + }, + { + "epoch": 1.0234906695938528, + "grad_norm": 1.3945257663726807, + "learning_rate": 1.5689881693061607e-05, + "loss": 0.3063, + "step": 18648 + }, + { + "epoch": 1.0236004390779363, + "grad_norm": 1.8086295127868652, + "learning_rate": 1.568502253190432e-05, + "loss": 0.2051, + "step": 18650 + }, + { + "epoch": 1.0237102085620198, + "grad_norm": 1.2397040128707886, + "learning_rate": 1.5680163779347667e-05, + "loss": 0.1661, + "step": 18652 + }, + { + "epoch": 1.0238199780461033, + "grad_norm": 1.0780800580978394, + "learning_rate": 1.5675305435604775e-05, + "loss": 0.1495, + "step": 18654 + }, + { + "epoch": 1.0239297475301865, + "grad_norm": 1.1416258811950684, + "learning_rate": 1.5670447500888756e-05, + "loss": 0.1804, + "step": 18656 + }, + { + "epoch": 1.02403951701427, + "grad_norm": 1.7912153005599976, + "learning_rate": 1.5665589975412708e-05, + "loss": 0.2685, + "step": 18658 + }, + { + "epoch": 1.0241492864983535, + "grad_norm": 1.53144371509552, + "learning_rate": 1.5660732859389686e-05, + "loss": 0.1715, + "step": 18660 + }, + { + "epoch": 1.024259055982437, + "grad_norm": 1.367138147354126, + "learning_rate": 1.5655876153032773e-05, + "loss": 0.2065, + "step": 18662 + }, + { + "epoch": 1.0243688254665204, + "grad_norm": 1.5434213876724243, + "learning_rate": 1.5651019856554995e-05, + "loss": 0.2555, + "step": 18664 + }, + { + "epoch": 1.0244785949506037, + "grad_norm": 1.9962388277053833, + "learning_rate": 1.5646163970169364e-05, + "loss": 0.2377, + "step": 18666 + }, + { + "epoch": 1.0245883644346871, + "grad_norm": 1.652980923652649, + "learning_rate": 1.56413084940889e-05, + "loss": 0.2494, + "step": 18668 + }, + { + "epoch": 1.0246981339187706, + "grad_norm": 1.2790255546569824, + "learning_rate": 1.5636453428526582e-05, + "loss": 0.2856, + "step": 18670 + }, + { + "epoch": 1.024807903402854, + "grad_norm": 0.9241877794265747, + "learning_rate": 1.563159877369537e-05, + "loss": 0.1485, + "step": 18672 + }, + { + "epoch": 1.0249176728869374, + "grad_norm": 1.140663981437683, + "learning_rate": 1.5626744529808223e-05, + "loss": 0.2174, + "step": 18674 + }, + { + "epoch": 1.0250274423710208, + "grad_norm": 0.8858147263526917, + "learning_rate": 1.562189069707807e-05, + "loss": 0.207, + "step": 18676 + }, + { + "epoch": 1.0251372118551043, + "grad_norm": 1.2950942516326904, + "learning_rate": 1.561703727571781e-05, + "loss": 0.1897, + "step": 18678 + }, + { + "epoch": 1.0252469813391878, + "grad_norm": 1.3870232105255127, + "learning_rate": 1.5612184265940348e-05, + "loss": 0.1692, + "step": 18680 + }, + { + "epoch": 1.025356750823271, + "grad_norm": 1.3812471628189087, + "learning_rate": 1.5607331667958575e-05, + "loss": 0.2164, + "step": 18682 + }, + { + "epoch": 1.0254665203073545, + "grad_norm": 1.4797942638397217, + "learning_rate": 1.5602479481985333e-05, + "loss": 0.1708, + "step": 18684 + }, + { + "epoch": 1.025576289791438, + "grad_norm": 1.062406301498413, + "learning_rate": 1.5597627708233465e-05, + "loss": 0.2103, + "step": 18686 + }, + { + "epoch": 1.0256860592755215, + "grad_norm": 1.311360478401184, + "learning_rate": 1.5592776346915796e-05, + "loss": 0.2157, + "step": 18688 + }, + { + "epoch": 1.0257958287596047, + "grad_norm": 1.2270662784576416, + "learning_rate": 1.558792539824513e-05, + "loss": 0.2118, + "step": 18690 + }, + { + "epoch": 1.0259055982436882, + "grad_norm": 1.3024550676345825, + "learning_rate": 1.5583074862434255e-05, + "loss": 0.2832, + "step": 18692 + }, + { + "epoch": 1.0260153677277717, + "grad_norm": 1.0074235200881958, + "learning_rate": 1.5578224739695938e-05, + "loss": 0.1654, + "step": 18694 + }, + { + "epoch": 1.0261251372118552, + "grad_norm": 1.0491702556610107, + "learning_rate": 1.5573375030242922e-05, + "loss": 0.2223, + "step": 18696 + }, + { + "epoch": 1.0262349066959384, + "grad_norm": 1.5629993677139282, + "learning_rate": 1.5568525734287953e-05, + "loss": 0.2303, + "step": 18698 + }, + { + "epoch": 1.026344676180022, + "grad_norm": 0.736964762210846, + "learning_rate": 1.556367685204374e-05, + "loss": 0.1873, + "step": 18700 + }, + { + "epoch": 1.0264544456641054, + "grad_norm": 0.713478147983551, + "learning_rate": 1.5558828383722968e-05, + "loss": 0.1542, + "step": 18702 + }, + { + "epoch": 1.0265642151481889, + "grad_norm": 1.0621334314346313, + "learning_rate": 1.5553980329538326e-05, + "loss": 0.1358, + "step": 18704 + }, + { + "epoch": 1.0266739846322723, + "grad_norm": 1.2193523645401, + "learning_rate": 1.5549132689702477e-05, + "loss": 0.1841, + "step": 18706 + }, + { + "epoch": 1.0267837541163556, + "grad_norm": 1.3588300943374634, + "learning_rate": 1.5544285464428045e-05, + "loss": 0.2424, + "step": 18708 + }, + { + "epoch": 1.026893523600439, + "grad_norm": 0.7830403447151184, + "learning_rate": 1.5539438653927663e-05, + "loss": 0.1822, + "step": 18710 + }, + { + "epoch": 1.0270032930845225, + "grad_norm": 0.8240053653717041, + "learning_rate": 1.5534592258413943e-05, + "loss": 0.1347, + "step": 18712 + }, + { + "epoch": 1.027113062568606, + "grad_norm": 0.6694943904876709, + "learning_rate": 1.5529746278099467e-05, + "loss": 0.1506, + "step": 18714 + }, + { + "epoch": 1.0272228320526893, + "grad_norm": 1.2791391611099243, + "learning_rate": 1.55249007131968e-05, + "loss": 0.1846, + "step": 18716 + }, + { + "epoch": 1.0273326015367727, + "grad_norm": 1.530696153640747, + "learning_rate": 1.5520055563918494e-05, + "loss": 0.1229, + "step": 18718 + }, + { + "epoch": 1.0274423710208562, + "grad_norm": 1.3760294914245605, + "learning_rate": 1.5515210830477083e-05, + "loss": 0.2199, + "step": 18720 + }, + { + "epoch": 1.0275521405049397, + "grad_norm": 1.1432616710662842, + "learning_rate": 1.5510366513085075e-05, + "loss": 0.2434, + "step": 18722 + }, + { + "epoch": 1.027661909989023, + "grad_norm": 1.487998366355896, + "learning_rate": 1.5505522611954975e-05, + "loss": 0.2128, + "step": 18724 + }, + { + "epoch": 1.0277716794731064, + "grad_norm": 0.9587336778640747, + "learning_rate": 1.5500679127299255e-05, + "loss": 0.17, + "step": 18726 + }, + { + "epoch": 1.02788144895719, + "grad_norm": 1.489210844039917, + "learning_rate": 1.549583605933037e-05, + "loss": 0.2823, + "step": 18728 + }, + { + "epoch": 1.0279912184412734, + "grad_norm": 0.9656094908714294, + "learning_rate": 1.549099340826077e-05, + "loss": 0.1183, + "step": 18730 + }, + { + "epoch": 1.0281009879253566, + "grad_norm": 1.1473331451416016, + "learning_rate": 1.548615117430286e-05, + "loss": 0.2189, + "step": 18732 + }, + { + "epoch": 1.0282107574094401, + "grad_norm": 1.3694690465927124, + "learning_rate": 1.5481309357669066e-05, + "loss": 0.201, + "step": 18734 + }, + { + "epoch": 1.0283205268935236, + "grad_norm": 1.492537260055542, + "learning_rate": 1.5476467958571767e-05, + "loss": 0.2243, + "step": 18736 + }, + { + "epoch": 1.028430296377607, + "grad_norm": 1.2400290966033936, + "learning_rate": 1.5471626977223318e-05, + "loss": 0.1259, + "step": 18738 + }, + { + "epoch": 1.0285400658616906, + "grad_norm": 1.1447256803512573, + "learning_rate": 1.5466786413836077e-05, + "loss": 0.2202, + "step": 18740 + }, + { + "epoch": 1.0286498353457738, + "grad_norm": 1.0469156503677368, + "learning_rate": 1.546194626862238e-05, + "loss": 0.2502, + "step": 18742 + }, + { + "epoch": 1.0287596048298573, + "grad_norm": 1.6376768350601196, + "learning_rate": 1.5457106541794543e-05, + "loss": 0.2984, + "step": 18744 + }, + { + "epoch": 1.0288693743139408, + "grad_norm": 1.2805010080337524, + "learning_rate": 1.5452267233564837e-05, + "loss": 0.2535, + "step": 18746 + }, + { + "epoch": 1.0289791437980242, + "grad_norm": 2.1179919242858887, + "learning_rate": 1.5447428344145563e-05, + "loss": 0.2606, + "step": 18748 + }, + { + "epoch": 1.0290889132821075, + "grad_norm": 0.9803415536880493, + "learning_rate": 1.5442589873748976e-05, + "loss": 0.15, + "step": 18750 + }, + { + "epoch": 1.029198682766191, + "grad_norm": 1.3369029760360718, + "learning_rate": 1.5437751822587294e-05, + "loss": 0.2239, + "step": 18752 + }, + { + "epoch": 1.0293084522502745, + "grad_norm": 1.0959150791168213, + "learning_rate": 1.5432914190872757e-05, + "loss": 0.2246, + "step": 18754 + }, + { + "epoch": 1.029418221734358, + "grad_norm": 1.0534878969192505, + "learning_rate": 1.5428076978817562e-05, + "loss": 0.1946, + "step": 18756 + }, + { + "epoch": 1.0295279912184412, + "grad_norm": 0.9711368680000305, + "learning_rate": 1.5423240186633892e-05, + "loss": 0.1259, + "step": 18758 + }, + { + "epoch": 1.0296377607025247, + "grad_norm": 1.2145874500274658, + "learning_rate": 1.5418403814533912e-05, + "loss": 0.2115, + "step": 18760 + }, + { + "epoch": 1.0297475301866081, + "grad_norm": 1.2992503643035889, + "learning_rate": 1.5413567862729776e-05, + "loss": 0.1645, + "step": 18762 + }, + { + "epoch": 1.0298572996706916, + "grad_norm": 2.043334484100342, + "learning_rate": 1.5408732331433595e-05, + "loss": 0.1794, + "step": 18764 + }, + { + "epoch": 1.0299670691547749, + "grad_norm": 3.8121700286865234, + "learning_rate": 1.54038972208575e-05, + "loss": 0.207, + "step": 18766 + }, + { + "epoch": 1.0300768386388583, + "grad_norm": 0.9261519312858582, + "learning_rate": 1.539906253121357e-05, + "loss": 0.1343, + "step": 18768 + }, + { + "epoch": 1.0301866081229418, + "grad_norm": 0.9389403462409973, + "learning_rate": 1.5394228262713874e-05, + "loss": 0.1806, + "step": 18770 + }, + { + "epoch": 1.0302963776070253, + "grad_norm": 1.4922852516174316, + "learning_rate": 1.538939441557048e-05, + "loss": 0.2409, + "step": 18772 + }, + { + "epoch": 1.0304061470911088, + "grad_norm": 1.358953833580017, + "learning_rate": 1.538456098999542e-05, + "loss": 0.1426, + "step": 18774 + }, + { + "epoch": 1.030515916575192, + "grad_norm": 2.0114705562591553, + "learning_rate": 1.5379727986200716e-05, + "loss": 0.1899, + "step": 18776 + }, + { + "epoch": 1.0306256860592755, + "grad_norm": 1.0471527576446533, + "learning_rate": 1.5374895404398354e-05, + "loss": 0.133, + "step": 18778 + }, + { + "epoch": 1.030735455543359, + "grad_norm": 1.5378061532974243, + "learning_rate": 1.5370063244800327e-05, + "loss": 0.217, + "step": 18780 + }, + { + "epoch": 1.0308452250274425, + "grad_norm": 1.0547120571136475, + "learning_rate": 1.536523150761859e-05, + "loss": 0.1798, + "step": 18782 + }, + { + "epoch": 1.0309549945115257, + "grad_norm": 1.8888003826141357, + "learning_rate": 1.536040019306509e-05, + "loss": 0.2796, + "step": 18784 + }, + { + "epoch": 1.0310647639956092, + "grad_norm": 1.4623876810073853, + "learning_rate": 1.5355569301351752e-05, + "loss": 0.3557, + "step": 18786 + }, + { + "epoch": 1.0311745334796927, + "grad_norm": 1.4439364671707153, + "learning_rate": 1.535073883269048e-05, + "loss": 0.2151, + "step": 18788 + }, + { + "epoch": 1.0312843029637762, + "grad_norm": 1.2782882452011108, + "learning_rate": 1.5345908787293164e-05, + "loss": 0.1276, + "step": 18790 + }, + { + "epoch": 1.0313940724478594, + "grad_norm": 1.7488996982574463, + "learning_rate": 1.534107916537168e-05, + "loss": 0.1971, + "step": 18792 + }, + { + "epoch": 1.0315038419319429, + "grad_norm": 1.023969054222107, + "learning_rate": 1.533624996713786e-05, + "loss": 0.2456, + "step": 18794 + }, + { + "epoch": 1.0316136114160264, + "grad_norm": 1.3413949012756348, + "learning_rate": 1.5331421192803565e-05, + "loss": 0.1908, + "step": 18796 + }, + { + "epoch": 1.0317233809001098, + "grad_norm": 1.050360918045044, + "learning_rate": 1.5326592842580584e-05, + "loss": 0.2062, + "step": 18798 + }, + { + "epoch": 1.031833150384193, + "grad_norm": 1.1930474042892456, + "learning_rate": 1.532176491668071e-05, + "loss": 0.2488, + "step": 18800 + }, + { + "epoch": 1.0319429198682766, + "grad_norm": 1.3036693334579468, + "learning_rate": 1.531693741531574e-05, + "loss": 0.2437, + "step": 18802 + }, + { + "epoch": 1.03205268935236, + "grad_norm": 1.8632209300994873, + "learning_rate": 1.5312110338697426e-05, + "loss": 0.2439, + "step": 18804 + }, + { + "epoch": 1.0321624588364435, + "grad_norm": 1.1635922193527222, + "learning_rate": 1.5307283687037502e-05, + "loss": 0.2224, + "step": 18806 + }, + { + "epoch": 1.0322722283205268, + "grad_norm": 1.6880377531051636, + "learning_rate": 1.5302457460547687e-05, + "loss": 0.307, + "step": 18808 + }, + { + "epoch": 1.0323819978046103, + "grad_norm": 1.230566143989563, + "learning_rate": 1.529763165943969e-05, + "loss": 0.2332, + "step": 18810 + }, + { + "epoch": 1.0324917672886937, + "grad_norm": 0.9243954420089722, + "learning_rate": 1.5292806283925193e-05, + "loss": 0.1927, + "step": 18812 + }, + { + "epoch": 1.0326015367727772, + "grad_norm": 1.1361523866653442, + "learning_rate": 1.528798133421585e-05, + "loss": 0.1875, + "step": 18814 + }, + { + "epoch": 1.0327113062568607, + "grad_norm": 1.0886445045471191, + "learning_rate": 1.528315681052332e-05, + "loss": 0.122, + "step": 18816 + }, + { + "epoch": 1.032821075740944, + "grad_norm": 1.210700273513794, + "learning_rate": 1.5278332713059227e-05, + "loss": 0.367, + "step": 18818 + }, + { + "epoch": 1.0329308452250274, + "grad_norm": 1.0418471097946167, + "learning_rate": 1.5273509042035172e-05, + "loss": 0.2415, + "step": 18820 + }, + { + "epoch": 1.033040614709111, + "grad_norm": 1.4404555559158325, + "learning_rate": 1.526868579766276e-05, + "loss": 0.1372, + "step": 18822 + }, + { + "epoch": 1.0331503841931944, + "grad_norm": 1.2961061000823975, + "learning_rate": 1.526386298015354e-05, + "loss": 0.2354, + "step": 18824 + }, + { + "epoch": 1.0332601536772776, + "grad_norm": 1.673186182975769, + "learning_rate": 1.5259040589719087e-05, + "loss": 0.2067, + "step": 18826 + }, + { + "epoch": 1.0333699231613611, + "grad_norm": 1.546862244606018, + "learning_rate": 1.5254218626570926e-05, + "loss": 0.2427, + "step": 18828 + }, + { + "epoch": 1.0334796926454446, + "grad_norm": 1.9435303211212158, + "learning_rate": 1.5249397090920559e-05, + "loss": 0.2176, + "step": 18830 + }, + { + "epoch": 1.033589462129528, + "grad_norm": 1.0301843881607056, + "learning_rate": 1.5244575982979497e-05, + "loss": 0.1571, + "step": 18832 + }, + { + "epoch": 1.0336992316136113, + "grad_norm": 1.0324190855026245, + "learning_rate": 1.5239755302959224e-05, + "loss": 0.2077, + "step": 18834 + }, + { + "epoch": 1.0338090010976948, + "grad_norm": 0.9637088775634766, + "learning_rate": 1.5234935051071192e-05, + "loss": 0.1872, + "step": 18836 + }, + { + "epoch": 1.0339187705817783, + "grad_norm": 0.7328130602836609, + "learning_rate": 1.5230115227526829e-05, + "loss": 0.1611, + "step": 18838 + }, + { + "epoch": 1.0340285400658618, + "grad_norm": 1.1201692819595337, + "learning_rate": 1.5225295832537574e-05, + "loss": 0.2088, + "step": 18840 + }, + { + "epoch": 1.034138309549945, + "grad_norm": 1.4190620183944702, + "learning_rate": 1.5220476866314814e-05, + "loss": 0.2408, + "step": 18842 + }, + { + "epoch": 1.0342480790340285, + "grad_norm": 0.7031670212745667, + "learning_rate": 1.521565832906994e-05, + "loss": 0.1299, + "step": 18844 + }, + { + "epoch": 1.034357848518112, + "grad_norm": 0.9527954459190369, + "learning_rate": 1.5210840221014323e-05, + "loss": 0.1581, + "step": 18846 + }, + { + "epoch": 1.0344676180021954, + "grad_norm": 1.381272792816162, + "learning_rate": 1.5206022542359297e-05, + "loss": 0.2243, + "step": 18848 + }, + { + "epoch": 1.034577387486279, + "grad_norm": 1.333363652229309, + "learning_rate": 1.520120529331619e-05, + "loss": 0.1971, + "step": 18850 + }, + { + "epoch": 1.0346871569703622, + "grad_norm": 1.4648888111114502, + "learning_rate": 1.5196388474096319e-05, + "loss": 0.2253, + "step": 18852 + }, + { + "epoch": 1.0347969264544457, + "grad_norm": 1.1095582246780396, + "learning_rate": 1.519157208491097e-05, + "loss": 0.2324, + "step": 18854 + }, + { + "epoch": 1.0349066959385291, + "grad_norm": 1.8494846820831299, + "learning_rate": 1.5186756125971407e-05, + "loss": 0.3185, + "step": 18856 + }, + { + "epoch": 1.0350164654226126, + "grad_norm": 1.098053216934204, + "learning_rate": 1.5181940597488891e-05, + "loss": 0.0985, + "step": 18858 + }, + { + "epoch": 1.0351262349066959, + "grad_norm": 0.9316503405570984, + "learning_rate": 1.5177125499674638e-05, + "loss": 0.1945, + "step": 18860 + }, + { + "epoch": 1.0352360043907793, + "grad_norm": 1.6199764013290405, + "learning_rate": 1.5172310832739889e-05, + "loss": 0.2577, + "step": 18862 + }, + { + "epoch": 1.0353457738748628, + "grad_norm": 0.971763551235199, + "learning_rate": 1.5167496596895814e-05, + "loss": 0.1695, + "step": 18864 + }, + { + "epoch": 1.0354555433589463, + "grad_norm": 1.08549165725708, + "learning_rate": 1.5162682792353608e-05, + "loss": 0.1544, + "step": 18866 + }, + { + "epoch": 1.0355653128430296, + "grad_norm": 1.8097424507141113, + "learning_rate": 1.515786941932441e-05, + "loss": 0.2827, + "step": 18868 + }, + { + "epoch": 1.035675082327113, + "grad_norm": 1.4483071565628052, + "learning_rate": 1.5153056478019378e-05, + "loss": 0.2106, + "step": 18870 + }, + { + "epoch": 1.0357848518111965, + "grad_norm": 1.0451719760894775, + "learning_rate": 1.5148243968649617e-05, + "loss": 0.1278, + "step": 18872 + }, + { + "epoch": 1.03589462129528, + "grad_norm": 0.783923327922821, + "learning_rate": 1.5143431891426225e-05, + "loss": 0.1673, + "step": 18874 + }, + { + "epoch": 1.0360043907793632, + "grad_norm": 2.398899555206299, + "learning_rate": 1.5138620246560296e-05, + "loss": 0.2135, + "step": 18876 + }, + { + "epoch": 1.0361141602634467, + "grad_norm": 1.7194819450378418, + "learning_rate": 1.5133809034262886e-05, + "loss": 0.1882, + "step": 18878 + }, + { + "epoch": 1.0362239297475302, + "grad_norm": 2.143386125564575, + "learning_rate": 1.512899825474503e-05, + "loss": 0.2407, + "step": 18880 + }, + { + "epoch": 1.0363336992316137, + "grad_norm": 0.8366804122924805, + "learning_rate": 1.5124187908217769e-05, + "loss": 0.2098, + "step": 18882 + }, + { + "epoch": 1.0364434687156971, + "grad_norm": 1.176629662513733, + "learning_rate": 1.5119377994892094e-05, + "loss": 0.1599, + "step": 18884 + }, + { + "epoch": 1.0365532381997804, + "grad_norm": 0.8802027702331543, + "learning_rate": 1.5114568514978996e-05, + "loss": 0.1689, + "step": 18886 + }, + { + "epoch": 1.0366630076838639, + "grad_norm": 1.160341501235962, + "learning_rate": 1.5109759468689449e-05, + "loss": 0.228, + "step": 18888 + }, + { + "epoch": 1.0367727771679474, + "grad_norm": 1.0060101747512817, + "learning_rate": 1.5104950856234395e-05, + "loss": 0.1674, + "step": 18890 + }, + { + "epoch": 1.0368825466520308, + "grad_norm": 1.1038589477539062, + "learning_rate": 1.5100142677824753e-05, + "loss": 0.1654, + "step": 18892 + }, + { + "epoch": 1.036992316136114, + "grad_norm": 2.2269351482391357, + "learning_rate": 1.509533493367145e-05, + "loss": 0.2556, + "step": 18894 + }, + { + "epoch": 1.0371020856201976, + "grad_norm": 1.697053074836731, + "learning_rate": 1.5090527623985379e-05, + "loss": 0.215, + "step": 18896 + }, + { + "epoch": 1.037211855104281, + "grad_norm": 1.1793931722640991, + "learning_rate": 1.5085720748977403e-05, + "loss": 0.3317, + "step": 18898 + }, + { + "epoch": 1.0373216245883645, + "grad_norm": 1.225571632385254, + "learning_rate": 1.5080914308858374e-05, + "loss": 0.1593, + "step": 18900 + }, + { + "epoch": 1.0374313940724478, + "grad_norm": 1.709820032119751, + "learning_rate": 1.5076108303839132e-05, + "loss": 0.2309, + "step": 18902 + }, + { + "epoch": 1.0375411635565313, + "grad_norm": 1.8456737995147705, + "learning_rate": 1.5071302734130489e-05, + "loss": 0.1927, + "step": 18904 + }, + { + "epoch": 1.0376509330406147, + "grad_norm": 1.7130674123764038, + "learning_rate": 1.5066497599943236e-05, + "loss": 0.3029, + "step": 18906 + }, + { + "epoch": 1.0377607025246982, + "grad_norm": 2.067728042602539, + "learning_rate": 1.5061692901488162e-05, + "loss": 0.3213, + "step": 18908 + }, + { + "epoch": 1.0378704720087815, + "grad_norm": 2.419772148132324, + "learning_rate": 1.5056888638976011e-05, + "loss": 0.27, + "step": 18910 + }, + { + "epoch": 1.037980241492865, + "grad_norm": 0.9748495817184448, + "learning_rate": 1.5052084812617533e-05, + "loss": 0.1546, + "step": 18912 + }, + { + "epoch": 1.0380900109769484, + "grad_norm": 1.287737488746643, + "learning_rate": 1.504728142262344e-05, + "loss": 0.2171, + "step": 18914 + }, + { + "epoch": 1.038199780461032, + "grad_norm": 1.2399070262908936, + "learning_rate": 1.5042478469204435e-05, + "loss": 0.2414, + "step": 18916 + }, + { + "epoch": 1.0383095499451152, + "grad_norm": 1.42728590965271, + "learning_rate": 1.5037675952571201e-05, + "loss": 0.1932, + "step": 18918 + }, + { + "epoch": 1.0384193194291986, + "grad_norm": 0.9406166672706604, + "learning_rate": 1.5032873872934394e-05, + "loss": 0.1946, + "step": 18920 + }, + { + "epoch": 1.038529088913282, + "grad_norm": 2.2355880737304688, + "learning_rate": 1.5028072230504656e-05, + "loss": 0.1582, + "step": 18922 + }, + { + "epoch": 1.0386388583973656, + "grad_norm": 1.5805574655532837, + "learning_rate": 1.5023271025492618e-05, + "loss": 0.2293, + "step": 18924 + }, + { + "epoch": 1.038748627881449, + "grad_norm": 1.3262178897857666, + "learning_rate": 1.5018470258108886e-05, + "loss": 0.233, + "step": 18926 + }, + { + "epoch": 1.0388583973655323, + "grad_norm": 0.898697555065155, + "learning_rate": 1.501366992856404e-05, + "loss": 0.3596, + "step": 18928 + }, + { + "epoch": 1.0389681668496158, + "grad_norm": 0.8873559832572937, + "learning_rate": 1.5008870037068644e-05, + "loss": 0.264, + "step": 18930 + }, + { + "epoch": 1.0390779363336993, + "grad_norm": 0.9363778233528137, + "learning_rate": 1.5004070583833251e-05, + "loss": 0.1418, + "step": 18932 + }, + { + "epoch": 1.0391877058177827, + "grad_norm": 1.3494391441345215, + "learning_rate": 1.4999271569068385e-05, + "loss": 0.1996, + "step": 18934 + }, + { + "epoch": 1.039297475301866, + "grad_norm": 1.182961344718933, + "learning_rate": 1.499447299298455e-05, + "loss": 0.2586, + "step": 18936 + }, + { + "epoch": 1.0394072447859495, + "grad_norm": 3.0318572521209717, + "learning_rate": 1.4989674855792246e-05, + "loss": 0.2054, + "step": 18938 + }, + { + "epoch": 1.039517014270033, + "grad_norm": 1.0718437433242798, + "learning_rate": 1.4984877157701932e-05, + "loss": 0.2089, + "step": 18940 + }, + { + "epoch": 1.0396267837541164, + "grad_norm": 0.8905099630355835, + "learning_rate": 1.498007989892406e-05, + "loss": 0.1972, + "step": 18942 + }, + { + "epoch": 1.0397365532381997, + "grad_norm": 1.0135549306869507, + "learning_rate": 1.4975283079669072e-05, + "loss": 0.1829, + "step": 18944 + }, + { + "epoch": 1.0398463227222832, + "grad_norm": 1.188084363937378, + "learning_rate": 1.4970486700147372e-05, + "loss": 0.1398, + "step": 18946 + }, + { + "epoch": 1.0399560922063666, + "grad_norm": 1.9438691139221191, + "learning_rate": 1.4965690760569346e-05, + "loss": 0.3485, + "step": 18948 + }, + { + "epoch": 1.0400658616904501, + "grad_norm": 0.925940215587616, + "learning_rate": 1.496089526114538e-05, + "loss": 0.1489, + "step": 18950 + }, + { + "epoch": 1.0401756311745334, + "grad_norm": 2.0381815433502197, + "learning_rate": 1.4956100202085809e-05, + "loss": 0.1482, + "step": 18952 + }, + { + "epoch": 1.0402854006586169, + "grad_norm": 0.94245845079422, + "learning_rate": 1.4951305583601e-05, + "loss": 0.2006, + "step": 18954 + }, + { + "epoch": 1.0403951701427003, + "grad_norm": 1.184180736541748, + "learning_rate": 1.4946511405901236e-05, + "loss": 0.1749, + "step": 18956 + }, + { + "epoch": 1.0405049396267838, + "grad_norm": 0.6850469708442688, + "learning_rate": 1.4941717669196836e-05, + "loss": 0.156, + "step": 18958 + }, + { + "epoch": 1.0406147091108673, + "grad_norm": 1.5784975290298462, + "learning_rate": 1.4936924373698066e-05, + "loss": 0.2, + "step": 18960 + }, + { + "epoch": 1.0407244785949505, + "grad_norm": 1.7555986642837524, + "learning_rate": 1.493213151961519e-05, + "loss": 0.2083, + "step": 18962 + }, + { + "epoch": 1.040834248079034, + "grad_norm": 1.3796709775924683, + "learning_rate": 1.4927339107158437e-05, + "loss": 0.1249, + "step": 18964 + }, + { + "epoch": 1.0409440175631175, + "grad_norm": 0.9882693886756897, + "learning_rate": 1.4922547136538028e-05, + "loss": 0.1455, + "step": 18966 + }, + { + "epoch": 1.041053787047201, + "grad_norm": 1.029517650604248, + "learning_rate": 1.4917755607964168e-05, + "loss": 0.2454, + "step": 18968 + }, + { + "epoch": 1.0411635565312842, + "grad_norm": 1.472739577293396, + "learning_rate": 1.4912964521647035e-05, + "loss": 0.2627, + "step": 18970 + }, + { + "epoch": 1.0412733260153677, + "grad_norm": 1.4566452503204346, + "learning_rate": 1.4908173877796783e-05, + "loss": 0.1607, + "step": 18972 + }, + { + "epoch": 1.0413830954994512, + "grad_norm": 1.139656901359558, + "learning_rate": 1.4903383676623564e-05, + "loss": 0.2918, + "step": 18974 + }, + { + "epoch": 1.0414928649835347, + "grad_norm": 1.3934569358825684, + "learning_rate": 1.4898593918337494e-05, + "loss": 0.2127, + "step": 18976 + }, + { + "epoch": 1.041602634467618, + "grad_norm": 1.1783039569854736, + "learning_rate": 1.489380460314867e-05, + "loss": 0.1668, + "step": 18978 + }, + { + "epoch": 1.0417124039517014, + "grad_norm": 1.432410717010498, + "learning_rate": 1.4889015731267186e-05, + "loss": 0.2065, + "step": 18980 + }, + { + "epoch": 1.0418221734357849, + "grad_norm": 1.473659634590149, + "learning_rate": 1.4884227302903086e-05, + "loss": 0.2278, + "step": 18982 + }, + { + "epoch": 1.0419319429198683, + "grad_norm": 1.1278175115585327, + "learning_rate": 1.4879439318266442e-05, + "loss": 0.2456, + "step": 18984 + }, + { + "epoch": 1.0420417124039516, + "grad_norm": 1.2909479141235352, + "learning_rate": 1.4874651777567256e-05, + "loss": 0.2312, + "step": 18986 + }, + { + "epoch": 1.042151481888035, + "grad_norm": 1.4834388494491577, + "learning_rate": 1.486986468101555e-05, + "loss": 0.2112, + "step": 18988 + }, + { + "epoch": 1.0422612513721186, + "grad_norm": 1.117112159729004, + "learning_rate": 1.4865078028821296e-05, + "loss": 0.2148, + "step": 18990 + }, + { + "epoch": 1.042371020856202, + "grad_norm": 2.784301519393921, + "learning_rate": 1.486029182119446e-05, + "loss": 0.2403, + "step": 18992 + }, + { + "epoch": 1.0424807903402855, + "grad_norm": 1.8821134567260742, + "learning_rate": 1.4855506058345003e-05, + "loss": 0.2037, + "step": 18994 + }, + { + "epoch": 1.0425905598243688, + "grad_norm": 1.3147950172424316, + "learning_rate": 1.485072074048284e-05, + "loss": 0.2031, + "step": 18996 + }, + { + "epoch": 1.0427003293084522, + "grad_norm": 1.3255906105041504, + "learning_rate": 1.4845935867817876e-05, + "loss": 0.2631, + "step": 18998 + }, + { + "epoch": 1.0428100987925357, + "grad_norm": 1.0306193828582764, + "learning_rate": 1.4841151440560009e-05, + "loss": 0.2356, + "step": 19000 + }, + { + "epoch": 1.0429198682766192, + "grad_norm": 1.2008973360061646, + "learning_rate": 1.4836367458919099e-05, + "loss": 0.2448, + "step": 19002 + }, + { + "epoch": 1.0430296377607025, + "grad_norm": 1.5188822746276855, + "learning_rate": 1.4831583923104999e-05, + "loss": 0.2196, + "step": 19004 + }, + { + "epoch": 1.043139407244786, + "grad_norm": 1.3426367044448853, + "learning_rate": 1.482680083332754e-05, + "loss": 0.2206, + "step": 19006 + }, + { + "epoch": 1.0432491767288694, + "grad_norm": 1.3197888135910034, + "learning_rate": 1.4822018189796525e-05, + "loss": 0.3007, + "step": 19008 + }, + { + "epoch": 1.0433589462129529, + "grad_norm": 1.1142935752868652, + "learning_rate": 1.481723599272175e-05, + "loss": 0.239, + "step": 19010 + }, + { + "epoch": 1.0434687156970361, + "grad_norm": 0.877812385559082, + "learning_rate": 1.4812454242312979e-05, + "loss": 0.2582, + "step": 19012 + }, + { + "epoch": 1.0435784851811196, + "grad_norm": 2.6787402629852295, + "learning_rate": 1.4807672938779975e-05, + "loss": 0.2125, + "step": 19014 + }, + { + "epoch": 1.043688254665203, + "grad_norm": 3.537217140197754, + "learning_rate": 1.4802892082332461e-05, + "loss": 0.2219, + "step": 19016 + }, + { + "epoch": 1.0437980241492866, + "grad_norm": 1.38948655128479, + "learning_rate": 1.4798111673180155e-05, + "loss": 0.2138, + "step": 19018 + }, + { + "epoch": 1.0439077936333698, + "grad_norm": 1.0992343425750732, + "learning_rate": 1.4793331711532744e-05, + "loss": 0.2218, + "step": 19020 + }, + { + "epoch": 1.0440175631174533, + "grad_norm": 1.1174460649490356, + "learning_rate": 1.4788552197599898e-05, + "loss": 0.1465, + "step": 19022 + }, + { + "epoch": 1.0441273326015368, + "grad_norm": 0.7797772288322449, + "learning_rate": 1.4783773131591278e-05, + "loss": 0.1624, + "step": 19024 + }, + { + "epoch": 1.0442371020856203, + "grad_norm": 0.7991822361946106, + "learning_rate": 1.477899451371651e-05, + "loss": 0.2519, + "step": 19026 + }, + { + "epoch": 1.0443468715697035, + "grad_norm": 1.493772268295288, + "learning_rate": 1.4774216344185205e-05, + "loss": 0.2774, + "step": 19028 + }, + { + "epoch": 1.044456641053787, + "grad_norm": 1.18708336353302, + "learning_rate": 1.4769438623206971e-05, + "loss": 0.2004, + "step": 19030 + }, + { + "epoch": 1.0445664105378705, + "grad_norm": 0.9667067527770996, + "learning_rate": 1.476466135099137e-05, + "loss": 0.1982, + "step": 19032 + }, + { + "epoch": 1.044676180021954, + "grad_norm": 0.787067174911499, + "learning_rate": 1.4759884527747957e-05, + "loss": 0.2844, + "step": 19034 + }, + { + "epoch": 1.0447859495060374, + "grad_norm": 1.3455170392990112, + "learning_rate": 1.4755108153686275e-05, + "loss": 0.2069, + "step": 19036 + }, + { + "epoch": 1.0448957189901207, + "grad_norm": 1.7966605424880981, + "learning_rate": 1.475033222901583e-05, + "loss": 0.2841, + "step": 19038 + }, + { + "epoch": 1.0450054884742042, + "grad_norm": 1.257768988609314, + "learning_rate": 1.4745556753946125e-05, + "loss": 0.2189, + "step": 19040 + }, + { + "epoch": 1.0451152579582876, + "grad_norm": 0.7846956849098206, + "learning_rate": 1.4740781728686623e-05, + "loss": 0.1396, + "step": 19042 + }, + { + "epoch": 1.0452250274423711, + "grad_norm": 1.2038228511810303, + "learning_rate": 1.4736007153446801e-05, + "loss": 0.2043, + "step": 19044 + }, + { + "epoch": 1.0453347969264544, + "grad_norm": 0.9496535658836365, + "learning_rate": 1.4731233028436076e-05, + "loss": 0.1587, + "step": 19046 + }, + { + "epoch": 1.0454445664105378, + "grad_norm": 1.3285539150238037, + "learning_rate": 1.472645935386388e-05, + "loss": 0.2426, + "step": 19048 + }, + { + "epoch": 1.0455543358946213, + "grad_norm": 1.4854966402053833, + "learning_rate": 1.47216861299396e-05, + "loss": 0.1403, + "step": 19050 + }, + { + "epoch": 1.0456641053787048, + "grad_norm": 0.9270810484886169, + "learning_rate": 1.4716913356872614e-05, + "loss": 0.1158, + "step": 19052 + }, + { + "epoch": 1.045773874862788, + "grad_norm": 1.3091216087341309, + "learning_rate": 1.4712141034872282e-05, + "loss": 0.1631, + "step": 19054 + }, + { + "epoch": 1.0458836443468715, + "grad_norm": 0.8864503502845764, + "learning_rate": 1.470736916414794e-05, + "loss": 0.1579, + "step": 19056 + }, + { + "epoch": 1.045993413830955, + "grad_norm": 1.4167275428771973, + "learning_rate": 1.4702597744908903e-05, + "loss": 0.1858, + "step": 19058 + }, + { + "epoch": 1.0461031833150385, + "grad_norm": 0.877802312374115, + "learning_rate": 1.4697826777364477e-05, + "loss": 0.1315, + "step": 19060 + }, + { + "epoch": 1.0462129527991217, + "grad_norm": 0.8515374660491943, + "learning_rate": 1.469305626172393e-05, + "loss": 0.1358, + "step": 19062 + }, + { + "epoch": 1.0463227222832052, + "grad_norm": 1.3456758260726929, + "learning_rate": 1.4688286198196524e-05, + "loss": 0.1582, + "step": 19064 + }, + { + "epoch": 1.0464324917672887, + "grad_norm": 0.9262775182723999, + "learning_rate": 1.4683516586991503e-05, + "loss": 0.1626, + "step": 19066 + }, + { + "epoch": 1.0465422612513722, + "grad_norm": 1.165107250213623, + "learning_rate": 1.4678747428318079e-05, + "loss": 0.1634, + "step": 19068 + }, + { + "epoch": 1.0466520307354554, + "grad_norm": 1.678643822669983, + "learning_rate": 1.4673978722385451e-05, + "loss": 0.1933, + "step": 19070 + }, + { + "epoch": 1.046761800219539, + "grad_norm": 1.1600720882415771, + "learning_rate": 1.4669210469402789e-05, + "loss": 0.2651, + "step": 19072 + }, + { + "epoch": 1.0468715697036224, + "grad_norm": 1.0079386234283447, + "learning_rate": 1.4664442669579275e-05, + "loss": 0.144, + "step": 19074 + }, + { + "epoch": 1.0469813391877059, + "grad_norm": 1.8706722259521484, + "learning_rate": 1.4659675323124036e-05, + "loss": 0.2659, + "step": 19076 + }, + { + "epoch": 1.0470911086717893, + "grad_norm": 1.2799242734909058, + "learning_rate": 1.4654908430246184e-05, + "loss": 0.237, + "step": 19078 + }, + { + "epoch": 1.0472008781558726, + "grad_norm": 1.474997878074646, + "learning_rate": 1.4650141991154832e-05, + "loss": 0.3198, + "step": 19080 + }, + { + "epoch": 1.047310647639956, + "grad_norm": 1.25120210647583, + "learning_rate": 1.4645376006059053e-05, + "loss": 0.245, + "step": 19082 + }, + { + "epoch": 1.0474204171240395, + "grad_norm": 1.3821965456008911, + "learning_rate": 1.4640610475167898e-05, + "loss": 0.1893, + "step": 19084 + }, + { + "epoch": 1.047530186608123, + "grad_norm": 1.478683590888977, + "learning_rate": 1.463584539869042e-05, + "loss": 0.2146, + "step": 19086 + }, + { + "epoch": 1.0476399560922063, + "grad_norm": 1.9940327405929565, + "learning_rate": 1.4631080776835629e-05, + "loss": 0.2254, + "step": 19088 + }, + { + "epoch": 1.0477497255762898, + "grad_norm": 1.3676807880401611, + "learning_rate": 1.4626316609812535e-05, + "loss": 0.2585, + "step": 19090 + }, + { + "epoch": 1.0478594950603732, + "grad_norm": 1.1762359142303467, + "learning_rate": 1.462155289783011e-05, + "loss": 0.2097, + "step": 19092 + }, + { + "epoch": 1.0479692645444567, + "grad_norm": 1.1674718856811523, + "learning_rate": 1.4616789641097308e-05, + "loss": 0.1853, + "step": 19094 + }, + { + "epoch": 1.04807903402854, + "grad_norm": 1.0660310983657837, + "learning_rate": 1.4612026839823084e-05, + "loss": 0.164, + "step": 19096 + }, + { + "epoch": 1.0481888035126234, + "grad_norm": 1.838370680809021, + "learning_rate": 1.460726449421635e-05, + "loss": 0.2895, + "step": 19098 + }, + { + "epoch": 1.048298572996707, + "grad_norm": 1.8410755395889282, + "learning_rate": 1.4602502604486001e-05, + "loss": 0.2519, + "step": 19100 + }, + { + "epoch": 1.0484083424807904, + "grad_norm": 1.2514127492904663, + "learning_rate": 1.4597741170840914e-05, + "loss": 0.2489, + "step": 19102 + }, + { + "epoch": 1.0485181119648739, + "grad_norm": 1.1266940832138062, + "learning_rate": 1.4592980193489975e-05, + "loss": 0.1676, + "step": 19104 + }, + { + "epoch": 1.0486278814489571, + "grad_norm": 0.9465166330337524, + "learning_rate": 1.458821967264199e-05, + "loss": 0.1883, + "step": 19106 + }, + { + "epoch": 1.0487376509330406, + "grad_norm": 1.0671190023422241, + "learning_rate": 1.4583459608505801e-05, + "loss": 0.1238, + "step": 19108 + }, + { + "epoch": 1.048847420417124, + "grad_norm": 1.3753286600112915, + "learning_rate": 1.4578700001290202e-05, + "loss": 0.3242, + "step": 19110 + }, + { + "epoch": 1.0489571899012076, + "grad_norm": 2.1017954349517822, + "learning_rate": 1.4573940851203974e-05, + "loss": 0.2523, + "step": 19112 + }, + { + "epoch": 1.0490669593852908, + "grad_norm": 1.331930160522461, + "learning_rate": 1.4569182158455875e-05, + "loss": 0.2498, + "step": 19114 + }, + { + "epoch": 1.0491767288693743, + "grad_norm": 3.1008729934692383, + "learning_rate": 1.456442392325463e-05, + "loss": 0.2337, + "step": 19116 + }, + { + "epoch": 1.0492864983534578, + "grad_norm": 1.84954833984375, + "learning_rate": 1.4559666145808986e-05, + "loss": 0.1294, + "step": 19118 + }, + { + "epoch": 1.0493962678375413, + "grad_norm": 0.9673701524734497, + "learning_rate": 1.4554908826327625e-05, + "loss": 0.1813, + "step": 19120 + }, + { + "epoch": 1.0495060373216245, + "grad_norm": 1.2805438041687012, + "learning_rate": 1.4550151965019235e-05, + "loss": 0.3037, + "step": 19122 + }, + { + "epoch": 1.049615806805708, + "grad_norm": 1.181738257408142, + "learning_rate": 1.4545395562092468e-05, + "loss": 0.1438, + "step": 19124 + }, + { + "epoch": 1.0497255762897915, + "grad_norm": 1.5386130809783936, + "learning_rate": 1.454063961775597e-05, + "loss": 0.2501, + "step": 19126 + }, + { + "epoch": 1.049835345773875, + "grad_norm": 0.9951169490814209, + "learning_rate": 1.4535884132218342e-05, + "loss": 0.1455, + "step": 19128 + }, + { + "epoch": 1.0499451152579582, + "grad_norm": 1.0830211639404297, + "learning_rate": 1.4531129105688207e-05, + "loss": 0.251, + "step": 19130 + }, + { + "epoch": 1.0500548847420417, + "grad_norm": 0.9989545345306396, + "learning_rate": 1.4526374538374132e-05, + "loss": 0.1627, + "step": 19132 + }, + { + "epoch": 1.0501646542261251, + "grad_norm": 1.2670544385910034, + "learning_rate": 1.452162043048467e-05, + "loss": 0.2153, + "step": 19134 + }, + { + "epoch": 1.0502744237102086, + "grad_norm": 1.061668038368225, + "learning_rate": 1.4516866782228378e-05, + "loss": 0.0967, + "step": 19136 + }, + { + "epoch": 1.0503841931942919, + "grad_norm": 1.2949475049972534, + "learning_rate": 1.4512113593813759e-05, + "loss": 0.2677, + "step": 19138 + }, + { + "epoch": 1.0504939626783754, + "grad_norm": 1.135201096534729, + "learning_rate": 1.4507360865449319e-05, + "loss": 0.1701, + "step": 19140 + }, + { + "epoch": 1.0506037321624588, + "grad_norm": 1.2591052055358887, + "learning_rate": 1.450260859734352e-05, + "loss": 0.2637, + "step": 19142 + }, + { + "epoch": 1.0507135016465423, + "grad_norm": 1.5001951456069946, + "learning_rate": 1.4497856789704844e-05, + "loss": 0.3108, + "step": 19144 + }, + { + "epoch": 1.0508232711306258, + "grad_norm": 1.3367855548858643, + "learning_rate": 1.4493105442741717e-05, + "loss": 0.2399, + "step": 19146 + }, + { + "epoch": 1.050933040614709, + "grad_norm": 1.3658428192138672, + "learning_rate": 1.4488354556662554e-05, + "loss": 0.2802, + "step": 19148 + }, + { + "epoch": 1.0510428100987925, + "grad_norm": 1.073975920677185, + "learning_rate": 1.4483604131675755e-05, + "loss": 0.1178, + "step": 19150 + }, + { + "epoch": 1.051152579582876, + "grad_norm": 2.278851270675659, + "learning_rate": 1.4478854167989687e-05, + "loss": 0.1867, + "step": 19152 + }, + { + "epoch": 1.0512623490669595, + "grad_norm": 1.2435451745986938, + "learning_rate": 1.4474104665812727e-05, + "loss": 0.188, + "step": 19154 + }, + { + "epoch": 1.0513721185510427, + "grad_norm": 1.829379677772522, + "learning_rate": 1.4469355625353198e-05, + "loss": 0.1592, + "step": 19156 + }, + { + "epoch": 1.0514818880351262, + "grad_norm": 0.8735647201538086, + "learning_rate": 1.446460704681942e-05, + "loss": 0.1171, + "step": 19158 + }, + { + "epoch": 1.0515916575192097, + "grad_norm": 1.295095682144165, + "learning_rate": 1.4459858930419689e-05, + "loss": 0.2031, + "step": 19160 + }, + { + "epoch": 1.0517014270032932, + "grad_norm": 0.9213384985923767, + "learning_rate": 1.4455111276362277e-05, + "loss": 0.1775, + "step": 19162 + }, + { + "epoch": 1.0518111964873764, + "grad_norm": 1.0587035417556763, + "learning_rate": 1.4450364084855433e-05, + "loss": 0.1222, + "step": 19164 + }, + { + "epoch": 1.05192096597146, + "grad_norm": 1.1550183296203613, + "learning_rate": 1.4445617356107399e-05, + "loss": 0.1681, + "step": 19166 + }, + { + "epoch": 1.0520307354555434, + "grad_norm": 1.2707618474960327, + "learning_rate": 1.4440871090326404e-05, + "loss": 0.2201, + "step": 19168 + }, + { + "epoch": 1.0521405049396269, + "grad_norm": 0.8941804766654968, + "learning_rate": 1.4436125287720632e-05, + "loss": 0.1119, + "step": 19170 + }, + { + "epoch": 1.05225027442371, + "grad_norm": 0.8552420735359192, + "learning_rate": 1.4431379948498253e-05, + "loss": 0.1892, + "step": 19172 + }, + { + "epoch": 1.0523600439077936, + "grad_norm": 2.235086441040039, + "learning_rate": 1.4426635072867423e-05, + "loss": 0.1592, + "step": 19174 + }, + { + "epoch": 1.052469813391877, + "grad_norm": 1.2868419885635376, + "learning_rate": 1.4421890661036275e-05, + "loss": 0.1578, + "step": 19176 + }, + { + "epoch": 1.0525795828759605, + "grad_norm": 2.324032783508301, + "learning_rate": 1.4417146713212914e-05, + "loss": 0.1934, + "step": 19178 + }, + { + "epoch": 1.0526893523600438, + "grad_norm": 2.36297869682312, + "learning_rate": 1.4412403229605454e-05, + "loss": 0.2173, + "step": 19180 + }, + { + "epoch": 1.0527991218441273, + "grad_norm": 1.2052133083343506, + "learning_rate": 1.4407660210421952e-05, + "loss": 0.3298, + "step": 19182 + }, + { + "epoch": 1.0529088913282107, + "grad_norm": 1.8150112628936768, + "learning_rate": 1.4402917655870466e-05, + "loss": 0.1488, + "step": 19184 + }, + { + "epoch": 1.0530186608122942, + "grad_norm": 1.6424853801727295, + "learning_rate": 1.4398175566159023e-05, + "loss": 0.2022, + "step": 19186 + }, + { + "epoch": 1.0531284302963777, + "grad_norm": 1.2974255084991455, + "learning_rate": 1.4393433941495637e-05, + "loss": 0.192, + "step": 19188 + }, + { + "epoch": 1.053238199780461, + "grad_norm": 1.8525598049163818, + "learning_rate": 1.4388692782088292e-05, + "loss": 0.3047, + "step": 19190 + }, + { + "epoch": 1.0533479692645444, + "grad_norm": 0.970150351524353, + "learning_rate": 1.438395208814497e-05, + "loss": 0.2518, + "step": 19192 + }, + { + "epoch": 1.053457738748628, + "grad_norm": 0.9730039238929749, + "learning_rate": 1.4379211859873609e-05, + "loss": 0.1401, + "step": 19194 + }, + { + "epoch": 1.0535675082327114, + "grad_norm": 1.0587856769561768, + "learning_rate": 1.4374472097482155e-05, + "loss": 0.1919, + "step": 19196 + }, + { + "epoch": 1.0536772777167946, + "grad_norm": 1.2250161170959473, + "learning_rate": 1.4369732801178507e-05, + "loss": 0.2552, + "step": 19198 + }, + { + "epoch": 1.0537870472008781, + "grad_norm": 0.8631046414375305, + "learning_rate": 1.4364993971170553e-05, + "loss": 0.1705, + "step": 19200 + }, + { + "epoch": 1.0538968166849616, + "grad_norm": 1.2357639074325562, + "learning_rate": 1.4360255607666157e-05, + "loss": 0.1996, + "step": 19202 + }, + { + "epoch": 1.054006586169045, + "grad_norm": 1.4767894744873047, + "learning_rate": 1.4355517710873184e-05, + "loss": 0.197, + "step": 19204 + }, + { + "epoch": 1.0541163556531283, + "grad_norm": 0.9709569215774536, + "learning_rate": 1.4350780280999445e-05, + "loss": 0.1441, + "step": 19206 + }, + { + "epoch": 1.0542261251372118, + "grad_norm": 1.1415051221847534, + "learning_rate": 1.4346043318252756e-05, + "loss": 0.1419, + "step": 19208 + }, + { + "epoch": 1.0543358946212953, + "grad_norm": 1.3918160200119019, + "learning_rate": 1.43413068228409e-05, + "loss": 0.1767, + "step": 19210 + }, + { + "epoch": 1.0544456641053788, + "grad_norm": 1.0704305171966553, + "learning_rate": 1.4336570794971643e-05, + "loss": 0.1607, + "step": 19212 + }, + { + "epoch": 1.0545554335894622, + "grad_norm": 1.0140928030014038, + "learning_rate": 1.4331835234852717e-05, + "loss": 0.1523, + "step": 19214 + }, + { + "epoch": 1.0546652030735455, + "grad_norm": 1.0581082105636597, + "learning_rate": 1.4327100142691874e-05, + "loss": 0.3053, + "step": 19216 + }, + { + "epoch": 1.054774972557629, + "grad_norm": 1.1278796195983887, + "learning_rate": 1.4322365518696801e-05, + "loss": 0.1365, + "step": 19218 + }, + { + "epoch": 1.0548847420417125, + "grad_norm": 1.154097080230713, + "learning_rate": 1.4317631363075184e-05, + "loss": 0.2042, + "step": 19220 + }, + { + "epoch": 1.054994511525796, + "grad_norm": 1.3009220361709595, + "learning_rate": 1.4312897676034693e-05, + "loss": 0.21, + "step": 19222 + }, + { + "epoch": 1.0551042810098792, + "grad_norm": 1.0670890808105469, + "learning_rate": 1.4308164457782952e-05, + "loss": 0.1226, + "step": 19224 + }, + { + "epoch": 1.0552140504939627, + "grad_norm": 2.2300539016723633, + "learning_rate": 1.4303431708527606e-05, + "loss": 0.2533, + "step": 19226 + }, + { + "epoch": 1.0553238199780461, + "grad_norm": 1.4422715902328491, + "learning_rate": 1.4298699428476236e-05, + "loss": 0.2056, + "step": 19228 + }, + { + "epoch": 1.0554335894621296, + "grad_norm": 1.0428342819213867, + "learning_rate": 1.4293967617836449e-05, + "loss": 0.1672, + "step": 19230 + }, + { + "epoch": 1.0555433589462129, + "grad_norm": 1.2422685623168945, + "learning_rate": 1.4289236276815787e-05, + "loss": 0.1509, + "step": 19232 + }, + { + "epoch": 1.0556531284302964, + "grad_norm": 0.9902531504631042, + "learning_rate": 1.4284505405621795e-05, + "loss": 0.217, + "step": 19234 + }, + { + "epoch": 1.0557628979143798, + "grad_norm": 1.6137140989303589, + "learning_rate": 1.427977500446199e-05, + "loss": 0.2603, + "step": 19236 + }, + { + "epoch": 1.0558726673984633, + "grad_norm": 1.6598222255706787, + "learning_rate": 1.4275045073543869e-05, + "loss": 0.2922, + "step": 19238 + }, + { + "epoch": 1.0559824368825466, + "grad_norm": 0.9882667064666748, + "learning_rate": 1.4270315613074906e-05, + "loss": 0.1311, + "step": 19240 + }, + { + "epoch": 1.05609220636663, + "grad_norm": 1.3874545097351074, + "learning_rate": 1.4265586623262573e-05, + "loss": 0.1827, + "step": 19242 + }, + { + "epoch": 1.0562019758507135, + "grad_norm": 1.3360936641693115, + "learning_rate": 1.4260858104314297e-05, + "loss": 0.1934, + "step": 19244 + }, + { + "epoch": 1.056311745334797, + "grad_norm": 1.9084980487823486, + "learning_rate": 1.4256130056437498e-05, + "loss": 0.2144, + "step": 19246 + }, + { + "epoch": 1.0564215148188802, + "grad_norm": 1.163153886795044, + "learning_rate": 1.4251402479839564e-05, + "loss": 0.1473, + "step": 19248 + }, + { + "epoch": 1.0565312843029637, + "grad_norm": 1.483109712600708, + "learning_rate": 1.4246675374727869e-05, + "loss": 0.2334, + "step": 19250 + }, + { + "epoch": 1.0566410537870472, + "grad_norm": 2.173365354537964, + "learning_rate": 1.4241948741309782e-05, + "loss": 0.2408, + "step": 19252 + }, + { + "epoch": 1.0567508232711307, + "grad_norm": 1.2463033199310303, + "learning_rate": 1.4237222579792618e-05, + "loss": 0.2283, + "step": 19254 + }, + { + "epoch": 1.0568605927552142, + "grad_norm": 1.256878137588501, + "learning_rate": 1.4232496890383706e-05, + "loss": 0.2376, + "step": 19256 + }, + { + "epoch": 1.0569703622392974, + "grad_norm": 2.3196892738342285, + "learning_rate": 1.422777167329033e-05, + "loss": 0.2998, + "step": 19258 + }, + { + "epoch": 1.057080131723381, + "grad_norm": 1.1355520486831665, + "learning_rate": 1.4223046928719763e-05, + "loss": 0.1498, + "step": 19260 + }, + { + "epoch": 1.0571899012074644, + "grad_norm": 1.0215672254562378, + "learning_rate": 1.4218322656879254e-05, + "loss": 0.2091, + "step": 19262 + }, + { + "epoch": 1.0572996706915478, + "grad_norm": 1.3283675909042358, + "learning_rate": 1.4213598857976024e-05, + "loss": 0.1613, + "step": 19264 + }, + { + "epoch": 1.057409440175631, + "grad_norm": 1.330159068107605, + "learning_rate": 1.4208875532217298e-05, + "loss": 0.1497, + "step": 19266 + }, + { + "epoch": 1.0575192096597146, + "grad_norm": 0.7331910729408264, + "learning_rate": 1.4204152679810258e-05, + "loss": 0.1251, + "step": 19268 + }, + { + "epoch": 1.057628979143798, + "grad_norm": 1.400684118270874, + "learning_rate": 1.4199430300962072e-05, + "loss": 0.1841, + "step": 19270 + }, + { + "epoch": 1.0577387486278815, + "grad_norm": 1.2808727025985718, + "learning_rate": 1.4194708395879886e-05, + "loss": 0.1439, + "step": 19272 + }, + { + "epoch": 1.0578485181119648, + "grad_norm": 1.2052011489868164, + "learning_rate": 1.4189986964770823e-05, + "loss": 0.2056, + "step": 19274 + }, + { + "epoch": 1.0579582875960483, + "grad_norm": 1.72543203830719, + "learning_rate": 1.418526600784198e-05, + "loss": 0.2299, + "step": 19276 + }, + { + "epoch": 1.0580680570801317, + "grad_norm": 1.5311464071273804, + "learning_rate": 1.4180545525300464e-05, + "loss": 0.2102, + "step": 19278 + }, + { + "epoch": 1.0581778265642152, + "grad_norm": 1.1422758102416992, + "learning_rate": 1.4175825517353325e-05, + "loss": 0.2381, + "step": 19280 + }, + { + "epoch": 1.0582875960482985, + "grad_norm": 0.8948400616645813, + "learning_rate": 1.4171105984207605e-05, + "loss": 0.1718, + "step": 19282 + }, + { + "epoch": 1.058397365532382, + "grad_norm": 2.306603193283081, + "learning_rate": 1.4166386926070322e-05, + "loss": 0.193, + "step": 19284 + }, + { + "epoch": 1.0585071350164654, + "grad_norm": 1.7821279764175415, + "learning_rate": 1.4161668343148491e-05, + "loss": 0.1849, + "step": 19286 + }, + { + "epoch": 1.058616904500549, + "grad_norm": 1.304782509803772, + "learning_rate": 1.4156950235649074e-05, + "loss": 0.1486, + "step": 19288 + }, + { + "epoch": 1.0587266739846322, + "grad_norm": 1.1280953884124756, + "learning_rate": 1.415223260377905e-05, + "loss": 0.2269, + "step": 19290 + }, + { + "epoch": 1.0588364434687156, + "grad_norm": 2.2877590656280518, + "learning_rate": 1.4147515447745349e-05, + "loss": 0.2659, + "step": 19292 + }, + { + "epoch": 1.0589462129527991, + "grad_norm": 0.9885607957839966, + "learning_rate": 1.4142798767754886e-05, + "loss": 0.2007, + "step": 19294 + }, + { + "epoch": 1.0590559824368826, + "grad_norm": 1.2333616018295288, + "learning_rate": 1.413808256401456e-05, + "loss": 0.2098, + "step": 19296 + }, + { + "epoch": 1.059165751920966, + "grad_norm": 0.7838774919509888, + "learning_rate": 1.4133366836731249e-05, + "loss": 0.1702, + "step": 19298 + }, + { + "epoch": 1.0592755214050493, + "grad_norm": 1.3203911781311035, + "learning_rate": 1.412865158611179e-05, + "loss": 0.2793, + "step": 19300 + }, + { + "epoch": 1.0593852908891328, + "grad_norm": 0.9413360357284546, + "learning_rate": 1.4123936812363047e-05, + "loss": 0.1767, + "step": 19302 + }, + { + "epoch": 1.0594950603732163, + "grad_norm": 1.3451980352401733, + "learning_rate": 1.4119222515691816e-05, + "loss": 0.261, + "step": 19304 + }, + { + "epoch": 1.0596048298572998, + "grad_norm": 0.7447597980499268, + "learning_rate": 1.411450869630489e-05, + "loss": 0.1528, + "step": 19306 + }, + { + "epoch": 1.059714599341383, + "grad_norm": 1.7736220359802246, + "learning_rate": 1.4109795354409044e-05, + "loss": 0.2757, + "step": 19308 + }, + { + "epoch": 1.0598243688254665, + "grad_norm": 1.2613219022750854, + "learning_rate": 1.4105082490211025e-05, + "loss": 0.2161, + "step": 19310 + }, + { + "epoch": 1.05993413830955, + "grad_norm": 1.7419408559799194, + "learning_rate": 1.4100370103917554e-05, + "loss": 0.2738, + "step": 19312 + }, + { + "epoch": 1.0600439077936334, + "grad_norm": 1.291327714920044, + "learning_rate": 1.4095658195735351e-05, + "loss": 0.1947, + "step": 19314 + }, + { + "epoch": 1.0601536772777167, + "grad_norm": 1.562036156654358, + "learning_rate": 1.4090946765871104e-05, + "loss": 0.2394, + "step": 19316 + }, + { + "epoch": 1.0602634467618002, + "grad_norm": 1.1710535287857056, + "learning_rate": 1.4086235814531485e-05, + "loss": 0.2403, + "step": 19318 + }, + { + "epoch": 1.0603732162458837, + "grad_norm": 1.509520411491394, + "learning_rate": 1.4081525341923127e-05, + "loss": 0.161, + "step": 19320 + }, + { + "epoch": 1.0604829857299671, + "grad_norm": 1.3559454679489136, + "learning_rate": 1.407681534825266e-05, + "loss": 0.2274, + "step": 19322 + }, + { + "epoch": 1.0605927552140504, + "grad_norm": 1.2026933431625366, + "learning_rate": 1.4072105833726684e-05, + "loss": 0.1878, + "step": 19324 + }, + { + "epoch": 1.0607025246981339, + "grad_norm": 1.3831037282943726, + "learning_rate": 1.4067396798551774e-05, + "loss": 0.2307, + "step": 19326 + }, + { + "epoch": 1.0608122941822173, + "grad_norm": 1.0064061880111694, + "learning_rate": 1.406268824293451e-05, + "loss": 0.1492, + "step": 19328 + }, + { + "epoch": 1.0609220636663008, + "grad_norm": 1.0999470949172974, + "learning_rate": 1.4057980167081425e-05, + "loss": 0.1897, + "step": 19330 + }, + { + "epoch": 1.0610318331503843, + "grad_norm": 0.9737218618392944, + "learning_rate": 1.4053272571199036e-05, + "loss": 0.2124, + "step": 19332 + }, + { + "epoch": 1.0611416026344676, + "grad_norm": 1.151775598526001, + "learning_rate": 1.404856545549384e-05, + "loss": 0.2748, + "step": 19334 + }, + { + "epoch": 1.061251372118551, + "grad_norm": 1.0704313516616821, + "learning_rate": 1.4043858820172309e-05, + "loss": 0.1657, + "step": 19336 + }, + { + "epoch": 1.0613611416026345, + "grad_norm": 1.5849050283432007, + "learning_rate": 1.4039152665440913e-05, + "loss": 0.1428, + "step": 19338 + }, + { + "epoch": 1.061470911086718, + "grad_norm": 0.9387586712837219, + "learning_rate": 1.4034446991506083e-05, + "loss": 0.1542, + "step": 19340 + }, + { + "epoch": 1.0615806805708012, + "grad_norm": 1.5235676765441895, + "learning_rate": 1.4029741798574227e-05, + "loss": 0.2621, + "step": 19342 + }, + { + "epoch": 1.0616904500548847, + "grad_norm": 1.1012656688690186, + "learning_rate": 1.4025037086851733e-05, + "loss": 0.2642, + "step": 19344 + }, + { + "epoch": 1.0618002195389682, + "grad_norm": 1.8842264413833618, + "learning_rate": 1.4020332856544991e-05, + "loss": 0.249, + "step": 19346 + }, + { + "epoch": 1.0619099890230517, + "grad_norm": 1.2186594009399414, + "learning_rate": 1.401562910786034e-05, + "loss": 0.2741, + "step": 19348 + }, + { + "epoch": 1.062019758507135, + "grad_norm": 1.5503230094909668, + "learning_rate": 1.4010925841004102e-05, + "loss": 0.2126, + "step": 19350 + }, + { + "epoch": 1.0621295279912184, + "grad_norm": 0.7343040704727173, + "learning_rate": 1.4006223056182604e-05, + "loss": 0.1162, + "step": 19352 + }, + { + "epoch": 1.0622392974753019, + "grad_norm": 2.3933520317077637, + "learning_rate": 1.4001520753602121e-05, + "loss": 0.2531, + "step": 19354 + }, + { + "epoch": 1.0623490669593854, + "grad_norm": 0.7455522418022156, + "learning_rate": 1.3996818933468925e-05, + "loss": 0.1478, + "step": 19356 + }, + { + "epoch": 1.0624588364434686, + "grad_norm": 1.0127270221710205, + "learning_rate": 1.3992117595989254e-05, + "loss": 0.2037, + "step": 19358 + }, + { + "epoch": 1.062568605927552, + "grad_norm": 1.1248788833618164, + "learning_rate": 1.3987416741369336e-05, + "loss": 0.145, + "step": 19360 + }, + { + "epoch": 1.0626783754116356, + "grad_norm": 0.7684763669967651, + "learning_rate": 1.3982716369815365e-05, + "loss": 0.1432, + "step": 19362 + }, + { + "epoch": 1.062788144895719, + "grad_norm": 1.4069972038269043, + "learning_rate": 1.397801648153354e-05, + "loss": 0.2212, + "step": 19364 + }, + { + "epoch": 1.0628979143798025, + "grad_norm": 1.1967504024505615, + "learning_rate": 1.3973317076730008e-05, + "loss": 0.1806, + "step": 19366 + }, + { + "epoch": 1.0630076838638858, + "grad_norm": 1.2227219343185425, + "learning_rate": 1.3968618155610913e-05, + "loss": 0.1449, + "step": 19368 + }, + { + "epoch": 1.0631174533479693, + "grad_norm": 2.38201904296875, + "learning_rate": 1.396391971838237e-05, + "loss": 0.2769, + "step": 19370 + }, + { + "epoch": 1.0632272228320527, + "grad_norm": 0.9323464035987854, + "learning_rate": 1.3959221765250469e-05, + "loss": 0.1188, + "step": 19372 + }, + { + "epoch": 1.0633369923161362, + "grad_norm": 1.8365076780319214, + "learning_rate": 1.3954524296421301e-05, + "loss": 0.2448, + "step": 19374 + }, + { + "epoch": 1.0634467618002195, + "grad_norm": 1.4162331819534302, + "learning_rate": 1.39498273121009e-05, + "loss": 0.2341, + "step": 19376 + }, + { + "epoch": 1.063556531284303, + "grad_norm": 1.1014487743377686, + "learning_rate": 1.3945130812495321e-05, + "loss": 0.2174, + "step": 19378 + }, + { + "epoch": 1.0636663007683864, + "grad_norm": 1.1297259330749512, + "learning_rate": 1.3940434797810568e-05, + "loss": 0.2391, + "step": 19380 + }, + { + "epoch": 1.06377607025247, + "grad_norm": 1.3821314573287964, + "learning_rate": 1.3935739268252627e-05, + "loss": 0.2542, + "step": 19382 + }, + { + "epoch": 1.0638858397365532, + "grad_norm": 2.004025936126709, + "learning_rate": 1.3931044224027468e-05, + "loss": 0.2068, + "step": 19384 + }, + { + "epoch": 1.0639956092206366, + "grad_norm": 1.2137986421585083, + "learning_rate": 1.3926349665341026e-05, + "loss": 0.2589, + "step": 19386 + }, + { + "epoch": 1.06410537870472, + "grad_norm": 1.0108615159988403, + "learning_rate": 1.3921655592399254e-05, + "loss": 0.1152, + "step": 19388 + }, + { + "epoch": 1.0642151481888036, + "grad_norm": 1.2523518800735474, + "learning_rate": 1.3916962005408043e-05, + "loss": 0.2068, + "step": 19390 + }, + { + "epoch": 1.0643249176728868, + "grad_norm": 1.4080110788345337, + "learning_rate": 1.3912268904573277e-05, + "loss": 0.1839, + "step": 19392 + }, + { + "epoch": 1.0644346871569703, + "grad_norm": 1.9056645631790161, + "learning_rate": 1.3907576290100819e-05, + "loss": 0.184, + "step": 19394 + }, + { + "epoch": 1.0645444566410538, + "grad_norm": 1.0243124961853027, + "learning_rate": 1.3902884162196508e-05, + "loss": 0.1566, + "step": 19396 + }, + { + "epoch": 1.0646542261251373, + "grad_norm": 1.9047040939331055, + "learning_rate": 1.3898192521066156e-05, + "loss": 0.1312, + "step": 19398 + }, + { + "epoch": 1.0647639956092205, + "grad_norm": 1.1260846853256226, + "learning_rate": 1.3893501366915582e-05, + "loss": 0.2264, + "step": 19400 + }, + { + "epoch": 1.064873765093304, + "grad_norm": 1.5000507831573486, + "learning_rate": 1.388881069995055e-05, + "loss": 0.2914, + "step": 19402 + }, + { + "epoch": 1.0649835345773875, + "grad_norm": 1.3722635507583618, + "learning_rate": 1.388412052037682e-05, + "loss": 0.3153, + "step": 19404 + }, + { + "epoch": 1.065093304061471, + "grad_norm": 1.0964299440383911, + "learning_rate": 1.3879430828400115e-05, + "loss": 0.2118, + "step": 19406 + }, + { + "epoch": 1.0652030735455544, + "grad_norm": 1.330406665802002, + "learning_rate": 1.3874741624226162e-05, + "loss": 0.1504, + "step": 19408 + }, + { + "epoch": 1.0653128430296377, + "grad_norm": 0.9638695120811462, + "learning_rate": 1.3870052908060651e-05, + "loss": 0.1336, + "step": 19410 + }, + { + "epoch": 1.0654226125137212, + "grad_norm": 1.0081802606582642, + "learning_rate": 1.386536468010924e-05, + "loss": 0.1834, + "step": 19412 + }, + { + "epoch": 1.0655323819978046, + "grad_norm": 1.1319299936294556, + "learning_rate": 1.3860676940577594e-05, + "loss": 0.2215, + "step": 19414 + }, + { + "epoch": 1.0656421514818881, + "grad_norm": 0.9982566237449646, + "learning_rate": 1.3855989689671328e-05, + "loss": 0.4014, + "step": 19416 + }, + { + "epoch": 1.0657519209659714, + "grad_norm": 1.465453028678894, + "learning_rate": 1.385130292759606e-05, + "loss": 0.4568, + "step": 19418 + }, + { + "epoch": 1.0658616904500549, + "grad_norm": Infinity, + "learning_rate": 1.3848959729934285e-05, + "loss": 0.25, + "step": 19420 + }, + { + "epoch": 1.0659714599341383, + "grad_norm": 1.094955325126648, + "learning_rate": 1.3844273701490962e-05, + "loss": 0.1233, + "step": 19422 + }, + { + "epoch": 1.0660812294182218, + "grad_norm": 1.3379276990890503, + "learning_rate": 1.3839588162392553e-05, + "loss": 0.1665, + "step": 19424 + }, + { + "epoch": 1.066190998902305, + "grad_norm": 1.6725881099700928, + "learning_rate": 1.3834903112844582e-05, + "loss": 0.2258, + "step": 19426 + }, + { + "epoch": 1.0663007683863885, + "grad_norm": 1.7742058038711548, + "learning_rate": 1.3830218553052566e-05, + "loss": 0.1606, + "step": 19428 + }, + { + "epoch": 1.066410537870472, + "grad_norm": 0.9813349843025208, + "learning_rate": 1.3825534483221974e-05, + "loss": 0.1657, + "step": 19430 + }, + { + "epoch": 1.0665203073545555, + "grad_norm": 1.2840653657913208, + "learning_rate": 1.38208509035583e-05, + "loss": 0.145, + "step": 19432 + }, + { + "epoch": 1.066630076838639, + "grad_norm": 1.4157034158706665, + "learning_rate": 1.3816167814266973e-05, + "loss": 0.2092, + "step": 19434 + }, + { + "epoch": 1.0667398463227222, + "grad_norm": 0.9943729043006897, + "learning_rate": 1.3811485215553412e-05, + "loss": 0.1573, + "step": 19436 + }, + { + "epoch": 1.0668496158068057, + "grad_norm": 1.29453444480896, + "learning_rate": 1.3806803107623033e-05, + "loss": 0.2246, + "step": 19438 + }, + { + "epoch": 1.0669593852908892, + "grad_norm": 1.1153262853622437, + "learning_rate": 1.3802121490681213e-05, + "loss": 0.2096, + "step": 19440 + }, + { + "epoch": 1.0670691547749727, + "grad_norm": 1.000856876373291, + "learning_rate": 1.3797440364933305e-05, + "loss": 0.2351, + "step": 19442 + }, + { + "epoch": 1.067178924259056, + "grad_norm": 1.5581309795379639, + "learning_rate": 1.3792759730584639e-05, + "loss": 0.2112, + "step": 19444 + }, + { + "epoch": 1.0672886937431394, + "grad_norm": 1.3750884532928467, + "learning_rate": 1.3788079587840549e-05, + "loss": 0.1363, + "step": 19446 + }, + { + "epoch": 1.0673984632272229, + "grad_norm": 1.6266345977783203, + "learning_rate": 1.378339993690632e-05, + "loss": 0.1277, + "step": 19448 + }, + { + "epoch": 1.0675082327113063, + "grad_norm": 1.146319031715393, + "learning_rate": 1.3778720777987225e-05, + "loss": 0.1515, + "step": 19450 + }, + { + "epoch": 1.0676180021953896, + "grad_norm": 1.7926561832427979, + "learning_rate": 1.377404211128851e-05, + "loss": 0.3731, + "step": 19452 + }, + { + "epoch": 1.067727771679473, + "grad_norm": 1.371256709098816, + "learning_rate": 1.37693639370154e-05, + "loss": 0.1979, + "step": 19454 + }, + { + "epoch": 1.0678375411635566, + "grad_norm": 1.5681463479995728, + "learning_rate": 1.3764686255373121e-05, + "loss": 0.23, + "step": 19456 + }, + { + "epoch": 1.06794731064764, + "grad_norm": 1.2735120058059692, + "learning_rate": 1.3760009066566854e-05, + "loss": 0.1972, + "step": 19458 + }, + { + "epoch": 1.0680570801317233, + "grad_norm": 1.237679123878479, + "learning_rate": 1.3755332370801752e-05, + "loss": 0.1947, + "step": 19460 + }, + { + "epoch": 1.0681668496158068, + "grad_norm": 1.0736550092697144, + "learning_rate": 1.3750656168282966e-05, + "loss": 0.1788, + "step": 19462 + }, + { + "epoch": 1.0682766190998902, + "grad_norm": 1.6649608612060547, + "learning_rate": 1.3745980459215616e-05, + "loss": 0.3099, + "step": 19464 + }, + { + "epoch": 1.0683863885839737, + "grad_norm": 1.0784010887145996, + "learning_rate": 1.3741305243804792e-05, + "loss": 0.2197, + "step": 19466 + }, + { + "epoch": 1.068496158068057, + "grad_norm": 1.0239957571029663, + "learning_rate": 1.3736630522255577e-05, + "loss": 0.1821, + "step": 19468 + }, + { + "epoch": 1.0686059275521405, + "grad_norm": 1.1438583135604858, + "learning_rate": 1.3731956294773046e-05, + "loss": 0.2287, + "step": 19470 + }, + { + "epoch": 1.068715697036224, + "grad_norm": 0.9289769530296326, + "learning_rate": 1.3727282561562216e-05, + "loss": 0.3428, + "step": 19472 + }, + { + "epoch": 1.0688254665203074, + "grad_norm": 1.1107361316680908, + "learning_rate": 1.3722609322828103e-05, + "loss": 0.1857, + "step": 19474 + }, + { + "epoch": 1.0689352360043909, + "grad_norm": 1.4024468660354614, + "learning_rate": 1.3717936578775698e-05, + "loss": 0.1959, + "step": 19476 + }, + { + "epoch": 1.0690450054884741, + "grad_norm": 1.3473384380340576, + "learning_rate": 1.371326432960997e-05, + "loss": 0.185, + "step": 19478 + }, + { + "epoch": 1.0691547749725576, + "grad_norm": 0.9451892375946045, + "learning_rate": 1.3708592575535858e-05, + "loss": 0.2447, + "step": 19480 + }, + { + "epoch": 1.069264544456641, + "grad_norm": 1.0158226490020752, + "learning_rate": 1.3703921316758306e-05, + "loss": 0.2346, + "step": 19482 + }, + { + "epoch": 1.0693743139407246, + "grad_norm": 1.4194377660751343, + "learning_rate": 1.3699250553482213e-05, + "loss": 0.2236, + "step": 19484 + }, + { + "epoch": 1.0694840834248078, + "grad_norm": 1.1251636743545532, + "learning_rate": 1.3694580285912454e-05, + "loss": 0.1641, + "step": 19486 + }, + { + "epoch": 1.0695938529088913, + "grad_norm": 0.7639486193656921, + "learning_rate": 1.3689910514253895e-05, + "loss": 0.1317, + "step": 19488 + }, + { + "epoch": 1.0697036223929748, + "grad_norm": 0.9923813343048096, + "learning_rate": 1.3685241238711367e-05, + "loss": 0.2154, + "step": 19490 + }, + { + "epoch": 1.0698133918770583, + "grad_norm": 1.3455045223236084, + "learning_rate": 1.36805724594897e-05, + "loss": 0.1768, + "step": 19492 + }, + { + "epoch": 1.0699231613611415, + "grad_norm": 1.0701491832733154, + "learning_rate": 1.3675904176793686e-05, + "loss": 0.1703, + "step": 19494 + }, + { + "epoch": 1.070032930845225, + "grad_norm": 0.9584517478942871, + "learning_rate": 1.3671236390828085e-05, + "loss": 0.1503, + "step": 19496 + }, + { + "epoch": 1.0701427003293085, + "grad_norm": 1.1325377225875854, + "learning_rate": 1.3666569101797672e-05, + "loss": 0.241, + "step": 19498 + }, + { + "epoch": 1.070252469813392, + "grad_norm": 0.9713684320449829, + "learning_rate": 1.3661902309907166e-05, + "loss": 0.1654, + "step": 19500 + }, + { + "epoch": 1.0703622392974752, + "grad_norm": 1.0948749780654907, + "learning_rate": 1.3657236015361275e-05, + "loss": 0.2737, + "step": 19502 + }, + { + "epoch": 1.0704720087815587, + "grad_norm": 1.4767853021621704, + "learning_rate": 1.3652570218364674e-05, + "loss": 0.1543, + "step": 19504 + }, + { + "epoch": 1.0705817782656422, + "grad_norm": 0.8934125304222107, + "learning_rate": 1.3647904919122054e-05, + "loss": 0.3153, + "step": 19506 + }, + { + "epoch": 1.0706915477497256, + "grad_norm": 1.2112977504730225, + "learning_rate": 1.364324011783804e-05, + "loss": 0.3518, + "step": 19508 + }, + { + "epoch": 1.070801317233809, + "grad_norm": 1.307637333869934, + "learning_rate": 1.3638575814717258e-05, + "loss": 0.1322, + "step": 19510 + }, + { + "epoch": 1.0709110867178924, + "grad_norm": 1.7449023723602295, + "learning_rate": 1.3633912009964305e-05, + "loss": 0.3564, + "step": 19512 + }, + { + "epoch": 1.0710208562019758, + "grad_norm": 0.8498575687408447, + "learning_rate": 1.3629248703783761e-05, + "loss": 0.1981, + "step": 19514 + }, + { + "epoch": 1.0711306256860593, + "grad_norm": 0.9316795468330383, + "learning_rate": 1.3624585896380171e-05, + "loss": 0.2087, + "step": 19516 + }, + { + "epoch": 1.0712403951701428, + "grad_norm": 0.8600060939788818, + "learning_rate": 1.3619923587958084e-05, + "loss": 0.2437, + "step": 19518 + }, + { + "epoch": 1.071350164654226, + "grad_norm": 0.9950534105300903, + "learning_rate": 1.3615261778722008e-05, + "loss": 0.2594, + "step": 19520 + }, + { + "epoch": 1.0714599341383095, + "grad_norm": 1.087813377380371, + "learning_rate": 1.3610600468876428e-05, + "loss": 0.179, + "step": 19522 + }, + { + "epoch": 1.071569703622393, + "grad_norm": 1.6759282350540161, + "learning_rate": 1.3605939658625816e-05, + "loss": 0.2488, + "step": 19524 + }, + { + "epoch": 1.0716794731064765, + "grad_norm": 1.1518945693969727, + "learning_rate": 1.3601279348174603e-05, + "loss": 0.1738, + "step": 19526 + }, + { + "epoch": 1.0717892425905597, + "grad_norm": 1.0714867115020752, + "learning_rate": 1.3596619537727237e-05, + "loss": 0.1155, + "step": 19528 + }, + { + "epoch": 1.0718990120746432, + "grad_norm": 1.3227697610855103, + "learning_rate": 1.3591960227488098e-05, + "loss": 0.2978, + "step": 19530 + }, + { + "epoch": 1.0720087815587267, + "grad_norm": 1.4010511636734009, + "learning_rate": 1.3587301417661591e-05, + "loss": 0.2269, + "step": 19532 + }, + { + "epoch": 1.0721185510428102, + "grad_norm": 1.2669825553894043, + "learning_rate": 1.358264310845206e-05, + "loss": 0.1884, + "step": 19534 + }, + { + "epoch": 1.0722283205268934, + "grad_norm": 1.4321006536483765, + "learning_rate": 1.3577985300063839e-05, + "loss": 0.1856, + "step": 19536 + }, + { + "epoch": 1.072338090010977, + "grad_norm": 0.9325752854347229, + "learning_rate": 1.3573327992701245e-05, + "loss": 0.1603, + "step": 19538 + }, + { + "epoch": 1.0724478594950604, + "grad_norm": 1.086905598640442, + "learning_rate": 1.3568671186568566e-05, + "loss": 0.2106, + "step": 19540 + }, + { + "epoch": 1.0725576289791439, + "grad_norm": 0.9786468744277954, + "learning_rate": 1.3564014881870082e-05, + "loss": 0.1882, + "step": 19542 + }, + { + "epoch": 1.0726673984632273, + "grad_norm": 1.1709012985229492, + "learning_rate": 1.3559359078810038e-05, + "loss": 0.1597, + "step": 19544 + }, + { + "epoch": 1.0727771679473106, + "grad_norm": 1.2894519567489624, + "learning_rate": 1.3554703777592658e-05, + "loss": 0.1736, + "step": 19546 + }, + { + "epoch": 1.072886937431394, + "grad_norm": 1.7513772249221802, + "learning_rate": 1.3550048978422147e-05, + "loss": 0.1638, + "step": 19548 + }, + { + "epoch": 1.0729967069154775, + "grad_norm": 1.2178709506988525, + "learning_rate": 1.3545394681502688e-05, + "loss": 0.1972, + "step": 19550 + }, + { + "epoch": 1.0731064763995608, + "grad_norm": 1.086253046989441, + "learning_rate": 1.3540740887038434e-05, + "loss": 0.1818, + "step": 19552 + }, + { + "epoch": 1.0732162458836443, + "grad_norm": 1.5183438062667847, + "learning_rate": 1.3536087595233537e-05, + "loss": 0.1963, + "step": 19554 + }, + { + "epoch": 1.0733260153677278, + "grad_norm": 1.39473295211792, + "learning_rate": 1.3531434806292104e-05, + "loss": 0.2109, + "step": 19556 + }, + { + "epoch": 1.0734357848518112, + "grad_norm": 0.9540141224861145, + "learning_rate": 1.3526782520418223e-05, + "loss": 0.1846, + "step": 19558 + }, + { + "epoch": 1.0735455543358947, + "grad_norm": 1.0387662649154663, + "learning_rate": 1.3522130737815988e-05, + "loss": 0.1765, + "step": 19560 + }, + { + "epoch": 1.073655323819978, + "grad_norm": 1.0741709470748901, + "learning_rate": 1.3517479458689434e-05, + "loss": 0.1822, + "step": 19562 + }, + { + "epoch": 1.0737650933040614, + "grad_norm": 1.0946274995803833, + "learning_rate": 1.351282868324259e-05, + "loss": 0.2057, + "step": 19564 + }, + { + "epoch": 1.073874862788145, + "grad_norm": 3.508413076400757, + "learning_rate": 1.3508178411679451e-05, + "loss": 0.3482, + "step": 19566 + }, + { + "epoch": 1.0739846322722284, + "grad_norm": 1.6344479322433472, + "learning_rate": 1.3503528644204022e-05, + "loss": 0.3218, + "step": 19568 + }, + { + "epoch": 1.0740944017563117, + "grad_norm": 1.1574954986572266, + "learning_rate": 1.3498879381020254e-05, + "loss": 0.1744, + "step": 19570 + }, + { + "epoch": 1.0742041712403951, + "grad_norm": 0.8032236099243164, + "learning_rate": 1.3494230622332088e-05, + "loss": 0.1635, + "step": 19572 + }, + { + "epoch": 1.0743139407244786, + "grad_norm": 1.0825680494308472, + "learning_rate": 1.3489582368343442e-05, + "loss": 0.2832, + "step": 19574 + }, + { + "epoch": 1.074423710208562, + "grad_norm": 1.1872737407684326, + "learning_rate": 1.34849346192582e-05, + "loss": 0.2206, + "step": 19576 + }, + { + "epoch": 1.0745334796926453, + "grad_norm": 1.060420036315918, + "learning_rate": 1.3480287375280254e-05, + "loss": 0.2266, + "step": 19578 + }, + { + "epoch": 1.0746432491767288, + "grad_norm": 1.1299470663070679, + "learning_rate": 1.3475640636613446e-05, + "loss": 0.203, + "step": 19580 + }, + { + "epoch": 1.0747530186608123, + "grad_norm": 0.9322475790977478, + "learning_rate": 1.34709944034616e-05, + "loss": 0.2486, + "step": 19582 + }, + { + "epoch": 1.0748627881448958, + "grad_norm": 1.8079935312271118, + "learning_rate": 1.3466348676028533e-05, + "loss": 0.1933, + "step": 19584 + }, + { + "epoch": 1.0749725576289793, + "grad_norm": 0.7339370250701904, + "learning_rate": 1.346170345451802e-05, + "loss": 0.2316, + "step": 19586 + }, + { + "epoch": 1.0750823271130625, + "grad_norm": 1.686833381652832, + "learning_rate": 1.3457058739133822e-05, + "loss": 0.292, + "step": 19588 + }, + { + "epoch": 1.075192096597146, + "grad_norm": 1.150670051574707, + "learning_rate": 1.345241453007968e-05, + "loss": 0.1971, + "step": 19590 + }, + { + "epoch": 1.0753018660812295, + "grad_norm": 1.454177975654602, + "learning_rate": 1.3447770827559326e-05, + "loss": 0.262, + "step": 19592 + }, + { + "epoch": 1.075411635565313, + "grad_norm": 1.2319095134735107, + "learning_rate": 1.3443127631776445e-05, + "loss": 0.2555, + "step": 19594 + }, + { + "epoch": 1.0755214050493962, + "grad_norm": 1.5336620807647705, + "learning_rate": 1.3438484942934708e-05, + "loss": 0.2739, + "step": 19596 + }, + { + "epoch": 1.0756311745334797, + "grad_norm": 1.114363431930542, + "learning_rate": 1.3433842761237774e-05, + "loss": 0.2664, + "step": 19598 + }, + { + "epoch": 1.0757409440175631, + "grad_norm": 1.1229833364486694, + "learning_rate": 1.3429201086889264e-05, + "loss": 0.2369, + "step": 19600 + }, + { + "epoch": 1.0758507135016466, + "grad_norm": 1.0666273832321167, + "learning_rate": 1.3424559920092778e-05, + "loss": 0.206, + "step": 19602 + }, + { + "epoch": 1.0759604829857299, + "grad_norm": 1.3614754676818848, + "learning_rate": 1.3419919261051916e-05, + "loss": 0.1703, + "step": 19604 + }, + { + "epoch": 1.0760702524698134, + "grad_norm": 3.1703131198883057, + "learning_rate": 1.3415279109970238e-05, + "loss": 0.2703, + "step": 19606 + }, + { + "epoch": 1.0761800219538968, + "grad_norm": 1.05585777759552, + "learning_rate": 1.3410639467051279e-05, + "loss": 0.229, + "step": 19608 + }, + { + "epoch": 1.0762897914379803, + "grad_norm": 0.9868279099464417, + "learning_rate": 1.3406000332498553e-05, + "loss": 0.2376, + "step": 19610 + }, + { + "epoch": 1.0763995609220636, + "grad_norm": 0.7863990068435669, + "learning_rate": 1.3401361706515553e-05, + "loss": 0.1103, + "step": 19612 + }, + { + "epoch": 1.076509330406147, + "grad_norm": 1.4509587287902832, + "learning_rate": 1.3396723589305764e-05, + "loss": 0.2804, + "step": 19614 + }, + { + "epoch": 1.0766190998902305, + "grad_norm": 1.0563300848007202, + "learning_rate": 1.3392085981072636e-05, + "loss": 0.2036, + "step": 19616 + }, + { + "epoch": 1.076728869374314, + "grad_norm": 1.6893541812896729, + "learning_rate": 1.338744888201958e-05, + "loss": 0.2046, + "step": 19618 + }, + { + "epoch": 1.0768386388583973, + "grad_norm": 1.6277669668197632, + "learning_rate": 1.3382812292350023e-05, + "loss": 0.2702, + "step": 19620 + }, + { + "epoch": 1.0769484083424807, + "grad_norm": 1.5635466575622559, + "learning_rate": 1.3378176212267337e-05, + "loss": 0.1801, + "step": 19622 + }, + { + "epoch": 1.0770581778265642, + "grad_norm": 1.689727783203125, + "learning_rate": 1.3373540641974888e-05, + "loss": 0.208, + "step": 19624 + }, + { + "epoch": 1.0771679473106477, + "grad_norm": 0.7923365235328674, + "learning_rate": 1.3368905581676002e-05, + "loss": 0.1241, + "step": 19626 + }, + { + "epoch": 1.0772777167947312, + "grad_norm": 1.1211943626403809, + "learning_rate": 1.3364271031574016e-05, + "loss": 0.1644, + "step": 19628 + }, + { + "epoch": 1.0773874862788144, + "grad_norm": 1.2776367664337158, + "learning_rate": 1.3359636991872215e-05, + "loss": 0.2045, + "step": 19630 + }, + { + "epoch": 1.077497255762898, + "grad_norm": 1.7937343120574951, + "learning_rate": 1.3355003462773869e-05, + "loss": 0.2758, + "step": 19632 + }, + { + "epoch": 1.0776070252469814, + "grad_norm": 1.7808244228363037, + "learning_rate": 1.335037044448223e-05, + "loss": 0.2335, + "step": 19634 + }, + { + "epoch": 1.0777167947310649, + "grad_norm": 1.0654205083847046, + "learning_rate": 1.334573793720052e-05, + "loss": 0.2138, + "step": 19636 + }, + { + "epoch": 1.077826564215148, + "grad_norm": 0.913996160030365, + "learning_rate": 1.3341105941131937e-05, + "loss": 0.1558, + "step": 19638 + }, + { + "epoch": 1.0779363336992316, + "grad_norm": 1.0348273515701294, + "learning_rate": 1.3336474456479686e-05, + "loss": 0.2554, + "step": 19640 + }, + { + "epoch": 1.078046103183315, + "grad_norm": 1.6529607772827148, + "learning_rate": 1.333184348344691e-05, + "loss": 0.203, + "step": 19642 + }, + { + "epoch": 1.0781558726673985, + "grad_norm": 1.4427824020385742, + "learning_rate": 1.332721302223675e-05, + "loss": 0.2202, + "step": 19644 + }, + { + "epoch": 1.0782656421514818, + "grad_norm": 1.1963672637939453, + "learning_rate": 1.332258307305232e-05, + "loss": 0.164, + "step": 19646 + }, + { + "epoch": 1.0783754116355653, + "grad_norm": 0.9973081946372986, + "learning_rate": 1.3317953636096703e-05, + "loss": 0.1059, + "step": 19648 + }, + { + "epoch": 1.0784851811196488, + "grad_norm": 1.176116704940796, + "learning_rate": 1.331332471157299e-05, + "loss": 0.2011, + "step": 19650 + }, + { + "epoch": 1.0785949506037322, + "grad_norm": 1.2987172603607178, + "learning_rate": 1.3308696299684203e-05, + "loss": 0.1745, + "step": 19652 + }, + { + "epoch": 1.0787047200878157, + "grad_norm": 1.7306772470474243, + "learning_rate": 1.3304068400633397e-05, + "loss": 0.2466, + "step": 19654 + }, + { + "epoch": 1.078814489571899, + "grad_norm": 0.8693793416023254, + "learning_rate": 1.3299441014623552e-05, + "loss": 0.2113, + "step": 19656 + }, + { + "epoch": 1.0789242590559824, + "grad_norm": 1.6636005640029907, + "learning_rate": 1.3294814141857653e-05, + "loss": 0.2722, + "step": 19658 + }, + { + "epoch": 1.079034028540066, + "grad_norm": 0.9849826097488403, + "learning_rate": 1.3290187782538662e-05, + "loss": 0.1678, + "step": 19660 + }, + { + "epoch": 1.0791437980241492, + "grad_norm": 1.4409866333007812, + "learning_rate": 1.32855619368695e-05, + "loss": 0.2362, + "step": 19662 + }, + { + "epoch": 1.0792535675082326, + "grad_norm": 1.3257774114608765, + "learning_rate": 1.3280936605053095e-05, + "loss": 0.1981, + "step": 19664 + }, + { + "epoch": 1.0793633369923161, + "grad_norm": 1.5856091976165771, + "learning_rate": 1.3276311787292337e-05, + "loss": 0.3133, + "step": 19666 + }, + { + "epoch": 1.0794731064763996, + "grad_norm": 1.4228545427322388, + "learning_rate": 1.3271687483790084e-05, + "loss": 0.1493, + "step": 19668 + }, + { + "epoch": 1.079582875960483, + "grad_norm": 1.156121850013733, + "learning_rate": 1.3267063694749182e-05, + "loss": 0.1725, + "step": 19670 + }, + { + "epoch": 1.0796926454445663, + "grad_norm": 1.6220061779022217, + "learning_rate": 1.3262440420372456e-05, + "loss": 0.231, + "step": 19672 + }, + { + "epoch": 1.0798024149286498, + "grad_norm": 0.9510533213615417, + "learning_rate": 1.3257817660862698e-05, + "loss": 0.148, + "step": 19674 + }, + { + "epoch": 1.0799121844127333, + "grad_norm": 0.8299481272697449, + "learning_rate": 1.3253195416422702e-05, + "loss": 0.1428, + "step": 19676 + }, + { + "epoch": 1.0800219538968168, + "grad_norm": 0.8871111273765564, + "learning_rate": 1.3248573687255198e-05, + "loss": 0.1394, + "step": 19678 + }, + { + "epoch": 1.0801317233809, + "grad_norm": 0.9999939203262329, + "learning_rate": 1.3243952473562942e-05, + "loss": 0.1769, + "step": 19680 + }, + { + "epoch": 1.0802414928649835, + "grad_norm": 1.7985714673995972, + "learning_rate": 1.3239331775548634e-05, + "loss": 0.2655, + "step": 19682 + }, + { + "epoch": 1.080351262349067, + "grad_norm": 1.423341155052185, + "learning_rate": 1.3234711593414956e-05, + "loss": 0.26, + "step": 19684 + }, + { + "epoch": 1.0804610318331505, + "grad_norm": 1.095317006111145, + "learning_rate": 1.3230091927364579e-05, + "loss": 0.144, + "step": 19686 + }, + { + "epoch": 1.0805708013172337, + "grad_norm": 1.1231002807617188, + "learning_rate": 1.322547277760013e-05, + "loss": 0.1546, + "step": 19688 + }, + { + "epoch": 1.0806805708013172, + "grad_norm": 0.8104836940765381, + "learning_rate": 1.3220854144324243e-05, + "loss": 0.1768, + "step": 19690 + }, + { + "epoch": 1.0807903402854007, + "grad_norm": 1.3847662210464478, + "learning_rate": 1.3216236027739512e-05, + "loss": 0.2702, + "step": 19692 + }, + { + "epoch": 1.0809001097694841, + "grad_norm": 1.6540353298187256, + "learning_rate": 1.3211618428048506e-05, + "loss": 0.241, + "step": 19694 + }, + { + "epoch": 1.0810098792535676, + "grad_norm": 0.8714056611061096, + "learning_rate": 1.3207001345453778e-05, + "loss": 0.2008, + "step": 19696 + }, + { + "epoch": 1.0811196487376509, + "grad_norm": 1.2986325025558472, + "learning_rate": 1.3202384780157853e-05, + "loss": 0.2407, + "step": 19698 + }, + { + "epoch": 1.0812294182217344, + "grad_norm": 1.5356194972991943, + "learning_rate": 1.3197768732363231e-05, + "loss": 0.185, + "step": 19700 + }, + { + "epoch": 1.0813391877058178, + "grad_norm": 1.0310289859771729, + "learning_rate": 1.3193153202272407e-05, + "loss": 0.1158, + "step": 19702 + }, + { + "epoch": 1.0814489571899013, + "grad_norm": 1.3220795392990112, + "learning_rate": 1.3188538190087835e-05, + "loss": 0.174, + "step": 19704 + }, + { + "epoch": 1.0815587266739846, + "grad_norm": 1.3199189901351929, + "learning_rate": 1.3183923696011952e-05, + "loss": 0.1711, + "step": 19706 + }, + { + "epoch": 1.081668496158068, + "grad_norm": 1.191901445388794, + "learning_rate": 1.3179309720247166e-05, + "loss": 0.2531, + "step": 19708 + }, + { + "epoch": 1.0817782656421515, + "grad_norm": 3.546302556991577, + "learning_rate": 1.3174696262995883e-05, + "loss": 0.2269, + "step": 19710 + }, + { + "epoch": 1.081888035126235, + "grad_norm": 1.2042882442474365, + "learning_rate": 1.3170083324460453e-05, + "loss": 0.1898, + "step": 19712 + }, + { + "epoch": 1.0819978046103182, + "grad_norm": 1.1109586954116821, + "learning_rate": 1.3165470904843243e-05, + "loss": 0.2464, + "step": 19714 + }, + { + "epoch": 1.0821075740944017, + "grad_norm": 1.8825935125350952, + "learning_rate": 1.3160859004346567e-05, + "loss": 0.2299, + "step": 19716 + }, + { + "epoch": 1.0822173435784852, + "grad_norm": 1.8920559883117676, + "learning_rate": 1.3156247623172727e-05, + "loss": 0.2261, + "step": 19718 + }, + { + "epoch": 1.0823271130625687, + "grad_norm": 1.2549142837524414, + "learning_rate": 1.3151636761523995e-05, + "loss": 0.2561, + "step": 19720 + }, + { + "epoch": 1.082436882546652, + "grad_norm": 1.1311320066452026, + "learning_rate": 1.3147026419602632e-05, + "loss": 0.1262, + "step": 19722 + }, + { + "epoch": 1.0825466520307354, + "grad_norm": 1.3227159976959229, + "learning_rate": 1.3142416597610857e-05, + "loss": 0.2559, + "step": 19724 + }, + { + "epoch": 1.082656421514819, + "grad_norm": 0.9270760416984558, + "learning_rate": 1.3137807295750904e-05, + "loss": 0.1444, + "step": 19726 + }, + { + "epoch": 1.0827661909989024, + "grad_norm": 1.193281650543213, + "learning_rate": 1.3133198514224942e-05, + "loss": 0.1086, + "step": 19728 + }, + { + "epoch": 1.0828759604829856, + "grad_norm": 1.2384177446365356, + "learning_rate": 1.312859025323514e-05, + "loss": 0.2615, + "step": 19730 + }, + { + "epoch": 1.082985729967069, + "grad_norm": 1.8437529802322388, + "learning_rate": 1.312398251298364e-05, + "loss": 0.2329, + "step": 19732 + }, + { + "epoch": 1.0830954994511526, + "grad_norm": 1.321045160293579, + "learning_rate": 1.3119375293672554e-05, + "loss": 0.1903, + "step": 19734 + }, + { + "epoch": 1.083205268935236, + "grad_norm": 1.0003232955932617, + "learning_rate": 1.3114768595503973e-05, + "loss": 0.2714, + "step": 19736 + }, + { + "epoch": 1.0833150384193195, + "grad_norm": 1.5891445875167847, + "learning_rate": 1.3110162418679978e-05, + "loss": 0.2377, + "step": 19738 + }, + { + "epoch": 1.0834248079034028, + "grad_norm": 1.7950429916381836, + "learning_rate": 1.3105556763402626e-05, + "loss": 0.2092, + "step": 19740 + }, + { + "epoch": 1.0835345773874863, + "grad_norm": 1.1791400909423828, + "learning_rate": 1.310095162987394e-05, + "loss": 0.1493, + "step": 19742 + }, + { + "epoch": 1.0836443468715697, + "grad_norm": 1.4268040657043457, + "learning_rate": 1.3096347018295917e-05, + "loss": 0.2419, + "step": 19744 + }, + { + "epoch": 1.0837541163556532, + "grad_norm": 1.25644850730896, + "learning_rate": 1.309174292887054e-05, + "loss": 0.1598, + "step": 19746 + }, + { + "epoch": 1.0838638858397365, + "grad_norm": 0.596783459186554, + "learning_rate": 1.3087139361799766e-05, + "loss": 0.1184, + "step": 19748 + }, + { + "epoch": 1.08397365532382, + "grad_norm": 1.126944661140442, + "learning_rate": 1.3082536317285527e-05, + "loss": 0.2495, + "step": 19750 + }, + { + "epoch": 1.0840834248079034, + "grad_norm": 1.29551362991333, + "learning_rate": 1.3077933795529743e-05, + "loss": 0.1984, + "step": 19752 + }, + { + "epoch": 1.084193194291987, + "grad_norm": 1.2033253908157349, + "learning_rate": 1.30733317967343e-05, + "loss": 0.2765, + "step": 19754 + }, + { + "epoch": 1.0843029637760702, + "grad_norm": 1.4907779693603516, + "learning_rate": 1.3068730321101069e-05, + "loss": 0.1985, + "step": 19756 + }, + { + "epoch": 1.0844127332601536, + "grad_norm": 1.4016194343566895, + "learning_rate": 1.3064129368831884e-05, + "loss": 0.1338, + "step": 19758 + }, + { + "epoch": 1.0845225027442371, + "grad_norm": 1.2509797811508179, + "learning_rate": 1.3059528940128562e-05, + "loss": 0.2845, + "step": 19760 + }, + { + "epoch": 1.0846322722283206, + "grad_norm": 2.686894655227661, + "learning_rate": 1.3054929035192915e-05, + "loss": 0.1715, + "step": 19762 + }, + { + "epoch": 1.0847420417124038, + "grad_norm": 0.8472687005996704, + "learning_rate": 1.3050329654226712e-05, + "loss": 0.1301, + "step": 19764 + }, + { + "epoch": 1.0848518111964873, + "grad_norm": 0.9267050623893738, + "learning_rate": 1.3045730797431704e-05, + "loss": 0.1964, + "step": 19766 + }, + { + "epoch": 1.0849615806805708, + "grad_norm": 1.3783488273620605, + "learning_rate": 1.3041132465009618e-05, + "loss": 0.1518, + "step": 19768 + }, + { + "epoch": 1.0850713501646543, + "grad_norm": 1.3825604915618896, + "learning_rate": 1.3036534657162148e-05, + "loss": 0.2043, + "step": 19770 + }, + { + "epoch": 1.0851811196487375, + "grad_norm": 1.1892300844192505, + "learning_rate": 1.3031937374091e-05, + "loss": 0.1939, + "step": 19772 + }, + { + "epoch": 1.085290889132821, + "grad_norm": 2.464061975479126, + "learning_rate": 1.302734061599781e-05, + "loss": 0.2222, + "step": 19774 + }, + { + "epoch": 1.0854006586169045, + "grad_norm": 1.3107951879501343, + "learning_rate": 1.3022744383084237e-05, + "loss": 0.0987, + "step": 19776 + }, + { + "epoch": 1.085510428100988, + "grad_norm": 1.1111475229263306, + "learning_rate": 1.3018148675551884e-05, + "loss": 0.2131, + "step": 19778 + }, + { + "epoch": 1.0856201975850714, + "grad_norm": 1.050864577293396, + "learning_rate": 1.3013553493602338e-05, + "loss": 0.2271, + "step": 19780 + }, + { + "epoch": 1.0857299670691547, + "grad_norm": 1.749837040901184, + "learning_rate": 1.3008958837437169e-05, + "loss": 0.2508, + "step": 19782 + }, + { + "epoch": 1.0858397365532382, + "grad_norm": 0.8106648325920105, + "learning_rate": 1.3004364707257922e-05, + "loss": 0.1996, + "step": 19784 + }, + { + "epoch": 1.0859495060373217, + "grad_norm": 0.7452597618103027, + "learning_rate": 1.2999771103266106e-05, + "loss": 0.1246, + "step": 19786 + }, + { + "epoch": 1.0860592755214051, + "grad_norm": 0.961455225944519, + "learning_rate": 1.299517802566324e-05, + "loss": 0.1428, + "step": 19788 + }, + { + "epoch": 1.0861690450054884, + "grad_norm": 1.4807227849960327, + "learning_rate": 1.299058547465079e-05, + "loss": 0.1802, + "step": 19790 + }, + { + "epoch": 1.0862788144895719, + "grad_norm": 1.1948919296264648, + "learning_rate": 1.2985993450430207e-05, + "loss": 0.1603, + "step": 19792 + }, + { + "epoch": 1.0863885839736553, + "grad_norm": 1.230059266090393, + "learning_rate": 1.2981401953202918e-05, + "loss": 0.2436, + "step": 19794 + }, + { + "epoch": 1.0864983534577388, + "grad_norm": 0.8618940711021423, + "learning_rate": 1.297681098317032e-05, + "loss": 0.1837, + "step": 19796 + }, + { + "epoch": 1.086608122941822, + "grad_norm": 1.2022396326065063, + "learning_rate": 1.297222054053382e-05, + "loss": 0.2438, + "step": 19798 + }, + { + "epoch": 1.0867178924259056, + "grad_norm": 1.2612377405166626, + "learning_rate": 1.2967630625494747e-05, + "loss": 0.3706, + "step": 19800 + }, + { + "epoch": 1.086827661909989, + "grad_norm": 0.8535941243171692, + "learning_rate": 1.2963041238254464e-05, + "loss": 0.0964, + "step": 19802 + }, + { + "epoch": 1.0869374313940725, + "grad_norm": 0.991797149181366, + "learning_rate": 1.2958452379014274e-05, + "loss": 0.1493, + "step": 19804 + }, + { + "epoch": 1.087047200878156, + "grad_norm": 1.1442878246307373, + "learning_rate": 1.2953864047975466e-05, + "loss": 0.164, + "step": 19806 + }, + { + "epoch": 1.0871569703622392, + "grad_norm": 1.6145087480545044, + "learning_rate": 1.294927624533931e-05, + "loss": 0.2593, + "step": 19808 + }, + { + "epoch": 1.0872667398463227, + "grad_norm": 1.4884439706802368, + "learning_rate": 1.2944688971307035e-05, + "loss": 0.2505, + "step": 19810 + }, + { + "epoch": 1.0873765093304062, + "grad_norm": 1.5648220777511597, + "learning_rate": 1.294010222607988e-05, + "loss": 0.2614, + "step": 19812 + }, + { + "epoch": 1.0874862788144897, + "grad_norm": 1.1711772680282593, + "learning_rate": 1.2935516009859035e-05, + "loss": 0.1656, + "step": 19814 + }, + { + "epoch": 1.087596048298573, + "grad_norm": 1.4114789962768555, + "learning_rate": 1.2930930322845675e-05, + "loss": 0.1501, + "step": 19816 + }, + { + "epoch": 1.0877058177826564, + "grad_norm": 5.404414176940918, + "learning_rate": 1.292634516524095e-05, + "loss": 0.2508, + "step": 19818 + }, + { + "epoch": 1.0878155872667399, + "grad_norm": 0.9473867416381836, + "learning_rate": 1.2921760537245986e-05, + "loss": 0.2242, + "step": 19820 + }, + { + "epoch": 1.0879253567508234, + "grad_norm": 1.5343573093414307, + "learning_rate": 1.291717643906188e-05, + "loss": 0.3101, + "step": 19822 + }, + { + "epoch": 1.0880351262349066, + "grad_norm": 1.7635623216629028, + "learning_rate": 1.291259287088973e-05, + "loss": 0.2492, + "step": 19824 + }, + { + "epoch": 1.08814489571899, + "grad_norm": 2.0073540210723877, + "learning_rate": 1.2908009832930584e-05, + "loss": 0.1989, + "step": 19826 + }, + { + "epoch": 1.0882546652030736, + "grad_norm": 2.0145184993743896, + "learning_rate": 1.290342732538548e-05, + "loss": 0.2965, + "step": 19828 + }, + { + "epoch": 1.088364434687157, + "grad_norm": 1.0110691785812378, + "learning_rate": 1.2898845348455418e-05, + "loss": 0.198, + "step": 19830 + }, + { + "epoch": 1.0884742041712403, + "grad_norm": 1.0678529739379883, + "learning_rate": 1.2894263902341402e-05, + "loss": 0.2384, + "step": 19832 + }, + { + "epoch": 1.0885839736553238, + "grad_norm": 1.0771018266677856, + "learning_rate": 1.288968298724439e-05, + "loss": 0.3148, + "step": 19834 + }, + { + "epoch": 1.0886937431394073, + "grad_norm": 1.7273615598678589, + "learning_rate": 1.2885102603365314e-05, + "loss": 0.2543, + "step": 19836 + }, + { + "epoch": 1.0888035126234907, + "grad_norm": 1.1987640857696533, + "learning_rate": 1.2880522750905111e-05, + "loss": 0.1898, + "step": 19838 + }, + { + "epoch": 1.088913282107574, + "grad_norm": 1.0608175992965698, + "learning_rate": 1.287594343006467e-05, + "loss": 0.2184, + "step": 19840 + }, + { + "epoch": 1.0890230515916575, + "grad_norm": 1.4786629676818848, + "learning_rate": 1.2871364641044853e-05, + "loss": 0.3394, + "step": 19842 + }, + { + "epoch": 1.089132821075741, + "grad_norm": 1.0687308311462402, + "learning_rate": 1.2866786384046514e-05, + "loss": 0.1718, + "step": 19844 + }, + { + "epoch": 1.0892425905598244, + "grad_norm": 0.9194145798683167, + "learning_rate": 1.286220865927047e-05, + "loss": 0.2688, + "step": 19846 + }, + { + "epoch": 1.089352360043908, + "grad_norm": 1.1085731983184814, + "learning_rate": 1.2857631466917539e-05, + "loss": 0.1672, + "step": 19848 + }, + { + "epoch": 1.0894621295279912, + "grad_norm": 1.5226414203643799, + "learning_rate": 1.2853054807188488e-05, + "loss": 0.2358, + "step": 19850 + }, + { + "epoch": 1.0895718990120746, + "grad_norm": 0.8942558765411377, + "learning_rate": 1.2848478680284077e-05, + "loss": 0.1318, + "step": 19852 + }, + { + "epoch": 1.089681668496158, + "grad_norm": 0.7999584078788757, + "learning_rate": 1.2843903086405035e-05, + "loss": 0.1874, + "step": 19854 + }, + { + "epoch": 1.0897914379802416, + "grad_norm": 1.6145124435424805, + "learning_rate": 1.2839328025752064e-05, + "loss": 0.145, + "step": 19856 + }, + { + "epoch": 1.0899012074643248, + "grad_norm": 1.0983433723449707, + "learning_rate": 1.2834753498525848e-05, + "loss": 0.1989, + "step": 19858 + }, + { + "epoch": 1.0900109769484083, + "grad_norm": 2.0054574012756348, + "learning_rate": 1.2830179504927054e-05, + "loss": 0.2053, + "step": 19860 + }, + { + "epoch": 1.0901207464324918, + "grad_norm": 1.1073635816574097, + "learning_rate": 1.282560604515633e-05, + "loss": 0.1623, + "step": 19862 + }, + { + "epoch": 1.0902305159165753, + "grad_norm": 1.6382001638412476, + "learning_rate": 1.2821033119414278e-05, + "loss": 0.2245, + "step": 19864 + }, + { + "epoch": 1.0903402854006585, + "grad_norm": 1.1573631763458252, + "learning_rate": 1.2816460727901494e-05, + "loss": 0.1595, + "step": 19866 + }, + { + "epoch": 1.090450054884742, + "grad_norm": 1.036177396774292, + "learning_rate": 1.2811888870818543e-05, + "loss": 0.1956, + "step": 19868 + }, + { + "epoch": 1.0905598243688255, + "grad_norm": 1.2905761003494263, + "learning_rate": 1.2807317548365966e-05, + "loss": 0.2043, + "step": 19870 + }, + { + "epoch": 1.090669593852909, + "grad_norm": 0.9829495549201965, + "learning_rate": 1.2802746760744277e-05, + "loss": 0.2021, + "step": 19872 + }, + { + "epoch": 1.0907793633369922, + "grad_norm": 1.152284860610962, + "learning_rate": 1.2798176508153998e-05, + "loss": 0.1685, + "step": 19874 + }, + { + "epoch": 1.0908891328210757, + "grad_norm": 0.9162967205047607, + "learning_rate": 1.279360679079558e-05, + "loss": 0.1754, + "step": 19876 + }, + { + "epoch": 1.0909989023051592, + "grad_norm": 1.2456164360046387, + "learning_rate": 1.2789037608869487e-05, + "loss": 0.1383, + "step": 19878 + }, + { + "epoch": 1.0911086717892426, + "grad_norm": 0.962680995464325, + "learning_rate": 1.2784468962576136e-05, + "loss": 0.1461, + "step": 19880 + }, + { + "epoch": 1.091218441273326, + "grad_norm": 1.1049858331680298, + "learning_rate": 1.2779900852115922e-05, + "loss": 0.2185, + "step": 19882 + }, + { + "epoch": 1.0913282107574094, + "grad_norm": 1.9798238277435303, + "learning_rate": 1.277533327768925e-05, + "loss": 0.2332, + "step": 19884 + }, + { + "epoch": 1.0914379802414929, + "grad_norm": 1.333767294883728, + "learning_rate": 1.277076623949646e-05, + "loss": 0.2298, + "step": 19886 + }, + { + "epoch": 1.0915477497255763, + "grad_norm": 1.0642324686050415, + "learning_rate": 1.2766199737737888e-05, + "loss": 0.1737, + "step": 19888 + }, + { + "epoch": 1.0916575192096598, + "grad_norm": 1.1387337446212769, + "learning_rate": 1.2761633772613832e-05, + "loss": 0.2295, + "step": 19890 + }, + { + "epoch": 1.091767288693743, + "grad_norm": 1.0065464973449707, + "learning_rate": 1.27570683443246e-05, + "loss": 0.182, + "step": 19892 + }, + { + "epoch": 1.0918770581778265, + "grad_norm": 1.015412449836731, + "learning_rate": 1.2752503453070441e-05, + "loss": 0.1793, + "step": 19894 + }, + { + "epoch": 1.09198682766191, + "grad_norm": 1.145885705947876, + "learning_rate": 1.2747939099051586e-05, + "loss": 0.2103, + "step": 19896 + }, + { + "epoch": 1.0920965971459935, + "grad_norm": 1.799263834953308, + "learning_rate": 1.2743375282468267e-05, + "loss": 0.2446, + "step": 19898 + }, + { + "epoch": 1.0922063666300768, + "grad_norm": 0.863738477230072, + "learning_rate": 1.2738812003520667e-05, + "loss": 0.2115, + "step": 19900 + }, + { + "epoch": 1.0923161361141602, + "grad_norm": 1.3939522504806519, + "learning_rate": 1.2734249262408957e-05, + "loss": 0.2346, + "step": 19902 + }, + { + "epoch": 1.0924259055982437, + "grad_norm": 1.6562221050262451, + "learning_rate": 1.2729687059333276e-05, + "loss": 0.2854, + "step": 19904 + }, + { + "epoch": 1.0925356750823272, + "grad_norm": 2.3474042415618896, + "learning_rate": 1.2725125394493748e-05, + "loss": 0.2259, + "step": 19906 + }, + { + "epoch": 1.0926454445664104, + "grad_norm": 1.1314748525619507, + "learning_rate": 1.2720564268090457e-05, + "loss": 0.2572, + "step": 19908 + }, + { + "epoch": 1.092755214050494, + "grad_norm": 1.3567768335342407, + "learning_rate": 1.2716003680323499e-05, + "loss": 0.3279, + "step": 19910 + }, + { + "epoch": 1.0928649835345774, + "grad_norm": 1.133739709854126, + "learning_rate": 1.271144363139291e-05, + "loss": 0.1994, + "step": 19912 + }, + { + "epoch": 1.0929747530186609, + "grad_norm": 1.0174669027328491, + "learning_rate": 1.2706884121498724e-05, + "loss": 0.1241, + "step": 19914 + }, + { + "epoch": 1.0930845225027443, + "grad_norm": 1.3841184377670288, + "learning_rate": 1.2702325150840937e-05, + "loss": 0.2628, + "step": 19916 + }, + { + "epoch": 1.0931942919868276, + "grad_norm": 0.8379765748977661, + "learning_rate": 1.269776671961953e-05, + "loss": 0.1783, + "step": 19918 + }, + { + "epoch": 1.093304061470911, + "grad_norm": 1.2465656995773315, + "learning_rate": 1.2693208828034448e-05, + "loss": 0.2759, + "step": 19920 + }, + { + "epoch": 1.0934138309549946, + "grad_norm": 1.4960542917251587, + "learning_rate": 1.2688651476285629e-05, + "loss": 0.1558, + "step": 19922 + }, + { + "epoch": 1.093523600439078, + "grad_norm": 0.981073260307312, + "learning_rate": 1.2684094664572999e-05, + "loss": 0.1639, + "step": 19924 + }, + { + "epoch": 1.0936333699231613, + "grad_norm": 0.8883768916130066, + "learning_rate": 1.267953839309642e-05, + "loss": 0.1651, + "step": 19926 + }, + { + "epoch": 1.0937431394072448, + "grad_norm": 1.7031110525131226, + "learning_rate": 1.2674982662055765e-05, + "loss": 0.2385, + "step": 19928 + }, + { + "epoch": 1.0938529088913282, + "grad_norm": 1.1151961088180542, + "learning_rate": 1.2670427471650864e-05, + "loss": 0.2075, + "step": 19930 + }, + { + "epoch": 1.0939626783754117, + "grad_norm": 1.1119743585586548, + "learning_rate": 1.266587282208152e-05, + "loss": 0.2187, + "step": 19932 + }, + { + "epoch": 1.094072447859495, + "grad_norm": 1.0857199430465698, + "learning_rate": 1.2661318713547542e-05, + "loss": 0.1939, + "step": 19934 + }, + { + "epoch": 1.0941822173435785, + "grad_norm": 0.8635862469673157, + "learning_rate": 1.2656765146248691e-05, + "loss": 0.1165, + "step": 19936 + }, + { + "epoch": 1.094291986827662, + "grad_norm": 1.292633056640625, + "learning_rate": 1.2652212120384705e-05, + "loss": 0.1788, + "step": 19938 + }, + { + "epoch": 1.0944017563117454, + "grad_norm": 0.7117019891738892, + "learning_rate": 1.2647659636155298e-05, + "loss": 0.1219, + "step": 19940 + }, + { + "epoch": 1.0945115257958287, + "grad_norm": 1.079418420791626, + "learning_rate": 1.264310769376017e-05, + "loss": 0.1995, + "step": 19942 + }, + { + "epoch": 1.0946212952799121, + "grad_norm": 0.8300760388374329, + "learning_rate": 1.2638556293398978e-05, + "loss": 0.1254, + "step": 19944 + }, + { + "epoch": 1.0947310647639956, + "grad_norm": 0.986225426197052, + "learning_rate": 1.2634005435271395e-05, + "loss": 0.2555, + "step": 19946 + }, + { + "epoch": 1.094840834248079, + "grad_norm": 1.445167899131775, + "learning_rate": 1.2629455119577027e-05, + "loss": 0.2282, + "step": 19948 + }, + { + "epoch": 1.0949506037321624, + "grad_norm": 1.3812267780303955, + "learning_rate": 1.2624905346515465e-05, + "loss": 0.169, + "step": 19950 + }, + { + "epoch": 1.0950603732162458, + "grad_norm": 1.0392003059387207, + "learning_rate": 1.2620356116286309e-05, + "loss": 0.2163, + "step": 19952 + }, + { + "epoch": 1.0951701427003293, + "grad_norm": 2.2857885360717773, + "learning_rate": 1.2615807429089091e-05, + "loss": 0.1885, + "step": 19954 + }, + { + "epoch": 1.0952799121844128, + "grad_norm": 1.6694427728652954, + "learning_rate": 1.2611259285123345e-05, + "loss": 0.2701, + "step": 19956 + }, + { + "epoch": 1.0953896816684963, + "grad_norm": 2.0698347091674805, + "learning_rate": 1.2606711684588568e-05, + "loss": 0.2037, + "step": 19958 + }, + { + "epoch": 1.0954994511525795, + "grad_norm": 1.0014405250549316, + "learning_rate": 1.2602164627684254e-05, + "loss": 0.185, + "step": 19960 + }, + { + "epoch": 1.095609220636663, + "grad_norm": 1.8683944940567017, + "learning_rate": 1.259761811460985e-05, + "loss": 0.2055, + "step": 19962 + }, + { + "epoch": 1.0957189901207465, + "grad_norm": 1.3317713737487793, + "learning_rate": 1.2593072145564794e-05, + "loss": 0.2351, + "step": 19964 + }, + { + "epoch": 1.09582875960483, + "grad_norm": 0.9737566709518433, + "learning_rate": 1.258852672074849e-05, + "loss": 0.1278, + "step": 19966 + }, + { + "epoch": 1.0959385290889132, + "grad_norm": 0.9460033178329468, + "learning_rate": 1.258398184036031e-05, + "loss": 0.228, + "step": 19968 + }, + { + "epoch": 1.0960482985729967, + "grad_norm": 1.3996909856796265, + "learning_rate": 1.2579437504599639e-05, + "loss": 0.3003, + "step": 19970 + }, + { + "epoch": 1.0961580680570802, + "grad_norm": 1.0612847805023193, + "learning_rate": 1.2574893713665804e-05, + "loss": 0.1569, + "step": 19972 + }, + { + "epoch": 1.0962678375411636, + "grad_norm": 0.8157576322555542, + "learning_rate": 1.2570350467758113e-05, + "loss": 0.1854, + "step": 19974 + }, + { + "epoch": 1.096377607025247, + "grad_norm": 1.512663722038269, + "learning_rate": 1.2565807767075863e-05, + "loss": 0.1632, + "step": 19976 + }, + { + "epoch": 1.0964873765093304, + "grad_norm": 1.1094661951065063, + "learning_rate": 1.2561265611818312e-05, + "loss": 0.1849, + "step": 19978 + }, + { + "epoch": 1.0965971459934138, + "grad_norm": 1.165393352508545, + "learning_rate": 1.2556724002184696e-05, + "loss": 0.186, + "step": 19980 + }, + { + "epoch": 1.0967069154774973, + "grad_norm": 1.2134164571762085, + "learning_rate": 1.255218293837424e-05, + "loss": 0.1872, + "step": 19982 + }, + { + "epoch": 1.0968166849615806, + "grad_norm": 0.984977126121521, + "learning_rate": 1.2547642420586148e-05, + "loss": 0.2013, + "step": 19984 + }, + { + "epoch": 1.096926454445664, + "grad_norm": 1.6193573474884033, + "learning_rate": 1.2543102449019578e-05, + "loss": 0.2675, + "step": 19986 + }, + { + "epoch": 1.0970362239297475, + "grad_norm": 1.4591070413589478, + "learning_rate": 1.2538563023873679e-05, + "loss": 0.2482, + "step": 19988 + }, + { + "epoch": 1.097145993413831, + "grad_norm": 1.2668598890304565, + "learning_rate": 1.253402414534757e-05, + "loss": 0.1963, + "step": 19990 + }, + { + "epoch": 1.0972557628979143, + "grad_norm": 1.1342787742614746, + "learning_rate": 1.2529485813640345e-05, + "loss": 0.163, + "step": 19992 + }, + { + "epoch": 1.0973655323819977, + "grad_norm": 1.107843041419983, + "learning_rate": 1.2524948028951079e-05, + "loss": 0.1417, + "step": 19994 + }, + { + "epoch": 1.0974753018660812, + "grad_norm": 0.968218207359314, + "learning_rate": 1.2520410791478826e-05, + "loss": 0.1826, + "step": 19996 + }, + { + "epoch": 1.0975850713501647, + "grad_norm": 1.2985742092132568, + "learning_rate": 1.251587410142261e-05, + "loss": 0.2172, + "step": 19998 + }, + { + "epoch": 1.0976948408342482, + "grad_norm": 0.9772369265556335, + "learning_rate": 1.2511337958981434e-05, + "loss": 0.1896, + "step": 20000 + }, + { + "epoch": 1.0978046103183314, + "grad_norm": 1.9751849174499512, + "learning_rate": 1.2506802364354269e-05, + "loss": 0.244, + "step": 20002 + }, + { + "epoch": 1.097914379802415, + "grad_norm": 1.645935297012329, + "learning_rate": 1.250226731774008e-05, + "loss": 0.2993, + "step": 20004 + }, + { + "epoch": 1.0980241492864984, + "grad_norm": 1.1935484409332275, + "learning_rate": 1.2497732819337773e-05, + "loss": 0.1991, + "step": 20006 + }, + { + "epoch": 1.0981339187705819, + "grad_norm": 1.592474341392517, + "learning_rate": 1.2493198869346279e-05, + "loss": 0.303, + "step": 20008 + }, + { + "epoch": 1.0982436882546651, + "grad_norm": 1.1270372867584229, + "learning_rate": 1.2488665467964472e-05, + "loss": 0.1307, + "step": 20010 + }, + { + "epoch": 1.0983534577387486, + "grad_norm": 1.163397192955017, + "learning_rate": 1.2484132615391197e-05, + "loss": 0.1445, + "step": 20012 + }, + { + "epoch": 1.098463227222832, + "grad_norm": 1.6082602739334106, + "learning_rate": 1.2479600311825306e-05, + "loss": 0.1678, + "step": 20014 + }, + { + "epoch": 1.0985729967069155, + "grad_norm": 1.4850897789001465, + "learning_rate": 1.24750685574656e-05, + "loss": 0.2287, + "step": 20016 + }, + { + "epoch": 1.0986827661909988, + "grad_norm": 1.1562166213989258, + "learning_rate": 1.2470537352510853e-05, + "loss": 0.1623, + "step": 20018 + }, + { + "epoch": 1.0987925356750823, + "grad_norm": 1.3364840745925903, + "learning_rate": 1.2466006697159843e-05, + "loss": 0.1436, + "step": 20020 + }, + { + "epoch": 1.0989023051591658, + "grad_norm": 1.358465552330017, + "learning_rate": 1.24614765916113e-05, + "loss": 0.199, + "step": 20022 + }, + { + "epoch": 1.0990120746432492, + "grad_norm": 0.8982688784599304, + "learning_rate": 1.245694703606394e-05, + "loss": 0.1455, + "step": 20024 + }, + { + "epoch": 1.0991218441273327, + "grad_norm": 1.297601342201233, + "learning_rate": 1.2452418030716448e-05, + "loss": 0.1715, + "step": 20026 + }, + { + "epoch": 1.099231613611416, + "grad_norm": 1.3876320123672485, + "learning_rate": 1.2447889575767485e-05, + "loss": 0.1885, + "step": 20028 + }, + { + "epoch": 1.0993413830954994, + "grad_norm": 0.9537897706031799, + "learning_rate": 1.2443361671415687e-05, + "loss": 0.1391, + "step": 20030 + }, + { + "epoch": 1.099451152579583, + "grad_norm": 1.123855710029602, + "learning_rate": 1.2438834317859688e-05, + "loss": 0.2371, + "step": 20032 + }, + { + "epoch": 1.0995609220636664, + "grad_norm": 0.8425115346908569, + "learning_rate": 1.2434307515298067e-05, + "loss": 0.1697, + "step": 20034 + }, + { + "epoch": 1.0996706915477497, + "grad_norm": 1.5841273069381714, + "learning_rate": 1.2429781263929397e-05, + "loss": 0.2494, + "step": 20036 + }, + { + "epoch": 1.0997804610318331, + "grad_norm": 1.6098506450653076, + "learning_rate": 1.2425255563952218e-05, + "loss": 0.2742, + "step": 20038 + }, + { + "epoch": 1.0998902305159166, + "grad_norm": 2.3287150859832764, + "learning_rate": 1.2420730415565048e-05, + "loss": 0.253, + "step": 20040 + }, + { + "epoch": 1.1, + "grad_norm": 1.1516737937927246, + "learning_rate": 1.2416205818966379e-05, + "loss": 0.1962, + "step": 20042 + }, + { + "epoch": 1.1001097694840833, + "grad_norm": 1.6863586902618408, + "learning_rate": 1.2411681774354686e-05, + "loss": 0.2652, + "step": 20044 + }, + { + "epoch": 1.1002195389681668, + "grad_norm": 1.0543344020843506, + "learning_rate": 1.2407158281928427e-05, + "loss": 0.2119, + "step": 20046 + }, + { + "epoch": 1.1003293084522503, + "grad_norm": 1.1108680963516235, + "learning_rate": 1.2402635341886016e-05, + "loss": 0.2171, + "step": 20048 + }, + { + "epoch": 1.1004390779363338, + "grad_norm": 1.0974589586257935, + "learning_rate": 1.2398112954425848e-05, + "loss": 0.1433, + "step": 20050 + }, + { + "epoch": 1.100548847420417, + "grad_norm": 0.8987396955490112, + "learning_rate": 1.2393591119746301e-05, + "loss": 0.0781, + "step": 20052 + }, + { + "epoch": 1.1006586169045005, + "grad_norm": 0.8174713253974915, + "learning_rate": 1.2389069838045724e-05, + "loss": 0.2831, + "step": 20054 + }, + { + "epoch": 1.100768386388584, + "grad_norm": 1.3935319185256958, + "learning_rate": 1.2384549109522431e-05, + "loss": 0.211, + "step": 20056 + }, + { + "epoch": 1.1008781558726675, + "grad_norm": 0.6377102136611938, + "learning_rate": 1.2380028934374746e-05, + "loss": 0.0989, + "step": 20058 + }, + { + "epoch": 1.1009879253567507, + "grad_norm": 1.4193209409713745, + "learning_rate": 1.2375509312800934e-05, + "loss": 0.1994, + "step": 20060 + }, + { + "epoch": 1.1010976948408342, + "grad_norm": 1.1001695394515991, + "learning_rate": 1.2370990244999247e-05, + "loss": 0.2537, + "step": 20062 + }, + { + "epoch": 1.1012074643249177, + "grad_norm": 1.3360944986343384, + "learning_rate": 1.2366471731167913e-05, + "loss": 0.1938, + "step": 20064 + }, + { + "epoch": 1.1013172338090012, + "grad_norm": 1.0466235876083374, + "learning_rate": 1.2361953771505133e-05, + "loss": 0.2156, + "step": 20066 + }, + { + "epoch": 1.1014270032930846, + "grad_norm": 1.1182764768600464, + "learning_rate": 1.2357436366209096e-05, + "loss": 0.1973, + "step": 20068 + }, + { + "epoch": 1.1015367727771679, + "grad_norm": 1.05155611038208, + "learning_rate": 1.2352919515477957e-05, + "loss": 0.1269, + "step": 20070 + }, + { + "epoch": 1.1016465422612514, + "grad_norm": 1.0418983697891235, + "learning_rate": 1.2348403219509832e-05, + "loss": 0.1585, + "step": 20072 + }, + { + "epoch": 1.1017563117453348, + "grad_norm": 0.8541826605796814, + "learning_rate": 1.2343887478502852e-05, + "loss": 0.1744, + "step": 20074 + }, + { + "epoch": 1.1018660812294183, + "grad_norm": 1.4311937093734741, + "learning_rate": 1.2339372292655085e-05, + "loss": 0.1145, + "step": 20076 + }, + { + "epoch": 1.1019758507135016, + "grad_norm": 1.1250578165054321, + "learning_rate": 1.2334857662164593e-05, + "loss": 0.2474, + "step": 20078 + }, + { + "epoch": 1.102085620197585, + "grad_norm": 1.44386625289917, + "learning_rate": 1.2330343587229396e-05, + "loss": 0.2182, + "step": 20080 + }, + { + "epoch": 1.1021953896816685, + "grad_norm": 1.2934802770614624, + "learning_rate": 1.2325830068047525e-05, + "loss": 0.1876, + "step": 20082 + }, + { + "epoch": 1.102305159165752, + "grad_norm": 1.1118887662887573, + "learning_rate": 1.2321317104816956e-05, + "loss": 0.3534, + "step": 20084 + }, + { + "epoch": 1.1024149286498353, + "grad_norm": 1.2263442277908325, + "learning_rate": 1.2316804697735651e-05, + "loss": 0.1758, + "step": 20086 + }, + { + "epoch": 1.1025246981339187, + "grad_norm": 1.217336893081665, + "learning_rate": 1.2312292847001545e-05, + "loss": 0.1382, + "step": 20088 + }, + { + "epoch": 1.1026344676180022, + "grad_norm": 0.791760265827179, + "learning_rate": 1.230778155281255e-05, + "loss": 0.1306, + "step": 20090 + }, + { + "epoch": 1.1027442371020857, + "grad_norm": 1.6291521787643433, + "learning_rate": 1.2303270815366542e-05, + "loss": 0.2091, + "step": 20092 + }, + { + "epoch": 1.102854006586169, + "grad_norm": 0.9917869567871094, + "learning_rate": 1.2298760634861403e-05, + "loss": 0.1655, + "step": 20094 + }, + { + "epoch": 1.1029637760702524, + "grad_norm": 1.0194329023361206, + "learning_rate": 1.2294251011494967e-05, + "loss": 0.183, + "step": 20096 + }, + { + "epoch": 1.103073545554336, + "grad_norm": 1.0604194402694702, + "learning_rate": 1.2289741945465044e-05, + "loss": 0.2392, + "step": 20098 + }, + { + "epoch": 1.1031833150384194, + "grad_norm": 0.9781529903411865, + "learning_rate": 1.2285233436969426e-05, + "loss": 0.1958, + "step": 20100 + }, + { + "epoch": 1.1032930845225026, + "grad_norm": 1.2935535907745361, + "learning_rate": 1.2280725486205866e-05, + "loss": 0.2623, + "step": 20102 + }, + { + "epoch": 1.103402854006586, + "grad_norm": 1.1576441526412964, + "learning_rate": 1.2276218093372127e-05, + "loss": 0.1877, + "step": 20104 + }, + { + "epoch": 1.1035126234906696, + "grad_norm": 1.2296594381332397, + "learning_rate": 1.2271711258665905e-05, + "loss": 0.1928, + "step": 20106 + }, + { + "epoch": 1.103622392974753, + "grad_norm": 1.038569450378418, + "learning_rate": 1.2267204982284908e-05, + "loss": 0.1659, + "step": 20108 + }, + { + "epoch": 1.1037321624588365, + "grad_norm": 1.0006318092346191, + "learning_rate": 1.2262699264426799e-05, + "loss": 0.1804, + "step": 20110 + }, + { + "epoch": 1.1038419319429198, + "grad_norm": 0.9546777009963989, + "learning_rate": 1.225819410528922e-05, + "loss": 0.1676, + "step": 20112 + }, + { + "epoch": 1.1039517014270033, + "grad_norm": 1.0661993026733398, + "learning_rate": 1.2253689505069784e-05, + "loss": 0.1821, + "step": 20114 + }, + { + "epoch": 1.1040614709110868, + "grad_norm": 1.3375442028045654, + "learning_rate": 1.2249185463966083e-05, + "loss": 0.2474, + "step": 20116 + }, + { + "epoch": 1.1041712403951702, + "grad_norm": 0.8398571610450745, + "learning_rate": 1.2244681982175702e-05, + "loss": 0.1109, + "step": 20118 + }, + { + "epoch": 1.1042810098792535, + "grad_norm": 1.5819697380065918, + "learning_rate": 1.2240179059896172e-05, + "loss": 0.289, + "step": 20120 + }, + { + "epoch": 1.104390779363337, + "grad_norm": 0.6844534873962402, + "learning_rate": 1.2235676697325022e-05, + "loss": 0.1248, + "step": 20122 + }, + { + "epoch": 1.1045005488474204, + "grad_norm": 0.9027161002159119, + "learning_rate": 1.223117489465974e-05, + "loss": 0.1357, + "step": 20124 + }, + { + "epoch": 1.104610318331504, + "grad_norm": 1.512207269668579, + "learning_rate": 1.22266736520978e-05, + "loss": 0.2409, + "step": 20126 + }, + { + "epoch": 1.1047200878155872, + "grad_norm": 1.0822381973266602, + "learning_rate": 1.222217296983664e-05, + "loss": 0.1895, + "step": 20128 + }, + { + "epoch": 1.1048298572996706, + "grad_norm": 1.37493896484375, + "learning_rate": 1.2217672848073702e-05, + "loss": 0.1541, + "step": 20130 + }, + { + "epoch": 1.1049396267837541, + "grad_norm": 1.8022806644439697, + "learning_rate": 1.221317328700636e-05, + "loss": 0.3066, + "step": 20132 + }, + { + "epoch": 1.1050493962678376, + "grad_norm": 1.459151268005371, + "learning_rate": 1.2208674286832011e-05, + "loss": 0.226, + "step": 20134 + }, + { + "epoch": 1.105159165751921, + "grad_norm": 1.4307539463043213, + "learning_rate": 1.220417584774799e-05, + "loss": 0.2441, + "step": 20136 + }, + { + "epoch": 1.1052689352360043, + "grad_norm": 1.368502140045166, + "learning_rate": 1.2199677969951622e-05, + "loss": 0.2374, + "step": 20138 + }, + { + "epoch": 1.1053787047200878, + "grad_norm": 1.1733381748199463, + "learning_rate": 1.2195180653640206e-05, + "loss": 0.1339, + "step": 20140 + }, + { + "epoch": 1.1054884742041713, + "grad_norm": 1.1573419570922852, + "learning_rate": 1.2190683899011007e-05, + "loss": 0.1769, + "step": 20142 + }, + { + "epoch": 1.1055982436882548, + "grad_norm": 0.9234422445297241, + "learning_rate": 1.2186187706261296e-05, + "loss": 0.1546, + "step": 20144 + }, + { + "epoch": 1.105708013172338, + "grad_norm": 1.1364033222198486, + "learning_rate": 1.2181692075588284e-05, + "loss": 0.2028, + "step": 20146 + }, + { + "epoch": 1.1058177826564215, + "grad_norm": 1.6637881994247437, + "learning_rate": 1.217719700718917e-05, + "loss": 0.2651, + "step": 20148 + }, + { + "epoch": 1.105927552140505, + "grad_norm": 0.8254796266555786, + "learning_rate": 1.2172702501261139e-05, + "loss": 0.1889, + "step": 20150 + }, + { + "epoch": 1.1060373216245885, + "grad_norm": 1.0215306282043457, + "learning_rate": 1.2168208558001323e-05, + "loss": 0.1588, + "step": 20152 + }, + { + "epoch": 1.1061470911086717, + "grad_norm": 1.5501667261123657, + "learning_rate": 1.216371517760687e-05, + "loss": 0.2072, + "step": 20154 + }, + { + "epoch": 1.1062568605927552, + "grad_norm": 1.153360366821289, + "learning_rate": 1.2159222360274877e-05, + "loss": 0.205, + "step": 20156 + }, + { + "epoch": 1.1063666300768387, + "grad_norm": 1.1685597896575928, + "learning_rate": 1.2154730106202417e-05, + "loss": 0.2473, + "step": 20158 + }, + { + "epoch": 1.1064763995609221, + "grad_norm": 1.1362642049789429, + "learning_rate": 1.2150238415586543e-05, + "loss": 0.1459, + "step": 20160 + }, + { + "epoch": 1.1065861690450054, + "grad_norm": 0.9146531224250793, + "learning_rate": 1.2145747288624273e-05, + "loss": 0.1477, + "step": 20162 + }, + { + "epoch": 1.1066959385290889, + "grad_norm": 1.4058501720428467, + "learning_rate": 1.2141256725512628e-05, + "loss": 0.241, + "step": 20164 + }, + { + "epoch": 1.1068057080131724, + "grad_norm": 1.0588102340698242, + "learning_rate": 1.2136766726448567e-05, + "loss": 0.1615, + "step": 20166 + }, + { + "epoch": 1.1069154774972558, + "grad_norm": 0.9262502193450928, + "learning_rate": 1.2132277291629066e-05, + "loss": 0.1198, + "step": 20168 + }, + { + "epoch": 1.107025246981339, + "grad_norm": 0.8365896940231323, + "learning_rate": 1.2127788421251038e-05, + "loss": 0.1898, + "step": 20170 + }, + { + "epoch": 1.1071350164654226, + "grad_norm": 1.0654067993164062, + "learning_rate": 1.2123300115511394e-05, + "loss": 0.1896, + "step": 20172 + }, + { + "epoch": 1.107244785949506, + "grad_norm": 1.2846375703811646, + "learning_rate": 1.2118812374607008e-05, + "loss": 0.2646, + "step": 20174 + }, + { + "epoch": 1.1073545554335895, + "grad_norm": 1.562584638595581, + "learning_rate": 1.2114325198734736e-05, + "loss": 0.22, + "step": 20176 + }, + { + "epoch": 1.107464324917673, + "grad_norm": 1.18381929397583, + "learning_rate": 1.2109838588091397e-05, + "loss": 0.2338, + "step": 20178 + }, + { + "epoch": 1.1075740944017562, + "grad_norm": 1.3349095582962036, + "learning_rate": 1.2105352542873815e-05, + "loss": 0.1973, + "step": 20180 + }, + { + "epoch": 1.1076838638858397, + "grad_norm": 1.1004523038864136, + "learning_rate": 1.2100867063278761e-05, + "loss": 0.2094, + "step": 20182 + }, + { + "epoch": 1.1077936333699232, + "grad_norm": 1.0214011669158936, + "learning_rate": 1.2096382149502995e-05, + "loss": 0.1603, + "step": 20184 + }, + { + "epoch": 1.1079034028540067, + "grad_norm": 0.9693605303764343, + "learning_rate": 1.2091897801743238e-05, + "loss": 0.2011, + "step": 20186 + }, + { + "epoch": 1.10801317233809, + "grad_norm": 1.9865070581436157, + "learning_rate": 1.2087414020196192e-05, + "loss": 0.2203, + "step": 20188 + }, + { + "epoch": 1.1081229418221734, + "grad_norm": 1.2857903242111206, + "learning_rate": 1.2082930805058554e-05, + "loss": 0.2109, + "step": 20190 + }, + { + "epoch": 1.108232711306257, + "grad_norm": 1.0944119691848755, + "learning_rate": 1.2078448156526965e-05, + "loss": 0.1299, + "step": 20192 + }, + { + "epoch": 1.1083424807903404, + "grad_norm": 1.0494056940078735, + "learning_rate": 1.2073966074798068e-05, + "loss": 0.2231, + "step": 20194 + }, + { + "epoch": 1.1084522502744236, + "grad_norm": 1.0095555782318115, + "learning_rate": 1.2069484560068467e-05, + "loss": 0.2926, + "step": 20196 + }, + { + "epoch": 1.108562019758507, + "grad_norm": 1.2363150119781494, + "learning_rate": 1.206500361253474e-05, + "loss": 0.1692, + "step": 20198 + }, + { + "epoch": 1.1086717892425906, + "grad_norm": 1.2607142925262451, + "learning_rate": 1.2060523232393439e-05, + "loss": 0.1971, + "step": 20200 + }, + { + "epoch": 1.108781558726674, + "grad_norm": 1.701751947402954, + "learning_rate": 1.2056043419841095e-05, + "loss": 0.1489, + "step": 20202 + }, + { + "epoch": 1.1088913282107573, + "grad_norm": 0.6540396809577942, + "learning_rate": 1.2051564175074226e-05, + "loss": 0.137, + "step": 20204 + }, + { + "epoch": 1.1090010976948408, + "grad_norm": 2.0647497177124023, + "learning_rate": 1.2047085498289309e-05, + "loss": 0.2234, + "step": 20206 + }, + { + "epoch": 1.1091108671789243, + "grad_norm": 1.1224433183670044, + "learning_rate": 1.2042607389682798e-05, + "loss": 0.2116, + "step": 20208 + }, + { + "epoch": 1.1092206366630077, + "grad_norm": 1.1375974416732788, + "learning_rate": 1.2038129849451125e-05, + "loss": 0.177, + "step": 20210 + }, + { + "epoch": 1.109330406147091, + "grad_norm": 0.6509904265403748, + "learning_rate": 1.2033652877790696e-05, + "loss": 0.1016, + "step": 20212 + }, + { + "epoch": 1.1094401756311745, + "grad_norm": 1.2866325378417969, + "learning_rate": 1.2029176474897888e-05, + "loss": 0.1492, + "step": 20214 + }, + { + "epoch": 1.109549945115258, + "grad_norm": 1.4956332445144653, + "learning_rate": 1.2024700640969073e-05, + "loss": 0.1868, + "step": 20216 + }, + { + "epoch": 1.1096597145993414, + "grad_norm": 1.2070605754852295, + "learning_rate": 1.2020225376200572e-05, + "loss": 0.1558, + "step": 20218 + }, + { + "epoch": 1.109769484083425, + "grad_norm": 1.26142418384552, + "learning_rate": 1.2015750680788699e-05, + "loss": 0.1758, + "step": 20220 + }, + { + "epoch": 1.1098792535675082, + "grad_norm": 1.0237101316452026, + "learning_rate": 1.2011276554929718e-05, + "loss": 0.1985, + "step": 20222 + }, + { + "epoch": 1.1099890230515916, + "grad_norm": 1.4184890985488892, + "learning_rate": 1.200680299881991e-05, + "loss": 0.3019, + "step": 20224 + }, + { + "epoch": 1.1100987925356751, + "grad_norm": 0.9320732355117798, + "learning_rate": 1.2002330012655496e-05, + "loss": 0.1223, + "step": 20226 + }, + { + "epoch": 1.1102085620197586, + "grad_norm": 1.579807996749878, + "learning_rate": 1.1997857596632678e-05, + "loss": 0.2209, + "step": 20228 + }, + { + "epoch": 1.1103183315038418, + "grad_norm": 1.0022811889648438, + "learning_rate": 1.199338575094765e-05, + "loss": 0.1917, + "step": 20230 + }, + { + "epoch": 1.1104281009879253, + "grad_norm": 1.2304340600967407, + "learning_rate": 1.1988914475796565e-05, + "loss": 0.1742, + "step": 20232 + }, + { + "epoch": 1.1105378704720088, + "grad_norm": 1.180005431175232, + "learning_rate": 1.1984443771375554e-05, + "loss": 0.1854, + "step": 20234 + }, + { + "epoch": 1.1106476399560923, + "grad_norm": 1.1845165491104126, + "learning_rate": 1.197997363788072e-05, + "loss": 0.1209, + "step": 20236 + }, + { + "epoch": 1.1107574094401755, + "grad_norm": 1.2238816022872925, + "learning_rate": 1.1975504075508142e-05, + "loss": 0.1507, + "step": 20238 + }, + { + "epoch": 1.110867178924259, + "grad_norm": 0.6489412188529968, + "learning_rate": 1.1971035084453891e-05, + "loss": 0.1419, + "step": 20240 + }, + { + "epoch": 1.1109769484083425, + "grad_norm": 1.0257219076156616, + "learning_rate": 1.1966566664913991e-05, + "loss": 0.2568, + "step": 20242 + }, + { + "epoch": 1.111086717892426, + "grad_norm": 0.8618423342704773, + "learning_rate": 1.1962098817084452e-05, + "loss": 0.1257, + "step": 20244 + }, + { + "epoch": 1.1111964873765094, + "grad_norm": 0.8481981158256531, + "learning_rate": 1.195763154116125e-05, + "loss": 0.1917, + "step": 20246 + }, + { + "epoch": 1.1113062568605927, + "grad_norm": 1.2082250118255615, + "learning_rate": 1.1953164837340345e-05, + "loss": 0.1842, + "step": 20248 + }, + { + "epoch": 1.1114160263446762, + "grad_norm": 1.3012933731079102, + "learning_rate": 1.1948698705817656e-05, + "loss": 0.1512, + "step": 20250 + }, + { + "epoch": 1.1115257958287597, + "grad_norm": 0.8126192092895508, + "learning_rate": 1.1944233146789106e-05, + "loss": 0.1198, + "step": 20252 + }, + { + "epoch": 1.1116355653128431, + "grad_norm": 1.4318249225616455, + "learning_rate": 1.193976816045058e-05, + "loss": 0.2559, + "step": 20254 + }, + { + "epoch": 1.1117453347969264, + "grad_norm": 1.9762382507324219, + "learning_rate": 1.1935303746997923e-05, + "loss": 0.2265, + "step": 20256 + }, + { + "epoch": 1.1118551042810099, + "grad_norm": 1.677997350692749, + "learning_rate": 1.193083990662697e-05, + "loss": 0.2436, + "step": 20258 + }, + { + "epoch": 1.1119648737650933, + "grad_norm": 0.9866496920585632, + "learning_rate": 1.1926376639533527e-05, + "loss": 0.2037, + "step": 20260 + }, + { + "epoch": 1.1120746432491768, + "grad_norm": 1.0684775114059448, + "learning_rate": 1.192191394591337e-05, + "loss": 0.3763, + "step": 20262 + }, + { + "epoch": 1.11218441273326, + "grad_norm": 1.278751015663147, + "learning_rate": 1.1917451825962254e-05, + "loss": 0.1094, + "step": 20264 + }, + { + "epoch": 1.1122941822173436, + "grad_norm": 0.8611131906509399, + "learning_rate": 1.1912990279875922e-05, + "loss": 0.1786, + "step": 20266 + }, + { + "epoch": 1.112403951701427, + "grad_norm": 1.3164753913879395, + "learning_rate": 1.190852930785007e-05, + "loss": 0.2825, + "step": 20268 + }, + { + "epoch": 1.1125137211855105, + "grad_norm": 1.0117664337158203, + "learning_rate": 1.1904068910080379e-05, + "loss": 0.155, + "step": 20270 + }, + { + "epoch": 1.1126234906695938, + "grad_norm": 1.0172535181045532, + "learning_rate": 1.1899609086762506e-05, + "loss": 0.1397, + "step": 20272 + }, + { + "epoch": 1.1127332601536772, + "grad_norm": 1.447677493095398, + "learning_rate": 1.1895149838092075e-05, + "loss": 0.1831, + "step": 20274 + }, + { + "epoch": 1.1128430296377607, + "grad_norm": 1.2583550214767456, + "learning_rate": 1.189069116426469e-05, + "loss": 0.2015, + "step": 20276 + }, + { + "epoch": 1.1129527991218442, + "grad_norm": 0.9166381359100342, + "learning_rate": 1.188623306547594e-05, + "loss": 0.2054, + "step": 20278 + }, + { + "epoch": 1.1130625686059274, + "grad_norm": 1.150007724761963, + "learning_rate": 1.1881775541921378e-05, + "loss": 0.1909, + "step": 20280 + }, + { + "epoch": 1.113172338090011, + "grad_norm": 1.3961925506591797, + "learning_rate": 1.1877318593796527e-05, + "loss": 0.2391, + "step": 20282 + }, + { + "epoch": 1.1132821075740944, + "grad_norm": 2.2682127952575684, + "learning_rate": 1.1872862221296882e-05, + "loss": 0.3012, + "step": 20284 + }, + { + "epoch": 1.1133918770581779, + "grad_norm": 1.4811533689498901, + "learning_rate": 1.1868406424617943e-05, + "loss": 0.1741, + "step": 20286 + }, + { + "epoch": 1.1135016465422614, + "grad_norm": 1.6378759145736694, + "learning_rate": 1.186395120395514e-05, + "loss": 0.2536, + "step": 20288 + }, + { + "epoch": 1.1136114160263446, + "grad_norm": 1.5814169645309448, + "learning_rate": 1.1859496559503925e-05, + "loss": 0.2756, + "step": 20290 + }, + { + "epoch": 1.113721185510428, + "grad_norm": 1.0609911680221558, + "learning_rate": 1.1855042491459686e-05, + "loss": 0.1334, + "step": 20292 + }, + { + "epoch": 1.1138309549945116, + "grad_norm": 1.5361515283584595, + "learning_rate": 1.1850589000017803e-05, + "loss": 0.1533, + "step": 20294 + }, + { + "epoch": 1.113940724478595, + "grad_norm": 1.2319328784942627, + "learning_rate": 1.184613608537363e-05, + "loss": 0.229, + "step": 20296 + }, + { + "epoch": 1.1140504939626783, + "grad_norm": 1.5656801462173462, + "learning_rate": 1.1841683747722487e-05, + "loss": 0.2262, + "step": 20298 + }, + { + "epoch": 1.1141602634467618, + "grad_norm": 1.3106718063354492, + "learning_rate": 1.1837231987259672e-05, + "loss": 0.181, + "step": 20300 + }, + { + "epoch": 1.1142700329308453, + "grad_norm": 0.9449533224105835, + "learning_rate": 1.183278080418048e-05, + "loss": 0.238, + "step": 20302 + }, + { + "epoch": 1.1143798024149287, + "grad_norm": 1.3136224746704102, + "learning_rate": 1.1828330198680148e-05, + "loss": 0.2038, + "step": 20304 + }, + { + "epoch": 1.114489571899012, + "grad_norm": 1.1272730827331543, + "learning_rate": 1.1823880170953904e-05, + "loss": 0.1781, + "step": 20306 + }, + { + "epoch": 1.1145993413830955, + "grad_norm": 1.0232402086257935, + "learning_rate": 1.181943072119695e-05, + "loss": 0.1775, + "step": 20308 + }, + { + "epoch": 1.114709110867179, + "grad_norm": 1.7081313133239746, + "learning_rate": 1.1814981849604459e-05, + "loss": 0.2368, + "step": 20310 + }, + { + "epoch": 1.1148188803512624, + "grad_norm": 0.8796844482421875, + "learning_rate": 1.1810533556371567e-05, + "loss": 0.1434, + "step": 20312 + }, + { + "epoch": 1.1149286498353457, + "grad_norm": 0.8962081670761108, + "learning_rate": 1.1806085841693412e-05, + "loss": 0.0982, + "step": 20314 + }, + { + "epoch": 1.1150384193194292, + "grad_norm": 1.6524821519851685, + "learning_rate": 1.1801638705765104e-05, + "loss": 0.2137, + "step": 20316 + }, + { + "epoch": 1.1151481888035126, + "grad_norm": 1.6642489433288574, + "learning_rate": 1.1797192148781702e-05, + "loss": 0.2476, + "step": 20318 + }, + { + "epoch": 1.115257958287596, + "grad_norm": 1.370705485343933, + "learning_rate": 1.1792746170938254e-05, + "loss": 0.2087, + "step": 20320 + }, + { + "epoch": 1.1153677277716794, + "grad_norm": 0.942890465259552, + "learning_rate": 1.1788300772429785e-05, + "loss": 0.1027, + "step": 20322 + }, + { + "epoch": 1.1154774972557628, + "grad_norm": 0.8374423980712891, + "learning_rate": 1.1783855953451281e-05, + "loss": 0.1155, + "step": 20324 + }, + { + "epoch": 1.1155872667398463, + "grad_norm": 1.033557653427124, + "learning_rate": 1.1779411714197733e-05, + "loss": 0.4122, + "step": 20326 + }, + { + "epoch": 1.1156970362239298, + "grad_norm": 1.5945148468017578, + "learning_rate": 1.177496805486408e-05, + "loss": 0.2232, + "step": 20328 + }, + { + "epoch": 1.1158068057080133, + "grad_norm": 1.2080583572387695, + "learning_rate": 1.1770524975645238e-05, + "loss": 0.1555, + "step": 20330 + }, + { + "epoch": 1.1159165751920965, + "grad_norm": 1.3974608182907104, + "learning_rate": 1.1766082476736104e-05, + "loss": 0.3302, + "step": 20332 + }, + { + "epoch": 1.11602634467618, + "grad_norm": 1.297082543373108, + "learning_rate": 1.1761640558331553e-05, + "loss": 0.1735, + "step": 20334 + }, + { + "epoch": 1.1161361141602635, + "grad_norm": 1.4083346128463745, + "learning_rate": 1.1757199220626411e-05, + "loss": 0.1594, + "step": 20336 + }, + { + "epoch": 1.116245883644347, + "grad_norm": 1.1155107021331787, + "learning_rate": 1.1752758463815518e-05, + "loss": 0.1186, + "step": 20338 + }, + { + "epoch": 1.1163556531284302, + "grad_norm": 0.9638316631317139, + "learning_rate": 1.1748318288093666e-05, + "loss": 0.2733, + "step": 20340 + }, + { + "epoch": 1.1164654226125137, + "grad_norm": 1.0366512537002563, + "learning_rate": 1.1743878693655614e-05, + "loss": 0.2358, + "step": 20342 + }, + { + "epoch": 1.1165751920965972, + "grad_norm": 1.4356052875518799, + "learning_rate": 1.1739439680696098e-05, + "loss": 0.1854, + "step": 20344 + }, + { + "epoch": 1.1166849615806806, + "grad_norm": 0.9820836782455444, + "learning_rate": 1.1735001249409856e-05, + "loss": 0.1644, + "step": 20346 + }, + { + "epoch": 1.116794731064764, + "grad_norm": 1.4181572198867798, + "learning_rate": 1.1730563399991563e-05, + "loss": 0.2381, + "step": 20348 + }, + { + "epoch": 1.1169045005488474, + "grad_norm": 1.2303565740585327, + "learning_rate": 1.1726126132635883e-05, + "loss": 0.1326, + "step": 20350 + }, + { + "epoch": 1.1170142700329309, + "grad_norm": 0.793407142162323, + "learning_rate": 1.172168944753747e-05, + "loss": 0.2287, + "step": 20352 + }, + { + "epoch": 1.1171240395170143, + "grad_norm": 1.3601775169372559, + "learning_rate": 1.1717253344890935e-05, + "loss": 0.2211, + "step": 20354 + }, + { + "epoch": 1.1172338090010978, + "grad_norm": 1.0351667404174805, + "learning_rate": 1.1712817824890863e-05, + "loss": 0.1358, + "step": 20356 + }, + { + "epoch": 1.117343578485181, + "grad_norm": 1.0398645401000977, + "learning_rate": 1.1708382887731817e-05, + "loss": 0.1469, + "step": 20358 + }, + { + "epoch": 1.1174533479692645, + "grad_norm": 1.1092658042907715, + "learning_rate": 1.1703948533608339e-05, + "loss": 0.2791, + "step": 20360 + }, + { + "epoch": 1.117563117453348, + "grad_norm": 1.0214275121688843, + "learning_rate": 1.1699514762714928e-05, + "loss": 0.146, + "step": 20362 + }, + { + "epoch": 1.1176728869374315, + "grad_norm": 0.8661616444587708, + "learning_rate": 1.1695081575246093e-05, + "loss": 0.1583, + "step": 20364 + }, + { + "epoch": 1.1177826564215148, + "grad_norm": 1.0515352487564087, + "learning_rate": 1.1690648971396282e-05, + "loss": 0.1532, + "step": 20366 + }, + { + "epoch": 1.1178924259055982, + "grad_norm": 0.8879004120826721, + "learning_rate": 1.1686216951359936e-05, + "loss": 0.2403, + "step": 20368 + }, + { + "epoch": 1.1180021953896817, + "grad_norm": 1.0443800687789917, + "learning_rate": 1.1681785515331458e-05, + "loss": 0.2159, + "step": 20370 + }, + { + "epoch": 1.1181119648737652, + "grad_norm": 1.2710801362991333, + "learning_rate": 1.167735466350523e-05, + "loss": 0.2037, + "step": 20372 + }, + { + "epoch": 1.1182217343578484, + "grad_norm": 1.3149181604385376, + "learning_rate": 1.1672924396075619e-05, + "loss": 0.2136, + "step": 20374 + }, + { + "epoch": 1.118331503841932, + "grad_norm": 1.4205691814422607, + "learning_rate": 1.1668494713236967e-05, + "loss": 0.2157, + "step": 20376 + }, + { + "epoch": 1.1184412733260154, + "grad_norm": 1.387047529220581, + "learning_rate": 1.166406561518357e-05, + "loss": 0.1749, + "step": 20378 + }, + { + "epoch": 1.1185510428100989, + "grad_norm": 1.132278323173523, + "learning_rate": 1.1659637102109714e-05, + "loss": 0.2467, + "step": 20380 + }, + { + "epoch": 1.1186608122941821, + "grad_norm": 0.8496060967445374, + "learning_rate": 1.165520917420965e-05, + "loss": 0.1363, + "step": 20382 + }, + { + "epoch": 1.1187705817782656, + "grad_norm": 0.9206421971321106, + "learning_rate": 1.1650781831677612e-05, + "loss": 0.2087, + "step": 20384 + }, + { + "epoch": 1.118880351262349, + "grad_norm": 2.245912551879883, + "learning_rate": 1.1646355074707799e-05, + "loss": 0.1566, + "step": 20386 + }, + { + "epoch": 1.1189901207464326, + "grad_norm": 1.598477840423584, + "learning_rate": 1.1641928903494401e-05, + "loss": 0.1377, + "step": 20388 + }, + { + "epoch": 1.1190998902305158, + "grad_norm": 1.8618345260620117, + "learning_rate": 1.1637503318231569e-05, + "loss": 0.1466, + "step": 20390 + }, + { + "epoch": 1.1192096597145993, + "grad_norm": 1.2001557350158691, + "learning_rate": 1.1633078319113427e-05, + "loss": 0.1643, + "step": 20392 + }, + { + "epoch": 1.1193194291986828, + "grad_norm": 0.9831852316856384, + "learning_rate": 1.162865390633408e-05, + "loss": 0.1451, + "step": 20394 + }, + { + "epoch": 1.1194291986827662, + "grad_norm": 1.5705335140228271, + "learning_rate": 1.1624230080087604e-05, + "loss": 0.1371, + "step": 20396 + }, + { + "epoch": 1.1195389681668497, + "grad_norm": 1.0820891857147217, + "learning_rate": 1.1619806840568038e-05, + "loss": 0.1405, + "step": 20398 + }, + { + "epoch": 1.119648737650933, + "grad_norm": 1.0165308713912964, + "learning_rate": 1.1615384187969425e-05, + "loss": 0.101, + "step": 20400 + }, + { + "epoch": 1.1197585071350165, + "grad_norm": 1.217165231704712, + "learning_rate": 1.161096212248576e-05, + "loss": 0.1825, + "step": 20402 + }, + { + "epoch": 1.1198682766191, + "grad_norm": 1.293798565864563, + "learning_rate": 1.1606540644311004e-05, + "loss": 0.148, + "step": 20404 + }, + { + "epoch": 1.1199780461031834, + "grad_norm": 1.3964638710021973, + "learning_rate": 1.1602119753639124e-05, + "loss": 0.3066, + "step": 20406 + }, + { + "epoch": 1.1200878155872667, + "grad_norm": 1.1621369123458862, + "learning_rate": 1.1597699450664028e-05, + "loss": 0.1178, + "step": 20408 + }, + { + "epoch": 1.1201975850713501, + "grad_norm": 1.340407371520996, + "learning_rate": 1.159327973557962e-05, + "loss": 0.1941, + "step": 20410 + }, + { + "epoch": 1.1203073545554336, + "grad_norm": 1.6720950603485107, + "learning_rate": 1.158886060857976e-05, + "loss": 0.2765, + "step": 20412 + }, + { + "epoch": 1.120417124039517, + "grad_norm": 0.97701096534729, + "learning_rate": 1.1584442069858306e-05, + "loss": 0.183, + "step": 20414 + }, + { + "epoch": 1.1205268935236004, + "grad_norm": 0.926565945148468, + "learning_rate": 1.158002411960907e-05, + "loss": 0.1508, + "step": 20416 + }, + { + "epoch": 1.1206366630076838, + "grad_norm": 1.405297040939331, + "learning_rate": 1.1575606758025845e-05, + "loss": 0.1804, + "step": 20418 + }, + { + "epoch": 1.1207464324917673, + "grad_norm": 0.9502239227294922, + "learning_rate": 1.1571189985302399e-05, + "loss": 0.1515, + "step": 20420 + }, + { + "epoch": 1.1208562019758508, + "grad_norm": 1.7512544393539429, + "learning_rate": 1.1566773801632463e-05, + "loss": 0.2136, + "step": 20422 + }, + { + "epoch": 1.120965971459934, + "grad_norm": 1.0808156728744507, + "learning_rate": 1.1562358207209773e-05, + "loss": 0.1443, + "step": 20424 + }, + { + "epoch": 1.1210757409440175, + "grad_norm": 1.2582836151123047, + "learning_rate": 1.1557943202228007e-05, + "loss": 0.244, + "step": 20426 + }, + { + "epoch": 1.121185510428101, + "grad_norm": 1.2458827495574951, + "learning_rate": 1.155352878688083e-05, + "loss": 0.1801, + "step": 20428 + }, + { + "epoch": 1.1212952799121845, + "grad_norm": 1.5244495868682861, + "learning_rate": 1.154911496136188e-05, + "loss": 0.17, + "step": 20430 + }, + { + "epoch": 1.1214050493962677, + "grad_norm": 1.1812516450881958, + "learning_rate": 1.1544701725864768e-05, + "loss": 0.2089, + "step": 20432 + }, + { + "epoch": 1.1215148188803512, + "grad_norm": 2.627223491668701, + "learning_rate": 1.154028908058307e-05, + "loss": 0.2252, + "step": 20434 + }, + { + "epoch": 1.1216245883644347, + "grad_norm": 2.3316080570220947, + "learning_rate": 1.1535877025710356e-05, + "loss": 0.2872, + "step": 20436 + }, + { + "epoch": 1.1217343578485182, + "grad_norm": 0.9228220582008362, + "learning_rate": 1.1531465561440174e-05, + "loss": 0.2064, + "step": 20438 + }, + { + "epoch": 1.1218441273326016, + "grad_norm": 1.0783309936523438, + "learning_rate": 1.152705468796602e-05, + "loss": 0.1489, + "step": 20440 + }, + { + "epoch": 1.121953896816685, + "grad_norm": 1.087272047996521, + "learning_rate": 1.1522644405481376e-05, + "loss": 0.1853, + "step": 20442 + }, + { + "epoch": 1.1220636663007684, + "grad_norm": 0.9655637145042419, + "learning_rate": 1.15182347141797e-05, + "loss": 0.189, + "step": 20444 + }, + { + "epoch": 1.1221734357848518, + "grad_norm": 1.1772269010543823, + "learning_rate": 1.1513825614254417e-05, + "loss": 0.2499, + "step": 20446 + }, + { + "epoch": 1.1222832052689353, + "grad_norm": 0.8806902170181274, + "learning_rate": 1.150941710589893e-05, + "loss": 0.1619, + "step": 20448 + }, + { + "epoch": 1.1223929747530186, + "grad_norm": 1.090941071510315, + "learning_rate": 1.1505009189306636e-05, + "loss": 0.2858, + "step": 20450 + }, + { + "epoch": 1.122502744237102, + "grad_norm": 1.8305068016052246, + "learning_rate": 1.1500601864670876e-05, + "loss": 0.2455, + "step": 20452 + }, + { + "epoch": 1.1226125137211855, + "grad_norm": 0.7549258470535278, + "learning_rate": 1.1496195132184975e-05, + "loss": 0.137, + "step": 20454 + }, + { + "epoch": 1.122722283205269, + "grad_norm": 1.1827492713928223, + "learning_rate": 1.1491788992042238e-05, + "loss": 0.1229, + "step": 20456 + }, + { + "epoch": 1.1228320526893523, + "grad_norm": 1.43787682056427, + "learning_rate": 1.1487383444435925e-05, + "loss": 0.1944, + "step": 20458 + }, + { + "epoch": 1.1229418221734357, + "grad_norm": 1.3617827892303467, + "learning_rate": 1.1482978489559312e-05, + "loss": 0.2825, + "step": 20460 + }, + { + "epoch": 1.1230515916575192, + "grad_norm": 1.263616681098938, + "learning_rate": 1.1478574127605608e-05, + "loss": 0.1872, + "step": 20462 + }, + { + "epoch": 1.1231613611416027, + "grad_norm": 1.1445645093917847, + "learning_rate": 1.1474170358767997e-05, + "loss": 0.1686, + "step": 20464 + }, + { + "epoch": 1.1232711306256862, + "grad_norm": 1.3641633987426758, + "learning_rate": 1.1469767183239672e-05, + "loss": 0.1792, + "step": 20466 + }, + { + "epoch": 1.1233809001097694, + "grad_norm": 1.217342495918274, + "learning_rate": 1.1465364601213771e-05, + "loss": 0.169, + "step": 20468 + }, + { + "epoch": 1.123490669593853, + "grad_norm": 1.8992935419082642, + "learning_rate": 1.1460962612883408e-05, + "loss": 0.3053, + "step": 20470 + }, + { + "epoch": 1.1236004390779364, + "grad_norm": 1.1005836725234985, + "learning_rate": 1.1456561218441672e-05, + "loss": 0.1705, + "step": 20472 + }, + { + "epoch": 1.1237102085620196, + "grad_norm": 1.0965704917907715, + "learning_rate": 1.1452160418081642e-05, + "loss": 0.1718, + "step": 20474 + }, + { + "epoch": 1.1238199780461031, + "grad_norm": 1.3359322547912598, + "learning_rate": 1.1447760211996356e-05, + "loss": 0.1592, + "step": 20476 + }, + { + "epoch": 1.1239297475301866, + "grad_norm": 1.6696127653121948, + "learning_rate": 1.1443360600378825e-05, + "loss": 0.1681, + "step": 20478 + }, + { + "epoch": 1.12403951701427, + "grad_norm": 1.393614411354065, + "learning_rate": 1.1438961583422037e-05, + "loss": 0.2953, + "step": 20480 + }, + { + "epoch": 1.1241492864983536, + "grad_norm": 1.1971244812011719, + "learning_rate": 1.1434563161318954e-05, + "loss": 0.1916, + "step": 20482 + }, + { + "epoch": 1.1242590559824368, + "grad_norm": 1.9602396488189697, + "learning_rate": 1.1430165334262508e-05, + "loss": 0.2877, + "step": 20484 + }, + { + "epoch": 1.1243688254665203, + "grad_norm": 1.3959590196609497, + "learning_rate": 1.1425768102445622e-05, + "loss": 0.1485, + "step": 20486 + }, + { + "epoch": 1.1244785949506038, + "grad_norm": 1.6177960634231567, + "learning_rate": 1.1421371466061173e-05, + "loss": 0.203, + "step": 20488 + }, + { + "epoch": 1.1245883644346872, + "grad_norm": 2.495368003845215, + "learning_rate": 1.141697542530202e-05, + "loss": 0.2863, + "step": 20490 + }, + { + "epoch": 1.1246981339187705, + "grad_norm": 1.0187206268310547, + "learning_rate": 1.1412579980360994e-05, + "loss": 0.1425, + "step": 20492 + }, + { + "epoch": 1.124807903402854, + "grad_norm": 1.1708422899246216, + "learning_rate": 1.1408185131430893e-05, + "loss": 0.1952, + "step": 20494 + }, + { + "epoch": 1.1249176728869374, + "grad_norm": 1.337215781211853, + "learning_rate": 1.1403790878704512e-05, + "loss": 0.1432, + "step": 20496 + }, + { + "epoch": 1.125027442371021, + "grad_norm": 2.01297926902771, + "learning_rate": 1.1399397222374588e-05, + "loss": 0.2445, + "step": 20498 + }, + { + "epoch": 1.1251372118551042, + "grad_norm": 1.593064546585083, + "learning_rate": 1.1395004162633868e-05, + "loss": 0.1404, + "step": 20500 + }, + { + "epoch": 1.1252469813391877, + "grad_norm": 0.8032316565513611, + "learning_rate": 1.1390611699675045e-05, + "loss": 0.1818, + "step": 20502 + }, + { + "epoch": 1.1253567508232711, + "grad_norm": 2.546949863433838, + "learning_rate": 1.1386219833690787e-05, + "loss": 0.2309, + "step": 20504 + }, + { + "epoch": 1.1254665203073546, + "grad_norm": 1.4809348583221436, + "learning_rate": 1.138182856487375e-05, + "loss": 0.2825, + "step": 20506 + }, + { + "epoch": 1.125576289791438, + "grad_norm": 1.3905038833618164, + "learning_rate": 1.1377437893416546e-05, + "loss": 0.1944, + "step": 20508 + }, + { + "epoch": 1.1256860592755213, + "grad_norm": 1.2779372930526733, + "learning_rate": 1.1373047819511784e-05, + "loss": 0.2181, + "step": 20510 + }, + { + "epoch": 1.1257958287596048, + "grad_norm": 1.2830069065093994, + "learning_rate": 1.1368658343352034e-05, + "loss": 0.1942, + "step": 20512 + }, + { + "epoch": 1.1259055982436883, + "grad_norm": 2.0345253944396973, + "learning_rate": 1.1364269465129834e-05, + "loss": 0.2958, + "step": 20514 + }, + { + "epoch": 1.1260153677277718, + "grad_norm": 1.358505368232727, + "learning_rate": 1.1359881185037704e-05, + "loss": 0.27, + "step": 20516 + }, + { + "epoch": 1.126125137211855, + "grad_norm": 1.3212568759918213, + "learning_rate": 1.1355493503268137e-05, + "loss": 0.2823, + "step": 20518 + }, + { + "epoch": 1.1262349066959385, + "grad_norm": 1.2702527046203613, + "learning_rate": 1.1351106420013583e-05, + "loss": 0.2095, + "step": 20520 + }, + { + "epoch": 1.126344676180022, + "grad_norm": 1.2248932123184204, + "learning_rate": 1.1346719935466504e-05, + "loss": 0.1915, + "step": 20522 + }, + { + "epoch": 1.1264544456641055, + "grad_norm": 1.248215913772583, + "learning_rate": 1.1342334049819303e-05, + "loss": 0.2188, + "step": 20524 + }, + { + "epoch": 1.1265642151481887, + "grad_norm": 0.9959232211112976, + "learning_rate": 1.1337948763264358e-05, + "loss": 0.0981, + "step": 20526 + }, + { + "epoch": 1.1266739846322722, + "grad_norm": 1.1962835788726807, + "learning_rate": 1.1333564075994047e-05, + "loss": 0.1981, + "step": 20528 + }, + { + "epoch": 1.1267837541163557, + "grad_norm": 1.0123059749603271, + "learning_rate": 1.1329179988200694e-05, + "loss": 0.1499, + "step": 20530 + }, + { + "epoch": 1.1268935236004392, + "grad_norm": 0.8081173896789551, + "learning_rate": 1.1324796500076606e-05, + "loss": 0.193, + "step": 20532 + }, + { + "epoch": 1.1270032930845224, + "grad_norm": 1.142820119857788, + "learning_rate": 1.1320413611814058e-05, + "loss": 0.1716, + "step": 20534 + }, + { + "epoch": 1.1271130625686059, + "grad_norm": 1.106373906135559, + "learning_rate": 1.1316031323605322e-05, + "loss": 0.1648, + "step": 20536 + }, + { + "epoch": 1.1272228320526894, + "grad_norm": 1.031523585319519, + "learning_rate": 1.1311649635642615e-05, + "loss": 0.1501, + "step": 20538 + }, + { + "epoch": 1.1273326015367728, + "grad_norm": 1.3165538311004639, + "learning_rate": 1.130726854811814e-05, + "loss": 0.1639, + "step": 20540 + }, + { + "epoch": 1.127442371020856, + "grad_norm": 0.940875768661499, + "learning_rate": 1.1302888061224079e-05, + "loss": 0.0984, + "step": 20542 + }, + { + "epoch": 1.1275521405049396, + "grad_norm": 1.2434524297714233, + "learning_rate": 1.1298508175152563e-05, + "loss": 0.2267, + "step": 20544 + }, + { + "epoch": 1.127661909989023, + "grad_norm": 1.4738667011260986, + "learning_rate": 1.1294128890095742e-05, + "loss": 0.1933, + "step": 20546 + }, + { + "epoch": 1.1277716794731065, + "grad_norm": 0.8289895057678223, + "learning_rate": 1.12897502062457e-05, + "loss": 0.1869, + "step": 20548 + }, + { + "epoch": 1.12788144895719, + "grad_norm": 1.031869888305664, + "learning_rate": 1.1285372123794508e-05, + "loss": 0.1935, + "step": 20550 + }, + { + "epoch": 1.1279912184412733, + "grad_norm": 0.9890245795249939, + "learning_rate": 1.1280994642934209e-05, + "loss": 0.193, + "step": 20552 + }, + { + "epoch": 1.1281009879253567, + "grad_norm": 1.4963263273239136, + "learning_rate": 1.127661776385682e-05, + "loss": 0.3201, + "step": 20554 + }, + { + "epoch": 1.1282107574094402, + "grad_norm": 0.8668873310089111, + "learning_rate": 1.1272241486754326e-05, + "loss": 0.1114, + "step": 20556 + }, + { + "epoch": 1.1283205268935237, + "grad_norm": 1.306551218032837, + "learning_rate": 1.1267865811818701e-05, + "loss": 0.2918, + "step": 20558 + }, + { + "epoch": 1.128430296377607, + "grad_norm": 1.8867781162261963, + "learning_rate": 1.1263490739241895e-05, + "loss": 0.2422, + "step": 20560 + }, + { + "epoch": 1.1285400658616904, + "grad_norm": 1.1489906311035156, + "learning_rate": 1.1259116269215803e-05, + "loss": 0.1992, + "step": 20562 + }, + { + "epoch": 1.128649835345774, + "grad_norm": 1.4268951416015625, + "learning_rate": 1.1254742401932316e-05, + "loss": 0.2343, + "step": 20564 + }, + { + "epoch": 1.1287596048298574, + "grad_norm": 1.1686761379241943, + "learning_rate": 1.1250369137583297e-05, + "loss": 0.1661, + "step": 20566 + }, + { + "epoch": 1.1288693743139406, + "grad_norm": 0.8458613157272339, + "learning_rate": 1.124599647636057e-05, + "loss": 0.1991, + "step": 20568 + }, + { + "epoch": 1.128979143798024, + "grad_norm": 1.1684703826904297, + "learning_rate": 1.124162441845594e-05, + "loss": 0.2161, + "step": 20570 + }, + { + "epoch": 1.1290889132821076, + "grad_norm": 1.2372864484786987, + "learning_rate": 1.1237252964061202e-05, + "loss": 0.2435, + "step": 20572 + }, + { + "epoch": 1.129198682766191, + "grad_norm": 1.29702889919281, + "learning_rate": 1.1232882113368099e-05, + "loss": 0.1613, + "step": 20574 + }, + { + "epoch": 1.1293084522502745, + "grad_norm": 1.8695619106292725, + "learning_rate": 1.122851186656836e-05, + "loss": 0.2611, + "step": 20576 + }, + { + "epoch": 1.1294182217343578, + "grad_norm": 1.3408855199813843, + "learning_rate": 1.1224142223853687e-05, + "loss": 0.1936, + "step": 20578 + }, + { + "epoch": 1.1295279912184413, + "grad_norm": 2.2449417114257812, + "learning_rate": 1.121977318541575e-05, + "loss": 0.2904, + "step": 20580 + }, + { + "epoch": 1.1296377607025248, + "grad_norm": 1.1113502979278564, + "learning_rate": 1.1215404751446191e-05, + "loss": 0.2227, + "step": 20582 + }, + { + "epoch": 1.129747530186608, + "grad_norm": 1.422361969947815, + "learning_rate": 1.1211036922136645e-05, + "loss": 0.2565, + "step": 20584 + }, + { + "epoch": 1.1298572996706915, + "grad_norm": 1.26450514793396, + "learning_rate": 1.1206669697678693e-05, + "loss": 0.1862, + "step": 20586 + }, + { + "epoch": 1.129967069154775, + "grad_norm": 1.7673280239105225, + "learning_rate": 1.1202303078263917e-05, + "loss": 0.1581, + "step": 20588 + }, + { + "epoch": 1.1300768386388584, + "grad_norm": 0.9157180786132812, + "learning_rate": 1.1197937064083853e-05, + "loss": 0.2367, + "step": 20590 + }, + { + "epoch": 1.130186608122942, + "grad_norm": 0.9918776154518127, + "learning_rate": 1.1193571655330013e-05, + "loss": 0.1682, + "step": 20592 + }, + { + "epoch": 1.1302963776070252, + "grad_norm": 0.9405786395072937, + "learning_rate": 1.1189206852193876e-05, + "loss": 0.1994, + "step": 20594 + }, + { + "epoch": 1.1304061470911086, + "grad_norm": 1.215294599533081, + "learning_rate": 1.1184842654866923e-05, + "loss": 0.194, + "step": 20596 + }, + { + "epoch": 1.1305159165751921, + "grad_norm": 1.613975167274475, + "learning_rate": 1.118047906354058e-05, + "loss": 0.2919, + "step": 20598 + }, + { + "epoch": 1.1306256860592756, + "grad_norm": 1.1822519302368164, + "learning_rate": 1.1176116078406257e-05, + "loss": 0.1319, + "step": 20600 + }, + { + "epoch": 1.1307354555433589, + "grad_norm": 1.274829387664795, + "learning_rate": 1.1171753699655332e-05, + "loss": 0.1626, + "step": 20602 + }, + { + "epoch": 1.1308452250274423, + "grad_norm": 1.433377742767334, + "learning_rate": 1.1167391927479166e-05, + "loss": 0.2186, + "step": 20604 + }, + { + "epoch": 1.1309549945115258, + "grad_norm": 1.2756083011627197, + "learning_rate": 1.1163030762069073e-05, + "loss": 0.158, + "step": 20606 + }, + { + "epoch": 1.1310647639956093, + "grad_norm": 0.8883808851242065, + "learning_rate": 1.1158670203616375e-05, + "loss": 0.109, + "step": 20608 + }, + { + "epoch": 1.1311745334796925, + "grad_norm": 0.9103630781173706, + "learning_rate": 1.1154310252312341e-05, + "loss": 0.1257, + "step": 20610 + }, + { + "epoch": 1.131284302963776, + "grad_norm": 1.5087316036224365, + "learning_rate": 1.1149950908348219e-05, + "loss": 0.2049, + "step": 20612 + }, + { + "epoch": 1.1313940724478595, + "grad_norm": 0.6167513728141785, + "learning_rate": 1.1145592171915229e-05, + "loss": 0.1532, + "step": 20614 + }, + { + "epoch": 1.131503841931943, + "grad_norm": 1.458627462387085, + "learning_rate": 1.1141234043204558e-05, + "loss": 0.2413, + "step": 20616 + }, + { + "epoch": 1.1316136114160265, + "grad_norm": 1.4298077821731567, + "learning_rate": 1.1136876522407393e-05, + "loss": 0.234, + "step": 20618 + }, + { + "epoch": 1.1317233809001097, + "grad_norm": 1.0645557641983032, + "learning_rate": 1.1132519609714862e-05, + "loss": 0.2271, + "step": 20620 + }, + { + "epoch": 1.1318331503841932, + "grad_norm": 0.8869678974151611, + "learning_rate": 1.1128163305318093e-05, + "loss": 0.1148, + "step": 20622 + }, + { + "epoch": 1.1319429198682767, + "grad_norm": 0.6904522180557251, + "learning_rate": 1.1123807609408168e-05, + "loss": 0.125, + "step": 20624 + }, + { + "epoch": 1.13205268935236, + "grad_norm": 1.1302564144134521, + "learning_rate": 1.1119452522176152e-05, + "loss": 0.139, + "step": 20626 + }, + { + "epoch": 1.1321624588364434, + "grad_norm": 1.1782281398773193, + "learning_rate": 1.1115098043813077e-05, + "loss": 0.1383, + "step": 20628 + }, + { + "epoch": 1.1322722283205269, + "grad_norm": 1.2307155132293701, + "learning_rate": 1.1110744174509952e-05, + "loss": 0.2755, + "step": 20630 + }, + { + "epoch": 1.1323819978046104, + "grad_norm": 1.2319254875183105, + "learning_rate": 1.1106390914457751e-05, + "loss": 0.1976, + "step": 20632 + }, + { + "epoch": 1.1324917672886938, + "grad_norm": 1.296129822731018, + "learning_rate": 1.1102038263847448e-05, + "loss": 0.1693, + "step": 20634 + }, + { + "epoch": 1.132601536772777, + "grad_norm": 1.8642853498458862, + "learning_rate": 1.1097686222869963e-05, + "loss": 0.1869, + "step": 20636 + }, + { + "epoch": 1.1327113062568606, + "grad_norm": 1.302427053451538, + "learning_rate": 1.1093334791716196e-05, + "loss": 0.2788, + "step": 20638 + }, + { + "epoch": 1.132821075740944, + "grad_norm": 1.3179064989089966, + "learning_rate": 1.1088983970577021e-05, + "loss": 0.1526, + "step": 20640 + }, + { + "epoch": 1.1329308452250275, + "grad_norm": 1.0896573066711426, + "learning_rate": 1.1084633759643282e-05, + "loss": 0.288, + "step": 20642 + }, + { + "epoch": 1.1330406147091108, + "grad_norm": 1.201923131942749, + "learning_rate": 1.1080284159105814e-05, + "loss": 0.2203, + "step": 20644 + }, + { + "epoch": 1.1331503841931942, + "grad_norm": 1.2730897665023804, + "learning_rate": 1.1075935169155396e-05, + "loss": 0.1568, + "step": 20646 + }, + { + "epoch": 1.1332601536772777, + "grad_norm": 1.2883063554763794, + "learning_rate": 1.1071586789982816e-05, + "loss": 0.1734, + "step": 20648 + }, + { + "epoch": 1.1333699231613612, + "grad_norm": 1.9642716646194458, + "learning_rate": 1.1067239021778802e-05, + "loss": 0.3408, + "step": 20650 + }, + { + "epoch": 1.1334796926454445, + "grad_norm": 1.168243646621704, + "learning_rate": 1.1062891864734071e-05, + "loss": 0.1824, + "step": 20652 + }, + { + "epoch": 1.133589462129528, + "grad_norm": 0.8669171929359436, + "learning_rate": 1.1058545319039309e-05, + "loss": 0.1757, + "step": 20654 + }, + { + "epoch": 1.1336992316136114, + "grad_norm": 1.346994161605835, + "learning_rate": 1.1054199384885172e-05, + "loss": 0.1872, + "step": 20656 + }, + { + "epoch": 1.133809001097695, + "grad_norm": 1.3143030405044556, + "learning_rate": 1.1049854062462308e-05, + "loss": 0.2481, + "step": 20658 + }, + { + "epoch": 1.1339187705817784, + "grad_norm": 1.50228750705719, + "learning_rate": 1.1045509351961315e-05, + "loss": 0.2385, + "step": 20660 + }, + { + "epoch": 1.1340285400658616, + "grad_norm": 1.7138018608093262, + "learning_rate": 1.1041165253572775e-05, + "loss": 0.3013, + "step": 20662 + }, + { + "epoch": 1.134138309549945, + "grad_norm": 1.106447458267212, + "learning_rate": 1.1036821767487241e-05, + "loss": 0.1382, + "step": 20664 + }, + { + "epoch": 1.1342480790340286, + "grad_norm": 1.0522385835647583, + "learning_rate": 1.103247889389524e-05, + "loss": 0.1912, + "step": 20666 + }, + { + "epoch": 1.134357848518112, + "grad_norm": 0.5686368942260742, + "learning_rate": 1.1028136632987263e-05, + "loss": 0.2187, + "step": 20668 + }, + { + "epoch": 1.1344676180021953, + "grad_norm": 1.4079610109329224, + "learning_rate": 1.1023794984953797e-05, + "loss": 0.2002, + "step": 20670 + }, + { + "epoch": 1.1345773874862788, + "grad_norm": 1.1663368940353394, + "learning_rate": 1.1019453949985284e-05, + "loss": 0.3115, + "step": 20672 + }, + { + "epoch": 1.1346871569703623, + "grad_norm": 1.3655023574829102, + "learning_rate": 1.1015113528272141e-05, + "loss": 0.2102, + "step": 20674 + }, + { + "epoch": 1.1347969264544457, + "grad_norm": 1.1394163370132446, + "learning_rate": 1.1010773720004755e-05, + "loss": 0.2179, + "step": 20676 + }, + { + "epoch": 1.134906695938529, + "grad_norm": 0.8235660791397095, + "learning_rate": 1.1006434525373502e-05, + "loss": 0.1308, + "step": 20678 + }, + { + "epoch": 1.1350164654226125, + "grad_norm": 1.761820912361145, + "learning_rate": 1.1002095944568707e-05, + "loss": 0.1271, + "step": 20680 + }, + { + "epoch": 1.135126234906696, + "grad_norm": 1.5716599225997925, + "learning_rate": 1.0997757977780698e-05, + "loss": 0.1806, + "step": 20682 + }, + { + "epoch": 1.1352360043907794, + "grad_norm": 1.0628300905227661, + "learning_rate": 1.0993420625199752e-05, + "loss": 0.1232, + "step": 20684 + }, + { + "epoch": 1.135345773874863, + "grad_norm": 1.0535756349563599, + "learning_rate": 1.0989083887016128e-05, + "loss": 0.1684, + "step": 20686 + }, + { + "epoch": 1.1354555433589462, + "grad_norm": 1.5567638874053955, + "learning_rate": 1.098474776342005e-05, + "loss": 0.2824, + "step": 20688 + }, + { + "epoch": 1.1355653128430296, + "grad_norm": 1.509735345840454, + "learning_rate": 1.098041225460173e-05, + "loss": 0.1734, + "step": 20690 + }, + { + "epoch": 1.1356750823271131, + "grad_norm": 1.31514310836792, + "learning_rate": 1.0976077360751327e-05, + "loss": 0.2033, + "step": 20692 + }, + { + "epoch": 1.1357848518111964, + "grad_norm": 1.2417750358581543, + "learning_rate": 1.0971743082059017e-05, + "loss": 0.1863, + "step": 20694 + }, + { + "epoch": 1.1358946212952798, + "grad_norm": 2.4549288749694824, + "learning_rate": 1.0967409418714907e-05, + "loss": 0.2238, + "step": 20696 + }, + { + "epoch": 1.1360043907793633, + "grad_norm": 0.9866318106651306, + "learning_rate": 1.09630763709091e-05, + "loss": 0.1769, + "step": 20698 + }, + { + "epoch": 1.1361141602634468, + "grad_norm": 1.8074673414230347, + "learning_rate": 1.0958743938831659e-05, + "loss": 0.168, + "step": 20700 + }, + { + "epoch": 1.1362239297475303, + "grad_norm": 1.1450707912445068, + "learning_rate": 1.0954412122672628e-05, + "loss": 0.1616, + "step": 20702 + }, + { + "epoch": 1.1363336992316135, + "grad_norm": 0.6870021820068359, + "learning_rate": 1.095008092262201e-05, + "loss": 0.1421, + "step": 20704 + }, + { + "epoch": 1.136443468715697, + "grad_norm": 1.8002903461456299, + "learning_rate": 1.0945750338869806e-05, + "loss": 0.2722, + "step": 20706 + }, + { + "epoch": 1.1365532381997805, + "grad_norm": 1.4333046674728394, + "learning_rate": 1.0941420371605981e-05, + "loss": 0.2438, + "step": 20708 + }, + { + "epoch": 1.136663007683864, + "grad_norm": 1.3777531385421753, + "learning_rate": 1.0937091021020465e-05, + "loss": 0.3299, + "step": 20710 + }, + { + "epoch": 1.1367727771679472, + "grad_norm": 1.0682048797607422, + "learning_rate": 1.0932762287303161e-05, + "loss": 0.1656, + "step": 20712 + }, + { + "epoch": 1.1368825466520307, + "grad_norm": 1.224921464920044, + "learning_rate": 1.0928434170643947e-05, + "loss": 0.1536, + "step": 20714 + }, + { + "epoch": 1.1369923161361142, + "grad_norm": 1.2738922834396362, + "learning_rate": 1.0924106671232681e-05, + "loss": 0.1894, + "step": 20716 + }, + { + "epoch": 1.1371020856201977, + "grad_norm": 2.302333116531372, + "learning_rate": 1.0919779789259172e-05, + "loss": 0.3203, + "step": 20718 + }, + { + "epoch": 1.137211855104281, + "grad_norm": 1.0346168279647827, + "learning_rate": 1.0915453524913243e-05, + "loss": 0.1869, + "step": 20720 + }, + { + "epoch": 1.1373216245883644, + "grad_norm": 1.0371301174163818, + "learning_rate": 1.0911127878384652e-05, + "loss": 0.1672, + "step": 20722 + }, + { + "epoch": 1.1374313940724479, + "grad_norm": 1.508211612701416, + "learning_rate": 1.0906802849863146e-05, + "loss": 0.1294, + "step": 20724 + }, + { + "epoch": 1.1375411635565313, + "grad_norm": 0.9785498380661011, + "learning_rate": 1.0902478439538438e-05, + "loss": 0.1216, + "step": 20726 + }, + { + "epoch": 1.1376509330406148, + "grad_norm": 1.2594428062438965, + "learning_rate": 1.0898154647600215e-05, + "loss": 0.2439, + "step": 20728 + }, + { + "epoch": 1.137760702524698, + "grad_norm": 1.3092972040176392, + "learning_rate": 1.089383147423815e-05, + "loss": 0.2902, + "step": 20730 + }, + { + "epoch": 1.1378704720087816, + "grad_norm": 0.9952117800712585, + "learning_rate": 1.0889508919641877e-05, + "loss": 0.212, + "step": 20732 + }, + { + "epoch": 1.137980241492865, + "grad_norm": 1.7681180238723755, + "learning_rate": 1.0885186984001e-05, + "loss": 0.268, + "step": 20734 + }, + { + "epoch": 1.1380900109769483, + "grad_norm": 1.0143285989761353, + "learning_rate": 1.0880865667505092e-05, + "loss": 0.1576, + "step": 20736 + }, + { + "epoch": 1.1381997804610318, + "grad_norm": 1.3182802200317383, + "learning_rate": 1.0876544970343728e-05, + "loss": 0.2074, + "step": 20738 + }, + { + "epoch": 1.1383095499451152, + "grad_norm": 1.277477741241455, + "learning_rate": 1.087222489270642e-05, + "loss": 0.162, + "step": 20740 + }, + { + "epoch": 1.1384193194291987, + "grad_norm": 1.250053882598877, + "learning_rate": 1.0867905434782662e-05, + "loss": 0.2838, + "step": 20742 + }, + { + "epoch": 1.1385290889132822, + "grad_norm": 3.20788836479187, + "learning_rate": 1.0863586596761949e-05, + "loss": 0.1992, + "step": 20744 + }, + { + "epoch": 1.1386388583973654, + "grad_norm": 1.596729040145874, + "learning_rate": 1.0859268378833712e-05, + "loss": 0.2488, + "step": 20746 + }, + { + "epoch": 1.138748627881449, + "grad_norm": 1.097132682800293, + "learning_rate": 1.085495078118737e-05, + "loss": 0.2354, + "step": 20748 + }, + { + "epoch": 1.1388583973655324, + "grad_norm": 2.009678363800049, + "learning_rate": 1.0850633804012314e-05, + "loss": 0.3161, + "step": 20750 + }, + { + "epoch": 1.1389681668496159, + "grad_norm": 1.372841238975525, + "learning_rate": 1.0846317447497909e-05, + "loss": 0.2287, + "step": 20752 + }, + { + "epoch": 1.1390779363336991, + "grad_norm": 2.0691652297973633, + "learning_rate": 1.0842001711833483e-05, + "loss": 0.2131, + "step": 20754 + }, + { + "epoch": 1.1391877058177826, + "grad_norm": 1.067245364189148, + "learning_rate": 1.0837686597208363e-05, + "loss": 0.2964, + "step": 20756 + }, + { + "epoch": 1.139297475301866, + "grad_norm": 0.8411799073219299, + "learning_rate": 1.083337210381182e-05, + "loss": 0.147, + "step": 20758 + }, + { + "epoch": 1.1394072447859496, + "grad_norm": 1.6860307455062866, + "learning_rate": 1.0829058231833112e-05, + "loss": 0.2816, + "step": 20760 + }, + { + "epoch": 1.1395170142700328, + "grad_norm": 0.8637361526489258, + "learning_rate": 1.0824744981461462e-05, + "loss": 0.1341, + "step": 20762 + }, + { + "epoch": 1.1396267837541163, + "grad_norm": 0.9964738488197327, + "learning_rate": 1.0820432352886067e-05, + "loss": 0.1474, + "step": 20764 + }, + { + "epoch": 1.1397365532381998, + "grad_norm": 1.2713978290557861, + "learning_rate": 1.0816120346296116e-05, + "loss": 0.2046, + "step": 20766 + }, + { + "epoch": 1.1398463227222833, + "grad_norm": 1.581196665763855, + "learning_rate": 1.0811808961880734e-05, + "loss": 0.2148, + "step": 20768 + }, + { + "epoch": 1.1399560922063667, + "grad_norm": 1.492699384689331, + "learning_rate": 1.080749819982906e-05, + "loss": 0.2081, + "step": 20770 + }, + { + "epoch": 1.14006586169045, + "grad_norm": 1.307786464691162, + "learning_rate": 1.0803188060330176e-05, + "loss": 0.2194, + "step": 20772 + }, + { + "epoch": 1.1401756311745335, + "grad_norm": 1.2309929132461548, + "learning_rate": 1.0798878543573149e-05, + "loss": 0.2252, + "step": 20774 + }, + { + "epoch": 1.140285400658617, + "grad_norm": 1.101128101348877, + "learning_rate": 1.0794569649747007e-05, + "loss": 0.1242, + "step": 20776 + }, + { + "epoch": 1.1403951701427004, + "grad_norm": 1.3149985074996948, + "learning_rate": 1.0790261379040758e-05, + "loss": 0.1965, + "step": 20778 + }, + { + "epoch": 1.1405049396267837, + "grad_norm": 1.3738597631454468, + "learning_rate": 1.07859537316434e-05, + "loss": 0.2703, + "step": 20780 + }, + { + "epoch": 1.1406147091108672, + "grad_norm": 0.8828713297843933, + "learning_rate": 1.0781646707743877e-05, + "loss": 0.2482, + "step": 20782 + }, + { + "epoch": 1.1407244785949506, + "grad_norm": 1.8674358129501343, + "learning_rate": 1.077734030753112e-05, + "loss": 0.2183, + "step": 20784 + }, + { + "epoch": 1.140834248079034, + "grad_norm": 0.8459065556526184, + "learning_rate": 1.0773034531194024e-05, + "loss": 0.1646, + "step": 20786 + }, + { + "epoch": 1.1409440175631174, + "grad_norm": 1.3591711521148682, + "learning_rate": 1.0768729378921463e-05, + "loss": 0.2579, + "step": 20788 + }, + { + "epoch": 1.1410537870472008, + "grad_norm": 0.8430252075195312, + "learning_rate": 1.0764424850902272e-05, + "loss": 0.1577, + "step": 20790 + }, + { + "epoch": 1.1411635565312843, + "grad_norm": 1.8448668718338013, + "learning_rate": 1.0760120947325292e-05, + "loss": 0.2007, + "step": 20792 + }, + { + "epoch": 1.1412733260153678, + "grad_norm": 1.1233892440795898, + "learning_rate": 1.0755817668379299e-05, + "loss": 0.214, + "step": 20794 + }, + { + "epoch": 1.1413830954994513, + "grad_norm": 0.8896909356117249, + "learning_rate": 1.0751515014253055e-05, + "loss": 0.1128, + "step": 20796 + }, + { + "epoch": 1.1414928649835345, + "grad_norm": 1.3119513988494873, + "learning_rate": 1.0747212985135293e-05, + "loss": 0.1597, + "step": 20798 + }, + { + "epoch": 1.141602634467618, + "grad_norm": 0.7286577224731445, + "learning_rate": 1.0742911581214735e-05, + "loss": 0.1215, + "step": 20800 + }, + { + "epoch": 1.1417124039517015, + "grad_norm": 1.657090425491333, + "learning_rate": 1.0738610802680052e-05, + "loss": 0.2278, + "step": 20802 + }, + { + "epoch": 1.1418221734357847, + "grad_norm": 1.4405653476715088, + "learning_rate": 1.0734310649719889e-05, + "loss": 0.2582, + "step": 20804 + }, + { + "epoch": 1.1419319429198682, + "grad_norm": 1.7116875648498535, + "learning_rate": 1.073001112252289e-05, + "loss": 0.3356, + "step": 20806 + }, + { + "epoch": 1.1420417124039517, + "grad_norm": 1.2157663106918335, + "learning_rate": 1.0725712221277643e-05, + "loss": 0.1319, + "step": 20808 + }, + { + "epoch": 1.1421514818880352, + "grad_norm": 0.9870991110801697, + "learning_rate": 1.0721413946172724e-05, + "loss": 0.2297, + "step": 20810 + }, + { + "epoch": 1.1422612513721186, + "grad_norm": 0.9907146096229553, + "learning_rate": 1.0717116297396671e-05, + "loss": 0.1621, + "step": 20812 + }, + { + "epoch": 1.142371020856202, + "grad_norm": 1.0230578184127808, + "learning_rate": 1.071281927513799e-05, + "loss": 0.2141, + "step": 20814 + }, + { + "epoch": 1.1424807903402854, + "grad_norm": 1.3605146408081055, + "learning_rate": 1.0708522879585192e-05, + "loss": 0.1968, + "step": 20816 + }, + { + "epoch": 1.1425905598243689, + "grad_norm": 1.2206364870071411, + "learning_rate": 1.0704227110926729e-05, + "loss": 0.223, + "step": 20818 + }, + { + "epoch": 1.1427003293084523, + "grad_norm": 1.1414451599121094, + "learning_rate": 1.0699931969351033e-05, + "loss": 0.2305, + "step": 20820 + }, + { + "epoch": 1.1428100987925356, + "grad_norm": 1.1216500997543335, + "learning_rate": 1.0695637455046505e-05, + "loss": 0.2188, + "step": 20822 + }, + { + "epoch": 1.142919868276619, + "grad_norm": 0.864373505115509, + "learning_rate": 1.069134356820153e-05, + "loss": 0.1871, + "step": 20824 + }, + { + "epoch": 1.1430296377607025, + "grad_norm": 1.2583887577056885, + "learning_rate": 1.0687050309004451e-05, + "loss": 0.3271, + "step": 20826 + }, + { + "epoch": 1.143139407244786, + "grad_norm": 2.0958974361419678, + "learning_rate": 1.0682757677643596e-05, + "loss": 0.2304, + "step": 20828 + }, + { + "epoch": 1.1432491767288693, + "grad_norm": 1.109207034111023, + "learning_rate": 1.0678465674307273e-05, + "loss": 0.1567, + "step": 20830 + }, + { + "epoch": 1.1433589462129528, + "grad_norm": 1.6562321186065674, + "learning_rate": 1.0674174299183737e-05, + "loss": 0.2499, + "step": 20832 + }, + { + "epoch": 1.1434687156970362, + "grad_norm": 1.2481398582458496, + "learning_rate": 1.0669883552461232e-05, + "loss": 0.1487, + "step": 20834 + }, + { + "epoch": 1.1435784851811197, + "grad_norm": 0.7747460007667542, + "learning_rate": 1.0665593434327973e-05, + "loss": 0.173, + "step": 20836 + }, + { + "epoch": 1.1436882546652032, + "grad_norm": 1.018692135810852, + "learning_rate": 1.066130394497214e-05, + "loss": 0.167, + "step": 20838 + }, + { + "epoch": 1.1437980241492864, + "grad_norm": 0.8455358743667603, + "learning_rate": 1.0657015084581887e-05, + "loss": 0.2074, + "step": 20840 + }, + { + "epoch": 1.14390779363337, + "grad_norm": 1.2254642248153687, + "learning_rate": 1.0652726853345362e-05, + "loss": 0.1753, + "step": 20842 + }, + { + "epoch": 1.1440175631174534, + "grad_norm": 0.89448481798172, + "learning_rate": 1.0648439251450656e-05, + "loss": 0.1324, + "step": 20844 + }, + { + "epoch": 1.1441273326015367, + "grad_norm": 0.9550970196723938, + "learning_rate": 1.0644152279085848e-05, + "loss": 0.1802, + "step": 20846 + }, + { + "epoch": 1.1442371020856201, + "grad_norm": 1.012632131576538, + "learning_rate": 1.0639865936438983e-05, + "loss": 0.2309, + "step": 20848 + }, + { + "epoch": 1.1443468715697036, + "grad_norm": 1.235690712928772, + "learning_rate": 1.0635580223698071e-05, + "loss": 0.234, + "step": 20850 + }, + { + "epoch": 1.144456641053787, + "grad_norm": 1.1652936935424805, + "learning_rate": 1.0631295141051125e-05, + "loss": 0.2055, + "step": 20852 + }, + { + "epoch": 1.1445664105378706, + "grad_norm": 1.155393362045288, + "learning_rate": 1.0627010688686102e-05, + "loss": 0.147, + "step": 20854 + }, + { + "epoch": 1.1446761800219538, + "grad_norm": 1.5336222648620605, + "learning_rate": 1.0622726866790936e-05, + "loss": 0.3047, + "step": 20856 + }, + { + "epoch": 1.1447859495060373, + "grad_norm": 0.977475106716156, + "learning_rate": 1.0618443675553527e-05, + "loss": 0.1551, + "step": 20858 + }, + { + "epoch": 1.1448957189901208, + "grad_norm": 1.0113334655761719, + "learning_rate": 1.0614161115161777e-05, + "loss": 0.2212, + "step": 20860 + }, + { + "epoch": 1.1450054884742042, + "grad_norm": 0.941250205039978, + "learning_rate": 1.0609879185803529e-05, + "loss": 0.153, + "step": 20862 + }, + { + "epoch": 1.1451152579582875, + "grad_norm": 1.0039470195770264, + "learning_rate": 1.0605597887666604e-05, + "loss": 0.1205, + "step": 20864 + }, + { + "epoch": 1.145225027442371, + "grad_norm": 1.0703281164169312, + "learning_rate": 1.0601317220938815e-05, + "loss": 0.1368, + "step": 20866 + }, + { + "epoch": 1.1453347969264545, + "grad_norm": 1.0989388227462769, + "learning_rate": 1.0597037185807925e-05, + "loss": 0.1451, + "step": 20868 + }, + { + "epoch": 1.145444566410538, + "grad_norm": 2.189478635787964, + "learning_rate": 1.059275778246168e-05, + "loss": 0.154, + "step": 20870 + }, + { + "epoch": 1.1455543358946212, + "grad_norm": 1.056952714920044, + "learning_rate": 1.0588479011087793e-05, + "loss": 0.1821, + "step": 20872 + }, + { + "epoch": 1.1456641053787047, + "grad_norm": 1.005193829536438, + "learning_rate": 1.0584200871873953e-05, + "loss": 0.1243, + "step": 20874 + }, + { + "epoch": 1.1457738748627881, + "grad_norm": 1.5032416582107544, + "learning_rate": 1.057992336500781e-05, + "loss": 0.3181, + "step": 20876 + }, + { + "epoch": 1.1458836443468716, + "grad_norm": 1.3436188697814941, + "learning_rate": 1.0575646490677013e-05, + "loss": 0.3271, + "step": 20878 + }, + { + "epoch": 1.145993413830955, + "grad_norm": 1.4099427461624146, + "learning_rate": 1.0571370249069162e-05, + "loss": 0.1779, + "step": 20880 + }, + { + "epoch": 1.1461031833150384, + "grad_norm": 0.9446223974227905, + "learning_rate": 1.0567094640371832e-05, + "loss": 0.1654, + "step": 20882 + }, + { + "epoch": 1.1462129527991218, + "grad_norm": 1.0494047403335571, + "learning_rate": 1.0562819664772572e-05, + "loss": 0.1447, + "step": 20884 + }, + { + "epoch": 1.1463227222832053, + "grad_norm": 1.2322684526443481, + "learning_rate": 1.0558545322458901e-05, + "loss": 0.1603, + "step": 20886 + }, + { + "epoch": 1.1464324917672888, + "grad_norm": 1.342512607574463, + "learning_rate": 1.0554271613618308e-05, + "loss": 0.1398, + "step": 20888 + }, + { + "epoch": 1.146542261251372, + "grad_norm": 1.2115861177444458, + "learning_rate": 1.0549998538438265e-05, + "loss": 0.2364, + "step": 20890 + }, + { + "epoch": 1.1466520307354555, + "grad_norm": 1.6398557424545288, + "learning_rate": 1.0545726097106223e-05, + "loss": 0.2521, + "step": 20892 + }, + { + "epoch": 1.146761800219539, + "grad_norm": 0.8168389797210693, + "learning_rate": 1.0541454289809577e-05, + "loss": 0.127, + "step": 20894 + }, + { + "epoch": 1.1468715697036225, + "grad_norm": 1.0401983261108398, + "learning_rate": 1.0537183116735714e-05, + "loss": 0.1598, + "step": 20896 + }, + { + "epoch": 1.1469813391877057, + "grad_norm": 1.095367670059204, + "learning_rate": 1.0532912578071985e-05, + "loss": 0.1615, + "step": 20898 + }, + { + "epoch": 1.1470911086717892, + "grad_norm": 2.16683030128479, + "learning_rate": 1.0528642674005711e-05, + "loss": 0.2527, + "step": 20900 + }, + { + "epoch": 1.1472008781558727, + "grad_norm": 1.0592278242111206, + "learning_rate": 1.0524373404724209e-05, + "loss": 0.1877, + "step": 20902 + }, + { + "epoch": 1.1473106476399562, + "grad_norm": 1.190300464630127, + "learning_rate": 1.052010477041474e-05, + "loss": 0.3122, + "step": 20904 + }, + { + "epoch": 1.1474204171240396, + "grad_norm": 1.0280286073684692, + "learning_rate": 1.0515836771264544e-05, + "loss": 0.1953, + "step": 20906 + }, + { + "epoch": 1.147530186608123, + "grad_norm": 1.4837149381637573, + "learning_rate": 1.0511569407460845e-05, + "loss": 0.2798, + "step": 20908 + }, + { + "epoch": 1.1476399560922064, + "grad_norm": 1.2442110776901245, + "learning_rate": 1.0507302679190823e-05, + "loss": 0.1899, + "step": 20910 + }, + { + "epoch": 1.1477497255762898, + "grad_norm": 1.7038031816482544, + "learning_rate": 1.0503036586641629e-05, + "loss": 0.2038, + "step": 20912 + }, + { + "epoch": 1.147859495060373, + "grad_norm": 1.734635591506958, + "learning_rate": 1.0498771130000415e-05, + "loss": 0.2678, + "step": 20914 + }, + { + "epoch": 1.1479692645444566, + "grad_norm": 1.2565789222717285, + "learning_rate": 1.0494506309454277e-05, + "loss": 0.2541, + "step": 20916 + }, + { + "epoch": 1.14807903402854, + "grad_norm": 1.4588556289672852, + "learning_rate": 1.049024212519028e-05, + "loss": 0.2476, + "step": 20918 + }, + { + "epoch": 1.1481888035126235, + "grad_norm": 1.1355867385864258, + "learning_rate": 1.0485978577395491e-05, + "loss": 0.1547, + "step": 20920 + }, + { + "epoch": 1.148298572996707, + "grad_norm": 0.7039055824279785, + "learning_rate": 1.048171566625692e-05, + "loss": 0.2164, + "step": 20922 + }, + { + "epoch": 1.1484083424807903, + "grad_norm": 1.7450146675109863, + "learning_rate": 1.0477453391961558e-05, + "loss": 0.2045, + "step": 20924 + }, + { + "epoch": 1.1485181119648737, + "grad_norm": 1.0683432817459106, + "learning_rate": 1.0473191754696365e-05, + "loss": 0.1361, + "step": 20926 + }, + { + "epoch": 1.1486278814489572, + "grad_norm": 1.1075547933578491, + "learning_rate": 1.046893075464829e-05, + "loss": 0.1408, + "step": 20928 + }, + { + "epoch": 1.1487376509330407, + "grad_norm": 1.1947437524795532, + "learning_rate": 1.0464670392004235e-05, + "loss": 0.2876, + "step": 20930 + }, + { + "epoch": 1.148847420417124, + "grad_norm": 1.3100879192352295, + "learning_rate": 1.0460410666951081e-05, + "loss": 0.1796, + "step": 20932 + }, + { + "epoch": 1.1489571899012074, + "grad_norm": 1.0441514253616333, + "learning_rate": 1.0456151579675682e-05, + "loss": 0.1819, + "step": 20934 + }, + { + "epoch": 1.149066959385291, + "grad_norm": 1.583268404006958, + "learning_rate": 1.0451893130364857e-05, + "loss": 0.1802, + "step": 20936 + }, + { + "epoch": 1.1491767288693744, + "grad_norm": 0.9053165912628174, + "learning_rate": 1.0447635319205398e-05, + "loss": 0.146, + "step": 20938 + }, + { + "epoch": 1.1492864983534576, + "grad_norm": 1.1929094791412354, + "learning_rate": 1.0443378146384089e-05, + "loss": 0.2074, + "step": 20940 + }, + { + "epoch": 1.1493962678375411, + "grad_norm": 1.2699484825134277, + "learning_rate": 1.0439121612087663e-05, + "loss": 0.1446, + "step": 20942 + }, + { + "epoch": 1.1495060373216246, + "grad_norm": 0.9343648552894592, + "learning_rate": 1.0434865716502834e-05, + "loss": 0.1809, + "step": 20944 + }, + { + "epoch": 1.149615806805708, + "grad_norm": 0.8560636639595032, + "learning_rate": 1.0430610459816281e-05, + "loss": 0.1329, + "step": 20946 + }, + { + "epoch": 1.1497255762897916, + "grad_norm": 0.9560309052467346, + "learning_rate": 1.0426355842214657e-05, + "loss": 0.1563, + "step": 20948 + }, + { + "epoch": 1.1498353457738748, + "grad_norm": 0.8181703686714172, + "learning_rate": 1.0422101863884597e-05, + "loss": 0.1316, + "step": 20950 + }, + { + "epoch": 1.1499451152579583, + "grad_norm": 1.4339724779129028, + "learning_rate": 1.0417848525012713e-05, + "loss": 0.2782, + "step": 20952 + }, + { + "epoch": 1.1500548847420418, + "grad_norm": 0.9411487579345703, + "learning_rate": 1.0413595825785564e-05, + "loss": 0.1278, + "step": 20954 + }, + { + "epoch": 1.150164654226125, + "grad_norm": 1.0622127056121826, + "learning_rate": 1.0409343766389695e-05, + "loss": 0.1405, + "step": 20956 + }, + { + "epoch": 1.1502744237102085, + "grad_norm": 1.043217658996582, + "learning_rate": 1.0405092347011627e-05, + "loss": 0.1557, + "step": 20958 + }, + { + "epoch": 1.150384193194292, + "grad_norm": 0.8912575244903564, + "learning_rate": 1.0400841567837843e-05, + "loss": 0.1459, + "step": 20960 + }, + { + "epoch": 1.1504939626783754, + "grad_norm": 1.0400770902633667, + "learning_rate": 1.0396591429054795e-05, + "loss": 0.1894, + "step": 20962 + }, + { + "epoch": 1.150603732162459, + "grad_norm": 1.2555108070373535, + "learning_rate": 1.0392341930848934e-05, + "loss": 0.2644, + "step": 20964 + }, + { + "epoch": 1.1507135016465422, + "grad_norm": 0.9404834508895874, + "learning_rate": 1.0388093073406654e-05, + "loss": 0.266, + "step": 20966 + }, + { + "epoch": 1.1508232711306257, + "grad_norm": 1.3026574850082397, + "learning_rate": 1.0383844856914333e-05, + "loss": 0.2345, + "step": 20968 + }, + { + "epoch": 1.1509330406147091, + "grad_norm": 0.952978253364563, + "learning_rate": 1.0379597281558315e-05, + "loss": 0.1619, + "step": 20970 + }, + { + "epoch": 1.1510428100987926, + "grad_norm": 1.1505070924758911, + "learning_rate": 1.0375350347524923e-05, + "loss": 0.1917, + "step": 20972 + }, + { + "epoch": 1.1511525795828759, + "grad_norm": 1.2141464948654175, + "learning_rate": 1.0371104055000438e-05, + "loss": 0.2452, + "step": 20974 + }, + { + "epoch": 1.1512623490669593, + "grad_norm": 0.8510380983352661, + "learning_rate": 1.036685840417114e-05, + "loss": 0.2907, + "step": 20976 + }, + { + "epoch": 1.1513721185510428, + "grad_norm": 1.7366042137145996, + "learning_rate": 1.0362613395223247e-05, + "loss": 0.1772, + "step": 20978 + }, + { + "epoch": 1.1514818880351263, + "grad_norm": 1.1389737129211426, + "learning_rate": 1.0358369028342985e-05, + "loss": 0.2299, + "step": 20980 + }, + { + "epoch": 1.1515916575192096, + "grad_norm": 0.7725861668586731, + "learning_rate": 1.035412530371652e-05, + "loss": 0.1632, + "step": 20982 + }, + { + "epoch": 1.151701427003293, + "grad_norm": 1.006589412689209, + "learning_rate": 1.0349882221530008e-05, + "loss": 0.1349, + "step": 20984 + }, + { + "epoch": 1.1518111964873765, + "grad_norm": 1.477589726448059, + "learning_rate": 1.034563978196956e-05, + "loss": 0.2067, + "step": 20986 + }, + { + "epoch": 1.15192096597146, + "grad_norm": 0.9683922529220581, + "learning_rate": 1.0341397985221284e-05, + "loss": 0.1792, + "step": 20988 + }, + { + "epoch": 1.1520307354555435, + "grad_norm": 0.9475816488265991, + "learning_rate": 1.0337156831471246e-05, + "loss": 0.1609, + "step": 20990 + }, + { + "epoch": 1.1521405049396267, + "grad_norm": 1.0494275093078613, + "learning_rate": 1.0332916320905479e-05, + "loss": 0.1712, + "step": 20992 + }, + { + "epoch": 1.1522502744237102, + "grad_norm": 1.460396647453308, + "learning_rate": 1.0328676453709987e-05, + "loss": 0.1731, + "step": 20994 + }, + { + "epoch": 1.1523600439077937, + "grad_norm": 0.8528972268104553, + "learning_rate": 1.032443723007076e-05, + "loss": 0.1674, + "step": 20996 + }, + { + "epoch": 1.1524698133918772, + "grad_norm": 0.9760070443153381, + "learning_rate": 1.032019865017374e-05, + "loss": 0.1743, + "step": 20998 + }, + { + "epoch": 1.1525795828759604, + "grad_norm": 1.0819107294082642, + "learning_rate": 1.031596071420487e-05, + "loss": 0.1718, + "step": 21000 + }, + { + "epoch": 1.1526893523600439, + "grad_norm": 1.5305120944976807, + "learning_rate": 1.0311723422350034e-05, + "loss": 0.1656, + "step": 21002 + }, + { + "epoch": 1.1527991218441274, + "grad_norm": 2.106497049331665, + "learning_rate": 1.0307486774795102e-05, + "loss": 0.1972, + "step": 21004 + }, + { + "epoch": 1.1529088913282108, + "grad_norm": 0.9286597967147827, + "learning_rate": 1.0303250771725918e-05, + "loss": 0.3119, + "step": 21006 + }, + { + "epoch": 1.153018660812294, + "grad_norm": 1.0182043313980103, + "learning_rate": 1.0299015413328289e-05, + "loss": 0.1558, + "step": 21008 + }, + { + "epoch": 1.1531284302963776, + "grad_norm": 1.02573561668396, + "learning_rate": 1.0294780699787993e-05, + "loss": 0.1999, + "step": 21010 + }, + { + "epoch": 1.153238199780461, + "grad_norm": 1.2851755619049072, + "learning_rate": 1.0290546631290793e-05, + "loss": 0.1872, + "step": 21012 + }, + { + "epoch": 1.1533479692645445, + "grad_norm": 0.8376994729042053, + "learning_rate": 1.0286313208022424e-05, + "loss": 0.1167, + "step": 21014 + }, + { + "epoch": 1.153457738748628, + "grad_norm": 0.8143194317817688, + "learning_rate": 1.0282080430168579e-05, + "loss": 0.1677, + "step": 21016 + }, + { + "epoch": 1.1535675082327113, + "grad_norm": 1.335087776184082, + "learning_rate": 1.0277848297914925e-05, + "loss": 0.1802, + "step": 21018 + }, + { + "epoch": 1.1536772777167947, + "grad_norm": 1.225350260734558, + "learning_rate": 1.0273616811447104e-05, + "loss": 0.1788, + "step": 21020 + }, + { + "epoch": 1.1537870472008782, + "grad_norm": 2.133575916290283, + "learning_rate": 1.026938597095073e-05, + "loss": 0.207, + "step": 21022 + }, + { + "epoch": 1.1538968166849615, + "grad_norm": 1.1042191982269287, + "learning_rate": 1.0265155776611382e-05, + "loss": 0.2191, + "step": 21024 + }, + { + "epoch": 1.154006586169045, + "grad_norm": 1.4274792671203613, + "learning_rate": 1.0260926228614634e-05, + "loss": 0.1365, + "step": 21026 + }, + { + "epoch": 1.1541163556531284, + "grad_norm": 1.2037663459777832, + "learning_rate": 1.0256697327146003e-05, + "loss": 0.1642, + "step": 21028 + }, + { + "epoch": 1.154226125137212, + "grad_norm": 1.0545319318771362, + "learning_rate": 1.0252469072390994e-05, + "loss": 0.2328, + "step": 21030 + }, + { + "epoch": 1.1543358946212954, + "grad_norm": 1.5496934652328491, + "learning_rate": 1.0248241464535078e-05, + "loss": 0.2037, + "step": 21032 + }, + { + "epoch": 1.1544456641053786, + "grad_norm": 0.8943925499916077, + "learning_rate": 1.0244014503763686e-05, + "loss": 0.1808, + "step": 21034 + }, + { + "epoch": 1.154555433589462, + "grad_norm": 1.312510371208191, + "learning_rate": 1.0239788190262254e-05, + "loss": 0.2521, + "step": 21036 + }, + { + "epoch": 1.1546652030735456, + "grad_norm": 0.8868182897567749, + "learning_rate": 1.0235562524216158e-05, + "loss": 0.255, + "step": 21038 + }, + { + "epoch": 1.154774972557629, + "grad_norm": 1.1684657335281372, + "learning_rate": 1.023133750581075e-05, + "loss": 0.2202, + "step": 21040 + }, + { + "epoch": 1.1548847420417123, + "grad_norm": 1.5561034679412842, + "learning_rate": 1.0227113135231378e-05, + "loss": 0.1877, + "step": 21042 + }, + { + "epoch": 1.1549945115257958, + "grad_norm": 0.8730041980743408, + "learning_rate": 1.022288941266333e-05, + "loss": 0.088, + "step": 21044 + }, + { + "epoch": 1.1551042810098793, + "grad_norm": 1.349073052406311, + "learning_rate": 1.0218666338291886e-05, + "loss": 0.2025, + "step": 21046 + }, + { + "epoch": 1.1552140504939628, + "grad_norm": 0.956926703453064, + "learning_rate": 1.0214443912302276e-05, + "loss": 0.1577, + "step": 21048 + }, + { + "epoch": 1.155323819978046, + "grad_norm": 1.0979161262512207, + "learning_rate": 1.021022213487974e-05, + "loss": 0.1958, + "step": 21050 + }, + { + "epoch": 1.1554335894621295, + "grad_norm": 1.151856780052185, + "learning_rate": 1.020600100620945e-05, + "loss": 0.1716, + "step": 21052 + }, + { + "epoch": 1.155543358946213, + "grad_norm": 1.4987285137176514, + "learning_rate": 1.0201780526476574e-05, + "loss": 0.2026, + "step": 21054 + }, + { + "epoch": 1.1556531284302964, + "grad_norm": 0.8972800374031067, + "learning_rate": 1.0197560695866234e-05, + "loss": 0.1822, + "step": 21056 + }, + { + "epoch": 1.15576289791438, + "grad_norm": 1.0768448114395142, + "learning_rate": 1.0193341514563537e-05, + "loss": 0.1537, + "step": 21058 + }, + { + "epoch": 1.1558726673984632, + "grad_norm": 1.2916849851608276, + "learning_rate": 1.0189122982753549e-05, + "loss": 0.1935, + "step": 21060 + }, + { + "epoch": 1.1559824368825466, + "grad_norm": 4.2274932861328125, + "learning_rate": 1.018490510062133e-05, + "loss": 0.2515, + "step": 21062 + }, + { + "epoch": 1.1560922063666301, + "grad_norm": 0.8538386821746826, + "learning_rate": 1.0180687868351892e-05, + "loss": 0.1885, + "step": 21064 + }, + { + "epoch": 1.1562019758507134, + "grad_norm": 1.5701006650924683, + "learning_rate": 1.017647128613022e-05, + "loss": 0.1976, + "step": 21066 + }, + { + "epoch": 1.1563117453347969, + "grad_norm": 1.1168707609176636, + "learning_rate": 1.0172255354141278e-05, + "loss": 0.1917, + "step": 21068 + }, + { + "epoch": 1.1564215148188803, + "grad_norm": 0.8821299076080322, + "learning_rate": 1.0168040072569984e-05, + "loss": 0.2089, + "step": 21070 + }, + { + "epoch": 1.1565312843029638, + "grad_norm": 0.833143949508667, + "learning_rate": 1.016382544160126e-05, + "loss": 0.189, + "step": 21072 + }, + { + "epoch": 1.1566410537870473, + "grad_norm": 1.2299168109893799, + "learning_rate": 1.0159611461419965e-05, + "loss": 0.184, + "step": 21074 + }, + { + "epoch": 1.1567508232711305, + "grad_norm": 1.0169944763183594, + "learning_rate": 1.0155398132210961e-05, + "loss": 0.1944, + "step": 21076 + }, + { + "epoch": 1.156860592755214, + "grad_norm": 1.9305638074874878, + "learning_rate": 1.0151185454159057e-05, + "loss": 0.1958, + "step": 21078 + }, + { + "epoch": 1.1569703622392975, + "grad_norm": 1.738555669784546, + "learning_rate": 1.0146973427449038e-05, + "loss": 0.1777, + "step": 21080 + }, + { + "epoch": 1.157080131723381, + "grad_norm": 1.1027271747589111, + "learning_rate": 1.0142762052265673e-05, + "loss": 0.1704, + "step": 21082 + }, + { + "epoch": 1.1571899012074642, + "grad_norm": 1.2747174501419067, + "learning_rate": 1.0138551328793674e-05, + "loss": 0.1686, + "step": 21084 + }, + { + "epoch": 1.1572996706915477, + "grad_norm": 1.1873278617858887, + "learning_rate": 1.013434125721777e-05, + "loss": 0.169, + "step": 21086 + }, + { + "epoch": 1.1574094401756312, + "grad_norm": 1.9497864246368408, + "learning_rate": 1.0130131837722623e-05, + "loss": 0.2198, + "step": 21088 + }, + { + "epoch": 1.1575192096597147, + "grad_norm": 1.7915929555892944, + "learning_rate": 1.0125923070492879e-05, + "loss": 0.3011, + "step": 21090 + }, + { + "epoch": 1.157628979143798, + "grad_norm": 0.8637761473655701, + "learning_rate": 1.0121714955713154e-05, + "loss": 0.2179, + "step": 21092 + }, + { + "epoch": 1.1577387486278814, + "grad_norm": 1.2152717113494873, + "learning_rate": 1.011750749356804e-05, + "loss": 0.3524, + "step": 21094 + }, + { + "epoch": 1.1578485181119649, + "grad_norm": 0.8262462019920349, + "learning_rate": 1.0113300684242083e-05, + "loss": 0.2147, + "step": 21096 + }, + { + "epoch": 1.1579582875960484, + "grad_norm": 0.9867494702339172, + "learning_rate": 1.0109094527919838e-05, + "loss": 0.139, + "step": 21098 + }, + { + "epoch": 1.1580680570801318, + "grad_norm": 1.0762089490890503, + "learning_rate": 1.0104889024785785e-05, + "loss": 0.1907, + "step": 21100 + }, + { + "epoch": 1.158177826564215, + "grad_norm": 0.7762182354927063, + "learning_rate": 1.010068417502442e-05, + "loss": 0.1608, + "step": 21102 + }, + { + "epoch": 1.1582875960482986, + "grad_norm": 1.194292426109314, + "learning_rate": 1.0096479978820176e-05, + "loss": 0.1947, + "step": 21104 + }, + { + "epoch": 1.158397365532382, + "grad_norm": 0.828415036201477, + "learning_rate": 1.0092276436357474e-05, + "loss": 0.2458, + "step": 21106 + }, + { + "epoch": 1.1585071350164655, + "grad_norm": 1.066938877105713, + "learning_rate": 1.0088073547820692e-05, + "loss": 0.185, + "step": 21108 + }, + { + "epoch": 1.1586169045005488, + "grad_norm": 1.515623927116394, + "learning_rate": 1.0083871313394192e-05, + "loss": 0.1894, + "step": 21110 + }, + { + "epoch": 1.1587266739846322, + "grad_norm": 1.1289575099945068, + "learning_rate": 1.0079669733262317e-05, + "loss": 0.2308, + "step": 21112 + }, + { + "epoch": 1.1588364434687157, + "grad_norm": 1.167896032333374, + "learning_rate": 1.0075468807609362e-05, + "loss": 0.1856, + "step": 21114 + }, + { + "epoch": 1.1589462129527992, + "grad_norm": 1.5374715328216553, + "learning_rate": 1.0071268536619596e-05, + "loss": 0.1913, + "step": 21116 + }, + { + "epoch": 1.1590559824368825, + "grad_norm": 0.8393316268920898, + "learning_rate": 1.0067068920477271e-05, + "loss": 0.2059, + "step": 21118 + }, + { + "epoch": 1.159165751920966, + "grad_norm": 0.9970742464065552, + "learning_rate": 1.0062869959366585e-05, + "loss": 0.1415, + "step": 21120 + }, + { + "epoch": 1.1592755214050494, + "grad_norm": 1.6907802820205688, + "learning_rate": 1.005867165347175e-05, + "loss": 0.2304, + "step": 21122 + }, + { + "epoch": 1.159385290889133, + "grad_norm": 1.1227631568908691, + "learning_rate": 1.005447400297691e-05, + "loss": 0.3034, + "step": 21124 + }, + { + "epoch": 1.1594950603732164, + "grad_norm": 1.3542029857635498, + "learning_rate": 1.00502770080662e-05, + "loss": 0.166, + "step": 21126 + }, + { + "epoch": 1.1596048298572996, + "grad_norm": 1.1594866514205933, + "learning_rate": 1.0046080668923717e-05, + "loss": 0.2345, + "step": 21128 + }, + { + "epoch": 1.159714599341383, + "grad_norm": 1.1289665699005127, + "learning_rate": 1.0041884985733524e-05, + "loss": 0.1611, + "step": 21130 + }, + { + "epoch": 1.1598243688254666, + "grad_norm": 1.1324849128723145, + "learning_rate": 1.0037689958679686e-05, + "loss": 0.1359, + "step": 21132 + }, + { + "epoch": 1.1599341383095498, + "grad_norm": 1.4390227794647217, + "learning_rate": 1.0033495587946193e-05, + "loss": 0.1886, + "step": 21134 + }, + { + "epoch": 1.1600439077936333, + "grad_norm": 0.7969996929168701, + "learning_rate": 1.0029301873717053e-05, + "loss": 0.1621, + "step": 21136 + }, + { + "epoch": 1.1601536772777168, + "grad_norm": 1.0027321577072144, + "learning_rate": 1.0025108816176215e-05, + "loss": 0.1394, + "step": 21138 + }, + { + "epoch": 1.1602634467618003, + "grad_norm": 2.184389114379883, + "learning_rate": 1.0020916415507606e-05, + "loss": 0.1971, + "step": 21140 + }, + { + "epoch": 1.1603732162458837, + "grad_norm": 0.9935865998268127, + "learning_rate": 1.0016724671895125e-05, + "loss": 0.2054, + "step": 21142 + }, + { + "epoch": 1.160482985729967, + "grad_norm": 2.832158088684082, + "learning_rate": 1.0012533585522641e-05, + "loss": 0.169, + "step": 21144 + }, + { + "epoch": 1.1605927552140505, + "grad_norm": 0.9286994338035583, + "learning_rate": 1.0008343156573987e-05, + "loss": 0.2365, + "step": 21146 + }, + { + "epoch": 1.160702524698134, + "grad_norm": 1.045240044593811, + "learning_rate": 1.0004153385232995e-05, + "loss": 0.1547, + "step": 21148 + }, + { + "epoch": 1.1608122941822174, + "grad_norm": 1.1082581281661987, + "learning_rate": 9.99996427168344e-06, + "loss": 0.2014, + "step": 21150 + }, + { + "epoch": 1.1609220636663007, + "grad_norm": 1.233285903930664, + "learning_rate": 9.995775816109077e-06, + "loss": 0.1368, + "step": 21152 + }, + { + "epoch": 1.1610318331503842, + "grad_norm": 1.1187928915023804, + "learning_rate": 9.991588018693629e-06, + "loss": 0.1316, + "step": 21154 + }, + { + "epoch": 1.1611416026344676, + "grad_norm": 1.1886756420135498, + "learning_rate": 9.98740087962079e-06, + "loss": 0.2526, + "step": 21156 + }, + { + "epoch": 1.1612513721185511, + "grad_norm": 1.3616149425506592, + "learning_rate": 9.983214399074241e-06, + "loss": 0.211, + "step": 21158 + }, + { + "epoch": 1.1613611416026344, + "grad_norm": 0.8733143210411072, + "learning_rate": 9.979028577237609e-06, + "loss": 0.1399, + "step": 21160 + }, + { + "epoch": 1.1614709110867178, + "grad_norm": 0.6423743963241577, + "learning_rate": 9.974843414294518e-06, + "loss": 0.1086, + "step": 21162 + }, + { + "epoch": 1.1615806805708013, + "grad_norm": 0.8201112151145935, + "learning_rate": 9.970658910428543e-06, + "loss": 0.1716, + "step": 21164 + }, + { + "epoch": 1.1616904500548848, + "grad_norm": 1.0875104665756226, + "learning_rate": 9.966475065823239e-06, + "loss": 0.1834, + "step": 21166 + }, + { + "epoch": 1.1618002195389683, + "grad_norm": 1.0755608081817627, + "learning_rate": 9.962291880662125e-06, + "loss": 0.1512, + "step": 21168 + }, + { + "epoch": 1.1619099890230515, + "grad_norm": 1.0285321474075317, + "learning_rate": 9.958109355128689e-06, + "loss": 0.2131, + "step": 21170 + }, + { + "epoch": 1.162019758507135, + "grad_norm": 0.8759807348251343, + "learning_rate": 9.953927489406416e-06, + "loss": 0.2434, + "step": 21172 + }, + { + "epoch": 1.1621295279912185, + "grad_norm": 1.0527528524398804, + "learning_rate": 9.949746283678735e-06, + "loss": 0.1515, + "step": 21174 + }, + { + "epoch": 1.1622392974753017, + "grad_norm": 0.8554089069366455, + "learning_rate": 9.945565738129053e-06, + "loss": 0.1375, + "step": 21176 + }, + { + "epoch": 1.1623490669593852, + "grad_norm": 1.677246332168579, + "learning_rate": 9.94138585294075e-06, + "loss": 0.2253, + "step": 21178 + }, + { + "epoch": 1.1624588364434687, + "grad_norm": 0.8119385242462158, + "learning_rate": 9.937206628297172e-06, + "loss": 0.1421, + "step": 21180 + }, + { + "epoch": 1.1625686059275522, + "grad_norm": 0.8988960385322571, + "learning_rate": 9.933028064381639e-06, + "loss": 0.1426, + "step": 21182 + }, + { + "epoch": 1.1626783754116357, + "grad_norm": 1.1796317100524902, + "learning_rate": 9.928850161377453e-06, + "loss": 0.2039, + "step": 21184 + }, + { + "epoch": 1.162788144895719, + "grad_norm": 1.4209816455841064, + "learning_rate": 9.924672919467876e-06, + "loss": 0.2471, + "step": 21186 + }, + { + "epoch": 1.1628979143798024, + "grad_norm": 1.1983164548873901, + "learning_rate": 9.920496338836135e-06, + "loss": 0.123, + "step": 21188 + }, + { + "epoch": 1.1630076838638859, + "grad_norm": 1.190880537033081, + "learning_rate": 9.916320419665435e-06, + "loss": 0.1337, + "step": 21190 + }, + { + "epoch": 1.1631174533479693, + "grad_norm": 1.2729190587997437, + "learning_rate": 9.912145162138961e-06, + "loss": 0.2221, + "step": 21192 + }, + { + "epoch": 1.1632272228320526, + "grad_norm": 1.498284935951233, + "learning_rate": 9.907970566439858e-06, + "loss": 0.1496, + "step": 21194 + }, + { + "epoch": 1.163336992316136, + "grad_norm": 1.6097558736801147, + "learning_rate": 9.903796632751234e-06, + "loss": 0.1933, + "step": 21196 + }, + { + "epoch": 1.1634467618002196, + "grad_norm": 0.9661285877227783, + "learning_rate": 9.899623361256193e-06, + "loss": 0.1439, + "step": 21198 + }, + { + "epoch": 1.163556531284303, + "grad_norm": 1.2000874280929565, + "learning_rate": 9.895450752137788e-06, + "loss": 0.2063, + "step": 21200 + }, + { + "epoch": 1.1636663007683863, + "grad_norm": 0.9265146851539612, + "learning_rate": 9.891278805579054e-06, + "loss": 0.1465, + "step": 21202 + }, + { + "epoch": 1.1637760702524698, + "grad_norm": 1.2032766342163086, + "learning_rate": 9.887107521762986e-06, + "loss": 0.157, + "step": 21204 + }, + { + "epoch": 1.1638858397365532, + "grad_norm": 1.3773179054260254, + "learning_rate": 9.882936900872552e-06, + "loss": 0.2852, + "step": 21206 + }, + { + "epoch": 1.1639956092206367, + "grad_norm": 1.00459885597229, + "learning_rate": 9.878766943090717e-06, + "loss": 0.1324, + "step": 21208 + }, + { + "epoch": 1.1641053787047202, + "grad_norm": 2.045078754425049, + "learning_rate": 9.874597648600379e-06, + "loss": 0.3079, + "step": 21210 + }, + { + "epoch": 1.1642151481888035, + "grad_norm": 0.851008951663971, + "learning_rate": 9.870429017584428e-06, + "loss": 0.1637, + "step": 21212 + }, + { + "epoch": 1.164324917672887, + "grad_norm": 0.9076129198074341, + "learning_rate": 9.866261050225726e-06, + "loss": 0.1473, + "step": 21214 + }, + { + "epoch": 1.1644346871569704, + "grad_norm": 1.648476481437683, + "learning_rate": 9.862093746707091e-06, + "loss": 0.2006, + "step": 21216 + }, + { + "epoch": 1.1645444566410539, + "grad_norm": 1.1484379768371582, + "learning_rate": 9.857927107211315e-06, + "loss": 0.1504, + "step": 21218 + }, + { + "epoch": 1.1646542261251371, + "grad_norm": 1.2779799699783325, + "learning_rate": 9.853761131921179e-06, + "loss": 0.1832, + "step": 21220 + }, + { + "epoch": 1.1647639956092206, + "grad_norm": 1.1051486730575562, + "learning_rate": 9.849595821019433e-06, + "loss": 0.2037, + "step": 21222 + }, + { + "epoch": 1.164873765093304, + "grad_norm": 1.0813146829605103, + "learning_rate": 9.845431174688774e-06, + "loss": 0.1857, + "step": 21224 + }, + { + "epoch": 1.1649835345773876, + "grad_norm": 1.3055331707000732, + "learning_rate": 9.841267193111889e-06, + "loss": 0.1624, + "step": 21226 + }, + { + "epoch": 1.1650933040614708, + "grad_norm": 0.9967363476753235, + "learning_rate": 9.837103876471426e-06, + "loss": 0.173, + "step": 21228 + }, + { + "epoch": 1.1652030735455543, + "grad_norm": 1.3563997745513916, + "learning_rate": 9.832941224950012e-06, + "loss": 0.1957, + "step": 21230 + }, + { + "epoch": 1.1653128430296378, + "grad_norm": 1.1557062864303589, + "learning_rate": 9.828779238730231e-06, + "loss": 0.2708, + "step": 21232 + }, + { + "epoch": 1.1654226125137213, + "grad_norm": 1.1165409088134766, + "learning_rate": 9.82461791799467e-06, + "loss": 0.1733, + "step": 21234 + }, + { + "epoch": 1.1655323819978047, + "grad_norm": 0.8563121557235718, + "learning_rate": 9.820457262925848e-06, + "loss": 0.2907, + "step": 21236 + }, + { + "epoch": 1.165642151481888, + "grad_norm": 0.9450798034667969, + "learning_rate": 9.816297273706276e-06, + "loss": 0.1986, + "step": 21238 + }, + { + "epoch": 1.1657519209659715, + "grad_norm": 1.1704185009002686, + "learning_rate": 9.812137950518432e-06, + "loss": 0.2239, + "step": 21240 + }, + { + "epoch": 1.165861690450055, + "grad_norm": 1.3397033214569092, + "learning_rate": 9.807979293544764e-06, + "loss": 0.153, + "step": 21242 + }, + { + "epoch": 1.1659714599341382, + "grad_norm": 0.558979332447052, + "learning_rate": 9.803821302967683e-06, + "loss": 0.1497, + "step": 21244 + }, + { + "epoch": 1.1660812294182217, + "grad_norm": 1.100637435913086, + "learning_rate": 9.799663978969595e-06, + "loss": 0.1625, + "step": 21246 + }, + { + "epoch": 1.1661909989023052, + "grad_norm": 1.0251705646514893, + "learning_rate": 9.795507321732853e-06, + "loss": 0.1245, + "step": 21248 + }, + { + "epoch": 1.1663007683863886, + "grad_norm": 1.162927269935608, + "learning_rate": 9.79135133143978e-06, + "loss": 0.1891, + "step": 21250 + }, + { + "epoch": 1.166410537870472, + "grad_norm": 1.3027604818344116, + "learning_rate": 9.787196008272692e-06, + "loss": 0.3198, + "step": 21252 + }, + { + "epoch": 1.1665203073545554, + "grad_norm": 2.1209588050842285, + "learning_rate": 9.783041352413858e-06, + "loss": 0.2728, + "step": 21254 + }, + { + "epoch": 1.1666300768386388, + "grad_norm": 3.123236656188965, + "learning_rate": 9.778887364045511e-06, + "loss": 0.1919, + "step": 21256 + }, + { + "epoch": 1.1667398463227223, + "grad_norm": 1.3489675521850586, + "learning_rate": 9.774734043349882e-06, + "loss": 0.2368, + "step": 21258 + }, + { + "epoch": 1.1668496158068058, + "grad_norm": 1.4641751050949097, + "learning_rate": 9.770581390509149e-06, + "loss": 0.1956, + "step": 21260 + }, + { + "epoch": 1.166959385290889, + "grad_norm": 0.9558554291725159, + "learning_rate": 9.766429405705462e-06, + "loss": 0.117, + "step": 21262 + }, + { + "epoch": 1.1670691547749725, + "grad_norm": 1.5004278421401978, + "learning_rate": 9.762278089120957e-06, + "loss": 0.2177, + "step": 21264 + }, + { + "epoch": 1.167178924259056, + "grad_norm": 1.5816795825958252, + "learning_rate": 9.758127440937725e-06, + "loss": 0.228, + "step": 21266 + }, + { + "epoch": 1.1672886937431395, + "grad_norm": 1.0014595985412598, + "learning_rate": 9.753977461337827e-06, + "loss": 0.1627, + "step": 21268 + }, + { + "epoch": 1.1673984632272227, + "grad_norm": 0.9646679759025574, + "learning_rate": 9.749828150503316e-06, + "loss": 0.1395, + "step": 21270 + }, + { + "epoch": 1.1675082327113062, + "grad_norm": 0.6269030570983887, + "learning_rate": 9.745679508616195e-06, + "loss": 0.1534, + "step": 21272 + }, + { + "epoch": 1.1676180021953897, + "grad_norm": 1.0014429092407227, + "learning_rate": 9.741531535858445e-06, + "loss": 0.1556, + "step": 21274 + }, + { + "epoch": 1.1677277716794732, + "grad_norm": 0.9793763160705566, + "learning_rate": 9.737384232412014e-06, + "loss": 0.2355, + "step": 21276 + }, + { + "epoch": 1.1678375411635566, + "grad_norm": 1.2158418893814087, + "learning_rate": 9.733237598458821e-06, + "loss": 0.2778, + "step": 21278 + }, + { + "epoch": 1.16794731064764, + "grad_norm": 1.6239186525344849, + "learning_rate": 9.729091634180756e-06, + "loss": 0.1637, + "step": 21280 + }, + { + "epoch": 1.1680570801317234, + "grad_norm": 1.3159606456756592, + "learning_rate": 9.724946339759686e-06, + "loss": 0.1965, + "step": 21282 + }, + { + "epoch": 1.1681668496158069, + "grad_norm": 1.3461272716522217, + "learning_rate": 9.72080171537745e-06, + "loss": 0.2308, + "step": 21284 + }, + { + "epoch": 1.1682766190998901, + "grad_norm": 1.3431997299194336, + "learning_rate": 9.716657761215847e-06, + "loss": 0.1941, + "step": 21286 + }, + { + "epoch": 1.1683863885839736, + "grad_norm": 1.523362159729004, + "learning_rate": 9.71251447745665e-06, + "loss": 0.1766, + "step": 21288 + }, + { + "epoch": 1.168496158068057, + "grad_norm": 0.9604460000991821, + "learning_rate": 9.708371864281602e-06, + "loss": 0.1909, + "step": 21290 + }, + { + "epoch": 1.1686059275521405, + "grad_norm": 1.405334711074829, + "learning_rate": 9.704229921872422e-06, + "loss": 0.1924, + "step": 21292 + }, + { + "epoch": 1.168715697036224, + "grad_norm": 2.3471450805664062, + "learning_rate": 9.700088650410782e-06, + "loss": 0.213, + "step": 21294 + }, + { + "epoch": 1.1688254665203073, + "grad_norm": 1.1895784139633179, + "learning_rate": 9.69594805007836e-06, + "loss": 0.1812, + "step": 21296 + }, + { + "epoch": 1.1689352360043908, + "grad_norm": 0.9532161355018616, + "learning_rate": 9.691808121056773e-06, + "loss": 0.158, + "step": 21298 + }, + { + "epoch": 1.1690450054884742, + "grad_norm": 1.29609215259552, + "learning_rate": 9.68766886352762e-06, + "loss": 0.2187, + "step": 21300 + }, + { + "epoch": 1.1691547749725577, + "grad_norm": 0.9457452297210693, + "learning_rate": 9.683530277672467e-06, + "loss": 0.2433, + "step": 21302 + }, + { + "epoch": 1.169264544456641, + "grad_norm": 0.8692788481712341, + "learning_rate": 9.679392363672846e-06, + "loss": 0.1971, + "step": 21304 + }, + { + "epoch": 1.1693743139407244, + "grad_norm": 1.5145761966705322, + "learning_rate": 9.67525512171028e-06, + "loss": 0.1927, + "step": 21306 + }, + { + "epoch": 1.169484083424808, + "grad_norm": 1.6219699382781982, + "learning_rate": 9.671118551966246e-06, + "loss": 0.251, + "step": 21308 + }, + { + "epoch": 1.1695938529088914, + "grad_norm": 1.4318794012069702, + "learning_rate": 9.66698265462219e-06, + "loss": 0.2422, + "step": 21310 + }, + { + "epoch": 1.1697036223929747, + "grad_norm": 1.135509729385376, + "learning_rate": 9.662847429859522e-06, + "loss": 0.2178, + "step": 21312 + }, + { + "epoch": 1.1698133918770581, + "grad_norm": 1.4529569149017334, + "learning_rate": 9.658712877859657e-06, + "loss": 0.1877, + "step": 21314 + }, + { + "epoch": 1.1699231613611416, + "grad_norm": 1.9784784317016602, + "learning_rate": 9.654578998803945e-06, + "loss": 0.1947, + "step": 21316 + }, + { + "epoch": 1.170032930845225, + "grad_norm": 1.023208737373352, + "learning_rate": 9.650445792873708e-06, + "loss": 0.1583, + "step": 21318 + }, + { + "epoch": 1.1701427003293086, + "grad_norm": 1.1337851285934448, + "learning_rate": 9.646313260250267e-06, + "loss": 0.2476, + "step": 21320 + }, + { + "epoch": 1.1702524698133918, + "grad_norm": 1.5048260688781738, + "learning_rate": 9.642181401114885e-06, + "loss": 0.2956, + "step": 21322 + }, + { + "epoch": 1.1703622392974753, + "grad_norm": 1.3913428783416748, + "learning_rate": 9.638050215648812e-06, + "loss": 0.2895, + "step": 21324 + }, + { + "epoch": 1.1704720087815588, + "grad_norm": 1.2444170713424683, + "learning_rate": 9.633919704033253e-06, + "loss": 0.22, + "step": 21326 + }, + { + "epoch": 1.1705817782656422, + "grad_norm": 1.193678379058838, + "learning_rate": 9.6297898664494e-06, + "loss": 0.2289, + "step": 21328 + }, + { + "epoch": 1.1706915477497255, + "grad_norm": 1.2840933799743652, + "learning_rate": 9.625660703078392e-06, + "loss": 0.2078, + "step": 21330 + }, + { + "epoch": 1.170801317233809, + "grad_norm": 0.8445857763290405, + "learning_rate": 9.62153221410138e-06, + "loss": 0.2678, + "step": 21332 + }, + { + "epoch": 1.1709110867178925, + "grad_norm": 1.7448797225952148, + "learning_rate": 9.617404399699445e-06, + "loss": 0.3244, + "step": 21334 + }, + { + "epoch": 1.171020856201976, + "grad_norm": 0.9430142641067505, + "learning_rate": 9.613277260053657e-06, + "loss": 0.1831, + "step": 21336 + }, + { + "epoch": 1.1711306256860592, + "grad_norm": 0.9613323211669922, + "learning_rate": 9.609150795345051e-06, + "loss": 0.1453, + "step": 21338 + }, + { + "epoch": 1.1712403951701427, + "grad_norm": 1.1344267129898071, + "learning_rate": 9.605025005754622e-06, + "loss": 0.306, + "step": 21340 + }, + { + "epoch": 1.1713501646542261, + "grad_norm": 0.566567063331604, + "learning_rate": 9.600899891463363e-06, + "loss": 0.0742, + "step": 21342 + }, + { + "epoch": 1.1714599341383096, + "grad_norm": 1.0546268224716187, + "learning_rate": 9.596775452652224e-06, + "loss": 0.1608, + "step": 21344 + }, + { + "epoch": 1.171569703622393, + "grad_norm": 0.9420010447502136, + "learning_rate": 9.592651689502119e-06, + "loss": 0.1265, + "step": 21346 + }, + { + "epoch": 1.1716794731064764, + "grad_norm": 1.0913989543914795, + "learning_rate": 9.588528602193933e-06, + "loss": 0.2096, + "step": 21348 + }, + { + "epoch": 1.1717892425905598, + "grad_norm": 1.1330984830856323, + "learning_rate": 9.584406190908527e-06, + "loss": 0.2006, + "step": 21350 + }, + { + "epoch": 1.1718990120746433, + "grad_norm": 2.0603766441345215, + "learning_rate": 9.58028445582673e-06, + "loss": 0.2511, + "step": 21352 + }, + { + "epoch": 1.1720087815587266, + "grad_norm": 1.387519359588623, + "learning_rate": 9.576163397129331e-06, + "loss": 0.1529, + "step": 21354 + }, + { + "epoch": 1.17211855104281, + "grad_norm": 1.1614941358566284, + "learning_rate": 9.57204301499712e-06, + "loss": 0.2564, + "step": 21356 + }, + { + "epoch": 1.1722283205268935, + "grad_norm": 1.1600825786590576, + "learning_rate": 9.56792330961083e-06, + "loss": 0.1597, + "step": 21358 + }, + { + "epoch": 1.172338090010977, + "grad_norm": 1.095523715019226, + "learning_rate": 9.563804281151164e-06, + "loss": 0.184, + "step": 21360 + }, + { + "epoch": 1.1724478594950605, + "grad_norm": 0.9314603209495544, + "learning_rate": 9.559685929798812e-06, + "loss": 0.1932, + "step": 21362 + }, + { + "epoch": 1.1725576289791437, + "grad_norm": 0.7628123760223389, + "learning_rate": 9.555568255734418e-06, + "loss": 0.1156, + "step": 21364 + }, + { + "epoch": 1.1726673984632272, + "grad_norm": 1.5480881929397583, + "learning_rate": 9.551451259138596e-06, + "loss": 0.2115, + "step": 21366 + }, + { + "epoch": 1.1727771679473107, + "grad_norm": 1.1045112609863281, + "learning_rate": 9.547334940191957e-06, + "loss": 0.1831, + "step": 21368 + }, + { + "epoch": 1.1728869374313942, + "grad_norm": 0.8921491503715515, + "learning_rate": 9.543219299075057e-06, + "loss": 0.1954, + "step": 21370 + }, + { + "epoch": 1.1729967069154774, + "grad_norm": 1.7802964448928833, + "learning_rate": 9.539104335968413e-06, + "loss": 0.1921, + "step": 21372 + }, + { + "epoch": 1.173106476399561, + "grad_norm": 1.7009422779083252, + "learning_rate": 9.534990051052548e-06, + "loss": 0.1404, + "step": 21374 + }, + { + "epoch": 1.1732162458836444, + "grad_norm": 1.376611590385437, + "learning_rate": 9.530876444507927e-06, + "loss": 0.2454, + "step": 21376 + }, + { + "epoch": 1.1733260153677278, + "grad_norm": 1.8066216707229614, + "learning_rate": 9.526763516514991e-06, + "loss": 0.2751, + "step": 21378 + }, + { + "epoch": 1.173435784851811, + "grad_norm": 1.1851615905761719, + "learning_rate": 9.522651267254149e-06, + "loss": 0.2154, + "step": 21380 + }, + { + "epoch": 1.1735455543358946, + "grad_norm": 1.042817234992981, + "learning_rate": 9.518539696905795e-06, + "loss": 0.1164, + "step": 21382 + }, + { + "epoch": 1.173655323819978, + "grad_norm": 1.4051694869995117, + "learning_rate": 9.514428805650277e-06, + "loss": 0.3031, + "step": 21384 + }, + { + "epoch": 1.1737650933040615, + "grad_norm": 1.1280945539474487, + "learning_rate": 9.510318593667922e-06, + "loss": 0.1726, + "step": 21386 + }, + { + "epoch": 1.173874862788145, + "grad_norm": 1.193759799003601, + "learning_rate": 9.506209061139021e-06, + "loss": 0.2037, + "step": 21388 + }, + { + "epoch": 1.1739846322722283, + "grad_norm": 1.0347174406051636, + "learning_rate": 9.502100208243828e-06, + "loss": 0.1523, + "step": 21390 + }, + { + "epoch": 1.1740944017563117, + "grad_norm": 2.827500820159912, + "learning_rate": 9.497992035162599e-06, + "loss": 0.2356, + "step": 21392 + }, + { + "epoch": 1.1742041712403952, + "grad_norm": 0.8255742192268372, + "learning_rate": 9.493884542075526e-06, + "loss": 0.1615, + "step": 21394 + }, + { + "epoch": 1.1743139407244785, + "grad_norm": 0.8738126754760742, + "learning_rate": 9.489777729162786e-06, + "loss": 0.2005, + "step": 21396 + }, + { + "epoch": 1.174423710208562, + "grad_norm": 0.830945611000061, + "learning_rate": 9.485671596604523e-06, + "loss": 0.2276, + "step": 21398 + }, + { + "epoch": 1.1745334796926454, + "grad_norm": 1.9145796298980713, + "learning_rate": 9.481566144580853e-06, + "loss": 0.2199, + "step": 21400 + }, + { + "epoch": 1.174643249176729, + "grad_norm": 1.5562230348587036, + "learning_rate": 9.477461373271852e-06, + "loss": 0.1655, + "step": 21402 + }, + { + "epoch": 1.1747530186608124, + "grad_norm": 1.944451928138733, + "learning_rate": 9.473357282857583e-06, + "loss": 0.2242, + "step": 21404 + }, + { + "epoch": 1.1748627881448956, + "grad_norm": 1.0384032726287842, + "learning_rate": 9.469253873518083e-06, + "loss": 0.157, + "step": 21406 + }, + { + "epoch": 1.1749725576289791, + "grad_norm": 0.7837209105491638, + "learning_rate": 9.465151145433338e-06, + "loss": 0.1537, + "step": 21408 + }, + { + "epoch": 1.1750823271130626, + "grad_norm": 1.3692678213119507, + "learning_rate": 9.461049098783312e-06, + "loss": 0.2957, + "step": 21410 + }, + { + "epoch": 1.175192096597146, + "grad_norm": 1.0975884199142456, + "learning_rate": 9.456947733747943e-06, + "loss": 0.1321, + "step": 21412 + }, + { + "epoch": 1.1753018660812293, + "grad_norm": 0.8476656079292297, + "learning_rate": 9.452847050507135e-06, + "loss": 0.2217, + "step": 21414 + }, + { + "epoch": 1.1754116355653128, + "grad_norm": 0.9661343097686768, + "learning_rate": 9.448747049240758e-06, + "loss": 0.1257, + "step": 21416 + }, + { + "epoch": 1.1755214050493963, + "grad_norm": 1.0596935749053955, + "learning_rate": 9.444647730128673e-06, + "loss": 0.2164, + "step": 21418 + }, + { + "epoch": 1.1756311745334798, + "grad_norm": 0.9556555151939392, + "learning_rate": 9.44054909335069e-06, + "loss": 0.195, + "step": 21420 + }, + { + "epoch": 1.175740944017563, + "grad_norm": 1.180624008178711, + "learning_rate": 9.436451139086594e-06, + "loss": 0.1525, + "step": 21422 + }, + { + "epoch": 1.1758507135016465, + "grad_norm": 1.0899131298065186, + "learning_rate": 9.432353867516138e-06, + "loss": 0.1532, + "step": 21424 + }, + { + "epoch": 1.17596048298573, + "grad_norm": 1.1077017784118652, + "learning_rate": 9.428257278819046e-06, + "loss": 0.2351, + "step": 21426 + }, + { + "epoch": 1.1760702524698134, + "grad_norm": 1.09752357006073, + "learning_rate": 9.42416137317503e-06, + "loss": 0.1813, + "step": 21428 + }, + { + "epoch": 1.176180021953897, + "grad_norm": 1.4507485628128052, + "learning_rate": 9.420066150763748e-06, + "loss": 0.2544, + "step": 21430 + }, + { + "epoch": 1.1762897914379802, + "grad_norm": 1.8093972206115723, + "learning_rate": 9.415971611764823e-06, + "loss": 0.2091, + "step": 21432 + }, + { + "epoch": 1.1763995609220637, + "grad_norm": 0.6487160325050354, + "learning_rate": 9.411877756357885e-06, + "loss": 0.1021, + "step": 21434 + }, + { + "epoch": 1.1765093304061471, + "grad_norm": 1.3714451789855957, + "learning_rate": 9.407784584722498e-06, + "loss": 0.1992, + "step": 21436 + }, + { + "epoch": 1.1766190998902304, + "grad_norm": 1.0872676372528076, + "learning_rate": 9.403692097038213e-06, + "loss": 0.2449, + "step": 21438 + }, + { + "epoch": 1.1767288693743139, + "grad_norm": 0.9194514751434326, + "learning_rate": 9.399600293484533e-06, + "loss": 0.1479, + "step": 21440 + }, + { + "epoch": 1.1768386388583973, + "grad_norm": 1.6107252836227417, + "learning_rate": 9.395509174240965e-06, + "loss": 0.1207, + "step": 21442 + }, + { + "epoch": 1.1769484083424808, + "grad_norm": 3.0252506732940674, + "learning_rate": 9.391418739486955e-06, + "loss": 0.1792, + "step": 21444 + }, + { + "epoch": 1.1770581778265643, + "grad_norm": 1.5188179016113281, + "learning_rate": 9.38732898940193e-06, + "loss": 0.1771, + "step": 21446 + }, + { + "epoch": 1.1771679473106476, + "grad_norm": 0.9126371741294861, + "learning_rate": 9.383239924165287e-06, + "loss": 0.1303, + "step": 21448 + }, + { + "epoch": 1.177277716794731, + "grad_norm": 1.3147281408309937, + "learning_rate": 9.379151543956396e-06, + "loss": 0.1622, + "step": 21450 + }, + { + "epoch": 1.1773874862788145, + "grad_norm": 1.1197402477264404, + "learning_rate": 9.375063848954577e-06, + "loss": 0.2412, + "step": 21452 + }, + { + "epoch": 1.177497255762898, + "grad_norm": 1.0376958847045898, + "learning_rate": 9.37097683933916e-06, + "loss": 0.0938, + "step": 21454 + }, + { + "epoch": 1.1776070252469812, + "grad_norm": 0.6772419810295105, + "learning_rate": 9.366890515289411e-06, + "loss": 0.1148, + "step": 21456 + }, + { + "epoch": 1.1777167947310647, + "grad_norm": 1.1183693408966064, + "learning_rate": 9.362804876984573e-06, + "loss": 0.1203, + "step": 21458 + }, + { + "epoch": 1.1778265642151482, + "grad_norm": 1.4069547653198242, + "learning_rate": 9.35871992460387e-06, + "loss": 0.2189, + "step": 21460 + }, + { + "epoch": 1.1779363336992317, + "grad_norm": 1.0007508993148804, + "learning_rate": 9.35463565832647e-06, + "loss": 0.1925, + "step": 21462 + }, + { + "epoch": 1.178046103183315, + "grad_norm": 2.086418390274048, + "learning_rate": 9.350552078331554e-06, + "loss": 0.4328, + "step": 21464 + }, + { + "epoch": 1.1781558726673984, + "grad_norm": 1.0365427732467651, + "learning_rate": 9.346469184798223e-06, + "loss": 0.1834, + "step": 21466 + }, + { + "epoch": 1.1782656421514819, + "grad_norm": 1.1443570852279663, + "learning_rate": 9.342386977905598e-06, + "loss": 0.2052, + "step": 21468 + }, + { + "epoch": 1.1783754116355654, + "grad_norm": 1.014005422592163, + "learning_rate": 9.33830545783273e-06, + "loss": 0.1827, + "step": 21470 + }, + { + "epoch": 1.1784851811196488, + "grad_norm": 1.3885174989700317, + "learning_rate": 9.334224624758658e-06, + "loss": 0.2179, + "step": 21472 + }, + { + "epoch": 1.178594950603732, + "grad_norm": 1.0634925365447998, + "learning_rate": 9.330144478862385e-06, + "loss": 0.1672, + "step": 21474 + }, + { + "epoch": 1.1787047200878156, + "grad_norm": 1.225936770439148, + "learning_rate": 9.32606502032288e-06, + "loss": 0.2373, + "step": 21476 + }, + { + "epoch": 1.178814489571899, + "grad_norm": 1.6319795846939087, + "learning_rate": 9.321986249319103e-06, + "loss": 0.2677, + "step": 21478 + }, + { + "epoch": 1.1789242590559825, + "grad_norm": 1.867339849472046, + "learning_rate": 9.317908166029962e-06, + "loss": 0.203, + "step": 21480 + }, + { + "epoch": 1.1790340285400658, + "grad_norm": 1.1944533586502075, + "learning_rate": 9.31383077063434e-06, + "loss": 0.235, + "step": 21482 + }, + { + "epoch": 1.1791437980241493, + "grad_norm": 1.283094048500061, + "learning_rate": 9.309754063311094e-06, + "loss": 0.185, + "step": 21484 + }, + { + "epoch": 1.1792535675082327, + "grad_norm": 1.0020968914031982, + "learning_rate": 9.305678044239046e-06, + "loss": 0.1719, + "step": 21486 + }, + { + "epoch": 1.1793633369923162, + "grad_norm": 1.4882285594940186, + "learning_rate": 9.301602713596982e-06, + "loss": 0.1666, + "step": 21488 + }, + { + "epoch": 1.1794731064763995, + "grad_norm": 1.127349615097046, + "learning_rate": 9.297528071563685e-06, + "loss": 0.234, + "step": 21490 + }, + { + "epoch": 1.179582875960483, + "grad_norm": 1.8157660961151123, + "learning_rate": 9.293454118317871e-06, + "loss": 0.2391, + "step": 21492 + }, + { + "epoch": 1.1796926454445664, + "grad_norm": 1.1891361474990845, + "learning_rate": 9.28938085403826e-06, + "loss": 0.1653, + "step": 21494 + }, + { + "epoch": 1.17980241492865, + "grad_norm": 1.036377191543579, + "learning_rate": 9.285308278903519e-06, + "loss": 0.1841, + "step": 21496 + }, + { + "epoch": 1.1799121844127334, + "grad_norm": 1.6073458194732666, + "learning_rate": 9.281236393092285e-06, + "loss": 0.2137, + "step": 21498 + }, + { + "epoch": 1.1800219538968166, + "grad_norm": 1.4774565696716309, + "learning_rate": 9.277165196783178e-06, + "loss": 0.1893, + "step": 21500 + }, + { + "epoch": 1.1801317233809, + "grad_norm": 1.2512239217758179, + "learning_rate": 9.273094690154766e-06, + "loss": 0.2481, + "step": 21502 + }, + { + "epoch": 1.1802414928649836, + "grad_norm": 1.344368815422058, + "learning_rate": 9.269024873385624e-06, + "loss": 0.1601, + "step": 21504 + }, + { + "epoch": 1.1803512623490668, + "grad_norm": 0.8647432923316956, + "learning_rate": 9.264955746654263e-06, + "loss": 0.1871, + "step": 21506 + }, + { + "epoch": 1.1804610318331503, + "grad_norm": 1.6566996574401855, + "learning_rate": 9.260887310139175e-06, + "loss": 0.1639, + "step": 21508 + }, + { + "epoch": 1.1805708013172338, + "grad_norm": 1.14015531539917, + "learning_rate": 9.25681956401882e-06, + "loss": 0.1669, + "step": 21510 + }, + { + "epoch": 1.1806805708013173, + "grad_norm": 1.7307237386703491, + "learning_rate": 9.252752508471626e-06, + "loss": 0.248, + "step": 21512 + }, + { + "epoch": 1.1807903402854008, + "grad_norm": 1.122761845588684, + "learning_rate": 9.248686143676005e-06, + "loss": 0.1498, + "step": 21514 + }, + { + "epoch": 1.180900109769484, + "grad_norm": 1.1739401817321777, + "learning_rate": 9.244620469810322e-06, + "loss": 0.1701, + "step": 21516 + }, + { + "epoch": 1.1810098792535675, + "grad_norm": 1.2037895917892456, + "learning_rate": 9.240555487052918e-06, + "loss": 0.3583, + "step": 21518 + }, + { + "epoch": 1.181119648737651, + "grad_norm": 1.2348880767822266, + "learning_rate": 9.2364911955821e-06, + "loss": 0.2324, + "step": 21520 + }, + { + "epoch": 1.1812294182217344, + "grad_norm": 1.112457513809204, + "learning_rate": 9.232427595576154e-06, + "loss": 0.184, + "step": 21522 + }, + { + "epoch": 1.1813391877058177, + "grad_norm": 0.9853076338768005, + "learning_rate": 9.228364687213315e-06, + "loss": 0.1663, + "step": 21524 + }, + { + "epoch": 1.1814489571899012, + "grad_norm": 1.1044832468032837, + "learning_rate": 9.22430247067181e-06, + "loss": 0.2281, + "step": 21526 + }, + { + "epoch": 1.1815587266739846, + "grad_norm": 1.086527705192566, + "learning_rate": 9.220240946129844e-06, + "loss": 0.1617, + "step": 21528 + }, + { + "epoch": 1.1816684961580681, + "grad_norm": 1.7916007041931152, + "learning_rate": 9.216180113765558e-06, + "loss": 0.214, + "step": 21530 + }, + { + "epoch": 1.1817782656421514, + "grad_norm": 1.2656031847000122, + "learning_rate": 9.212119973757082e-06, + "loss": 0.3029, + "step": 21532 + }, + { + "epoch": 1.1818880351262349, + "grad_norm": 1.2766848802566528, + "learning_rate": 9.208060526282519e-06, + "loss": 0.1575, + "step": 21534 + }, + { + "epoch": 1.1819978046103183, + "grad_norm": 1.2859292030334473, + "learning_rate": 9.204001771519933e-06, + "loss": 0.1729, + "step": 21536 + }, + { + "epoch": 1.1821075740944018, + "grad_norm": 1.0352764129638672, + "learning_rate": 9.199943709647348e-06, + "loss": 0.2807, + "step": 21538 + }, + { + "epoch": 1.1822173435784853, + "grad_norm": 1.5958775281906128, + "learning_rate": 9.195886340842796e-06, + "loss": 0.1849, + "step": 21540 + }, + { + "epoch": 1.1823271130625685, + "grad_norm": 1.1627233028411865, + "learning_rate": 9.19182966528424e-06, + "loss": 0.196, + "step": 21542 + }, + { + "epoch": 1.182436882546652, + "grad_norm": 1.5686070919036865, + "learning_rate": 9.187773683149626e-06, + "loss": 0.1866, + "step": 21544 + }, + { + "epoch": 1.1825466520307355, + "grad_norm": 0.9873930811882019, + "learning_rate": 9.183718394616867e-06, + "loss": 0.2246, + "step": 21546 + }, + { + "epoch": 1.1826564215148188, + "grad_norm": 1.1254262924194336, + "learning_rate": 9.179663799863849e-06, + "loss": 0.1826, + "step": 21548 + }, + { + "epoch": 1.1827661909989022, + "grad_norm": 0.8140988349914551, + "learning_rate": 9.175609899068421e-06, + "loss": 0.2361, + "step": 21550 + }, + { + "epoch": 1.1828759604829857, + "grad_norm": 1.146963357925415, + "learning_rate": 9.171556692408423e-06, + "loss": 0.2237, + "step": 21552 + }, + { + "epoch": 1.1829857299670692, + "grad_norm": 0.9245880246162415, + "learning_rate": 9.167504180061628e-06, + "loss": 0.1843, + "step": 21554 + }, + { + "epoch": 1.1830954994511527, + "grad_norm": 1.4639554023742676, + "learning_rate": 9.163452362205822e-06, + "loss": 0.2146, + "step": 21556 + }, + { + "epoch": 1.183205268935236, + "grad_norm": 1.3560802936553955, + "learning_rate": 9.159401239018722e-06, + "loss": 0.2029, + "step": 21558 + }, + { + "epoch": 1.1833150384193194, + "grad_norm": 1.834682822227478, + "learning_rate": 9.155350810678038e-06, + "loss": 0.2063, + "step": 21560 + }, + { + "epoch": 1.1834248079034029, + "grad_norm": 0.7744579911231995, + "learning_rate": 9.151301077361426e-06, + "loss": 0.1147, + "step": 21562 + }, + { + "epoch": 1.1835345773874864, + "grad_norm": 1.177337646484375, + "learning_rate": 9.147252039246548e-06, + "loss": 0.2038, + "step": 21564 + }, + { + "epoch": 1.1836443468715696, + "grad_norm": 1.3049402236938477, + "learning_rate": 9.143203696511004e-06, + "loss": 0.1668, + "step": 21566 + }, + { + "epoch": 1.183754116355653, + "grad_norm": 1.0798721313476562, + "learning_rate": 9.139156049332378e-06, + "loss": 0.1685, + "step": 21568 + }, + { + "epoch": 1.1838638858397366, + "grad_norm": 2.5247881412506104, + "learning_rate": 9.135109097888218e-06, + "loss": 0.2161, + "step": 21570 + }, + { + "epoch": 1.18397365532382, + "grad_norm": 0.9162827730178833, + "learning_rate": 9.131062842356042e-06, + "loss": 0.2345, + "step": 21572 + }, + { + "epoch": 1.1840834248079033, + "grad_norm": 1.2087146043777466, + "learning_rate": 9.127017282913328e-06, + "loss": 0.2081, + "step": 21574 + }, + { + "epoch": 1.1841931942919868, + "grad_norm": 1.327628254890442, + "learning_rate": 9.122972419737555e-06, + "loss": 0.2345, + "step": 21576 + }, + { + "epoch": 1.1843029637760702, + "grad_norm": 1.5481700897216797, + "learning_rate": 9.11892825300614e-06, + "loss": 0.3273, + "step": 21578 + }, + { + "epoch": 1.1844127332601537, + "grad_norm": 1.6267237663269043, + "learning_rate": 9.114884782896483e-06, + "loss": 0.236, + "step": 21580 + }, + { + "epoch": 1.1845225027442372, + "grad_norm": 1.3420705795288086, + "learning_rate": 9.110842009585946e-06, + "loss": 0.2045, + "step": 21582 + }, + { + "epoch": 1.1846322722283205, + "grad_norm": 1.2902114391326904, + "learning_rate": 9.106799933251858e-06, + "loss": 0.1816, + "step": 21584 + }, + { + "epoch": 1.184742041712404, + "grad_norm": 0.995169460773468, + "learning_rate": 9.102758554071544e-06, + "loss": 0.1676, + "step": 21586 + }, + { + "epoch": 1.1848518111964874, + "grad_norm": 1.0803041458129883, + "learning_rate": 9.09871787222226e-06, + "loss": 0.1657, + "step": 21588 + }, + { + "epoch": 1.184961580680571, + "grad_norm": 1.3970612287521362, + "learning_rate": 9.094677887881265e-06, + "loss": 0.1804, + "step": 21590 + }, + { + "epoch": 1.1850713501646541, + "grad_norm": 1.6296296119689941, + "learning_rate": 9.090638601225764e-06, + "loss": 0.1437, + "step": 21592 + }, + { + "epoch": 1.1851811196487376, + "grad_norm": 1.354107141494751, + "learning_rate": 9.086600012432945e-06, + "loss": 0.2113, + "step": 21594 + }, + { + "epoch": 1.185290889132821, + "grad_norm": 1.0022574663162231, + "learning_rate": 9.082562121679955e-06, + "loss": 0.1941, + "step": 21596 + }, + { + "epoch": 1.1854006586169046, + "grad_norm": 1.789002776145935, + "learning_rate": 9.07852492914392e-06, + "loss": 0.2607, + "step": 21598 + }, + { + "epoch": 1.1855104281009878, + "grad_norm": 1.4323490858078003, + "learning_rate": 9.074488435001918e-06, + "loss": 0.2897, + "step": 21600 + }, + { + "epoch": 1.1856201975850713, + "grad_norm": 1.0708199739456177, + "learning_rate": 9.07045263943103e-06, + "loss": 0.2209, + "step": 21602 + }, + { + "epoch": 1.1857299670691548, + "grad_norm": 1.1931827068328857, + "learning_rate": 9.066417542608274e-06, + "loss": 0.1794, + "step": 21604 + }, + { + "epoch": 1.1858397365532383, + "grad_norm": 1.0696771144866943, + "learning_rate": 9.062383144710654e-06, + "loss": 0.1802, + "step": 21606 + }, + { + "epoch": 1.1859495060373217, + "grad_norm": 1.216888189315796, + "learning_rate": 9.058349445915135e-06, + "loss": 0.1961, + "step": 21608 + }, + { + "epoch": 1.186059275521405, + "grad_norm": 1.3277332782745361, + "learning_rate": 9.054316446398648e-06, + "loss": 0.203, + "step": 21610 + }, + { + "epoch": 1.1861690450054885, + "grad_norm": 1.3453872203826904, + "learning_rate": 9.050284146338115e-06, + "loss": 0.1558, + "step": 21612 + }, + { + "epoch": 1.186278814489572, + "grad_norm": 1.4631375074386597, + "learning_rate": 9.046252545910394e-06, + "loss": 0.218, + "step": 21614 + }, + { + "epoch": 1.1863885839736552, + "grad_norm": 1.2763919830322266, + "learning_rate": 9.042221645292354e-06, + "loss": 0.2047, + "step": 21616 + }, + { + "epoch": 1.1864983534577387, + "grad_norm": 1.2363234758377075, + "learning_rate": 9.038191444660796e-06, + "loss": 0.2041, + "step": 21618 + }, + { + "epoch": 1.1866081229418222, + "grad_norm": 1.9084590673446655, + "learning_rate": 9.034161944192508e-06, + "loss": 0.3131, + "step": 21620 + }, + { + "epoch": 1.1867178924259056, + "grad_norm": 1.0926107168197632, + "learning_rate": 9.03013314406424e-06, + "loss": 0.194, + "step": 21622 + }, + { + "epoch": 1.1868276619099891, + "grad_norm": 1.304701328277588, + "learning_rate": 9.02610504445271e-06, + "loss": 0.1726, + "step": 21624 + }, + { + "epoch": 1.1869374313940724, + "grad_norm": 0.9192869663238525, + "learning_rate": 9.022077645534627e-06, + "loss": 0.1425, + "step": 21626 + }, + { + "epoch": 1.1870472008781559, + "grad_norm": 2.327256679534912, + "learning_rate": 9.018050947486643e-06, + "loss": 0.2363, + "step": 21628 + }, + { + "epoch": 1.1871569703622393, + "grad_norm": 0.9090166687965393, + "learning_rate": 9.014024950485383e-06, + "loss": 0.1747, + "step": 21630 + }, + { + "epoch": 1.1872667398463228, + "grad_norm": 0.8343676328659058, + "learning_rate": 9.009999654707458e-06, + "loss": 0.1248, + "step": 21632 + }, + { + "epoch": 1.187376509330406, + "grad_norm": 0.9857807755470276, + "learning_rate": 9.00597506032943e-06, + "loss": 0.2096, + "step": 21634 + }, + { + "epoch": 1.1874862788144895, + "grad_norm": 1.1522738933563232, + "learning_rate": 9.00195116752783e-06, + "loss": 0.158, + "step": 21636 + }, + { + "epoch": 1.187596048298573, + "grad_norm": 1.1542292833328247, + "learning_rate": 8.997927976479185e-06, + "loss": 0.1426, + "step": 21638 + }, + { + "epoch": 1.1877058177826565, + "grad_norm": 1.4884346723556519, + "learning_rate": 8.99390548735996e-06, + "loss": 0.1646, + "step": 21640 + }, + { + "epoch": 1.1878155872667397, + "grad_norm": 1.0461558103561401, + "learning_rate": 8.989883700346604e-06, + "loss": 0.1811, + "step": 21642 + }, + { + "epoch": 1.1879253567508232, + "grad_norm": 1.1453019380569458, + "learning_rate": 8.985862615615522e-06, + "loss": 0.2607, + "step": 21644 + }, + { + "epoch": 1.1880351262349067, + "grad_norm": 1.0591260194778442, + "learning_rate": 8.981842233343116e-06, + "loss": 0.2271, + "step": 21646 + }, + { + "epoch": 1.1881448957189902, + "grad_norm": 1.2068780660629272, + "learning_rate": 8.977822553705733e-06, + "loss": 0.2247, + "step": 21648 + }, + { + "epoch": 1.1882546652030737, + "grad_norm": 0.7099742293357849, + "learning_rate": 8.973803576879683e-06, + "loss": 0.2509, + "step": 21650 + }, + { + "epoch": 1.188364434687157, + "grad_norm": 1.0539127588272095, + "learning_rate": 8.96978530304128e-06, + "loss": 0.2371, + "step": 21652 + }, + { + "epoch": 1.1884742041712404, + "grad_norm": 1.2445940971374512, + "learning_rate": 8.965767732366776e-06, + "loss": 0.1621, + "step": 21654 + }, + { + "epoch": 1.1885839736553239, + "grad_norm": 1.3376693725585938, + "learning_rate": 8.961750865032395e-06, + "loss": 0.1298, + "step": 21656 + }, + { + "epoch": 1.1886937431394071, + "grad_norm": 2.7795228958129883, + "learning_rate": 8.957734701214345e-06, + "loss": 0.2436, + "step": 21658 + }, + { + "epoch": 1.1888035126234906, + "grad_norm": 3.3253142833709717, + "learning_rate": 8.953719241088781e-06, + "loss": 0.2044, + "step": 21660 + }, + { + "epoch": 1.188913282107574, + "grad_norm": 2.25099778175354, + "learning_rate": 8.949704484831862e-06, + "loss": 0.2783, + "step": 21662 + }, + { + "epoch": 1.1890230515916576, + "grad_norm": 1.4404644966125488, + "learning_rate": 8.945690432619683e-06, + "loss": 0.2043, + "step": 21664 + }, + { + "epoch": 1.189132821075741, + "grad_norm": 0.8120037317276001, + "learning_rate": 8.941677084628319e-06, + "loss": 0.1531, + "step": 21666 + }, + { + "epoch": 1.1892425905598243, + "grad_norm": 1.430140733718872, + "learning_rate": 8.937664441033817e-06, + "loss": 0.189, + "step": 21668 + }, + { + "epoch": 1.1893523600439078, + "grad_norm": 0.7554320096969604, + "learning_rate": 8.93365250201219e-06, + "loss": 0.2003, + "step": 21670 + }, + { + "epoch": 1.1894621295279912, + "grad_norm": 1.4472010135650635, + "learning_rate": 8.929641267739419e-06, + "loss": 0.1849, + "step": 21672 + }, + { + "epoch": 1.1895718990120747, + "grad_norm": 1.5497850179672241, + "learning_rate": 8.925630738391455e-06, + "loss": 0.2251, + "step": 21674 + }, + { + "epoch": 1.189681668496158, + "grad_norm": 0.8230393528938293, + "learning_rate": 8.921620914144236e-06, + "loss": 0.138, + "step": 21676 + }, + { + "epoch": 1.1897914379802415, + "grad_norm": 2.014617681503296, + "learning_rate": 8.917611795173639e-06, + "loss": 0.1682, + "step": 21678 + }, + { + "epoch": 1.189901207464325, + "grad_norm": 1.833726167678833, + "learning_rate": 8.913603381655528e-06, + "loss": 0.1604, + "step": 21680 + }, + { + "epoch": 1.1900109769484084, + "grad_norm": 1.3274638652801514, + "learning_rate": 8.909595673765727e-06, + "loss": 0.2014, + "step": 21682 + }, + { + "epoch": 1.1901207464324917, + "grad_norm": 0.9675564765930176, + "learning_rate": 8.905588671680038e-06, + "loss": 0.2398, + "step": 21684 + }, + { + "epoch": 1.1902305159165751, + "grad_norm": 1.78130042552948, + "learning_rate": 8.901582375574216e-06, + "loss": 0.2351, + "step": 21686 + }, + { + "epoch": 1.1903402854006586, + "grad_norm": 1.1504240036010742, + "learning_rate": 8.897576785624014e-06, + "loss": 0.267, + "step": 21688 + }, + { + "epoch": 1.190450054884742, + "grad_norm": 0.9777896404266357, + "learning_rate": 8.893571902005133e-06, + "loss": 0.1876, + "step": 21690 + }, + { + "epoch": 1.1905598243688256, + "grad_norm": 1.160194754600525, + "learning_rate": 8.889567724893241e-06, + "loss": 0.1752, + "step": 21692 + }, + { + "epoch": 1.1906695938529088, + "grad_norm": 1.0241316556930542, + "learning_rate": 8.885564254463985e-06, + "loss": 0.1462, + "step": 21694 + }, + { + "epoch": 1.1907793633369923, + "grad_norm": 0.9759255647659302, + "learning_rate": 8.881561490892964e-06, + "loss": 0.1479, + "step": 21696 + }, + { + "epoch": 1.1908891328210758, + "grad_norm": 1.297611117362976, + "learning_rate": 8.87755943435578e-06, + "loss": 0.289, + "step": 21698 + }, + { + "epoch": 1.1909989023051593, + "grad_norm": 0.9902594685554504, + "learning_rate": 8.873558085027975e-06, + "loss": 0.2844, + "step": 21700 + }, + { + "epoch": 1.1911086717892425, + "grad_norm": 1.0751450061798096, + "learning_rate": 8.86955744308506e-06, + "loss": 0.2049, + "step": 21702 + }, + { + "epoch": 1.191218441273326, + "grad_norm": 0.807340145111084, + "learning_rate": 8.865557508702527e-06, + "loss": 0.1938, + "step": 21704 + }, + { + "epoch": 1.1913282107574095, + "grad_norm": 1.4927301406860352, + "learning_rate": 8.861558282055837e-06, + "loss": 0.1658, + "step": 21706 + }, + { + "epoch": 1.191437980241493, + "grad_norm": 1.2346006631851196, + "learning_rate": 8.857559763320414e-06, + "loss": 0.217, + "step": 21708 + }, + { + "epoch": 1.1915477497255762, + "grad_norm": 1.3909077644348145, + "learning_rate": 8.853561952671647e-06, + "loss": 0.1204, + "step": 21710 + }, + { + "epoch": 1.1916575192096597, + "grad_norm": 1.2820391654968262, + "learning_rate": 8.849564850284909e-06, + "loss": 0.1642, + "step": 21712 + }, + { + "epoch": 1.1917672886937432, + "grad_norm": 1.9138834476470947, + "learning_rate": 8.845568456335525e-06, + "loss": 0.1934, + "step": 21714 + }, + { + "epoch": 1.1918770581778266, + "grad_norm": 1.4238265752792358, + "learning_rate": 8.841572770998805e-06, + "loss": 0.1192, + "step": 21716 + }, + { + "epoch": 1.19198682766191, + "grad_norm": 0.8842300176620483, + "learning_rate": 8.837577794450011e-06, + "loss": 0.2017, + "step": 21718 + }, + { + "epoch": 1.1920965971459934, + "grad_norm": 1.2854647636413574, + "learning_rate": 8.833583526864383e-06, + "loss": 0.1562, + "step": 21720 + }, + { + "epoch": 1.1922063666300768, + "grad_norm": 0.9148006439208984, + "learning_rate": 8.829589968417123e-06, + "loss": 0.1642, + "step": 21722 + }, + { + "epoch": 1.1923161361141603, + "grad_norm": 0.7779569625854492, + "learning_rate": 8.825597119283427e-06, + "loss": 0.0997, + "step": 21724 + }, + { + "epoch": 1.1924259055982436, + "grad_norm": 1.0628381967544556, + "learning_rate": 8.821604979638428e-06, + "loss": 0.126, + "step": 21726 + }, + { + "epoch": 1.192535675082327, + "grad_norm": 1.2418732643127441, + "learning_rate": 8.817613549657244e-06, + "loss": 0.1416, + "step": 21728 + }, + { + "epoch": 1.1926454445664105, + "grad_norm": 1.2863520383834839, + "learning_rate": 8.813622829514956e-06, + "loss": 0.23, + "step": 21730 + }, + { + "epoch": 1.192755214050494, + "grad_norm": 1.011170506477356, + "learning_rate": 8.809632819386612e-06, + "loss": 0.2007, + "step": 21732 + }, + { + "epoch": 1.1928649835345775, + "grad_norm": 1.1886088848114014, + "learning_rate": 8.805643519447246e-06, + "loss": 0.1827, + "step": 21734 + }, + { + "epoch": 1.1929747530186607, + "grad_norm": 0.8857535123825073, + "learning_rate": 8.801654929871833e-06, + "loss": 0.169, + "step": 21736 + }, + { + "epoch": 1.1930845225027442, + "grad_norm": 1.1996556520462036, + "learning_rate": 8.797667050835352e-06, + "loss": 0.1554, + "step": 21738 + }, + { + "epoch": 1.1931942919868277, + "grad_norm": 1.1748217344284058, + "learning_rate": 8.793679882512717e-06, + "loss": 0.1799, + "step": 21740 + }, + { + "epoch": 1.1933040614709112, + "grad_norm": 2.1878929138183594, + "learning_rate": 8.78969342507883e-06, + "loss": 0.2211, + "step": 21742 + }, + { + "epoch": 1.1934138309549944, + "grad_norm": 1.6063700914382935, + "learning_rate": 8.785707678708551e-06, + "loss": 0.2916, + "step": 21744 + }, + { + "epoch": 1.193523600439078, + "grad_norm": 1.0848360061645508, + "learning_rate": 8.781722643576712e-06, + "loss": 0.2158, + "step": 21746 + }, + { + "epoch": 1.1936333699231614, + "grad_norm": 1.2022745609283447, + "learning_rate": 8.777738319858131e-06, + "loss": 0.2728, + "step": 21748 + }, + { + "epoch": 1.1937431394072449, + "grad_norm": 1.0966284275054932, + "learning_rate": 8.773754707727569e-06, + "loss": 0.1469, + "step": 21750 + }, + { + "epoch": 1.1938529088913281, + "grad_norm": 1.5022597312927246, + "learning_rate": 8.76977180735977e-06, + "loss": 0.1477, + "step": 21752 + }, + { + "epoch": 1.1939626783754116, + "grad_norm": 1.102828025817871, + "learning_rate": 8.765789618929441e-06, + "loss": 0.2348, + "step": 21754 + }, + { + "epoch": 1.194072447859495, + "grad_norm": 1.3505358695983887, + "learning_rate": 8.761808142611261e-06, + "loss": 0.2092, + "step": 21756 + }, + { + "epoch": 1.1941822173435785, + "grad_norm": 1.3196601867675781, + "learning_rate": 8.75782737857987e-06, + "loss": 0.2288, + "step": 21758 + }, + { + "epoch": 1.194291986827662, + "grad_norm": 0.997677743434906, + "learning_rate": 8.753847327009898e-06, + "loss": 0.1939, + "step": 21760 + }, + { + "epoch": 1.1944017563117453, + "grad_norm": 1.2920193672180176, + "learning_rate": 8.749867988075924e-06, + "loss": 0.1808, + "step": 21762 + }, + { + "epoch": 1.1945115257958288, + "grad_norm": 1.3136460781097412, + "learning_rate": 8.745889361952503e-06, + "loss": 0.2292, + "step": 21764 + }, + { + "epoch": 1.1946212952799122, + "grad_norm": 1.0829336643218994, + "learning_rate": 8.741911448814144e-06, + "loss": 0.1614, + "step": 21766 + }, + { + "epoch": 1.1947310647639955, + "grad_norm": 1.477675437927246, + "learning_rate": 8.737934248835359e-06, + "loss": 0.1848, + "step": 21768 + }, + { + "epoch": 1.194840834248079, + "grad_norm": 1.7300772666931152, + "learning_rate": 8.733957762190592e-06, + "loss": 0.3028, + "step": 21770 + }, + { + "epoch": 1.1949506037321624, + "grad_norm": 1.2421337366104126, + "learning_rate": 8.72998198905427e-06, + "loss": 0.1627, + "step": 21772 + }, + { + "epoch": 1.195060373216246, + "grad_norm": 1.575250267982483, + "learning_rate": 8.726006929600805e-06, + "loss": 0.2243, + "step": 21774 + }, + { + "epoch": 1.1951701427003294, + "grad_norm": 1.1981754302978516, + "learning_rate": 8.722032584004553e-06, + "loss": 0.2302, + "step": 21776 + }, + { + "epoch": 1.1952799121844127, + "grad_norm": 0.9347860813140869, + "learning_rate": 8.71805895243985e-06, + "loss": 0.1278, + "step": 21778 + }, + { + "epoch": 1.1953896816684961, + "grad_norm": 1.2746185064315796, + "learning_rate": 8.714086035080996e-06, + "loss": 0.1622, + "step": 21780 + }, + { + "epoch": 1.1954994511525796, + "grad_norm": 1.7640119791030884, + "learning_rate": 8.710113832102257e-06, + "loss": 0.3443, + "step": 21782 + }, + { + "epoch": 1.195609220636663, + "grad_norm": 1.4384195804595947, + "learning_rate": 8.70614234367789e-06, + "loss": 0.3015, + "step": 21784 + }, + { + "epoch": 1.1957189901207463, + "grad_norm": 1.4194837808609009, + "learning_rate": 8.702171569982092e-06, + "loss": 0.2896, + "step": 21786 + }, + { + "epoch": 1.1958287596048298, + "grad_norm": 1.133504867553711, + "learning_rate": 8.698201511189048e-06, + "loss": 0.2803, + "step": 21788 + }, + { + "epoch": 1.1959385290889133, + "grad_norm": 0.9581324458122253, + "learning_rate": 8.694232167472896e-06, + "loss": 0.1968, + "step": 21790 + }, + { + "epoch": 1.1960482985729968, + "grad_norm": 1.3073043823242188, + "learning_rate": 8.690263539007753e-06, + "loss": 0.249, + "step": 21792 + }, + { + "epoch": 1.19615806805708, + "grad_norm": 1.0776110887527466, + "learning_rate": 8.6862956259677e-06, + "loss": 0.2496, + "step": 21794 + }, + { + "epoch": 1.1962678375411635, + "grad_norm": 4.156874656677246, + "learning_rate": 8.68232842852679e-06, + "loss": 0.2561, + "step": 21796 + }, + { + "epoch": 1.196377607025247, + "grad_norm": 1.2375268936157227, + "learning_rate": 8.678361946859056e-06, + "loss": 0.2408, + "step": 21798 + }, + { + "epoch": 1.1964873765093305, + "grad_norm": 0.7755662798881531, + "learning_rate": 8.67439618113848e-06, + "loss": 0.1263, + "step": 21800 + }, + { + "epoch": 1.196597145993414, + "grad_norm": 1.1744569540023804, + "learning_rate": 8.670431131539017e-06, + "loss": 0.1622, + "step": 21802 + }, + { + "epoch": 1.1967069154774972, + "grad_norm": 1.2702306509017944, + "learning_rate": 8.666466798234596e-06, + "loss": 0.1995, + "step": 21804 + }, + { + "epoch": 1.1968166849615807, + "grad_norm": 1.2203043699264526, + "learning_rate": 8.66250318139911e-06, + "loss": 0.2081, + "step": 21806 + }, + { + "epoch": 1.1969264544456641, + "grad_norm": 1.179502248764038, + "learning_rate": 8.658540281206415e-06, + "loss": 0.1837, + "step": 21808 + }, + { + "epoch": 1.1970362239297476, + "grad_norm": 1.43866765499115, + "learning_rate": 8.65457809783036e-06, + "loss": 0.2853, + "step": 21810 + }, + { + "epoch": 1.1971459934138309, + "grad_norm": 1.1492358446121216, + "learning_rate": 8.650616631444738e-06, + "loss": 0.1561, + "step": 21812 + }, + { + "epoch": 1.1972557628979144, + "grad_norm": 1.504473090171814, + "learning_rate": 8.646655882223317e-06, + "loss": 0.1652, + "step": 21814 + }, + { + "epoch": 1.1973655323819978, + "grad_norm": 1.0884531736373901, + "learning_rate": 8.642695850339838e-06, + "loss": 0.2059, + "step": 21816 + }, + { + "epoch": 1.1974753018660813, + "grad_norm": 1.2922759056091309, + "learning_rate": 8.638736535967998e-06, + "loss": 0.2475, + "step": 21818 + }, + { + "epoch": 1.1975850713501646, + "grad_norm": 1.0807870626449585, + "learning_rate": 8.634777939281486e-06, + "loss": 0.1649, + "step": 21820 + }, + { + "epoch": 1.197694840834248, + "grad_norm": 1.1419838666915894, + "learning_rate": 8.630820060453938e-06, + "loss": 0.1277, + "step": 21822 + }, + { + "epoch": 1.1978046103183315, + "grad_norm": 1.2381740808486938, + "learning_rate": 8.626862899658967e-06, + "loss": 0.2405, + "step": 21824 + }, + { + "epoch": 1.197914379802415, + "grad_norm": 1.3593482971191406, + "learning_rate": 8.622906457070145e-06, + "loss": 0.175, + "step": 21826 + }, + { + "epoch": 1.1980241492864985, + "grad_norm": 2.5066657066345215, + "learning_rate": 8.618950732861039e-06, + "loss": 0.1846, + "step": 21828 + }, + { + "epoch": 1.1981339187705817, + "grad_norm": 0.8112918734550476, + "learning_rate": 8.614995727205156e-06, + "loss": 0.2323, + "step": 21830 + }, + { + "epoch": 1.1982436882546652, + "grad_norm": 1.0458881855010986, + "learning_rate": 8.611041440275974e-06, + "loss": 0.1494, + "step": 21832 + }, + { + "epoch": 1.1983534577387487, + "grad_norm": 1.6462427377700806, + "learning_rate": 8.607087872246964e-06, + "loss": 0.2996, + "step": 21834 + }, + { + "epoch": 1.198463227222832, + "grad_norm": 0.721185028553009, + "learning_rate": 8.603135023291542e-06, + "loss": 0.1066, + "step": 21836 + }, + { + "epoch": 1.1985729967069154, + "grad_norm": 1.4197113513946533, + "learning_rate": 8.5991828935831e-06, + "loss": 0.2751, + "step": 21838 + }, + { + "epoch": 1.198682766190999, + "grad_norm": 1.299497127532959, + "learning_rate": 8.595231483294994e-06, + "loss": 0.2425, + "step": 21840 + }, + { + "epoch": 1.1987925356750824, + "grad_norm": 1.12531316280365, + "learning_rate": 8.591280792600554e-06, + "loss": 0.2297, + "step": 21842 + }, + { + "epoch": 1.1989023051591658, + "grad_norm": 1.6553175449371338, + "learning_rate": 8.58733082167307e-06, + "loss": 0.1983, + "step": 21844 + }, + { + "epoch": 1.199012074643249, + "grad_norm": 1.1125798225402832, + "learning_rate": 8.583381570685822e-06, + "loss": 0.1934, + "step": 21846 + }, + { + "epoch": 1.1991218441273326, + "grad_norm": 0.9473298788070679, + "learning_rate": 8.579433039812037e-06, + "loss": 0.1477, + "step": 21848 + }, + { + "epoch": 1.199231613611416, + "grad_norm": 1.036441445350647, + "learning_rate": 8.575485229224913e-06, + "loss": 0.1471, + "step": 21850 + }, + { + "epoch": 1.1993413830954995, + "grad_norm": 1.3260780572891235, + "learning_rate": 8.571538139097626e-06, + "loss": 0.217, + "step": 21852 + }, + { + "epoch": 1.1994511525795828, + "grad_norm": 1.0920575857162476, + "learning_rate": 8.567591769603312e-06, + "loss": 0.1803, + "step": 21854 + }, + { + "epoch": 1.1995609220636663, + "grad_norm": 0.9921234250068665, + "learning_rate": 8.56364612091507e-06, + "loss": 0.1322, + "step": 21856 + }, + { + "epoch": 1.1996706915477497, + "grad_norm": 1.0421377420425415, + "learning_rate": 8.559701193205979e-06, + "loss": 0.2272, + "step": 21858 + }, + { + "epoch": 1.1997804610318332, + "grad_norm": 1.3754888772964478, + "learning_rate": 8.5557569866491e-06, + "loss": 0.1717, + "step": 21860 + }, + { + "epoch": 1.1998902305159165, + "grad_norm": 1.1846561431884766, + "learning_rate": 8.551813501417432e-06, + "loss": 0.3529, + "step": 21862 + }, + { + "epoch": 1.2, + "grad_norm": 0.9599493741989136, + "learning_rate": 8.547870737683955e-06, + "loss": 0.1956, + "step": 21864 + }, + { + "epoch": 1.2001097694840834, + "grad_norm": 1.5845088958740234, + "learning_rate": 8.543928695621623e-06, + "loss": 0.2174, + "step": 21866 + }, + { + "epoch": 1.200219538968167, + "grad_norm": 1.4528452157974243, + "learning_rate": 8.539987375403336e-06, + "loss": 0.2209, + "step": 21868 + }, + { + "epoch": 1.2003293084522504, + "grad_norm": 1.0845441818237305, + "learning_rate": 8.536046777202003e-06, + "loss": 0.2054, + "step": 21870 + }, + { + "epoch": 1.2004390779363336, + "grad_norm": 1.1918920278549194, + "learning_rate": 8.532106901190471e-06, + "loss": 0.1512, + "step": 21872 + }, + { + "epoch": 1.2005488474204171, + "grad_norm": 1.4256116151809692, + "learning_rate": 8.528167747541557e-06, + "loss": 0.2004, + "step": 21874 + }, + { + "epoch": 1.2006586169045006, + "grad_norm": 1.306269645690918, + "learning_rate": 8.524229316428053e-06, + "loss": 0.1546, + "step": 21876 + }, + { + "epoch": 1.2007683863885839, + "grad_norm": 1.2248835563659668, + "learning_rate": 8.520291608022724e-06, + "loss": 0.1618, + "step": 21878 + }, + { + "epoch": 1.2008781558726673, + "grad_norm": 0.9571784138679504, + "learning_rate": 8.51635462249828e-06, + "loss": 0.2063, + "step": 21880 + }, + { + "epoch": 1.2009879253567508, + "grad_norm": 1.0409471988677979, + "learning_rate": 8.512418360027435e-06, + "loss": 0.1176, + "step": 21882 + }, + { + "epoch": 1.2010976948408343, + "grad_norm": 1.5231738090515137, + "learning_rate": 8.508482820782851e-06, + "loss": 0.1892, + "step": 21884 + }, + { + "epoch": 1.2012074643249178, + "grad_norm": 1.419978141784668, + "learning_rate": 8.504548004937146e-06, + "loss": 0.2158, + "step": 21886 + }, + { + "epoch": 1.201317233809001, + "grad_norm": 1.7707877159118652, + "learning_rate": 8.500613912662939e-06, + "loss": 0.2167, + "step": 21888 + }, + { + "epoch": 1.2014270032930845, + "grad_norm": 1.7153817415237427, + "learning_rate": 8.496680544132788e-06, + "loss": 0.2285, + "step": 21890 + }, + { + "epoch": 1.201536772777168, + "grad_norm": 1.2374581098556519, + "learning_rate": 8.49274789951923e-06, + "loss": 0.2134, + "step": 21892 + }, + { + "epoch": 1.2016465422612514, + "grad_norm": 1.2152448892593384, + "learning_rate": 8.488815978994767e-06, + "loss": 0.2506, + "step": 21894 + }, + { + "epoch": 1.2017563117453347, + "grad_norm": 1.3391697406768799, + "learning_rate": 8.484884782731881e-06, + "loss": 0.1716, + "step": 21896 + }, + { + "epoch": 1.2018660812294182, + "grad_norm": 1.0295867919921875, + "learning_rate": 8.480954310903011e-06, + "loss": 0.139, + "step": 21898 + }, + { + "epoch": 1.2019758507135017, + "grad_norm": 1.1190587282180786, + "learning_rate": 8.477024563680566e-06, + "loss": 0.1639, + "step": 21900 + }, + { + "epoch": 1.2020856201975851, + "grad_norm": 1.3988004922866821, + "learning_rate": 8.473095541236923e-06, + "loss": 0.1247, + "step": 21902 + }, + { + "epoch": 1.2021953896816684, + "grad_norm": 1.9593034982681274, + "learning_rate": 8.469167243744427e-06, + "loss": 0.2262, + "step": 21904 + }, + { + "epoch": 1.2023051591657519, + "grad_norm": 1.1591572761535645, + "learning_rate": 8.465239671375388e-06, + "loss": 0.1536, + "step": 21906 + }, + { + "epoch": 1.2024149286498353, + "grad_norm": 2.184990882873535, + "learning_rate": 8.4613128243021e-06, + "loss": 0.2317, + "step": 21908 + }, + { + "epoch": 1.2025246981339188, + "grad_norm": 1.6944230794906616, + "learning_rate": 8.45738670269681e-06, + "loss": 0.2842, + "step": 21910 + }, + { + "epoch": 1.2026344676180023, + "grad_norm": 1.2919009923934937, + "learning_rate": 8.453461306731736e-06, + "loss": 0.2263, + "step": 21912 + }, + { + "epoch": 1.2027442371020856, + "grad_norm": 1.1147829294204712, + "learning_rate": 8.449536636579061e-06, + "loss": 0.2089, + "step": 21914 + }, + { + "epoch": 1.202854006586169, + "grad_norm": 2.0396053791046143, + "learning_rate": 8.445612692410939e-06, + "loss": 0.2303, + "step": 21916 + }, + { + "epoch": 1.2029637760702525, + "grad_norm": 1.7220747470855713, + "learning_rate": 8.441689474399495e-06, + "loss": 0.2155, + "step": 21918 + }, + { + "epoch": 1.203073545554336, + "grad_norm": 0.8174144625663757, + "learning_rate": 8.437766982716835e-06, + "loss": 0.263, + "step": 21920 + }, + { + "epoch": 1.2031833150384192, + "grad_norm": 1.5029464960098267, + "learning_rate": 8.433845217535005e-06, + "loss": 0.1567, + "step": 21922 + }, + { + "epoch": 1.2032930845225027, + "grad_norm": 1.239644169807434, + "learning_rate": 8.429924179026035e-06, + "loss": 0.1621, + "step": 21924 + }, + { + "epoch": 1.2034028540065862, + "grad_norm": 0.9054602384567261, + "learning_rate": 8.426003867361923e-06, + "loss": 0.1014, + "step": 21926 + }, + { + "epoch": 1.2035126234906697, + "grad_norm": 0.944786012172699, + "learning_rate": 8.422084282714629e-06, + "loss": 0.1431, + "step": 21928 + }, + { + "epoch": 1.203622392974753, + "grad_norm": 1.0678530931472778, + "learning_rate": 8.41816542525608e-06, + "loss": 0.1648, + "step": 21930 + }, + { + "epoch": 1.2037321624588364, + "grad_norm": 2.219252586364746, + "learning_rate": 8.414247295158195e-06, + "loss": 0.2064, + "step": 21932 + }, + { + "epoch": 1.2038419319429199, + "grad_norm": 1.1862952709197998, + "learning_rate": 8.410329892592828e-06, + "loss": 0.1623, + "step": 21934 + }, + { + "epoch": 1.2039517014270034, + "grad_norm": 1.1085330247879028, + "learning_rate": 8.40641321773182e-06, + "loss": 0.2274, + "step": 21936 + }, + { + "epoch": 1.2040614709110868, + "grad_norm": 1.352101445198059, + "learning_rate": 8.402497270746976e-06, + "loss": 0.2061, + "step": 21938 + }, + { + "epoch": 1.20417124039517, + "grad_norm": 1.4321503639221191, + "learning_rate": 8.398582051810063e-06, + "loss": 0.211, + "step": 21940 + }, + { + "epoch": 1.2042810098792536, + "grad_norm": 1.3641996383666992, + "learning_rate": 8.39466756109282e-06, + "loss": 0.1725, + "step": 21942 + }, + { + "epoch": 1.204390779363337, + "grad_norm": 1.4118974208831787, + "learning_rate": 8.390753798766971e-06, + "loss": 0.163, + "step": 21944 + }, + { + "epoch": 1.2045005488474203, + "grad_norm": 1.2131003141403198, + "learning_rate": 8.386840765004173e-06, + "loss": 0.2552, + "step": 21946 + }, + { + "epoch": 1.2046103183315038, + "grad_norm": 0.7364380359649658, + "learning_rate": 8.382928459976091e-06, + "loss": 0.1561, + "step": 21948 + }, + { + "epoch": 1.2047200878155873, + "grad_norm": 0.9598454236984253, + "learning_rate": 8.379016883854326e-06, + "loss": 0.1494, + "step": 21950 + }, + { + "epoch": 1.2048298572996707, + "grad_norm": 0.8547390699386597, + "learning_rate": 8.375106036810461e-06, + "loss": 0.1631, + "step": 21952 + }, + { + "epoch": 1.2049396267837542, + "grad_norm": 0.763831377029419, + "learning_rate": 8.371195919016048e-06, + "loss": 0.1482, + "step": 21954 + }, + { + "epoch": 1.2050493962678375, + "grad_norm": 1.4288592338562012, + "learning_rate": 8.367286530642591e-06, + "loss": 0.1843, + "step": 21956 + }, + { + "epoch": 1.205159165751921, + "grad_norm": 1.1824764013290405, + "learning_rate": 8.363377871861594e-06, + "loss": 0.1706, + "step": 21958 + }, + { + "epoch": 1.2052689352360044, + "grad_norm": 1.5637600421905518, + "learning_rate": 8.3594699428445e-06, + "loss": 0.2033, + "step": 21960 + }, + { + "epoch": 1.205378704720088, + "grad_norm": 1.082497477531433, + "learning_rate": 8.355562743762732e-06, + "loss": 0.145, + "step": 21962 + }, + { + "epoch": 1.2054884742041712, + "grad_norm": 0.9699558615684509, + "learning_rate": 8.351656274787676e-06, + "loss": 0.0886, + "step": 21964 + }, + { + "epoch": 1.2055982436882546, + "grad_norm": 0.7757745981216431, + "learning_rate": 8.347750536090684e-06, + "loss": 0.1155, + "step": 21966 + }, + { + "epoch": 1.2057080131723381, + "grad_norm": 1.4792490005493164, + "learning_rate": 8.343845527843094e-06, + "loss": 0.2044, + "step": 21968 + }, + { + "epoch": 1.2058177826564216, + "grad_norm": 0.7988273501396179, + "learning_rate": 8.339941250216194e-06, + "loss": 0.2128, + "step": 21970 + }, + { + "epoch": 1.2059275521405048, + "grad_norm": 1.2657846212387085, + "learning_rate": 8.336037703381242e-06, + "loss": 0.1504, + "step": 21972 + }, + { + "epoch": 1.2060373216245883, + "grad_norm": 1.1693496704101562, + "learning_rate": 8.332134887509468e-06, + "loss": 0.3356, + "step": 21974 + }, + { + "epoch": 1.2061470911086718, + "grad_norm": 1.3719147443771362, + "learning_rate": 8.328232802772062e-06, + "loss": 0.3778, + "step": 21976 + }, + { + "epoch": 1.2062568605927553, + "grad_norm": 1.378046989440918, + "learning_rate": 8.3243314493402e-06, + "loss": 0.1595, + "step": 21978 + }, + { + "epoch": 1.2063666300768388, + "grad_norm": 0.9586505889892578, + "learning_rate": 8.320430827385003e-06, + "loss": 0.179, + "step": 21980 + }, + { + "epoch": 1.206476399560922, + "grad_norm": 1.2247897386550903, + "learning_rate": 8.316530937077587e-06, + "loss": 0.2387, + "step": 21982 + }, + { + "epoch": 1.2065861690450055, + "grad_norm": 1.3845628499984741, + "learning_rate": 8.312631778589015e-06, + "loss": 0.1558, + "step": 21984 + }, + { + "epoch": 1.206695938529089, + "grad_norm": 0.9908838272094727, + "learning_rate": 8.308733352090314e-06, + "loss": 0.217, + "step": 21986 + }, + { + "epoch": 1.2068057080131722, + "grad_norm": 1.2420998811721802, + "learning_rate": 8.304835657752494e-06, + "loss": 0.2022, + "step": 21988 + }, + { + "epoch": 1.2069154774972557, + "grad_norm": 1.5018311738967896, + "learning_rate": 8.30093869574653e-06, + "loss": 0.2076, + "step": 21990 + }, + { + "epoch": 1.2070252469813392, + "grad_norm": 2.227069854736328, + "learning_rate": 8.297042466243348e-06, + "loss": 0.3116, + "step": 21992 + }, + { + "epoch": 1.2071350164654227, + "grad_norm": 1.2927231788635254, + "learning_rate": 8.293146969413876e-06, + "loss": 0.1948, + "step": 21994 + }, + { + "epoch": 1.2072447859495061, + "grad_norm": 1.0332118272781372, + "learning_rate": 8.289252205428979e-06, + "loss": 0.1284, + "step": 21996 + }, + { + "epoch": 1.2073545554335894, + "grad_norm": 1.1496751308441162, + "learning_rate": 8.2853581744595e-06, + "loss": 0.1599, + "step": 21998 + }, + { + "epoch": 1.2074643249176729, + "grad_norm": 1.2489020824432373, + "learning_rate": 8.281464876676254e-06, + "loss": 0.1825, + "step": 22000 + }, + { + "epoch": 1.2075740944017563, + "grad_norm": 1.2059346437454224, + "learning_rate": 8.277572312250009e-06, + "loss": 0.1617, + "step": 22002 + }, + { + "epoch": 1.2076838638858398, + "grad_norm": 2.107996702194214, + "learning_rate": 8.273680481351528e-06, + "loss": 0.31, + "step": 22004 + }, + { + "epoch": 1.207793633369923, + "grad_norm": 0.9312357902526855, + "learning_rate": 8.269789384151522e-06, + "loss": 0.1434, + "step": 22006 + }, + { + "epoch": 1.2079034028540065, + "grad_norm": 1.4657055139541626, + "learning_rate": 8.265899020820658e-06, + "loss": 0.2471, + "step": 22008 + }, + { + "epoch": 1.20801317233809, + "grad_norm": 0.9182456731796265, + "learning_rate": 8.26200939152961e-06, + "loss": 0.1678, + "step": 22010 + }, + { + "epoch": 1.2081229418221735, + "grad_norm": 1.5240914821624756, + "learning_rate": 8.258120496448985e-06, + "loss": 0.2279, + "step": 22012 + }, + { + "epoch": 1.2082327113062568, + "grad_norm": 1.3925137519836426, + "learning_rate": 8.254232335749368e-06, + "loss": 0.3317, + "step": 22014 + }, + { + "epoch": 1.2083424807903402, + "grad_norm": 1.2716200351715088, + "learning_rate": 8.250344909601304e-06, + "loss": 0.1785, + "step": 22016 + }, + { + "epoch": 1.2084522502744237, + "grad_norm": 1.6136281490325928, + "learning_rate": 8.246458218175335e-06, + "loss": 0.2912, + "step": 22018 + }, + { + "epoch": 1.2085620197585072, + "grad_norm": 1.022262454032898, + "learning_rate": 8.24257226164194e-06, + "loss": 0.2056, + "step": 22020 + }, + { + "epoch": 1.2086717892425907, + "grad_norm": 0.9197710156440735, + "learning_rate": 8.238687040171575e-06, + "loss": 0.1805, + "step": 22022 + }, + { + "epoch": 1.208781558726674, + "grad_norm": 1.1225719451904297, + "learning_rate": 8.234802553934665e-06, + "loss": 0.2394, + "step": 22024 + }, + { + "epoch": 1.2088913282107574, + "grad_norm": 0.7620855569839478, + "learning_rate": 8.230918803101602e-06, + "loss": 0.1824, + "step": 22026 + }, + { + "epoch": 1.2090010976948409, + "grad_norm": 1.7190279960632324, + "learning_rate": 8.227035787842744e-06, + "loss": 0.2009, + "step": 22028 + }, + { + "epoch": 1.2091108671789244, + "grad_norm": 0.8949450254440308, + "learning_rate": 8.22315350832843e-06, + "loss": 0.1223, + "step": 22030 + }, + { + "epoch": 1.2092206366630076, + "grad_norm": 1.2216793298721313, + "learning_rate": 8.219271964728944e-06, + "loss": 0.2121, + "step": 22032 + }, + { + "epoch": 1.209330406147091, + "grad_norm": 1.287189245223999, + "learning_rate": 8.215391157214558e-06, + "loss": 0.166, + "step": 22034 + }, + { + "epoch": 1.2094401756311746, + "grad_norm": 0.8308677077293396, + "learning_rate": 8.2115110859555e-06, + "loss": 0.1181, + "step": 22036 + }, + { + "epoch": 1.209549945115258, + "grad_norm": 1.0979619026184082, + "learning_rate": 8.20763175112196e-06, + "loss": 0.1808, + "step": 22038 + }, + { + "epoch": 1.2096597145993413, + "grad_norm": 1.109156608581543, + "learning_rate": 8.203753152884122e-06, + "loss": 0.2068, + "step": 22040 + }, + { + "epoch": 1.2097694840834248, + "grad_norm": 1.2874906063079834, + "learning_rate": 8.199875291412101e-06, + "loss": 0.1239, + "step": 22042 + }, + { + "epoch": 1.2098792535675083, + "grad_norm": 1.2952908277511597, + "learning_rate": 8.195998166876021e-06, + "loss": 0.1457, + "step": 22044 + }, + { + "epoch": 1.2099890230515917, + "grad_norm": 1.1489067077636719, + "learning_rate": 8.192121779445939e-06, + "loss": 0.1709, + "step": 22046 + }, + { + "epoch": 1.2100987925356752, + "grad_norm": 1.4263367652893066, + "learning_rate": 8.18824612929189e-06, + "loss": 0.1486, + "step": 22048 + }, + { + "epoch": 1.2102085620197585, + "grad_norm": 1.3153542280197144, + "learning_rate": 8.184371216583888e-06, + "loss": 0.2004, + "step": 22050 + }, + { + "epoch": 1.210318331503842, + "grad_norm": 0.8657707571983337, + "learning_rate": 8.180497041491888e-06, + "loss": 0.1355, + "step": 22052 + }, + { + "epoch": 1.2104281009879254, + "grad_norm": 1.8101886510849, + "learning_rate": 8.176623604185854e-06, + "loss": 0.2194, + "step": 22054 + }, + { + "epoch": 1.2105378704720087, + "grad_norm": 0.9177037477493286, + "learning_rate": 8.172750904835682e-06, + "loss": 0.3208, + "step": 22056 + }, + { + "epoch": 1.2106476399560921, + "grad_norm": 1.498416543006897, + "learning_rate": 8.16887894361125e-06, + "loss": 0.2325, + "step": 22058 + }, + { + "epoch": 1.2107574094401756, + "grad_norm": 0.8145471215248108, + "learning_rate": 8.165007720682396e-06, + "loss": 0.1754, + "step": 22060 + }, + { + "epoch": 1.210867178924259, + "grad_norm": 1.427649736404419, + "learning_rate": 8.161137236218936e-06, + "loss": 0.1718, + "step": 22062 + }, + { + "epoch": 1.2109769484083426, + "grad_norm": 1.8922340869903564, + "learning_rate": 8.157267490390638e-06, + "loss": 0.2817, + "step": 22064 + }, + { + "epoch": 1.2110867178924258, + "grad_norm": 1.1061434745788574, + "learning_rate": 8.153398483367266e-06, + "loss": 0.1367, + "step": 22066 + }, + { + "epoch": 1.2111964873765093, + "grad_norm": 1.4532684087753296, + "learning_rate": 8.149530215318515e-06, + "loss": 0.151, + "step": 22068 + }, + { + "epoch": 1.2113062568605928, + "grad_norm": 1.5456165075302124, + "learning_rate": 8.145662686414086e-06, + "loss": 0.2158, + "step": 22070 + }, + { + "epoch": 1.2114160263446763, + "grad_norm": 1.88516366481781, + "learning_rate": 8.141795896823618e-06, + "loss": 0.222, + "step": 22072 + }, + { + "epoch": 1.2115257958287595, + "grad_norm": 0.9077252745628357, + "learning_rate": 8.137929846716724e-06, + "loss": 0.1054, + "step": 22074 + }, + { + "epoch": 1.211635565312843, + "grad_norm": 1.066731333732605, + "learning_rate": 8.134064536262992e-06, + "loss": 0.1467, + "step": 22076 + }, + { + "epoch": 1.2117453347969265, + "grad_norm": 1.1259316205978394, + "learning_rate": 8.130199965631966e-06, + "loss": 0.1545, + "step": 22078 + }, + { + "epoch": 1.21185510428101, + "grad_norm": 0.96626877784729, + "learning_rate": 8.126336134993176e-06, + "loss": 0.1598, + "step": 22080 + }, + { + "epoch": 1.2119648737650932, + "grad_norm": 1.5110892057418823, + "learning_rate": 8.122473044516108e-06, + "loss": 0.1771, + "step": 22082 + }, + { + "epoch": 1.2120746432491767, + "grad_norm": 1.260026216506958, + "learning_rate": 8.118610694370206e-06, + "loss": 0.1547, + "step": 22084 + }, + { + "epoch": 1.2121844127332602, + "grad_norm": 1.5162503719329834, + "learning_rate": 8.114749084724904e-06, + "loss": 0.2773, + "step": 22086 + }, + { + "epoch": 1.2122941822173436, + "grad_norm": 1.296227216720581, + "learning_rate": 8.110888215749574e-06, + "loss": 0.1404, + "step": 22088 + }, + { + "epoch": 1.2124039517014271, + "grad_norm": 0.9479511380195618, + "learning_rate": 8.107028087613592e-06, + "loss": 0.1204, + "step": 22090 + }, + { + "epoch": 1.2125137211855104, + "grad_norm": 0.9573137164115906, + "learning_rate": 8.103168700486272e-06, + "loss": 0.0954, + "step": 22092 + }, + { + "epoch": 1.2126234906695939, + "grad_norm": 0.7378578186035156, + "learning_rate": 8.09931005453691e-06, + "loss": 0.1507, + "step": 22094 + }, + { + "epoch": 1.2127332601536773, + "grad_norm": 1.2016175985336304, + "learning_rate": 8.095452149934762e-06, + "loss": 0.2451, + "step": 22096 + }, + { + "epoch": 1.2128430296377606, + "grad_norm": 0.8993279337882996, + "learning_rate": 8.091594986849046e-06, + "loss": 0.1719, + "step": 22098 + }, + { + "epoch": 1.212952799121844, + "grad_norm": 0.702985942363739, + "learning_rate": 8.087738565448975e-06, + "loss": 0.0893, + "step": 22100 + }, + { + "epoch": 1.2130625686059275, + "grad_norm": 0.8082519173622131, + "learning_rate": 8.08388288590369e-06, + "loss": 0.1275, + "step": 22102 + }, + { + "epoch": 1.213172338090011, + "grad_norm": 1.2482757568359375, + "learning_rate": 8.080027948382341e-06, + "loss": 0.1772, + "step": 22104 + }, + { + "epoch": 1.2132821075740945, + "grad_norm": 1.0125595331192017, + "learning_rate": 8.076173753054012e-06, + "loss": 0.1953, + "step": 22106 + }, + { + "epoch": 1.2133918770581777, + "grad_norm": 1.4490855932235718, + "learning_rate": 8.072320300087771e-06, + "loss": 0.1871, + "step": 22108 + }, + { + "epoch": 1.2135016465422612, + "grad_norm": 1.067406177520752, + "learning_rate": 8.068467589652647e-06, + "loss": 0.1981, + "step": 22110 + }, + { + "epoch": 1.2136114160263447, + "grad_norm": 1.2252931594848633, + "learning_rate": 8.06461562191764e-06, + "loss": 0.2182, + "step": 22112 + }, + { + "epoch": 1.2137211855104282, + "grad_norm": 0.9949693083763123, + "learning_rate": 8.060764397051707e-06, + "loss": 0.1576, + "step": 22114 + }, + { + "epoch": 1.2138309549945114, + "grad_norm": 1.116363763809204, + "learning_rate": 8.056913915223798e-06, + "loss": 0.197, + "step": 22116 + }, + { + "epoch": 1.213940724478595, + "grad_norm": 1.587402582168579, + "learning_rate": 8.053064176602806e-06, + "loss": 0.2122, + "step": 22118 + }, + { + "epoch": 1.2140504939626784, + "grad_norm": 2.343574285507202, + "learning_rate": 8.049215181357603e-06, + "loss": 0.2521, + "step": 22120 + }, + { + "epoch": 1.2141602634467619, + "grad_norm": 1.1993948221206665, + "learning_rate": 8.045366929657016e-06, + "loss": 0.1886, + "step": 22122 + }, + { + "epoch": 1.2142700329308451, + "grad_norm": 1.1109566688537598, + "learning_rate": 8.041519421669858e-06, + "loss": 0.2003, + "step": 22124 + }, + { + "epoch": 1.2143798024149286, + "grad_norm": 0.9894227385520935, + "learning_rate": 8.037672657564885e-06, + "loss": 0.102, + "step": 22126 + }, + { + "epoch": 1.214489571899012, + "grad_norm": 0.894315242767334, + "learning_rate": 8.033826637510847e-06, + "loss": 0.169, + "step": 22128 + }, + { + "epoch": 1.2145993413830956, + "grad_norm": 1.2108283042907715, + "learning_rate": 8.029981361676456e-06, + "loss": 0.2045, + "step": 22130 + }, + { + "epoch": 1.214709110867179, + "grad_norm": 1.3837058544158936, + "learning_rate": 8.026136830230374e-06, + "loss": 0.1442, + "step": 22132 + }, + { + "epoch": 1.2148188803512623, + "grad_norm": 1.2390414476394653, + "learning_rate": 8.022293043341245e-06, + "loss": 0.3442, + "step": 22134 + }, + { + "epoch": 1.2149286498353458, + "grad_norm": 1.2370190620422363, + "learning_rate": 8.018450001177672e-06, + "loss": 0.3524, + "step": 22136 + }, + { + "epoch": 1.2150384193194292, + "grad_norm": 1.0056626796722412, + "learning_rate": 8.014607703908225e-06, + "loss": 0.2506, + "step": 22138 + }, + { + "epoch": 1.2151481888035127, + "grad_norm": 1.0633704662322998, + "learning_rate": 8.010766151701463e-06, + "loss": 0.1607, + "step": 22140 + }, + { + "epoch": 1.215257958287596, + "grad_norm": 0.7300794124603271, + "learning_rate": 8.006925344725885e-06, + "loss": 0.1304, + "step": 22142 + }, + { + "epoch": 1.2153677277716795, + "grad_norm": 2.3737661838531494, + "learning_rate": 8.003085283149969e-06, + "loss": 0.1805, + "step": 22144 + }, + { + "epoch": 1.215477497255763, + "grad_norm": 0.8535043001174927, + "learning_rate": 7.999245967142158e-06, + "loss": 0.2057, + "step": 22146 + }, + { + "epoch": 1.2155872667398464, + "grad_norm": 0.8446755409240723, + "learning_rate": 7.995407396870862e-06, + "loss": 0.1352, + "step": 22148 + }, + { + "epoch": 1.2156970362239297, + "grad_norm": 1.0827590227127075, + "learning_rate": 7.991569572504453e-06, + "loss": 0.3087, + "step": 22150 + }, + { + "epoch": 1.2158068057080131, + "grad_norm": 1.3066380023956299, + "learning_rate": 7.987732494211294e-06, + "loss": 0.2045, + "step": 22152 + }, + { + "epoch": 1.2159165751920966, + "grad_norm": 1.1745339632034302, + "learning_rate": 7.98389616215969e-06, + "loss": 0.1715, + "step": 22154 + }, + { + "epoch": 1.21602634467618, + "grad_norm": 0.7728831768035889, + "learning_rate": 7.98006057651792e-06, + "loss": 0.1539, + "step": 22156 + }, + { + "epoch": 1.2161361141602636, + "grad_norm": 1.6303759813308716, + "learning_rate": 7.976225737454224e-06, + "loss": 0.1907, + "step": 22158 + }, + { + "epoch": 1.2162458836443468, + "grad_norm": 0.9896330833435059, + "learning_rate": 7.97239164513683e-06, + "loss": 0.1429, + "step": 22160 + }, + { + "epoch": 1.2163556531284303, + "grad_norm": 1.2814276218414307, + "learning_rate": 7.968558299733917e-06, + "loss": 0.2323, + "step": 22162 + }, + { + "epoch": 1.2164654226125138, + "grad_norm": 1.5461326837539673, + "learning_rate": 7.964725701413625e-06, + "loss": 0.357, + "step": 22164 + }, + { + "epoch": 1.216575192096597, + "grad_norm": 0.835347592830658, + "learning_rate": 7.960893850344083e-06, + "loss": 0.2024, + "step": 22166 + }, + { + "epoch": 1.2166849615806805, + "grad_norm": 2.2044155597686768, + "learning_rate": 7.957062746693372e-06, + "loss": 0.2107, + "step": 22168 + }, + { + "epoch": 1.216794731064764, + "grad_norm": 0.9788275361061096, + "learning_rate": 7.95323239062954e-06, + "loss": 0.1798, + "step": 22170 + }, + { + "epoch": 1.2169045005488475, + "grad_norm": 0.9332573413848877, + "learning_rate": 7.949402782320605e-06, + "loss": 0.2498, + "step": 22172 + }, + { + "epoch": 1.217014270032931, + "grad_norm": 1.019320011138916, + "learning_rate": 7.945573921934546e-06, + "loss": 0.1354, + "step": 22174 + }, + { + "epoch": 1.2171240395170142, + "grad_norm": 1.2558879852294922, + "learning_rate": 7.941745809639331e-06, + "loss": 0.1816, + "step": 22176 + }, + { + "epoch": 1.2172338090010977, + "grad_norm": 1.1481804847717285, + "learning_rate": 7.937918445602871e-06, + "loss": 0.2537, + "step": 22178 + }, + { + "epoch": 1.2173435784851812, + "grad_norm": 0.8094334602355957, + "learning_rate": 7.934091829993055e-06, + "loss": 0.0864, + "step": 22180 + }, + { + "epoch": 1.2174533479692646, + "grad_norm": 0.9897680878639221, + "learning_rate": 7.930265962977734e-06, + "loss": 0.1055, + "step": 22182 + }, + { + "epoch": 1.2175631174533479, + "grad_norm": 0.9460722804069519, + "learning_rate": 7.92644084472473e-06, + "loss": 0.1466, + "step": 22184 + }, + { + "epoch": 1.2176728869374314, + "grad_norm": 1.1787959337234497, + "learning_rate": 7.922616475401829e-06, + "loss": 0.2128, + "step": 22186 + }, + { + "epoch": 1.2177826564215148, + "grad_norm": 1.58228600025177, + "learning_rate": 7.918792855176787e-06, + "loss": 0.2378, + "step": 22188 + }, + { + "epoch": 1.2178924259055983, + "grad_norm": 0.9362415671348572, + "learning_rate": 7.914969984217338e-06, + "loss": 0.2297, + "step": 22190 + }, + { + "epoch": 1.2180021953896816, + "grad_norm": 0.7300733923912048, + "learning_rate": 7.911147862691163e-06, + "loss": 0.186, + "step": 22192 + }, + { + "epoch": 1.218111964873765, + "grad_norm": 2.061777353286743, + "learning_rate": 7.90732649076592e-06, + "loss": 0.1883, + "step": 22194 + }, + { + "epoch": 1.2182217343578485, + "grad_norm": 1.197977066040039, + "learning_rate": 7.903505868609235e-06, + "loss": 0.1789, + "step": 22196 + }, + { + "epoch": 1.218331503841932, + "grad_norm": 1.2569094896316528, + "learning_rate": 7.899685996388695e-06, + "loss": 0.1858, + "step": 22198 + }, + { + "epoch": 1.2184412733260155, + "grad_norm": 1.32020902633667, + "learning_rate": 7.895866874271854e-06, + "loss": 0.1492, + "step": 22200 + }, + { + "epoch": 1.2185510428100987, + "grad_norm": 1.3041179180145264, + "learning_rate": 7.892048502426252e-06, + "loss": 0.1636, + "step": 22202 + }, + { + "epoch": 1.2186608122941822, + "grad_norm": 1.1499178409576416, + "learning_rate": 7.88823088101937e-06, + "loss": 0.2743, + "step": 22204 + }, + { + "epoch": 1.2187705817782657, + "grad_norm": 0.9702976942062378, + "learning_rate": 7.884414010218677e-06, + "loss": 0.2162, + "step": 22206 + }, + { + "epoch": 1.218880351262349, + "grad_norm": 1.2140719890594482, + "learning_rate": 7.880597890191587e-06, + "loss": 0.1679, + "step": 22208 + }, + { + "epoch": 1.2189901207464324, + "grad_norm": 0.9292429089546204, + "learning_rate": 7.876782521105506e-06, + "loss": 0.2306, + "step": 22210 + }, + { + "epoch": 1.219099890230516, + "grad_norm": 1.053452730178833, + "learning_rate": 7.872967903127779e-06, + "loss": 0.2085, + "step": 22212 + }, + { + "epoch": 1.2192096597145994, + "grad_norm": 1.23482346534729, + "learning_rate": 7.869154036425752e-06, + "loss": 0.3297, + "step": 22214 + }, + { + "epoch": 1.2193194291986829, + "grad_norm": 0.9289733171463013, + "learning_rate": 7.865340921166714e-06, + "loss": 0.1712, + "step": 22216 + }, + { + "epoch": 1.2194291986827661, + "grad_norm": 1.3929475545883179, + "learning_rate": 7.861528557517913e-06, + "loss": 0.2846, + "step": 22218 + }, + { + "epoch": 1.2195389681668496, + "grad_norm": 1.171831727027893, + "learning_rate": 7.857716945646603e-06, + "loss": 0.1837, + "step": 22220 + }, + { + "epoch": 1.219648737650933, + "grad_norm": 2.8091135025024414, + "learning_rate": 7.853906085719964e-06, + "loss": 0.3234, + "step": 22222 + }, + { + "epoch": 1.2197585071350165, + "grad_norm": 1.1397370100021362, + "learning_rate": 7.850095977905156e-06, + "loss": 0.1741, + "step": 22224 + }, + { + "epoch": 1.2198682766190998, + "grad_norm": 1.2711912393569946, + "learning_rate": 7.84628662236932e-06, + "loss": 0.1903, + "step": 22226 + }, + { + "epoch": 1.2199780461031833, + "grad_norm": 1.4691537618637085, + "learning_rate": 7.84247801927955e-06, + "loss": 0.2695, + "step": 22228 + }, + { + "epoch": 1.2200878155872668, + "grad_norm": 1.118097186088562, + "learning_rate": 7.838670168802909e-06, + "loss": 0.1958, + "step": 22230 + }, + { + "epoch": 1.2201975850713502, + "grad_norm": 0.9531015157699585, + "learning_rate": 7.834863071106428e-06, + "loss": 0.1795, + "step": 22232 + }, + { + "epoch": 1.2203073545554335, + "grad_norm": 1.3530153036117554, + "learning_rate": 7.8310567263571e-06, + "loss": 0.2336, + "step": 22234 + }, + { + "epoch": 1.220417124039517, + "grad_norm": 1.3874478340148926, + "learning_rate": 7.82725113472189e-06, + "loss": 0.1923, + "step": 22236 + }, + { + "epoch": 1.2205268935236004, + "grad_norm": 1.6515049934387207, + "learning_rate": 7.823446296367739e-06, + "loss": 0.3033, + "step": 22238 + }, + { + "epoch": 1.220636663007684, + "grad_norm": 0.8000200986862183, + "learning_rate": 7.819642211461545e-06, + "loss": 0.1355, + "step": 22240 + }, + { + "epoch": 1.2207464324917674, + "grad_norm": 1.718191385269165, + "learning_rate": 7.815838880170168e-06, + "loss": 0.2229, + "step": 22242 + }, + { + "epoch": 1.2208562019758507, + "grad_norm": 0.8089155554771423, + "learning_rate": 7.812036302660441e-06, + "loss": 0.1177, + "step": 22244 + }, + { + "epoch": 1.2209659714599341, + "grad_norm": 0.9607914090156555, + "learning_rate": 7.808234479099166e-06, + "loss": 0.1949, + "step": 22246 + }, + { + "epoch": 1.2210757409440176, + "grad_norm": 1.0021240711212158, + "learning_rate": 7.8044334096531e-06, + "loss": 0.1471, + "step": 22248 + }, + { + "epoch": 1.221185510428101, + "grad_norm": 1.2361302375793457, + "learning_rate": 7.800633094488988e-06, + "loss": 0.1962, + "step": 22250 + }, + { + "epoch": 1.2212952799121843, + "grad_norm": 1.064537525177002, + "learning_rate": 7.796833533773534e-06, + "loss": 0.1498, + "step": 22252 + }, + { + "epoch": 1.2214050493962678, + "grad_norm": 1.0152764320373535, + "learning_rate": 7.793034727673399e-06, + "loss": 0.1611, + "step": 22254 + }, + { + "epoch": 1.2215148188803513, + "grad_norm": 1.4022328853607178, + "learning_rate": 7.789236676355219e-06, + "loss": 0.2407, + "step": 22256 + }, + { + "epoch": 1.2216245883644348, + "grad_norm": 2.132408380508423, + "learning_rate": 7.785439379985592e-06, + "loss": 0.2203, + "step": 22258 + }, + { + "epoch": 1.221734357848518, + "grad_norm": 1.4302828311920166, + "learning_rate": 7.781642838731091e-06, + "loss": 0.1898, + "step": 22260 + }, + { + "epoch": 1.2218441273326015, + "grad_norm": 1.482479453086853, + "learning_rate": 7.777847052758235e-06, + "loss": 0.2532, + "step": 22262 + }, + { + "epoch": 1.221953896816685, + "grad_norm": 1.6549948453903198, + "learning_rate": 7.774052022233549e-06, + "loss": 0.2864, + "step": 22264 + }, + { + "epoch": 1.2220636663007685, + "grad_norm": 0.9626025557518005, + "learning_rate": 7.770257747323493e-06, + "loss": 0.1394, + "step": 22266 + }, + { + "epoch": 1.222173435784852, + "grad_norm": 1.017223834991455, + "learning_rate": 7.7664642281945e-06, + "loss": 0.1434, + "step": 22268 + }, + { + "epoch": 1.2222832052689352, + "grad_norm": 1.2666746377944946, + "learning_rate": 7.762671465012972e-06, + "loss": 0.165, + "step": 22270 + }, + { + "epoch": 1.2223929747530187, + "grad_norm": 1.1137181520462036, + "learning_rate": 7.75887945794527e-06, + "loss": 0.148, + "step": 22272 + }, + { + "epoch": 1.2225027442371021, + "grad_norm": 1.139574646949768, + "learning_rate": 7.755088207157752e-06, + "loss": 0.1385, + "step": 22274 + }, + { + "epoch": 1.2226125137211854, + "grad_norm": 1.132820963859558, + "learning_rate": 7.751297712816704e-06, + "loss": 0.2295, + "step": 22276 + }, + { + "epoch": 1.2227222832052689, + "grad_norm": 0.8475651144981384, + "learning_rate": 7.7475079750884e-06, + "loss": 0.1658, + "step": 22278 + }, + { + "epoch": 1.2228320526893524, + "grad_norm": 1.3448560237884521, + "learning_rate": 7.743718994139071e-06, + "loss": 0.2059, + "step": 22280 + }, + { + "epoch": 1.2229418221734358, + "grad_norm": 0.9579096436500549, + "learning_rate": 7.739930770134931e-06, + "loss": 0.1167, + "step": 22282 + }, + { + "epoch": 1.2230515916575193, + "grad_norm": 1.419018268585205, + "learning_rate": 7.736143303242147e-06, + "loss": 0.2656, + "step": 22284 + }, + { + "epoch": 1.2231613611416026, + "grad_norm": 1.1205527782440186, + "learning_rate": 7.732356593626846e-06, + "loss": 0.2281, + "step": 22286 + }, + { + "epoch": 1.223271130625686, + "grad_norm": 1.5129234790802002, + "learning_rate": 7.728570641455146e-06, + "loss": 0.2441, + "step": 22288 + }, + { + "epoch": 1.2233809001097695, + "grad_norm": 0.8381415605545044, + "learning_rate": 7.724785446893112e-06, + "loss": 0.1637, + "step": 22290 + }, + { + "epoch": 1.223490669593853, + "grad_norm": 0.8101715445518494, + "learning_rate": 7.721001010106779e-06, + "loss": 0.1485, + "step": 22292 + }, + { + "epoch": 1.2236004390779363, + "grad_norm": 0.8838652968406677, + "learning_rate": 7.717217331262152e-06, + "loss": 0.2216, + "step": 22294 + }, + { + "epoch": 1.2237102085620197, + "grad_norm": 1.3878686428070068, + "learning_rate": 7.713434410525203e-06, + "loss": 0.1552, + "step": 22296 + }, + { + "epoch": 1.2238199780461032, + "grad_norm": 1.1297528743743896, + "learning_rate": 7.709652248061858e-06, + "loss": 0.2649, + "step": 22298 + }, + { + "epoch": 1.2239297475301867, + "grad_norm": 1.2418804168701172, + "learning_rate": 7.70587084403804e-06, + "loss": 0.1782, + "step": 22300 + }, + { + "epoch": 1.22403951701427, + "grad_norm": 0.9490138292312622, + "learning_rate": 7.702090198619614e-06, + "loss": 0.152, + "step": 22302 + }, + { + "epoch": 1.2241492864983534, + "grad_norm": 0.8506868481636047, + "learning_rate": 7.698310311972412e-06, + "loss": 0.1519, + "step": 22304 + }, + { + "epoch": 1.224259055982437, + "grad_norm": 0.9114393591880798, + "learning_rate": 7.694531184262242e-06, + "loss": 0.1762, + "step": 22306 + }, + { + "epoch": 1.2243688254665204, + "grad_norm": 1.5770505666732788, + "learning_rate": 7.690752815654867e-06, + "loss": 0.1543, + "step": 22308 + }, + { + "epoch": 1.2244785949506038, + "grad_norm": 0.9539921283721924, + "learning_rate": 7.686975206316041e-06, + "loss": 0.1759, + "step": 22310 + }, + { + "epoch": 1.224588364434687, + "grad_norm": 0.6109926700592041, + "learning_rate": 7.68319835641145e-06, + "loss": 0.0938, + "step": 22312 + }, + { + "epoch": 1.2246981339187706, + "grad_norm": 1.0333402156829834, + "learning_rate": 7.679422266106784e-06, + "loss": 0.1549, + "step": 22314 + }, + { + "epoch": 1.224807903402854, + "grad_norm": 1.0642307996749878, + "learning_rate": 7.67564693556767e-06, + "loss": 0.2719, + "step": 22316 + }, + { + "epoch": 1.2249176728869373, + "grad_norm": 1.0209956169128418, + "learning_rate": 7.671872364959717e-06, + "loss": 0.3101, + "step": 22318 + }, + { + "epoch": 1.2250274423710208, + "grad_norm": 1.2270736694335938, + "learning_rate": 7.668098554448494e-06, + "loss": 0.1989, + "step": 22320 + }, + { + "epoch": 1.2251372118551043, + "grad_norm": 0.8602864146232605, + "learning_rate": 7.664325504199529e-06, + "loss": 0.1434, + "step": 22322 + }, + { + "epoch": 1.2252469813391877, + "grad_norm": 0.9013287425041199, + "learning_rate": 7.660553214378344e-06, + "loss": 0.1687, + "step": 22324 + }, + { + "epoch": 1.2253567508232712, + "grad_norm": 1.2793965339660645, + "learning_rate": 7.656781685150401e-06, + "loss": 0.1814, + "step": 22326 + }, + { + "epoch": 1.2254665203073545, + "grad_norm": 1.3326586484909058, + "learning_rate": 7.653010916681141e-06, + "loss": 0.2462, + "step": 22328 + }, + { + "epoch": 1.225576289791438, + "grad_norm": 1.1691997051239014, + "learning_rate": 7.649240909135965e-06, + "loss": 0.2769, + "step": 22330 + }, + { + "epoch": 1.2256860592755214, + "grad_norm": 0.8784566521644592, + "learning_rate": 7.645471662680245e-06, + "loss": 0.1507, + "step": 22332 + }, + { + "epoch": 1.225795828759605, + "grad_norm": 1.1763269901275635, + "learning_rate": 7.641703177479311e-06, + "loss": 0.1682, + "step": 22334 + }, + { + "epoch": 1.2259055982436882, + "grad_norm": 1.0608071088790894, + "learning_rate": 7.637935453698483e-06, + "loss": 0.1684, + "step": 22336 + }, + { + "epoch": 1.2260153677277716, + "grad_norm": 2.190580368041992, + "learning_rate": 7.634168491503024e-06, + "loss": 0.2144, + "step": 22338 + }, + { + "epoch": 1.2261251372118551, + "grad_norm": 0.7966528534889221, + "learning_rate": 7.630402291058164e-06, + "loss": 0.128, + "step": 22340 + }, + { + "epoch": 1.2262349066959386, + "grad_norm": 0.8376215100288391, + "learning_rate": 7.626636852529123e-06, + "loss": 0.179, + "step": 22342 + }, + { + "epoch": 1.2263446761800219, + "grad_norm": 1.8186345100402832, + "learning_rate": 7.622872176081061e-06, + "loss": 0.1725, + "step": 22344 + }, + { + "epoch": 1.2264544456641053, + "grad_norm": 0.8661434054374695, + "learning_rate": 7.619108261879121e-06, + "loss": 0.1537, + "step": 22346 + }, + { + "epoch": 1.2265642151481888, + "grad_norm": 1.2595815658569336, + "learning_rate": 7.615345110088393e-06, + "loss": 0.1593, + "step": 22348 + }, + { + "epoch": 1.2266739846322723, + "grad_norm": 1.2762573957443237, + "learning_rate": 7.611582720873964e-06, + "loss": 0.1693, + "step": 22350 + }, + { + "epoch": 1.2267837541163558, + "grad_norm": 0.9074845910072327, + "learning_rate": 7.607821094400866e-06, + "loss": 0.1861, + "step": 22352 + }, + { + "epoch": 1.226893523600439, + "grad_norm": 1.058994174003601, + "learning_rate": 7.604060230834101e-06, + "loss": 0.1648, + "step": 22354 + }, + { + "epoch": 1.2270032930845225, + "grad_norm": 1.1437963247299194, + "learning_rate": 7.600300130338636e-06, + "loss": 0.1991, + "step": 22356 + }, + { + "epoch": 1.227113062568606, + "grad_norm": 1.2059959173202515, + "learning_rate": 7.596540793079404e-06, + "loss": 0.2602, + "step": 22358 + }, + { + "epoch": 1.2272228320526892, + "grad_norm": 1.627271294593811, + "learning_rate": 7.5927822192213225e-06, + "loss": 0.238, + "step": 22360 + }, + { + "epoch": 1.2273326015367727, + "grad_norm": 0.6948056221008301, + "learning_rate": 7.589024408929252e-06, + "loss": 0.1885, + "step": 22362 + }, + { + "epoch": 1.2274423710208562, + "grad_norm": 1.177951455116272, + "learning_rate": 7.585267362368029e-06, + "loss": 0.2495, + "step": 22364 + }, + { + "epoch": 1.2275521405049397, + "grad_norm": 0.978990375995636, + "learning_rate": 7.581511079702455e-06, + "loss": 0.1669, + "step": 22366 + }, + { + "epoch": 1.2276619099890231, + "grad_norm": 0.6941247582435608, + "learning_rate": 7.577755561097303e-06, + "loss": 0.2116, + "step": 22368 + }, + { + "epoch": 1.2277716794731064, + "grad_norm": 1.2950292825698853, + "learning_rate": 7.574000806717293e-06, + "loss": 0.178, + "step": 22370 + }, + { + "epoch": 1.2278814489571899, + "grad_norm": 1.636172890663147, + "learning_rate": 7.570246816727142e-06, + "loss": 0.4488, + "step": 22372 + }, + { + "epoch": 1.2279912184412733, + "grad_norm": 1.1622148752212524, + "learning_rate": 7.566493591291524e-06, + "loss": 0.1565, + "step": 22374 + }, + { + "epoch": 1.2281009879253568, + "grad_norm": 1.4851046800613403, + "learning_rate": 7.5627411305750654e-06, + "loss": 0.2096, + "step": 22376 + }, + { + "epoch": 1.22821075740944, + "grad_norm": 2.028287649154663, + "learning_rate": 7.55898943474237e-06, + "loss": 0.199, + "step": 22378 + }, + { + "epoch": 1.2283205268935236, + "grad_norm": 1.681339979171753, + "learning_rate": 7.555238503958001e-06, + "loss": 0.3332, + "step": 22380 + }, + { + "epoch": 1.228430296377607, + "grad_norm": 2.0000712871551514, + "learning_rate": 7.551488338386495e-06, + "loss": 0.2164, + "step": 22382 + }, + { + "epoch": 1.2285400658616905, + "grad_norm": 1.5962966680526733, + "learning_rate": 7.547738938192344e-06, + "loss": 0.2382, + "step": 22384 + }, + { + "epoch": 1.2286498353457738, + "grad_norm": 1.0065568685531616, + "learning_rate": 7.543990303540035e-06, + "loss": 0.1659, + "step": 22386 + }, + { + "epoch": 1.2287596048298572, + "grad_norm": 1.0175297260284424, + "learning_rate": 7.5402424345939884e-06, + "loss": 0.148, + "step": 22388 + }, + { + "epoch": 1.2288693743139407, + "grad_norm": 2.0929620265960693, + "learning_rate": 7.536495331518606e-06, + "loss": 0.1843, + "step": 22390 + }, + { + "epoch": 1.2289791437980242, + "grad_norm": 1.1281238794326782, + "learning_rate": 7.5327489944782555e-06, + "loss": 0.1603, + "step": 22392 + }, + { + "epoch": 1.2290889132821077, + "grad_norm": 0.828996479511261, + "learning_rate": 7.52900342363726e-06, + "loss": 0.1281, + "step": 22394 + }, + { + "epoch": 1.229198682766191, + "grad_norm": 1.3581618070602417, + "learning_rate": 7.525258619159933e-06, + "loss": 0.1551, + "step": 22396 + }, + { + "epoch": 1.2293084522502744, + "grad_norm": 1.397492527961731, + "learning_rate": 7.521514581210537e-06, + "loss": 0.1471, + "step": 22398 + }, + { + "epoch": 1.2294182217343579, + "grad_norm": 1.2445082664489746, + "learning_rate": 7.517771309953292e-06, + "loss": 0.289, + "step": 22400 + }, + { + "epoch": 1.2295279912184414, + "grad_norm": 1.3243659734725952, + "learning_rate": 7.514028805552414e-06, + "loss": 0.1894, + "step": 22402 + }, + { + "epoch": 1.2296377607025246, + "grad_norm": 1.132788896560669, + "learning_rate": 7.510287068172056e-06, + "loss": 0.2505, + "step": 22404 + }, + { + "epoch": 1.229747530186608, + "grad_norm": 1.8177902698516846, + "learning_rate": 7.506546097976355e-06, + "loss": 0.2464, + "step": 22406 + }, + { + "epoch": 1.2298572996706916, + "grad_norm": 0.8561745882034302, + "learning_rate": 7.502805895129395e-06, + "loss": 0.1933, + "step": 22408 + }, + { + "epoch": 1.229967069154775, + "grad_norm": 0.6619571447372437, + "learning_rate": 7.499066459795259e-06, + "loss": 0.1117, + "step": 22410 + }, + { + "epoch": 1.2300768386388583, + "grad_norm": 1.159580111503601, + "learning_rate": 7.495327792137966e-06, + "loss": 0.1441, + "step": 22412 + }, + { + "epoch": 1.2301866081229418, + "grad_norm": 1.1785235404968262, + "learning_rate": 7.491589892321516e-06, + "loss": 0.2417, + "step": 22414 + }, + { + "epoch": 1.2302963776070253, + "grad_norm": 0.8387810587882996, + "learning_rate": 7.487852760509867e-06, + "loss": 0.1326, + "step": 22416 + }, + { + "epoch": 1.2304061470911087, + "grad_norm": 0.8782972693443298, + "learning_rate": 7.4841163968669524e-06, + "loss": 0.0969, + "step": 22418 + }, + { + "epoch": 1.2305159165751922, + "grad_norm": 1.29960298538208, + "learning_rate": 7.480380801556658e-06, + "loss": 0.1731, + "step": 22420 + }, + { + "epoch": 1.2306256860592755, + "grad_norm": 1.5303568840026855, + "learning_rate": 7.47664597474286e-06, + "loss": 0.1756, + "step": 22422 + }, + { + "epoch": 1.230735455543359, + "grad_norm": 1.3596357107162476, + "learning_rate": 7.472911916589381e-06, + "loss": 0.2777, + "step": 22424 + }, + { + "epoch": 1.2308452250274424, + "grad_norm": 1.5225411653518677, + "learning_rate": 7.469178627260012e-06, + "loss": 0.1442, + "step": 22426 + }, + { + "epoch": 1.2309549945115257, + "grad_norm": 1.1301796436309814, + "learning_rate": 7.465446106918514e-06, + "loss": 0.2476, + "step": 22428 + }, + { + "epoch": 1.2310647639956092, + "grad_norm": 0.9643179178237915, + "learning_rate": 7.461714355728608e-06, + "loss": 0.1214, + "step": 22430 + }, + { + "epoch": 1.2311745334796926, + "grad_norm": 1.1293494701385498, + "learning_rate": 7.457983373853999e-06, + "loss": 0.1213, + "step": 22432 + }, + { + "epoch": 1.2312843029637761, + "grad_norm": 1.0410995483398438, + "learning_rate": 7.454253161458335e-06, + "loss": 0.1353, + "step": 22434 + }, + { + "epoch": 1.2313940724478596, + "grad_norm": 1.1179898977279663, + "learning_rate": 7.450523718705255e-06, + "loss": 0.122, + "step": 22436 + }, + { + "epoch": 1.2315038419319428, + "grad_norm": 1.096619725227356, + "learning_rate": 7.446795045758343e-06, + "loss": 0.2002, + "step": 22438 + }, + { + "epoch": 1.2316136114160263, + "grad_norm": 0.9831221699714661, + "learning_rate": 7.4430671427811545e-06, + "loss": 0.1941, + "step": 22440 + }, + { + "epoch": 1.2317233809001098, + "grad_norm": 1.3079792261123657, + "learning_rate": 7.4393400099372155e-06, + "loss": 0.2279, + "step": 22442 + }, + { + "epoch": 1.2318331503841933, + "grad_norm": 0.7920869588851929, + "learning_rate": 7.4356136473900075e-06, + "loss": 0.1812, + "step": 22444 + }, + { + "epoch": 1.2319429198682765, + "grad_norm": 1.0678983926773071, + "learning_rate": 7.431888055303008e-06, + "loss": 0.096, + "step": 22446 + }, + { + "epoch": 1.23205268935236, + "grad_norm": 1.0902137756347656, + "learning_rate": 7.428163233839624e-06, + "loss": 0.139, + "step": 22448 + }, + { + "epoch": 1.2321624588364435, + "grad_norm": 0.7973085045814514, + "learning_rate": 7.424439183163251e-06, + "loss": 0.1194, + "step": 22450 + }, + { + "epoch": 1.232272228320527, + "grad_norm": 1.161370038986206, + "learning_rate": 7.420715903437239e-06, + "loss": 0.1638, + "step": 22452 + }, + { + "epoch": 1.2323819978046102, + "grad_norm": 0.7523512244224548, + "learning_rate": 7.416993394824914e-06, + "loss": 0.1405, + "step": 22454 + }, + { + "epoch": 1.2324917672886937, + "grad_norm": 0.800523579120636, + "learning_rate": 7.413271657489551e-06, + "loss": 0.1502, + "step": 22456 + }, + { + "epoch": 1.2326015367727772, + "grad_norm": 1.31660795211792, + "learning_rate": 7.409550691594422e-06, + "loss": 0.2024, + "step": 22458 + }, + { + "epoch": 1.2327113062568607, + "grad_norm": 1.014224648475647, + "learning_rate": 7.405830497302732e-06, + "loss": 0.22, + "step": 22460 + }, + { + "epoch": 1.2328210757409441, + "grad_norm": 1.123534917831421, + "learning_rate": 7.402111074777682e-06, + "loss": 0.1516, + "step": 22462 + }, + { + "epoch": 1.2329308452250274, + "grad_norm": 1.3211567401885986, + "learning_rate": 7.398392424182415e-06, + "loss": 0.2266, + "step": 22464 + }, + { + "epoch": 1.2330406147091109, + "grad_norm": 1.2530754804611206, + "learning_rate": 7.39467454568005e-06, + "loss": 0.1652, + "step": 22466 + }, + { + "epoch": 1.2331503841931943, + "grad_norm": 1.1518827676773071, + "learning_rate": 7.390957439433671e-06, + "loss": 0.2171, + "step": 22468 + }, + { + "epoch": 1.2332601536772776, + "grad_norm": 1.597799301147461, + "learning_rate": 7.387241105606321e-06, + "loss": 0.1932, + "step": 22470 + }, + { + "epoch": 1.233369923161361, + "grad_norm": 1.0699152946472168, + "learning_rate": 7.3835255443610334e-06, + "loss": 0.2899, + "step": 22472 + }, + { + "epoch": 1.2334796926454445, + "grad_norm": 1.6222678422927856, + "learning_rate": 7.3798107558607805e-06, + "loss": 0.2118, + "step": 22474 + }, + { + "epoch": 1.233589462129528, + "grad_norm": 1.1321250200271606, + "learning_rate": 7.3760967402685145e-06, + "loss": 0.1604, + "step": 22476 + }, + { + "epoch": 1.2336992316136115, + "grad_norm": 1.1980235576629639, + "learning_rate": 7.372383497747149e-06, + "loss": 0.1065, + "step": 22478 + }, + { + "epoch": 1.2338090010976948, + "grad_norm": 0.984845757484436, + "learning_rate": 7.368671028459564e-06, + "loss": 0.1384, + "step": 22480 + }, + { + "epoch": 1.2339187705817782, + "grad_norm": 3.398038864135742, + "learning_rate": 7.3649593325686e-06, + "loss": 0.1899, + "step": 22482 + }, + { + "epoch": 1.2340285400658617, + "grad_norm": 1.98301100730896, + "learning_rate": 7.3612484102370845e-06, + "loss": 0.3225, + "step": 22484 + }, + { + "epoch": 1.2341383095499452, + "grad_norm": 1.3404027223587036, + "learning_rate": 7.357538261627789e-06, + "loss": 0.2363, + "step": 22486 + }, + { + "epoch": 1.2342480790340284, + "grad_norm": 1.2663217782974243, + "learning_rate": 7.35382888690346e-06, + "loss": 0.2292, + "step": 22488 + }, + { + "epoch": 1.234357848518112, + "grad_norm": 1.8795892000198364, + "learning_rate": 7.350120286226803e-06, + "loss": 0.276, + "step": 22490 + }, + { + "epoch": 1.2344676180021954, + "grad_norm": 1.2929178476333618, + "learning_rate": 7.346412459760507e-06, + "loss": 0.1559, + "step": 22492 + }, + { + "epoch": 1.2345773874862789, + "grad_norm": 1.0620266199111938, + "learning_rate": 7.342705407667203e-06, + "loss": 0.1587, + "step": 22494 + }, + { + "epoch": 1.2346871569703621, + "grad_norm": 1.0875873565673828, + "learning_rate": 7.338999130109514e-06, + "loss": 0.2645, + "step": 22496 + }, + { + "epoch": 1.2347969264544456, + "grad_norm": 1.510339379310608, + "learning_rate": 7.335293627250006e-06, + "loss": 0.251, + "step": 22498 + }, + { + "epoch": 1.234906695938529, + "grad_norm": 1.3749611377716064, + "learning_rate": 7.3315888992512265e-06, + "loss": 0.1984, + "step": 22500 + }, + { + "epoch": 1.2350164654226126, + "grad_norm": 1.3025180101394653, + "learning_rate": 7.327884946275679e-06, + "loss": 0.1681, + "step": 22502 + }, + { + "epoch": 1.235126234906696, + "grad_norm": 1.5302987098693848, + "learning_rate": 7.324181768485835e-06, + "loss": 0.2804, + "step": 22504 + }, + { + "epoch": 1.2352360043907793, + "grad_norm": 1.9406379461288452, + "learning_rate": 7.320479366044131e-06, + "loss": 0.1999, + "step": 22506 + }, + { + "epoch": 1.2353457738748628, + "grad_norm": 1.3592156171798706, + "learning_rate": 7.316777739112985e-06, + "loss": 0.2553, + "step": 22508 + }, + { + "epoch": 1.2354555433589463, + "grad_norm": 1.1882072687149048, + "learning_rate": 7.313076887854764e-06, + "loss": 0.2351, + "step": 22510 + }, + { + "epoch": 1.2355653128430297, + "grad_norm": 1.298119306564331, + "learning_rate": 7.3093768124318005e-06, + "loss": 0.1706, + "step": 22512 + }, + { + "epoch": 1.235675082327113, + "grad_norm": 0.9508122205734253, + "learning_rate": 7.305677513006401e-06, + "loss": 0.2248, + "step": 22514 + }, + { + "epoch": 1.2357848518111965, + "grad_norm": 1.259103536605835, + "learning_rate": 7.301978989740835e-06, + "loss": 0.1368, + "step": 22516 + }, + { + "epoch": 1.23589462129528, + "grad_norm": 2.2603349685668945, + "learning_rate": 7.29828124279733e-06, + "loss": 0.1724, + "step": 22518 + }, + { + "epoch": 1.2360043907793634, + "grad_norm": 1.2321622371673584, + "learning_rate": 7.2945842723381035e-06, + "loss": 0.2769, + "step": 22520 + }, + { + "epoch": 1.2361141602634467, + "grad_norm": 0.9441007971763611, + "learning_rate": 7.290888078525307e-06, + "loss": 0.1754, + "step": 22522 + }, + { + "epoch": 1.2362239297475301, + "grad_norm": 1.0779701471328735, + "learning_rate": 7.287192661521086e-06, + "loss": 0.1588, + "step": 22524 + }, + { + "epoch": 1.2363336992316136, + "grad_norm": 1.1289056539535522, + "learning_rate": 7.283498021487536e-06, + "loss": 0.1809, + "step": 22526 + }, + { + "epoch": 1.236443468715697, + "grad_norm": 1.0833888053894043, + "learning_rate": 7.27980415858672e-06, + "loss": 0.1823, + "step": 22528 + }, + { + "epoch": 1.2365532381997806, + "grad_norm": 0.9970762729644775, + "learning_rate": 7.276111072980663e-06, + "loss": 0.1889, + "step": 22530 + }, + { + "epoch": 1.2366630076838638, + "grad_norm": 0.9372761845588684, + "learning_rate": 7.272418764831374e-06, + "loss": 0.2051, + "step": 22532 + }, + { + "epoch": 1.2367727771679473, + "grad_norm": 1.6096197366714478, + "learning_rate": 7.268727234300809e-06, + "loss": 0.175, + "step": 22534 + }, + { + "epoch": 1.2368825466520308, + "grad_norm": 1.4783194065093994, + "learning_rate": 7.265036481550902e-06, + "loss": 0.1898, + "step": 22536 + }, + { + "epoch": 1.236992316136114, + "grad_norm": 1.4647748470306396, + "learning_rate": 7.261346506743538e-06, + "loss": 0.1693, + "step": 22538 + }, + { + "epoch": 1.2371020856201975, + "grad_norm": 1.6127893924713135, + "learning_rate": 7.257657310040586e-06, + "loss": 0.1763, + "step": 22540 + }, + { + "epoch": 1.237211855104281, + "grad_norm": 1.072981834411621, + "learning_rate": 7.253968891603863e-06, + "loss": 0.1734, + "step": 22542 + }, + { + "epoch": 1.2373216245883645, + "grad_norm": 1.5545307397842407, + "learning_rate": 7.25028125159517e-06, + "loss": 0.1453, + "step": 22544 + }, + { + "epoch": 1.237431394072448, + "grad_norm": 1.199006199836731, + "learning_rate": 7.2465943901762646e-06, + "loss": 0.2102, + "step": 22546 + }, + { + "epoch": 1.2375411635565312, + "grad_norm": 1.5265530347824097, + "learning_rate": 7.24290830750887e-06, + "loss": 0.1989, + "step": 22548 + }, + { + "epoch": 1.2376509330406147, + "grad_norm": 1.3471190929412842, + "learning_rate": 7.239223003754672e-06, + "loss": 0.2244, + "step": 22550 + }, + { + "epoch": 1.2377607025246982, + "grad_norm": 1.174824833869934, + "learning_rate": 7.235538479075318e-06, + "loss": 0.1866, + "step": 22552 + }, + { + "epoch": 1.2378704720087816, + "grad_norm": 1.1422619819641113, + "learning_rate": 7.231854733632451e-06, + "loss": 0.2248, + "step": 22554 + }, + { + "epoch": 1.237980241492865, + "grad_norm": 1.523517370223999, + "learning_rate": 7.228171767587638e-06, + "loss": 0.1626, + "step": 22556 + }, + { + "epoch": 1.2380900109769484, + "grad_norm": 1.1472866535186768, + "learning_rate": 7.224489581102448e-06, + "loss": 0.2013, + "step": 22558 + }, + { + "epoch": 1.2381997804610319, + "grad_norm": 0.8469519019126892, + "learning_rate": 7.220808174338389e-06, + "loss": 0.1419, + "step": 22560 + }, + { + "epoch": 1.2383095499451153, + "grad_norm": 1.209791660308838, + "learning_rate": 7.217127547456953e-06, + "loss": 0.164, + "step": 22562 + }, + { + "epoch": 1.2384193194291986, + "grad_norm": 1.2592628002166748, + "learning_rate": 7.213447700619585e-06, + "loss": 0.2269, + "step": 22564 + }, + { + "epoch": 1.238529088913282, + "grad_norm": 1.3774809837341309, + "learning_rate": 7.209768633987704e-06, + "loss": 0.1866, + "step": 22566 + }, + { + "epoch": 1.2386388583973655, + "grad_norm": 1.309134602546692, + "learning_rate": 7.20609034772268e-06, + "loss": 0.1479, + "step": 22568 + }, + { + "epoch": 1.238748627881449, + "grad_norm": 1.0980141162872314, + "learning_rate": 7.202412841985878e-06, + "loss": 0.1829, + "step": 22570 + }, + { + "epoch": 1.2388583973655325, + "grad_norm": 0.8612945079803467, + "learning_rate": 7.198736116938609e-06, + "loss": 0.1403, + "step": 22572 + }, + { + "epoch": 1.2389681668496157, + "grad_norm": 1.1639747619628906, + "learning_rate": 7.195060172742144e-06, + "loss": 0.2067, + "step": 22574 + }, + { + "epoch": 1.2390779363336992, + "grad_norm": 0.8608453273773193, + "learning_rate": 7.191385009557733e-06, + "loss": 0.1553, + "step": 22576 + }, + { + "epoch": 1.2391877058177827, + "grad_norm": 1.5431439876556396, + "learning_rate": 7.187710627546576e-06, + "loss": 0.1717, + "step": 22578 + }, + { + "epoch": 1.239297475301866, + "grad_norm": 1.525649905204773, + "learning_rate": 7.184037026869867e-06, + "loss": 0.2687, + "step": 22580 + }, + { + "epoch": 1.2394072447859494, + "grad_norm": 1.8375293016433716, + "learning_rate": 7.180364207688734e-06, + "loss": 0.2192, + "step": 22582 + }, + { + "epoch": 1.239517014270033, + "grad_norm": 1.086472749710083, + "learning_rate": 7.176692170164295e-06, + "loss": 0.1355, + "step": 22584 + }, + { + "epoch": 1.2396267837541164, + "grad_norm": 1.2411589622497559, + "learning_rate": 7.173020914457623e-06, + "loss": 0.2047, + "step": 22586 + }, + { + "epoch": 1.2397365532381999, + "grad_norm": 1.352355718612671, + "learning_rate": 7.169350440729752e-06, + "loss": 0.2601, + "step": 22588 + }, + { + "epoch": 1.2398463227222831, + "grad_norm": 1.207548975944519, + "learning_rate": 7.16568074914169e-06, + "loss": 0.17, + "step": 22590 + }, + { + "epoch": 1.2399560922063666, + "grad_norm": 0.8392630815505981, + "learning_rate": 7.162011839854396e-06, + "loss": 0.2301, + "step": 22592 + }, + { + "epoch": 1.24006586169045, + "grad_norm": 0.8876519799232483, + "learning_rate": 7.158343713028826e-06, + "loss": 0.1608, + "step": 22594 + }, + { + "epoch": 1.2401756311745336, + "grad_norm": 1.3766453266143799, + "learning_rate": 7.154676368825872e-06, + "loss": 0.1834, + "step": 22596 + }, + { + "epoch": 1.2402854006586168, + "grad_norm": 1.358787178993225, + "learning_rate": 7.151009807406403e-06, + "loss": 0.1728, + "step": 22598 + }, + { + "epoch": 1.2403951701427003, + "grad_norm": 1.0698051452636719, + "learning_rate": 7.147344028931252e-06, + "loss": 0.2533, + "step": 22600 + }, + { + "epoch": 1.2405049396267838, + "grad_norm": 1.0800745487213135, + "learning_rate": 7.143679033561218e-06, + "loss": 0.208, + "step": 22602 + }, + { + "epoch": 1.2406147091108672, + "grad_norm": 1.299265742301941, + "learning_rate": 7.140014821457059e-06, + "loss": 0.187, + "step": 22604 + }, + { + "epoch": 1.2407244785949505, + "grad_norm": 0.9455879330635071, + "learning_rate": 7.136351392779517e-06, + "loss": 0.178, + "step": 22606 + }, + { + "epoch": 1.240834248079034, + "grad_norm": 1.2288038730621338, + "learning_rate": 7.132688747689284e-06, + "loss": 0.1481, + "step": 22608 + }, + { + "epoch": 1.2409440175631175, + "grad_norm": 1.1268054246902466, + "learning_rate": 7.12902688634702e-06, + "loss": 0.1275, + "step": 22610 + }, + { + "epoch": 1.241053787047201, + "grad_norm": 1.4840776920318604, + "learning_rate": 7.125365808913348e-06, + "loss": 0.2924, + "step": 22612 + }, + { + "epoch": 1.2411635565312844, + "grad_norm": 2.0230023860931396, + "learning_rate": 7.121705515548871e-06, + "loss": 0.1763, + "step": 22614 + }, + { + "epoch": 1.2412733260153677, + "grad_norm": 1.5775225162506104, + "learning_rate": 7.118046006414142e-06, + "loss": 0.3182, + "step": 22616 + }, + { + "epoch": 1.2413830954994511, + "grad_norm": 0.8596591949462891, + "learning_rate": 7.114387281669677e-06, + "loss": 0.1524, + "step": 22618 + }, + { + "epoch": 1.2414928649835346, + "grad_norm": 1.410813331604004, + "learning_rate": 7.11072934147598e-06, + "loss": 0.2111, + "step": 22620 + }, + { + "epoch": 1.241602634467618, + "grad_norm": 1.3795112371444702, + "learning_rate": 7.107072185993502e-06, + "loss": 0.193, + "step": 22622 + }, + { + "epoch": 1.2417124039517013, + "grad_norm": 0.9224848747253418, + "learning_rate": 7.103415815382661e-06, + "loss": 0.1338, + "step": 22624 + }, + { + "epoch": 1.2418221734357848, + "grad_norm": 1.089451551437378, + "learning_rate": 7.0997602298038426e-06, + "loss": 0.2218, + "step": 22626 + }, + { + "epoch": 1.2419319429198683, + "grad_norm": 2.046229362487793, + "learning_rate": 7.096105429417393e-06, + "loss": 0.1381, + "step": 22628 + }, + { + "epoch": 1.2420417124039518, + "grad_norm": 0.9029183983802795, + "learning_rate": 7.092451414383644e-06, + "loss": 0.1185, + "step": 22630 + }, + { + "epoch": 1.242151481888035, + "grad_norm": 1.1369997262954712, + "learning_rate": 7.088798184862872e-06, + "loss": 0.2026, + "step": 22632 + }, + { + "epoch": 1.2422612513721185, + "grad_norm": 1.4041637182235718, + "learning_rate": 7.085145741015325e-06, + "loss": 0.2117, + "step": 22634 + }, + { + "epoch": 1.242371020856202, + "grad_norm": 1.3276965618133545, + "learning_rate": 7.081494083001217e-06, + "loss": 0.1835, + "step": 22636 + }, + { + "epoch": 1.2424807903402855, + "grad_norm": 0.7801879048347473, + "learning_rate": 7.077843210980728e-06, + "loss": 0.13, + "step": 22638 + }, + { + "epoch": 1.242590559824369, + "grad_norm": 1.297910451889038, + "learning_rate": 7.074193125113996e-06, + "loss": 0.1895, + "step": 22640 + }, + { + "epoch": 1.2427003293084522, + "grad_norm": 0.8887372612953186, + "learning_rate": 7.070543825561138e-06, + "loss": 0.1488, + "step": 22642 + }, + { + "epoch": 1.2428100987925357, + "grad_norm": 0.7226913571357727, + "learning_rate": 7.066895312482239e-06, + "loss": 0.1028, + "step": 22644 + }, + { + "epoch": 1.2429198682766192, + "grad_norm": 1.4463465213775635, + "learning_rate": 7.0632475860373305e-06, + "loss": 0.2677, + "step": 22646 + }, + { + "epoch": 1.2430296377607024, + "grad_norm": 1.3438541889190674, + "learning_rate": 7.059600646386422e-06, + "loss": 0.2568, + "step": 22648 + }, + { + "epoch": 1.2431394072447859, + "grad_norm": 1.0453544855117798, + "learning_rate": 7.055954493689487e-06, + "loss": 0.1882, + "step": 22650 + }, + { + "epoch": 1.2432491767288694, + "grad_norm": 1.7929611206054688, + "learning_rate": 7.052309128106463e-06, + "loss": 0.1869, + "step": 22652 + }, + { + "epoch": 1.2433589462129528, + "grad_norm": 0.9726617336273193, + "learning_rate": 7.0486645497972435e-06, + "loss": 0.1178, + "step": 22654 + }, + { + "epoch": 1.2434687156970363, + "grad_norm": 1.3030788898468018, + "learning_rate": 7.045020758921716e-06, + "loss": 0.129, + "step": 22656 + }, + { + "epoch": 1.2435784851811196, + "grad_norm": 1.0782709121704102, + "learning_rate": 7.0413777556397055e-06, + "loss": 0.1991, + "step": 22658 + }, + { + "epoch": 1.243688254665203, + "grad_norm": 1.5485323667526245, + "learning_rate": 7.037735540111015e-06, + "loss": 0.2453, + "step": 22660 + }, + { + "epoch": 1.2437980241492865, + "grad_norm": 1.285422444343567, + "learning_rate": 7.034094112495404e-06, + "loss": 0.2012, + "step": 22662 + }, + { + "epoch": 1.24390779363337, + "grad_norm": 0.9944800734519958, + "learning_rate": 7.0304534729526025e-06, + "loss": 0.1143, + "step": 22664 + }, + { + "epoch": 1.2440175631174533, + "grad_norm": 1.177180528640747, + "learning_rate": 7.026813621642317e-06, + "loss": 0.1463, + "step": 22666 + }, + { + "epoch": 1.2441273326015367, + "grad_norm": 1.3479441404342651, + "learning_rate": 7.023174558724205e-06, + "loss": 0.145, + "step": 22668 + }, + { + "epoch": 1.2442371020856202, + "grad_norm": 1.231103777885437, + "learning_rate": 7.019536284357892e-06, + "loss": 0.2044, + "step": 22670 + }, + { + "epoch": 1.2443468715697037, + "grad_norm": 1.278333306312561, + "learning_rate": 7.015898798702964e-06, + "loss": 0.1642, + "step": 22672 + }, + { + "epoch": 1.244456641053787, + "grad_norm": 1.4221760034561157, + "learning_rate": 7.012262101918993e-06, + "loss": 0.2059, + "step": 22674 + }, + { + "epoch": 1.2445664105378704, + "grad_norm": 1.5365806818008423, + "learning_rate": 7.008626194165493e-06, + "loss": 0.4055, + "step": 22676 + }, + { + "epoch": 1.244676180021954, + "grad_norm": 1.4940308332443237, + "learning_rate": 7.00499107560195e-06, + "loss": 0.2205, + "step": 22678 + }, + { + "epoch": 1.2447859495060374, + "grad_norm": 0.9019755721092224, + "learning_rate": 7.00135674638783e-06, + "loss": 0.1582, + "step": 22680 + }, + { + "epoch": 1.2448957189901209, + "grad_norm": 1.299117922782898, + "learning_rate": 6.997723206682546e-06, + "loss": 0.1872, + "step": 22682 + }, + { + "epoch": 1.2450054884742041, + "grad_norm": 1.0099722146987915, + "learning_rate": 6.994090456645483e-06, + "loss": 0.1724, + "step": 22684 + }, + { + "epoch": 1.2451152579582876, + "grad_norm": 1.08290696144104, + "learning_rate": 6.9904584964359895e-06, + "loss": 0.1921, + "step": 22686 + }, + { + "epoch": 1.245225027442371, + "grad_norm": 1.497186541557312, + "learning_rate": 6.986827326213383e-06, + "loss": 0.1896, + "step": 22688 + }, + { + "epoch": 1.2453347969264543, + "grad_norm": 1.3472979068756104, + "learning_rate": 6.983196946136936e-06, + "loss": 0.2398, + "step": 22690 + }, + { + "epoch": 1.2454445664105378, + "grad_norm": 1.3595190048217773, + "learning_rate": 6.9795673563659096e-06, + "loss": 0.2619, + "step": 22692 + }, + { + "epoch": 1.2455543358946213, + "grad_norm": 1.5245474576950073, + "learning_rate": 6.975938557059508e-06, + "loss": 0.2413, + "step": 22694 + }, + { + "epoch": 1.2456641053787048, + "grad_norm": 0.8398270010948181, + "learning_rate": 6.972310548376909e-06, + "loss": 0.1622, + "step": 22696 + }, + { + "epoch": 1.2457738748627882, + "grad_norm": 1.1644445657730103, + "learning_rate": 6.9686833304772575e-06, + "loss": 0.277, + "step": 22698 + }, + { + "epoch": 1.2458836443468715, + "grad_norm": 0.8608121871948242, + "learning_rate": 6.9650569035196484e-06, + "loss": 0.1806, + "step": 22700 + }, + { + "epoch": 1.245993413830955, + "grad_norm": 1.5064254999160767, + "learning_rate": 6.961431267663171e-06, + "loss": 0.2134, + "step": 22702 + }, + { + "epoch": 1.2461031833150384, + "grad_norm": 0.7873255610466003, + "learning_rate": 6.9578064230668485e-06, + "loss": 0.1962, + "step": 22704 + }, + { + "epoch": 1.246212952799122, + "grad_norm": 0.9920089840888977, + "learning_rate": 6.9541823698897e-06, + "loss": 0.1953, + "step": 22706 + }, + { + "epoch": 1.2463227222832052, + "grad_norm": 1.641886591911316, + "learning_rate": 6.9505591082906885e-06, + "loss": 0.2457, + "step": 22708 + }, + { + "epoch": 1.2464324917672887, + "grad_norm": 1.1249969005584717, + "learning_rate": 6.946936638428747e-06, + "loss": 0.1973, + "step": 22710 + }, + { + "epoch": 1.2465422612513721, + "grad_norm": 0.8354945182800293, + "learning_rate": 6.943314960462774e-06, + "loss": 0.1521, + "step": 22712 + }, + { + "epoch": 1.2466520307354556, + "grad_norm": 1.4751533269882202, + "learning_rate": 6.939694074551625e-06, + "loss": 0.329, + "step": 22714 + }, + { + "epoch": 1.2467618002195389, + "grad_norm": 1.1273638010025024, + "learning_rate": 6.936073980854147e-06, + "loss": 0.2232, + "step": 22716 + }, + { + "epoch": 1.2468715697036223, + "grad_norm": 1.2940133810043335, + "learning_rate": 6.932454679529129e-06, + "loss": 0.2099, + "step": 22718 + }, + { + "epoch": 1.2469813391877058, + "grad_norm": 1.7768298387527466, + "learning_rate": 6.928836170735328e-06, + "loss": 0.1383, + "step": 22720 + }, + { + "epoch": 1.2470911086717893, + "grad_norm": 1.0622460842132568, + "learning_rate": 6.925218454631471e-06, + "loss": 0.1491, + "step": 22722 + }, + { + "epoch": 1.2472008781558728, + "grad_norm": 1.326055884361267, + "learning_rate": 6.921601531376248e-06, + "loss": 0.1743, + "step": 22724 + }, + { + "epoch": 1.247310647639956, + "grad_norm": 0.9458357691764832, + "learning_rate": 6.91798540112831e-06, + "loss": 0.1531, + "step": 22726 + }, + { + "epoch": 1.2474204171240395, + "grad_norm": 1.0175588130950928, + "learning_rate": 6.914370064046291e-06, + "loss": 0.2441, + "step": 22728 + }, + { + "epoch": 1.247530186608123, + "grad_norm": 1.0304728746414185, + "learning_rate": 6.91075552028877e-06, + "loss": 0.1344, + "step": 22730 + }, + { + "epoch": 1.2476399560922065, + "grad_norm": 1.1122769117355347, + "learning_rate": 6.907141770014291e-06, + "loss": 0.1931, + "step": 22732 + }, + { + "epoch": 1.2477497255762897, + "grad_norm": 1.1295298337936401, + "learning_rate": 6.903528813381388e-06, + "loss": 0.1868, + "step": 22734 + }, + { + "epoch": 1.2478594950603732, + "grad_norm": 1.2190971374511719, + "learning_rate": 6.899916650548535e-06, + "loss": 0.1527, + "step": 22736 + }, + { + "epoch": 1.2479692645444567, + "grad_norm": 4.402713298797607, + "learning_rate": 6.896305281674176e-06, + "loss": 0.1557, + "step": 22738 + }, + { + "epoch": 1.2480790340285401, + "grad_norm": 0.7555407881736755, + "learning_rate": 6.892694706916719e-06, + "loss": 0.1466, + "step": 22740 + }, + { + "epoch": 1.2481888035126234, + "grad_norm": 1.3049806356430054, + "learning_rate": 6.889084926434555e-06, + "loss": 0.1107, + "step": 22742 + }, + { + "epoch": 1.2482985729967069, + "grad_norm": 1.2263044118881226, + "learning_rate": 6.885475940386022e-06, + "loss": 0.1544, + "step": 22744 + }, + { + "epoch": 1.2484083424807904, + "grad_norm": 1.3911317586898804, + "learning_rate": 6.881867748929422e-06, + "loss": 0.2374, + "step": 22746 + }, + { + "epoch": 1.2485181119648738, + "grad_norm": 0.9015282988548279, + "learning_rate": 6.8782603522230314e-06, + "loss": 0.1133, + "step": 22748 + }, + { + "epoch": 1.2486278814489573, + "grad_norm": 0.6570688486099243, + "learning_rate": 6.874653750425083e-06, + "loss": 0.1258, + "step": 22750 + }, + { + "epoch": 1.2487376509330406, + "grad_norm": 1.4683079719543457, + "learning_rate": 6.871047943693793e-06, + "loss": 0.1864, + "step": 22752 + }, + { + "epoch": 1.248847420417124, + "grad_norm": 1.034024715423584, + "learning_rate": 6.867442932187324e-06, + "loss": 0.1731, + "step": 22754 + }, + { + "epoch": 1.2489571899012075, + "grad_norm": 0.8927130699157715, + "learning_rate": 6.863838716063803e-06, + "loss": 0.1614, + "step": 22756 + }, + { + "epoch": 1.2490669593852908, + "grad_norm": 3.168154001235962, + "learning_rate": 6.860235295481337e-06, + "loss": 0.1799, + "step": 22758 + }, + { + "epoch": 1.2491767288693743, + "grad_norm": 1.0853086709976196, + "learning_rate": 6.856632670597987e-06, + "loss": 0.2235, + "step": 22760 + }, + { + "epoch": 1.2492864983534577, + "grad_norm": 1.0335357189178467, + "learning_rate": 6.85303084157177e-06, + "loss": 0.1725, + "step": 22762 + }, + { + "epoch": 1.2493962678375412, + "grad_norm": 1.000270962715149, + "learning_rate": 6.849429808560692e-06, + "loss": 0.2461, + "step": 22764 + }, + { + "epoch": 1.2495060373216247, + "grad_norm": 1.3950073719024658, + "learning_rate": 6.84582957172272e-06, + "loss": 0.2994, + "step": 22766 + }, + { + "epoch": 1.249615806805708, + "grad_norm": 0.9382147192955017, + "learning_rate": 6.8422301312157686e-06, + "loss": 0.1823, + "step": 22768 + }, + { + "epoch": 1.2497255762897914, + "grad_norm": 1.0983797311782837, + "learning_rate": 6.838631487197728e-06, + "loss": 0.1708, + "step": 22770 + }, + { + "epoch": 1.249835345773875, + "grad_norm": 0.6640132665634155, + "learning_rate": 6.8350336398264496e-06, + "loss": 0.1488, + "step": 22772 + }, + { + "epoch": 1.2499451152579584, + "grad_norm": 1.56241774559021, + "learning_rate": 6.8314365892597545e-06, + "loss": 0.1807, + "step": 22774 + }, + { + "epoch": 1.2500548847420418, + "grad_norm": 1.364633321762085, + "learning_rate": 6.827840335655419e-06, + "loss": 0.1619, + "step": 22776 + }, + { + "epoch": 1.250164654226125, + "grad_norm": 1.0089436769485474, + "learning_rate": 6.82424487917121e-06, + "loss": 0.2167, + "step": 22778 + }, + { + "epoch": 1.2502744237102086, + "grad_norm": 1.0758774280548096, + "learning_rate": 6.820650219964833e-06, + "loss": 0.1916, + "step": 22780 + }, + { + "epoch": 1.250384193194292, + "grad_norm": 1.284642219543457, + "learning_rate": 6.817056358193965e-06, + "loss": 0.2267, + "step": 22782 + }, + { + "epoch": 1.2504939626783753, + "grad_norm": 0.911766529083252, + "learning_rate": 6.813463294016254e-06, + "loss": 0.1165, + "step": 22784 + }, + { + "epoch": 1.2506037321624588, + "grad_norm": 2.0156757831573486, + "learning_rate": 6.809871027589307e-06, + "loss": 0.2632, + "step": 22786 + }, + { + "epoch": 1.2507135016465423, + "grad_norm": 0.9454084038734436, + "learning_rate": 6.806279559070691e-06, + "loss": 0.24, + "step": 22788 + }, + { + "epoch": 1.2508232711306257, + "grad_norm": 1.2474169731140137, + "learning_rate": 6.802688888617961e-06, + "loss": 0.1921, + "step": 22790 + }, + { + "epoch": 1.2509330406147092, + "grad_norm": 1.4269804954528809, + "learning_rate": 6.799099016388613e-06, + "loss": 0.3041, + "step": 22792 + }, + { + "epoch": 1.2510428100987925, + "grad_norm": 1.7602440118789673, + "learning_rate": 6.7955099425401125e-06, + "loss": 0.2369, + "step": 22794 + }, + { + "epoch": 1.251152579582876, + "grad_norm": 1.4332729578018188, + "learning_rate": 6.791921667229906e-06, + "loss": 0.2019, + "step": 22796 + }, + { + "epoch": 1.2512623490669594, + "grad_norm": 1.4810290336608887, + "learning_rate": 6.7883341906153834e-06, + "loss": 0.1832, + "step": 22798 + }, + { + "epoch": 1.2513721185510427, + "grad_norm": 0.8385092616081238, + "learning_rate": 6.784747512853906e-06, + "loss": 0.1341, + "step": 22800 + }, + { + "epoch": 1.2514818880351262, + "grad_norm": 1.2766426801681519, + "learning_rate": 6.7811616341028145e-06, + "loss": 0.2142, + "step": 22802 + }, + { + "epoch": 1.2515916575192096, + "grad_norm": 1.284990668296814, + "learning_rate": 6.777576554519399e-06, + "loss": 0.1641, + "step": 22804 + }, + { + "epoch": 1.2517014270032931, + "grad_norm": 1.3312206268310547, + "learning_rate": 6.773992274260915e-06, + "loss": 0.3782, + "step": 22806 + }, + { + "epoch": 1.2518111964873766, + "grad_norm": 1.4951950311660767, + "learning_rate": 6.77040879348459e-06, + "loss": 0.2048, + "step": 22808 + }, + { + "epoch": 1.2519209659714599, + "grad_norm": 0.9521315097808838, + "learning_rate": 6.7668261123476105e-06, + "loss": 0.2638, + "step": 22810 + }, + { + "epoch": 1.2520307354555433, + "grad_norm": 1.1795544624328613, + "learning_rate": 6.7632442310071235e-06, + "loss": 0.1865, + "step": 22812 + }, + { + "epoch": 1.2521405049396268, + "grad_norm": 1.1806374788284302, + "learning_rate": 6.759663149620263e-06, + "loss": 0.1315, + "step": 22814 + }, + { + "epoch": 1.2522502744237103, + "grad_norm": 1.2754676342010498, + "learning_rate": 6.757872908958534e-06, + "loss": 0.2274, + "step": 22816 + }, + { + "epoch": 1.2523600439077938, + "grad_norm": 1.1309142112731934, + "learning_rate": 6.754293027796615e-06, + "loss": 0.2231, + "step": 22818 + }, + { + "epoch": 1.252469813391877, + "grad_norm": 0.9044373631477356, + "learning_rate": 6.750713946980969e-06, + "loss": 0.1184, + "step": 22820 + }, + { + "epoch": 1.2525795828759605, + "grad_norm": 1.1964008808135986, + "learning_rate": 6.747135666668581e-06, + "loss": 0.1403, + "step": 22822 + }, + { + "epoch": 1.252689352360044, + "grad_norm": 1.2907620668411255, + "learning_rate": 6.743558187016405e-06, + "loss": 0.2439, + "step": 22824 + }, + { + "epoch": 1.2527991218441272, + "grad_norm": 1.1444555521011353, + "learning_rate": 6.739981508181384e-06, + "loss": 0.2038, + "step": 22826 + }, + { + "epoch": 1.2529088913282107, + "grad_norm": 1.150033950805664, + "learning_rate": 6.7364056303204e-06, + "loss": 0.2646, + "step": 22828 + }, + { + "epoch": 1.2530186608122942, + "grad_norm": 0.8444405198097229, + "learning_rate": 6.732830553590305e-06, + "loss": 0.2478, + "step": 22830 + }, + { + "epoch": 1.2531284302963777, + "grad_norm": 1.094275951385498, + "learning_rate": 6.729256278147917e-06, + "loss": 0.1666, + "step": 22832 + }, + { + "epoch": 1.2532381997804611, + "grad_norm": 1.2355315685272217, + "learning_rate": 6.725682804150032e-06, + "loss": 0.2393, + "step": 22834 + }, + { + "epoch": 1.2533479692645444, + "grad_norm": 1.0444546937942505, + "learning_rate": 6.722110131753398e-06, + "loss": 0.22, + "step": 22836 + }, + { + "epoch": 1.2534577387486279, + "grad_norm": 0.9059786200523376, + "learning_rate": 6.718538261114726e-06, + "loss": 0.1285, + "step": 22838 + }, + { + "epoch": 1.2535675082327113, + "grad_norm": 1.4999918937683105, + "learning_rate": 6.7149671923906945e-06, + "loss": 0.294, + "step": 22840 + }, + { + "epoch": 1.2536772777167946, + "grad_norm": 0.9400800466537476, + "learning_rate": 6.711396925737953e-06, + "loss": 0.2032, + "step": 22842 + }, + { + "epoch": 1.253787047200878, + "grad_norm": 1.2806663513183594, + "learning_rate": 6.707827461313099e-06, + "loss": 0.147, + "step": 22844 + }, + { + "epoch": 1.2538968166849616, + "grad_norm": 0.9769928455352783, + "learning_rate": 6.704258799272722e-06, + "loss": 0.2922, + "step": 22846 + }, + { + "epoch": 1.254006586169045, + "grad_norm": 0.9513059258460999, + "learning_rate": 6.7006909397733575e-06, + "loss": 0.2408, + "step": 22848 + }, + { + "epoch": 1.2541163556531285, + "grad_norm": 1.3629800081253052, + "learning_rate": 6.697123882971507e-06, + "loss": 0.1861, + "step": 22850 + }, + { + "epoch": 1.2542261251372118, + "grad_norm": 1.0251595973968506, + "learning_rate": 6.6935576290236366e-06, + "loss": 0.1601, + "step": 22852 + }, + { + "epoch": 1.2543358946212952, + "grad_norm": 1.2647504806518555, + "learning_rate": 6.689992178086174e-06, + "loss": 0.1727, + "step": 22854 + }, + { + "epoch": 1.2544456641053787, + "grad_norm": 1.3246192932128906, + "learning_rate": 6.686427530315534e-06, + "loss": 0.1903, + "step": 22856 + }, + { + "epoch": 1.2545554335894622, + "grad_norm": 1.3038630485534668, + "learning_rate": 6.6828636858680625e-06, + "loss": 0.2261, + "step": 22858 + }, + { + "epoch": 1.2546652030735457, + "grad_norm": 1.9382150173187256, + "learning_rate": 6.679300644900105e-06, + "loss": 0.2266, + "step": 22860 + }, + { + "epoch": 1.254774972557629, + "grad_norm": 1.3346787691116333, + "learning_rate": 6.675738407567941e-06, + "loss": 0.2327, + "step": 22862 + }, + { + "epoch": 1.2548847420417124, + "grad_norm": 1.1529620885849, + "learning_rate": 6.672176974027836e-06, + "loss": 0.4199, + "step": 22864 + }, + { + "epoch": 1.2549945115257959, + "grad_norm": 0.9398683309555054, + "learning_rate": 6.668616344436004e-06, + "loss": 0.1561, + "step": 22866 + }, + { + "epoch": 1.2551042810098791, + "grad_norm": 0.9538170099258423, + "learning_rate": 6.665056518948629e-06, + "loss": 0.1191, + "step": 22868 + }, + { + "epoch": 1.2552140504939626, + "grad_norm": 0.9826436042785645, + "learning_rate": 6.661497497721872e-06, + "loss": 0.1674, + "step": 22870 + }, + { + "epoch": 1.255323819978046, + "grad_norm": 1.0639485120773315, + "learning_rate": 6.657939280911848e-06, + "loss": 0.1436, + "step": 22872 + }, + { + "epoch": 1.2554335894621296, + "grad_norm": 0.9920739531517029, + "learning_rate": 6.654381868674636e-06, + "loss": 0.2433, + "step": 22874 + }, + { + "epoch": 1.255543358946213, + "grad_norm": 0.7758328318595886, + "learning_rate": 6.650825261166283e-06, + "loss": 0.2313, + "step": 22876 + }, + { + "epoch": 1.2556531284302963, + "grad_norm": 1.4143918752670288, + "learning_rate": 6.647269458542793e-06, + "loss": 0.2551, + "step": 22878 + }, + { + "epoch": 1.2557628979143798, + "grad_norm": 1.0887744426727295, + "learning_rate": 6.643714460960137e-06, + "loss": 0.2265, + "step": 22880 + }, + { + "epoch": 1.2558726673984633, + "grad_norm": 2.6310880184173584, + "learning_rate": 6.640160268574275e-06, + "loss": 0.1938, + "step": 22882 + }, + { + "epoch": 1.2559824368825465, + "grad_norm": 1.0396640300750732, + "learning_rate": 6.636606881541094e-06, + "loss": 0.1336, + "step": 22884 + }, + { + "epoch": 1.2560922063666302, + "grad_norm": 1.3371444940567017, + "learning_rate": 6.6330543000164645e-06, + "loss": 0.2104, + "step": 22886 + }, + { + "epoch": 1.2562019758507135, + "grad_norm": 1.0331941843032837, + "learning_rate": 6.629502524156228e-06, + "loss": 0.1176, + "step": 22888 + }, + { + "epoch": 1.256311745334797, + "grad_norm": 1.1655104160308838, + "learning_rate": 6.625951554116181e-06, + "loss": 0.1694, + "step": 22890 + }, + { + "epoch": 1.2564215148188804, + "grad_norm": 1.177101731300354, + "learning_rate": 6.622401390052083e-06, + "loss": 0.2277, + "step": 22892 + }, + { + "epoch": 1.2565312843029637, + "grad_norm": 1.4985707998275757, + "learning_rate": 6.618852032119654e-06, + "loss": 0.2745, + "step": 22894 + }, + { + "epoch": 1.2566410537870472, + "grad_norm": 0.687092125415802, + "learning_rate": 6.615303480474603e-06, + "loss": 0.1215, + "step": 22896 + }, + { + "epoch": 1.2567508232711306, + "grad_norm": 1.5492537021636963, + "learning_rate": 6.611755735272579e-06, + "loss": 0.1958, + "step": 22898 + }, + { + "epoch": 1.2568605927552141, + "grad_norm": 1.44817054271698, + "learning_rate": 6.608208796669205e-06, + "loss": 0.1389, + "step": 22900 + }, + { + "epoch": 1.2569703622392976, + "grad_norm": 0.7165293097496033, + "learning_rate": 6.604662664820063e-06, + "loss": 0.1289, + "step": 22902 + }, + { + "epoch": 1.2570801317233808, + "grad_norm": 1.5996320247650146, + "learning_rate": 6.6011173398807e-06, + "loss": 0.2411, + "step": 22904 + }, + { + "epoch": 1.2571899012074643, + "grad_norm": 1.3125206232070923, + "learning_rate": 6.5975728220066425e-06, + "loss": 0.228, + "step": 22906 + }, + { + "epoch": 1.2572996706915478, + "grad_norm": 1.6210341453552246, + "learning_rate": 6.594029111353367e-06, + "loss": 0.2131, + "step": 22908 + }, + { + "epoch": 1.257409440175631, + "grad_norm": 1.3700881004333496, + "learning_rate": 6.590486208076318e-06, + "loss": 0.2271, + "step": 22910 + }, + { + "epoch": 1.2575192096597145, + "grad_norm": 1.1112316846847534, + "learning_rate": 6.5869441123309005e-06, + "loss": 0.1541, + "step": 22912 + }, + { + "epoch": 1.257628979143798, + "grad_norm": 0.9159445762634277, + "learning_rate": 6.583402824272494e-06, + "loss": 0.1342, + "step": 22914 + }, + { + "epoch": 1.2577387486278815, + "grad_norm": 1.0326000452041626, + "learning_rate": 6.579862344056423e-06, + "loss": 0.215, + "step": 22916 + }, + { + "epoch": 1.257848518111965, + "grad_norm": 0.9849622249603271, + "learning_rate": 6.576322671838003e-06, + "loss": 0.1882, + "step": 22918 + }, + { + "epoch": 1.2579582875960482, + "grad_norm": 0.9829431772232056, + "learning_rate": 6.572783807772506e-06, + "loss": 0.1488, + "step": 22920 + }, + { + "epoch": 1.2580680570801317, + "grad_norm": 1.4877673387527466, + "learning_rate": 6.569245752015157e-06, + "loss": 0.1808, + "step": 22922 + }, + { + "epoch": 1.2581778265642152, + "grad_norm": 2.393845558166504, + "learning_rate": 6.5657085047211546e-06, + "loss": 0.15, + "step": 22924 + }, + { + "epoch": 1.2582875960482987, + "grad_norm": 1.433558702468872, + "learning_rate": 6.562172066045655e-06, + "loss": 0.1812, + "step": 22926 + }, + { + "epoch": 1.2583973655323821, + "grad_norm": 1.041784644126892, + "learning_rate": 6.558636436143789e-06, + "loss": 0.1732, + "step": 22928 + }, + { + "epoch": 1.2585071350164654, + "grad_norm": 1.2233123779296875, + "learning_rate": 6.555101615170636e-06, + "loss": 0.2643, + "step": 22930 + }, + { + "epoch": 1.2586169045005489, + "grad_norm": 1.2941150665283203, + "learning_rate": 6.551567603281267e-06, + "loss": 0.1907, + "step": 22932 + }, + { + "epoch": 1.2587266739846323, + "grad_norm": 0.9223312139511108, + "learning_rate": 6.548034400630692e-06, + "loss": 0.127, + "step": 22934 + }, + { + "epoch": 1.2588364434687156, + "grad_norm": 1.5569124221801758, + "learning_rate": 6.544502007373898e-06, + "loss": 0.1278, + "step": 22936 + }, + { + "epoch": 1.258946212952799, + "grad_norm": 1.3826022148132324, + "learning_rate": 6.540970423665829e-06, + "loss": 0.2345, + "step": 22938 + }, + { + "epoch": 1.2590559824368825, + "grad_norm": 0.9459128975868225, + "learning_rate": 6.5374396496614e-06, + "loss": 0.2588, + "step": 22940 + }, + { + "epoch": 1.259165751920966, + "grad_norm": 1.3980733156204224, + "learning_rate": 6.533909685515483e-06, + "loss": 0.2155, + "step": 22942 + }, + { + "epoch": 1.2592755214050495, + "grad_norm": 0.7193322777748108, + "learning_rate": 6.530380531382927e-06, + "loss": 0.1334, + "step": 22944 + }, + { + "epoch": 1.2593852908891328, + "grad_norm": 1.1445949077606201, + "learning_rate": 6.5268521874185304e-06, + "loss": 0.1663, + "step": 22946 + }, + { + "epoch": 1.2594950603732162, + "grad_norm": 1.2603946924209595, + "learning_rate": 6.523324653777074e-06, + "loss": 0.15, + "step": 22948 + }, + { + "epoch": 1.2596048298572997, + "grad_norm": 1.5287141799926758, + "learning_rate": 6.519797930613289e-06, + "loss": 0.285, + "step": 22950 + }, + { + "epoch": 1.259714599341383, + "grad_norm": 1.3621352910995483, + "learning_rate": 6.5162720180818705e-06, + "loss": 0.1622, + "step": 22952 + }, + { + "epoch": 1.2598243688254664, + "grad_norm": 0.7698501348495483, + "learning_rate": 6.512746916337481e-06, + "loss": 0.1764, + "step": 22954 + }, + { + "epoch": 1.25993413830955, + "grad_norm": 1.2796804904937744, + "learning_rate": 6.509222625534755e-06, + "loss": 0.3042, + "step": 22956 + }, + { + "epoch": 1.2600439077936334, + "grad_norm": 1.9934591054916382, + "learning_rate": 6.505699145828286e-06, + "loss": 0.345, + "step": 22958 + }, + { + "epoch": 1.2601536772777169, + "grad_norm": 0.8450225591659546, + "learning_rate": 6.502176477372629e-06, + "loss": 0.1197, + "step": 22960 + }, + { + "epoch": 1.2602634467618001, + "grad_norm": 1.2364450693130493, + "learning_rate": 6.498654620322303e-06, + "loss": 0.1383, + "step": 22962 + }, + { + "epoch": 1.2603732162458836, + "grad_norm": 0.8545176386833191, + "learning_rate": 6.495133574831794e-06, + "loss": 0.1816, + "step": 22964 + }, + { + "epoch": 1.260482985729967, + "grad_norm": 1.048844337463379, + "learning_rate": 6.4916133410555466e-06, + "loss": 0.1232, + "step": 22966 + }, + { + "epoch": 1.2605927552140506, + "grad_norm": 2.713144540786743, + "learning_rate": 6.488093919147991e-06, + "loss": 0.2896, + "step": 22968 + }, + { + "epoch": 1.260702524698134, + "grad_norm": 1.6585410833358765, + "learning_rate": 6.484575309263499e-06, + "loss": 0.2269, + "step": 22970 + }, + { + "epoch": 1.2608122941822173, + "grad_norm": 0.9012830853462219, + "learning_rate": 6.48105751155641e-06, + "loss": 0.1712, + "step": 22972 + }, + { + "epoch": 1.2609220636663008, + "grad_norm": 1.1511582136154175, + "learning_rate": 6.4775405261810364e-06, + "loss": 0.1144, + "step": 22974 + }, + { + "epoch": 1.2610318331503843, + "grad_norm": 1.3702363967895508, + "learning_rate": 6.474024353291641e-06, + "loss": 0.2378, + "step": 22976 + }, + { + "epoch": 1.2611416026344675, + "grad_norm": 1.1917412281036377, + "learning_rate": 6.470508993042476e-06, + "loss": 0.1906, + "step": 22978 + }, + { + "epoch": 1.261251372118551, + "grad_norm": 1.7677218914031982, + "learning_rate": 6.466994445587729e-06, + "loss": 0.2442, + "step": 22980 + }, + { + "epoch": 1.2613611416026345, + "grad_norm": 0.9686455130577087, + "learning_rate": 6.463480711081576e-06, + "loss": 0.1938, + "step": 22982 + }, + { + "epoch": 1.261470911086718, + "grad_norm": 0.9799404740333557, + "learning_rate": 6.459967789678142e-06, + "loss": 0.1571, + "step": 22984 + }, + { + "epoch": 1.2615806805708014, + "grad_norm": 1.0808769464492798, + "learning_rate": 6.456455681531523e-06, + "loss": 0.0881, + "step": 22986 + }, + { + "epoch": 1.2616904500548847, + "grad_norm": 0.8687312006950378, + "learning_rate": 6.452944386795773e-06, + "loss": 0.2388, + "step": 22988 + }, + { + "epoch": 1.2618002195389681, + "grad_norm": 1.0714765787124634, + "learning_rate": 6.449433905624916e-06, + "loss": 0.1306, + "step": 22990 + }, + { + "epoch": 1.2619099890230516, + "grad_norm": 1.092360496520996, + "learning_rate": 6.4459242381729315e-06, + "loss": 0.1614, + "step": 22992 + }, + { + "epoch": 1.2620197585071349, + "grad_norm": 0.914472222328186, + "learning_rate": 6.4424153845937864e-06, + "loss": 0.1278, + "step": 22994 + }, + { + "epoch": 1.2621295279912184, + "grad_norm": 1.4491647481918335, + "learning_rate": 6.43890734504139e-06, + "loss": 0.1741, + "step": 22996 + }, + { + "epoch": 1.2622392974753018, + "grad_norm": 0.8660205006599426, + "learning_rate": 6.435400119669618e-06, + "loss": 0.1839, + "step": 22998 + }, + { + "epoch": 1.2623490669593853, + "grad_norm": 1.1846678256988525, + "learning_rate": 6.431893708632319e-06, + "loss": 0.1826, + "step": 23000 + }, + { + "epoch": 1.2624588364434688, + "grad_norm": 0.7308275699615479, + "learning_rate": 6.428388112083295e-06, + "loss": 0.1467, + "step": 23002 + }, + { + "epoch": 1.262568605927552, + "grad_norm": 1.161937952041626, + "learning_rate": 6.424883330176326e-06, + "loss": 0.1774, + "step": 23004 + }, + { + "epoch": 1.2626783754116355, + "grad_norm": 1.627557635307312, + "learning_rate": 6.421379363065142e-06, + "loss": 0.2233, + "step": 23006 + }, + { + "epoch": 1.262788144895719, + "grad_norm": 1.1150251626968384, + "learning_rate": 6.417876210903454e-06, + "loss": 0.1516, + "step": 23008 + }, + { + "epoch": 1.2628979143798025, + "grad_norm": 1.7750741243362427, + "learning_rate": 6.4143738738449225e-06, + "loss": 0.2372, + "step": 23010 + }, + { + "epoch": 1.263007683863886, + "grad_norm": 1.0814695358276367, + "learning_rate": 6.410872352043176e-06, + "loss": 0.1722, + "step": 23012 + }, + { + "epoch": 1.2631174533479692, + "grad_norm": 1.2036547660827637, + "learning_rate": 6.407371645651808e-06, + "loss": 0.1825, + "step": 23014 + }, + { + "epoch": 1.2632272228320527, + "grad_norm": 1.4383811950683594, + "learning_rate": 6.403871754824373e-06, + "loss": 0.1211, + "step": 23016 + }, + { + "epoch": 1.2633369923161362, + "grad_norm": 0.8779545426368713, + "learning_rate": 6.400372679714403e-06, + "loss": 0.1864, + "step": 23018 + }, + { + "epoch": 1.2634467618002194, + "grad_norm": 1.149872899055481, + "learning_rate": 6.396874420475379e-06, + "loss": 0.1998, + "step": 23020 + }, + { + "epoch": 1.263556531284303, + "grad_norm": 1.1726359128952026, + "learning_rate": 6.3933769772607535e-06, + "loss": 0.268, + "step": 23022 + }, + { + "epoch": 1.2636663007683864, + "grad_norm": 1.156317114830017, + "learning_rate": 6.3898803502239425e-06, + "loss": 0.2249, + "step": 23024 + }, + { + "epoch": 1.2637760702524699, + "grad_norm": 0.8038113117218018, + "learning_rate": 6.38638453951832e-06, + "loss": 0.1513, + "step": 23026 + }, + { + "epoch": 1.2638858397365533, + "grad_norm": 1.324406385421753, + "learning_rate": 6.382889545297227e-06, + "loss": 0.2027, + "step": 23028 + }, + { + "epoch": 1.2639956092206366, + "grad_norm": 1.1190769672393799, + "learning_rate": 6.379395367713984e-06, + "loss": 0.2025, + "step": 23030 + }, + { + "epoch": 1.26410537870472, + "grad_norm": 1.0101279020309448, + "learning_rate": 6.375902006921855e-06, + "loss": 0.1781, + "step": 23032 + }, + { + "epoch": 1.2642151481888035, + "grad_norm": 1.1026091575622559, + "learning_rate": 6.3724094630740776e-06, + "loss": 0.2676, + "step": 23034 + }, + { + "epoch": 1.264324917672887, + "grad_norm": 0.9664697051048279, + "learning_rate": 6.368917736323843e-06, + "loss": 0.1714, + "step": 23036 + }, + { + "epoch": 1.2644346871569705, + "grad_norm": 1.3497592210769653, + "learning_rate": 6.365426826824328e-06, + "loss": 0.1643, + "step": 23038 + }, + { + "epoch": 1.2645444566410537, + "grad_norm": 1.1937869787216187, + "learning_rate": 6.361936734728652e-06, + "loss": 0.1836, + "step": 23040 + }, + { + "epoch": 1.2646542261251372, + "grad_norm": 1.1122510433197021, + "learning_rate": 6.358447460189917e-06, + "loss": 0.1593, + "step": 23042 + }, + { + "epoch": 1.2647639956092207, + "grad_norm": 1.1940782070159912, + "learning_rate": 6.354959003361177e-06, + "loss": 0.1546, + "step": 23044 + }, + { + "epoch": 1.264873765093304, + "grad_norm": 1.1308106184005737, + "learning_rate": 6.3514713643954475e-06, + "loss": 0.1736, + "step": 23046 + }, + { + "epoch": 1.2649835345773874, + "grad_norm": 1.4155635833740234, + "learning_rate": 6.347984543445718e-06, + "loss": 0.4073, + "step": 23048 + }, + { + "epoch": 1.265093304061471, + "grad_norm": 1.1438958644866943, + "learning_rate": 6.344498540664936e-06, + "loss": 0.1639, + "step": 23050 + }, + { + "epoch": 1.2652030735455544, + "grad_norm": 0.959804117679596, + "learning_rate": 6.341013356206007e-06, + "loss": 0.1178, + "step": 23052 + }, + { + "epoch": 1.2653128430296379, + "grad_norm": 0.87181156873703, + "learning_rate": 6.337528990221822e-06, + "loss": 0.0971, + "step": 23054 + }, + { + "epoch": 1.2654226125137211, + "grad_norm": 0.5879222750663757, + "learning_rate": 6.334045442865219e-06, + "loss": 0.2227, + "step": 23056 + }, + { + "epoch": 1.2655323819978046, + "grad_norm": 1.2488840818405151, + "learning_rate": 6.330562714288998e-06, + "loss": 0.2605, + "step": 23058 + }, + { + "epoch": 1.265642151481888, + "grad_norm": 1.7545677423477173, + "learning_rate": 6.327080804645932e-06, + "loss": 0.2505, + "step": 23060 + }, + { + "epoch": 1.2657519209659713, + "grad_norm": 1.1677242517471313, + "learning_rate": 6.323599714088754e-06, + "loss": 0.2238, + "step": 23062 + }, + { + "epoch": 1.2658616904500548, + "grad_norm": 1.3675458431243896, + "learning_rate": 6.320119442770156e-06, + "loss": 0.3341, + "step": 23064 + }, + { + "epoch": 1.2659714599341383, + "grad_norm": 1.4394285678863525, + "learning_rate": 6.316639990842804e-06, + "loss": 0.2326, + "step": 23066 + }, + { + "epoch": 1.2660812294182218, + "grad_norm": 1.3118089437484741, + "learning_rate": 6.3131613584593356e-06, + "loss": 0.1648, + "step": 23068 + }, + { + "epoch": 1.2661909989023052, + "grad_norm": 1.351765513420105, + "learning_rate": 6.309683545772327e-06, + "loss": 0.1693, + "step": 23070 + }, + { + "epoch": 1.2663007683863885, + "grad_norm": 0.8442266583442688, + "learning_rate": 6.306206552934335e-06, + "loss": 0.1244, + "step": 23072 + }, + { + "epoch": 1.266410537870472, + "grad_norm": 1.2235186100006104, + "learning_rate": 6.302730380097879e-06, + "loss": 0.2821, + "step": 23074 + }, + { + "epoch": 1.2665203073545555, + "grad_norm": 1.2093472480773926, + "learning_rate": 6.299255027415443e-06, + "loss": 0.1223, + "step": 23076 + }, + { + "epoch": 1.266630076838639, + "grad_norm": 1.4108092784881592, + "learning_rate": 6.295780495039461e-06, + "loss": 0.1577, + "step": 23078 + }, + { + "epoch": 1.2667398463227224, + "grad_norm": 0.9648683667182922, + "learning_rate": 6.292306783122356e-06, + "loss": 0.1729, + "step": 23080 + }, + { + "epoch": 1.2668496158068057, + "grad_norm": 1.086460828781128, + "learning_rate": 6.288833891816503e-06, + "loss": 0.1623, + "step": 23082 + }, + { + "epoch": 1.2669593852908891, + "grad_norm": 1.3299927711486816, + "learning_rate": 6.285361821274232e-06, + "loss": 0.2054, + "step": 23084 + }, + { + "epoch": 1.2670691547749726, + "grad_norm": 1.3607919216156006, + "learning_rate": 6.281890571647853e-06, + "loss": 0.181, + "step": 23086 + }, + { + "epoch": 1.2671789242590559, + "grad_norm": 1.2776161432266235, + "learning_rate": 6.278420143089617e-06, + "loss": 0.1225, + "step": 23088 + }, + { + "epoch": 1.2672886937431393, + "grad_norm": 2.2562718391418457, + "learning_rate": 6.2749505357517696e-06, + "loss": 0.2153, + "step": 23090 + }, + { + "epoch": 1.2673984632272228, + "grad_norm": 1.0163761377334595, + "learning_rate": 6.271481749786504e-06, + "loss": 0.2999, + "step": 23092 + }, + { + "epoch": 1.2675082327113063, + "grad_norm": 1.1847866773605347, + "learning_rate": 6.268013785345969e-06, + "loss": 0.1534, + "step": 23094 + }, + { + "epoch": 1.2676180021953898, + "grad_norm": 1.2850310802459717, + "learning_rate": 6.264546642582289e-06, + "loss": 0.2526, + "step": 23096 + }, + { + "epoch": 1.267727771679473, + "grad_norm": 1.3142634630203247, + "learning_rate": 6.261080321647555e-06, + "loss": 0.1641, + "step": 23098 + }, + { + "epoch": 1.2678375411635565, + "grad_norm": 0.9382002949714661, + "learning_rate": 6.257614822693819e-06, + "loss": 0.2844, + "step": 23100 + }, + { + "epoch": 1.26794731064764, + "grad_norm": 1.523788571357727, + "learning_rate": 6.254150145873081e-06, + "loss": 0.1986, + "step": 23102 + }, + { + "epoch": 1.2680570801317232, + "grad_norm": 1.2017215490341187, + "learning_rate": 6.250686291337332e-06, + "loss": 0.2125, + "step": 23104 + }, + { + "epoch": 1.2681668496158067, + "grad_norm": 1.2514103651046753, + "learning_rate": 6.247223259238511e-06, + "loss": 0.1995, + "step": 23106 + }, + { + "epoch": 1.2682766190998902, + "grad_norm": 0.888541042804718, + "learning_rate": 6.243761049728522e-06, + "loss": 0.138, + "step": 23108 + }, + { + "epoch": 1.2683863885839737, + "grad_norm": 1.0518275499343872, + "learning_rate": 6.240299662959237e-06, + "loss": 0.1451, + "step": 23110 + }, + { + "epoch": 1.2684961580680572, + "grad_norm": 0.9255789518356323, + "learning_rate": 6.236839099082484e-06, + "loss": 0.1824, + "step": 23112 + }, + { + "epoch": 1.2686059275521404, + "grad_norm": 1.464766502380371, + "learning_rate": 6.233379358250055e-06, + "loss": 0.2625, + "step": 23114 + }, + { + "epoch": 1.2687156970362239, + "grad_norm": 1.3818556070327759, + "learning_rate": 6.2299204406137295e-06, + "loss": 0.3496, + "step": 23116 + }, + { + "epoch": 1.2688254665203074, + "grad_norm": 0.9904506206512451, + "learning_rate": 6.226462346325221e-06, + "loss": 0.1741, + "step": 23118 + }, + { + "epoch": 1.2689352360043908, + "grad_norm": 1.296554446220398, + "learning_rate": 6.22300507553622e-06, + "loss": 0.1861, + "step": 23120 + }, + { + "epoch": 1.2690450054884743, + "grad_norm": 1.1809357404708862, + "learning_rate": 6.2195486283983775e-06, + "loss": 0.1634, + "step": 23122 + }, + { + "epoch": 1.2691547749725576, + "grad_norm": 0.976459264755249, + "learning_rate": 6.216093005063306e-06, + "loss": 0.1171, + "step": 23124 + }, + { + "epoch": 1.269264544456641, + "grad_norm": 1.0006442070007324, + "learning_rate": 6.2126382056826e-06, + "loss": 0.15, + "step": 23126 + }, + { + "epoch": 1.2693743139407245, + "grad_norm": 1.2194504737854004, + "learning_rate": 6.209184230407788e-06, + "loss": 0.2012, + "step": 23128 + }, + { + "epoch": 1.2694840834248078, + "grad_norm": 1.0423314571380615, + "learning_rate": 6.205731079390395e-06, + "loss": 0.1515, + "step": 23130 + }, + { + "epoch": 1.2695938529088913, + "grad_norm": 0.9589484930038452, + "learning_rate": 6.202278752781884e-06, + "loss": 0.1875, + "step": 23132 + }, + { + "epoch": 1.2697036223929747, + "grad_norm": 1.204648494720459, + "learning_rate": 6.198827250733693e-06, + "loss": 0.2236, + "step": 23134 + }, + { + "epoch": 1.2698133918770582, + "grad_norm": 1.5051931142807007, + "learning_rate": 6.195376573397218e-06, + "loss": 0.28, + "step": 23136 + }, + { + "epoch": 1.2699231613611417, + "grad_norm": 1.0881651639938354, + "learning_rate": 6.1919267209238185e-06, + "loss": 0.2138, + "step": 23138 + }, + { + "epoch": 1.270032930845225, + "grad_norm": 1.071103572845459, + "learning_rate": 6.188477693464836e-06, + "loss": 0.135, + "step": 23140 + }, + { + "epoch": 1.2701427003293084, + "grad_norm": 1.0938364267349243, + "learning_rate": 6.185029491171554e-06, + "loss": 0.2018, + "step": 23142 + }, + { + "epoch": 1.270252469813392, + "grad_norm": 1.1196719408035278, + "learning_rate": 6.181582114195228e-06, + "loss": 0.1567, + "step": 23144 + }, + { + "epoch": 1.2703622392974754, + "grad_norm": 0.9884875416755676, + "learning_rate": 6.178135562687076e-06, + "loss": 0.1841, + "step": 23146 + }, + { + "epoch": 1.2704720087815589, + "grad_norm": 1.2936534881591797, + "learning_rate": 6.174689836798284e-06, + "loss": 0.2181, + "step": 23148 + }, + { + "epoch": 1.2705817782656421, + "grad_norm": 1.098739743232727, + "learning_rate": 6.171244936679984e-06, + "loss": 0.1901, + "step": 23150 + }, + { + "epoch": 1.2706915477497256, + "grad_norm": 1.0515583753585815, + "learning_rate": 6.167800862483308e-06, + "loss": 0.1623, + "step": 23152 + }, + { + "epoch": 1.270801317233809, + "grad_norm": 1.044304609298706, + "learning_rate": 6.16435761435932e-06, + "loss": 0.1367, + "step": 23154 + }, + { + "epoch": 1.2709110867178923, + "grad_norm": 0.9155198335647583, + "learning_rate": 6.160915192459058e-06, + "loss": 0.1522, + "step": 23156 + }, + { + "epoch": 1.2710208562019758, + "grad_norm": 0.893255889415741, + "learning_rate": 6.157473596933516e-06, + "loss": 0.1542, + "step": 23158 + }, + { + "epoch": 1.2711306256860593, + "grad_norm": 1.1729373931884766, + "learning_rate": 6.154032827933673e-06, + "loss": 0.1491, + "step": 23160 + }, + { + "epoch": 1.2712403951701428, + "grad_norm": 1.1721806526184082, + "learning_rate": 6.150592885610454e-06, + "loss": 0.1999, + "step": 23162 + }, + { + "epoch": 1.2713501646542262, + "grad_norm": 1.4698349237442017, + "learning_rate": 6.147153770114738e-06, + "loss": 0.2341, + "step": 23164 + }, + { + "epoch": 1.2714599341383095, + "grad_norm": 0.7893222570419312, + "learning_rate": 6.143715481597404e-06, + "loss": 0.1646, + "step": 23166 + }, + { + "epoch": 1.271569703622393, + "grad_norm": 0.919308066368103, + "learning_rate": 6.1402780202092584e-06, + "loss": 0.1519, + "step": 23168 + }, + { + "epoch": 1.2716794731064764, + "grad_norm": 1.1650086641311646, + "learning_rate": 6.136841386101091e-06, + "loss": 0.1684, + "step": 23170 + }, + { + "epoch": 1.2717892425905597, + "grad_norm": 1.0809457302093506, + "learning_rate": 6.133405579423643e-06, + "loss": 0.2335, + "step": 23172 + }, + { + "epoch": 1.2718990120746432, + "grad_norm": 0.9395485520362854, + "learning_rate": 6.129970600327623e-06, + "loss": 0.1371, + "step": 23174 + }, + { + "epoch": 1.2720087815587267, + "grad_norm": 0.7581261992454529, + "learning_rate": 6.126536448963718e-06, + "loss": 0.1632, + "step": 23176 + }, + { + "epoch": 1.2721185510428101, + "grad_norm": 1.1615118980407715, + "learning_rate": 6.123103125482562e-06, + "loss": 0.2036, + "step": 23178 + }, + { + "epoch": 1.2722283205268936, + "grad_norm": 1.4940807819366455, + "learning_rate": 6.119670630034758e-06, + "loss": 0.3174, + "step": 23180 + }, + { + "epoch": 1.2723380900109769, + "grad_norm": 1.201619267463684, + "learning_rate": 6.116238962770867e-06, + "loss": 0.2192, + "step": 23182 + }, + { + "epoch": 1.2724478594950603, + "grad_norm": 1.7848868370056152, + "learning_rate": 6.112808123841424e-06, + "loss": 0.1158, + "step": 23184 + }, + { + "epoch": 1.2725576289791438, + "grad_norm": 0.8290392756462097, + "learning_rate": 6.109378113396913e-06, + "loss": 0.1231, + "step": 23186 + }, + { + "epoch": 1.2726673984632273, + "grad_norm": 0.9080179929733276, + "learning_rate": 6.1059489315877985e-06, + "loss": 0.1216, + "step": 23188 + }, + { + "epoch": 1.2727771679473108, + "grad_norm": 1.9438694715499878, + "learning_rate": 6.102520578564508e-06, + "loss": 0.2559, + "step": 23190 + }, + { + "epoch": 1.272886937431394, + "grad_norm": 0.8968314528465271, + "learning_rate": 6.09909305447742e-06, + "loss": 0.1386, + "step": 23192 + }, + { + "epoch": 1.2729967069154775, + "grad_norm": 1.211384654045105, + "learning_rate": 6.095666359476882e-06, + "loss": 0.1466, + "step": 23194 + }, + { + "epoch": 1.273106476399561, + "grad_norm": 1.0917036533355713, + "learning_rate": 6.092240493713205e-06, + "loss": 0.1708, + "step": 23196 + }, + { + "epoch": 1.2732162458836442, + "grad_norm": 1.0899348258972168, + "learning_rate": 6.088815457336663e-06, + "loss": 0.2386, + "step": 23198 + }, + { + "epoch": 1.2733260153677277, + "grad_norm": 1.170902967453003, + "learning_rate": 6.085391250497491e-06, + "loss": 0.1762, + "step": 23200 + }, + { + "epoch": 1.2734357848518112, + "grad_norm": 1.1519453525543213, + "learning_rate": 6.081967873345903e-06, + "loss": 0.2678, + "step": 23202 + }, + { + "epoch": 1.2735455543358947, + "grad_norm": 1.2593401670455933, + "learning_rate": 6.078545326032062e-06, + "loss": 0.1637, + "step": 23204 + }, + { + "epoch": 1.2736553238199781, + "grad_norm": 0.9065418243408203, + "learning_rate": 6.075123608706093e-06, + "loss": 0.1293, + "step": 23206 + }, + { + "epoch": 1.2737650933040614, + "grad_norm": 0.9728071093559265, + "learning_rate": 6.071702721518091e-06, + "loss": 0.0928, + "step": 23208 + }, + { + "epoch": 1.2738748627881449, + "grad_norm": 1.3879566192626953, + "learning_rate": 6.068282664618108e-06, + "loss": 0.2834, + "step": 23210 + }, + { + "epoch": 1.2739846322722284, + "grad_norm": 0.9300850033760071, + "learning_rate": 6.064863438156173e-06, + "loss": 0.1983, + "step": 23212 + }, + { + "epoch": 1.2740944017563116, + "grad_norm": 1.3382232189178467, + "learning_rate": 6.061445042282271e-06, + "loss": 0.2589, + "step": 23214 + }, + { + "epoch": 1.274204171240395, + "grad_norm": 0.5956278443336487, + "learning_rate": 6.058027477146344e-06, + "loss": 0.1403, + "step": 23216 + }, + { + "epoch": 1.2743139407244786, + "grad_norm": 1.0763506889343262, + "learning_rate": 6.054610742898295e-06, + "loss": 0.1199, + "step": 23218 + }, + { + "epoch": 1.274423710208562, + "grad_norm": 0.9448535442352295, + "learning_rate": 6.051194839688018e-06, + "loss": 0.2117, + "step": 23220 + }, + { + "epoch": 1.2745334796926455, + "grad_norm": 1.2881247997283936, + "learning_rate": 6.04777976766534e-06, + "loss": 0.1796, + "step": 23222 + }, + { + "epoch": 1.2746432491767288, + "grad_norm": 1.3306645154953003, + "learning_rate": 6.044365526980058e-06, + "loss": 0.1963, + "step": 23224 + }, + { + "epoch": 1.2747530186608123, + "grad_norm": 1.041815161705017, + "learning_rate": 6.040952117781953e-06, + "loss": 0.1416, + "step": 23226 + }, + { + "epoch": 1.2748627881448957, + "grad_norm": 0.965446412563324, + "learning_rate": 6.037539540220741e-06, + "loss": 0.1981, + "step": 23228 + }, + { + "epoch": 1.2749725576289792, + "grad_norm": 1.2213730812072754, + "learning_rate": 6.034127794446121e-06, + "loss": 0.1765, + "step": 23230 + }, + { + "epoch": 1.2750823271130627, + "grad_norm": 1.2222223281860352, + "learning_rate": 6.030716880607745e-06, + "loss": 0.2406, + "step": 23232 + }, + { + "epoch": 1.275192096597146, + "grad_norm": 1.2637525796890259, + "learning_rate": 6.0273067988552316e-06, + "loss": 0.1617, + "step": 23234 + }, + { + "epoch": 1.2753018660812294, + "grad_norm": 1.7010056972503662, + "learning_rate": 6.023897549338159e-06, + "loss": 0.2215, + "step": 23236 + }, + { + "epoch": 1.275411635565313, + "grad_norm": 1.3282968997955322, + "learning_rate": 6.020489132206089e-06, + "loss": 0.1714, + "step": 23238 + }, + { + "epoch": 1.2755214050493962, + "grad_norm": 1.0586895942687988, + "learning_rate": 6.017081547608522e-06, + "loss": 0.1759, + "step": 23240 + }, + { + "epoch": 1.2756311745334796, + "grad_norm": 1.3775080442428589, + "learning_rate": 6.013674795694929e-06, + "loss": 0.2084, + "step": 23242 + }, + { + "epoch": 1.275740944017563, + "grad_norm": 1.400465488433838, + "learning_rate": 6.010268876614753e-06, + "loss": 0.1935, + "step": 23244 + }, + { + "epoch": 1.2758507135016466, + "grad_norm": 0.8572198748588562, + "learning_rate": 6.006863790517392e-06, + "loss": 0.2138, + "step": 23246 + }, + { + "epoch": 1.27596048298573, + "grad_norm": 1.0221226215362549, + "learning_rate": 6.003459537552197e-06, + "loss": 0.1853, + "step": 23248 + }, + { + "epoch": 1.2760702524698133, + "grad_norm": 0.8100926876068115, + "learning_rate": 6.000056117868511e-06, + "loss": 0.2123, + "step": 23250 + }, + { + "epoch": 1.2761800219538968, + "grad_norm": 1.6765930652618408, + "learning_rate": 5.996653531615628e-06, + "loss": 0.1995, + "step": 23252 + }, + { + "epoch": 1.2762897914379803, + "grad_norm": 1.2381603717803955, + "learning_rate": 5.9932517789427935e-06, + "loss": 0.192, + "step": 23254 + }, + { + "epoch": 1.2763995609220637, + "grad_norm": 0.9923169016838074, + "learning_rate": 5.989850859999227e-06, + "loss": 0.2054, + "step": 23256 + }, + { + "epoch": 1.2765093304061472, + "grad_norm": 1.0197464227676392, + "learning_rate": 5.98645077493411e-06, + "loss": 0.2187, + "step": 23258 + }, + { + "epoch": 1.2766190998902305, + "grad_norm": 1.5005149841308594, + "learning_rate": 5.983051523896579e-06, + "loss": 0.2348, + "step": 23260 + }, + { + "epoch": 1.276728869374314, + "grad_norm": 0.6815064549446106, + "learning_rate": 5.979653107035754e-06, + "loss": 0.1011, + "step": 23262 + }, + { + "epoch": 1.2768386388583974, + "grad_norm": 1.3809537887573242, + "learning_rate": 5.976255524500704e-06, + "loss": 0.2319, + "step": 23264 + }, + { + "epoch": 1.2769484083424807, + "grad_norm": 1.1621806621551514, + "learning_rate": 5.97285877644046e-06, + "loss": 0.1499, + "step": 23266 + }, + { + "epoch": 1.2770581778265642, + "grad_norm": 0.9396860599517822, + "learning_rate": 5.969462863004019e-06, + "loss": 0.1385, + "step": 23268 + }, + { + "epoch": 1.2771679473106476, + "grad_norm": 0.8673804998397827, + "learning_rate": 5.966067784340346e-06, + "loss": 0.1348, + "step": 23270 + }, + { + "epoch": 1.2772777167947311, + "grad_norm": 1.274000883102417, + "learning_rate": 5.962673540598354e-06, + "loss": 0.2352, + "step": 23272 + }, + { + "epoch": 1.2773874862788146, + "grad_norm": 0.8105905652046204, + "learning_rate": 5.95928013192695e-06, + "loss": 0.1952, + "step": 23274 + }, + { + "epoch": 1.2774972557628979, + "grad_norm": 1.0778669118881226, + "learning_rate": 5.955887558474979e-06, + "loss": 0.1897, + "step": 23276 + }, + { + "epoch": 1.2776070252469813, + "grad_norm": 1.006577968597412, + "learning_rate": 5.9524958203912435e-06, + "loss": 0.2476, + "step": 23278 + }, + { + "epoch": 1.2777167947310648, + "grad_norm": 1.2649755477905273, + "learning_rate": 5.949104917824541e-06, + "loss": 0.232, + "step": 23280 + }, + { + "epoch": 1.277826564215148, + "grad_norm": 1.1557133197784424, + "learning_rate": 5.945714850923603e-06, + "loss": 0.1637, + "step": 23282 + }, + { + "epoch": 1.2779363336992315, + "grad_norm": 2.1382980346679688, + "learning_rate": 5.942325619837136e-06, + "loss": 0.3229, + "step": 23284 + }, + { + "epoch": 1.278046103183315, + "grad_norm": 1.403801441192627, + "learning_rate": 5.9389372247138e-06, + "loss": 0.1823, + "step": 23286 + }, + { + "epoch": 1.2781558726673985, + "grad_norm": 0.9555289149284363, + "learning_rate": 5.935549665702245e-06, + "loss": 0.1439, + "step": 23288 + }, + { + "epoch": 1.278265642151482, + "grad_norm": 1.294988751411438, + "learning_rate": 5.93216294295105e-06, + "loss": 0.1981, + "step": 23290 + }, + { + "epoch": 1.2783754116355652, + "grad_norm": 1.1649757623672485, + "learning_rate": 5.928777056608783e-06, + "loss": 0.2168, + "step": 23292 + }, + { + "epoch": 1.2784851811196487, + "grad_norm": 1.6048589944839478, + "learning_rate": 5.92539200682396e-06, + "loss": 0.2637, + "step": 23294 + }, + { + "epoch": 1.2785949506037322, + "grad_norm": 1.2602078914642334, + "learning_rate": 5.922007793745069e-06, + "loss": 0.217, + "step": 23296 + }, + { + "epoch": 1.2787047200878157, + "grad_norm": 0.8355303406715393, + "learning_rate": 5.918624417520546e-06, + "loss": 0.1809, + "step": 23298 + }, + { + "epoch": 1.2788144895718991, + "grad_norm": 0.965125560760498, + "learning_rate": 5.915241878298822e-06, + "loss": 0.1778, + "step": 23300 + }, + { + "epoch": 1.2789242590559824, + "grad_norm": 1.5889322757720947, + "learning_rate": 5.911860176228262e-06, + "loss": 0.1678, + "step": 23302 + }, + { + "epoch": 1.2790340285400659, + "grad_norm": 0.9683637619018555, + "learning_rate": 5.908479311457205e-06, + "loss": 0.1627, + "step": 23304 + }, + { + "epoch": 1.2791437980241493, + "grad_norm": 0.7340697646141052, + "learning_rate": 5.905099284133952e-06, + "loss": 0.0797, + "step": 23306 + }, + { + "epoch": 1.2792535675082326, + "grad_norm": 0.9280164837837219, + "learning_rate": 5.901720094406762e-06, + "loss": 0.1069, + "step": 23308 + }, + { + "epoch": 1.279363336992316, + "grad_norm": 1.1060140132904053, + "learning_rate": 5.898341742423865e-06, + "loss": 0.133, + "step": 23310 + }, + { + "epoch": 1.2794731064763996, + "grad_norm": 1.4163380861282349, + "learning_rate": 5.8949642283334664e-06, + "loss": 0.184, + "step": 23312 + }, + { + "epoch": 1.279582875960483, + "grad_norm": 1.1273077726364136, + "learning_rate": 5.891587552283709e-06, + "loss": 0.255, + "step": 23314 + }, + { + "epoch": 1.2796926454445665, + "grad_norm": 1.3914817571640015, + "learning_rate": 5.8882117144227115e-06, + "loss": 0.1974, + "step": 23316 + }, + { + "epoch": 1.2798024149286498, + "grad_norm": 1.123073697090149, + "learning_rate": 5.884836714898554e-06, + "loss": 0.1721, + "step": 23318 + }, + { + "epoch": 1.2799121844127332, + "grad_norm": 1.0570420026779175, + "learning_rate": 5.88146255385928e-06, + "loss": 0.164, + "step": 23320 + }, + { + "epoch": 1.2800219538968167, + "grad_norm": 0.980033814907074, + "learning_rate": 5.878089231452891e-06, + "loss": 0.1768, + "step": 23322 + }, + { + "epoch": 1.2801317233809, + "grad_norm": 0.8704567551612854, + "learning_rate": 5.874716747827372e-06, + "loss": 0.086, + "step": 23324 + }, + { + "epoch": 1.2802414928649835, + "grad_norm": 0.9611850380897522, + "learning_rate": 5.871345103130646e-06, + "loss": 0.179, + "step": 23326 + }, + { + "epoch": 1.280351262349067, + "grad_norm": 2.118617296218872, + "learning_rate": 5.8679742975106175e-06, + "loss": 0.1583, + "step": 23328 + }, + { + "epoch": 1.2804610318331504, + "grad_norm": 0.8144082427024841, + "learning_rate": 5.864604331115137e-06, + "loss": 0.1911, + "step": 23330 + }, + { + "epoch": 1.2805708013172339, + "grad_norm": 0.803827166557312, + "learning_rate": 5.861235204092036e-06, + "loss": 0.1748, + "step": 23332 + }, + { + "epoch": 1.2806805708013171, + "grad_norm": 0.9425869584083557, + "learning_rate": 5.857866916589089e-06, + "loss": 0.1449, + "step": 23334 + }, + { + "epoch": 1.2807903402854006, + "grad_norm": 1.0624184608459473, + "learning_rate": 5.854499468754062e-06, + "loss": 0.163, + "step": 23336 + }, + { + "epoch": 1.280900109769484, + "grad_norm": 1.2684760093688965, + "learning_rate": 5.851132860734651e-06, + "loss": 0.2222, + "step": 23338 + }, + { + "epoch": 1.2810098792535676, + "grad_norm": 1.670613408088684, + "learning_rate": 5.847767092678549e-06, + "loss": 0.2647, + "step": 23340 + }, + { + "epoch": 1.281119648737651, + "grad_norm": 1.0078892707824707, + "learning_rate": 5.84440216473339e-06, + "loss": 0.2019, + "step": 23342 + }, + { + "epoch": 1.2812294182217343, + "grad_norm": 0.969865083694458, + "learning_rate": 5.84103807704677e-06, + "loss": 0.1341, + "step": 23344 + }, + { + "epoch": 1.2813391877058178, + "grad_norm": 0.9308078289031982, + "learning_rate": 5.837674829766257e-06, + "loss": 0.1387, + "step": 23346 + }, + { + "epoch": 1.2814489571899013, + "grad_norm": 1.422803521156311, + "learning_rate": 5.834312423039376e-06, + "loss": 0.2815, + "step": 23348 + }, + { + "epoch": 1.2815587266739845, + "grad_norm": 0.9665438532829285, + "learning_rate": 5.8309508570136284e-06, + "loss": 0.1293, + "step": 23350 + }, + { + "epoch": 1.281668496158068, + "grad_norm": 1.015917420387268, + "learning_rate": 5.8275901318364635e-06, + "loss": 0.1962, + "step": 23352 + }, + { + "epoch": 1.2817782656421515, + "grad_norm": 1.32011878490448, + "learning_rate": 5.824230247655299e-06, + "loss": 0.1845, + "step": 23354 + }, + { + "epoch": 1.281888035126235, + "grad_norm": 1.0984340906143188, + "learning_rate": 5.820871204617515e-06, + "loss": 0.2002, + "step": 23356 + }, + { + "epoch": 1.2819978046103184, + "grad_norm": 1.7504445314407349, + "learning_rate": 5.81751300287045e-06, + "loss": 0.2436, + "step": 23358 + }, + { + "epoch": 1.2821075740944017, + "grad_norm": 0.89743971824646, + "learning_rate": 5.814155642561428e-06, + "loss": 0.1739, + "step": 23360 + }, + { + "epoch": 1.2822173435784852, + "grad_norm": 1.2185487747192383, + "learning_rate": 5.810799123837707e-06, + "loss": 0.2282, + "step": 23362 + }, + { + "epoch": 1.2823271130625686, + "grad_norm": 1.2249624729156494, + "learning_rate": 5.807443446846522e-06, + "loss": 0.1884, + "step": 23364 + }, + { + "epoch": 1.2824368825466521, + "grad_norm": 1.2157038450241089, + "learning_rate": 5.80408861173507e-06, + "loss": 0.1871, + "step": 23366 + }, + { + "epoch": 1.2825466520307356, + "grad_norm": 2.196192502975464, + "learning_rate": 5.8007346186505054e-06, + "loss": 0.3088, + "step": 23368 + }, + { + "epoch": 1.2826564215148188, + "grad_norm": 1.1063224077224731, + "learning_rate": 5.79738146773996e-06, + "loss": 0.1653, + "step": 23370 + }, + { + "epoch": 1.2827661909989023, + "grad_norm": 1.3183587789535522, + "learning_rate": 5.7940291591505105e-06, + "loss": 0.1692, + "step": 23372 + }, + { + "epoch": 1.2828759604829858, + "grad_norm": 0.9989046454429626, + "learning_rate": 5.790677693029217e-06, + "loss": 0.1627, + "step": 23374 + }, + { + "epoch": 1.282985729967069, + "grad_norm": 1.2480027675628662, + "learning_rate": 5.7873270695230855e-06, + "loss": 0.1406, + "step": 23376 + }, + { + "epoch": 1.2830954994511525, + "grad_norm": 1.434544563293457, + "learning_rate": 5.783977288779088e-06, + "loss": 0.2214, + "step": 23378 + }, + { + "epoch": 1.283205268935236, + "grad_norm": 1.2481199502944946, + "learning_rate": 5.780628350944167e-06, + "loss": 0.1185, + "step": 23380 + }, + { + "epoch": 1.2833150384193195, + "grad_norm": 1.1552455425262451, + "learning_rate": 5.777280256165218e-06, + "loss": 0.1515, + "step": 23382 + }, + { + "epoch": 1.283424807903403, + "grad_norm": 1.0925575494766235, + "learning_rate": 5.773933004589102e-06, + "loss": 0.1895, + "step": 23384 + }, + { + "epoch": 1.2835345773874862, + "grad_norm": 1.706652283668518, + "learning_rate": 5.770586596362659e-06, + "loss": 0.1627, + "step": 23386 + }, + { + "epoch": 1.2836443468715697, + "grad_norm": 1.7257699966430664, + "learning_rate": 5.7672410316326725e-06, + "loss": 0.1689, + "step": 23388 + }, + { + "epoch": 1.2837541163556532, + "grad_norm": 1.4368462562561035, + "learning_rate": 5.763896310545894e-06, + "loss": 0.1527, + "step": 23390 + }, + { + "epoch": 1.2838638858397364, + "grad_norm": 1.341834545135498, + "learning_rate": 5.760552433249036e-06, + "loss": 0.1757, + "step": 23392 + }, + { + "epoch": 1.28397365532382, + "grad_norm": 1.1996554136276245, + "learning_rate": 5.757209399888777e-06, + "loss": 0.1414, + "step": 23394 + }, + { + "epoch": 1.2840834248079034, + "grad_norm": 1.231575846672058, + "learning_rate": 5.753867210611772e-06, + "loss": 0.2201, + "step": 23396 + }, + { + "epoch": 1.2841931942919869, + "grad_norm": 1.2910856008529663, + "learning_rate": 5.750525865564613e-06, + "loss": 0.1827, + "step": 23398 + }, + { + "epoch": 1.2843029637760703, + "grad_norm": 0.9681555032730103, + "learning_rate": 5.747185364893864e-06, + "loss": 0.1356, + "step": 23400 + }, + { + "epoch": 1.2844127332601536, + "grad_norm": 1.0996633768081665, + "learning_rate": 5.7438457087460736e-06, + "loss": 0.13, + "step": 23402 + }, + { + "epoch": 1.284522502744237, + "grad_norm": 1.4453059434890747, + "learning_rate": 5.740506897267725e-06, + "loss": 0.2626, + "step": 23404 + }, + { + "epoch": 1.2846322722283205, + "grad_norm": 1.0785410404205322, + "learning_rate": 5.737168930605272e-06, + "loss": 0.1872, + "step": 23406 + }, + { + "epoch": 1.284742041712404, + "grad_norm": 1.2074999809265137, + "learning_rate": 5.733831808905132e-06, + "loss": 0.1845, + "step": 23408 + }, + { + "epoch": 1.2848518111964875, + "grad_norm": 3.324423313140869, + "learning_rate": 5.730495532313701e-06, + "loss": 0.2315, + "step": 23410 + }, + { + "epoch": 1.2849615806805708, + "grad_norm": 0.887468159198761, + "learning_rate": 5.727160100977313e-06, + "loss": 0.1684, + "step": 23412 + }, + { + "epoch": 1.2850713501646542, + "grad_norm": 1.0050255060195923, + "learning_rate": 5.7238255150422835e-06, + "loss": 0.2517, + "step": 23414 + }, + { + "epoch": 1.2851811196487377, + "grad_norm": 1.5039687156677246, + "learning_rate": 5.7204917746548765e-06, + "loss": 0.28, + "step": 23416 + }, + { + "epoch": 1.285290889132821, + "grad_norm": 1.065346598625183, + "learning_rate": 5.717158879961332e-06, + "loss": 0.183, + "step": 23418 + }, + { + "epoch": 1.2854006586169044, + "grad_norm": 1.3665186166763306, + "learning_rate": 5.713826831107838e-06, + "loss": 0.2175, + "step": 23420 + }, + { + "epoch": 1.285510428100988, + "grad_norm": 1.252219557762146, + "learning_rate": 5.710495628240567e-06, + "loss": 0.1001, + "step": 23422 + }, + { + "epoch": 1.2856201975850714, + "grad_norm": 1.5053726434707642, + "learning_rate": 5.707165271505635e-06, + "loss": 0.1664, + "step": 23424 + }, + { + "epoch": 1.2857299670691549, + "grad_norm": 1.3649559020996094, + "learning_rate": 5.703835761049131e-06, + "loss": 0.2331, + "step": 23426 + }, + { + "epoch": 1.2858397365532381, + "grad_norm": 1.0136942863464355, + "learning_rate": 5.700507097017102e-06, + "loss": 0.1423, + "step": 23428 + }, + { + "epoch": 1.2859495060373216, + "grad_norm": 4.048413276672363, + "learning_rate": 5.6971792795555505e-06, + "loss": 0.2294, + "step": 23430 + }, + { + "epoch": 1.286059275521405, + "grad_norm": 0.9188183546066284, + "learning_rate": 5.693852308810468e-06, + "loss": 0.2053, + "step": 23432 + }, + { + "epoch": 1.2861690450054883, + "grad_norm": 1.969804286956787, + "learning_rate": 5.690526184927775e-06, + "loss": 0.239, + "step": 23434 + }, + { + "epoch": 1.2862788144895718, + "grad_norm": 1.1995539665222168, + "learning_rate": 5.687200908053389e-06, + "loss": 0.16, + "step": 23436 + }, + { + "epoch": 1.2863885839736553, + "grad_norm": 1.2566829919815063, + "learning_rate": 5.683876478333161e-06, + "loss": 0.2075, + "step": 23438 + }, + { + "epoch": 1.2864983534577388, + "grad_norm": 1.0292329788208008, + "learning_rate": 5.680552895912921e-06, + "loss": 0.1396, + "step": 23440 + }, + { + "epoch": 1.2866081229418223, + "grad_norm": 0.9973472356796265, + "learning_rate": 5.67723016093846e-06, + "loss": 0.1611, + "step": 23442 + }, + { + "epoch": 1.2867178924259055, + "grad_norm": 0.8700010180473328, + "learning_rate": 5.6739082735555145e-06, + "loss": 0.1467, + "step": 23444 + }, + { + "epoch": 1.286827661909989, + "grad_norm": 0.9651421308517456, + "learning_rate": 5.6705872339098186e-06, + "loss": 0.1975, + "step": 23446 + }, + { + "epoch": 1.2869374313940725, + "grad_norm": 0.9723232388496399, + "learning_rate": 5.667267042147043e-06, + "loss": 0.2337, + "step": 23448 + }, + { + "epoch": 1.287047200878156, + "grad_norm": 1.4304261207580566, + "learning_rate": 5.663947698412822e-06, + "loss": 0.2018, + "step": 23450 + }, + { + "epoch": 1.2871569703622394, + "grad_norm": 0.8324978351593018, + "learning_rate": 5.660629202852763e-06, + "loss": 0.1994, + "step": 23452 + }, + { + "epoch": 1.2872667398463227, + "grad_norm": 1.1744462251663208, + "learning_rate": 5.6573115556124325e-06, + "loss": 0.1701, + "step": 23454 + }, + { + "epoch": 1.2873765093304061, + "grad_norm": 1.0849318504333496, + "learning_rate": 5.653994756837347e-06, + "loss": 0.1628, + "step": 23456 + }, + { + "epoch": 1.2874862788144896, + "grad_norm": 1.1692720651626587, + "learning_rate": 5.6506788066730155e-06, + "loss": 0.1412, + "step": 23458 + }, + { + "epoch": 1.2875960482985729, + "grad_norm": 1.5888921022415161, + "learning_rate": 5.647363705264877e-06, + "loss": 0.2038, + "step": 23460 + }, + { + "epoch": 1.2877058177826564, + "grad_norm": 1.0992275476455688, + "learning_rate": 5.6440494527583595e-06, + "loss": 0.1194, + "step": 23462 + }, + { + "epoch": 1.2878155872667398, + "grad_norm": 0.8813158273696899, + "learning_rate": 5.640736049298839e-06, + "loss": 0.1178, + "step": 23464 + }, + { + "epoch": 1.2879253567508233, + "grad_norm": 1.2953912019729614, + "learning_rate": 5.637423495031658e-06, + "loss": 0.1545, + "step": 23466 + }, + { + "epoch": 1.2880351262349068, + "grad_norm": 2.907351493835449, + "learning_rate": 5.634111790102117e-06, + "loss": 0.1391, + "step": 23468 + }, + { + "epoch": 1.28814489571899, + "grad_norm": 1.1587961912155151, + "learning_rate": 5.63080093465548e-06, + "loss": 0.1695, + "step": 23470 + }, + { + "epoch": 1.2882546652030735, + "grad_norm": 0.8283339142799377, + "learning_rate": 5.62749092883699e-06, + "loss": 0.1807, + "step": 23472 + }, + { + "epoch": 1.288364434687157, + "grad_norm": 1.0640286207199097, + "learning_rate": 5.624181772791837e-06, + "loss": 0.1909, + "step": 23474 + }, + { + "epoch": 1.2884742041712405, + "grad_norm": 1.4805943965911865, + "learning_rate": 5.620873466665169e-06, + "loss": 0.232, + "step": 23476 + }, + { + "epoch": 1.288583973655324, + "grad_norm": 1.4303619861602783, + "learning_rate": 5.617566010602113e-06, + "loss": 0.2506, + "step": 23478 + }, + { + "epoch": 1.2886937431394072, + "grad_norm": 1.3101751804351807, + "learning_rate": 5.614259404747737e-06, + "loss": 0.1635, + "step": 23480 + }, + { + "epoch": 1.2888035126234907, + "grad_norm": 1.1947191953659058, + "learning_rate": 5.610953649247102e-06, + "loss": 0.2329, + "step": 23482 + }, + { + "epoch": 1.2889132821075742, + "grad_norm": 0.9146214723587036, + "learning_rate": 5.607648744245206e-06, + "loss": 0.2838, + "step": 23484 + }, + { + "epoch": 1.2890230515916574, + "grad_norm": 1.1159437894821167, + "learning_rate": 5.60434468988702e-06, + "loss": 0.2189, + "step": 23486 + }, + { + "epoch": 1.289132821075741, + "grad_norm": 2.5319435596466064, + "learning_rate": 5.601041486317477e-06, + "loss": 0.1903, + "step": 23488 + }, + { + "epoch": 1.2892425905598244, + "grad_norm": 0.9078552722930908, + "learning_rate": 5.597739133681462e-06, + "loss": 0.2585, + "step": 23490 + }, + { + "epoch": 1.2893523600439079, + "grad_norm": 0.8137244582176208, + "learning_rate": 5.5944376321238475e-06, + "loss": 0.1108, + "step": 23492 + }, + { + "epoch": 1.2894621295279913, + "grad_norm": 1.813266634941101, + "learning_rate": 5.591136981789438e-06, + "loss": 0.2203, + "step": 23494 + }, + { + "epoch": 1.2895718990120746, + "grad_norm": 1.0734801292419434, + "learning_rate": 5.587837182823033e-06, + "loss": 0.1488, + "step": 23496 + }, + { + "epoch": 1.289681668496158, + "grad_norm": 0.9794149994850159, + "learning_rate": 5.584538235369371e-06, + "loss": 0.1606, + "step": 23498 + }, + { + "epoch": 1.2897914379802415, + "grad_norm": 0.9601576924324036, + "learning_rate": 5.581240139573158e-06, + "loss": 0.1326, + "step": 23500 + }, + { + "epoch": 1.2899012074643248, + "grad_norm": 1.2648704051971436, + "learning_rate": 5.577942895579064e-06, + "loss": 0.2346, + "step": 23502 + }, + { + "epoch": 1.2900109769484083, + "grad_norm": 1.1786131858825684, + "learning_rate": 5.574646503531725e-06, + "loss": 0.1607, + "step": 23504 + }, + { + "epoch": 1.2901207464324917, + "grad_norm": 1.1906673908233643, + "learning_rate": 5.571350963575728e-06, + "loss": 0.213, + "step": 23506 + }, + { + "epoch": 1.2902305159165752, + "grad_norm": 1.087997317314148, + "learning_rate": 5.5680562758556456e-06, + "loss": 0.2179, + "step": 23508 + }, + { + "epoch": 1.2903402854006587, + "grad_norm": 0.8641610741615295, + "learning_rate": 5.5647624405159945e-06, + "loss": 0.2506, + "step": 23510 + }, + { + "epoch": 1.290450054884742, + "grad_norm": 2.377028465270996, + "learning_rate": 5.561469457701254e-06, + "loss": 0.3866, + "step": 23512 + }, + { + "epoch": 1.2905598243688254, + "grad_norm": 1.1083922386169434, + "learning_rate": 5.558177327555875e-06, + "loss": 0.2551, + "step": 23514 + }, + { + "epoch": 1.290669593852909, + "grad_norm": 1.1780130863189697, + "learning_rate": 5.554886050224256e-06, + "loss": 0.1799, + "step": 23516 + }, + { + "epoch": 1.2907793633369924, + "grad_norm": 1.3402988910675049, + "learning_rate": 5.551595625850786e-06, + "loss": 0.224, + "step": 23518 + }, + { + "epoch": 1.2908891328210759, + "grad_norm": 1.151113748550415, + "learning_rate": 5.54830605457978e-06, + "loss": 0.1558, + "step": 23520 + }, + { + "epoch": 1.2909989023051591, + "grad_norm": 3.426621198654175, + "learning_rate": 5.545017336555555e-06, + "loss": 0.3042, + "step": 23522 + }, + { + "epoch": 1.2911086717892426, + "grad_norm": 0.916437566280365, + "learning_rate": 5.541729471922361e-06, + "loss": 0.1606, + "step": 23524 + }, + { + "epoch": 1.291218441273326, + "grad_norm": 1.285712718963623, + "learning_rate": 5.538442460824417e-06, + "loss": 0.1702, + "step": 23526 + }, + { + "epoch": 1.2913282107574093, + "grad_norm": 1.169562816619873, + "learning_rate": 5.535156303405909e-06, + "loss": 0.1494, + "step": 23528 + }, + { + "epoch": 1.2914379802414928, + "grad_norm": 1.1866297721862793, + "learning_rate": 5.531870999810979e-06, + "loss": 0.2492, + "step": 23530 + }, + { + "epoch": 1.2915477497255763, + "grad_norm": 0.9808924198150635, + "learning_rate": 5.528586550183748e-06, + "loss": 0.1661, + "step": 23532 + }, + { + "epoch": 1.2916575192096598, + "grad_norm": 1.1654547452926636, + "learning_rate": 5.525302954668285e-06, + "loss": 0.1425, + "step": 23534 + }, + { + "epoch": 1.2917672886937432, + "grad_norm": 1.1531641483306885, + "learning_rate": 5.522020213408618e-06, + "loss": 0.1898, + "step": 23536 + }, + { + "epoch": 1.2918770581778265, + "grad_norm": 1.3900545835494995, + "learning_rate": 5.51873832654875e-06, + "loss": 0.1793, + "step": 23538 + }, + { + "epoch": 1.29198682766191, + "grad_norm": 1.4978998899459839, + "learning_rate": 5.5154572942326396e-06, + "loss": 0.2399, + "step": 23540 + }, + { + "epoch": 1.2920965971459935, + "grad_norm": 0.9231959581375122, + "learning_rate": 5.5121771166042e-06, + "loss": 0.2236, + "step": 23542 + }, + { + "epoch": 1.2922063666300767, + "grad_norm": 1.8448257446289062, + "learning_rate": 5.50889779380733e-06, + "loss": 0.1879, + "step": 23544 + }, + { + "epoch": 1.2923161361141602, + "grad_norm": 0.8447260856628418, + "learning_rate": 5.505619325985873e-06, + "loss": 0.1626, + "step": 23546 + }, + { + "epoch": 1.2924259055982437, + "grad_norm": 1.6726934909820557, + "learning_rate": 5.502341713283635e-06, + "loss": 0.2235, + "step": 23548 + }, + { + "epoch": 1.2925356750823271, + "grad_norm": 1.1647109985351562, + "learning_rate": 5.499064955844382e-06, + "loss": 0.1578, + "step": 23550 + }, + { + "epoch": 1.2926454445664106, + "grad_norm": 1.4337265491485596, + "learning_rate": 5.495789053811864e-06, + "loss": 0.2932, + "step": 23552 + }, + { + "epoch": 1.2927552140504939, + "grad_norm": 1.607235074043274, + "learning_rate": 5.492514007329771e-06, + "loss": 0.198, + "step": 23554 + }, + { + "epoch": 1.2928649835345773, + "grad_norm": 1.1766141653060913, + "learning_rate": 5.489239816541755e-06, + "loss": 0.1325, + "step": 23556 + }, + { + "epoch": 1.2929747530186608, + "grad_norm": 0.9344847798347473, + "learning_rate": 5.4859664815914504e-06, + "loss": 0.1965, + "step": 23558 + }, + { + "epoch": 1.2930845225027443, + "grad_norm": 0.9591060876846313, + "learning_rate": 5.482694002622438e-06, + "loss": 0.2143, + "step": 23560 + }, + { + "epoch": 1.2931942919868278, + "grad_norm": 0.8812858462333679, + "learning_rate": 5.479422379778263e-06, + "loss": 0.2143, + "step": 23562 + }, + { + "epoch": 1.293304061470911, + "grad_norm": 0.7833424210548401, + "learning_rate": 5.476151613202435e-06, + "loss": 0.1303, + "step": 23564 + }, + { + "epoch": 1.2934138309549945, + "grad_norm": 1.1391388177871704, + "learning_rate": 5.472881703038418e-06, + "loss": 0.1313, + "step": 23566 + }, + { + "epoch": 1.293523600439078, + "grad_norm": 1.564424753189087, + "learning_rate": 5.4696126494296614e-06, + "loss": 0.2217, + "step": 23568 + }, + { + "epoch": 1.2936333699231612, + "grad_norm": 1.356347918510437, + "learning_rate": 5.466344452519556e-06, + "loss": 0.2082, + "step": 23570 + }, + { + "epoch": 1.2937431394072447, + "grad_norm": 1.280949592590332, + "learning_rate": 5.463077112451459e-06, + "loss": 0.129, + "step": 23572 + }, + { + "epoch": 1.2938529088913282, + "grad_norm": 0.9986249804496765, + "learning_rate": 5.4598106293686916e-06, + "loss": 0.1568, + "step": 23574 + }, + { + "epoch": 1.2939626783754117, + "grad_norm": 1.096179723739624, + "learning_rate": 5.456545003414543e-06, + "loss": 0.1691, + "step": 23576 + }, + { + "epoch": 1.2940724478594952, + "grad_norm": 1.8558167219161987, + "learning_rate": 5.453280234732244e-06, + "loss": 0.1935, + "step": 23578 + }, + { + "epoch": 1.2941822173435784, + "grad_norm": 0.7439819574356079, + "learning_rate": 5.4500163234650155e-06, + "loss": 0.1381, + "step": 23580 + }, + { + "epoch": 1.2942919868276619, + "grad_norm": 1.1078290939331055, + "learning_rate": 5.446753269756036e-06, + "loss": 0.1458, + "step": 23582 + }, + { + "epoch": 1.2944017563117454, + "grad_norm": 0.6425333023071289, + "learning_rate": 5.443491073748433e-06, + "loss": 0.1469, + "step": 23584 + }, + { + "epoch": 1.2945115257958286, + "grad_norm": 1.0713952779769897, + "learning_rate": 5.440229735585298e-06, + "loss": 0.167, + "step": 23586 + }, + { + "epoch": 1.2946212952799123, + "grad_norm": 1.0936673879623413, + "learning_rate": 5.436969255409691e-06, + "loss": 0.1751, + "step": 23588 + }, + { + "epoch": 1.2947310647639956, + "grad_norm": 1.2598016262054443, + "learning_rate": 5.4337096333646365e-06, + "loss": 0.1411, + "step": 23590 + }, + { + "epoch": 1.294840834248079, + "grad_norm": 0.9443237781524658, + "learning_rate": 5.430450869593104e-06, + "loss": 0.2008, + "step": 23592 + }, + { + "epoch": 1.2949506037321625, + "grad_norm": 1.0448405742645264, + "learning_rate": 5.427192964238059e-06, + "loss": 0.2568, + "step": 23594 + }, + { + "epoch": 1.2950603732162458, + "grad_norm": 0.990874171257019, + "learning_rate": 5.423935917442399e-06, + "loss": 0.2225, + "step": 23596 + }, + { + "epoch": 1.2951701427003293, + "grad_norm": 0.8938705921173096, + "learning_rate": 5.420679729348993e-06, + "loss": 0.2159, + "step": 23598 + }, + { + "epoch": 1.2952799121844127, + "grad_norm": 0.8229356408119202, + "learning_rate": 5.417424400100679e-06, + "loss": 0.1662, + "step": 23600 + }, + { + "epoch": 1.2953896816684962, + "grad_norm": 1.5195497274398804, + "learning_rate": 5.414169929840243e-06, + "loss": 0.2465, + "step": 23602 + }, + { + "epoch": 1.2954994511525797, + "grad_norm": 1.0122394561767578, + "learning_rate": 5.410916318710443e-06, + "loss": 0.1901, + "step": 23604 + }, + { + "epoch": 1.295609220636663, + "grad_norm": 1.0396394729614258, + "learning_rate": 5.4076635668540075e-06, + "loss": 0.1655, + "step": 23606 + }, + { + "epoch": 1.2957189901207464, + "grad_norm": 1.3065135478973389, + "learning_rate": 5.404411674413615e-06, + "loss": 0.2138, + "step": 23608 + }, + { + "epoch": 1.29582875960483, + "grad_norm": 1.0907328128814697, + "learning_rate": 5.401160641531897e-06, + "loss": 0.1788, + "step": 23610 + }, + { + "epoch": 1.2959385290889132, + "grad_norm": 1.4162603616714478, + "learning_rate": 5.3979104683514804e-06, + "loss": 0.2581, + "step": 23612 + }, + { + "epoch": 1.2960482985729966, + "grad_norm": 1.2690160274505615, + "learning_rate": 5.394661155014921e-06, + "loss": 0.2083, + "step": 23614 + }, + { + "epoch": 1.2961580680570801, + "grad_norm": 1.0439306497573853, + "learning_rate": 5.391412701664744e-06, + "loss": 0.2392, + "step": 23616 + }, + { + "epoch": 1.2962678375411636, + "grad_norm": 1.1640063524246216, + "learning_rate": 5.38816510844346e-06, + "loss": 0.1675, + "step": 23618 + }, + { + "epoch": 1.296377607025247, + "grad_norm": 0.7355219125747681, + "learning_rate": 5.384918375493514e-06, + "loss": 0.1191, + "step": 23620 + }, + { + "epoch": 1.2964873765093303, + "grad_norm": 1.045365333557129, + "learning_rate": 5.381672502957324e-06, + "loss": 0.3421, + "step": 23622 + }, + { + "epoch": 1.2965971459934138, + "grad_norm": 1.5629167556762695, + "learning_rate": 5.378427490977272e-06, + "loss": 0.3032, + "step": 23624 + }, + { + "epoch": 1.2967069154774973, + "grad_norm": 0.9939965009689331, + "learning_rate": 5.3751833396956966e-06, + "loss": 0.1157, + "step": 23626 + }, + { + "epoch": 1.2968166849615808, + "grad_norm": 0.9549160003662109, + "learning_rate": 5.3719400492549e-06, + "loss": 0.127, + "step": 23628 + }, + { + "epoch": 1.2969264544456642, + "grad_norm": 0.9909381866455078, + "learning_rate": 5.368697619797159e-06, + "loss": 0.1356, + "step": 23630 + }, + { + "epoch": 1.2970362239297475, + "grad_norm": 0.9763997197151184, + "learning_rate": 5.3654560514646955e-06, + "loss": 0.2069, + "step": 23632 + }, + { + "epoch": 1.297145993413831, + "grad_norm": 0.7046853303909302, + "learning_rate": 5.362215344399701e-06, + "loss": 0.1284, + "step": 23634 + }, + { + "epoch": 1.2972557628979144, + "grad_norm": 1.195788860321045, + "learning_rate": 5.358975498744332e-06, + "loss": 0.15, + "step": 23636 + }, + { + "epoch": 1.2973655323819977, + "grad_norm": 1.0428532361984253, + "learning_rate": 5.355736514640697e-06, + "loss": 0.1313, + "step": 23638 + }, + { + "epoch": 1.2974753018660812, + "grad_norm": 0.7572205066680908, + "learning_rate": 5.352498392230876e-06, + "loss": 0.1378, + "step": 23640 + }, + { + "epoch": 1.2975850713501647, + "grad_norm": 0.9889045357704163, + "learning_rate": 5.349261131656907e-06, + "loss": 0.2762, + "step": 23642 + }, + { + "epoch": 1.2976948408342481, + "grad_norm": 0.8300843238830566, + "learning_rate": 5.346024733060806e-06, + "loss": 0.1617, + "step": 23644 + }, + { + "epoch": 1.2978046103183316, + "grad_norm": 1.5260168313980103, + "learning_rate": 5.342789196584527e-06, + "loss": 0.2141, + "step": 23646 + }, + { + "epoch": 1.2979143798024149, + "grad_norm": 0.7687512040138245, + "learning_rate": 5.339554522369996e-06, + "loss": 0.1692, + "step": 23648 + }, + { + "epoch": 1.2980241492864983, + "grad_norm": 1.5653799772262573, + "learning_rate": 5.336320710559106e-06, + "loss": 0.2115, + "step": 23650 + }, + { + "epoch": 1.2981339187705818, + "grad_norm": 1.2154031991958618, + "learning_rate": 5.333087761293701e-06, + "loss": 0.283, + "step": 23652 + }, + { + "epoch": 1.298243688254665, + "grad_norm": 1.6347066164016724, + "learning_rate": 5.3298556747155916e-06, + "loss": 0.1618, + "step": 23654 + }, + { + "epoch": 1.2983534577387486, + "grad_norm": 1.1519339084625244, + "learning_rate": 5.326624450966569e-06, + "loss": 0.1895, + "step": 23656 + }, + { + "epoch": 1.298463227222832, + "grad_norm": 1.0464649200439453, + "learning_rate": 5.323394090188358e-06, + "loss": 0.143, + "step": 23658 + }, + { + "epoch": 1.2985729967069155, + "grad_norm": 0.9269464015960693, + "learning_rate": 5.3201645925226615e-06, + "loss": 0.1512, + "step": 23660 + }, + { + "epoch": 1.298682766190999, + "grad_norm": 1.1034209728240967, + "learning_rate": 5.316935958111138e-06, + "loss": 0.1714, + "step": 23662 + }, + { + "epoch": 1.2987925356750822, + "grad_norm": 1.1949801445007324, + "learning_rate": 5.3137081870954096e-06, + "loss": 0.1255, + "step": 23664 + }, + { + "epoch": 1.2989023051591657, + "grad_norm": 0.9199396967887878, + "learning_rate": 5.310481279617072e-06, + "loss": 0.1867, + "step": 23666 + }, + { + "epoch": 1.2990120746432492, + "grad_norm": 0.9466255307197571, + "learning_rate": 5.307255235817665e-06, + "loss": 0.1552, + "step": 23668 + }, + { + "epoch": 1.2991218441273327, + "grad_norm": 1.706075668334961, + "learning_rate": 5.304030055838705e-06, + "loss": 0.1433, + "step": 23670 + }, + { + "epoch": 1.2992316136114161, + "grad_norm": 0.7261245250701904, + "learning_rate": 5.3008057398216515e-06, + "loss": 0.1318, + "step": 23672 + }, + { + "epoch": 1.2993413830954994, + "grad_norm": 1.2028870582580566, + "learning_rate": 5.297582287907954e-06, + "loss": 0.1577, + "step": 23674 + }, + { + "epoch": 1.2994511525795829, + "grad_norm": 1.517799735069275, + "learning_rate": 5.294359700239002e-06, + "loss": 0.1875, + "step": 23676 + }, + { + "epoch": 1.2995609220636664, + "grad_norm": 0.8145080804824829, + "learning_rate": 5.291137976956148e-06, + "loss": 0.1513, + "step": 23678 + }, + { + "epoch": 1.2996706915477496, + "grad_norm": 1.2671691179275513, + "learning_rate": 5.287917118200728e-06, + "loss": 0.2147, + "step": 23680 + }, + { + "epoch": 1.299780461031833, + "grad_norm": 1.175578236579895, + "learning_rate": 5.284697124114013e-06, + "loss": 0.226, + "step": 23682 + }, + { + "epoch": 1.2998902305159166, + "grad_norm": 0.7141181826591492, + "learning_rate": 5.281477994837253e-06, + "loss": 0.1327, + "step": 23684 + }, + { + "epoch": 1.3, + "grad_norm": 0.9057248830795288, + "learning_rate": 5.2782597305116504e-06, + "loss": 0.1378, + "step": 23686 + }, + { + "epoch": 1.3001097694840835, + "grad_norm": 1.317520022392273, + "learning_rate": 5.275042331278376e-06, + "loss": 0.1726, + "step": 23688 + }, + { + "epoch": 1.3002195389681668, + "grad_norm": 1.2112908363342285, + "learning_rate": 5.2718257972785565e-06, + "loss": 0.1509, + "step": 23690 + }, + { + "epoch": 1.3003293084522503, + "grad_norm": 0.8492406010627747, + "learning_rate": 5.2686101286532955e-06, + "loss": 0.2183, + "step": 23692 + }, + { + "epoch": 1.3004390779363337, + "grad_norm": 1.2611212730407715, + "learning_rate": 5.26539532554364e-06, + "loss": 0.1848, + "step": 23694 + }, + { + "epoch": 1.300548847420417, + "grad_norm": 1.3502955436706543, + "learning_rate": 5.262181388090609e-06, + "loss": 0.2072, + "step": 23696 + }, + { + "epoch": 1.3006586169045007, + "grad_norm": 1.0008183717727661, + "learning_rate": 5.258968316435181e-06, + "loss": 0.1089, + "step": 23698 + }, + { + "epoch": 1.300768386388584, + "grad_norm": 1.0707182884216309, + "learning_rate": 5.255756110718291e-06, + "loss": 0.1672, + "step": 23700 + }, + { + "epoch": 1.3008781558726674, + "grad_norm": 1.3460172414779663, + "learning_rate": 5.252544771080853e-06, + "loss": 0.2142, + "step": 23702 + }, + { + "epoch": 1.300987925356751, + "grad_norm": 1.1852099895477295, + "learning_rate": 5.249334297663724e-06, + "loss": 0.1455, + "step": 23704 + }, + { + "epoch": 1.3010976948408342, + "grad_norm": 0.707247257232666, + "learning_rate": 5.24612469060774e-06, + "loss": 0.1533, + "step": 23706 + }, + { + "epoch": 1.3012074643249176, + "grad_norm": 1.2197465896606445, + "learning_rate": 5.242915950053684e-06, + "loss": 0.2299, + "step": 23708 + }, + { + "epoch": 1.301317233809001, + "grad_norm": 1.1568851470947266, + "learning_rate": 5.239708076142311e-06, + "loss": 0.2171, + "step": 23710 + }, + { + "epoch": 1.3014270032930846, + "grad_norm": 2.0273327827453613, + "learning_rate": 5.236501069014327e-06, + "loss": 0.2333, + "step": 23712 + }, + { + "epoch": 1.301536772777168, + "grad_norm": 0.8940352201461792, + "learning_rate": 5.233294928810406e-06, + "loss": 0.1656, + "step": 23714 + }, + { + "epoch": 1.3016465422612513, + "grad_norm": 1.356488585472107, + "learning_rate": 5.230089655671197e-06, + "loss": 0.1378, + "step": 23716 + }, + { + "epoch": 1.3017563117453348, + "grad_norm": 0.9538145065307617, + "learning_rate": 5.226885249737293e-06, + "loss": 0.1394, + "step": 23718 + }, + { + "epoch": 1.3018660812294183, + "grad_norm": 1.6716994047164917, + "learning_rate": 5.223681711149253e-06, + "loss": 0.234, + "step": 23720 + }, + { + "epoch": 1.3019758507135015, + "grad_norm": 0.9893573522567749, + "learning_rate": 5.220479040047602e-06, + "loss": 0.2258, + "step": 23722 + }, + { + "epoch": 1.302085620197585, + "grad_norm": 1.1419767141342163, + "learning_rate": 5.217277236572824e-06, + "loss": 0.1607, + "step": 23724 + }, + { + "epoch": 1.3021953896816685, + "grad_norm": 1.0118169784545898, + "learning_rate": 5.214076300865359e-06, + "loss": 0.1847, + "step": 23726 + }, + { + "epoch": 1.302305159165752, + "grad_norm": 0.9150269627571106, + "learning_rate": 5.210876233065628e-06, + "loss": 0.138, + "step": 23728 + }, + { + "epoch": 1.3024149286498354, + "grad_norm": 1.1208136081695557, + "learning_rate": 5.207677033313996e-06, + "loss": 0.1559, + "step": 23730 + }, + { + "epoch": 1.3025246981339187, + "grad_norm": 0.8646210432052612, + "learning_rate": 5.204478701750789e-06, + "loss": 0.1793, + "step": 23732 + }, + { + "epoch": 1.3026344676180022, + "grad_norm": 0.9937525987625122, + "learning_rate": 5.201281238516317e-06, + "loss": 0.1111, + "step": 23734 + }, + { + "epoch": 1.3027442371020856, + "grad_norm": 0.9152743220329285, + "learning_rate": 5.198084643750825e-06, + "loss": 0.1357, + "step": 23736 + }, + { + "epoch": 1.3028540065861691, + "grad_norm": 1.2445369958877563, + "learning_rate": 5.194888917594537e-06, + "loss": 0.1494, + "step": 23738 + }, + { + "epoch": 1.3029637760702526, + "grad_norm": 1.378575086593628, + "learning_rate": 5.191694060187621e-06, + "loss": 0.2578, + "step": 23740 + }, + { + "epoch": 1.3030735455543359, + "grad_norm": 0.9208903908729553, + "learning_rate": 5.1885000716702355e-06, + "loss": 0.1498, + "step": 23742 + }, + { + "epoch": 1.3031833150384193, + "grad_norm": 1.3425859212875366, + "learning_rate": 5.185306952182478e-06, + "loss": 0.1186, + "step": 23744 + }, + { + "epoch": 1.3032930845225028, + "grad_norm": 1.2804268598556519, + "learning_rate": 5.1821147018644155e-06, + "loss": 0.1953, + "step": 23746 + }, + { + "epoch": 1.303402854006586, + "grad_norm": 1.073896050453186, + "learning_rate": 5.178923320856072e-06, + "loss": 0.2506, + "step": 23748 + }, + { + "epoch": 1.3035126234906695, + "grad_norm": 1.148991584777832, + "learning_rate": 5.175732809297434e-06, + "loss": 0.2127, + "step": 23750 + }, + { + "epoch": 1.303622392974753, + "grad_norm": 1.567937970161438, + "learning_rate": 5.172543167328464e-06, + "loss": 0.2886, + "step": 23752 + }, + { + "epoch": 1.3037321624588365, + "grad_norm": 1.3211654424667358, + "learning_rate": 5.169354395089068e-06, + "loss": 0.1599, + "step": 23754 + }, + { + "epoch": 1.30384193194292, + "grad_norm": 1.0097514390945435, + "learning_rate": 5.166166492719124e-06, + "loss": 0.1216, + "step": 23756 + }, + { + "epoch": 1.3039517014270032, + "grad_norm": 0.6105688810348511, + "learning_rate": 5.162979460358469e-06, + "loss": 0.1084, + "step": 23758 + }, + { + "epoch": 1.3040614709110867, + "grad_norm": 0.6841163635253906, + "learning_rate": 5.1597932981469e-06, + "loss": 0.1666, + "step": 23760 + }, + { + "epoch": 1.3041712403951702, + "grad_norm": 0.8662197589874268, + "learning_rate": 5.15660800622417e-06, + "loss": 0.1318, + "step": 23762 + }, + { + "epoch": 1.3042810098792534, + "grad_norm": 1.4811702966690063, + "learning_rate": 5.153423584730013e-06, + "loss": 0.1702, + "step": 23764 + }, + { + "epoch": 1.304390779363337, + "grad_norm": 0.9358521699905396, + "learning_rate": 5.150240033804116e-06, + "loss": 0.112, + "step": 23766 + }, + { + "epoch": 1.3045005488474204, + "grad_norm": 1.0363785028457642, + "learning_rate": 5.147057353586118e-06, + "loss": 0.1571, + "step": 23768 + }, + { + "epoch": 1.3046103183315039, + "grad_norm": 1.3735319375991821, + "learning_rate": 5.1438755442156286e-06, + "loss": 0.2471, + "step": 23770 + }, + { + "epoch": 1.3047200878155873, + "grad_norm": 0.9663985371589661, + "learning_rate": 5.14069460583222e-06, + "loss": 0.1983, + "step": 23772 + }, + { + "epoch": 1.3048298572996706, + "grad_norm": 1.133302927017212, + "learning_rate": 5.1375145385754195e-06, + "loss": 0.1534, + "step": 23774 + }, + { + "epoch": 1.304939626783754, + "grad_norm": 1.3959980010986328, + "learning_rate": 5.134335342584714e-06, + "loss": 0.3208, + "step": 23776 + }, + { + "epoch": 1.3050493962678376, + "grad_norm": 1.7660083770751953, + "learning_rate": 5.131157017999575e-06, + "loss": 0.2043, + "step": 23778 + }, + { + "epoch": 1.305159165751921, + "grad_norm": 1.0167826414108276, + "learning_rate": 5.127979564959412e-06, + "loss": 0.2139, + "step": 23780 + }, + { + "epoch": 1.3052689352360045, + "grad_norm": 0.8582643270492554, + "learning_rate": 5.124802983603602e-06, + "loss": 0.2703, + "step": 23782 + }, + { + "epoch": 1.3053787047200878, + "grad_norm": 0.9541704058647156, + "learning_rate": 5.121627274071486e-06, + "loss": 0.1595, + "step": 23784 + }, + { + "epoch": 1.3054884742041712, + "grad_norm": 1.1416553258895874, + "learning_rate": 5.118452436502361e-06, + "loss": 0.1887, + "step": 23786 + }, + { + "epoch": 1.3055982436882547, + "grad_norm": 0.9541401863098145, + "learning_rate": 5.1152784710355e-06, + "loss": 0.1314, + "step": 23788 + }, + { + "epoch": 1.305708013172338, + "grad_norm": 1.1381250619888306, + "learning_rate": 5.112105377810128e-06, + "loss": 0.1936, + "step": 23790 + }, + { + "epoch": 1.3058177826564215, + "grad_norm": 1.007686972618103, + "learning_rate": 5.1089331569654235e-06, + "loss": 0.1627, + "step": 23792 + }, + { + "epoch": 1.305927552140505, + "grad_norm": 2.425828456878662, + "learning_rate": 5.1057618086405465e-06, + "loss": 0.135, + "step": 23794 + }, + { + "epoch": 1.3060373216245884, + "grad_norm": 1.3872735500335693, + "learning_rate": 5.102591332974604e-06, + "loss": 0.2541, + "step": 23796 + }, + { + "epoch": 1.3061470911086719, + "grad_norm": 1.099391222000122, + "learning_rate": 5.099421730106668e-06, + "loss": 0.1597, + "step": 23798 + }, + { + "epoch": 1.3062568605927551, + "grad_norm": 1.0468538999557495, + "learning_rate": 5.096253000175766e-06, + "loss": 0.1575, + "step": 23800 + }, + { + "epoch": 1.3063666300768386, + "grad_norm": 0.7992756366729736, + "learning_rate": 5.093085143320908e-06, + "loss": 0.1342, + "step": 23802 + }, + { + "epoch": 1.306476399560922, + "grad_norm": 1.0581645965576172, + "learning_rate": 5.0899181596810455e-06, + "loss": 0.2152, + "step": 23804 + }, + { + "epoch": 1.3065861690450054, + "grad_norm": 1.1703697443008423, + "learning_rate": 5.086752049395094e-06, + "loss": 0.2512, + "step": 23806 + }, + { + "epoch": 1.306695938529089, + "grad_norm": 1.1715534925460815, + "learning_rate": 5.08358681260194e-06, + "loss": 0.1751, + "step": 23808 + }, + { + "epoch": 1.3068057080131723, + "grad_norm": 0.6800723671913147, + "learning_rate": 5.0804224494404225e-06, + "loss": 0.0732, + "step": 23810 + }, + { + "epoch": 1.3069154774972558, + "grad_norm": 1.1806825399398804, + "learning_rate": 5.077258960049341e-06, + "loss": 0.146, + "step": 23812 + }, + { + "epoch": 1.3070252469813393, + "grad_norm": 0.6447306275367737, + "learning_rate": 5.074096344567475e-06, + "loss": 0.0898, + "step": 23814 + }, + { + "epoch": 1.3071350164654225, + "grad_norm": 1.1008762121200562, + "learning_rate": 5.070934603133548e-06, + "loss": 0.2909, + "step": 23816 + }, + { + "epoch": 1.307244785949506, + "grad_norm": 1.5878667831420898, + "learning_rate": 5.067773735886244e-06, + "loss": 0.2741, + "step": 23818 + }, + { + "epoch": 1.3073545554335895, + "grad_norm": 0.9243111610412598, + "learning_rate": 5.064613742964216e-06, + "loss": 0.2298, + "step": 23820 + }, + { + "epoch": 1.307464324917673, + "grad_norm": 1.5540001392364502, + "learning_rate": 5.061454624506074e-06, + "loss": 0.2565, + "step": 23822 + }, + { + "epoch": 1.3075740944017564, + "grad_norm": 0.950980544090271, + "learning_rate": 5.058296380650401e-06, + "loss": 0.1528, + "step": 23824 + }, + { + "epoch": 1.3076838638858397, + "grad_norm": 0.8359047174453735, + "learning_rate": 5.055139011535723e-06, + "loss": 0.1929, + "step": 23826 + }, + { + "epoch": 1.3077936333699232, + "grad_norm": 0.8248084783554077, + "learning_rate": 5.051982517300549e-06, + "loss": 0.2106, + "step": 23828 + }, + { + "epoch": 1.3079034028540066, + "grad_norm": 1.3262176513671875, + "learning_rate": 5.048826898083331e-06, + "loss": 0.1979, + "step": 23830 + }, + { + "epoch": 1.30801317233809, + "grad_norm": 0.9878385663032532, + "learning_rate": 5.045672154022493e-06, + "loss": 0.1524, + "step": 23832 + }, + { + "epoch": 1.3081229418221734, + "grad_norm": 1.668274998664856, + "learning_rate": 5.042518285256417e-06, + "loss": 0.2494, + "step": 23834 + }, + { + "epoch": 1.3082327113062568, + "grad_norm": 1.5254573822021484, + "learning_rate": 5.0393652919234395e-06, + "loss": 0.2423, + "step": 23836 + }, + { + "epoch": 1.3083424807903403, + "grad_norm": 1.4510525465011597, + "learning_rate": 5.036213174161877e-06, + "loss": 0.2414, + "step": 23838 + }, + { + "epoch": 1.3084522502744238, + "grad_norm": 1.0447702407836914, + "learning_rate": 5.0330619321099945e-06, + "loss": 0.1118, + "step": 23840 + }, + { + "epoch": 1.308562019758507, + "grad_norm": 0.8989855051040649, + "learning_rate": 5.029911565906017e-06, + "loss": 0.1536, + "step": 23842 + }, + { + "epoch": 1.3086717892425905, + "grad_norm": 1.4491370916366577, + "learning_rate": 5.02676207568814e-06, + "loss": 0.2287, + "step": 23844 + }, + { + "epoch": 1.308781558726674, + "grad_norm": 1.0924657583236694, + "learning_rate": 5.023613461594512e-06, + "loss": 0.2112, + "step": 23846 + }, + { + "epoch": 1.3088913282107575, + "grad_norm": 1.0074573755264282, + "learning_rate": 5.02046572376324e-06, + "loss": 0.1701, + "step": 23848 + }, + { + "epoch": 1.309001097694841, + "grad_norm": 1.2231853008270264, + "learning_rate": 5.017318862332415e-06, + "loss": 0.1832, + "step": 23850 + }, + { + "epoch": 1.3091108671789242, + "grad_norm": 0.9692211151123047, + "learning_rate": 5.014172877440057e-06, + "loss": 0.0877, + "step": 23852 + }, + { + "epoch": 1.3092206366630077, + "grad_norm": 0.9551820755004883, + "learning_rate": 5.01102776922418e-06, + "loss": 0.1319, + "step": 23854 + }, + { + "epoch": 1.3093304061470912, + "grad_norm": 1.3670860528945923, + "learning_rate": 5.007883537822736e-06, + "loss": 0.2223, + "step": 23856 + }, + { + "epoch": 1.3094401756311744, + "grad_norm": 0.8855987787246704, + "learning_rate": 5.004740183373649e-06, + "loss": 0.1808, + "step": 23858 + }, + { + "epoch": 1.309549945115258, + "grad_norm": 0.9936245679855347, + "learning_rate": 5.001597706014799e-06, + "loss": 0.1425, + "step": 23860 + }, + { + "epoch": 1.3096597145993414, + "grad_norm": 1.3215302228927612, + "learning_rate": 4.998456105884025e-06, + "loss": 0.2218, + "step": 23862 + }, + { + "epoch": 1.3097694840834249, + "grad_norm": 1.9145407676696777, + "learning_rate": 4.995315383119145e-06, + "loss": 0.2729, + "step": 23864 + }, + { + "epoch": 1.3098792535675083, + "grad_norm": 1.3710832595825195, + "learning_rate": 4.992175537857924e-06, + "loss": 0.2193, + "step": 23866 + }, + { + "epoch": 1.3099890230515916, + "grad_norm": 1.1869066953659058, + "learning_rate": 4.989036570238084e-06, + "loss": 0.137, + "step": 23868 + }, + { + "epoch": 1.310098792535675, + "grad_norm": 2.7481062412261963, + "learning_rate": 4.9858984803973215e-06, + "loss": 0.2337, + "step": 23870 + }, + { + "epoch": 1.3102085620197585, + "grad_norm": 1.0801188945770264, + "learning_rate": 4.982761268473282e-06, + "loss": 0.2151, + "step": 23872 + }, + { + "epoch": 1.3103183315038418, + "grad_norm": 1.760369062423706, + "learning_rate": 4.979624934603589e-06, + "loss": 0.2627, + "step": 23874 + }, + { + "epoch": 1.3104281009879253, + "grad_norm": 0.9451481699943542, + "learning_rate": 4.976489478925811e-06, + "loss": 0.1488, + "step": 23876 + }, + { + "epoch": 1.3105378704720088, + "grad_norm": 2.4272916316986084, + "learning_rate": 4.973354901577487e-06, + "loss": 0.1879, + "step": 23878 + }, + { + "epoch": 1.3106476399560922, + "grad_norm": 0.997668445110321, + "learning_rate": 4.970221202696113e-06, + "loss": 0.1472, + "step": 23880 + }, + { + "epoch": 1.3107574094401757, + "grad_norm": 0.8983832001686096, + "learning_rate": 4.967088382419149e-06, + "loss": 0.0978, + "step": 23882 + }, + { + "epoch": 1.310867178924259, + "grad_norm": 1.1091961860656738, + "learning_rate": 4.96395644088401e-06, + "loss": 0.1341, + "step": 23884 + }, + { + "epoch": 1.3109769484083424, + "grad_norm": 0.6899910569190979, + "learning_rate": 4.960825378228082e-06, + "loss": 0.1426, + "step": 23886 + }, + { + "epoch": 1.311086717892426, + "grad_norm": 0.6947973966598511, + "learning_rate": 4.9576951945887185e-06, + "loss": 0.0842, + "step": 23888 + }, + { + "epoch": 1.3111964873765094, + "grad_norm": 1.0826239585876465, + "learning_rate": 4.954565890103219e-06, + "loss": 0.1461, + "step": 23890 + }, + { + "epoch": 1.3113062568605929, + "grad_norm": 1.2649060487747192, + "learning_rate": 4.951437464908848e-06, + "loss": 0.268, + "step": 23892 + }, + { + "epoch": 1.3114160263446761, + "grad_norm": 1.0992891788482666, + "learning_rate": 4.948309919142832e-06, + "loss": 0.1484, + "step": 23894 + }, + { + "epoch": 1.3115257958287596, + "grad_norm": 1.3371132612228394, + "learning_rate": 4.945183252942362e-06, + "loss": 0.1502, + "step": 23896 + }, + { + "epoch": 1.311635565312843, + "grad_norm": 1.2528496980667114, + "learning_rate": 4.942057466444586e-06, + "loss": 0.1841, + "step": 23898 + }, + { + "epoch": 1.3117453347969263, + "grad_norm": 0.9484922289848328, + "learning_rate": 4.9389325597866246e-06, + "loss": 0.1326, + "step": 23900 + }, + { + "epoch": 1.3118551042810098, + "grad_norm": 0.8972317576408386, + "learning_rate": 4.935808533105546e-06, + "loss": 0.2113, + "step": 23902 + }, + { + "epoch": 1.3119648737650933, + "grad_norm": 1.0167473554611206, + "learning_rate": 4.9326853865383855e-06, + "loss": 0.2798, + "step": 23904 + }, + { + "epoch": 1.3120746432491768, + "grad_norm": 0.9718087315559387, + "learning_rate": 4.929563120222141e-06, + "loss": 0.264, + "step": 23906 + }, + { + "epoch": 1.3121844127332603, + "grad_norm": 0.8806958198547363, + "learning_rate": 4.926441734293771e-06, + "loss": 0.1932, + "step": 23908 + }, + { + "epoch": 1.3122941822173435, + "grad_norm": 0.9883530139923096, + "learning_rate": 4.9233212288901845e-06, + "loss": 0.2431, + "step": 23910 + }, + { + "epoch": 1.312403951701427, + "grad_norm": 1.4837760925292969, + "learning_rate": 4.920201604148278e-06, + "loss": 0.1687, + "step": 23912 + }, + { + "epoch": 1.3125137211855105, + "grad_norm": 1.1160368919372559, + "learning_rate": 4.917082860204883e-06, + "loss": 0.1329, + "step": 23914 + }, + { + "epoch": 1.3126234906695937, + "grad_norm": 1.2928029298782349, + "learning_rate": 4.91396499719681e-06, + "loss": 0.1619, + "step": 23916 + }, + { + "epoch": 1.3127332601536772, + "grad_norm": 1.4263111352920532, + "learning_rate": 4.910848015260822e-06, + "loss": 0.224, + "step": 23918 + }, + { + "epoch": 1.3128430296377607, + "grad_norm": 0.7698242664337158, + "learning_rate": 4.907731914533642e-06, + "loss": 0.1335, + "step": 23920 + }, + { + "epoch": 1.3129527991218441, + "grad_norm": 1.175144076347351, + "learning_rate": 4.904616695151951e-06, + "loss": 0.1839, + "step": 23922 + }, + { + "epoch": 1.3130625686059276, + "grad_norm": 1.3917574882507324, + "learning_rate": 4.901502357252413e-06, + "loss": 0.1831, + "step": 23924 + }, + { + "epoch": 1.3131723380900109, + "grad_norm": 1.3287227153778076, + "learning_rate": 4.898388900971634e-06, + "loss": 0.1871, + "step": 23926 + }, + { + "epoch": 1.3132821075740944, + "grad_norm": 0.8843628168106079, + "learning_rate": 4.89527632644618e-06, + "loss": 0.1672, + "step": 23928 + }, + { + "epoch": 1.3133918770581778, + "grad_norm": 1.1868821382522583, + "learning_rate": 4.892164633812585e-06, + "loss": 0.2462, + "step": 23930 + }, + { + "epoch": 1.3135016465422613, + "grad_norm": 1.0401101112365723, + "learning_rate": 4.889053823207345e-06, + "loss": 0.1859, + "step": 23932 + }, + { + "epoch": 1.3136114160263448, + "grad_norm": 1.2726964950561523, + "learning_rate": 4.885943894766909e-06, + "loss": 0.2659, + "step": 23934 + }, + { + "epoch": 1.313721185510428, + "grad_norm": 1.420711874961853, + "learning_rate": 4.882834848627707e-06, + "loss": 0.2611, + "step": 23936 + }, + { + "epoch": 1.3138309549945115, + "grad_norm": 0.7695111632347107, + "learning_rate": 4.879726684926106e-06, + "loss": 0.1164, + "step": 23938 + }, + { + "epoch": 1.313940724478595, + "grad_norm": 1.5024917125701904, + "learning_rate": 4.876619403798452e-06, + "loss": 0.1454, + "step": 23940 + }, + { + "epoch": 1.3140504939626783, + "grad_norm": 1.3095502853393555, + "learning_rate": 4.873513005381042e-06, + "loss": 0.2554, + "step": 23942 + }, + { + "epoch": 1.3141602634467617, + "grad_norm": 0.8638771772384644, + "learning_rate": 4.870407489810131e-06, + "loss": 0.1331, + "step": 23944 + }, + { + "epoch": 1.3142700329308452, + "grad_norm": 1.3210923671722412, + "learning_rate": 4.867302857221953e-06, + "loss": 0.1408, + "step": 23946 + }, + { + "epoch": 1.3143798024149287, + "grad_norm": 0.9037335515022278, + "learning_rate": 4.864199107752685e-06, + "loss": 0.1443, + "step": 23948 + }, + { + "epoch": 1.3144895718990122, + "grad_norm": 0.7857005596160889, + "learning_rate": 4.861096241538482e-06, + "loss": 0.1532, + "step": 23950 + }, + { + "epoch": 1.3145993413830954, + "grad_norm": 0.8775054216384888, + "learning_rate": 4.857994258715448e-06, + "loss": 0.122, + "step": 23952 + }, + { + "epoch": 1.314709110867179, + "grad_norm": 1.1294033527374268, + "learning_rate": 4.8548931594196465e-06, + "loss": 0.2088, + "step": 23954 + }, + { + "epoch": 1.3148188803512624, + "grad_norm": 1.3511689901351929, + "learning_rate": 4.851792943787109e-06, + "loss": 0.2565, + "step": 23956 + }, + { + "epoch": 1.3149286498353459, + "grad_norm": 1.172079086303711, + "learning_rate": 4.848693611953825e-06, + "loss": 0.1669, + "step": 23958 + }, + { + "epoch": 1.3150384193194293, + "grad_norm": 0.9800037741661072, + "learning_rate": 4.845595164055744e-06, + "loss": 0.1849, + "step": 23960 + }, + { + "epoch": 1.3151481888035126, + "grad_norm": 1.452405333518982, + "learning_rate": 4.842497600228787e-06, + "loss": 0.2556, + "step": 23962 + }, + { + "epoch": 1.315257958287596, + "grad_norm": 1.793170690536499, + "learning_rate": 4.839400920608825e-06, + "loss": 0.3933, + "step": 23964 + }, + { + "epoch": 1.3153677277716795, + "grad_norm": 1.3178679943084717, + "learning_rate": 4.836305125331694e-06, + "loss": 0.2332, + "step": 23966 + }, + { + "epoch": 1.3154774972557628, + "grad_norm": 0.7007404565811157, + "learning_rate": 4.8332102145331875e-06, + "loss": 0.1892, + "step": 23968 + }, + { + "epoch": 1.3155872667398463, + "grad_norm": 1.6087769269943237, + "learning_rate": 4.83011618834906e-06, + "loss": 0.2305, + "step": 23970 + }, + { + "epoch": 1.3156970362239298, + "grad_norm": 1.3642092943191528, + "learning_rate": 4.827023046915041e-06, + "loss": 0.1733, + "step": 23972 + }, + { + "epoch": 1.3158068057080132, + "grad_norm": 1.4948949813842773, + "learning_rate": 4.8239307903668015e-06, + "loss": 0.2457, + "step": 23974 + }, + { + "epoch": 1.3159165751920967, + "grad_norm": 0.9922282695770264, + "learning_rate": 4.8208394188399925e-06, + "loss": 0.1337, + "step": 23976 + }, + { + "epoch": 1.31602634467618, + "grad_norm": 1.1139355897903442, + "learning_rate": 4.817748932470212e-06, + "loss": 0.2241, + "step": 23978 + }, + { + "epoch": 1.3161361141602634, + "grad_norm": 0.6774288415908813, + "learning_rate": 4.814659331393023e-06, + "loss": 0.0857, + "step": 23980 + }, + { + "epoch": 1.316245883644347, + "grad_norm": 0.9997815489768982, + "learning_rate": 4.811570615743952e-06, + "loss": 0.205, + "step": 23982 + }, + { + "epoch": 1.3163556531284302, + "grad_norm": 1.1953554153442383, + "learning_rate": 4.808482785658478e-06, + "loss": 0.1924, + "step": 23984 + }, + { + "epoch": 1.3164654226125136, + "grad_norm": 1.5491431951522827, + "learning_rate": 4.805395841272062e-06, + "loss": 0.1675, + "step": 23986 + }, + { + "epoch": 1.3165751920965971, + "grad_norm": 0.575518786907196, + "learning_rate": 4.802309782720105e-06, + "loss": 0.0943, + "step": 23988 + }, + { + "epoch": 1.3166849615806806, + "grad_norm": 1.0465850830078125, + "learning_rate": 4.799224610137976e-06, + "loss": 0.1497, + "step": 23990 + }, + { + "epoch": 1.316794731064764, + "grad_norm": 0.991252064704895, + "learning_rate": 4.796140323661008e-06, + "loss": 0.2088, + "step": 23992 + }, + { + "epoch": 1.3169045005488473, + "grad_norm": 1.070685625076294, + "learning_rate": 4.793056923424491e-06, + "loss": 0.1559, + "step": 23994 + }, + { + "epoch": 1.3170142700329308, + "grad_norm": 1.7220211029052734, + "learning_rate": 4.789974409563675e-06, + "loss": 0.2619, + "step": 23996 + }, + { + "epoch": 1.3171240395170143, + "grad_norm": 1.3093239068984985, + "learning_rate": 4.786892782213781e-06, + "loss": 0.2368, + "step": 23998 + }, + { + "epoch": 1.3172338090010978, + "grad_norm": 0.817278265953064, + "learning_rate": 4.783812041509983e-06, + "loss": 0.2814, + "step": 24000 + }, + { + "epoch": 1.3173435784851812, + "grad_norm": 1.1649715900421143, + "learning_rate": 4.780732187587414e-06, + "loss": 0.1595, + "step": 24002 + }, + { + "epoch": 1.3174533479692645, + "grad_norm": 1.074806571006775, + "learning_rate": 4.777653220581169e-06, + "loss": 0.1694, + "step": 24004 + }, + { + "epoch": 1.317563117453348, + "grad_norm": 1.3994566202163696, + "learning_rate": 4.7745751406263165e-06, + "loss": 0.1732, + "step": 24006 + }, + { + "epoch": 1.3176728869374315, + "grad_norm": 1.237486720085144, + "learning_rate": 4.77149794785787e-06, + "loss": 0.1939, + "step": 24008 + }, + { + "epoch": 1.3177826564215147, + "grad_norm": 1.3944286108016968, + "learning_rate": 4.768421642410806e-06, + "loss": 0.2096, + "step": 24010 + }, + { + "epoch": 1.3178924259055982, + "grad_norm": 2.26076340675354, + "learning_rate": 4.765346224420075e-06, + "loss": 0.135, + "step": 24012 + }, + { + "epoch": 1.3180021953896817, + "grad_norm": 0.7147594690322876, + "learning_rate": 4.762271694020579e-06, + "loss": 0.1356, + "step": 24014 + }, + { + "epoch": 1.3181119648737651, + "grad_norm": 1.3274922370910645, + "learning_rate": 4.7591980513471775e-06, + "loss": 0.2378, + "step": 24016 + }, + { + "epoch": 1.3182217343578486, + "grad_norm": 1.3529608249664307, + "learning_rate": 4.756125296534697e-06, + "loss": 0.191, + "step": 24018 + }, + { + "epoch": 1.3183315038419319, + "grad_norm": 1.0289264917373657, + "learning_rate": 4.753053429717919e-06, + "loss": 0.1334, + "step": 24020 + }, + { + "epoch": 1.3184412733260154, + "grad_norm": 1.3758149147033691, + "learning_rate": 4.749982451031601e-06, + "loss": 0.1618, + "step": 24022 + }, + { + "epoch": 1.3185510428100988, + "grad_norm": 1.6865191459655762, + "learning_rate": 4.746912360610445e-06, + "loss": 0.1568, + "step": 24024 + }, + { + "epoch": 1.318660812294182, + "grad_norm": 1.4214261770248413, + "learning_rate": 4.74384315858912e-06, + "loss": 0.1423, + "step": 24026 + }, + { + "epoch": 1.3187705817782656, + "grad_norm": 1.3876397609710693, + "learning_rate": 4.740774845102259e-06, + "loss": 0.3013, + "step": 24028 + }, + { + "epoch": 1.318880351262349, + "grad_norm": 1.3848800659179688, + "learning_rate": 4.737707420284451e-06, + "loss": 0.1477, + "step": 24030 + }, + { + "epoch": 1.3189901207464325, + "grad_norm": 1.4656909704208374, + "learning_rate": 4.734640884270242e-06, + "loss": 0.2264, + "step": 24032 + }, + { + "epoch": 1.319099890230516, + "grad_norm": 1.2451902627944946, + "learning_rate": 4.731575237194152e-06, + "loss": 0.1878, + "step": 24034 + }, + { + "epoch": 1.3192096597145992, + "grad_norm": 1.1120564937591553, + "learning_rate": 4.728510479190662e-06, + "loss": 0.2431, + "step": 24036 + }, + { + "epoch": 1.3193194291986827, + "grad_norm": 0.8772464990615845, + "learning_rate": 4.725446610394199e-06, + "loss": 0.2178, + "step": 24038 + }, + { + "epoch": 1.3194291986827662, + "grad_norm": 1.0413357019424438, + "learning_rate": 4.722383630939165e-06, + "loss": 0.147, + "step": 24040 + }, + { + "epoch": 1.3195389681668497, + "grad_norm": 1.139452338218689, + "learning_rate": 4.719321540959909e-06, + "loss": 0.1316, + "step": 24042 + }, + { + "epoch": 1.3196487376509332, + "grad_norm": 1.0594350099563599, + "learning_rate": 4.716260340590756e-06, + "loss": 0.2661, + "step": 24044 + }, + { + "epoch": 1.3197585071350164, + "grad_norm": 1.099553108215332, + "learning_rate": 4.713200029965978e-06, + "loss": 0.1332, + "step": 24046 + }, + { + "epoch": 1.3198682766191, + "grad_norm": 1.666438102722168, + "learning_rate": 4.710140609219824e-06, + "loss": 0.1605, + "step": 24048 + }, + { + "epoch": 1.3199780461031834, + "grad_norm": 0.8565510511398315, + "learning_rate": 4.7070820784864915e-06, + "loss": 0.1196, + "step": 24050 + }, + { + "epoch": 1.3200878155872666, + "grad_norm": 1.139443039894104, + "learning_rate": 4.7040244379001426e-06, + "loss": 0.1477, + "step": 24052 + }, + { + "epoch": 1.32019758507135, + "grad_norm": 1.078379511833191, + "learning_rate": 4.700967687594901e-06, + "loss": 0.1701, + "step": 24054 + }, + { + "epoch": 1.3203073545554336, + "grad_norm": 1.1617422103881836, + "learning_rate": 4.6979118277048426e-06, + "loss": 0.1344, + "step": 24056 + }, + { + "epoch": 1.320417124039517, + "grad_norm": 1.3255950212478638, + "learning_rate": 4.69485685836403e-06, + "loss": 0.2295, + "step": 24058 + }, + { + "epoch": 1.3205268935236005, + "grad_norm": 1.4289840459823608, + "learning_rate": 4.691802779706456e-06, + "loss": 0.2368, + "step": 24060 + }, + { + "epoch": 1.3206366630076838, + "grad_norm": 1.215240716934204, + "learning_rate": 4.68874959186609e-06, + "loss": 0.3377, + "step": 24062 + }, + { + "epoch": 1.3207464324917673, + "grad_norm": 1.0121140480041504, + "learning_rate": 4.6856972949768564e-06, + "loss": 0.218, + "step": 24064 + }, + { + "epoch": 1.3208562019758507, + "grad_norm": 1.0050491094589233, + "learning_rate": 4.682645889172651e-06, + "loss": 0.201, + "step": 24066 + }, + { + "epoch": 1.3209659714599342, + "grad_norm": 1.0011959075927734, + "learning_rate": 4.679595374587323e-06, + "loss": 0.1706, + "step": 24068 + }, + { + "epoch": 1.3210757409440177, + "grad_norm": 0.9711260795593262, + "learning_rate": 4.676545751354674e-06, + "loss": 0.1609, + "step": 24070 + }, + { + "epoch": 1.321185510428101, + "grad_norm": 0.9160823822021484, + "learning_rate": 4.673497019608486e-06, + "loss": 0.0955, + "step": 24072 + }, + { + "epoch": 1.3212952799121844, + "grad_norm": 1.0091995000839233, + "learning_rate": 4.670449179482489e-06, + "loss": 0.1087, + "step": 24074 + }, + { + "epoch": 1.321405049396268, + "grad_norm": 1.0022690296173096, + "learning_rate": 4.667402231110374e-06, + "loss": 0.2596, + "step": 24076 + }, + { + "epoch": 1.3215148188803512, + "grad_norm": 0.9699361324310303, + "learning_rate": 4.664356174625795e-06, + "loss": 0.1672, + "step": 24078 + }, + { + "epoch": 1.3216245883644346, + "grad_norm": 3.4241435527801514, + "learning_rate": 4.6613110101623666e-06, + "loss": 0.321, + "step": 24080 + }, + { + "epoch": 1.3217343578485181, + "grad_norm": 0.9034178256988525, + "learning_rate": 4.658266737853662e-06, + "loss": 0.1141, + "step": 24082 + }, + { + "epoch": 1.3218441273326016, + "grad_norm": 2.483285903930664, + "learning_rate": 4.6552233578332244e-06, + "loss": 0.2525, + "step": 24084 + }, + { + "epoch": 1.321953896816685, + "grad_norm": 1.6715065240859985, + "learning_rate": 4.6521808702345514e-06, + "loss": 0.1482, + "step": 24086 + }, + { + "epoch": 1.3220636663007683, + "grad_norm": 1.2999857664108276, + "learning_rate": 4.649139275191094e-06, + "loss": 0.2737, + "step": 24088 + }, + { + "epoch": 1.3221734357848518, + "grad_norm": 1.2513134479522705, + "learning_rate": 4.64609857283628e-06, + "loss": 0.217, + "step": 24090 + }, + { + "epoch": 1.3222832052689353, + "grad_norm": 1.1795434951782227, + "learning_rate": 4.643058763303479e-06, + "loss": 0.1851, + "step": 24092 + }, + { + "epoch": 1.3223929747530185, + "grad_norm": 1.1806303262710571, + "learning_rate": 4.6400198467260434e-06, + "loss": 0.2158, + "step": 24094 + }, + { + "epoch": 1.322502744237102, + "grad_norm": 0.657080352306366, + "learning_rate": 4.636981823237263e-06, + "loss": 0.1146, + "step": 24096 + }, + { + "epoch": 1.3226125137211855, + "grad_norm": 1.713627815246582, + "learning_rate": 4.633944692970413e-06, + "loss": 0.1979, + "step": 24098 + }, + { + "epoch": 1.322722283205269, + "grad_norm": 0.9723204970359802, + "learning_rate": 4.630908456058713e-06, + "loss": 0.1145, + "step": 24100 + }, + { + "epoch": 1.3228320526893524, + "grad_norm": 1.5488935708999634, + "learning_rate": 4.627873112635345e-06, + "loss": 0.2253, + "step": 24102 + }, + { + "epoch": 1.3229418221734357, + "grad_norm": 0.9803104400634766, + "learning_rate": 4.624838662833456e-06, + "loss": 0.1, + "step": 24104 + }, + { + "epoch": 1.3230515916575192, + "grad_norm": 1.0123344659805298, + "learning_rate": 4.621805106786142e-06, + "loss": 0.2292, + "step": 24106 + }, + { + "epoch": 1.3231613611416027, + "grad_norm": 0.9677920341491699, + "learning_rate": 4.618772444626484e-06, + "loss": 0.1149, + "step": 24108 + }, + { + "epoch": 1.3232711306256861, + "grad_norm": 1.030728816986084, + "learning_rate": 4.615740676487507e-06, + "loss": 0.1015, + "step": 24110 + }, + { + "epoch": 1.3233809001097696, + "grad_norm": 1.0511585474014282, + "learning_rate": 4.612709802502194e-06, + "loss": 0.2577, + "step": 24112 + }, + { + "epoch": 1.3234906695938529, + "grad_norm": 1.1598782539367676, + "learning_rate": 4.6096798228034946e-06, + "loss": 0.2102, + "step": 24114 + }, + { + "epoch": 1.3236004390779363, + "grad_norm": 1.517000675201416, + "learning_rate": 4.606650737524321e-06, + "loss": 0.2346, + "step": 24116 + }, + { + "epoch": 1.3237102085620198, + "grad_norm": 2.1837384700775146, + "learning_rate": 4.603622546797534e-06, + "loss": 0.2262, + "step": 24118 + }, + { + "epoch": 1.323819978046103, + "grad_norm": 1.1319648027420044, + "learning_rate": 4.600595250755982e-06, + "loss": 0.279, + "step": 24120 + }, + { + "epoch": 1.3239297475301866, + "grad_norm": 1.1546404361724854, + "learning_rate": 4.597568849532449e-06, + "loss": 0.1928, + "step": 24122 + }, + { + "epoch": 1.32403951701427, + "grad_norm": 1.384430170059204, + "learning_rate": 4.59454334325968e-06, + "loss": 0.1986, + "step": 24124 + }, + { + "epoch": 1.3241492864983535, + "grad_norm": 1.1120132207870483, + "learning_rate": 4.591518732070402e-06, + "loss": 0.0941, + "step": 24126 + }, + { + "epoch": 1.324259055982437, + "grad_norm": 0.7110294103622437, + "learning_rate": 4.5884950160972825e-06, + "loss": 0.0972, + "step": 24128 + }, + { + "epoch": 1.3243688254665202, + "grad_norm": 1.4895631074905396, + "learning_rate": 4.585472195472959e-06, + "loss": 0.2284, + "step": 24130 + }, + { + "epoch": 1.3244785949506037, + "grad_norm": 0.9187166690826416, + "learning_rate": 4.582450270330016e-06, + "loss": 0.1536, + "step": 24132 + }, + { + "epoch": 1.3245883644346872, + "grad_norm": 1.0313091278076172, + "learning_rate": 4.579429240801028e-06, + "loss": 0.1592, + "step": 24134 + }, + { + "epoch": 1.3246981339187704, + "grad_norm": 0.850597620010376, + "learning_rate": 4.5764091070185035e-06, + "loss": 0.1631, + "step": 24136 + }, + { + "epoch": 1.324807903402854, + "grad_norm": 0.9220597147941589, + "learning_rate": 4.573389869114919e-06, + "loss": 0.1261, + "step": 24138 + }, + { + "epoch": 1.3249176728869374, + "grad_norm": 1.0859458446502686, + "learning_rate": 4.570371527222716e-06, + "loss": 0.1584, + "step": 24140 + }, + { + "epoch": 1.3250274423710209, + "grad_norm": 1.5029306411743164, + "learning_rate": 4.567354081474287e-06, + "loss": 0.1413, + "step": 24142 + }, + { + "epoch": 1.3251372118551044, + "grad_norm": 1.6081899404525757, + "learning_rate": 4.564337532002002e-06, + "loss": 0.3134, + "step": 24144 + }, + { + "epoch": 1.3252469813391876, + "grad_norm": 1.1538290977478027, + "learning_rate": 4.561321878938177e-06, + "loss": 0.2085, + "step": 24146 + }, + { + "epoch": 1.325356750823271, + "grad_norm": 0.9225503206253052, + "learning_rate": 4.558307122415092e-06, + "loss": 0.1071, + "step": 24148 + }, + { + "epoch": 1.3254665203073546, + "grad_norm": 1.059936285018921, + "learning_rate": 4.5552932625649944e-06, + "loss": 0.1587, + "step": 24150 + }, + { + "epoch": 1.325576289791438, + "grad_norm": 1.5777184963226318, + "learning_rate": 4.5522802995200785e-06, + "loss": 0.145, + "step": 24152 + }, + { + "epoch": 1.3256860592755215, + "grad_norm": 1.263278841972351, + "learning_rate": 4.549268233412507e-06, + "loss": 0.2149, + "step": 24154 + }, + { + "epoch": 1.3257958287596048, + "grad_norm": 1.1571640968322754, + "learning_rate": 4.54625706437441e-06, + "loss": 0.1525, + "step": 24156 + }, + { + "epoch": 1.3259055982436883, + "grad_norm": 1.3535066843032837, + "learning_rate": 4.543246792537878e-06, + "loss": 0.1913, + "step": 24158 + }, + { + "epoch": 1.3260153677277717, + "grad_norm": 1.0370092391967773, + "learning_rate": 4.540237418034948e-06, + "loss": 0.2142, + "step": 24160 + }, + { + "epoch": 1.326125137211855, + "grad_norm": 0.7231248021125793, + "learning_rate": 4.537228940997626e-06, + "loss": 0.162, + "step": 24162 + }, + { + "epoch": 1.3262349066959385, + "grad_norm": 1.0021847486495972, + "learning_rate": 4.5342213615578834e-06, + "loss": 0.1126, + "step": 24164 + }, + { + "epoch": 1.326344676180022, + "grad_norm": 1.149411678314209, + "learning_rate": 4.53121467984764e-06, + "loss": 0.1728, + "step": 24166 + }, + { + "epoch": 1.3264544456641054, + "grad_norm": 1.4058623313903809, + "learning_rate": 4.528208895998784e-06, + "loss": 0.1356, + "step": 24168 + }, + { + "epoch": 1.326564215148189, + "grad_norm": 1.071162462234497, + "learning_rate": 4.525204010143172e-06, + "loss": 0.1953, + "step": 24170 + }, + { + "epoch": 1.3266739846322722, + "grad_norm": 1.0048936605453491, + "learning_rate": 4.522200022412609e-06, + "loss": 0.2389, + "step": 24172 + }, + { + "epoch": 1.3267837541163556, + "grad_norm": 1.0246403217315674, + "learning_rate": 4.5191969329388625e-06, + "loss": 0.1731, + "step": 24174 + }, + { + "epoch": 1.326893523600439, + "grad_norm": 1.2368513345718384, + "learning_rate": 4.5161947418536656e-06, + "loss": 0.1552, + "step": 24176 + }, + { + "epoch": 1.3270032930845226, + "grad_norm": 1.1464347839355469, + "learning_rate": 4.513193449288708e-06, + "loss": 0.2135, + "step": 24178 + }, + { + "epoch": 1.327113062568606, + "grad_norm": 0.8827232718467712, + "learning_rate": 4.510193055375633e-06, + "loss": 0.1197, + "step": 24180 + }, + { + "epoch": 1.3272228320526893, + "grad_norm": 1.5175777673721313, + "learning_rate": 4.50719356024607e-06, + "loss": 0.2726, + "step": 24182 + }, + { + "epoch": 1.3273326015367728, + "grad_norm": 1.1605594158172607, + "learning_rate": 4.50419496403158e-06, + "loss": 0.1738, + "step": 24184 + }, + { + "epoch": 1.3274423710208563, + "grad_norm": 1.4887847900390625, + "learning_rate": 4.501197266863691e-06, + "loss": 0.2317, + "step": 24186 + }, + { + "epoch": 1.3275521405049395, + "grad_norm": 1.5391385555267334, + "learning_rate": 4.498200468873912e-06, + "loss": 0.2309, + "step": 24188 + }, + { + "epoch": 1.327661909989023, + "grad_norm": 1.2255005836486816, + "learning_rate": 4.495204570193687e-06, + "loss": 0.1809, + "step": 24190 + }, + { + "epoch": 1.3277716794731065, + "grad_norm": 1.3205136060714722, + "learning_rate": 4.492209570954428e-06, + "loss": 0.1867, + "step": 24192 + }, + { + "epoch": 1.32788144895719, + "grad_norm": 1.147684931755066, + "learning_rate": 4.489215471287522e-06, + "loss": 0.2326, + "step": 24194 + }, + { + "epoch": 1.3279912184412734, + "grad_norm": 1.5801656246185303, + "learning_rate": 4.486222271324297e-06, + "loss": 0.1995, + "step": 24196 + }, + { + "epoch": 1.3281009879253567, + "grad_norm": 1.1339629888534546, + "learning_rate": 4.483229971196054e-06, + "loss": 0.1573, + "step": 24198 + }, + { + "epoch": 1.3282107574094402, + "grad_norm": 1.191543459892273, + "learning_rate": 4.480238571034042e-06, + "loss": 0.2071, + "step": 24200 + }, + { + "epoch": 1.3283205268935236, + "grad_norm": 1.0877708196640015, + "learning_rate": 4.477248070969486e-06, + "loss": 0.166, + "step": 24202 + }, + { + "epoch": 1.328430296377607, + "grad_norm": 1.443060040473938, + "learning_rate": 4.474258471133555e-06, + "loss": 0.1496, + "step": 24204 + }, + { + "epoch": 1.3285400658616904, + "grad_norm": 1.2410392761230469, + "learning_rate": 4.4712697716574e-06, + "loss": 0.1959, + "step": 24206 + }, + { + "epoch": 1.3286498353457739, + "grad_norm": 1.5989184379577637, + "learning_rate": 4.468281972672114e-06, + "loss": 0.1771, + "step": 24208 + }, + { + "epoch": 1.3287596048298573, + "grad_norm": 1.2971152067184448, + "learning_rate": 4.465295074308756e-06, + "loss": 0.1589, + "step": 24210 + }, + { + "epoch": 1.3288693743139408, + "grad_norm": 1.007961630821228, + "learning_rate": 4.4623090766983456e-06, + "loss": 0.1736, + "step": 24212 + }, + { + "epoch": 1.328979143798024, + "grad_norm": 1.4114140272140503, + "learning_rate": 4.459323979971863e-06, + "loss": 0.1712, + "step": 24214 + }, + { + "epoch": 1.3290889132821075, + "grad_norm": 0.9281047582626343, + "learning_rate": 4.456339784260247e-06, + "loss": 0.1743, + "step": 24216 + }, + { + "epoch": 1.329198682766191, + "grad_norm": 1.023132085800171, + "learning_rate": 4.453356489694399e-06, + "loss": 0.1472, + "step": 24218 + }, + { + "epoch": 1.3293084522502745, + "grad_norm": 0.9654538035392761, + "learning_rate": 4.450374096405194e-06, + "loss": 0.2046, + "step": 24220 + }, + { + "epoch": 1.329418221734358, + "grad_norm": 0.9202544093132019, + "learning_rate": 4.4473926045234425e-06, + "loss": 0.1162, + "step": 24222 + }, + { + "epoch": 1.3295279912184412, + "grad_norm": 1.4956307411193848, + "learning_rate": 4.4444120141799275e-06, + "loss": 0.2325, + "step": 24224 + }, + { + "epoch": 1.3296377607025247, + "grad_norm": 1.2410635948181152, + "learning_rate": 4.441432325505399e-06, + "loss": 0.1782, + "step": 24226 + }, + { + "epoch": 1.3297475301866082, + "grad_norm": 1.2251121997833252, + "learning_rate": 4.438453538630546e-06, + "loss": 0.2232, + "step": 24228 + }, + { + "epoch": 1.3298572996706914, + "grad_norm": 1.687294602394104, + "learning_rate": 4.43547565368605e-06, + "loss": 0.1881, + "step": 24230 + }, + { + "epoch": 1.329967069154775, + "grad_norm": 0.7205924391746521, + "learning_rate": 4.432498670802529e-06, + "loss": 0.1845, + "step": 24232 + }, + { + "epoch": 1.3300768386388584, + "grad_norm": 1.7685058116912842, + "learning_rate": 4.429522590110569e-06, + "loss": 0.2183, + "step": 24234 + }, + { + "epoch": 1.3301866081229419, + "grad_norm": 0.9663395285606384, + "learning_rate": 4.426547411740712e-06, + "loss": 0.1651, + "step": 24236 + }, + { + "epoch": 1.3302963776070253, + "grad_norm": 1.173393726348877, + "learning_rate": 4.423573135823464e-06, + "loss": 0.2061, + "step": 24238 + }, + { + "epoch": 1.3304061470911086, + "grad_norm": 1.2506417036056519, + "learning_rate": 4.420599762489291e-06, + "loss": 0.2032, + "step": 24240 + }, + { + "epoch": 1.330515916575192, + "grad_norm": 1.7695624828338623, + "learning_rate": 4.417627291868625e-06, + "loss": 0.2277, + "step": 24242 + }, + { + "epoch": 1.3306256860592756, + "grad_norm": 0.9502527117729187, + "learning_rate": 4.414655724091854e-06, + "loss": 0.2693, + "step": 24244 + }, + { + "epoch": 1.3307354555433588, + "grad_norm": 1.1890915632247925, + "learning_rate": 4.411685059289314e-06, + "loss": 0.1632, + "step": 24246 + }, + { + "epoch": 1.3308452250274423, + "grad_norm": 1.314743161201477, + "learning_rate": 4.408715297591326e-06, + "loss": 0.235, + "step": 24248 + }, + { + "epoch": 1.3309549945115258, + "grad_norm": 1.1638237237930298, + "learning_rate": 4.405746439128153e-06, + "loss": 0.2715, + "step": 24250 + }, + { + "epoch": 1.3310647639956092, + "grad_norm": 1.0100369453430176, + "learning_rate": 4.402778484030024e-06, + "loss": 0.1218, + "step": 24252 + }, + { + "epoch": 1.3311745334796927, + "grad_norm": 1.8394379615783691, + "learning_rate": 4.399811432427123e-06, + "loss": 0.2653, + "step": 24254 + }, + { + "epoch": 1.331284302963776, + "grad_norm": 1.1825472116470337, + "learning_rate": 4.396845284449608e-06, + "loss": 0.191, + "step": 24256 + }, + { + "epoch": 1.3313940724478595, + "grad_norm": 0.949614942073822, + "learning_rate": 4.393880040227585e-06, + "loss": 0.1612, + "step": 24258 + }, + { + "epoch": 1.331503841931943, + "grad_norm": 1.6997607946395874, + "learning_rate": 4.390915699891127e-06, + "loss": 0.2231, + "step": 24260 + }, + { + "epoch": 1.3316136114160264, + "grad_norm": 0.9649055600166321, + "learning_rate": 4.387952263570261e-06, + "loss": 0.2178, + "step": 24262 + }, + { + "epoch": 1.3317233809001099, + "grad_norm": 1.3128737211227417, + "learning_rate": 4.384989731394979e-06, + "loss": 0.1899, + "step": 24264 + }, + { + "epoch": 1.3318331503841931, + "grad_norm": 0.9686983823776245, + "learning_rate": 4.382028103495223e-06, + "loss": 0.1453, + "step": 24266 + }, + { + "epoch": 1.3319429198682766, + "grad_norm": 0.8943983912467957, + "learning_rate": 4.37906738000092e-06, + "loss": 0.1671, + "step": 24268 + }, + { + "epoch": 1.33205268935236, + "grad_norm": 1.6961795091629028, + "learning_rate": 4.376107561041937e-06, + "loss": 0.1752, + "step": 24270 + }, + { + "epoch": 1.3321624588364434, + "grad_norm": 1.0856343507766724, + "learning_rate": 4.373148646748104e-06, + "loss": 0.1956, + "step": 24272 + }, + { + "epoch": 1.3322722283205268, + "grad_norm": 1.1743844747543335, + "learning_rate": 4.370190637249213e-06, + "loss": 0.1761, + "step": 24274 + }, + { + "epoch": 1.3323819978046103, + "grad_norm": 1.1684168577194214, + "learning_rate": 4.367233532675011e-06, + "loss": 0.2346, + "step": 24276 + }, + { + "epoch": 1.3324917672886938, + "grad_norm": 1.201521873474121, + "learning_rate": 4.36427733315522e-06, + "loss": 0.2211, + "step": 24278 + }, + { + "epoch": 1.3326015367727773, + "grad_norm": 1.333533525466919, + "learning_rate": 4.361322038819515e-06, + "loss": 0.2002, + "step": 24280 + }, + { + "epoch": 1.3327113062568605, + "grad_norm": 0.9098895788192749, + "learning_rate": 4.358367649797529e-06, + "loss": 0.2448, + "step": 24282 + }, + { + "epoch": 1.332821075740944, + "grad_norm": 1.6085678339004517, + "learning_rate": 4.355414166218852e-06, + "loss": 0.2122, + "step": 24284 + }, + { + "epoch": 1.3329308452250275, + "grad_norm": 0.9698820114135742, + "learning_rate": 4.352461588213036e-06, + "loss": 0.1417, + "step": 24286 + }, + { + "epoch": 1.333040614709111, + "grad_norm": 1.7485911846160889, + "learning_rate": 4.3495099159096025e-06, + "loss": 0.1331, + "step": 24288 + }, + { + "epoch": 1.3331503841931944, + "grad_norm": 1.004012107849121, + "learning_rate": 4.346559149438015e-06, + "loss": 0.1952, + "step": 24290 + }, + { + "epoch": 1.3332601536772777, + "grad_norm": 1.8045719861984253, + "learning_rate": 4.343609288927722e-06, + "loss": 0.2281, + "step": 24292 + }, + { + "epoch": 1.3333699231613612, + "grad_norm": 0.8950126767158508, + "learning_rate": 4.340660334508115e-06, + "loss": 0.142, + "step": 24294 + }, + { + "epoch": 1.3334796926454446, + "grad_norm": 1.1587237119674683, + "learning_rate": 4.337712286308546e-06, + "loss": 0.1408, + "step": 24296 + }, + { + "epoch": 1.333589462129528, + "grad_norm": 1.2083708047866821, + "learning_rate": 4.334765144458333e-06, + "loss": 0.1842, + "step": 24298 + }, + { + "epoch": 1.3336992316136114, + "grad_norm": 1.2249212265014648, + "learning_rate": 4.331818909086752e-06, + "loss": 0.2386, + "step": 24300 + }, + { + "epoch": 1.3338090010976948, + "grad_norm": 1.3491860628128052, + "learning_rate": 4.328873580323034e-06, + "loss": 0.217, + "step": 24302 + }, + { + "epoch": 1.3339187705817783, + "grad_norm": 1.4385184049606323, + "learning_rate": 4.325929158296385e-06, + "loss": 0.1382, + "step": 24304 + }, + { + "epoch": 1.3340285400658618, + "grad_norm": 1.2395132780075073, + "learning_rate": 4.322985643135952e-06, + "loss": 0.2287, + "step": 24306 + }, + { + "epoch": 1.334138309549945, + "grad_norm": 0.9235969185829163, + "learning_rate": 4.320043034970863e-06, + "loss": 0.1749, + "step": 24308 + }, + { + "epoch": 1.3342480790340285, + "grad_norm": 1.294796347618103, + "learning_rate": 4.317101333930191e-06, + "loss": 0.1657, + "step": 24310 + }, + { + "epoch": 1.334357848518112, + "grad_norm": 2.6191749572753906, + "learning_rate": 4.314160540142972e-06, + "loss": 0.2345, + "step": 24312 + }, + { + "epoch": 1.3344676180021953, + "grad_norm": 1.5241650342941284, + "learning_rate": 4.311220653738202e-06, + "loss": 0.2208, + "step": 24314 + }, + { + "epoch": 1.3345773874862787, + "grad_norm": 1.0333937406539917, + "learning_rate": 4.308281674844836e-06, + "loss": 0.1853, + "step": 24316 + }, + { + "epoch": 1.3346871569703622, + "grad_norm": 0.610299289226532, + "learning_rate": 4.305343603591802e-06, + "loss": 0.1484, + "step": 24318 + }, + { + "epoch": 1.3347969264544457, + "grad_norm": 1.948992371559143, + "learning_rate": 4.302406440107973e-06, + "loss": 0.2373, + "step": 24320 + }, + { + "epoch": 1.3349066959385292, + "grad_norm": 0.8603988289833069, + "learning_rate": 4.299470184522189e-06, + "loss": 0.2714, + "step": 24322 + }, + { + "epoch": 1.3350164654226124, + "grad_norm": 0.9795128107070923, + "learning_rate": 4.296534836963245e-06, + "loss": 0.1889, + "step": 24324 + }, + { + "epoch": 1.335126234906696, + "grad_norm": 0.9065436720848083, + "learning_rate": 4.293600397559897e-06, + "loss": 0.1661, + "step": 24326 + }, + { + "epoch": 1.3352360043907794, + "grad_norm": 1.1657428741455078, + "learning_rate": 4.290666866440873e-06, + "loss": 0.2654, + "step": 24328 + }, + { + "epoch": 1.3353457738748629, + "grad_norm": 1.5397062301635742, + "learning_rate": 4.287734243734848e-06, + "loss": 0.2071, + "step": 24330 + }, + { + "epoch": 1.3354555433589463, + "grad_norm": 0.8644483685493469, + "learning_rate": 4.284802529570462e-06, + "loss": 0.151, + "step": 24332 + }, + { + "epoch": 1.3355653128430296, + "grad_norm": 0.9871540069580078, + "learning_rate": 4.281871724076311e-06, + "loss": 0.162, + "step": 24334 + }, + { + "epoch": 1.335675082327113, + "grad_norm": 1.2505099773406982, + "learning_rate": 4.278941827380953e-06, + "loss": 0.2018, + "step": 24336 + }, + { + "epoch": 1.3357848518111965, + "grad_norm": 2.845838785171509, + "learning_rate": 4.276012839612917e-06, + "loss": 0.2072, + "step": 24338 + }, + { + "epoch": 1.3358946212952798, + "grad_norm": 1.3989380598068237, + "learning_rate": 4.27308476090067e-06, + "loss": 0.2912, + "step": 24340 + }, + { + "epoch": 1.3360043907793633, + "grad_norm": 1.233127236366272, + "learning_rate": 4.270157591372664e-06, + "loss": 0.1687, + "step": 24342 + }, + { + "epoch": 1.3361141602634468, + "grad_norm": 1.183792233467102, + "learning_rate": 4.267231331157298e-06, + "loss": 0.1037, + "step": 24344 + }, + { + "epoch": 1.3362239297475302, + "grad_norm": 1.1962990760803223, + "learning_rate": 4.264305980382927e-06, + "loss": 0.1663, + "step": 24346 + }, + { + "epoch": 1.3363336992316137, + "grad_norm": 1.049517035484314, + "learning_rate": 4.26138153917787e-06, + "loss": 0.1179, + "step": 24348 + }, + { + "epoch": 1.336443468715697, + "grad_norm": 0.730621874332428, + "learning_rate": 4.258458007670413e-06, + "loss": 0.2177, + "step": 24350 + }, + { + "epoch": 1.3365532381997804, + "grad_norm": 1.5578571557998657, + "learning_rate": 4.255535385988785e-06, + "loss": 0.2053, + "step": 24352 + }, + { + "epoch": 1.336663007683864, + "grad_norm": 0.9415145516395569, + "learning_rate": 4.252613674261202e-06, + "loss": 0.2165, + "step": 24354 + }, + { + "epoch": 1.3367727771679472, + "grad_norm": 1.1276663541793823, + "learning_rate": 4.2496928726158155e-06, + "loss": 0.1244, + "step": 24356 + }, + { + "epoch": 1.3368825466520307, + "grad_norm": 1.5302811861038208, + "learning_rate": 4.246772981180749e-06, + "loss": 0.1724, + "step": 24358 + }, + { + "epoch": 1.3369923161361141, + "grad_norm": 0.9840185046195984, + "learning_rate": 4.2438540000840826e-06, + "loss": 0.0846, + "step": 24360 + }, + { + "epoch": 1.3371020856201976, + "grad_norm": 1.069810152053833, + "learning_rate": 4.240935929453852e-06, + "loss": 0.173, + "step": 24362 + }, + { + "epoch": 1.337211855104281, + "grad_norm": 1.0172481536865234, + "learning_rate": 4.238018769418067e-06, + "loss": 0.2129, + "step": 24364 + }, + { + "epoch": 1.3373216245883643, + "grad_norm": 0.9087870121002197, + "learning_rate": 4.235102520104681e-06, + "loss": 0.1353, + "step": 24366 + }, + { + "epoch": 1.3374313940724478, + "grad_norm": 1.0686419010162354, + "learning_rate": 4.232187181641622e-06, + "loss": 0.2396, + "step": 24368 + }, + { + "epoch": 1.3375411635565313, + "grad_norm": 1.3456130027770996, + "learning_rate": 4.229272754156771e-06, + "loss": 0.2534, + "step": 24370 + }, + { + "epoch": 1.3376509330406148, + "grad_norm": 1.1034433841705322, + "learning_rate": 4.2263592377779635e-06, + "loss": 0.1477, + "step": 24372 + }, + { + "epoch": 1.3377607025246983, + "grad_norm": 1.1692465543746948, + "learning_rate": 4.223446632633002e-06, + "loss": 0.1501, + "step": 24374 + }, + { + "epoch": 1.3378704720087815, + "grad_norm": 0.7894482612609863, + "learning_rate": 4.2205349388496435e-06, + "loss": 0.1133, + "step": 24376 + }, + { + "epoch": 1.337980241492865, + "grad_norm": 1.350296139717102, + "learning_rate": 4.2176241565556215e-06, + "loss": 0.2827, + "step": 24378 + }, + { + "epoch": 1.3380900109769485, + "grad_norm": 1.133414387702942, + "learning_rate": 4.21471428587861e-06, + "loss": 0.2173, + "step": 24380 + }, + { + "epoch": 1.3381997804610317, + "grad_norm": 1.9193333387374878, + "learning_rate": 4.211805326946247e-06, + "loss": 0.1626, + "step": 24382 + }, + { + "epoch": 1.3383095499451152, + "grad_norm": 0.8650577664375305, + "learning_rate": 4.20889727988614e-06, + "loss": 0.2429, + "step": 24384 + }, + { + "epoch": 1.3384193194291987, + "grad_norm": 1.5100544691085815, + "learning_rate": 4.205990144825844e-06, + "loss": 0.2659, + "step": 24386 + }, + { + "epoch": 1.3385290889132822, + "grad_norm": 1.7247694730758667, + "learning_rate": 4.203083921892881e-06, + "loss": 0.1644, + "step": 24388 + }, + { + "epoch": 1.3386388583973656, + "grad_norm": 0.9616646766662598, + "learning_rate": 4.200178611214736e-06, + "loss": 0.1906, + "step": 24390 + }, + { + "epoch": 1.3387486278814489, + "grad_norm": 0.9760344624519348, + "learning_rate": 4.197274212918853e-06, + "loss": 0.1171, + "step": 24392 + }, + { + "epoch": 1.3388583973655324, + "grad_norm": 1.2719979286193848, + "learning_rate": 4.194370727132627e-06, + "loss": 0.1404, + "step": 24394 + }, + { + "epoch": 1.3389681668496158, + "grad_norm": 1.3248647451400757, + "learning_rate": 4.191468153983419e-06, + "loss": 0.1742, + "step": 24396 + }, + { + "epoch": 1.3390779363336993, + "grad_norm": 1.1546847820281982, + "learning_rate": 4.18856649359855e-06, + "loss": 0.2522, + "step": 24398 + }, + { + "epoch": 1.3391877058177828, + "grad_norm": 0.9896048903465271, + "learning_rate": 4.185665746105305e-06, + "loss": 0.1444, + "step": 24400 + }, + { + "epoch": 1.339297475301866, + "grad_norm": 1.1092828512191772, + "learning_rate": 4.182765911630921e-06, + "loss": 0.1645, + "step": 24402 + }, + { + "epoch": 1.3394072447859495, + "grad_norm": 0.967924952507019, + "learning_rate": 4.179866990302605e-06, + "loss": 0.253, + "step": 24404 + }, + { + "epoch": 1.339517014270033, + "grad_norm": 1.0165975093841553, + "learning_rate": 4.176968982247514e-06, + "loss": 0.1511, + "step": 24406 + }, + { + "epoch": 1.3396267837541163, + "grad_norm": 1.161014199256897, + "learning_rate": 4.174071887592768e-06, + "loss": 0.1403, + "step": 24408 + }, + { + "epoch": 1.3397365532381997, + "grad_norm": 1.3130981922149658, + "learning_rate": 4.171175706465449e-06, + "loss": 0.1855, + "step": 24410 + }, + { + "epoch": 1.3398463227222832, + "grad_norm": 1.0214314460754395, + "learning_rate": 4.168280438992595e-06, + "loss": 0.1632, + "step": 24412 + }, + { + "epoch": 1.3399560922063667, + "grad_norm": 1.5437718629837036, + "learning_rate": 4.165386085301212e-06, + "loss": 0.2156, + "step": 24414 + }, + { + "epoch": 1.3400658616904502, + "grad_norm": 0.9528448581695557, + "learning_rate": 4.162492645518257e-06, + "loss": 0.1102, + "step": 24416 + }, + { + "epoch": 1.3401756311745334, + "grad_norm": 0.9696421027183533, + "learning_rate": 4.159600119770651e-06, + "loss": 0.1967, + "step": 24418 + }, + { + "epoch": 1.340285400658617, + "grad_norm": 1.1655305624008179, + "learning_rate": 4.156708508185278e-06, + "loss": 0.146, + "step": 24420 + }, + { + "epoch": 1.3403951701427004, + "grad_norm": 1.3411773443222046, + "learning_rate": 4.153817810888971e-06, + "loss": 0.2155, + "step": 24422 + }, + { + "epoch": 1.3405049396267836, + "grad_norm": 0.9352236986160278, + "learning_rate": 4.15092802800853e-06, + "loss": 0.1858, + "step": 24424 + }, + { + "epoch": 1.340614709110867, + "grad_norm": 0.7885242700576782, + "learning_rate": 4.1480391596707215e-06, + "loss": 0.168, + "step": 24426 + }, + { + "epoch": 1.3407244785949506, + "grad_norm": 1.0639941692352295, + "learning_rate": 4.14515120600226e-06, + "loss": 0.2028, + "step": 24428 + }, + { + "epoch": 1.340834248079034, + "grad_norm": 2.3371403217315674, + "learning_rate": 4.1422641671298335e-06, + "loss": 0.1525, + "step": 24430 + }, + { + "epoch": 1.3409440175631175, + "grad_norm": 1.0134456157684326, + "learning_rate": 4.139378043180078e-06, + "loss": 0.1443, + "step": 24432 + }, + { + "epoch": 1.3410537870472008, + "grad_norm": 1.3812296390533447, + "learning_rate": 4.13649283427959e-06, + "loss": 0.2189, + "step": 24434 + }, + { + "epoch": 1.3411635565312843, + "grad_norm": 1.6623270511627197, + "learning_rate": 4.133608540554932e-06, + "loss": 0.186, + "step": 24436 + }, + { + "epoch": 1.3412733260153678, + "grad_norm": 0.8991780877113342, + "learning_rate": 4.130725162132612e-06, + "loss": 0.1412, + "step": 24438 + }, + { + "epoch": 1.3413830954994512, + "grad_norm": 1.1184110641479492, + "learning_rate": 4.127842699139128e-06, + "loss": 0.1624, + "step": 24440 + }, + { + "epoch": 1.3414928649835347, + "grad_norm": 1.037178874015808, + "learning_rate": 4.124961151700907e-06, + "loss": 0.1812, + "step": 24442 + }, + { + "epoch": 1.341602634467618, + "grad_norm": 0.9837933778762817, + "learning_rate": 4.1220805199443545e-06, + "loss": 0.1504, + "step": 24444 + }, + { + "epoch": 1.3417124039517014, + "grad_norm": 1.299500584602356, + "learning_rate": 4.1192008039958235e-06, + "loss": 0.1683, + "step": 24446 + }, + { + "epoch": 1.341822173435785, + "grad_norm": 0.9119054079055786, + "learning_rate": 4.116322003981627e-06, + "loss": 0.1788, + "step": 24448 + }, + { + "epoch": 1.3419319429198682, + "grad_norm": 0.9068105816841125, + "learning_rate": 4.1134441200280565e-06, + "loss": 0.1324, + "step": 24450 + }, + { + "epoch": 1.3420417124039516, + "grad_norm": 1.096848726272583, + "learning_rate": 4.1105671522613464e-06, + "loss": 0.2399, + "step": 24452 + }, + { + "epoch": 1.3421514818880351, + "grad_norm": 1.230672001838684, + "learning_rate": 4.10769110080769e-06, + "loss": 0.2306, + "step": 24454 + }, + { + "epoch": 1.3422612513721186, + "grad_norm": 1.527328610420227, + "learning_rate": 4.104815965793249e-06, + "loss": 0.1572, + "step": 24456 + }, + { + "epoch": 1.342371020856202, + "grad_norm": 0.938847005367279, + "learning_rate": 4.101941747344132e-06, + "loss": 0.2108, + "step": 24458 + }, + { + "epoch": 1.3424807903402853, + "grad_norm": 0.942038893699646, + "learning_rate": 4.099068445586432e-06, + "loss": 0.1659, + "step": 24460 + }, + { + "epoch": 1.3425905598243688, + "grad_norm": 1.5510683059692383, + "learning_rate": 4.096196060646168e-06, + "loss": 0.2054, + "step": 24462 + }, + { + "epoch": 1.3427003293084523, + "grad_norm": 1.305490255355835, + "learning_rate": 4.0933245926493555e-06, + "loss": 0.129, + "step": 24464 + }, + { + "epoch": 1.3428100987925355, + "grad_norm": 1.154881477355957, + "learning_rate": 4.090454041721942e-06, + "loss": 0.1564, + "step": 24466 + }, + { + "epoch": 1.342919868276619, + "grad_norm": 1.0465236902236938, + "learning_rate": 4.087584407989845e-06, + "loss": 0.1194, + "step": 24468 + }, + { + "epoch": 1.3430296377607025, + "grad_norm": 1.2417266368865967, + "learning_rate": 4.084715691578939e-06, + "loss": 0.178, + "step": 24470 + }, + { + "epoch": 1.343139407244786, + "grad_norm": 1.5814549922943115, + "learning_rate": 4.081847892615062e-06, + "loss": 0.1198, + "step": 24472 + }, + { + "epoch": 1.3432491767288695, + "grad_norm": 1.8697961568832397, + "learning_rate": 4.0789810112240005e-06, + "loss": 0.1861, + "step": 24474 + }, + { + "epoch": 1.3433589462129527, + "grad_norm": 1.2516237497329712, + "learning_rate": 4.076115047531526e-06, + "loss": 0.1967, + "step": 24476 + }, + { + "epoch": 1.3434687156970362, + "grad_norm": 0.8283421397209167, + "learning_rate": 4.073250001663345e-06, + "loss": 0.123, + "step": 24478 + }, + { + "epoch": 1.3435784851811197, + "grad_norm": 1.0833951234817505, + "learning_rate": 4.0703858737451365e-06, + "loss": 0.1844, + "step": 24480 + }, + { + "epoch": 1.3436882546652031, + "grad_norm": 1.0240237712860107, + "learning_rate": 4.067522663902529e-06, + "loss": 0.1846, + "step": 24482 + }, + { + "epoch": 1.3437980241492866, + "grad_norm": 1.0477408170700073, + "learning_rate": 4.06466037226112e-06, + "loss": 0.239, + "step": 24484 + }, + { + "epoch": 1.3439077936333699, + "grad_norm": 1.3664398193359375, + "learning_rate": 4.061798998946459e-06, + "loss": 0.18, + "step": 24486 + }, + { + "epoch": 1.3440175631174534, + "grad_norm": 0.9990288019180298, + "learning_rate": 4.058938544084065e-06, + "loss": 0.1803, + "step": 24488 + }, + { + "epoch": 1.3441273326015368, + "grad_norm": 1.7355740070343018, + "learning_rate": 4.056079007799415e-06, + "loss": 0.2521, + "step": 24490 + }, + { + "epoch": 1.34423710208562, + "grad_norm": 1.2414687871932983, + "learning_rate": 4.053220390217941e-06, + "loss": 0.1554, + "step": 24492 + }, + { + "epoch": 1.3443468715697036, + "grad_norm": 0.8591800332069397, + "learning_rate": 4.050362691465032e-06, + "loss": 0.1452, + "step": 24494 + }, + { + "epoch": 1.344456641053787, + "grad_norm": 1.4372341632843018, + "learning_rate": 4.047505911666044e-06, + "loss": 0.2282, + "step": 24496 + }, + { + "epoch": 1.3445664105378705, + "grad_norm": 1.286455512046814, + "learning_rate": 4.04465005094628e-06, + "loss": 0.1864, + "step": 24498 + }, + { + "epoch": 1.344676180021954, + "grad_norm": 1.2489323616027832, + "learning_rate": 4.041795109431026e-06, + "loss": 0.1285, + "step": 24500 + }, + { + "epoch": 1.3447859495060372, + "grad_norm": 1.0980077981948853, + "learning_rate": 4.038941087245507e-06, + "loss": 0.2448, + "step": 24502 + }, + { + "epoch": 1.3448957189901207, + "grad_norm": 1.2506109476089478, + "learning_rate": 4.036087984514916e-06, + "loss": 0.1347, + "step": 24504 + }, + { + "epoch": 1.3450054884742042, + "grad_norm": 1.0317400693893433, + "learning_rate": 4.0332358013644016e-06, + "loss": 0.1702, + "step": 24506 + }, + { + "epoch": 1.3451152579582875, + "grad_norm": 1.5061907768249512, + "learning_rate": 4.030384537919077e-06, + "loss": 0.2477, + "step": 24508 + }, + { + "epoch": 1.3452250274423712, + "grad_norm": 0.9649812579154968, + "learning_rate": 4.027534194304005e-06, + "loss": 0.1737, + "step": 24510 + }, + { + "epoch": 1.3453347969264544, + "grad_norm": 1.084213137626648, + "learning_rate": 4.02468477064423e-06, + "loss": 0.2531, + "step": 24512 + }, + { + "epoch": 1.345444566410538, + "grad_norm": 1.2763932943344116, + "learning_rate": 4.0218362670647325e-06, + "loss": 0.1965, + "step": 24514 + }, + { + "epoch": 1.3455543358946214, + "grad_norm": 0.92915940284729, + "learning_rate": 4.018988683690461e-06, + "loss": 0.1266, + "step": 24516 + }, + { + "epoch": 1.3456641053787046, + "grad_norm": 0.9896421432495117, + "learning_rate": 4.016142020646324e-06, + "loss": 0.1504, + "step": 24518 + }, + { + "epoch": 1.345773874862788, + "grad_norm": 1.3144782781600952, + "learning_rate": 4.013296278057196e-06, + "loss": 0.1853, + "step": 24520 + }, + { + "epoch": 1.3458836443468716, + "grad_norm": 0.9666076898574829, + "learning_rate": 4.010451456047903e-06, + "loss": 0.1441, + "step": 24522 + }, + { + "epoch": 1.345993413830955, + "grad_norm": 1.1237962245941162, + "learning_rate": 4.007607554743229e-06, + "loss": 0.1922, + "step": 24524 + }, + { + "epoch": 1.3461031833150385, + "grad_norm": 1.2775568962097168, + "learning_rate": 4.004764574267927e-06, + "loss": 0.2017, + "step": 24526 + }, + { + "epoch": 1.3462129527991218, + "grad_norm": 1.738600492477417, + "learning_rate": 4.0019225147467035e-06, + "loss": 0.1403, + "step": 24528 + }, + { + "epoch": 1.3463227222832053, + "grad_norm": 1.566342830657959, + "learning_rate": 3.99908137630422e-06, + "loss": 0.2588, + "step": 24530 + }, + { + "epoch": 1.3464324917672887, + "grad_norm": 1.047993779182434, + "learning_rate": 3.996241159065109e-06, + "loss": 0.1629, + "step": 24532 + }, + { + "epoch": 1.346542261251372, + "grad_norm": 1.490138053894043, + "learning_rate": 3.99340186315395e-06, + "loss": 0.2246, + "step": 24534 + }, + { + "epoch": 1.3466520307354555, + "grad_norm": 2.330181837081909, + "learning_rate": 3.99056348869529e-06, + "loss": 0.1999, + "step": 24536 + }, + { + "epoch": 1.346761800219539, + "grad_norm": 0.9912280440330505, + "learning_rate": 3.987726035813638e-06, + "loss": 0.1491, + "step": 24538 + }, + { + "epoch": 1.3468715697036224, + "grad_norm": 0.7853619456291199, + "learning_rate": 3.984889504633458e-06, + "loss": 0.2037, + "step": 24540 + }, + { + "epoch": 1.346981339187706, + "grad_norm": 1.1715177297592163, + "learning_rate": 3.9820538952791724e-06, + "loss": 0.1473, + "step": 24542 + }, + { + "epoch": 1.3470911086717892, + "grad_norm": 0.957222580909729, + "learning_rate": 3.979219207875165e-06, + "loss": 0.1331, + "step": 24544 + }, + { + "epoch": 1.3472008781558726, + "grad_norm": 0.7865654826164246, + "learning_rate": 3.976385442545774e-06, + "loss": 0.1843, + "step": 24546 + }, + { + "epoch": 1.3473106476399561, + "grad_norm": 1.358111023902893, + "learning_rate": 3.973552599415306e-06, + "loss": 0.2048, + "step": 24548 + }, + { + "epoch": 1.3474204171240396, + "grad_norm": 1.6101115942001343, + "learning_rate": 3.970720678608034e-06, + "loss": 0.2091, + "step": 24550 + }, + { + "epoch": 1.347530186608123, + "grad_norm": 1.4030404090881348, + "learning_rate": 3.967889680248168e-06, + "loss": 0.1856, + "step": 24552 + }, + { + "epoch": 1.3476399560922063, + "grad_norm": 0.9016121625900269, + "learning_rate": 3.965059604459892e-06, + "loss": 0.1789, + "step": 24554 + }, + { + "epoch": 1.3477497255762898, + "grad_norm": 1.321167230606079, + "learning_rate": 3.962230451367349e-06, + "loss": 0.195, + "step": 24556 + }, + { + "epoch": 1.3478594950603733, + "grad_norm": 1.322283148765564, + "learning_rate": 3.959402221094635e-06, + "loss": 0.1581, + "step": 24558 + }, + { + "epoch": 1.3479692645444565, + "grad_norm": 0.978014349937439, + "learning_rate": 3.95657491376581e-06, + "loss": 0.1089, + "step": 24560 + }, + { + "epoch": 1.34807903402854, + "grad_norm": 1.4820494651794434, + "learning_rate": 3.953748529504903e-06, + "loss": 0.255, + "step": 24562 + }, + { + "epoch": 1.3481888035126235, + "grad_norm": 0.9225842356681824, + "learning_rate": 3.950923068435883e-06, + "loss": 0.2096, + "step": 24564 + }, + { + "epoch": 1.348298572996707, + "grad_norm": 2.0144572257995605, + "learning_rate": 3.948098530682695e-06, + "loss": 0.1309, + "step": 24566 + }, + { + "epoch": 1.3484083424807904, + "grad_norm": 1.5968713760375977, + "learning_rate": 3.945274916369232e-06, + "loss": 0.1737, + "step": 24568 + }, + { + "epoch": 1.3485181119648737, + "grad_norm": 0.8127391934394836, + "learning_rate": 3.942452225619356e-06, + "loss": 0.2823, + "step": 24570 + }, + { + "epoch": 1.3486278814489572, + "grad_norm": 1.3149160146713257, + "learning_rate": 3.9396304585568756e-06, + "loss": 0.2946, + "step": 24572 + }, + { + "epoch": 1.3487376509330407, + "grad_norm": 0.5780813097953796, + "learning_rate": 3.936809615305578e-06, + "loss": 0.1146, + "step": 24574 + }, + { + "epoch": 1.348847420417124, + "grad_norm": 0.9029669761657715, + "learning_rate": 3.9339896959891985e-06, + "loss": 0.1374, + "step": 24576 + }, + { + "epoch": 1.3489571899012074, + "grad_norm": 1.2991396188735962, + "learning_rate": 3.931170700731421e-06, + "loss": 0.2167, + "step": 24578 + }, + { + "epoch": 1.3490669593852909, + "grad_norm": 1.0424085855484009, + "learning_rate": 3.928352629655912e-06, + "loss": 0.1665, + "step": 24580 + }, + { + "epoch": 1.3491767288693743, + "grad_norm": 1.7601234912872314, + "learning_rate": 3.925535482886286e-06, + "loss": 0.1593, + "step": 24582 + }, + { + "epoch": 1.3492864983534578, + "grad_norm": 0.7850868701934814, + "learning_rate": 3.922719260546106e-06, + "loss": 0.1217, + "step": 24584 + }, + { + "epoch": 1.349396267837541, + "grad_norm": 0.6844419240951538, + "learning_rate": 3.919903962758917e-06, + "loss": 0.166, + "step": 24586 + }, + { + "epoch": 1.3495060373216246, + "grad_norm": 1.0403687953948975, + "learning_rate": 3.917089589648209e-06, + "loss": 0.1365, + "step": 24588 + }, + { + "epoch": 1.349615806805708, + "grad_norm": 1.154565691947937, + "learning_rate": 3.914276141337434e-06, + "loss": 0.1818, + "step": 24590 + }, + { + "epoch": 1.3497255762897915, + "grad_norm": 1.7322036027908325, + "learning_rate": 3.911463617950001e-06, + "loss": 0.1811, + "step": 24592 + }, + { + "epoch": 1.349835345773875, + "grad_norm": 1.661406397819519, + "learning_rate": 3.908652019609279e-06, + "loss": 0.3583, + "step": 24594 + }, + { + "epoch": 1.3499451152579582, + "grad_norm": 1.170336365699768, + "learning_rate": 3.905841346438602e-06, + "loss": 0.301, + "step": 24596 + }, + { + "epoch": 1.3500548847420417, + "grad_norm": 0.5991685390472412, + "learning_rate": 3.90303159856126e-06, + "loss": 0.0816, + "step": 24598 + }, + { + "epoch": 1.3501646542261252, + "grad_norm": 1.2938929796218872, + "learning_rate": 3.900222776100506e-06, + "loss": 0.1853, + "step": 24600 + }, + { + "epoch": 1.3502744237102084, + "grad_norm": 1.438370943069458, + "learning_rate": 3.897414879179543e-06, + "loss": 0.2151, + "step": 24602 + }, + { + "epoch": 1.350384193194292, + "grad_norm": 1.0710002183914185, + "learning_rate": 3.8946079079215404e-06, + "loss": 0.1765, + "step": 24604 + }, + { + "epoch": 1.3504939626783754, + "grad_norm": 2.338789701461792, + "learning_rate": 3.891801862449629e-06, + "loss": 0.2139, + "step": 24606 + }, + { + "epoch": 1.3506037321624589, + "grad_norm": 1.6704784631729126, + "learning_rate": 3.888996742886886e-06, + "loss": 0.1918, + "step": 24608 + }, + { + "epoch": 1.3507135016465424, + "grad_norm": 1.056626558303833, + "learning_rate": 3.8861925493563655e-06, + "loss": 0.203, + "step": 24610 + }, + { + "epoch": 1.3508232711306256, + "grad_norm": 1.1393766403198242, + "learning_rate": 3.883389281981075e-06, + "loss": 0.1473, + "step": 24612 + }, + { + "epoch": 1.350933040614709, + "grad_norm": 0.7816056609153748, + "learning_rate": 3.880586940883979e-06, + "loss": 0.1355, + "step": 24614 + }, + { + "epoch": 1.3510428100987926, + "grad_norm": 0.946938693523407, + "learning_rate": 3.877785526188002e-06, + "loss": 0.1175, + "step": 24616 + }, + { + "epoch": 1.3511525795828758, + "grad_norm": 2.0881781578063965, + "learning_rate": 3.8749850380160245e-06, + "loss": 0.1648, + "step": 24618 + }, + { + "epoch": 1.3512623490669595, + "grad_norm": 1.3690091371536255, + "learning_rate": 3.872185476490889e-06, + "loss": 0.2608, + "step": 24620 + }, + { + "epoch": 1.3513721185510428, + "grad_norm": 1.0426712036132812, + "learning_rate": 3.8693868417353955e-06, + "loss": 0.1426, + "step": 24622 + }, + { + "epoch": 1.3514818880351263, + "grad_norm": 1.6304348707199097, + "learning_rate": 3.866589133872317e-06, + "loss": 0.2589, + "step": 24624 + }, + { + "epoch": 1.3515916575192097, + "grad_norm": 1.135677695274353, + "learning_rate": 3.863792353024367e-06, + "loss": 0.2181, + "step": 24626 + }, + { + "epoch": 1.351701427003293, + "grad_norm": 1.4776116609573364, + "learning_rate": 3.8609964993142264e-06, + "loss": 0.2447, + "step": 24628 + }, + { + "epoch": 1.3518111964873765, + "grad_norm": 0.8991222381591797, + "learning_rate": 3.858201572864537e-06, + "loss": 0.1785, + "step": 24630 + }, + { + "epoch": 1.35192096597146, + "grad_norm": 1.6791359186172485, + "learning_rate": 3.85540757379789e-06, + "loss": 0.1631, + "step": 24632 + }, + { + "epoch": 1.3520307354555434, + "grad_norm": 1.1187779903411865, + "learning_rate": 3.852614502236856e-06, + "loss": 0.1654, + "step": 24634 + }, + { + "epoch": 1.352140504939627, + "grad_norm": 1.224444031715393, + "learning_rate": 3.849822358303948e-06, + "loss": 0.1978, + "step": 24636 + }, + { + "epoch": 1.3522502744237102, + "grad_norm": 0.9958744645118713, + "learning_rate": 3.8470311421216435e-06, + "loss": 0.2185, + "step": 24638 + }, + { + "epoch": 1.3523600439077936, + "grad_norm": 1.0156440734863281, + "learning_rate": 3.84424085381237e-06, + "loss": 0.2515, + "step": 24640 + }, + { + "epoch": 1.352469813391877, + "grad_norm": 1.1217641830444336, + "learning_rate": 3.841451493498538e-06, + "loss": 0.1504, + "step": 24642 + }, + { + "epoch": 1.3525795828759604, + "grad_norm": 0.9768620729446411, + "learning_rate": 3.838663061302497e-06, + "loss": 0.1807, + "step": 24644 + }, + { + "epoch": 1.3526893523600438, + "grad_norm": 1.51337730884552, + "learning_rate": 3.835875557346552e-06, + "loss": 0.2107, + "step": 24646 + }, + { + "epoch": 1.3527991218441273, + "grad_norm": 0.8037508130073547, + "learning_rate": 3.833088981752994e-06, + "loss": 0.2182, + "step": 24648 + }, + { + "epoch": 1.3529088913282108, + "grad_norm": 0.8661147356033325, + "learning_rate": 3.830303334644045e-06, + "loss": 0.1795, + "step": 24650 + }, + { + "epoch": 1.3530186608122943, + "grad_norm": 1.1726608276367188, + "learning_rate": 3.827518616141895e-06, + "loss": 0.128, + "step": 24652 + }, + { + "epoch": 1.3531284302963775, + "grad_norm": 1.3276362419128418, + "learning_rate": 3.824734826368703e-06, + "loss": 0.1873, + "step": 24654 + }, + { + "epoch": 1.353238199780461, + "grad_norm": 1.246276617050171, + "learning_rate": 3.821951965446577e-06, + "loss": 0.2138, + "step": 24656 + }, + { + "epoch": 1.3533479692645445, + "grad_norm": 1.2674943208694458, + "learning_rate": 3.819170033497577e-06, + "loss": 0.2982, + "step": 24658 + }, + { + "epoch": 1.353457738748628, + "grad_norm": 1.1962229013442993, + "learning_rate": 3.8163890306437465e-06, + "loss": 0.1891, + "step": 24660 + }, + { + "epoch": 1.3535675082327114, + "grad_norm": 0.885600745677948, + "learning_rate": 3.8136089570070697e-06, + "loss": 0.1858, + "step": 24662 + }, + { + "epoch": 1.3536772777167947, + "grad_norm": 1.445882797241211, + "learning_rate": 3.810829812709493e-06, + "loss": 0.1836, + "step": 24664 + }, + { + "epoch": 1.3537870472008782, + "grad_norm": 1.3260852098464966, + "learning_rate": 3.8080515978729247e-06, + "loss": 0.1843, + "step": 24666 + }, + { + "epoch": 1.3538968166849616, + "grad_norm": 1.9155681133270264, + "learning_rate": 3.8052743126192224e-06, + "loss": 0.2919, + "step": 24668 + }, + { + "epoch": 1.354006586169045, + "grad_norm": 1.5002565383911133, + "learning_rate": 3.802497957070225e-06, + "loss": 0.2574, + "step": 24670 + }, + { + "epoch": 1.3541163556531284, + "grad_norm": 1.0613090991973877, + "learning_rate": 3.799722531347702e-06, + "loss": 0.1531, + "step": 24672 + }, + { + "epoch": 1.3542261251372119, + "grad_norm": 1.0260789394378662, + "learning_rate": 3.7969480355734137e-06, + "loss": 0.169, + "step": 24674 + }, + { + "epoch": 1.3543358946212953, + "grad_norm": 1.1829969882965088, + "learning_rate": 3.794174469869058e-06, + "loss": 0.1644, + "step": 24676 + }, + { + "epoch": 1.3544456641053788, + "grad_norm": 1.1199666261672974, + "learning_rate": 3.7914018343562895e-06, + "loss": 0.188, + "step": 24678 + }, + { + "epoch": 1.354555433589462, + "grad_norm": 1.0353151559829712, + "learning_rate": 3.7886301291567366e-06, + "loss": 0.1851, + "step": 24680 + }, + { + "epoch": 1.3546652030735455, + "grad_norm": 0.9638563394546509, + "learning_rate": 3.785859354391974e-06, + "loss": 0.3011, + "step": 24682 + }, + { + "epoch": 1.354774972557629, + "grad_norm": 1.256449580192566, + "learning_rate": 3.7830895101835488e-06, + "loss": 0.1252, + "step": 24684 + }, + { + "epoch": 1.3548847420417123, + "grad_norm": 0.8521527051925659, + "learning_rate": 3.780320596652956e-06, + "loss": 0.1255, + "step": 24686 + }, + { + "epoch": 1.3549945115257958, + "grad_norm": 1.5744932889938354, + "learning_rate": 3.777552613921656e-06, + "loss": 0.176, + "step": 24688 + }, + { + "epoch": 1.3551042810098792, + "grad_norm": 1.1200425624847412, + "learning_rate": 3.7747855621110636e-06, + "loss": 0.1774, + "step": 24690 + }, + { + "epoch": 1.3552140504939627, + "grad_norm": 1.4111144542694092, + "learning_rate": 3.772019441342556e-06, + "loss": 0.2116, + "step": 24692 + }, + { + "epoch": 1.3553238199780462, + "grad_norm": 1.8854197263717651, + "learning_rate": 3.7692542517374614e-06, + "loss": 0.2425, + "step": 24694 + }, + { + "epoch": 1.3554335894621294, + "grad_norm": 1.3256973028182983, + "learning_rate": 3.766489993417088e-06, + "loss": 0.2388, + "step": 24696 + }, + { + "epoch": 1.355543358946213, + "grad_norm": 1.1390867233276367, + "learning_rate": 3.7637266665026844e-06, + "loss": 0.1179, + "step": 24698 + }, + { + "epoch": 1.3556531284302964, + "grad_norm": 0.9113049507141113, + "learning_rate": 3.760964271115458e-06, + "loss": 0.1396, + "step": 24700 + }, + { + "epoch": 1.3557628979143799, + "grad_norm": 1.4018163681030273, + "learning_rate": 3.75820280737659e-06, + "loss": 0.1293, + "step": 24702 + }, + { + "epoch": 1.3558726673984633, + "grad_norm": 1.652750015258789, + "learning_rate": 3.7554422754072088e-06, + "loss": 0.1278, + "step": 24704 + }, + { + "epoch": 1.3559824368825466, + "grad_norm": 1.5268131494522095, + "learning_rate": 3.752682675328406e-06, + "loss": 0.2723, + "step": 24706 + }, + { + "epoch": 1.35609220636663, + "grad_norm": 1.3739348649978638, + "learning_rate": 3.7499240072612214e-06, + "loss": 0.2199, + "step": 24708 + }, + { + "epoch": 1.3562019758507136, + "grad_norm": 1.1221805810928345, + "learning_rate": 3.7471662713266744e-06, + "loss": 0.199, + "step": 24710 + }, + { + "epoch": 1.3563117453347968, + "grad_norm": 1.0653008222579956, + "learning_rate": 3.744409467645732e-06, + "loss": 0.1818, + "step": 24712 + }, + { + "epoch": 1.3564215148188803, + "grad_norm": 1.0878005027770996, + "learning_rate": 3.74165359633932e-06, + "loss": 0.2035, + "step": 24714 + }, + { + "epoch": 1.3565312843029638, + "grad_norm": 1.1544712781906128, + "learning_rate": 3.7388986575283224e-06, + "loss": 0.2013, + "step": 24716 + }, + { + "epoch": 1.3566410537870472, + "grad_norm": 0.8732955455780029, + "learning_rate": 3.7361446513335815e-06, + "loss": 0.1242, + "step": 24718 + }, + { + "epoch": 1.3567508232711307, + "grad_norm": 1.2243345975875854, + "learning_rate": 3.7333915778759083e-06, + "loss": 0.162, + "step": 24720 + }, + { + "epoch": 1.356860592755214, + "grad_norm": 1.9573017358779907, + "learning_rate": 3.730639437276065e-06, + "loss": 0.216, + "step": 24722 + }, + { + "epoch": 1.3569703622392975, + "grad_norm": 1.5038691759109497, + "learning_rate": 3.727888229654769e-06, + "loss": 0.2881, + "step": 24724 + }, + { + "epoch": 1.357080131723381, + "grad_norm": 0.7356289029121399, + "learning_rate": 3.725137955132707e-06, + "loss": 0.1062, + "step": 24726 + }, + { + "epoch": 1.3571899012074642, + "grad_norm": 1.5064448118209839, + "learning_rate": 3.7223886138305186e-06, + "loss": 0.309, + "step": 24728 + }, + { + "epoch": 1.3572996706915479, + "grad_norm": 1.256463646888733, + "learning_rate": 3.719640205868796e-06, + "loss": 0.157, + "step": 24730 + }, + { + "epoch": 1.3574094401756311, + "grad_norm": 0.821398913860321, + "learning_rate": 3.7168927313681044e-06, + "loss": 0.1558, + "step": 24732 + }, + { + "epoch": 1.3575192096597146, + "grad_norm": 0.8830507397651672, + "learning_rate": 3.7141461904489666e-06, + "loss": 0.2496, + "step": 24734 + }, + { + "epoch": 1.357628979143798, + "grad_norm": 1.3686814308166504, + "learning_rate": 3.7114005832318527e-06, + "loss": 0.2193, + "step": 24736 + }, + { + "epoch": 1.3577387486278814, + "grad_norm": 1.5336196422576904, + "learning_rate": 3.7086559098372e-06, + "loss": 0.185, + "step": 24738 + }, + { + "epoch": 1.3578485181119648, + "grad_norm": 1.9781068563461304, + "learning_rate": 3.705912170385406e-06, + "loss": 0.2668, + "step": 24740 + }, + { + "epoch": 1.3579582875960483, + "grad_norm": 1.1135574579238892, + "learning_rate": 3.7031693649968195e-06, + "loss": 0.1848, + "step": 24742 + }, + { + "epoch": 1.3580680570801318, + "grad_norm": 0.8988064527511597, + "learning_rate": 3.70042749379175e-06, + "loss": 0.1731, + "step": 24744 + }, + { + "epoch": 1.3581778265642153, + "grad_norm": 1.204689621925354, + "learning_rate": 3.697686556890481e-06, + "loss": 0.1415, + "step": 24746 + }, + { + "epoch": 1.3582875960482985, + "grad_norm": 0.9599860906600952, + "learning_rate": 3.6949465544132365e-06, + "loss": 0.1229, + "step": 24748 + }, + { + "epoch": 1.358397365532382, + "grad_norm": 0.7475808262825012, + "learning_rate": 3.692207486480209e-06, + "loss": 0.1015, + "step": 24750 + }, + { + "epoch": 1.3585071350164655, + "grad_norm": 0.7562744617462158, + "learning_rate": 3.6894693532115445e-06, + "loss": 0.2335, + "step": 24752 + }, + { + "epoch": 1.3586169045005487, + "grad_norm": 2.1194007396698, + "learning_rate": 3.6867321547273486e-06, + "loss": 0.145, + "step": 24754 + }, + { + "epoch": 1.3587266739846322, + "grad_norm": 1.4801843166351318, + "learning_rate": 3.6839958911476957e-06, + "loss": 0.1941, + "step": 24756 + }, + { + "epoch": 1.3588364434687157, + "grad_norm": 1.4100135564804077, + "learning_rate": 3.681260562592609e-06, + "loss": 0.1931, + "step": 24758 + }, + { + "epoch": 1.3589462129527992, + "grad_norm": 1.367929458618164, + "learning_rate": 3.678526169182067e-06, + "loss": 0.1951, + "step": 24760 + }, + { + "epoch": 1.3590559824368826, + "grad_norm": 1.0284589529037476, + "learning_rate": 3.6757927110360247e-06, + "loss": 0.2208, + "step": 24762 + }, + { + "epoch": 1.359165751920966, + "grad_norm": 1.6597124338150024, + "learning_rate": 3.67306018827438e-06, + "loss": 0.1536, + "step": 24764 + }, + { + "epoch": 1.3592755214050494, + "grad_norm": 0.935300350189209, + "learning_rate": 3.670328601016995e-06, + "loss": 0.1721, + "step": 24766 + }, + { + "epoch": 1.3593852908891328, + "grad_norm": 0.924692690372467, + "learning_rate": 3.667597949383683e-06, + "loss": 0.1494, + "step": 24768 + }, + { + "epoch": 1.3594950603732163, + "grad_norm": 1.4231899976730347, + "learning_rate": 3.664868233494234e-06, + "loss": 0.1833, + "step": 24770 + }, + { + "epoch": 1.3596048298572998, + "grad_norm": 1.453900933265686, + "learning_rate": 3.6621394534683854e-06, + "loss": 0.2087, + "step": 24772 + }, + { + "epoch": 1.359714599341383, + "grad_norm": 0.850770115852356, + "learning_rate": 3.6594116094258337e-06, + "loss": 0.0949, + "step": 24774 + }, + { + "epoch": 1.3598243688254665, + "grad_norm": 1.2213594913482666, + "learning_rate": 3.656684701486235e-06, + "loss": 0.2592, + "step": 24776 + }, + { + "epoch": 1.35993413830955, + "grad_norm": 0.6824683547019958, + "learning_rate": 3.6539587297692022e-06, + "loss": 0.0912, + "step": 24778 + }, + { + "epoch": 1.3600439077936333, + "grad_norm": 0.8130699396133423, + "learning_rate": 3.651233694394307e-06, + "loss": 0.1705, + "step": 24780 + }, + { + "epoch": 1.3601536772777167, + "grad_norm": 1.2398380041122437, + "learning_rate": 3.648509595481095e-06, + "loss": 0.1886, + "step": 24782 + }, + { + "epoch": 1.3602634467618002, + "grad_norm": 1.224895715713501, + "learning_rate": 3.645786433149051e-06, + "loss": 0.203, + "step": 24784 + }, + { + "epoch": 1.3603732162458837, + "grad_norm": 0.782403290271759, + "learning_rate": 3.643064207517624e-06, + "loss": 0.1906, + "step": 24786 + }, + { + "epoch": 1.3604829857299672, + "grad_norm": 1.0867652893066406, + "learning_rate": 3.64034291870623e-06, + "loss": 0.2027, + "step": 24788 + }, + { + "epoch": 1.3605927552140504, + "grad_norm": 1.3392882347106934, + "learning_rate": 3.6376225668342284e-06, + "loss": 0.2176, + "step": 24790 + }, + { + "epoch": 1.360702524698134, + "grad_norm": 0.8095707297325134, + "learning_rate": 3.6349031520209574e-06, + "loss": 0.1271, + "step": 24792 + }, + { + "epoch": 1.3608122941822174, + "grad_norm": 1.1264045238494873, + "learning_rate": 3.6321846743856967e-06, + "loss": 0.1943, + "step": 24794 + }, + { + "epoch": 1.3609220636663006, + "grad_norm": 1.0647480487823486, + "learning_rate": 3.629467134047701e-06, + "loss": 0.1653, + "step": 24796 + }, + { + "epoch": 1.3610318331503841, + "grad_norm": 1.0590792894363403, + "learning_rate": 3.626750531126169e-06, + "loss": 0.1413, + "step": 24798 + }, + { + "epoch": 1.3611416026344676, + "grad_norm": 1.101534128189087, + "learning_rate": 3.6240348657402667e-06, + "loss": 0.1541, + "step": 24800 + }, + { + "epoch": 1.361251372118551, + "grad_norm": 0.9089385867118835, + "learning_rate": 3.6213201380091128e-06, + "loss": 0.2418, + "step": 24802 + }, + { + "epoch": 1.3613611416026346, + "grad_norm": 1.1943855285644531, + "learning_rate": 3.618606348051784e-06, + "loss": 0.1381, + "step": 24804 + }, + { + "epoch": 1.3614709110867178, + "grad_norm": 1.076633334159851, + "learning_rate": 3.6158934959873353e-06, + "loss": 0.1588, + "step": 24806 + }, + { + "epoch": 1.3615806805708013, + "grad_norm": 1.1437445878982544, + "learning_rate": 3.6131815819347546e-06, + "loss": 0.2445, + "step": 24808 + }, + { + "epoch": 1.3616904500548848, + "grad_norm": 0.9933814406394958, + "learning_rate": 3.6104706060130027e-06, + "loss": 0.2019, + "step": 24810 + }, + { + "epoch": 1.3618002195389682, + "grad_norm": 1.490579605102539, + "learning_rate": 3.607760568340998e-06, + "loss": 0.1875, + "step": 24812 + }, + { + "epoch": 1.3619099890230517, + "grad_norm": 1.440854787826538, + "learning_rate": 3.6050514690376125e-06, + "loss": 0.1832, + "step": 24814 + }, + { + "epoch": 1.362019758507135, + "grad_norm": 1.0082072019577026, + "learning_rate": 3.6023433082216755e-06, + "loss": 0.1672, + "step": 24816 + }, + { + "epoch": 1.3621295279912184, + "grad_norm": 1.2921476364135742, + "learning_rate": 3.5996360860119928e-06, + "loss": 0.153, + "step": 24818 + }, + { + "epoch": 1.362239297475302, + "grad_norm": 0.8867775797843933, + "learning_rate": 3.5969298025273074e-06, + "loss": 0.1602, + "step": 24820 + }, + { + "epoch": 1.3623490669593852, + "grad_norm": 1.0324116945266724, + "learning_rate": 3.594224457886336e-06, + "loss": 0.1743, + "step": 24822 + }, + { + "epoch": 1.3624588364434687, + "grad_norm": 1.0582752227783203, + "learning_rate": 3.5915200522077473e-06, + "loss": 0.2299, + "step": 24824 + }, + { + "epoch": 1.3625686059275521, + "grad_norm": 1.509742259979248, + "learning_rate": 3.5888165856101693e-06, + "loss": 0.2062, + "step": 24826 + }, + { + "epoch": 1.3626783754116356, + "grad_norm": 1.110603928565979, + "learning_rate": 3.5861140582121866e-06, + "loss": 0.1699, + "step": 24828 + }, + { + "epoch": 1.362788144895719, + "grad_norm": 0.934787392616272, + "learning_rate": 3.5834124701323413e-06, + "loss": 0.2341, + "step": 24830 + }, + { + "epoch": 1.3628979143798023, + "grad_norm": 1.2431366443634033, + "learning_rate": 3.5807118214891495e-06, + "loss": 0.1824, + "step": 24832 + }, + { + "epoch": 1.3630076838638858, + "grad_norm": 1.3908947706222534, + "learning_rate": 3.578012112401069e-06, + "loss": 0.3122, + "step": 24834 + }, + { + "epoch": 1.3631174533479693, + "grad_norm": 1.3095437288284302, + "learning_rate": 3.575313342986522e-06, + "loss": 0.2732, + "step": 24836 + }, + { + "epoch": 1.3632272228320526, + "grad_norm": 1.1566542387008667, + "learning_rate": 3.572615513363892e-06, + "loss": 0.1361, + "step": 24838 + }, + { + "epoch": 1.363336992316136, + "grad_norm": 0.9328665733337402, + "learning_rate": 3.569918623651514e-06, + "loss": 0.1784, + "step": 24840 + }, + { + "epoch": 1.3634467618002195, + "grad_norm": 1.8601171970367432, + "learning_rate": 3.567222673967688e-06, + "loss": 0.1703, + "step": 24842 + }, + { + "epoch": 1.363556531284303, + "grad_norm": 0.8215374946594238, + "learning_rate": 3.564527664430678e-06, + "loss": 0.1106, + "step": 24844 + }, + { + "epoch": 1.3636663007683865, + "grad_norm": 1.0907158851623535, + "learning_rate": 3.561833595158698e-06, + "loss": 0.1714, + "step": 24846 + }, + { + "epoch": 1.3637760702524697, + "grad_norm": 1.1481480598449707, + "learning_rate": 3.5591404662699197e-06, + "loss": 0.1449, + "step": 24848 + }, + { + "epoch": 1.3638858397365532, + "grad_norm": 0.9849326610565186, + "learning_rate": 3.5564482778824707e-06, + "loss": 0.1391, + "step": 24850 + }, + { + "epoch": 1.3639956092206367, + "grad_norm": 1.1124745607376099, + "learning_rate": 3.5537570301144604e-06, + "loss": 0.1898, + "step": 24852 + }, + { + "epoch": 1.3641053787047202, + "grad_norm": 0.9710575938224792, + "learning_rate": 3.5510667230839235e-06, + "loss": 0.21, + "step": 24854 + }, + { + "epoch": 1.3642151481888036, + "grad_norm": 1.1818861961364746, + "learning_rate": 3.5483773569088856e-06, + "loss": 0.2595, + "step": 24856 + }, + { + "epoch": 1.3643249176728869, + "grad_norm": 1.1999197006225586, + "learning_rate": 3.5456889317073055e-06, + "loss": 0.1704, + "step": 24858 + }, + { + "epoch": 1.3644346871569704, + "grad_norm": 1.0469551086425781, + "learning_rate": 3.5430014475971134e-06, + "loss": 0.2023, + "step": 24860 + }, + { + "epoch": 1.3645444566410538, + "grad_norm": 1.0053905248641968, + "learning_rate": 3.540314904696196e-06, + "loss": 0.1723, + "step": 24862 + }, + { + "epoch": 1.364654226125137, + "grad_norm": 1.052781105041504, + "learning_rate": 3.5376293031223945e-06, + "loss": 0.2038, + "step": 24864 + }, + { + "epoch": 1.3647639956092206, + "grad_norm": 2.4087626934051514, + "learning_rate": 3.5349446429935122e-06, + "loss": 0.1322, + "step": 24866 + }, + { + "epoch": 1.364873765093304, + "grad_norm": 1.3553179502487183, + "learning_rate": 3.532260924427319e-06, + "loss": 0.175, + "step": 24868 + }, + { + "epoch": 1.3649835345773875, + "grad_norm": 1.1650621891021729, + "learning_rate": 3.529578147541532e-06, + "loss": 0.2366, + "step": 24870 + }, + { + "epoch": 1.365093304061471, + "grad_norm": 1.2392632961273193, + "learning_rate": 3.526896312453831e-06, + "loss": 0.2301, + "step": 24872 + }, + { + "epoch": 1.3652030735455543, + "grad_norm": 1.6048245429992676, + "learning_rate": 3.524215419281854e-06, + "loss": 0.272, + "step": 24874 + }, + { + "epoch": 1.3653128430296377, + "grad_norm": 0.9441759586334229, + "learning_rate": 3.5215354681431973e-06, + "loss": 0.1652, + "step": 24876 + }, + { + "epoch": 1.3654226125137212, + "grad_norm": 1.1539028882980347, + "learning_rate": 3.518856459155409e-06, + "loss": 0.1685, + "step": 24878 + }, + { + "epoch": 1.3655323819978047, + "grad_norm": 2.488471746444702, + "learning_rate": 3.5161783924360177e-06, + "loss": 0.2443, + "step": 24880 + }, + { + "epoch": 1.3656421514818882, + "grad_norm": 1.4575706720352173, + "learning_rate": 3.5135012681024925e-06, + "loss": 0.2605, + "step": 24882 + }, + { + "epoch": 1.3657519209659714, + "grad_norm": 1.0022945404052734, + "learning_rate": 3.5108250862722653e-06, + "loss": 0.1612, + "step": 24884 + }, + { + "epoch": 1.365861690450055, + "grad_norm": 0.9488061666488647, + "learning_rate": 3.508149847062725e-06, + "loss": 0.2322, + "step": 24886 + }, + { + "epoch": 1.3659714599341384, + "grad_norm": 1.1243531703948975, + "learning_rate": 3.5054755505912194e-06, + "loss": 0.2028, + "step": 24888 + }, + { + "epoch": 1.3660812294182216, + "grad_norm": 1.3315694332122803, + "learning_rate": 3.502802196975058e-06, + "loss": 0.2027, + "step": 24890 + }, + { + "epoch": 1.366190998902305, + "grad_norm": 1.5191864967346191, + "learning_rate": 3.500129786331502e-06, + "loss": 0.1896, + "step": 24892 + }, + { + "epoch": 1.3663007683863886, + "grad_norm": 1.1617282629013062, + "learning_rate": 3.4974583187777852e-06, + "loss": 0.1739, + "step": 24894 + }, + { + "epoch": 1.366410537870472, + "grad_norm": 0.8901249170303345, + "learning_rate": 3.494787794431087e-06, + "loss": 0.1662, + "step": 24896 + }, + { + "epoch": 1.3665203073545555, + "grad_norm": 1.729038953781128, + "learning_rate": 3.492118213408552e-06, + "loss": 0.2997, + "step": 24898 + }, + { + "epoch": 1.3666300768386388, + "grad_norm": 0.9744307398796082, + "learning_rate": 3.4894495758272775e-06, + "loss": 0.1724, + "step": 24900 + }, + { + "epoch": 1.3667398463227223, + "grad_norm": 0.9630973935127258, + "learning_rate": 3.4867818818043213e-06, + "loss": 0.2028, + "step": 24902 + }, + { + "epoch": 1.3668496158068058, + "grad_norm": 1.280426263809204, + "learning_rate": 3.4841151314567085e-06, + "loss": 0.1896, + "step": 24904 + }, + { + "epoch": 1.366959385290889, + "grad_norm": 1.0706660747528076, + "learning_rate": 3.4814493249014116e-06, + "loss": 0.1713, + "step": 24906 + }, + { + "epoch": 1.3670691547749725, + "grad_norm": 2.067239999771118, + "learning_rate": 3.4787844622553663e-06, + "loss": 0.3314, + "step": 24908 + }, + { + "epoch": 1.367178924259056, + "grad_norm": 0.8204110264778137, + "learning_rate": 3.4761205436354692e-06, + "loss": 0.1202, + "step": 24910 + }, + { + "epoch": 1.3672886937431394, + "grad_norm": 0.9458566904067993, + "learning_rate": 3.473457569158564e-06, + "loss": 0.1537, + "step": 24912 + }, + { + "epoch": 1.367398463227223, + "grad_norm": 0.9336760640144348, + "learning_rate": 3.4707955389414737e-06, + "loss": 0.1494, + "step": 24914 + }, + { + "epoch": 1.3675082327113062, + "grad_norm": 0.9934253692626953, + "learning_rate": 3.4681344531009572e-06, + "loss": 0.18, + "step": 24916 + }, + { + "epoch": 1.3676180021953896, + "grad_norm": 1.224623203277588, + "learning_rate": 3.4654743117537524e-06, + "loss": 0.1589, + "step": 24918 + }, + { + "epoch": 1.3677277716794731, + "grad_norm": 0.9147771000862122, + "learning_rate": 3.4628151150165434e-06, + "loss": 0.1287, + "step": 24920 + }, + { + "epoch": 1.3678375411635566, + "grad_norm": 0.9134629964828491, + "learning_rate": 3.4601568630059727e-06, + "loss": 0.1422, + "step": 24922 + }, + { + "epoch": 1.36794731064764, + "grad_norm": 0.913806676864624, + "learning_rate": 3.4574995558386474e-06, + "loss": 0.1125, + "step": 24924 + }, + { + "epoch": 1.3680570801317233, + "grad_norm": 1.2040327787399292, + "learning_rate": 3.454843193631127e-06, + "loss": 0.1393, + "step": 24926 + }, + { + "epoch": 1.3681668496158068, + "grad_norm": 1.4803904294967651, + "learning_rate": 3.4521877764999293e-06, + "loss": 0.1537, + "step": 24928 + }, + { + "epoch": 1.3682766190998903, + "grad_norm": 0.8445658087730408, + "learning_rate": 3.4495333045615446e-06, + "loss": 0.1015, + "step": 24930 + }, + { + "epoch": 1.3683863885839735, + "grad_norm": 0.7808338403701782, + "learning_rate": 3.4468797779324048e-06, + "loss": 0.1179, + "step": 24932 + }, + { + "epoch": 1.368496158068057, + "grad_norm": 1.1321508884429932, + "learning_rate": 3.4442271967289082e-06, + "loss": 0.1652, + "step": 24934 + }, + { + "epoch": 1.3686059275521405, + "grad_norm": 0.9103202819824219, + "learning_rate": 3.441575561067406e-06, + "loss": 0.1683, + "step": 24936 + }, + { + "epoch": 1.368715697036224, + "grad_norm": 1.8758172988891602, + "learning_rate": 3.4389248710642114e-06, + "loss": 0.2002, + "step": 24938 + }, + { + "epoch": 1.3688254665203075, + "grad_norm": 1.0404164791107178, + "learning_rate": 3.436275126835603e-06, + "loss": 0.1728, + "step": 24940 + }, + { + "epoch": 1.3689352360043907, + "grad_norm": 1.0433833599090576, + "learning_rate": 3.433626328497805e-06, + "loss": 0.2387, + "step": 24942 + }, + { + "epoch": 1.3690450054884742, + "grad_norm": 1.0034652948379517, + "learning_rate": 3.4309784761670155e-06, + "loss": 0.1523, + "step": 24944 + }, + { + "epoch": 1.3691547749725577, + "grad_norm": 1.0842713117599487, + "learning_rate": 3.4283315699593753e-06, + "loss": 0.0974, + "step": 24946 + }, + { + "epoch": 1.369264544456641, + "grad_norm": 0.9782703518867493, + "learning_rate": 3.4256856099909945e-06, + "loss": 0.2046, + "step": 24948 + }, + { + "epoch": 1.3693743139407244, + "grad_norm": 1.149101734161377, + "learning_rate": 3.4230405963779355e-06, + "loss": 0.1518, + "step": 24950 + }, + { + "epoch": 1.3694840834248079, + "grad_norm": 1.3779314756393433, + "learning_rate": 3.4203965292362166e-06, + "loss": 0.1341, + "step": 24952 + }, + { + "epoch": 1.3695938529088914, + "grad_norm": 1.1084446907043457, + "learning_rate": 3.4177534086818286e-06, + "loss": 0.1824, + "step": 24954 + }, + { + "epoch": 1.3697036223929748, + "grad_norm": 1.5981369018554688, + "learning_rate": 3.415111234830709e-06, + "loss": 0.2361, + "step": 24956 + }, + { + "epoch": 1.369813391877058, + "grad_norm": 1.191451907157898, + "learning_rate": 3.4124700077987572e-06, + "loss": 0.1973, + "step": 24958 + }, + { + "epoch": 1.3699231613611416, + "grad_norm": 1.3477113246917725, + "learning_rate": 3.4098297277018273e-06, + "loss": 0.2332, + "step": 24960 + }, + { + "epoch": 1.370032930845225, + "grad_norm": 1.3228932619094849, + "learning_rate": 3.407190394655735e-06, + "loss": 0.2547, + "step": 24962 + }, + { + "epoch": 1.3701427003293085, + "grad_norm": 1.3604941368103027, + "learning_rate": 3.404552008776252e-06, + "loss": 0.2412, + "step": 24964 + }, + { + "epoch": 1.370252469813392, + "grad_norm": 1.1967891454696655, + "learning_rate": 3.4019145701791184e-06, + "loss": 0.1422, + "step": 24966 + }, + { + "epoch": 1.3703622392974752, + "grad_norm": 1.2761555910110474, + "learning_rate": 3.3992780789800227e-06, + "loss": 0.2703, + "step": 24968 + }, + { + "epoch": 1.3704720087815587, + "grad_norm": 1.1361639499664307, + "learning_rate": 3.396642535294614e-06, + "loss": 0.1997, + "step": 24970 + }, + { + "epoch": 1.3705817782656422, + "grad_norm": 1.1251635551452637, + "learning_rate": 3.3940079392384916e-06, + "loss": 0.1841, + "step": 24972 + }, + { + "epoch": 1.3706915477497255, + "grad_norm": 1.1045714616775513, + "learning_rate": 3.391374290927235e-06, + "loss": 0.2742, + "step": 24974 + }, + { + "epoch": 1.370801317233809, + "grad_norm": 0.9641996622085571, + "learning_rate": 3.388741590476366e-06, + "loss": 0.0998, + "step": 24976 + }, + { + "epoch": 1.3709110867178924, + "grad_norm": 0.9216663241386414, + "learning_rate": 3.386109838001358e-06, + "loss": 0.1886, + "step": 24978 + }, + { + "epoch": 1.371020856201976, + "grad_norm": 1.409403920173645, + "learning_rate": 3.383479033617665e-06, + "loss": 0.2736, + "step": 24980 + }, + { + "epoch": 1.3711306256860594, + "grad_norm": 0.8967812657356262, + "learning_rate": 3.3808491774406815e-06, + "loss": 0.1696, + "step": 24982 + }, + { + "epoch": 1.3712403951701426, + "grad_norm": 1.096610188484192, + "learning_rate": 3.3782202695857663e-06, + "loss": 0.0946, + "step": 24984 + }, + { + "epoch": 1.371350164654226, + "grad_norm": 0.9044569730758667, + "learning_rate": 3.3755923101682353e-06, + "loss": 0.2393, + "step": 24986 + }, + { + "epoch": 1.3714599341383096, + "grad_norm": 1.4497736692428589, + "learning_rate": 3.37296529930336e-06, + "loss": 0.212, + "step": 24988 + }, + { + "epoch": 1.371569703622393, + "grad_norm": 0.9997216463088989, + "learning_rate": 3.3703392371063845e-06, + "loss": 0.26, + "step": 24990 + }, + { + "epoch": 1.3716794731064765, + "grad_norm": 0.7615695595741272, + "learning_rate": 3.367714123692495e-06, + "loss": 0.1086, + "step": 24992 + }, + { + "epoch": 1.3717892425905598, + "grad_norm": 0.7328476905822754, + "learning_rate": 3.3650899591768396e-06, + "loss": 0.0984, + "step": 24994 + }, + { + "epoch": 1.3718990120746433, + "grad_norm": 1.4000142812728882, + "learning_rate": 3.3624667436745306e-06, + "loss": 0.1846, + "step": 24996 + }, + { + "epoch": 1.3720087815587267, + "grad_norm": 1.1007521152496338, + "learning_rate": 3.359844477300633e-06, + "loss": 0.1349, + "step": 24998 + }, + { + "epoch": 1.37211855104281, + "grad_norm": 1.3064454793930054, + "learning_rate": 3.357223160170167e-06, + "loss": 0.1178, + "step": 25000 + }, + { + "epoch": 1.3722283205268935, + "grad_norm": 1.1810780763626099, + "learning_rate": 3.3546027923981203e-06, + "loss": 0.1314, + "step": 25002 + }, + { + "epoch": 1.372338090010977, + "grad_norm": 1.547308087348938, + "learning_rate": 3.351983374099446e-06, + "loss": 0.2346, + "step": 25004 + }, + { + "epoch": 1.3724478594950604, + "grad_norm": 0.7554607391357422, + "learning_rate": 3.3493649053890326e-06, + "loss": 0.118, + "step": 25006 + }, + { + "epoch": 1.372557628979144, + "grad_norm": 1.0181615352630615, + "learning_rate": 3.346747386381743e-06, + "loss": 0.1709, + "step": 25008 + }, + { + "epoch": 1.3726673984632272, + "grad_norm": 1.0318996906280518, + "learning_rate": 3.3441308171923952e-06, + "loss": 0.1717, + "step": 25010 + }, + { + "epoch": 1.3727771679473106, + "grad_norm": 1.2734767198562622, + "learning_rate": 3.3415151979357627e-06, + "loss": 0.2292, + "step": 25012 + }, + { + "epoch": 1.3728869374313941, + "grad_norm": 1.8286205530166626, + "learning_rate": 3.338900528726571e-06, + "loss": 0.2435, + "step": 25014 + }, + { + "epoch": 1.3729967069154774, + "grad_norm": 1.1701308488845825, + "learning_rate": 3.3362868096795314e-06, + "loss": 0.3281, + "step": 25016 + }, + { + "epoch": 1.3731064763995608, + "grad_norm": 1.4125494956970215, + "learning_rate": 3.333674040909279e-06, + "loss": 0.2531, + "step": 25018 + }, + { + "epoch": 1.3732162458836443, + "grad_norm": 1.086449146270752, + "learning_rate": 3.3310622225304317e-06, + "loss": 0.1425, + "step": 25020 + }, + { + "epoch": 1.3733260153677278, + "grad_norm": 1.0601242780685425, + "learning_rate": 3.3284513546575495e-06, + "loss": 0.1249, + "step": 25022 + }, + { + "epoch": 1.3734357848518113, + "grad_norm": 0.8183299899101257, + "learning_rate": 3.325841437405158e-06, + "loss": 0.2051, + "step": 25024 + }, + { + "epoch": 1.3735455543358945, + "grad_norm": 0.8572509288787842, + "learning_rate": 3.3232324708877495e-06, + "loss": 0.1502, + "step": 25026 + }, + { + "epoch": 1.373655323819978, + "grad_norm": 1.4010385274887085, + "learning_rate": 3.320624455219759e-06, + "loss": 0.2857, + "step": 25028 + }, + { + "epoch": 1.3737650933040615, + "grad_norm": 2.1100685596466064, + "learning_rate": 3.3180173905155902e-06, + "loss": 0.1789, + "step": 25030 + }, + { + "epoch": 1.373874862788145, + "grad_norm": 1.3672178983688354, + "learning_rate": 3.3154112768895962e-06, + "loss": 0.2057, + "step": 25032 + }, + { + "epoch": 1.3739846322722284, + "grad_norm": 1.0777212381362915, + "learning_rate": 3.312806114456099e-06, + "loss": 0.1928, + "step": 25034 + }, + { + "epoch": 1.3740944017563117, + "grad_norm": 1.0198791027069092, + "learning_rate": 3.3102019033293767e-06, + "loss": 0.2056, + "step": 25036 + }, + { + "epoch": 1.3742041712403952, + "grad_norm": 0.8770959973335266, + "learning_rate": 3.3075986436236493e-06, + "loss": 0.1346, + "step": 25038 + }, + { + "epoch": 1.3743139407244787, + "grad_norm": 1.6057510375976562, + "learning_rate": 3.3049963354531254e-06, + "loss": 0.1883, + "step": 25040 + }, + { + "epoch": 1.374423710208562, + "grad_norm": 2.468398332595825, + "learning_rate": 3.30239497893195e-06, + "loss": 0.1361, + "step": 25042 + }, + { + "epoch": 1.3745334796926454, + "grad_norm": 1.2312275171279907, + "learning_rate": 3.2997945741742255e-06, + "loss": 0.1551, + "step": 25044 + }, + { + "epoch": 1.3746432491767289, + "grad_norm": 1.2964121103286743, + "learning_rate": 3.297195121294022e-06, + "loss": 0.1836, + "step": 25046 + }, + { + "epoch": 1.3747530186608123, + "grad_norm": 0.99634850025177, + "learning_rate": 3.2945966204053656e-06, + "loss": 0.2062, + "step": 25048 + }, + { + "epoch": 1.3748627881448958, + "grad_norm": 1.154209852218628, + "learning_rate": 3.2919990716222314e-06, + "loss": 0.2031, + "step": 25050 + }, + { + "epoch": 1.374972557628979, + "grad_norm": 1.888702392578125, + "learning_rate": 3.2894024750585727e-06, + "loss": 0.184, + "step": 25052 + }, + { + "epoch": 1.3750823271130626, + "grad_norm": 1.3310695886611938, + "learning_rate": 3.2868068308282847e-06, + "loss": 0.207, + "step": 25054 + }, + { + "epoch": 1.375192096597146, + "grad_norm": 1.0364031791687012, + "learning_rate": 3.284212139045223e-06, + "loss": 0.1256, + "step": 25056 + }, + { + "epoch": 1.3753018660812293, + "grad_norm": 1.2376701831817627, + "learning_rate": 3.2816183998232025e-06, + "loss": 0.1436, + "step": 25058 + }, + { + "epoch": 1.3754116355653128, + "grad_norm": 1.3136298656463623, + "learning_rate": 3.2790256132759963e-06, + "loss": 0.1446, + "step": 25060 + }, + { + "epoch": 1.3755214050493962, + "grad_norm": 1.4377809762954712, + "learning_rate": 3.2764337795173435e-06, + "loss": 0.1646, + "step": 25062 + }, + { + "epoch": 1.3756311745334797, + "grad_norm": 0.925686776638031, + "learning_rate": 3.2738428986609253e-06, + "loss": 0.2113, + "step": 25064 + }, + { + "epoch": 1.3757409440175632, + "grad_norm": 1.2347978353500366, + "learning_rate": 3.2712529708204016e-06, + "loss": 0.1512, + "step": 25066 + }, + { + "epoch": 1.3758507135016464, + "grad_norm": 1.072522521018982, + "learning_rate": 3.2686639961093723e-06, + "loss": 0.2357, + "step": 25068 + }, + { + "epoch": 1.37596048298573, + "grad_norm": 1.5607064962387085, + "learning_rate": 3.2660759746414054e-06, + "loss": 0.2848, + "step": 25070 + }, + { + "epoch": 1.3760702524698134, + "grad_norm": 1.247881531715393, + "learning_rate": 3.263488906530021e-06, + "loss": 0.2719, + "step": 25072 + }, + { + "epoch": 1.3761800219538969, + "grad_norm": 1.8928552865982056, + "learning_rate": 3.260902791888698e-06, + "loss": 0.2739, + "step": 25074 + }, + { + "epoch": 1.3762897914379804, + "grad_norm": 1.397435188293457, + "learning_rate": 3.2583176308308872e-06, + "loss": 0.234, + "step": 25076 + }, + { + "epoch": 1.3763995609220636, + "grad_norm": 0.877361536026001, + "learning_rate": 3.255733423469978e-06, + "loss": 0.1484, + "step": 25078 + }, + { + "epoch": 1.376509330406147, + "grad_norm": 0.7737651467323303, + "learning_rate": 3.253150169919328e-06, + "loss": 0.1394, + "step": 25080 + }, + { + "epoch": 1.3766190998902306, + "grad_norm": 2.056795358657837, + "learning_rate": 3.250567870292251e-06, + "loss": 0.1439, + "step": 25082 + }, + { + "epoch": 1.3767288693743138, + "grad_norm": 1.12824285030365, + "learning_rate": 3.247986524702018e-06, + "loss": 0.1946, + "step": 25084 + }, + { + "epoch": 1.3768386388583973, + "grad_norm": 1.224236011505127, + "learning_rate": 3.245406133261858e-06, + "loss": 0.1591, + "step": 25086 + }, + { + "epoch": 1.3769484083424808, + "grad_norm": 1.1047134399414062, + "learning_rate": 3.2428266960849692e-06, + "loss": 0.2129, + "step": 25088 + }, + { + "epoch": 1.3770581778265643, + "grad_norm": 1.2294948101043701, + "learning_rate": 3.2402482132844915e-06, + "loss": 0.1632, + "step": 25090 + }, + { + "epoch": 1.3771679473106477, + "grad_norm": 1.3041163682937622, + "learning_rate": 3.2376706849735232e-06, + "loss": 0.1793, + "step": 25092 + }, + { + "epoch": 1.377277716794731, + "grad_norm": 1.1758006811141968, + "learning_rate": 3.2350941112651413e-06, + "loss": 0.2216, + "step": 25094 + }, + { + "epoch": 1.3773874862788145, + "grad_norm": 1.3917361497879028, + "learning_rate": 3.23251849227236e-06, + "loss": 0.1778, + "step": 25096 + }, + { + "epoch": 1.377497255762898, + "grad_norm": 1.0807558298110962, + "learning_rate": 3.229943828108159e-06, + "loss": 0.1471, + "step": 25098 + }, + { + "epoch": 1.3776070252469814, + "grad_norm": 0.9720017313957214, + "learning_rate": 3.227370118885467e-06, + "loss": 0.2715, + "step": 25100 + }, + { + "epoch": 1.377716794731065, + "grad_norm": 0.9980533719062805, + "learning_rate": 3.224797364717197e-06, + "loss": 0.1156, + "step": 25102 + }, + { + "epoch": 1.3778265642151482, + "grad_norm": 1.7079724073410034, + "learning_rate": 3.2222255657161915e-06, + "loss": 0.3443, + "step": 25104 + }, + { + "epoch": 1.3779363336992316, + "grad_norm": 1.1409592628479004, + "learning_rate": 3.219654721995266e-06, + "loss": 0.1958, + "step": 25106 + }, + { + "epoch": 1.378046103183315, + "grad_norm": 1.3382164239883423, + "learning_rate": 3.217084833667189e-06, + "loss": 0.1937, + "step": 25108 + }, + { + "epoch": 1.3781558726673984, + "grad_norm": 1.512818694114685, + "learning_rate": 3.2145159008446807e-06, + "loss": 0.1929, + "step": 25110 + }, + { + "epoch": 1.3782656421514818, + "grad_norm": 1.6781805753707886, + "learning_rate": 3.2119479236404403e-06, + "loss": 0.2021, + "step": 25112 + }, + { + "epoch": 1.3783754116355653, + "grad_norm": 1.3267979621887207, + "learning_rate": 3.2093809021671057e-06, + "loss": 0.3725, + "step": 25114 + }, + { + "epoch": 1.3784851811196488, + "grad_norm": 1.5915157794952393, + "learning_rate": 3.2068148365372806e-06, + "loss": 0.1997, + "step": 25116 + }, + { + "epoch": 1.3785949506037323, + "grad_norm": 0.7796862125396729, + "learning_rate": 3.204249726863523e-06, + "loss": 0.1673, + "step": 25118 + }, + { + "epoch": 1.3787047200878155, + "grad_norm": 1.2313042879104614, + "learning_rate": 3.2016855732583534e-06, + "loss": 0.1405, + "step": 25120 + }, + { + "epoch": 1.378814489571899, + "grad_norm": 1.0457288026809692, + "learning_rate": 3.19912237583424e-06, + "loss": 0.2117, + "step": 25122 + }, + { + "epoch": 1.3789242590559825, + "grad_norm": 1.0636109113693237, + "learning_rate": 3.1965601347036244e-06, + "loss": 0.2077, + "step": 25124 + }, + { + "epoch": 1.3790340285400657, + "grad_norm": 1.4563157558441162, + "learning_rate": 3.1939988499789077e-06, + "loss": 0.1602, + "step": 25126 + }, + { + "epoch": 1.3791437980241492, + "grad_norm": 1.082155704498291, + "learning_rate": 3.1914385217724275e-06, + "loss": 0.165, + "step": 25128 + }, + { + "epoch": 1.3792535675082327, + "grad_norm": 0.7761269211769104, + "learning_rate": 3.1888791501964973e-06, + "loss": 0.1885, + "step": 25130 + }, + { + "epoch": 1.3793633369923162, + "grad_norm": 1.182141661643982, + "learning_rate": 3.1863207353633825e-06, + "loss": 0.1681, + "step": 25132 + }, + { + "epoch": 1.3794731064763996, + "grad_norm": 0.9524211883544922, + "learning_rate": 3.1837632773853098e-06, + "loss": 0.2048, + "step": 25134 + }, + { + "epoch": 1.379582875960483, + "grad_norm": 1.2037568092346191, + "learning_rate": 3.181206776374454e-06, + "loss": 0.1744, + "step": 25136 + }, + { + "epoch": 1.3796926454445664, + "grad_norm": 1.874370813369751, + "learning_rate": 3.1786512324429668e-06, + "loss": 0.2123, + "step": 25138 + }, + { + "epoch": 1.3798024149286499, + "grad_norm": 0.8577399849891663, + "learning_rate": 3.1760966457029416e-06, + "loss": 0.1461, + "step": 25140 + }, + { + "epoch": 1.3799121844127333, + "grad_norm": 1.117014765739441, + "learning_rate": 3.1735430162664363e-06, + "loss": 0.1961, + "step": 25142 + }, + { + "epoch": 1.3800219538968168, + "grad_norm": 0.932227373123169, + "learning_rate": 3.170990344245467e-06, + "loss": 0.1715, + "step": 25144 + }, + { + "epoch": 1.3801317233809, + "grad_norm": 1.0576876401901245, + "learning_rate": 3.168438629752002e-06, + "loss": 0.1853, + "step": 25146 + }, + { + "epoch": 1.3802414928649835, + "grad_norm": 0.9769771099090576, + "learning_rate": 3.1658878728979685e-06, + "loss": 0.1314, + "step": 25148 + }, + { + "epoch": 1.380351262349067, + "grad_norm": 1.1058828830718994, + "learning_rate": 3.1633380737952663e-06, + "loss": 0.1878, + "step": 25150 + }, + { + "epoch": 1.3804610318331503, + "grad_norm": 1.2934647798538208, + "learning_rate": 3.1607892325557385e-06, + "loss": 0.1118, + "step": 25152 + }, + { + "epoch": 1.3805708013172338, + "grad_norm": 1.558555006980896, + "learning_rate": 3.158241349291183e-06, + "loss": 0.1775, + "step": 25154 + }, + { + "epoch": 1.3806805708013172, + "grad_norm": 0.8227429389953613, + "learning_rate": 3.15569442411337e-06, + "loss": 0.1324, + "step": 25156 + }, + { + "epoch": 1.3807903402854007, + "grad_norm": 1.2213778495788574, + "learning_rate": 3.15314845713402e-06, + "loss": 0.2491, + "step": 25158 + }, + { + "epoch": 1.3809001097694842, + "grad_norm": 1.3862905502319336, + "learning_rate": 3.1506034484648036e-06, + "loss": 0.161, + "step": 25160 + }, + { + "epoch": 1.3810098792535674, + "grad_norm": 1.3851770162582397, + "learning_rate": 3.148059398217368e-06, + "loss": 0.1944, + "step": 25162 + }, + { + "epoch": 1.381119648737651, + "grad_norm": 1.6647175550460815, + "learning_rate": 3.1455163065033017e-06, + "loss": 0.2981, + "step": 25164 + }, + { + "epoch": 1.3812294182217344, + "grad_norm": 1.3459877967834473, + "learning_rate": 3.14297417343416e-06, + "loss": 0.1159, + "step": 25166 + }, + { + "epoch": 1.3813391877058177, + "grad_norm": 1.2513920068740845, + "learning_rate": 3.140432999121454e-06, + "loss": 0.2304, + "step": 25168 + }, + { + "epoch": 1.3814489571899011, + "grad_norm": 0.907501757144928, + "learning_rate": 3.1378927836766466e-06, + "loss": 0.0943, + "step": 25170 + }, + { + "epoch": 1.3815587266739846, + "grad_norm": 1.0345567464828491, + "learning_rate": 3.13535352721116e-06, + "loss": 0.1289, + "step": 25172 + }, + { + "epoch": 1.381668496158068, + "grad_norm": 1.2548577785491943, + "learning_rate": 3.1328152298363946e-06, + "loss": 0.2299, + "step": 25174 + }, + { + "epoch": 1.3817782656421516, + "grad_norm": 1.1443941593170166, + "learning_rate": 3.1302778916636828e-06, + "loss": 0.1455, + "step": 25176 + }, + { + "epoch": 1.3818880351262348, + "grad_norm": 0.8911463022232056, + "learning_rate": 3.1277415128043247e-06, + "loss": 0.1495, + "step": 25178 + }, + { + "epoch": 1.3819978046103183, + "grad_norm": 0.8817422389984131, + "learning_rate": 3.1252060933695814e-06, + "loss": 0.1226, + "step": 25180 + }, + { + "epoch": 1.3821075740944018, + "grad_norm": 1.9678324460983276, + "learning_rate": 3.122671633470664e-06, + "loss": 0.2121, + "step": 25182 + }, + { + "epoch": 1.3822173435784852, + "grad_norm": 1.1312243938446045, + "learning_rate": 3.1201381332187447e-06, + "loss": 0.2397, + "step": 25184 + }, + { + "epoch": 1.3823271130625687, + "grad_norm": 1.4478353261947632, + "learning_rate": 3.1176055927249594e-06, + "loss": 0.193, + "step": 25186 + }, + { + "epoch": 1.382436882546652, + "grad_norm": 0.8950369358062744, + "learning_rate": 3.115074012100405e-06, + "loss": 0.0913, + "step": 25188 + }, + { + "epoch": 1.3825466520307355, + "grad_norm": 1.1627404689788818, + "learning_rate": 3.1125433914561186e-06, + "loss": 0.2186, + "step": 25190 + }, + { + "epoch": 1.382656421514819, + "grad_norm": 1.4409645795822144, + "learning_rate": 3.1100137309031107e-06, + "loss": 0.2068, + "step": 25192 + }, + { + "epoch": 1.3827661909989022, + "grad_norm": 1.0222835540771484, + "learning_rate": 3.107485030552343e-06, + "loss": 0.1887, + "step": 25194 + }, + { + "epoch": 1.3828759604829857, + "grad_norm": 0.955814003944397, + "learning_rate": 3.1049572905147375e-06, + "loss": 0.1798, + "step": 25196 + }, + { + "epoch": 1.3829857299670691, + "grad_norm": 1.8289318084716797, + "learning_rate": 3.1024305109011665e-06, + "loss": 0.2104, + "step": 25198 + }, + { + "epoch": 1.3830954994511526, + "grad_norm": 1.1536868810653687, + "learning_rate": 3.099904691822478e-06, + "loss": 0.1889, + "step": 25200 + }, + { + "epoch": 1.383205268935236, + "grad_norm": 1.2563878297805786, + "learning_rate": 3.0973798333894634e-06, + "loss": 0.1286, + "step": 25202 + }, + { + "epoch": 1.3833150384193194, + "grad_norm": 1.746656894683838, + "learning_rate": 3.0948559357128703e-06, + "loss": 0.1844, + "step": 25204 + }, + { + "epoch": 1.3834248079034028, + "grad_norm": 1.0924710035324097, + "learning_rate": 3.092332998903416e-06, + "loss": 0.1673, + "step": 25206 + }, + { + "epoch": 1.3835345773874863, + "grad_norm": 1.2787973880767822, + "learning_rate": 3.0898110230717563e-06, + "loss": 0.1322, + "step": 25208 + }, + { + "epoch": 1.3836443468715698, + "grad_norm": 0.7991275787353516, + "learning_rate": 3.087290008328536e-06, + "loss": 0.1949, + "step": 25210 + }, + { + "epoch": 1.3837541163556533, + "grad_norm": 1.0117003917694092, + "learning_rate": 3.0847699547843277e-06, + "loss": 0.1821, + "step": 25212 + }, + { + "epoch": 1.3838638858397365, + "grad_norm": 1.0005524158477783, + "learning_rate": 3.082250862549671e-06, + "loss": 0.2214, + "step": 25214 + }, + { + "epoch": 1.38397365532382, + "grad_norm": 1.800553798675537, + "learning_rate": 3.0797327317350744e-06, + "loss": 0.1838, + "step": 25216 + }, + { + "epoch": 1.3840834248079035, + "grad_norm": 1.1316392421722412, + "learning_rate": 3.077215562450991e-06, + "loss": 0.1151, + "step": 25218 + }, + { + "epoch": 1.3841931942919867, + "grad_norm": 1.148484706878662, + "learning_rate": 3.074699354807839e-06, + "loss": 0.1703, + "step": 25220 + }, + { + "epoch": 1.3843029637760702, + "grad_norm": 1.6213096380233765, + "learning_rate": 3.072184108915982e-06, + "loss": 0.1943, + "step": 25222 + }, + { + "epoch": 1.3844127332601537, + "grad_norm": 1.1214381456375122, + "learning_rate": 3.0696698248857625e-06, + "loss": 0.1579, + "step": 25224 + }, + { + "epoch": 1.3845225027442372, + "grad_norm": 0.9669640064239502, + "learning_rate": 3.0671565028274644e-06, + "loss": 0.3176, + "step": 25226 + }, + { + "epoch": 1.3846322722283206, + "grad_norm": 1.3443794250488281, + "learning_rate": 3.064644142851336e-06, + "loss": 0.1843, + "step": 25228 + }, + { + "epoch": 1.384742041712404, + "grad_norm": 1.1723788976669312, + "learning_rate": 3.062132745067581e-06, + "loss": 0.1301, + "step": 25230 + }, + { + "epoch": 1.3848518111964874, + "grad_norm": 1.1726394891738892, + "learning_rate": 3.059622309586363e-06, + "loss": 0.2432, + "step": 25232 + }, + { + "epoch": 1.3849615806805708, + "grad_norm": 1.1322449445724487, + "learning_rate": 3.057112836517792e-06, + "loss": 0.2073, + "step": 25234 + }, + { + "epoch": 1.385071350164654, + "grad_norm": 1.2458994388580322, + "learning_rate": 3.0546043259719604e-06, + "loss": 0.161, + "step": 25236 + }, + { + "epoch": 1.3851811196487376, + "grad_norm": 1.1891599893569946, + "learning_rate": 3.0520967780588967e-06, + "loss": 0.2303, + "step": 25238 + }, + { + "epoch": 1.385290889132821, + "grad_norm": 1.0145503282546997, + "learning_rate": 3.049590192888596e-06, + "loss": 0.144, + "step": 25240 + }, + { + "epoch": 1.3854006586169045, + "grad_norm": 1.3024203777313232, + "learning_rate": 3.047084570571007e-06, + "loss": 0.2386, + "step": 25242 + }, + { + "epoch": 1.385510428100988, + "grad_norm": 0.903030276298523, + "learning_rate": 3.0445799112160363e-06, + "loss": 0.1371, + "step": 25244 + }, + { + "epoch": 1.3856201975850713, + "grad_norm": 1.109222173690796, + "learning_rate": 3.0420762149335565e-06, + "loss": 0.1268, + "step": 25246 + }, + { + "epoch": 1.3857299670691547, + "grad_norm": 1.2149862051010132, + "learning_rate": 3.039573481833388e-06, + "loss": 0.2178, + "step": 25248 + }, + { + "epoch": 1.3858397365532382, + "grad_norm": 1.3312721252441406, + "learning_rate": 3.037071712025319e-06, + "loss": 0.2069, + "step": 25250 + }, + { + "epoch": 1.3859495060373217, + "grad_norm": 0.8509261012077332, + "learning_rate": 3.0345709056190828e-06, + "loss": 0.1623, + "step": 25252 + }, + { + "epoch": 1.3860592755214052, + "grad_norm": 0.7189674377441406, + "learning_rate": 3.0320710627243813e-06, + "loss": 0.1647, + "step": 25254 + }, + { + "epoch": 1.3861690450054884, + "grad_norm": 1.2181371450424194, + "learning_rate": 3.029572183450868e-06, + "loss": 0.2688, + "step": 25256 + }, + { + "epoch": 1.386278814489572, + "grad_norm": 1.4603638648986816, + "learning_rate": 3.0270742679081503e-06, + "loss": 0.2054, + "step": 25258 + }, + { + "epoch": 1.3863885839736554, + "grad_norm": 0.83466637134552, + "learning_rate": 3.024577316205812e-06, + "loss": 0.1722, + "step": 25260 + }, + { + "epoch": 1.3864983534577386, + "grad_norm": 1.0978069305419922, + "learning_rate": 3.0220813284533717e-06, + "loss": 0.2253, + "step": 25262 + }, + { + "epoch": 1.3866081229418221, + "grad_norm": 1.481748342514038, + "learning_rate": 3.019586304760319e-06, + "loss": 0.2692, + "step": 25264 + }, + { + "epoch": 1.3867178924259056, + "grad_norm": 1.2543483972549438, + "learning_rate": 3.0170922452360973e-06, + "loss": 0.1524, + "step": 25266 + }, + { + "epoch": 1.386827661909989, + "grad_norm": 2.00091290473938, + "learning_rate": 3.014599149990108e-06, + "loss": 0.17, + "step": 25268 + }, + { + "epoch": 1.3869374313940726, + "grad_norm": 1.722878098487854, + "learning_rate": 3.0121070191317074e-06, + "loss": 0.15, + "step": 25270 + }, + { + "epoch": 1.3870472008781558, + "grad_norm": 0.7122257351875305, + "learning_rate": 3.0096158527702175e-06, + "loss": 0.115, + "step": 25272 + }, + { + "epoch": 1.3871569703622393, + "grad_norm": 1.1127490997314453, + "learning_rate": 3.007125651014908e-06, + "loss": 0.1452, + "step": 25274 + }, + { + "epoch": 1.3872667398463228, + "grad_norm": 1.3027896881103516, + "learning_rate": 3.0046364139750203e-06, + "loss": 0.2356, + "step": 25276 + }, + { + "epoch": 1.387376509330406, + "grad_norm": 1.7220734357833862, + "learning_rate": 3.002148141759739e-06, + "loss": 0.2533, + "step": 25278 + }, + { + "epoch": 1.3874862788144895, + "grad_norm": 1.1097291707992554, + "learning_rate": 2.999660834478213e-06, + "loss": 0.1639, + "step": 25280 + }, + { + "epoch": 1.387596048298573, + "grad_norm": 1.102419376373291, + "learning_rate": 2.997174492239546e-06, + "loss": 0.131, + "step": 25282 + }, + { + "epoch": 1.3877058177826564, + "grad_norm": 1.1725177764892578, + "learning_rate": 2.994689115152796e-06, + "loss": 0.1675, + "step": 25284 + }, + { + "epoch": 1.38781558726674, + "grad_norm": 1.0669447183609009, + "learning_rate": 2.992204703326995e-06, + "loss": 0.1952, + "step": 25286 + }, + { + "epoch": 1.3879253567508232, + "grad_norm": 1.2745202779769897, + "learning_rate": 2.989721256871117e-06, + "loss": 0.296, + "step": 25288 + }, + { + "epoch": 1.3880351262349067, + "grad_norm": 1.5085421800613403, + "learning_rate": 2.9872387758940968e-06, + "loss": 0.1569, + "step": 25290 + }, + { + "epoch": 1.3881448957189901, + "grad_norm": 0.9011393189430237, + "learning_rate": 2.9847572605048305e-06, + "loss": 0.1678, + "step": 25292 + }, + { + "epoch": 1.3882546652030736, + "grad_norm": 1.5767500400543213, + "learning_rate": 2.9822767108121625e-06, + "loss": 0.2548, + "step": 25294 + }, + { + "epoch": 1.388364434687157, + "grad_norm": 1.402385950088501, + "learning_rate": 2.97979712692491e-06, + "loss": 0.2251, + "step": 25296 + }, + { + "epoch": 1.3884742041712403, + "grad_norm": 1.2186506986618042, + "learning_rate": 2.9773185089518402e-06, + "loss": 0.1859, + "step": 25298 + }, + { + "epoch": 1.3885839736553238, + "grad_norm": 0.8738197088241577, + "learning_rate": 2.9748408570016735e-06, + "loss": 0.282, + "step": 25300 + }, + { + "epoch": 1.3886937431394073, + "grad_norm": 1.2963403463363647, + "learning_rate": 2.97236417118309e-06, + "loss": 0.165, + "step": 25302 + }, + { + "epoch": 1.3888035126234906, + "grad_norm": 0.9181370735168457, + "learning_rate": 2.9698884516047297e-06, + "loss": 0.1908, + "step": 25304 + }, + { + "epoch": 1.388913282107574, + "grad_norm": 1.3791862726211548, + "learning_rate": 2.967413698375196e-06, + "loss": 0.2234, + "step": 25306 + }, + { + "epoch": 1.3890230515916575, + "grad_norm": 1.631308674812317, + "learning_rate": 2.9649399116030348e-06, + "loss": 0.2221, + "step": 25308 + }, + { + "epoch": 1.389132821075741, + "grad_norm": 1.1339292526245117, + "learning_rate": 2.96246709139677e-06, + "loss": 0.222, + "step": 25310 + }, + { + "epoch": 1.3892425905598245, + "grad_norm": 1.0094521045684814, + "learning_rate": 2.9599952378648626e-06, + "loss": 0.2086, + "step": 25312 + }, + { + "epoch": 1.3893523600439077, + "grad_norm": 0.907394289970398, + "learning_rate": 2.9575243511157453e-06, + "loss": 0.101, + "step": 25314 + }, + { + "epoch": 1.3894621295279912, + "grad_norm": 0.9675107002258301, + "learning_rate": 2.9550544312577977e-06, + "loss": 0.2052, + "step": 25316 + }, + { + "epoch": 1.3895718990120747, + "grad_norm": 1.0474447011947632, + "learning_rate": 2.9525854783993696e-06, + "loss": 0.1924, + "step": 25318 + }, + { + "epoch": 1.3896816684961582, + "grad_norm": 0.9866352677345276, + "learning_rate": 2.950117492648749e-06, + "loss": 0.1811, + "step": 25320 + }, + { + "epoch": 1.3897914379802416, + "grad_norm": 0.8395014405250549, + "learning_rate": 2.947650474114211e-06, + "loss": 0.1498, + "step": 25322 + }, + { + "epoch": 1.3899012074643249, + "grad_norm": 1.0068355798721313, + "learning_rate": 2.9451844229039597e-06, + "loss": 0.1694, + "step": 25324 + }, + { + "epoch": 1.3900109769484084, + "grad_norm": 1.1650705337524414, + "learning_rate": 2.942719339126171e-06, + "loss": 0.2049, + "step": 25326 + }, + { + "epoch": 1.3901207464324918, + "grad_norm": 1.4011671543121338, + "learning_rate": 2.9402552228889768e-06, + "loss": 0.2971, + "step": 25328 + }, + { + "epoch": 1.390230515916575, + "grad_norm": 1.0011652708053589, + "learning_rate": 2.9377920743004577e-06, + "loss": 0.179, + "step": 25330 + }, + { + "epoch": 1.3903402854006586, + "grad_norm": 1.182221531867981, + "learning_rate": 2.935329893468672e-06, + "loss": 0.1747, + "step": 25332 + }, + { + "epoch": 1.390450054884742, + "grad_norm": 1.3278443813323975, + "learning_rate": 2.932868680501613e-06, + "loss": 0.175, + "step": 25334 + }, + { + "epoch": 1.3905598243688255, + "grad_norm": 1.2954154014587402, + "learning_rate": 2.930408435507248e-06, + "loss": 0.1758, + "step": 25336 + }, + { + "epoch": 1.390669593852909, + "grad_norm": 1.0509729385375977, + "learning_rate": 2.927949158593496e-06, + "loss": 0.248, + "step": 25338 + }, + { + "epoch": 1.3907793633369923, + "grad_norm": 1.0103240013122559, + "learning_rate": 2.925490849868226e-06, + "loss": 0.1652, + "step": 25340 + }, + { + "epoch": 1.3908891328210757, + "grad_norm": 1.9865977764129639, + "learning_rate": 2.92303350943928e-06, + "loss": 0.2314, + "step": 25342 + }, + { + "epoch": 1.3909989023051592, + "grad_norm": 1.446858525276184, + "learning_rate": 2.9205771374144346e-06, + "loss": 0.2138, + "step": 25344 + }, + { + "epoch": 1.3911086717892425, + "grad_norm": 1.3001270294189453, + "learning_rate": 2.918121733901458e-06, + "loss": 0.2291, + "step": 25346 + }, + { + "epoch": 1.391218441273326, + "grad_norm": 1.1419093608856201, + "learning_rate": 2.915667299008043e-06, + "loss": 0.1453, + "step": 25348 + }, + { + "epoch": 1.3913282107574094, + "grad_norm": 1.7578098773956299, + "learning_rate": 2.9132138328418573e-06, + "loss": 0.268, + "step": 25350 + }, + { + "epoch": 1.391437980241493, + "grad_norm": 1.185549020767212, + "learning_rate": 2.91076133551052e-06, + "loss": 0.1407, + "step": 25352 + }, + { + "epoch": 1.3915477497255764, + "grad_norm": 1.458108901977539, + "learning_rate": 2.9083098071216118e-06, + "loss": 0.1912, + "step": 25354 + }, + { + "epoch": 1.3916575192096596, + "grad_norm": 1.3288604021072388, + "learning_rate": 2.9058592477826636e-06, + "loss": 0.1231, + "step": 25356 + }, + { + "epoch": 1.391767288693743, + "grad_norm": 0.9051675200462341, + "learning_rate": 2.9034096576011806e-06, + "loss": 0.1541, + "step": 25358 + }, + { + "epoch": 1.3918770581778266, + "grad_norm": 2.065622568130493, + "learning_rate": 2.900961036684602e-06, + "loss": 0.2262, + "step": 25360 + }, + { + "epoch": 1.39198682766191, + "grad_norm": 1.5377312898635864, + "learning_rate": 2.898513385140342e-06, + "loss": 0.1399, + "step": 25362 + }, + { + "epoch": 1.3920965971459935, + "grad_norm": 2.397195816040039, + "learning_rate": 2.8960667030757617e-06, + "loss": 0.209, + "step": 25364 + }, + { + "epoch": 1.3922063666300768, + "grad_norm": 0.9353189468383789, + "learning_rate": 2.893620990598192e-06, + "loss": 0.1583, + "step": 25366 + }, + { + "epoch": 1.3923161361141603, + "grad_norm": 0.6921894550323486, + "learning_rate": 2.891176247814911e-06, + "loss": 0.0945, + "step": 25368 + }, + { + "epoch": 1.3924259055982438, + "grad_norm": 1.391879677772522, + "learning_rate": 2.888732474833153e-06, + "loss": 0.2063, + "step": 25370 + }, + { + "epoch": 1.392535675082327, + "grad_norm": 1.5439285039901733, + "learning_rate": 2.8862896717601203e-06, + "loss": 0.2579, + "step": 25372 + }, + { + "epoch": 1.3926454445664105, + "grad_norm": 1.0797113180160522, + "learning_rate": 2.8838478387029606e-06, + "loss": 0.1196, + "step": 25374 + }, + { + "epoch": 1.392755214050494, + "grad_norm": 1.2068244218826294, + "learning_rate": 2.8814069757687913e-06, + "loss": 0.1975, + "step": 25376 + }, + { + "epoch": 1.3928649835345774, + "grad_norm": 0.807682991027832, + "learning_rate": 2.8789670830646735e-06, + "loss": 0.1747, + "step": 25378 + }, + { + "epoch": 1.392974753018661, + "grad_norm": 1.2596168518066406, + "learning_rate": 2.8765281606976337e-06, + "loss": 0.1588, + "step": 25380 + }, + { + "epoch": 1.3930845225027442, + "grad_norm": 1.5071220397949219, + "learning_rate": 2.87409020877466e-06, + "loss": 0.2124, + "step": 25382 + }, + { + "epoch": 1.3931942919868276, + "grad_norm": 0.92131108045578, + "learning_rate": 2.8716532274026903e-06, + "loss": 0.1691, + "step": 25384 + }, + { + "epoch": 1.3933040614709111, + "grad_norm": 0.8874691128730774, + "learning_rate": 2.8692172166886215e-06, + "loss": 0.2049, + "step": 25386 + }, + { + "epoch": 1.3934138309549944, + "grad_norm": 0.9885433316230774, + "learning_rate": 2.8667821767393106e-06, + "loss": 0.1344, + "step": 25388 + }, + { + "epoch": 1.3935236004390779, + "grad_norm": 0.7893283367156982, + "learning_rate": 2.8643481076615712e-06, + "loss": 0.2783, + "step": 25390 + }, + { + "epoch": 1.3936333699231613, + "grad_norm": 2.020979881286621, + "learning_rate": 2.8619150095621662e-06, + "loss": 0.2793, + "step": 25392 + }, + { + "epoch": 1.3937431394072448, + "grad_norm": 1.2633970975875854, + "learning_rate": 2.859482882547834e-06, + "loss": 0.136, + "step": 25394 + }, + { + "epoch": 1.3938529088913283, + "grad_norm": 1.0970240831375122, + "learning_rate": 2.857051726725249e-06, + "loss": 0.2319, + "step": 25396 + }, + { + "epoch": 1.3939626783754115, + "grad_norm": 1.475576400756836, + "learning_rate": 2.8546215422010638e-06, + "loss": 0.1392, + "step": 25398 + }, + { + "epoch": 1.394072447859495, + "grad_norm": 1.7989784479141235, + "learning_rate": 2.8521923290818738e-06, + "loss": 0.2153, + "step": 25400 + }, + { + "epoch": 1.3941822173435785, + "grad_norm": 1.012597918510437, + "learning_rate": 2.849764087474238e-06, + "loss": 0.1359, + "step": 25402 + }, + { + "epoch": 1.394291986827662, + "grad_norm": 0.7305771708488464, + "learning_rate": 2.8473368174846666e-06, + "loss": 0.1459, + "step": 25404 + }, + { + "epoch": 1.3944017563117455, + "grad_norm": 1.397918462753296, + "learning_rate": 2.8449105192196316e-06, + "loss": 0.1954, + "step": 25406 + }, + { + "epoch": 1.3945115257958287, + "grad_norm": 1.1413283348083496, + "learning_rate": 2.8424851927855684e-06, + "loss": 0.1726, + "step": 25408 + }, + { + "epoch": 1.3946212952799122, + "grad_norm": 0.9203509092330933, + "learning_rate": 2.8400608382888604e-06, + "loss": 0.1447, + "step": 25410 + }, + { + "epoch": 1.3947310647639957, + "grad_norm": 0.7976059913635254, + "learning_rate": 2.837637455835851e-06, + "loss": 0.1456, + "step": 25412 + }, + { + "epoch": 1.394840834248079, + "grad_norm": 0.9931081533432007, + "learning_rate": 2.8352150455328407e-06, + "loss": 0.1007, + "step": 25414 + }, + { + "epoch": 1.3949506037321624, + "grad_norm": 1.2339450120925903, + "learning_rate": 2.832793607486087e-06, + "loss": 0.1758, + "step": 25416 + }, + { + "epoch": 1.3950603732162459, + "grad_norm": 1.1818089485168457, + "learning_rate": 2.830373141801815e-06, + "loss": 0.2151, + "step": 25418 + }, + { + "epoch": 1.3951701427003294, + "grad_norm": 0.8428771495819092, + "learning_rate": 2.8279536485861873e-06, + "loss": 0.0835, + "step": 25420 + }, + { + "epoch": 1.3952799121844128, + "grad_norm": 1.6915504932403564, + "learning_rate": 2.8255351279453446e-06, + "loss": 0.2219, + "step": 25422 + }, + { + "epoch": 1.395389681668496, + "grad_norm": 1.0357279777526855, + "learning_rate": 2.8231175799853653e-06, + "loss": 0.1926, + "step": 25424 + }, + { + "epoch": 1.3954994511525796, + "grad_norm": 1.6345123052597046, + "learning_rate": 2.8207010048122955e-06, + "loss": 0.2276, + "step": 25426 + }, + { + "epoch": 1.395609220636663, + "grad_norm": 1.5068416595458984, + "learning_rate": 2.8182854025321505e-06, + "loss": 0.2884, + "step": 25428 + }, + { + "epoch": 1.3957189901207463, + "grad_norm": 1.6819874048233032, + "learning_rate": 2.8158707732508727e-06, + "loss": 0.2024, + "step": 25430 + }, + { + "epoch": 1.39582875960483, + "grad_norm": 1.2776376008987427, + "learning_rate": 2.8134571170743975e-06, + "loss": 0.134, + "step": 25432 + }, + { + "epoch": 1.3959385290889132, + "grad_norm": 0.8689044713973999, + "learning_rate": 2.8110444341085895e-06, + "loss": 0.0991, + "step": 25434 + }, + { + "epoch": 1.3960482985729967, + "grad_norm": 1.7606955766677856, + "learning_rate": 2.8086327244592814e-06, + "loss": 0.1364, + "step": 25436 + }, + { + "epoch": 1.3961580680570802, + "grad_norm": 1.41842782497406, + "learning_rate": 2.8062219882322633e-06, + "loss": 0.1751, + "step": 25438 + }, + { + "epoch": 1.3962678375411635, + "grad_norm": 1.0721135139465332, + "learning_rate": 2.803812225533284e-06, + "loss": 0.2372, + "step": 25440 + }, + { + "epoch": 1.396377607025247, + "grad_norm": 1.6296305656433105, + "learning_rate": 2.8014034364680392e-06, + "loss": 0.2151, + "step": 25442 + }, + { + "epoch": 1.3964873765093304, + "grad_norm": 1.0791923999786377, + "learning_rate": 2.798995621142203e-06, + "loss": 0.1412, + "step": 25444 + }, + { + "epoch": 1.396597145993414, + "grad_norm": 1.247780203819275, + "learning_rate": 2.7965887796613884e-06, + "loss": 0.1898, + "step": 25446 + }, + { + "epoch": 1.3967069154774974, + "grad_norm": 1.321718454360962, + "learning_rate": 2.7941829121311712e-06, + "loss": 0.1763, + "step": 25448 + }, + { + "epoch": 1.3968166849615806, + "grad_norm": 1.267077088356018, + "learning_rate": 2.7917780186570818e-06, + "loss": 0.265, + "step": 25450 + }, + { + "epoch": 1.396926454445664, + "grad_norm": 1.314070224761963, + "learning_rate": 2.7893740993446133e-06, + "loss": 0.1261, + "step": 25452 + }, + { + "epoch": 1.3970362239297476, + "grad_norm": 1.4471367597579956, + "learning_rate": 2.786971154299209e-06, + "loss": 0.2062, + "step": 25454 + }, + { + "epoch": 1.3971459934138308, + "grad_norm": 1.4355390071868896, + "learning_rate": 2.784569183626276e-06, + "loss": 0.1, + "step": 25456 + }, + { + "epoch": 1.3972557628979143, + "grad_norm": 1.5720523595809937, + "learning_rate": 2.782168187431186e-06, + "loss": 0.2333, + "step": 25458 + }, + { + "epoch": 1.3973655323819978, + "grad_norm": 1.4084042310714722, + "learning_rate": 2.779768165819249e-06, + "loss": 0.19, + "step": 25460 + }, + { + "epoch": 1.3974753018660813, + "grad_norm": 1.1412091255187988, + "learning_rate": 2.7773691188957453e-06, + "loss": 0.1397, + "step": 25462 + }, + { + "epoch": 1.3975850713501647, + "grad_norm": 1.4691317081451416, + "learning_rate": 2.774971046765906e-06, + "loss": 0.2839, + "step": 25464 + }, + { + "epoch": 1.397694840834248, + "grad_norm": 1.1431854963302612, + "learning_rate": 2.772573949534918e-06, + "loss": 0.2173, + "step": 25466 + }, + { + "epoch": 1.3978046103183315, + "grad_norm": 0.7510189414024353, + "learning_rate": 2.7701778273079404e-06, + "loss": 0.127, + "step": 25468 + }, + { + "epoch": 1.397914379802415, + "grad_norm": 0.8827678561210632, + "learning_rate": 2.767782680190073e-06, + "loss": 0.1399, + "step": 25470 + }, + { + "epoch": 1.3980241492864984, + "grad_norm": 1.4484754800796509, + "learning_rate": 2.765388508286382e-06, + "loss": 0.1882, + "step": 25472 + }, + { + "epoch": 1.398133918770582, + "grad_norm": 1.0158131122589111, + "learning_rate": 2.7629953117018825e-06, + "loss": 0.2463, + "step": 25474 + }, + { + "epoch": 1.3982436882546652, + "grad_norm": 1.7137362957000732, + "learning_rate": 2.7606030905415555e-06, + "loss": 0.1217, + "step": 25476 + }, + { + "epoch": 1.3983534577387486, + "grad_norm": 1.3108166456222534, + "learning_rate": 2.7582118449103274e-06, + "loss": 0.2024, + "step": 25478 + }, + { + "epoch": 1.3984632272228321, + "grad_norm": 1.0900871753692627, + "learning_rate": 2.7558215749131033e-06, + "loss": 0.1937, + "step": 25480 + }, + { + "epoch": 1.3985729967069154, + "grad_norm": 1.3493828773498535, + "learning_rate": 2.7534322806547276e-06, + "loss": 0.2046, + "step": 25482 + }, + { + "epoch": 1.3986827661909988, + "grad_norm": 0.7561604380607605, + "learning_rate": 2.751043962240002e-06, + "loss": 0.1091, + "step": 25484 + }, + { + "epoch": 1.3987925356750823, + "grad_norm": 1.0868313312530518, + "learning_rate": 2.7486566197736874e-06, + "loss": 0.1834, + "step": 25486 + }, + { + "epoch": 1.3989023051591658, + "grad_norm": 0.8621016144752502, + "learning_rate": 2.7462702533605166e-06, + "loss": 0.1731, + "step": 25488 + }, + { + "epoch": 1.3990120746432493, + "grad_norm": 1.0028456449508667, + "learning_rate": 2.743884863105159e-06, + "loss": 0.1215, + "step": 25490 + }, + { + "epoch": 1.3991218441273325, + "grad_norm": 1.264963150024414, + "learning_rate": 2.741500449112247e-06, + "loss": 0.143, + "step": 25492 + }, + { + "epoch": 1.399231613611416, + "grad_norm": 1.2775754928588867, + "learning_rate": 2.739117011486378e-06, + "loss": 0.2779, + "step": 25494 + }, + { + "epoch": 1.3993413830954995, + "grad_norm": 1.0157629251480103, + "learning_rate": 2.736734550332104e-06, + "loss": 0.1258, + "step": 25496 + }, + { + "epoch": 1.3994511525795827, + "grad_norm": 1.5325264930725098, + "learning_rate": 2.7343530657539253e-06, + "loss": 0.1901, + "step": 25498 + }, + { + "epoch": 1.3995609220636662, + "grad_norm": 1.0083065032958984, + "learning_rate": 2.7319725578563047e-06, + "loss": 0.19, + "step": 25500 + }, + { + "epoch": 1.3996706915477497, + "grad_norm": 0.9322168231010437, + "learning_rate": 2.7295930267436682e-06, + "loss": 0.2775, + "step": 25502 + }, + { + "epoch": 1.3997804610318332, + "grad_norm": 1.555165410041809, + "learning_rate": 2.727214472520387e-06, + "loss": 0.1649, + "step": 25504 + }, + { + "epoch": 1.3998902305159167, + "grad_norm": 1.2748427391052246, + "learning_rate": 2.7248368952908053e-06, + "loss": 0.1898, + "step": 25506 + }, + { + "epoch": 1.4, + "grad_norm": 0.9134858846664429, + "learning_rate": 2.72246029515921e-06, + "loss": 0.1893, + "step": 25508 + }, + { + "epoch": 1.4001097694840834, + "grad_norm": 1.5127187967300415, + "learning_rate": 2.72008467222985e-06, + "loss": 0.2103, + "step": 25510 + }, + { + "epoch": 1.4002195389681669, + "grad_norm": 0.9846867918968201, + "learning_rate": 2.7177100266069317e-06, + "loss": 0.1921, + "step": 25512 + }, + { + "epoch": 1.4003293084522503, + "grad_norm": 1.0726747512817383, + "learning_rate": 2.7153363583946157e-06, + "loss": 0.2154, + "step": 25514 + }, + { + "epoch": 1.4004390779363338, + "grad_norm": 0.9220674633979797, + "learning_rate": 2.712963667697027e-06, + "loss": 0.1979, + "step": 25516 + }, + { + "epoch": 1.400548847420417, + "grad_norm": 1.04811692237854, + "learning_rate": 2.710591954618247e-06, + "loss": 0.2039, + "step": 25518 + }, + { + "epoch": 1.4006586169045006, + "grad_norm": 1.3930264711380005, + "learning_rate": 2.708221219262308e-06, + "loss": 0.1861, + "step": 25520 + }, + { + "epoch": 1.400768386388584, + "grad_norm": 1.2522395849227905, + "learning_rate": 2.7058514617332e-06, + "loss": 0.2527, + "step": 25522 + }, + { + "epoch": 1.4008781558726673, + "grad_norm": 1.0239448547363281, + "learning_rate": 2.7034826821348723e-06, + "loss": 0.1605, + "step": 25524 + }, + { + "epoch": 1.4009879253567508, + "grad_norm": 0.8225777745246887, + "learning_rate": 2.7011148805712314e-06, + "loss": 0.1128, + "step": 25526 + }, + { + "epoch": 1.4010976948408342, + "grad_norm": 0.9086242318153381, + "learning_rate": 2.698748057146139e-06, + "loss": 0.1389, + "step": 25528 + }, + { + "epoch": 1.4012074643249177, + "grad_norm": 1.2567485570907593, + "learning_rate": 2.6963822119634217e-06, + "loss": 0.2409, + "step": 25530 + }, + { + "epoch": 1.4013172338090012, + "grad_norm": 0.9948658347129822, + "learning_rate": 2.694017345126851e-06, + "loss": 0.197, + "step": 25532 + }, + { + "epoch": 1.4014270032930845, + "grad_norm": 1.2945151329040527, + "learning_rate": 2.6916534567401676e-06, + "loss": 0.1978, + "step": 25534 + }, + { + "epoch": 1.401536772777168, + "grad_norm": 1.139045238494873, + "learning_rate": 2.6892905469070554e-06, + "loss": 0.1373, + "step": 25536 + }, + { + "epoch": 1.4016465422612514, + "grad_norm": 0.9375502467155457, + "learning_rate": 2.6869286157311684e-06, + "loss": 0.1405, + "step": 25538 + }, + { + "epoch": 1.4017563117453347, + "grad_norm": 1.5185505151748657, + "learning_rate": 2.6845676633161095e-06, + "loss": 0.1994, + "step": 25540 + }, + { + "epoch": 1.4018660812294184, + "grad_norm": 1.022322416305542, + "learning_rate": 2.6822076897654452e-06, + "loss": 0.1248, + "step": 25542 + }, + { + "epoch": 1.4019758507135016, + "grad_norm": 1.386643409729004, + "learning_rate": 2.6798486951826934e-06, + "loss": 0.2002, + "step": 25544 + }, + { + "epoch": 1.402085620197585, + "grad_norm": 1.210630178451538, + "learning_rate": 2.6774906796713295e-06, + "loss": 0.1249, + "step": 25546 + }, + { + "epoch": 1.4021953896816686, + "grad_norm": 1.3120509386062622, + "learning_rate": 2.675133643334793e-06, + "loss": 0.2195, + "step": 25548 + }, + { + "epoch": 1.4023051591657518, + "grad_norm": 1.2700819969177246, + "learning_rate": 2.67277758627647e-06, + "loss": 0.2929, + "step": 25550 + }, + { + "epoch": 1.4024149286498353, + "grad_norm": 1.0146713256835938, + "learning_rate": 2.6704225085997127e-06, + "loss": 0.1628, + "step": 25552 + }, + { + "epoch": 1.4025246981339188, + "grad_norm": 1.0378186702728271, + "learning_rate": 2.66806841040782e-06, + "loss": 0.1698, + "step": 25554 + }, + { + "epoch": 1.4026344676180023, + "grad_norm": 1.1633683443069458, + "learning_rate": 2.6657152918040607e-06, + "loss": 0.1805, + "step": 25556 + }, + { + "epoch": 1.4027442371020857, + "grad_norm": 1.3715457916259766, + "learning_rate": 2.663363152891654e-06, + "loss": 0.1665, + "step": 25558 + }, + { + "epoch": 1.402854006586169, + "grad_norm": 0.8496752977371216, + "learning_rate": 2.6610119937737737e-06, + "loss": 0.1392, + "step": 25560 + }, + { + "epoch": 1.4029637760702525, + "grad_norm": 1.3368816375732422, + "learning_rate": 2.6586618145535536e-06, + "loss": 0.1226, + "step": 25562 + }, + { + "epoch": 1.403073545554336, + "grad_norm": 1.1834592819213867, + "learning_rate": 2.656312615334078e-06, + "loss": 0.095, + "step": 25564 + }, + { + "epoch": 1.4031833150384192, + "grad_norm": 2.1844077110290527, + "learning_rate": 2.6539643962184057e-06, + "loss": 0.2091, + "step": 25566 + }, + { + "epoch": 1.4032930845225027, + "grad_norm": 0.9886869192123413, + "learning_rate": 2.6516171573095355e-06, + "loss": 0.1925, + "step": 25568 + }, + { + "epoch": 1.4034028540065862, + "grad_norm": 1.0112859010696411, + "learning_rate": 2.649270898710432e-06, + "loss": 0.1778, + "step": 25570 + }, + { + "epoch": 1.4035126234906696, + "grad_norm": 1.4271631240844727, + "learning_rate": 2.6469256205240074e-06, + "loss": 0.2968, + "step": 25572 + }, + { + "epoch": 1.403622392974753, + "grad_norm": 0.7789452075958252, + "learning_rate": 2.6445813228531403e-06, + "loss": 0.1404, + "step": 25574 + }, + { + "epoch": 1.4037321624588364, + "grad_norm": 1.328885555267334, + "learning_rate": 2.6422380058006596e-06, + "loss": 0.172, + "step": 25576 + }, + { + "epoch": 1.4038419319429198, + "grad_norm": 0.9927303791046143, + "learning_rate": 2.639895669469358e-06, + "loss": 0.2513, + "step": 25578 + }, + { + "epoch": 1.4039517014270033, + "grad_norm": 1.1247769594192505, + "learning_rate": 2.6375543139619873e-06, + "loss": 0.2497, + "step": 25580 + }, + { + "epoch": 1.4040614709110868, + "grad_norm": 1.2992596626281738, + "learning_rate": 2.635213939381248e-06, + "loss": 0.1615, + "step": 25582 + }, + { + "epoch": 1.4041712403951703, + "grad_norm": 0.6197126507759094, + "learning_rate": 2.6328745458297943e-06, + "loss": 0.0769, + "step": 25584 + }, + { + "epoch": 1.4042810098792535, + "grad_norm": 0.9026857018470764, + "learning_rate": 2.6305361334102496e-06, + "loss": 0.1328, + "step": 25586 + }, + { + "epoch": 1.404390779363337, + "grad_norm": 0.8771283030509949, + "learning_rate": 2.628198702225185e-06, + "loss": 0.1929, + "step": 25588 + }, + { + "epoch": 1.4045005488474205, + "grad_norm": 1.5811684131622314, + "learning_rate": 2.6258622523771287e-06, + "loss": 0.1472, + "step": 25590 + }, + { + "epoch": 1.4046103183315037, + "grad_norm": 4.047150135040283, + "learning_rate": 2.6235267839685773e-06, + "loss": 0.2154, + "step": 25592 + }, + { + "epoch": 1.4047200878155872, + "grad_norm": 1.3214548826217651, + "learning_rate": 2.621192297101971e-06, + "loss": 0.292, + "step": 25594 + }, + { + "epoch": 1.4048298572996707, + "grad_norm": 1.0610817670822144, + "learning_rate": 2.6188587918797108e-06, + "loss": 0.146, + "step": 25596 + }, + { + "epoch": 1.4049396267837542, + "grad_norm": 0.8432288765907288, + "learning_rate": 2.6165262684041595e-06, + "loss": 0.1306, + "step": 25598 + }, + { + "epoch": 1.4050493962678376, + "grad_norm": 1.0697656869888306, + "learning_rate": 2.6141947267776246e-06, + "loss": 0.1358, + "step": 25600 + }, + { + "epoch": 1.405159165751921, + "grad_norm": 1.0958396196365356, + "learning_rate": 2.6118641671023903e-06, + "loss": 0.1765, + "step": 25602 + }, + { + "epoch": 1.4052689352360044, + "grad_norm": 1.9820125102996826, + "learning_rate": 2.6095345894806804e-06, + "loss": 0.1674, + "step": 25604 + }, + { + "epoch": 1.4053787047200879, + "grad_norm": 1.1171467304229736, + "learning_rate": 2.6072059940146775e-06, + "loss": 0.1439, + "step": 25606 + }, + { + "epoch": 1.4054884742041711, + "grad_norm": 1.1018532514572144, + "learning_rate": 2.6048783808065356e-06, + "loss": 0.1799, + "step": 25608 + }, + { + "epoch": 1.4055982436882546, + "grad_norm": 1.1548925638198853, + "learning_rate": 2.6025517499583484e-06, + "loss": 0.1417, + "step": 25610 + }, + { + "epoch": 1.405708013172338, + "grad_norm": 1.4297549724578857, + "learning_rate": 2.6002261015721757e-06, + "loss": 0.2039, + "step": 25612 + }, + { + "epoch": 1.4058177826564215, + "grad_norm": 1.214587688446045, + "learning_rate": 2.5979014357500248e-06, + "loss": 0.2435, + "step": 25614 + }, + { + "epoch": 1.405927552140505, + "grad_norm": 1.3948701620101929, + "learning_rate": 2.595577752593878e-06, + "loss": 0.1875, + "step": 25616 + }, + { + "epoch": 1.4060373216245883, + "grad_norm": 1.1333588361740112, + "learning_rate": 2.5932550522056594e-06, + "loss": 0.1339, + "step": 25618 + }, + { + "epoch": 1.4061470911086718, + "grad_norm": 1.0027536153793335, + "learning_rate": 2.5909333346872538e-06, + "loss": 0.1677, + "step": 25620 + }, + { + "epoch": 1.4062568605927552, + "grad_norm": 9.651357650756836, + "learning_rate": 2.5886126001405e-06, + "loss": 0.2589, + "step": 25622 + }, + { + "epoch": 1.4063666300768387, + "grad_norm": 1.350694179534912, + "learning_rate": 2.586292848667199e-06, + "loss": 0.2323, + "step": 25624 + }, + { + "epoch": 1.4064763995609222, + "grad_norm": 1.3547216653823853, + "learning_rate": 2.5839740803691032e-06, + "loss": 0.1724, + "step": 25626 + }, + { + "epoch": 1.4065861690450054, + "grad_norm": 1.2100008726119995, + "learning_rate": 2.5816562953479338e-06, + "loss": 0.203, + "step": 25628 + }, + { + "epoch": 1.406695938529089, + "grad_norm": 2.4218387603759766, + "learning_rate": 2.5793394937053544e-06, + "loss": 0.3048, + "step": 25630 + }, + { + "epoch": 1.4068057080131724, + "grad_norm": 1.5493968725204468, + "learning_rate": 2.577023675542992e-06, + "loss": 0.2543, + "step": 25632 + }, + { + "epoch": 1.4069154774972557, + "grad_norm": 1.1595954895019531, + "learning_rate": 2.574708840962428e-06, + "loss": 0.1424, + "step": 25634 + }, + { + "epoch": 1.4070252469813391, + "grad_norm": 0.9314596056938171, + "learning_rate": 2.5723949900652024e-06, + "loss": 0.1283, + "step": 25636 + }, + { + "epoch": 1.4071350164654226, + "grad_norm": 1.233547329902649, + "learning_rate": 2.570082122952816e-06, + "loss": 0.206, + "step": 25638 + }, + { + "epoch": 1.407244785949506, + "grad_norm": 1.2707229852676392, + "learning_rate": 2.567770239726716e-06, + "loss": 0.1879, + "step": 25640 + }, + { + "epoch": 1.4073545554335896, + "grad_norm": 0.7406114935874939, + "learning_rate": 2.5654593404883214e-06, + "loss": 0.1345, + "step": 25642 + }, + { + "epoch": 1.4074643249176728, + "grad_norm": 1.4668694734573364, + "learning_rate": 2.5631494253389954e-06, + "loss": 0.2982, + "step": 25644 + }, + { + "epoch": 1.4075740944017563, + "grad_norm": 1.3583672046661377, + "learning_rate": 2.5608404943800622e-06, + "loss": 0.1729, + "step": 25646 + }, + { + "epoch": 1.4076838638858398, + "grad_norm": 1.5771548748016357, + "learning_rate": 2.558532547712805e-06, + "loss": 0.2727, + "step": 25648 + }, + { + "epoch": 1.407793633369923, + "grad_norm": 0.881144106388092, + "learning_rate": 2.5562255854384515e-06, + "loss": 0.1343, + "step": 25650 + }, + { + "epoch": 1.4079034028540067, + "grad_norm": 0.845094621181488, + "learning_rate": 2.553919607658212e-06, + "loss": 0.1188, + "step": 25652 + }, + { + "epoch": 1.40801317233809, + "grad_norm": 1.0578701496124268, + "learning_rate": 2.5516146144732273e-06, + "loss": 0.1583, + "step": 25654 + }, + { + "epoch": 1.4081229418221735, + "grad_norm": 1.0840396881103516, + "learning_rate": 2.5493106059846116e-06, + "loss": 0.1264, + "step": 25656 + }, + { + "epoch": 1.408232711306257, + "grad_norm": 1.0334722995758057, + "learning_rate": 2.547007582293426e-06, + "loss": 0.1771, + "step": 25658 + }, + { + "epoch": 1.4083424807903402, + "grad_norm": 0.9395206570625305, + "learning_rate": 2.5447055435006946e-06, + "loss": 0.1351, + "step": 25660 + }, + { + "epoch": 1.4084522502744237, + "grad_norm": 0.9063157439231873, + "learning_rate": 2.5424044897073895e-06, + "loss": 0.1735, + "step": 25662 + }, + { + "epoch": 1.4085620197585071, + "grad_norm": 1.5637446641921997, + "learning_rate": 2.5401044210144553e-06, + "loss": 0.2959, + "step": 25664 + }, + { + "epoch": 1.4086717892425906, + "grad_norm": 1.387553334236145, + "learning_rate": 2.5378053375227835e-06, + "loss": 0.2939, + "step": 25666 + }, + { + "epoch": 1.408781558726674, + "grad_norm": 1.3365942239761353, + "learning_rate": 2.5355072393332124e-06, + "loss": 0.1592, + "step": 25668 + }, + { + "epoch": 1.4088913282107574, + "grad_norm": 0.9267249703407288, + "learning_rate": 2.5332101265465648e-06, + "loss": 0.1274, + "step": 25670 + }, + { + "epoch": 1.4090010976948408, + "grad_norm": 1.143927812576294, + "learning_rate": 2.53091399926359e-06, + "loss": 0.1604, + "step": 25672 + }, + { + "epoch": 1.4091108671789243, + "grad_norm": 1.1154345273971558, + "learning_rate": 2.5286188575850164e-06, + "loss": 0.1588, + "step": 25674 + }, + { + "epoch": 1.4092206366630076, + "grad_norm": 1.6667636632919312, + "learning_rate": 2.526324701611507e-06, + "loss": 0.2066, + "step": 25676 + }, + { + "epoch": 1.409330406147091, + "grad_norm": 0.9546225666999817, + "learning_rate": 2.52403153144371e-06, + "loss": 0.1892, + "step": 25678 + }, + { + "epoch": 1.4094401756311745, + "grad_norm": 0.9384045600891113, + "learning_rate": 2.5217393471822117e-06, + "loss": 0.152, + "step": 25680 + }, + { + "epoch": 1.409549945115258, + "grad_norm": 1.1357172727584839, + "learning_rate": 2.51944814892755e-06, + "loss": 0.1478, + "step": 25682 + }, + { + "epoch": 1.4096597145993415, + "grad_norm": 2.2471485137939453, + "learning_rate": 2.5171579367802373e-06, + "loss": 0.186, + "step": 25684 + }, + { + "epoch": 1.4097694840834247, + "grad_norm": 1.1916868686676025, + "learning_rate": 2.514868710840723e-06, + "loss": 0.2338, + "step": 25686 + }, + { + "epoch": 1.4098792535675082, + "grad_norm": 0.7735419869422913, + "learning_rate": 2.512580471209436e-06, + "loss": 0.1006, + "step": 25688 + }, + { + "epoch": 1.4099890230515917, + "grad_norm": 0.9571002721786499, + "learning_rate": 2.510293217986745e-06, + "loss": 0.1558, + "step": 25690 + }, + { + "epoch": 1.4100987925356752, + "grad_norm": 1.0274168252944946, + "learning_rate": 2.5080069512729787e-06, + "loss": 0.1045, + "step": 25692 + }, + { + "epoch": 1.4102085620197586, + "grad_norm": 1.3805367946624756, + "learning_rate": 2.505721671168426e-06, + "loss": 0.2814, + "step": 25694 + }, + { + "epoch": 1.410318331503842, + "grad_norm": 0.7889387011528015, + "learning_rate": 2.5034373777733267e-06, + "loss": 0.1526, + "step": 25696 + }, + { + "epoch": 1.4104281009879254, + "grad_norm": 1.3642230033874512, + "learning_rate": 2.5011540711878806e-06, + "loss": 0.1983, + "step": 25698 + }, + { + "epoch": 1.4105378704720088, + "grad_norm": 1.4813374280929565, + "learning_rate": 2.4988717515122466e-06, + "loss": 0.2037, + "step": 25700 + }, + { + "epoch": 1.410647639956092, + "grad_norm": 1.0618765354156494, + "learning_rate": 2.496590418846545e-06, + "loss": 0.131, + "step": 25702 + }, + { + "epoch": 1.4107574094401756, + "grad_norm": 1.3467164039611816, + "learning_rate": 2.4943100732908427e-06, + "loss": 0.2007, + "step": 25704 + }, + { + "epoch": 1.410867178924259, + "grad_norm": 1.3675401210784912, + "learning_rate": 2.492030714945162e-06, + "loss": 0.174, + "step": 25706 + }, + { + "epoch": 1.4109769484083425, + "grad_norm": 2.3067550659179688, + "learning_rate": 2.4897523439094935e-06, + "loss": 0.176, + "step": 25708 + }, + { + "epoch": 1.411086717892426, + "grad_norm": 1.0979701280593872, + "learning_rate": 2.4874749602837697e-06, + "loss": 0.1761, + "step": 25710 + }, + { + "epoch": 1.4111964873765093, + "grad_norm": 0.7051856517791748, + "learning_rate": 2.48519856416789e-06, + "loss": 0.1241, + "step": 25712 + }, + { + "epoch": 1.4113062568605927, + "grad_norm": 0.840392529964447, + "learning_rate": 2.482923155661715e-06, + "loss": 0.151, + "step": 25714 + }, + { + "epoch": 1.4114160263446762, + "grad_norm": 1.048718810081482, + "learning_rate": 2.4806487348650485e-06, + "loss": 0.2352, + "step": 25716 + }, + { + "epoch": 1.4115257958287595, + "grad_norm": 1.2160241603851318, + "learning_rate": 2.478375301877664e-06, + "loss": 0.1689, + "step": 25718 + }, + { + "epoch": 1.411635565312843, + "grad_norm": 1.2721765041351318, + "learning_rate": 2.476102856799278e-06, + "loss": 0.2305, + "step": 25720 + }, + { + "epoch": 1.4117453347969264, + "grad_norm": 0.9666181802749634, + "learning_rate": 2.4738313997295757e-06, + "loss": 0.1129, + "step": 25722 + }, + { + "epoch": 1.41185510428101, + "grad_norm": 1.835404396057129, + "learning_rate": 2.471560930768188e-06, + "loss": 0.2104, + "step": 25724 + }, + { + "epoch": 1.4119648737650934, + "grad_norm": 2.051481246948242, + "learning_rate": 2.4692914500147184e-06, + "loss": 0.2607, + "step": 25726 + }, + { + "epoch": 1.4120746432491766, + "grad_norm": 1.184136152267456, + "learning_rate": 2.4670229575687098e-06, + "loss": 0.2248, + "step": 25728 + }, + { + "epoch": 1.4121844127332601, + "grad_norm": 0.8972625732421875, + "learning_rate": 2.464755453529677e-06, + "loss": 0.1648, + "step": 25730 + }, + { + "epoch": 1.4122941822173436, + "grad_norm": 1.2694960832595825, + "learning_rate": 2.462488937997079e-06, + "loss": 0.1682, + "step": 25732 + }, + { + "epoch": 1.412403951701427, + "grad_norm": 0.8753219246864319, + "learning_rate": 2.4602234110703364e-06, + "loss": 0.1117, + "step": 25734 + }, + { + "epoch": 1.4125137211855106, + "grad_norm": 4.66762113571167, + "learning_rate": 2.457958872848823e-06, + "loss": 0.1677, + "step": 25736 + }, + { + "epoch": 1.4126234906695938, + "grad_norm": 0.9889683723449707, + "learning_rate": 2.455695323431878e-06, + "loss": 0.1741, + "step": 25738 + }, + { + "epoch": 1.4127332601536773, + "grad_norm": 1.4028398990631104, + "learning_rate": 2.4534327629187946e-06, + "loss": 0.208, + "step": 25740 + }, + { + "epoch": 1.4128430296377608, + "grad_norm": 1.141016960144043, + "learning_rate": 2.451171191408813e-06, + "loss": 0.173, + "step": 25742 + }, + { + "epoch": 1.412952799121844, + "grad_norm": 1.3565670251846313, + "learning_rate": 2.44891060900114e-06, + "loss": 0.2259, + "step": 25744 + }, + { + "epoch": 1.4130625686059275, + "grad_norm": 1.0358763933181763, + "learning_rate": 2.446651015794932e-06, + "loss": 0.1874, + "step": 25746 + }, + { + "epoch": 1.413172338090011, + "grad_norm": 1.196617841720581, + "learning_rate": 2.444392411889307e-06, + "loss": 0.1451, + "step": 25748 + }, + { + "epoch": 1.4132821075740944, + "grad_norm": 2.316559314727783, + "learning_rate": 2.442134797383344e-06, + "loss": 0.1809, + "step": 25750 + }, + { + "epoch": 1.413391877058178, + "grad_norm": 1.4569990634918213, + "learning_rate": 2.43987817237607e-06, + "loss": 0.244, + "step": 25752 + }, + { + "epoch": 1.4135016465422612, + "grad_norm": 1.1352146863937378, + "learning_rate": 2.437622536966472e-06, + "loss": 0.1585, + "step": 25754 + }, + { + "epoch": 1.4136114160263447, + "grad_norm": 1.0525645017623901, + "learning_rate": 2.43536789125349e-06, + "loss": 0.1696, + "step": 25756 + }, + { + "epoch": 1.4137211855104281, + "grad_norm": 1.253947377204895, + "learning_rate": 2.43311423533602e-06, + "loss": 0.2519, + "step": 25758 + }, + { + "epoch": 1.4138309549945114, + "grad_norm": 0.8948521614074707, + "learning_rate": 2.430861569312931e-06, + "loss": 0.1868, + "step": 25760 + }, + { + "epoch": 1.4139407244785949, + "grad_norm": 0.4118186831474304, + "learning_rate": 2.4286098932830264e-06, + "loss": 0.1353, + "step": 25762 + }, + { + "epoch": 1.4140504939626783, + "grad_norm": 1.1531094312667847, + "learning_rate": 2.426359207345083e-06, + "loss": 0.273, + "step": 25764 + }, + { + "epoch": 1.4141602634467618, + "grad_norm": 1.2985079288482666, + "learning_rate": 2.424109511597822e-06, + "loss": 0.1843, + "step": 25766 + }, + { + "epoch": 1.4142700329308453, + "grad_norm": 0.9675482511520386, + "learning_rate": 2.421860806139925e-06, + "loss": 0.2001, + "step": 25768 + }, + { + "epoch": 1.4143798024149286, + "grad_norm": 0.9869803190231323, + "learning_rate": 2.419613091070036e-06, + "loss": 0.1207, + "step": 25770 + }, + { + "epoch": 1.414489571899012, + "grad_norm": 1.2401366233825684, + "learning_rate": 2.4173663664867395e-06, + "loss": 0.1479, + "step": 25772 + }, + { + "epoch": 1.4145993413830955, + "grad_norm": 1.17876398563385, + "learning_rate": 2.4151206324886044e-06, + "loss": 0.199, + "step": 25774 + }, + { + "epoch": 1.414709110867179, + "grad_norm": 1.1770098209381104, + "learning_rate": 2.412875889174129e-06, + "loss": 0.1805, + "step": 25776 + }, + { + "epoch": 1.4148188803512625, + "grad_norm": 1.2775338888168335, + "learning_rate": 2.410632136641783e-06, + "loss": 0.1869, + "step": 25778 + }, + { + "epoch": 1.4149286498353457, + "grad_norm": 1.5220993757247925, + "learning_rate": 2.408389374989986e-06, + "loss": 0.3118, + "step": 25780 + }, + { + "epoch": 1.4150384193194292, + "grad_norm": 1.4438174962997437, + "learning_rate": 2.406147604317119e-06, + "loss": 0.247, + "step": 25782 + }, + { + "epoch": 1.4151481888035127, + "grad_norm": 1.0787731409072876, + "learning_rate": 2.403906824721508e-06, + "loss": 0.1606, + "step": 25784 + }, + { + "epoch": 1.415257958287596, + "grad_norm": 1.4500129222869873, + "learning_rate": 2.4016670363014583e-06, + "loss": 0.1838, + "step": 25786 + }, + { + "epoch": 1.4153677277716794, + "grad_norm": 1.489082932472229, + "learning_rate": 2.3994282391552047e-06, + "loss": 0.2693, + "step": 25788 + }, + { + "epoch": 1.4154774972557629, + "grad_norm": 1.0072025060653687, + "learning_rate": 2.3971904333809637e-06, + "loss": 0.1761, + "step": 25790 + }, + { + "epoch": 1.4155872667398464, + "grad_norm": 1.2366124391555786, + "learning_rate": 2.3949536190768923e-06, + "loss": 0.159, + "step": 25792 + }, + { + "epoch": 1.4156970362239298, + "grad_norm": 0.9271854758262634, + "learning_rate": 2.3927177963411096e-06, + "loss": 0.3598, + "step": 25794 + }, + { + "epoch": 1.415806805708013, + "grad_norm": 1.3847509622573853, + "learning_rate": 2.3904829652716843e-06, + "loss": 0.2216, + "step": 25796 + }, + { + "epoch": 1.4159165751920966, + "grad_norm": 1.2472063302993774, + "learning_rate": 2.3882491259666464e-06, + "loss": 0.1275, + "step": 25798 + }, + { + "epoch": 1.41602634467618, + "grad_norm": 1.185097575187683, + "learning_rate": 2.386016278523992e-06, + "loss": 0.1853, + "step": 25800 + }, + { + "epoch": 1.4161361141602635, + "grad_norm": 1.010351538658142, + "learning_rate": 2.3837844230416574e-06, + "loss": 0.1504, + "step": 25802 + }, + { + "epoch": 1.416245883644347, + "grad_norm": 1.2413389682769775, + "learning_rate": 2.381553559617547e-06, + "loss": 0.1927, + "step": 25804 + }, + { + "epoch": 1.4163556531284303, + "grad_norm": 0.7385395765304565, + "learning_rate": 2.379323688349516e-06, + "loss": 0.123, + "step": 25806 + }, + { + "epoch": 1.4164654226125137, + "grad_norm": 1.1661362648010254, + "learning_rate": 2.3770948093353757e-06, + "loss": 0.2167, + "step": 25808 + }, + { + "epoch": 1.4165751920965972, + "grad_norm": 1.2755669355392456, + "learning_rate": 2.3748669226728914e-06, + "loss": 0.1591, + "step": 25810 + }, + { + "epoch": 1.4166849615806805, + "grad_norm": 1.134486436843872, + "learning_rate": 2.3726400284597994e-06, + "loss": 0.3, + "step": 25812 + }, + { + "epoch": 1.416794731064764, + "grad_norm": 1.0350393056869507, + "learning_rate": 2.3704141267937796e-06, + "loss": 0.1072, + "step": 25814 + }, + { + "epoch": 1.4169045005488474, + "grad_norm": 1.5150959491729736, + "learning_rate": 2.368189217772465e-06, + "loss": 0.2422, + "step": 25816 + }, + { + "epoch": 1.417014270032931, + "grad_norm": 1.2834060192108154, + "learning_rate": 2.365965301493453e-06, + "loss": 0.2074, + "step": 25818 + }, + { + "epoch": 1.4171240395170144, + "grad_norm": 1.9352822303771973, + "learning_rate": 2.3637423780543013e-06, + "loss": 0.2611, + "step": 25820 + }, + { + "epoch": 1.4172338090010976, + "grad_norm": 0.8719683885574341, + "learning_rate": 2.3615204475525093e-06, + "loss": 0.1852, + "step": 25822 + }, + { + "epoch": 1.417343578485181, + "grad_norm": 1.0001835823059082, + "learning_rate": 2.3592995100855526e-06, + "loss": 0.1866, + "step": 25824 + }, + { + "epoch": 1.4174533479692646, + "grad_norm": 1.1538043022155762, + "learning_rate": 2.3570795657508445e-06, + "loss": 0.1845, + "step": 25826 + }, + { + "epoch": 1.4175631174533478, + "grad_norm": 1.4563089609146118, + "learning_rate": 2.3548606146457654e-06, + "loss": 0.2466, + "step": 25828 + }, + { + "epoch": 1.4176728869374313, + "grad_norm": 1.1343213319778442, + "learning_rate": 2.3526426568676483e-06, + "loss": 0.1786, + "step": 25830 + }, + { + "epoch": 1.4177826564215148, + "grad_norm": 1.5349342823028564, + "learning_rate": 2.350425692513783e-06, + "loss": 0.1459, + "step": 25832 + }, + { + "epoch": 1.4178924259055983, + "grad_norm": 0.7878022789955139, + "learning_rate": 2.348209721681416e-06, + "loss": 0.1777, + "step": 25834 + }, + { + "epoch": 1.4180021953896818, + "grad_norm": 2.4830665588378906, + "learning_rate": 2.3459947444677554e-06, + "loss": 0.2756, + "step": 25836 + }, + { + "epoch": 1.418111964873765, + "grad_norm": 1.3704235553741455, + "learning_rate": 2.3437807609699573e-06, + "loss": 0.1348, + "step": 25838 + }, + { + "epoch": 1.4182217343578485, + "grad_norm": 1.3599541187286377, + "learning_rate": 2.3415677712851386e-06, + "loss": 0.1959, + "step": 25840 + }, + { + "epoch": 1.418331503841932, + "grad_norm": 0.8139934539794922, + "learning_rate": 2.3393557755103714e-06, + "loss": 0.1243, + "step": 25842 + }, + { + "epoch": 1.4184412733260154, + "grad_norm": 0.7026658058166504, + "learning_rate": 2.3371447737426833e-06, + "loss": 0.1389, + "step": 25844 + }, + { + "epoch": 1.418551042810099, + "grad_norm": 0.9770546555519104, + "learning_rate": 2.3349347660790582e-06, + "loss": 0.1079, + "step": 25846 + }, + { + "epoch": 1.4186608122941822, + "grad_norm": 0.9805997610092163, + "learning_rate": 2.332725752616441e-06, + "loss": 0.1641, + "step": 25848 + }, + { + "epoch": 1.4187705817782656, + "grad_norm": 1.2612676620483398, + "learning_rate": 2.3305177334517343e-06, + "loss": 0.2035, + "step": 25850 + }, + { + "epoch": 1.4188803512623491, + "grad_norm": 1.0458204746246338, + "learning_rate": 2.3283107086817884e-06, + "loss": 0.2459, + "step": 25852 + }, + { + "epoch": 1.4189901207464324, + "grad_norm": 1.2175368070602417, + "learning_rate": 2.326104678403415e-06, + "loss": 0.2123, + "step": 25854 + }, + { + "epoch": 1.4190998902305159, + "grad_norm": 0.7663483023643494, + "learning_rate": 2.3238996427133784e-06, + "loss": 0.1245, + "step": 25856 + }, + { + "epoch": 1.4192096597145993, + "grad_norm": 0.9408507347106934, + "learning_rate": 2.321695601708404e-06, + "loss": 0.103, + "step": 25858 + }, + { + "epoch": 1.4193194291986828, + "grad_norm": 1.4333471059799194, + "learning_rate": 2.3194925554851696e-06, + "loss": 0.2632, + "step": 25860 + }, + { + "epoch": 1.4194291986827663, + "grad_norm": 1.105607032775879, + "learning_rate": 2.3172905041403183e-06, + "loss": 0.1489, + "step": 25862 + }, + { + "epoch": 1.4195389681668495, + "grad_norm": 0.9781045913696289, + "learning_rate": 2.3150894477704383e-06, + "loss": 0.1382, + "step": 25864 + }, + { + "epoch": 1.419648737650933, + "grad_norm": 1.0438085794448853, + "learning_rate": 2.312889386472078e-06, + "loss": 0.1896, + "step": 25866 + }, + { + "epoch": 1.4197585071350165, + "grad_norm": 1.3582128286361694, + "learning_rate": 2.3106903203417462e-06, + "loss": 0.2464, + "step": 25868 + }, + { + "epoch": 1.4198682766190998, + "grad_norm": 1.3743165731430054, + "learning_rate": 2.3084922494758963e-06, + "loss": 0.1668, + "step": 25870 + }, + { + "epoch": 1.4199780461031832, + "grad_norm": 0.8402392268180847, + "learning_rate": 2.30629517397096e-06, + "loss": 0.148, + "step": 25872 + }, + { + "epoch": 1.4200878155872667, + "grad_norm": 2.593665599822998, + "learning_rate": 2.3040990939233037e-06, + "loss": 0.1854, + "step": 25874 + }, + { + "epoch": 1.4201975850713502, + "grad_norm": 1.3634395599365234, + "learning_rate": 2.3019040094292564e-06, + "loss": 0.1924, + "step": 25876 + }, + { + "epoch": 1.4203073545554337, + "grad_norm": 1.3013806343078613, + "learning_rate": 2.299709920585108e-06, + "loss": 0.203, + "step": 25878 + }, + { + "epoch": 1.420417124039517, + "grad_norm": 0.9304802417755127, + "learning_rate": 2.2975168274871034e-06, + "loss": 0.1807, + "step": 25880 + }, + { + "epoch": 1.4205268935236004, + "grad_norm": 1.4089161157608032, + "learning_rate": 2.2953247302314407e-06, + "loss": 0.3366, + "step": 25882 + }, + { + "epoch": 1.4206366630076839, + "grad_norm": 0.805533230304718, + "learning_rate": 2.2931336289142735e-06, + "loss": 0.178, + "step": 25884 + }, + { + "epoch": 1.4207464324917674, + "grad_norm": 1.0121461153030396, + "learning_rate": 2.2909435236317222e-06, + "loss": 0.1344, + "step": 25886 + }, + { + "epoch": 1.4208562019758508, + "grad_norm": 1.0204451084136963, + "learning_rate": 2.288754414479849e-06, + "loss": 0.1201, + "step": 25888 + }, + { + "epoch": 1.420965971459934, + "grad_norm": 1.0321581363677979, + "learning_rate": 2.2865663015546824e-06, + "loss": 0.1582, + "step": 25890 + }, + { + "epoch": 1.4210757409440176, + "grad_norm": 1.4047870635986328, + "learning_rate": 2.284379184952201e-06, + "loss": 0.2468, + "step": 25892 + }, + { + "epoch": 1.421185510428101, + "grad_norm": 1.202754259109497, + "learning_rate": 2.2821930647683425e-06, + "loss": 0.1871, + "step": 25894 + }, + { + "epoch": 1.4212952799121843, + "grad_norm": 1.184661865234375, + "learning_rate": 2.2800079410989966e-06, + "loss": 0.1504, + "step": 25896 + }, + { + "epoch": 1.4214050493962678, + "grad_norm": 0.9644410610198975, + "learning_rate": 2.2778238140400255e-06, + "loss": 0.1648, + "step": 25898 + }, + { + "epoch": 1.4215148188803512, + "grad_norm": 1.2728420495986938, + "learning_rate": 2.2756406836872272e-06, + "loss": 0.1704, + "step": 25900 + }, + { + "epoch": 1.4216245883644347, + "grad_norm": 1.0959441661834717, + "learning_rate": 2.2734585501363673e-06, + "loss": 0.1416, + "step": 25902 + }, + { + "epoch": 1.4217343578485182, + "grad_norm": 0.7609245181083679, + "learning_rate": 2.271277413483164e-06, + "loss": 0.1433, + "step": 25904 + }, + { + "epoch": 1.4218441273326015, + "grad_norm": 1.4126769304275513, + "learning_rate": 2.269097273823287e-06, + "loss": 0.1637, + "step": 25906 + }, + { + "epoch": 1.421953896816685, + "grad_norm": 1.1881325244903564, + "learning_rate": 2.2669181312523795e-06, + "loss": 0.1586, + "step": 25908 + }, + { + "epoch": 1.4220636663007684, + "grad_norm": 1.1802077293395996, + "learning_rate": 2.2647399858660156e-06, + "loss": 0.1849, + "step": 25910 + }, + { + "epoch": 1.422173435784852, + "grad_norm": 2.2810909748077393, + "learning_rate": 2.2625628377597546e-06, + "loss": 0.2292, + "step": 25912 + }, + { + "epoch": 1.4222832052689354, + "grad_norm": 1.0426061153411865, + "learning_rate": 2.2603866870290897e-06, + "loss": 0.2732, + "step": 25914 + }, + { + "epoch": 1.4223929747530186, + "grad_norm": 0.8149722814559937, + "learning_rate": 2.258211533769475e-06, + "loss": 0.1571, + "step": 25916 + }, + { + "epoch": 1.422502744237102, + "grad_norm": 1.0341298580169678, + "learning_rate": 2.2560373780763257e-06, + "loss": 0.1763, + "step": 25918 + }, + { + "epoch": 1.4226125137211856, + "grad_norm": 1.6401679515838623, + "learning_rate": 2.253864220045007e-06, + "loss": 0.2012, + "step": 25920 + }, + { + "epoch": 1.4227222832052688, + "grad_norm": 1.077752947807312, + "learning_rate": 2.2516920597708547e-06, + "loss": 0.1967, + "step": 25922 + }, + { + "epoch": 1.4228320526893523, + "grad_norm": 0.9032753705978394, + "learning_rate": 2.2495208973491416e-06, + "loss": 0.1339, + "step": 25924 + }, + { + "epoch": 1.4229418221734358, + "grad_norm": 1.1303924322128296, + "learning_rate": 2.2473507328751086e-06, + "loss": 0.1384, + "step": 25926 + }, + { + "epoch": 1.4230515916575193, + "grad_norm": 1.0514073371887207, + "learning_rate": 2.245181566443952e-06, + "loss": 0.1196, + "step": 25928 + }, + { + "epoch": 1.4231613611416027, + "grad_norm": 0.8722531199455261, + "learning_rate": 2.2430133981508143e-06, + "loss": 0.2272, + "step": 25930 + }, + { + "epoch": 1.423271130625686, + "grad_norm": 1.125431776046753, + "learning_rate": 2.240846228090804e-06, + "loss": 0.2529, + "step": 25932 + }, + { + "epoch": 1.4233809001097695, + "grad_norm": 1.1755071878433228, + "learning_rate": 2.238680056358991e-06, + "loss": 0.1957, + "step": 25934 + }, + { + "epoch": 1.423490669593853, + "grad_norm": 0.9489601850509644, + "learning_rate": 2.2365148830503917e-06, + "loss": 0.1375, + "step": 25936 + }, + { + "epoch": 1.4236004390779362, + "grad_norm": 1.1806501150131226, + "learning_rate": 2.2343507082599774e-06, + "loss": 0.1821, + "step": 25938 + }, + { + "epoch": 1.4237102085620197, + "grad_norm": 2.065587043762207, + "learning_rate": 2.2321875320826773e-06, + "loss": 0.2971, + "step": 25940 + }, + { + "epoch": 1.4238199780461032, + "grad_norm": 0.7846449017524719, + "learning_rate": 2.2300253546133883e-06, + "loss": 0.1068, + "step": 25942 + }, + { + "epoch": 1.4239297475301866, + "grad_norm": 1.201686143875122, + "learning_rate": 2.2278641759469477e-06, + "loss": 0.2037, + "step": 25944 + }, + { + "epoch": 1.4240395170142701, + "grad_norm": 1.1514983177185059, + "learning_rate": 2.225703996178155e-06, + "loss": 0.1735, + "step": 25946 + }, + { + "epoch": 1.4241492864983534, + "grad_norm": 1.15680992603302, + "learning_rate": 2.223544815401768e-06, + "loss": 0.1324, + "step": 25948 + }, + { + "epoch": 1.4242590559824369, + "grad_norm": 1.2098270654678345, + "learning_rate": 2.2213866337125022e-06, + "loss": 0.3192, + "step": 25950 + }, + { + "epoch": 1.4243688254665203, + "grad_norm": 0.9041783809661865, + "learning_rate": 2.219229451205021e-06, + "loss": 0.1696, + "step": 25952 + }, + { + "epoch": 1.4244785949506038, + "grad_norm": 1.186061143875122, + "learning_rate": 2.2170732679739515e-06, + "loss": 0.1608, + "step": 25954 + }, + { + "epoch": 1.4245883644346873, + "grad_norm": 1.210775375366211, + "learning_rate": 2.2149180841138676e-06, + "loss": 0.1869, + "step": 25956 + }, + { + "epoch": 1.4246981339187705, + "grad_norm": 1.271100401878357, + "learning_rate": 2.2127638997193197e-06, + "loss": 0.1826, + "step": 25958 + }, + { + "epoch": 1.424807903402854, + "grad_norm": 1.037030577659607, + "learning_rate": 2.210610714884789e-06, + "loss": 0.2096, + "step": 25960 + }, + { + "epoch": 1.4249176728869375, + "grad_norm": 0.8226131200790405, + "learning_rate": 2.208458529704732e-06, + "loss": 0.1414, + "step": 25962 + }, + { + "epoch": 1.4250274423710207, + "grad_norm": 1.1781182289123535, + "learning_rate": 2.2063073442735534e-06, + "loss": 0.1416, + "step": 25964 + }, + { + "epoch": 1.4251372118551042, + "grad_norm": 1.3804582357406616, + "learning_rate": 2.2041571586856103e-06, + "loss": 0.2521, + "step": 25966 + }, + { + "epoch": 1.4252469813391877, + "grad_norm": 0.8621755838394165, + "learning_rate": 2.2020079730352195e-06, + "loss": 0.183, + "step": 25968 + }, + { + "epoch": 1.4253567508232712, + "grad_norm": 1.573302984237671, + "learning_rate": 2.1998597874166553e-06, + "loss": 0.2547, + "step": 25970 + }, + { + "epoch": 1.4254665203073547, + "grad_norm": 1.8919614553451538, + "learning_rate": 2.1977126019241566e-06, + "loss": 0.1714, + "step": 25972 + }, + { + "epoch": 1.425576289791438, + "grad_norm": 1.151404619216919, + "learning_rate": 2.1955664166519036e-06, + "loss": 0.147, + "step": 25974 + }, + { + "epoch": 1.4256860592755214, + "grad_norm": 1.3145928382873535, + "learning_rate": 2.19342123169404e-06, + "loss": 0.2026, + "step": 25976 + }, + { + "epoch": 1.4257958287596049, + "grad_norm": 1.3821475505828857, + "learning_rate": 2.1912770471446605e-06, + "loss": 0.1225, + "step": 25978 + }, + { + "epoch": 1.4259055982436881, + "grad_norm": 1.1339459419250488, + "learning_rate": 2.1891338630978226e-06, + "loss": 0.1754, + "step": 25980 + }, + { + "epoch": 1.4260153677277716, + "grad_norm": 1.6313952207565308, + "learning_rate": 2.186991679647529e-06, + "loss": 0.2858, + "step": 25982 + }, + { + "epoch": 1.426125137211855, + "grad_norm": 1.2048596143722534, + "learning_rate": 2.184850496887761e-06, + "loss": 0.2142, + "step": 25984 + }, + { + "epoch": 1.4262349066959386, + "grad_norm": 1.142225980758667, + "learning_rate": 2.1827103149124313e-06, + "loss": 0.201, + "step": 25986 + }, + { + "epoch": 1.426344676180022, + "grad_norm": 0.6627106070518494, + "learning_rate": 2.1805711338154205e-06, + "loss": 0.1135, + "step": 25988 + }, + { + "epoch": 1.4264544456641053, + "grad_norm": 0.7189264297485352, + "learning_rate": 2.178432953690565e-06, + "loss": 0.1659, + "step": 25990 + }, + { + "epoch": 1.4265642151481888, + "grad_norm": 1.3519121408462524, + "learning_rate": 2.176295774631651e-06, + "loss": 0.1466, + "step": 25992 + }, + { + "epoch": 1.4266739846322722, + "grad_norm": 1.4787371158599854, + "learning_rate": 2.1741595967324335e-06, + "loss": 0.2417, + "step": 25994 + }, + { + "epoch": 1.4267837541163557, + "grad_norm": 0.9230902194976807, + "learning_rate": 2.172024420086613e-06, + "loss": 0.1513, + "step": 25996 + }, + { + "epoch": 1.4268935236004392, + "grad_norm": 0.9721460938453674, + "learning_rate": 2.1698902447878477e-06, + "loss": 0.1993, + "step": 25998 + }, + { + "epoch": 1.4270032930845225, + "grad_norm": 1.0949411392211914, + "learning_rate": 2.1677570709297463e-06, + "loss": 0.1726, + "step": 26000 + }, + { + "epoch": 1.427113062568606, + "grad_norm": 0.9191784858703613, + "learning_rate": 2.1656248986058945e-06, + "loss": 0.2656, + "step": 26002 + }, + { + "epoch": 1.4272228320526894, + "grad_norm": 1.3499864339828491, + "learning_rate": 2.16349372790981e-06, + "loss": 0.187, + "step": 26004 + }, + { + "epoch": 1.4273326015367727, + "grad_norm": 1.2286324501037598, + "learning_rate": 2.1613635589349756e-06, + "loss": 0.1314, + "step": 26006 + }, + { + "epoch": 1.4274423710208561, + "grad_norm": 0.9074335694313049, + "learning_rate": 2.1592343917748394e-06, + "loss": 0.2146, + "step": 26008 + }, + { + "epoch": 1.4275521405049396, + "grad_norm": 0.9694872498512268, + "learning_rate": 2.15710622652279e-06, + "loss": 0.1542, + "step": 26010 + }, + { + "epoch": 1.427661909989023, + "grad_norm": 0.9723356366157532, + "learning_rate": 2.1549790632721806e-06, + "loss": 0.1386, + "step": 26012 + }, + { + "epoch": 1.4277716794731066, + "grad_norm": 1.0178453922271729, + "learning_rate": 2.1528529021163203e-06, + "loss": 0.17, + "step": 26014 + }, + { + "epoch": 1.4278814489571898, + "grad_norm": 1.6260790824890137, + "learning_rate": 2.150727743148473e-06, + "loss": 0.1955, + "step": 26016 + }, + { + "epoch": 1.4279912184412733, + "grad_norm": 1.0599018335342407, + "learning_rate": 2.148603586461853e-06, + "loss": 0.1588, + "step": 26018 + }, + { + "epoch": 1.4281009879253568, + "grad_norm": 1.4901167154312134, + "learning_rate": 2.1464804321496447e-06, + "loss": 0.1782, + "step": 26020 + }, + { + "epoch": 1.4282107574094403, + "grad_norm": 2.1984617710113525, + "learning_rate": 2.1443582803049755e-06, + "loss": 0.2082, + "step": 26022 + }, + { + "epoch": 1.4283205268935237, + "grad_norm": 0.963657557964325, + "learning_rate": 2.1422371310209354e-06, + "loss": 0.1041, + "step": 26024 + }, + { + "epoch": 1.428430296377607, + "grad_norm": 1.0620731115341187, + "learning_rate": 2.1401169843905693e-06, + "loss": 0.2405, + "step": 26026 + }, + { + "epoch": 1.4285400658616905, + "grad_norm": 1.0970995426177979, + "learning_rate": 2.1379978405068725e-06, + "loss": 0.1399, + "step": 26028 + }, + { + "epoch": 1.428649835345774, + "grad_norm": 1.1171379089355469, + "learning_rate": 2.1358796994628005e-06, + "loss": 0.1845, + "step": 26030 + }, + { + "epoch": 1.4287596048298572, + "grad_norm": 1.0374550819396973, + "learning_rate": 2.1337625613512657e-06, + "loss": 0.184, + "step": 26032 + }, + { + "epoch": 1.4288693743139407, + "grad_norm": 1.0260268449783325, + "learning_rate": 2.1316464262651464e-06, + "loss": 0.1608, + "step": 26034 + }, + { + "epoch": 1.4289791437980242, + "grad_norm": 0.9644562602043152, + "learning_rate": 2.12953129429726e-06, + "loss": 0.1878, + "step": 26036 + }, + { + "epoch": 1.4290889132821076, + "grad_norm": 1.0441715717315674, + "learning_rate": 2.127417165540385e-06, + "loss": 0.2062, + "step": 26038 + }, + { + "epoch": 1.429198682766191, + "grad_norm": 1.085955262184143, + "learning_rate": 2.125304040087256e-06, + "loss": 0.1502, + "step": 26040 + }, + { + "epoch": 1.4293084522502744, + "grad_norm": 1.2268882989883423, + "learning_rate": 2.1231919180305647e-06, + "loss": 0.1348, + "step": 26042 + }, + { + "epoch": 1.4294182217343578, + "grad_norm": 1.0687134265899658, + "learning_rate": 2.1210807994629685e-06, + "loss": 0.1337, + "step": 26044 + }, + { + "epoch": 1.4295279912184413, + "grad_norm": 1.237374186515808, + "learning_rate": 2.118970684477062e-06, + "loss": 0.1499, + "step": 26046 + }, + { + "epoch": 1.4296377607025246, + "grad_norm": 0.9219495058059692, + "learning_rate": 2.116861573165407e-06, + "loss": 0.1714, + "step": 26048 + }, + { + "epoch": 1.429747530186608, + "grad_norm": 1.6081827878952026, + "learning_rate": 2.1147534656205225e-06, + "loss": 0.2497, + "step": 26050 + }, + { + "epoch": 1.4298572996706915, + "grad_norm": 1.077365756034851, + "learning_rate": 2.1126463619348807e-06, + "loss": 0.1618, + "step": 26052 + }, + { + "epoch": 1.429967069154775, + "grad_norm": 1.00556218624115, + "learning_rate": 2.1105402622009e-06, + "loss": 0.1847, + "step": 26054 + }, + { + "epoch": 1.4300768386388585, + "grad_norm": 1.2713344097137451, + "learning_rate": 2.1084351665109746e-06, + "loss": 0.1781, + "step": 26056 + }, + { + "epoch": 1.4301866081229417, + "grad_norm": 1.3372353315353394, + "learning_rate": 2.106331074957443e-06, + "loss": 0.2461, + "step": 26058 + }, + { + "epoch": 1.4302963776070252, + "grad_norm": 1.7180782556533813, + "learning_rate": 2.1042279876325952e-06, + "loss": 0.175, + "step": 26060 + }, + { + "epoch": 1.4304061470911087, + "grad_norm": 1.123822808265686, + "learning_rate": 2.102125904628691e-06, + "loss": 0.2486, + "step": 26062 + }, + { + "epoch": 1.4305159165751922, + "grad_norm": 1.7753612995147705, + "learning_rate": 2.100024826037933e-06, + "loss": 0.3472, + "step": 26064 + }, + { + "epoch": 1.4306256860592756, + "grad_norm": 1.3173108100891113, + "learning_rate": 2.0979247519524877e-06, + "loss": 0.119, + "step": 26066 + }, + { + "epoch": 1.430735455543359, + "grad_norm": 1.0638139247894287, + "learning_rate": 2.095825682464467e-06, + "loss": 0.21, + "step": 26068 + }, + { + "epoch": 1.4308452250274424, + "grad_norm": 1.3625450134277344, + "learning_rate": 2.093727617665955e-06, + "loss": 0.1289, + "step": 26070 + }, + { + "epoch": 1.4309549945115259, + "grad_norm": 1.427020788192749, + "learning_rate": 2.0916305576489825e-06, + "loss": 0.1449, + "step": 26072 + }, + { + "epoch": 1.4310647639956091, + "grad_norm": 1.3675041198730469, + "learning_rate": 2.089534502505533e-06, + "loss": 0.2168, + "step": 26074 + }, + { + "epoch": 1.4311745334796926, + "grad_norm": 1.1272896528244019, + "learning_rate": 2.0874394523275524e-06, + "loss": 0.157, + "step": 26076 + }, + { + "epoch": 1.431284302963776, + "grad_norm": 1.0702511072158813, + "learning_rate": 2.08534540720694e-06, + "loss": 0.174, + "step": 26078 + }, + { + "epoch": 1.4313940724478595, + "grad_norm": 1.0745900869369507, + "learning_rate": 2.0832523672355443e-06, + "loss": 0.1582, + "step": 26080 + }, + { + "epoch": 1.431503841931943, + "grad_norm": 1.4225718975067139, + "learning_rate": 2.081160332505186e-06, + "loss": 0.2742, + "step": 26082 + }, + { + "epoch": 1.4316136114160263, + "grad_norm": 1.037737488746643, + "learning_rate": 2.079069303107628e-06, + "loss": 0.1818, + "step": 26084 + }, + { + "epoch": 1.4317233809001098, + "grad_norm": 2.0075645446777344, + "learning_rate": 2.0769792791345945e-06, + "loss": 0.258, + "step": 26086 + }, + { + "epoch": 1.4318331503841932, + "grad_norm": 1.5185439586639404, + "learning_rate": 2.0748902606777615e-06, + "loss": 0.1954, + "step": 26088 + }, + { + "epoch": 1.4319429198682765, + "grad_norm": 1.2707138061523438, + "learning_rate": 2.072802247828759e-06, + "loss": 0.1854, + "step": 26090 + }, + { + "epoch": 1.43205268935236, + "grad_norm": 0.8160680532455444, + "learning_rate": 2.070715240679183e-06, + "loss": 0.128, + "step": 26092 + }, + { + "epoch": 1.4321624588364434, + "grad_norm": 1.1955398321151733, + "learning_rate": 2.068629239320588e-06, + "loss": 0.153, + "step": 26094 + }, + { + "epoch": 1.432272228320527, + "grad_norm": 1.072709321975708, + "learning_rate": 2.066544243844465e-06, + "loss": 0.1867, + "step": 26096 + }, + { + "epoch": 1.4323819978046104, + "grad_norm": 1.541612148284912, + "learning_rate": 2.0644602543422766e-06, + "loss": 0.1267, + "step": 26098 + }, + { + "epoch": 1.4324917672886937, + "grad_norm": 1.0554053783416748, + "learning_rate": 2.0623772709054357e-06, + "loss": 0.1255, + "step": 26100 + }, + { + "epoch": 1.4326015367727771, + "grad_norm": 1.8284366130828857, + "learning_rate": 2.0602952936253112e-06, + "loss": 0.1952, + "step": 26102 + }, + { + "epoch": 1.4327113062568606, + "grad_norm": 1.4723619222640991, + "learning_rate": 2.0582143225932273e-06, + "loss": 0.1382, + "step": 26104 + }, + { + "epoch": 1.432821075740944, + "grad_norm": 1.0602238178253174, + "learning_rate": 2.0561343579004715e-06, + "loss": 0.1319, + "step": 26106 + }, + { + "epoch": 1.4329308452250276, + "grad_norm": 0.9114440679550171, + "learning_rate": 2.054055399638277e-06, + "loss": 0.1873, + "step": 26108 + }, + { + "epoch": 1.4330406147091108, + "grad_norm": 1.1592540740966797, + "learning_rate": 2.0519774478978406e-06, + "loss": 0.2728, + "step": 26110 + }, + { + "epoch": 1.4331503841931943, + "grad_norm": 1.1771351099014282, + "learning_rate": 2.049900502770308e-06, + "loss": 0.1564, + "step": 26112 + }, + { + "epoch": 1.4332601536772778, + "grad_norm": 1.3879199028015137, + "learning_rate": 2.047824564346784e-06, + "loss": 0.209, + "step": 26114 + }, + { + "epoch": 1.433369923161361, + "grad_norm": 1.4381271600723267, + "learning_rate": 2.0457496327183244e-06, + "loss": 0.2583, + "step": 26116 + }, + { + "epoch": 1.4334796926454445, + "grad_norm": 1.065099835395813, + "learning_rate": 2.043675707975959e-06, + "loss": 0.2339, + "step": 26118 + }, + { + "epoch": 1.433589462129528, + "grad_norm": 0.7132042646408081, + "learning_rate": 2.0416027902106476e-06, + "loss": 0.0768, + "step": 26120 + }, + { + "epoch": 1.4336992316136115, + "grad_norm": 0.907162070274353, + "learning_rate": 2.0395308795133295e-06, + "loss": 0.3202, + "step": 26122 + }, + { + "epoch": 1.433809001097695, + "grad_norm": 1.2154972553253174, + "learning_rate": 2.0374599759748843e-06, + "loss": 0.2121, + "step": 26124 + }, + { + "epoch": 1.4339187705817782, + "grad_norm": 1.168525218963623, + "learning_rate": 2.03539007968615e-06, + "loss": 0.1543, + "step": 26126 + }, + { + "epoch": 1.4340285400658617, + "grad_norm": 0.7885262370109558, + "learning_rate": 2.0333211907379216e-06, + "loss": 0.1466, + "step": 26128 + }, + { + "epoch": 1.4341383095499451, + "grad_norm": 1.3577710390090942, + "learning_rate": 2.031253309220954e-06, + "loss": 0.2092, + "step": 26130 + }, + { + "epoch": 1.4342480790340286, + "grad_norm": 1.5612983703613281, + "learning_rate": 2.029186435225955e-06, + "loss": 0.1879, + "step": 26132 + }, + { + "epoch": 1.434357848518112, + "grad_norm": 1.3170157670974731, + "learning_rate": 2.0271205688435878e-06, + "loss": 0.1268, + "step": 26134 + }, + { + "epoch": 1.4344676180021954, + "grad_norm": 1.3815399408340454, + "learning_rate": 2.0250557101644696e-06, + "loss": 0.1767, + "step": 26136 + }, + { + "epoch": 1.4345773874862788, + "grad_norm": 1.0194406509399414, + "learning_rate": 2.0229918592791746e-06, + "loss": 0.1402, + "step": 26138 + }, + { + "epoch": 1.4346871569703623, + "grad_norm": 1.1641865968704224, + "learning_rate": 2.020929016278228e-06, + "loss": 0.1912, + "step": 26140 + }, + { + "epoch": 1.4347969264544456, + "grad_norm": 1.702541470527649, + "learning_rate": 2.0188671812521292e-06, + "loss": 0.2782, + "step": 26142 + }, + { + "epoch": 1.434906695938529, + "grad_norm": 1.2098541259765625, + "learning_rate": 2.0168063542913144e-06, + "loss": 0.2151, + "step": 26144 + }, + { + "epoch": 1.4350164654226125, + "grad_norm": 1.0544580221176147, + "learning_rate": 2.014746535486181e-06, + "loss": 0.2205, + "step": 26146 + }, + { + "epoch": 1.435126234906696, + "grad_norm": 1.017006516456604, + "learning_rate": 2.0126877249270815e-06, + "loss": 0.1601, + "step": 26148 + }, + { + "epoch": 1.4352360043907795, + "grad_norm": 0.9218233823776245, + "learning_rate": 2.0106299227043297e-06, + "loss": 0.1684, + "step": 26150 + }, + { + "epoch": 1.4353457738748627, + "grad_norm": 0.8674120306968689, + "learning_rate": 2.008573128908181e-06, + "loss": 0.1285, + "step": 26152 + }, + { + "epoch": 1.4354555433589462, + "grad_norm": 0.7251468300819397, + "learning_rate": 2.0065173436288636e-06, + "loss": 0.1729, + "step": 26154 + }, + { + "epoch": 1.4355653128430297, + "grad_norm": 1.1414440870285034, + "learning_rate": 2.004462566956558e-06, + "loss": 0.2295, + "step": 26156 + }, + { + "epoch": 1.435675082327113, + "grad_norm": 1.555380940437317, + "learning_rate": 2.002408798981395e-06, + "loss": 0.1616, + "step": 26158 + }, + { + "epoch": 1.4357848518111964, + "grad_norm": 1.1153229475021362, + "learning_rate": 2.000356039793458e-06, + "loss": 0.1097, + "step": 26160 + }, + { + "epoch": 1.43589462129528, + "grad_norm": 1.2493692636489868, + "learning_rate": 1.998304289482797e-06, + "loss": 0.1758, + "step": 26162 + }, + { + "epoch": 1.4360043907793634, + "grad_norm": 0.9050414562225342, + "learning_rate": 1.9962535481394067e-06, + "loss": 0.2537, + "step": 26164 + }, + { + "epoch": 1.4361141602634468, + "grad_norm": 1.5525981187820435, + "learning_rate": 1.9942038158532407e-06, + "loss": 0.2578, + "step": 26166 + }, + { + "epoch": 1.43622392974753, + "grad_norm": 1.0079044103622437, + "learning_rate": 1.9921550927142206e-06, + "loss": 0.1773, + "step": 26168 + }, + { + "epoch": 1.4363336992316136, + "grad_norm": 1.203122615814209, + "learning_rate": 1.990107378812206e-06, + "loss": 0.2515, + "step": 26170 + }, + { + "epoch": 1.436443468715697, + "grad_norm": 0.6620283722877502, + "learning_rate": 1.9880606742370218e-06, + "loss": 0.1357, + "step": 26172 + }, + { + "epoch": 1.4365532381997805, + "grad_norm": 1.459726095199585, + "learning_rate": 1.9860149790784432e-06, + "loss": 0.2708, + "step": 26174 + }, + { + "epoch": 1.436663007683864, + "grad_norm": 2.0821025371551514, + "learning_rate": 1.9839702934262046e-06, + "loss": 0.199, + "step": 26176 + }, + { + "epoch": 1.4367727771679473, + "grad_norm": 1.0362776517868042, + "learning_rate": 1.981926617370003e-06, + "loss": 0.1799, + "step": 26178 + }, + { + "epoch": 1.4368825466520307, + "grad_norm": 1.0162185430526733, + "learning_rate": 1.979883950999478e-06, + "loss": 0.1693, + "step": 26180 + }, + { + "epoch": 1.4369923161361142, + "grad_norm": 0.8213239908218384, + "learning_rate": 1.977842294404228e-06, + "loss": 0.1556, + "step": 26182 + }, + { + "epoch": 1.4371020856201975, + "grad_norm": 1.2015243768692017, + "learning_rate": 1.9758016476738193e-06, + "loss": 0.1794, + "step": 26184 + }, + { + "epoch": 1.437211855104281, + "grad_norm": 1.0752750635147095, + "learning_rate": 1.973762010897759e-06, + "loss": 0.1448, + "step": 26186 + }, + { + "epoch": 1.4373216245883644, + "grad_norm": 1.109422206878662, + "learning_rate": 1.9717233841655164e-06, + "loss": 0.1723, + "step": 26188 + }, + { + "epoch": 1.437431394072448, + "grad_norm": 0.9843521118164062, + "learning_rate": 1.969685767566512e-06, + "loss": 0.1772, + "step": 26190 + }, + { + "epoch": 1.4375411635565314, + "grad_norm": 0.6500905156135559, + "learning_rate": 1.9676491611901355e-06, + "loss": 0.079, + "step": 26192 + }, + { + "epoch": 1.4376509330406146, + "grad_norm": 1.2993695735931396, + "learning_rate": 1.9656135651257122e-06, + "loss": 0.2376, + "step": 26194 + }, + { + "epoch": 1.4377607025246981, + "grad_norm": 0.9624186158180237, + "learning_rate": 1.9635789794625408e-06, + "loss": 0.173, + "step": 26196 + }, + { + "epoch": 1.4378704720087816, + "grad_norm": 1.3415189981460571, + "learning_rate": 1.9615454042898633e-06, + "loss": 0.288, + "step": 26198 + }, + { + "epoch": 1.4379802414928649, + "grad_norm": 1.4255074262619019, + "learning_rate": 1.9595128396968838e-06, + "loss": 0.2579, + "step": 26200 + }, + { + "epoch": 1.4380900109769483, + "grad_norm": 0.759901225566864, + "learning_rate": 1.9574812857727554e-06, + "loss": 0.2301, + "step": 26202 + }, + { + "epoch": 1.4381997804610318, + "grad_norm": 0.6325985789299011, + "learning_rate": 1.9554507426066014e-06, + "loss": 0.2203, + "step": 26204 + }, + { + "epoch": 1.4383095499451153, + "grad_norm": 1.1330595016479492, + "learning_rate": 1.95342121028749e-06, + "loss": 0.1513, + "step": 26206 + }, + { + "epoch": 1.4384193194291988, + "grad_norm": 1.3833036422729492, + "learning_rate": 1.951392688904441e-06, + "loss": 0.1896, + "step": 26208 + }, + { + "epoch": 1.438529088913282, + "grad_norm": 1.9746001958847046, + "learning_rate": 1.9493651785464388e-06, + "loss": 0.2959, + "step": 26210 + }, + { + "epoch": 1.4386388583973655, + "grad_norm": 1.0812538862228394, + "learning_rate": 1.947338679302413e-06, + "loss": 0.1549, + "step": 26212 + }, + { + "epoch": 1.438748627881449, + "grad_norm": 1.2926385402679443, + "learning_rate": 1.9453131912612694e-06, + "loss": 0.1065, + "step": 26214 + }, + { + "epoch": 1.4388583973655324, + "grad_norm": 1.4309215545654297, + "learning_rate": 1.94328871451184e-06, + "loss": 0.2315, + "step": 26216 + }, + { + "epoch": 1.438968166849616, + "grad_norm": 1.1792852878570557, + "learning_rate": 1.9412652491429427e-06, + "loss": 0.1821, + "step": 26218 + }, + { + "epoch": 1.4390779363336992, + "grad_norm": 1.6567305326461792, + "learning_rate": 1.9392427952433317e-06, + "loss": 0.2626, + "step": 26220 + }, + { + "epoch": 1.4391877058177827, + "grad_norm": 0.9582017064094543, + "learning_rate": 1.9372213529017193e-06, + "loss": 0.2693, + "step": 26222 + }, + { + "epoch": 1.4392974753018661, + "grad_norm": 0.9439077377319336, + "learning_rate": 1.9352009222067784e-06, + "loss": 0.1821, + "step": 26224 + }, + { + "epoch": 1.4394072447859494, + "grad_norm": 1.2130334377288818, + "learning_rate": 1.9331815032471277e-06, + "loss": 0.1161, + "step": 26226 + }, + { + "epoch": 1.4395170142700329, + "grad_norm": 1.787797212600708, + "learning_rate": 1.93116309611136e-06, + "loss": 0.1485, + "step": 26228 + }, + { + "epoch": 1.4396267837541163, + "grad_norm": 1.2243422269821167, + "learning_rate": 1.929145700888008e-06, + "loss": 0.2065, + "step": 26230 + }, + { + "epoch": 1.4397365532381998, + "grad_norm": 1.3566913604736328, + "learning_rate": 1.927129317665563e-06, + "loss": 0.2331, + "step": 26232 + }, + { + "epoch": 1.4398463227222833, + "grad_norm": 0.9714477062225342, + "learning_rate": 1.9251139465324754e-06, + "loss": 0.2239, + "step": 26234 + }, + { + "epoch": 1.4399560922063666, + "grad_norm": 0.8969460725784302, + "learning_rate": 1.9230995875771485e-06, + "loss": 0.1773, + "step": 26236 + }, + { + "epoch": 1.44006586169045, + "grad_norm": 0.8343444466590881, + "learning_rate": 1.921086240887937e-06, + "loss": 0.0727, + "step": 26238 + }, + { + "epoch": 1.4401756311745335, + "grad_norm": 1.0144158601760864, + "learning_rate": 1.919073906553165e-06, + "loss": 0.1733, + "step": 26240 + }, + { + "epoch": 1.440285400658617, + "grad_norm": 1.121248722076416, + "learning_rate": 1.9170625846610944e-06, + "loss": 0.1369, + "step": 26242 + }, + { + "epoch": 1.4403951701427005, + "grad_norm": 2.639207601547241, + "learning_rate": 1.915052275299961e-06, + "loss": 0.1068, + "step": 26244 + }, + { + "epoch": 1.4405049396267837, + "grad_norm": 1.3356473445892334, + "learning_rate": 1.913042978557944e-06, + "loss": 0.2039, + "step": 26246 + }, + { + "epoch": 1.4406147091108672, + "grad_norm": 1.2450517416000366, + "learning_rate": 1.9110346945231782e-06, + "loss": 0.1591, + "step": 26248 + }, + { + "epoch": 1.4407244785949507, + "grad_norm": 0.8165706396102905, + "learning_rate": 1.9090274232837572e-06, + "loss": 0.1464, + "step": 26250 + }, + { + "epoch": 1.440834248079034, + "grad_norm": 1.6223633289337158, + "learning_rate": 1.9070211649277275e-06, + "loss": 0.1805, + "step": 26252 + }, + { + "epoch": 1.4409440175631174, + "grad_norm": 1.0221372842788696, + "learning_rate": 1.9050159195431017e-06, + "loss": 0.1681, + "step": 26254 + }, + { + "epoch": 1.4410537870472009, + "grad_norm": 0.872528612613678, + "learning_rate": 1.9030116872178316e-06, + "loss": 0.1524, + "step": 26256 + }, + { + "epoch": 1.4411635565312844, + "grad_norm": 0.8324972987174988, + "learning_rate": 1.9010084680398388e-06, + "loss": 0.16, + "step": 26258 + }, + { + "epoch": 1.4412733260153678, + "grad_norm": 1.0275459289550781, + "learning_rate": 1.8990062620969889e-06, + "loss": 0.1701, + "step": 26260 + }, + { + "epoch": 1.441383095499451, + "grad_norm": 1.1504892110824585, + "learning_rate": 1.8970050694771064e-06, + "loss": 0.1903, + "step": 26262 + }, + { + "epoch": 1.4414928649835346, + "grad_norm": 1.1567645072937012, + "learning_rate": 1.8950048902679845e-06, + "loss": 0.1311, + "step": 26264 + }, + { + "epoch": 1.441602634467618, + "grad_norm": 0.6357282996177673, + "learning_rate": 1.8930057245573508e-06, + "loss": 0.1413, + "step": 26266 + }, + { + "epoch": 1.4417124039517013, + "grad_norm": 1.4272253513336182, + "learning_rate": 1.891007572432904e-06, + "loss": 0.1849, + "step": 26268 + }, + { + "epoch": 1.4418221734357848, + "grad_norm": 0.6971812844276428, + "learning_rate": 1.889010433982291e-06, + "loss": 0.1338, + "step": 26270 + }, + { + "epoch": 1.4419319429198683, + "grad_norm": 0.8656797409057617, + "learning_rate": 1.8870143092931086e-06, + "loss": 0.2432, + "step": 26272 + }, + { + "epoch": 1.4420417124039517, + "grad_norm": 1.452065348625183, + "learning_rate": 1.8850191984529309e-06, + "loss": 0.2244, + "step": 26274 + }, + { + "epoch": 1.4421514818880352, + "grad_norm": 0.8837994933128357, + "learning_rate": 1.8830251015492601e-06, + "loss": 0.189, + "step": 26276 + }, + { + "epoch": 1.4422612513721185, + "grad_norm": 1.1856615543365479, + "learning_rate": 1.881032018669579e-06, + "loss": 0.1744, + "step": 26278 + }, + { + "epoch": 1.442371020856202, + "grad_norm": 0.8308027982711792, + "learning_rate": 1.8790399499013094e-06, + "loss": 0.1509, + "step": 26280 + }, + { + "epoch": 1.4424807903402854, + "grad_norm": 1.1791191101074219, + "learning_rate": 1.8770488953318288e-06, + "loss": 0.1295, + "step": 26282 + }, + { + "epoch": 1.442590559824369, + "grad_norm": 1.7787764072418213, + "learning_rate": 1.8750588550484782e-06, + "loss": 0.2699, + "step": 26284 + }, + { + "epoch": 1.4427003293084524, + "grad_norm": 1.7481608390808105, + "learning_rate": 1.8730698291385518e-06, + "loss": 0.2747, + "step": 26286 + }, + { + "epoch": 1.4428100987925356, + "grad_norm": 1.1972765922546387, + "learning_rate": 1.8710818176892908e-06, + "loss": 0.1526, + "step": 26288 + }, + { + "epoch": 1.442919868276619, + "grad_norm": 3.001065492630005, + "learning_rate": 1.8690948207879089e-06, + "loss": 0.1707, + "step": 26290 + }, + { + "epoch": 1.4430296377607026, + "grad_norm": 1.118263602256775, + "learning_rate": 1.867108838521564e-06, + "loss": 0.2407, + "step": 26292 + }, + { + "epoch": 1.4431394072447858, + "grad_norm": 1.9297783374786377, + "learning_rate": 1.8651238709773643e-06, + "loss": 0.1412, + "step": 26294 + }, + { + "epoch": 1.4432491767288693, + "grad_norm": 0.929878830909729, + "learning_rate": 1.8631399182423875e-06, + "loss": 0.1778, + "step": 26296 + }, + { + "epoch": 1.4433589462129528, + "grad_norm": 0.9293321967124939, + "learning_rate": 1.86115698040365e-06, + "loss": 0.1939, + "step": 26298 + }, + { + "epoch": 1.4434687156970363, + "grad_norm": 1.7907410860061646, + "learning_rate": 1.8591750575481431e-06, + "loss": 0.287, + "step": 26300 + }, + { + "epoch": 1.4435784851811198, + "grad_norm": 1.342749834060669, + "learning_rate": 1.8571941497627977e-06, + "loss": 0.2085, + "step": 26302 + }, + { + "epoch": 1.443688254665203, + "grad_norm": 0.9567713141441345, + "learning_rate": 1.8552142571345133e-06, + "loss": 0.096, + "step": 26304 + }, + { + "epoch": 1.4437980241492865, + "grad_norm": 1.2286808490753174, + "learning_rate": 1.8532353797501318e-06, + "loss": 0.2138, + "step": 26306 + }, + { + "epoch": 1.44390779363337, + "grad_norm": 1.2536342144012451, + "learning_rate": 1.8512575176964558e-06, + "loss": 0.1484, + "step": 26308 + }, + { + "epoch": 1.4440175631174532, + "grad_norm": 1.3829014301300049, + "learning_rate": 1.8492806710602496e-06, + "loss": 0.1567, + "step": 26310 + }, + { + "epoch": 1.4441273326015367, + "grad_norm": 1.2923473119735718, + "learning_rate": 1.8473048399282155e-06, + "loss": 0.1965, + "step": 26312 + }, + { + "epoch": 1.4442371020856202, + "grad_norm": 1.1568387746810913, + "learning_rate": 1.8453300243870375e-06, + "loss": 0.1929, + "step": 26314 + }, + { + "epoch": 1.4443468715697036, + "grad_norm": 1.149668574333191, + "learning_rate": 1.8433562245233349e-06, + "loss": 0.1532, + "step": 26316 + }, + { + "epoch": 1.4444566410537871, + "grad_norm": 0.8475110530853271, + "learning_rate": 1.8413834404236858e-06, + "loss": 0.1871, + "step": 26318 + }, + { + "epoch": 1.4445664105378704, + "grad_norm": 0.8697116374969482, + "learning_rate": 1.839411672174629e-06, + "loss": 0.1385, + "step": 26320 + }, + { + "epoch": 1.4446761800219539, + "grad_norm": 1.7211955785751343, + "learning_rate": 1.837440919862657e-06, + "loss": 0.273, + "step": 26322 + }, + { + "epoch": 1.4447859495060373, + "grad_norm": 0.9746912717819214, + "learning_rate": 1.8354711835742083e-06, + "loss": 0.1107, + "step": 26324 + }, + { + "epoch": 1.4448957189901208, + "grad_norm": 1.5592868328094482, + "learning_rate": 1.8335024633956976e-06, + "loss": 0.1904, + "step": 26326 + }, + { + "epoch": 1.4450054884742043, + "grad_norm": 0.9151173830032349, + "learning_rate": 1.8315347594134753e-06, + "loss": 0.1824, + "step": 26328 + }, + { + "epoch": 1.4451152579582875, + "grad_norm": 1.3096834421157837, + "learning_rate": 1.8295680717138552e-06, + "loss": 0.1695, + "step": 26330 + }, + { + "epoch": 1.445225027442371, + "grad_norm": 0.9965806007385254, + "learning_rate": 1.8276024003831049e-06, + "loss": 0.1862, + "step": 26332 + }, + { + "epoch": 1.4453347969264545, + "grad_norm": 1.0655782222747803, + "learning_rate": 1.8256377455074525e-06, + "loss": 0.1783, + "step": 26334 + }, + { + "epoch": 1.4454445664105378, + "grad_norm": 5.104814529418945, + "learning_rate": 1.8236741071730761e-06, + "loss": 0.1916, + "step": 26336 + }, + { + "epoch": 1.4455543358946212, + "grad_norm": 1.208648920059204, + "learning_rate": 1.8217114854661043e-06, + "loss": 0.1914, + "step": 26338 + }, + { + "epoch": 1.4456641053787047, + "grad_norm": 1.4047218561172485, + "learning_rate": 1.8197498804726376e-06, + "loss": 0.1718, + "step": 26340 + }, + { + "epoch": 1.4457738748627882, + "grad_norm": 0.8101232051849365, + "learning_rate": 1.8177892922787154e-06, + "loss": 0.1525, + "step": 26342 + }, + { + "epoch": 1.4458836443468717, + "grad_norm": 1.3231416940689087, + "learning_rate": 1.8158297209703411e-06, + "loss": 0.2029, + "step": 26344 + }, + { + "epoch": 1.445993413830955, + "grad_norm": 1.0843749046325684, + "learning_rate": 1.8138711666334685e-06, + "loss": 0.1577, + "step": 26346 + }, + { + "epoch": 1.4461031833150384, + "grad_norm": 0.8602085709571838, + "learning_rate": 1.811913629354009e-06, + "loss": 0.1441, + "step": 26348 + }, + { + "epoch": 1.4462129527991219, + "grad_norm": 1.317800760269165, + "learning_rate": 1.809957109217833e-06, + "loss": 0.2725, + "step": 26350 + }, + { + "epoch": 1.4463227222832051, + "grad_norm": 0.88285893201828, + "learning_rate": 1.8080016063107635e-06, + "loss": 0.2552, + "step": 26352 + }, + { + "epoch": 1.4464324917672888, + "grad_norm": 2.2908949851989746, + "learning_rate": 1.8060471207185764e-06, + "loss": 0.273, + "step": 26354 + }, + { + "epoch": 1.446542261251372, + "grad_norm": 1.2847442626953125, + "learning_rate": 1.804093652527003e-06, + "loss": 0.1528, + "step": 26356 + }, + { + "epoch": 1.4466520307354556, + "grad_norm": 1.781882882118225, + "learning_rate": 1.802141201821736e-06, + "loss": 0.2238, + "step": 26358 + }, + { + "epoch": 1.446761800219539, + "grad_norm": 1.2916877269744873, + "learning_rate": 1.800189768688415e-06, + "loss": 0.1052, + "step": 26360 + }, + { + "epoch": 1.4468715697036223, + "grad_norm": 0.7340002655982971, + "learning_rate": 1.7982393532126384e-06, + "loss": 0.119, + "step": 26362 + }, + { + "epoch": 1.4469813391877058, + "grad_norm": 1.3045052289962769, + "learning_rate": 1.7962899554799712e-06, + "loss": 0.2587, + "step": 26364 + }, + { + "epoch": 1.4470911086717893, + "grad_norm": 1.5758659839630127, + "learning_rate": 1.7943415755759168e-06, + "loss": 0.2722, + "step": 26366 + }, + { + "epoch": 1.4472008781558727, + "grad_norm": 0.8282353281974792, + "learning_rate": 1.7923942135859406e-06, + "loss": 0.182, + "step": 26368 + }, + { + "epoch": 1.4473106476399562, + "grad_norm": 0.8120018839836121, + "learning_rate": 1.7904478695954658e-06, + "loss": 0.1177, + "step": 26370 + }, + { + "epoch": 1.4474204171240395, + "grad_norm": 2.4130523204803467, + "learning_rate": 1.7885025436898657e-06, + "loss": 0.1784, + "step": 26372 + }, + { + "epoch": 1.447530186608123, + "grad_norm": 1.6285886764526367, + "learning_rate": 1.7865582359544664e-06, + "loss": 0.2175, + "step": 26374 + }, + { + "epoch": 1.4476399560922064, + "grad_norm": 1.2647143602371216, + "learning_rate": 1.7846149464745666e-06, + "loss": 0.2219, + "step": 26376 + }, + { + "epoch": 1.4477497255762897, + "grad_norm": 1.44660484790802, + "learning_rate": 1.7826726753354034e-06, + "loss": 0.1891, + "step": 26378 + }, + { + "epoch": 1.4478594950603731, + "grad_norm": 1.1942027807235718, + "learning_rate": 1.7807314226221756e-06, + "loss": 0.1632, + "step": 26380 + }, + { + "epoch": 1.4479692645444566, + "grad_norm": 1.0176414251327515, + "learning_rate": 1.7787911884200314e-06, + "loss": 0.1514, + "step": 26382 + }, + { + "epoch": 1.44807903402854, + "grad_norm": 0.7441948652267456, + "learning_rate": 1.7768519728140837e-06, + "loss": 0.1196, + "step": 26384 + }, + { + "epoch": 1.4481888035126236, + "grad_norm": 1.669590711593628, + "learning_rate": 1.774913775889389e-06, + "loss": 0.1506, + "step": 26386 + }, + { + "epoch": 1.4482985729967068, + "grad_norm": 1.4243440628051758, + "learning_rate": 1.772976597730977e-06, + "loss": 0.1344, + "step": 26388 + }, + { + "epoch": 1.4484083424807903, + "grad_norm": 1.0301966667175293, + "learning_rate": 1.7710404384238155e-06, + "loss": 0.1986, + "step": 26390 + }, + { + "epoch": 1.4485181119648738, + "grad_norm": 0.866507887840271, + "learning_rate": 1.7691052980528367e-06, + "loss": 0.1744, + "step": 26392 + }, + { + "epoch": 1.4486278814489573, + "grad_norm": 1.2266067266464233, + "learning_rate": 1.767171176702917e-06, + "loss": 0.1715, + "step": 26394 + }, + { + "epoch": 1.4487376509330407, + "grad_norm": 1.1809755563735962, + "learning_rate": 1.7652380744589082e-06, + "loss": 0.1311, + "step": 26396 + }, + { + "epoch": 1.448847420417124, + "grad_norm": 1.035554051399231, + "learning_rate": 1.7633059914055976e-06, + "loss": 0.1645, + "step": 26398 + }, + { + "epoch": 1.4489571899012075, + "grad_norm": 1.6240004301071167, + "learning_rate": 1.7613749276277403e-06, + "loss": 0.1103, + "step": 26400 + }, + { + "epoch": 1.449066959385291, + "grad_norm": 0.670484185218811, + "learning_rate": 1.7594448832100402e-06, + "loss": 0.1035, + "step": 26402 + }, + { + "epoch": 1.4491767288693742, + "grad_norm": 1.5084171295166016, + "learning_rate": 1.7575158582371604e-06, + "loss": 0.2444, + "step": 26404 + }, + { + "epoch": 1.4492864983534577, + "grad_norm": 1.0774987936019897, + "learning_rate": 1.7555878527937164e-06, + "loss": 0.1578, + "step": 26406 + }, + { + "epoch": 1.4493962678375412, + "grad_norm": 0.9885256290435791, + "learning_rate": 1.7536608669642796e-06, + "loss": 0.1365, + "step": 26408 + }, + { + "epoch": 1.4495060373216246, + "grad_norm": 1.0494972467422485, + "learning_rate": 1.751734900833371e-06, + "loss": 0.1363, + "step": 26410 + }, + { + "epoch": 1.4496158068057081, + "grad_norm": 0.949584424495697, + "learning_rate": 1.7498099544854846e-06, + "loss": 0.2356, + "step": 26412 + }, + { + "epoch": 1.4497255762897914, + "grad_norm": 0.8611241579055786, + "learning_rate": 1.7478860280050525e-06, + "loss": 0.1113, + "step": 26414 + }, + { + "epoch": 1.4498353457738749, + "grad_norm": 1.2409745454788208, + "learning_rate": 1.7459631214764654e-06, + "loss": 0.2501, + "step": 26416 + }, + { + "epoch": 1.4499451152579583, + "grad_norm": 1.636663556098938, + "learning_rate": 1.744041234984073e-06, + "loss": 0.1914, + "step": 26418 + }, + { + "epoch": 1.4500548847420416, + "grad_norm": 0.9733678698539734, + "learning_rate": 1.7421203686121823e-06, + "loss": 0.1687, + "step": 26420 + }, + { + "epoch": 1.450164654226125, + "grad_norm": 1.4930219650268555, + "learning_rate": 1.740200522445043e-06, + "loss": 0.2415, + "step": 26422 + }, + { + "epoch": 1.4502744237102085, + "grad_norm": 1.316404104232788, + "learning_rate": 1.7382816965668737e-06, + "loss": 0.1823, + "step": 26424 + }, + { + "epoch": 1.450384193194292, + "grad_norm": 0.7899768352508545, + "learning_rate": 1.7363638910618484e-06, + "loss": 0.1368, + "step": 26426 + }, + { + "epoch": 1.4504939626783755, + "grad_norm": 1.0976325273513794, + "learning_rate": 1.7344471060140866e-06, + "loss": 0.1344, + "step": 26428 + }, + { + "epoch": 1.4506037321624587, + "grad_norm": 1.167510986328125, + "learning_rate": 1.7325313415076705e-06, + "loss": 0.3298, + "step": 26430 + }, + { + "epoch": 1.4507135016465422, + "grad_norm": 1.7083032131195068, + "learning_rate": 1.730616597626633e-06, + "loss": 0.1359, + "step": 26432 + }, + { + "epoch": 1.4508232711306257, + "grad_norm": 1.2855041027069092, + "learning_rate": 1.72870287445496e-06, + "loss": 0.1631, + "step": 26434 + }, + { + "epoch": 1.4509330406147092, + "grad_norm": 0.9978513717651367, + "learning_rate": 1.7267901720766061e-06, + "loss": 0.1293, + "step": 26436 + }, + { + "epoch": 1.4510428100987927, + "grad_norm": 1.0856876373291016, + "learning_rate": 1.7248784905754656e-06, + "loss": 0.1754, + "step": 26438 + }, + { + "epoch": 1.451152579582876, + "grad_norm": 1.0855844020843506, + "learning_rate": 1.7229678300353963e-06, + "loss": 0.1721, + "step": 26440 + }, + { + "epoch": 1.4512623490669594, + "grad_norm": 0.9124547243118286, + "learning_rate": 1.721058190540209e-06, + "loss": 0.1835, + "step": 26442 + }, + { + "epoch": 1.4513721185510429, + "grad_norm": 1.3742215633392334, + "learning_rate": 1.71914957217367e-06, + "loss": 0.237, + "step": 26444 + }, + { + "epoch": 1.4514818880351261, + "grad_norm": 0.9184341430664062, + "learning_rate": 1.717241975019493e-06, + "loss": 0.2123, + "step": 26446 + }, + { + "epoch": 1.4515916575192096, + "grad_norm": 1.387346863746643, + "learning_rate": 1.7153353991613662e-06, + "loss": 0.2319, + "step": 26448 + }, + { + "epoch": 1.451701427003293, + "grad_norm": 0.8814169764518738, + "learning_rate": 1.7134298446829178e-06, + "loss": 0.1398, + "step": 26450 + }, + { + "epoch": 1.4518111964873766, + "grad_norm": 1.3035951852798462, + "learning_rate": 1.7115253116677333e-06, + "loss": 0.2213, + "step": 26452 + }, + { + "epoch": 1.45192096597146, + "grad_norm": 1.1244709491729736, + "learning_rate": 1.7096218001993513e-06, + "loss": 0.2286, + "step": 26454 + }, + { + "epoch": 1.4520307354555433, + "grad_norm": 0.809855043888092, + "learning_rate": 1.7077193103612777e-06, + "loss": 0.1212, + "step": 26456 + }, + { + "epoch": 1.4521405049396268, + "grad_norm": 1.0846669673919678, + "learning_rate": 1.7058178422369591e-06, + "loss": 0.2209, + "step": 26458 + }, + { + "epoch": 1.4522502744237102, + "grad_norm": 1.463449239730835, + "learning_rate": 1.7039173959098015e-06, + "loss": 0.1493, + "step": 26460 + }, + { + "epoch": 1.4523600439077935, + "grad_norm": 1.1760647296905518, + "learning_rate": 1.702017971463174e-06, + "loss": 0.131, + "step": 26462 + }, + { + "epoch": 1.4524698133918772, + "grad_norm": 1.1624823808670044, + "learning_rate": 1.7001195689803905e-06, + "loss": 0.2137, + "step": 26464 + }, + { + "epoch": 1.4525795828759605, + "grad_norm": 1.3396048545837402, + "learning_rate": 1.6982221885447264e-06, + "loss": 0.217, + "step": 26466 + }, + { + "epoch": 1.452689352360044, + "grad_norm": 0.8498256802558899, + "learning_rate": 1.6963258302394093e-06, + "loss": 0.138, + "step": 26468 + }, + { + "epoch": 1.4527991218441274, + "grad_norm": 1.2472745180130005, + "learning_rate": 1.6944304941476225e-06, + "loss": 0.205, + "step": 26470 + }, + { + "epoch": 1.4529088913282107, + "grad_norm": 0.9887480735778809, + "learning_rate": 1.6925361803524998e-06, + "loss": 0.1691, + "step": 26472 + }, + { + "epoch": 1.4530186608122941, + "grad_norm": 1.1515259742736816, + "learning_rate": 1.6906428889371468e-06, + "loss": 0.2306, + "step": 26474 + }, + { + "epoch": 1.4531284302963776, + "grad_norm": 1.023733139038086, + "learning_rate": 1.6887506199846027e-06, + "loss": 0.1167, + "step": 26476 + }, + { + "epoch": 1.453238199780461, + "grad_norm": 1.0667529106140137, + "learning_rate": 1.686859373577876e-06, + "loss": 0.1541, + "step": 26478 + }, + { + "epoch": 1.4533479692645446, + "grad_norm": 1.3923649787902832, + "learning_rate": 1.6849691497999226e-06, + "loss": 0.2344, + "step": 26480 + }, + { + "epoch": 1.4534577387486278, + "grad_norm": 1.5926679372787476, + "learning_rate": 1.6830799487336569e-06, + "loss": 0.1679, + "step": 26482 + }, + { + "epoch": 1.4535675082327113, + "grad_norm": 1.1773158311843872, + "learning_rate": 1.6811917704619511e-06, + "loss": 0.1953, + "step": 26484 + }, + { + "epoch": 1.4536772777167948, + "grad_norm": 0.8450037240982056, + "learning_rate": 1.679304615067634e-06, + "loss": 0.1738, + "step": 26486 + }, + { + "epoch": 1.453787047200878, + "grad_norm": 0.7903428077697754, + "learning_rate": 1.6774184826334804e-06, + "loss": 0.1493, + "step": 26488 + }, + { + "epoch": 1.4538968166849615, + "grad_norm": 0.9237900972366333, + "learning_rate": 1.6755333732422274e-06, + "loss": 0.1216, + "step": 26490 + }, + { + "epoch": 1.454006586169045, + "grad_norm": 1.513506531715393, + "learning_rate": 1.673649286976564e-06, + "loss": 0.1908, + "step": 26492 + }, + { + "epoch": 1.4541163556531285, + "grad_norm": 1.0039139986038208, + "learning_rate": 1.6717662239191328e-06, + "loss": 0.1804, + "step": 26494 + }, + { + "epoch": 1.454226125137212, + "grad_norm": 1.3439607620239258, + "learning_rate": 1.6698841841525342e-06, + "loss": 0.2138, + "step": 26496 + }, + { + "epoch": 1.4543358946212952, + "grad_norm": 1.0172927379608154, + "learning_rate": 1.66800316775933e-06, + "loss": 0.2046, + "step": 26498 + }, + { + "epoch": 1.4544456641053787, + "grad_norm": 1.419175148010254, + "learning_rate": 1.666123174822029e-06, + "loss": 0.1789, + "step": 26500 + }, + { + "epoch": 1.4545554335894622, + "grad_norm": 1.4084453582763672, + "learning_rate": 1.6642442054230934e-06, + "loss": 0.1832, + "step": 26502 + }, + { + "epoch": 1.4546652030735456, + "grad_norm": 1.3530234098434448, + "learning_rate": 1.662366259644943e-06, + "loss": 0.264, + "step": 26504 + }, + { + "epoch": 1.454774972557629, + "grad_norm": 1.3998732566833496, + "learning_rate": 1.6604893375699594e-06, + "loss": 0.1607, + "step": 26506 + }, + { + "epoch": 1.4548847420417124, + "grad_norm": 1.36220121383667, + "learning_rate": 1.6586134392804653e-06, + "loss": 0.2571, + "step": 26508 + }, + { + "epoch": 1.4549945115257958, + "grad_norm": 1.082810878753662, + "learning_rate": 1.6567385648587564e-06, + "loss": 0.1409, + "step": 26510 + }, + { + "epoch": 1.4551042810098793, + "grad_norm": 1.3655352592468262, + "learning_rate": 1.6548647143870722e-06, + "loss": 0.1675, + "step": 26512 + }, + { + "epoch": 1.4552140504939626, + "grad_norm": 0.853126585483551, + "learning_rate": 1.6529918879475997e-06, + "loss": 0.1744, + "step": 26514 + }, + { + "epoch": 1.455323819978046, + "grad_norm": 1.3436528444290161, + "learning_rate": 1.651120085622501e-06, + "loss": 0.1455, + "step": 26516 + }, + { + "epoch": 1.4554335894621295, + "grad_norm": 0.7492475509643555, + "learning_rate": 1.6492493074938774e-06, + "loss": 0.1051, + "step": 26518 + }, + { + "epoch": 1.455543358946213, + "grad_norm": 1.4466936588287354, + "learning_rate": 1.6473795536437907e-06, + "loss": 0.1794, + "step": 26520 + }, + { + "epoch": 1.4556531284302965, + "grad_norm": 1.0015491247177124, + "learning_rate": 1.645510824154256e-06, + "loss": 0.2014, + "step": 26522 + }, + { + "epoch": 1.4557628979143797, + "grad_norm": 1.2127909660339355, + "learning_rate": 1.643643119107252e-06, + "loss": 0.2909, + "step": 26524 + }, + { + "epoch": 1.4558726673984632, + "grad_norm": 1.3854966163635254, + "learning_rate": 1.6417764385846996e-06, + "loss": 0.1751, + "step": 26526 + }, + { + "epoch": 1.4559824368825467, + "grad_norm": 1.3568652868270874, + "learning_rate": 1.639910782668483e-06, + "loss": 0.2551, + "step": 26528 + }, + { + "epoch": 1.45609220636663, + "grad_norm": 1.02922785282135, + "learning_rate": 1.6380461514404339e-06, + "loss": 0.1989, + "step": 26530 + }, + { + "epoch": 1.4562019758507134, + "grad_norm": 0.7830355167388916, + "learning_rate": 1.6361825449823453e-06, + "loss": 0.1304, + "step": 26532 + }, + { + "epoch": 1.456311745334797, + "grad_norm": 1.2502368688583374, + "learning_rate": 1.6343199633759715e-06, + "loss": 0.154, + "step": 26534 + }, + { + "epoch": 1.4564215148188804, + "grad_norm": 1.1812602281570435, + "learning_rate": 1.6324584067030108e-06, + "loss": 0.2175, + "step": 26536 + }, + { + "epoch": 1.4565312843029639, + "grad_norm": 1.3908249139785767, + "learning_rate": 1.6305978750451173e-06, + "loss": 0.2147, + "step": 26538 + }, + { + "epoch": 1.4566410537870471, + "grad_norm": 0.866954505443573, + "learning_rate": 1.6287383684839036e-06, + "loss": 0.131, + "step": 26540 + }, + { + "epoch": 1.4567508232711306, + "grad_norm": 3.1799347400665283, + "learning_rate": 1.6268798871009405e-06, + "loss": 0.1844, + "step": 26542 + }, + { + "epoch": 1.456860592755214, + "grad_norm": 1.1964441537857056, + "learning_rate": 1.6250224309777434e-06, + "loss": 0.2045, + "step": 26544 + }, + { + "epoch": 1.4569703622392975, + "grad_norm": 0.9349489808082581, + "learning_rate": 1.6231660001957916e-06, + "loss": 0.1485, + "step": 26546 + }, + { + "epoch": 1.457080131723381, + "grad_norm": 1.5939592123031616, + "learning_rate": 1.6213105948365254e-06, + "loss": 0.2614, + "step": 26548 + }, + { + "epoch": 1.4571899012074643, + "grad_norm": 0.9760693311691284, + "learning_rate": 1.6194562149813242e-06, + "loss": 0.1799, + "step": 26550 + }, + { + "epoch": 1.4572996706915478, + "grad_norm": 1.1485466957092285, + "learning_rate": 1.6176028607115312e-06, + "loss": 0.1703, + "step": 26552 + }, + { + "epoch": 1.4574094401756312, + "grad_norm": 0.9018387198448181, + "learning_rate": 1.6157505321084454e-06, + "loss": 0.1213, + "step": 26554 + }, + { + "epoch": 1.4575192096597145, + "grad_norm": 1.1253011226654053, + "learning_rate": 1.6138992292533183e-06, + "loss": 0.2789, + "step": 26556 + }, + { + "epoch": 1.457628979143798, + "grad_norm": 0.9740609526634216, + "learning_rate": 1.6120489522273545e-06, + "loss": 0.145, + "step": 26558 + }, + { + "epoch": 1.4577387486278814, + "grad_norm": 1.21720552444458, + "learning_rate": 1.6101997011117198e-06, + "loss": 0.1625, + "step": 26560 + }, + { + "epoch": 1.457848518111965, + "grad_norm": 1.178999662399292, + "learning_rate": 1.6083514759875296e-06, + "loss": 0.173, + "step": 26562 + }, + { + "epoch": 1.4579582875960484, + "grad_norm": 1.098272681236267, + "learning_rate": 1.606504276935858e-06, + "loss": 0.1636, + "step": 26564 + }, + { + "epoch": 1.4580680570801317, + "grad_norm": 1.0750492811203003, + "learning_rate": 1.6046581040377319e-06, + "loss": 0.1781, + "step": 26566 + }, + { + "epoch": 1.4581778265642151, + "grad_norm": 1.411992073059082, + "learning_rate": 1.602812957374128e-06, + "loss": 0.2599, + "step": 26568 + }, + { + "epoch": 1.4582875960482986, + "grad_norm": 0.9861490726470947, + "learning_rate": 1.60096883702599e-06, + "loss": 0.1103, + "step": 26570 + }, + { + "epoch": 1.4583973655323819, + "grad_norm": 1.39292573928833, + "learning_rate": 1.599125743074209e-06, + "loss": 0.2346, + "step": 26572 + }, + { + "epoch": 1.4585071350164656, + "grad_norm": 0.934270441532135, + "learning_rate": 1.5972836755996285e-06, + "loss": 0.1965, + "step": 26574 + }, + { + "epoch": 1.4586169045005488, + "grad_norm": 0.6963056325912476, + "learning_rate": 1.595442634683056e-06, + "loss": 0.1318, + "step": 26576 + }, + { + "epoch": 1.4587266739846323, + "grad_norm": 1.023980975151062, + "learning_rate": 1.5936026204052462e-06, + "loss": 0.2045, + "step": 26578 + }, + { + "epoch": 1.4588364434687158, + "grad_norm": 0.811961829662323, + "learning_rate": 1.5917636328469099e-06, + "loss": 0.1951, + "step": 26580 + }, + { + "epoch": 1.458946212952799, + "grad_norm": 1.062258243560791, + "learning_rate": 1.589925672088713e-06, + "loss": 0.2472, + "step": 26582 + }, + { + "epoch": 1.4590559824368825, + "grad_norm": 0.8865541219711304, + "learning_rate": 1.5880887382112825e-06, + "loss": 0.1425, + "step": 26584 + }, + { + "epoch": 1.459165751920966, + "grad_norm": 0.9561091065406799, + "learning_rate": 1.586252831295193e-06, + "loss": 0.1132, + "step": 26586 + }, + { + "epoch": 1.4592755214050495, + "grad_norm": 1.4022736549377441, + "learning_rate": 1.5844179514209746e-06, + "loss": 0.2531, + "step": 26588 + }, + { + "epoch": 1.459385290889133, + "grad_norm": 1.0814213752746582, + "learning_rate": 1.5825840986691154e-06, + "loss": 0.1248, + "step": 26590 + }, + { + "epoch": 1.4594950603732162, + "grad_norm": 0.9813456535339355, + "learning_rate": 1.5807512731200569e-06, + "loss": 0.278, + "step": 26592 + }, + { + "epoch": 1.4596048298572997, + "grad_norm": 1.4320112466812134, + "learning_rate": 1.578919474854193e-06, + "loss": 0.2786, + "step": 26594 + }, + { + "epoch": 1.4597145993413831, + "grad_norm": 1.7721041440963745, + "learning_rate": 1.5770887039518818e-06, + "loss": 0.2096, + "step": 26596 + }, + { + "epoch": 1.4598243688254664, + "grad_norm": 1.7153207063674927, + "learning_rate": 1.5752589604934255e-06, + "loss": 0.2531, + "step": 26598 + }, + { + "epoch": 1.4599341383095499, + "grad_norm": 0.9598408937454224, + "learning_rate": 1.5734302445590876e-06, + "loss": 0.2041, + "step": 26600 + }, + { + "epoch": 1.4600439077936334, + "grad_norm": 0.9473633170127869, + "learning_rate": 1.5716025562290848e-06, + "loss": 0.1585, + "step": 26602 + }, + { + "epoch": 1.4601536772777168, + "grad_norm": 1.6555675268173218, + "learning_rate": 1.5697758955835806e-06, + "loss": 0.1738, + "step": 26604 + }, + { + "epoch": 1.4602634467618003, + "grad_norm": 1.4840168952941895, + "learning_rate": 1.5679502627027136e-06, + "loss": 0.164, + "step": 26606 + }, + { + "epoch": 1.4603732162458836, + "grad_norm": 0.9930263757705688, + "learning_rate": 1.5661256576665562e-06, + "loss": 0.1061, + "step": 26608 + }, + { + "epoch": 1.460482985729967, + "grad_norm": 1.1403794288635254, + "learning_rate": 1.5643020805551495e-06, + "loss": 0.241, + "step": 26610 + }, + { + "epoch": 1.4605927552140505, + "grad_norm": 0.8993105292320251, + "learning_rate": 1.5624795314484851e-06, + "loss": 0.1839, + "step": 26612 + }, + { + "epoch": 1.460702524698134, + "grad_norm": 0.977264404296875, + "learning_rate": 1.560658010426505e-06, + "loss": 0.1977, + "step": 26614 + }, + { + "epoch": 1.4608122941822175, + "grad_norm": 1.5518821477890015, + "learning_rate": 1.5588375175691117e-06, + "loss": 0.1878, + "step": 26616 + }, + { + "epoch": 1.4609220636663007, + "grad_norm": 1.2602059841156006, + "learning_rate": 1.557018052956155e-06, + "loss": 0.1737, + "step": 26618 + }, + { + "epoch": 1.4610318331503842, + "grad_norm": 1.6099419593811035, + "learning_rate": 1.5551996166674576e-06, + "loss": 0.2767, + "step": 26620 + }, + { + "epoch": 1.4611416026344677, + "grad_norm": 1.0277612209320068, + "learning_rate": 1.5533822087827805e-06, + "loss": 0.148, + "step": 26622 + }, + { + "epoch": 1.461251372118551, + "grad_norm": 1.43793523311615, + "learning_rate": 1.5515658293818403e-06, + "loss": 0.1476, + "step": 26624 + }, + { + "epoch": 1.4613611416026344, + "grad_norm": 1.8986238241195679, + "learning_rate": 1.5497504785443151e-06, + "loss": 0.2326, + "step": 26626 + }, + { + "epoch": 1.461470911086718, + "grad_norm": 0.8554832935333252, + "learning_rate": 1.547936156349833e-06, + "loss": 0.1705, + "step": 26628 + }, + { + "epoch": 1.4615806805708014, + "grad_norm": 1.3233057260513306, + "learning_rate": 1.54612286287798e-06, + "loss": 0.2335, + "step": 26630 + }, + { + "epoch": 1.4616904500548848, + "grad_norm": 0.6640121340751648, + "learning_rate": 1.5443105982082984e-06, + "loss": 0.1851, + "step": 26632 + }, + { + "epoch": 1.461800219538968, + "grad_norm": 0.9966079592704773, + "learning_rate": 1.54249936242028e-06, + "loss": 0.2982, + "step": 26634 + }, + { + "epoch": 1.4619099890230516, + "grad_norm": 1.1274452209472656, + "learning_rate": 1.540689155593375e-06, + "loss": 0.2474, + "step": 26636 + }, + { + "epoch": 1.462019758507135, + "grad_norm": 1.0609385967254639, + "learning_rate": 1.5388799778069896e-06, + "loss": 0.2089, + "step": 26638 + }, + { + "epoch": 1.4621295279912183, + "grad_norm": 1.368825912475586, + "learning_rate": 1.5370718291404851e-06, + "loss": 0.2857, + "step": 26640 + }, + { + "epoch": 1.4622392974753018, + "grad_norm": 1.4740619659423828, + "learning_rate": 1.5352647096731705e-06, + "loss": 0.2358, + "step": 26642 + }, + { + "epoch": 1.4623490669593853, + "grad_norm": 1.1310924291610718, + "learning_rate": 1.5334586194843154e-06, + "loss": 0.1312, + "step": 26644 + }, + { + "epoch": 1.4624588364434687, + "grad_norm": 1.3792551755905151, + "learning_rate": 1.5316535586531483e-06, + "loss": 0.2157, + "step": 26646 + }, + { + "epoch": 1.4625686059275522, + "grad_norm": 1.0386961698532104, + "learning_rate": 1.5298495272588447e-06, + "loss": 0.1458, + "step": 26648 + }, + { + "epoch": 1.4626783754116355, + "grad_norm": 1.0713084936141968, + "learning_rate": 1.5280465253805415e-06, + "loss": 0.1256, + "step": 26650 + }, + { + "epoch": 1.462788144895719, + "grad_norm": 0.9941332340240479, + "learning_rate": 1.5262445530973224e-06, + "loss": 0.1215, + "step": 26652 + }, + { + "epoch": 1.4628979143798024, + "grad_norm": 1.0326941013336182, + "learning_rate": 1.5244436104882325e-06, + "loss": 0.1691, + "step": 26654 + }, + { + "epoch": 1.463007683863886, + "grad_norm": 1.3646533489227295, + "learning_rate": 1.5226436976322727e-06, + "loss": 0.1144, + "step": 26656 + }, + { + "epoch": 1.4631174533479694, + "grad_norm": 0.8900880813598633, + "learning_rate": 1.5208448146083937e-06, + "loss": 0.162, + "step": 26658 + }, + { + "epoch": 1.4632272228320526, + "grad_norm": 1.4820599555969238, + "learning_rate": 1.5190469614955045e-06, + "loss": 0.2032, + "step": 26660 + }, + { + "epoch": 1.4633369923161361, + "grad_norm": 1.1366525888442993, + "learning_rate": 1.5172501383724669e-06, + "loss": 0.1479, + "step": 26662 + }, + { + "epoch": 1.4634467618002196, + "grad_norm": 1.0803377628326416, + "learning_rate": 1.5154543453180958e-06, + "loss": 0.1476, + "step": 26664 + }, + { + "epoch": 1.4635565312843029, + "grad_norm": 1.4948070049285889, + "learning_rate": 1.5136595824111643e-06, + "loss": 0.1456, + "step": 26666 + }, + { + "epoch": 1.4636663007683863, + "grad_norm": 1.0903656482696533, + "learning_rate": 1.5118658497304011e-06, + "loss": 0.2583, + "step": 26668 + }, + { + "epoch": 1.4637760702524698, + "grad_norm": 0.8883850574493408, + "learning_rate": 1.5100731473544933e-06, + "loss": 0.212, + "step": 26670 + }, + { + "epoch": 1.4638858397365533, + "grad_norm": 1.5296192169189453, + "learning_rate": 1.5082814753620721e-06, + "loss": 0.1431, + "step": 26672 + }, + { + "epoch": 1.4639956092206368, + "grad_norm": 1.1197798252105713, + "learning_rate": 1.506490833831728e-06, + "loss": 0.1957, + "step": 26674 + }, + { + "epoch": 1.46410537870472, + "grad_norm": 1.7187947034835815, + "learning_rate": 1.5047012228420087e-06, + "loss": 0.2049, + "step": 26676 + }, + { + "epoch": 1.4642151481888035, + "grad_norm": 1.0281203985214233, + "learning_rate": 1.5029126424714185e-06, + "loss": 0.1835, + "step": 26678 + }, + { + "epoch": 1.464324917672887, + "grad_norm": 1.0081864595413208, + "learning_rate": 1.5011250927984028e-06, + "loss": 0.2031, + "step": 26680 + }, + { + "epoch": 1.4644346871569702, + "grad_norm": 1.0143005847930908, + "learning_rate": 1.499338573901385e-06, + "loss": 0.2807, + "step": 26682 + }, + { + "epoch": 1.4645444566410537, + "grad_norm": 1.928246259689331, + "learning_rate": 1.4975530858587271e-06, + "loss": 0.1676, + "step": 26684 + }, + { + "epoch": 1.4646542261251372, + "grad_norm": 1.2822461128234863, + "learning_rate": 1.495768628748745e-06, + "loss": 0.2991, + "step": 26686 + }, + { + "epoch": 1.4647639956092207, + "grad_norm": 1.2263319492340088, + "learning_rate": 1.4939852026497197e-06, + "loss": 0.144, + "step": 26688 + }, + { + "epoch": 1.4648737650933041, + "grad_norm": 1.6366894245147705, + "learning_rate": 1.4922028076398754e-06, + "loss": 0.2387, + "step": 26690 + }, + { + "epoch": 1.4649835345773874, + "grad_norm": 1.0322483777999878, + "learning_rate": 1.4904214437973963e-06, + "loss": 0.2395, + "step": 26692 + }, + { + "epoch": 1.4650933040614709, + "grad_norm": 0.829612135887146, + "learning_rate": 1.4886411112004255e-06, + "loss": 0.1248, + "step": 26694 + }, + { + "epoch": 1.4652030735455543, + "grad_norm": 0.8245943188667297, + "learning_rate": 1.4868618099270538e-06, + "loss": 0.1316, + "step": 26696 + }, + { + "epoch": 1.4653128430296378, + "grad_norm": 1.029578685760498, + "learning_rate": 1.4850835400553375e-06, + "loss": 0.1682, + "step": 26698 + }, + { + "epoch": 1.4654226125137213, + "grad_norm": 1.6003351211547852, + "learning_rate": 1.4833063016632759e-06, + "loss": 0.2669, + "step": 26700 + }, + { + "epoch": 1.4655323819978046, + "grad_norm": 0.7679964303970337, + "learning_rate": 1.481530094828823e-06, + "loss": 0.2464, + "step": 26702 + }, + { + "epoch": 1.465642151481888, + "grad_norm": 1.560452938079834, + "learning_rate": 1.4797549196298944e-06, + "loss": 0.1437, + "step": 26704 + }, + { + "epoch": 1.4657519209659715, + "grad_norm": 1.2305071353912354, + "learning_rate": 1.4779807761443636e-06, + "loss": 0.2372, + "step": 26706 + }, + { + "epoch": 1.4658616904500548, + "grad_norm": 1.1105999946594238, + "learning_rate": 1.4762076644500466e-06, + "loss": 0.1914, + "step": 26708 + }, + { + "epoch": 1.4659714599341382, + "grad_norm": 1.1923067569732666, + "learning_rate": 1.4744355846247255e-06, + "loss": 0.2426, + "step": 26710 + }, + { + "epoch": 1.4660812294182217, + "grad_norm": 1.3831418752670288, + "learning_rate": 1.4726645367461268e-06, + "loss": 0.204, + "step": 26712 + }, + { + "epoch": 1.4661909989023052, + "grad_norm": 1.0041900873184204, + "learning_rate": 1.470894520891944e-06, + "loss": 0.2001, + "step": 26714 + }, + { + "epoch": 1.4663007683863887, + "grad_norm": 0.8195383548736572, + "learning_rate": 1.4691255371398094e-06, + "loss": 0.1676, + "step": 26716 + }, + { + "epoch": 1.466410537870472, + "grad_norm": 0.8363613486289978, + "learning_rate": 1.4673575855673277e-06, + "loss": 0.1681, + "step": 26718 + }, + { + "epoch": 1.4665203073545554, + "grad_norm": 1.3259809017181396, + "learning_rate": 1.465590666252048e-06, + "loss": 0.229, + "step": 26720 + }, + { + "epoch": 1.4666300768386389, + "grad_norm": 1.085882544517517, + "learning_rate": 1.4638247792714777e-06, + "loss": 0.1154, + "step": 26722 + }, + { + "epoch": 1.4667398463227224, + "grad_norm": 1.7856123447418213, + "learning_rate": 1.4620599247030715e-06, + "loss": 0.2917, + "step": 26724 + }, + { + "epoch": 1.4668496158068058, + "grad_norm": 3.2295703887939453, + "learning_rate": 1.4602961026242479e-06, + "loss": 0.1964, + "step": 26726 + }, + { + "epoch": 1.466959385290889, + "grad_norm": 0.7550786733627319, + "learning_rate": 1.4585333131123785e-06, + "loss": 0.0882, + "step": 26728 + }, + { + "epoch": 1.4670691547749726, + "grad_norm": 0.9080488085746765, + "learning_rate": 1.456771556244782e-06, + "loss": 0.1353, + "step": 26730 + }, + { + "epoch": 1.467178924259056, + "grad_norm": 0.6337489485740662, + "learning_rate": 1.4550108320987466e-06, + "loss": 0.1838, + "step": 26732 + }, + { + "epoch": 1.4672886937431393, + "grad_norm": 0.9052376747131348, + "learning_rate": 1.453251140751502e-06, + "loss": 0.1688, + "step": 26734 + }, + { + "epoch": 1.4673984632272228, + "grad_norm": 0.621684193611145, + "learning_rate": 1.4514924822802367e-06, + "loss": 0.1073, + "step": 26736 + }, + { + "epoch": 1.4675082327113063, + "grad_norm": 1.2672823667526245, + "learning_rate": 1.4497348567620917e-06, + "loss": 0.2473, + "step": 26738 + }, + { + "epoch": 1.4676180021953897, + "grad_norm": 0.9363964200019836, + "learning_rate": 1.447978264274169e-06, + "loss": 0.1469, + "step": 26740 + }, + { + "epoch": 1.4677277716794732, + "grad_norm": 1.0533329248428345, + "learning_rate": 1.4462227048935183e-06, + "loss": 0.1592, + "step": 26742 + }, + { + "epoch": 1.4678375411635565, + "grad_norm": 1.2505762577056885, + "learning_rate": 1.4444681786971503e-06, + "loss": 0.1854, + "step": 26744 + }, + { + "epoch": 1.46794731064764, + "grad_norm": 0.9517371654510498, + "learning_rate": 1.4427146857620228e-06, + "loss": 0.1309, + "step": 26746 + }, + { + "epoch": 1.4680570801317234, + "grad_norm": 1.1632301807403564, + "learning_rate": 1.44096222616506e-06, + "loss": 0.1786, + "step": 26748 + }, + { + "epoch": 1.4681668496158067, + "grad_norm": 1.3756755590438843, + "learning_rate": 1.4392107999831262e-06, + "loss": 0.2519, + "step": 26750 + }, + { + "epoch": 1.4682766190998902, + "grad_norm": 0.7556758522987366, + "learning_rate": 1.4374604072930453e-06, + "loss": 0.1841, + "step": 26752 + }, + { + "epoch": 1.4683863885839736, + "grad_norm": 1.370928168296814, + "learning_rate": 1.4357110481716063e-06, + "loss": 0.2291, + "step": 26754 + }, + { + "epoch": 1.4684961580680571, + "grad_norm": 1.1435896158218384, + "learning_rate": 1.4339627226955392e-06, + "loss": 0.2546, + "step": 26756 + }, + { + "epoch": 1.4686059275521406, + "grad_norm": 0.9976693987846375, + "learning_rate": 1.4322154309415386e-06, + "loss": 0.2787, + "step": 26758 + }, + { + "epoch": 1.4687156970362238, + "grad_norm": 1.0085357427597046, + "learning_rate": 1.4304691729862457e-06, + "loss": 0.151, + "step": 26760 + }, + { + "epoch": 1.4688254665203073, + "grad_norm": 0.6654718518257141, + "learning_rate": 1.4287239489062632e-06, + "loss": 0.1313, + "step": 26762 + }, + { + "epoch": 1.4689352360043908, + "grad_norm": 1.7367043495178223, + "learning_rate": 1.426979758778141e-06, + "loss": 0.1745, + "step": 26764 + }, + { + "epoch": 1.4690450054884743, + "grad_norm": 1.563833475112915, + "learning_rate": 1.425236602678387e-06, + "loss": 0.2244, + "step": 26766 + }, + { + "epoch": 1.4691547749725578, + "grad_norm": 0.9361318349838257, + "learning_rate": 1.423494480683471e-06, + "loss": 0.157, + "step": 26768 + }, + { + "epoch": 1.469264544456641, + "grad_norm": 0.9715034365653992, + "learning_rate": 1.4217533928698067e-06, + "loss": 0.2423, + "step": 26770 + }, + { + "epoch": 1.4693743139407245, + "grad_norm": 1.821405053138733, + "learning_rate": 1.4200133393137693e-06, + "loss": 0.2459, + "step": 26772 + }, + { + "epoch": 1.469484083424808, + "grad_norm": 0.6861125826835632, + "learning_rate": 1.4182743200916838e-06, + "loss": 0.1534, + "step": 26774 + }, + { + "epoch": 1.4695938529088912, + "grad_norm": 1.8633393049240112, + "learning_rate": 1.4165363352798306e-06, + "loss": 0.2441, + "step": 26776 + }, + { + "epoch": 1.4697036223929747, + "grad_norm": 2.04132080078125, + "learning_rate": 1.4147993849544461e-06, + "loss": 0.2443, + "step": 26778 + }, + { + "epoch": 1.4698133918770582, + "grad_norm": 1.6223881244659424, + "learning_rate": 1.4130634691917249e-06, + "loss": 0.1959, + "step": 26780 + }, + { + "epoch": 1.4699231613611417, + "grad_norm": 0.8275705575942993, + "learning_rate": 1.4113285880678146e-06, + "loss": 0.1859, + "step": 26782 + }, + { + "epoch": 1.4700329308452251, + "grad_norm": 1.3209701776504517, + "learning_rate": 1.4095947416588124e-06, + "loss": 0.1586, + "step": 26784 + }, + { + "epoch": 1.4701427003293084, + "grad_norm": 1.0271977186203003, + "learning_rate": 1.4078619300407686e-06, + "loss": 0.1723, + "step": 26786 + }, + { + "epoch": 1.4702524698133919, + "grad_norm": 1.5682048797607422, + "learning_rate": 1.4061301532897003e-06, + "loss": 0.2142, + "step": 26788 + }, + { + "epoch": 1.4703622392974753, + "grad_norm": 1.4464682340621948, + "learning_rate": 1.4043994114815661e-06, + "loss": 0.183, + "step": 26790 + }, + { + "epoch": 1.4704720087815586, + "grad_norm": 1.4031857252120972, + "learning_rate": 1.4026697046922916e-06, + "loss": 0.1494, + "step": 26792 + }, + { + "epoch": 1.470581778265642, + "grad_norm": 1.4671005010604858, + "learning_rate": 1.4009410329977463e-06, + "loss": 0.1405, + "step": 26794 + }, + { + "epoch": 1.4706915477497255, + "grad_norm": 1.1511638164520264, + "learning_rate": 1.3992133964737586e-06, + "loss": 0.1674, + "step": 26796 + }, + { + "epoch": 1.470801317233809, + "grad_norm": 1.3584562540054321, + "learning_rate": 1.3974867951961095e-06, + "loss": 0.2296, + "step": 26798 + }, + { + "epoch": 1.4709110867178925, + "grad_norm": 0.8486929535865784, + "learning_rate": 1.3957612292405386e-06, + "loss": 0.1208, + "step": 26800 + }, + { + "epoch": 1.4710208562019758, + "grad_norm": 1.4564415216445923, + "learning_rate": 1.3940366986827325e-06, + "loss": 0.2363, + "step": 26802 + }, + { + "epoch": 1.4711306256860592, + "grad_norm": 1.0511703491210938, + "learning_rate": 1.3923132035983444e-06, + "loss": 0.2112, + "step": 26804 + }, + { + "epoch": 1.4712403951701427, + "grad_norm": 0.8785459399223328, + "learning_rate": 1.3905907440629752e-06, + "loss": 0.1335, + "step": 26806 + }, + { + "epoch": 1.4713501646542262, + "grad_norm": 1.336578607559204, + "learning_rate": 1.3888693201521752e-06, + "loss": 0.2224, + "step": 26808 + }, + { + "epoch": 1.4714599341383097, + "grad_norm": 1.1956374645233154, + "learning_rate": 1.3871489319414593e-06, + "loss": 0.2614, + "step": 26810 + }, + { + "epoch": 1.471569703622393, + "grad_norm": 1.2453058958053589, + "learning_rate": 1.3854295795062893e-06, + "loss": 0.1607, + "step": 26812 + }, + { + "epoch": 1.4716794731064764, + "grad_norm": 1.464855432510376, + "learning_rate": 1.38371126292208e-06, + "loss": 0.2074, + "step": 26814 + }, + { + "epoch": 1.4717892425905599, + "grad_norm": 1.2607585191726685, + "learning_rate": 1.3819939822642125e-06, + "loss": 0.1083, + "step": 26816 + }, + { + "epoch": 1.4718990120746431, + "grad_norm": 0.6276435256004333, + "learning_rate": 1.380277737608013e-06, + "loss": 0.128, + "step": 26818 + }, + { + "epoch": 1.4720087815587266, + "grad_norm": 1.1079487800598145, + "learning_rate": 1.378562529028768e-06, + "loss": 0.2376, + "step": 26820 + }, + { + "epoch": 1.47211855104281, + "grad_norm": 1.208394169807434, + "learning_rate": 1.3768483566017093e-06, + "loss": 0.1522, + "step": 26822 + }, + { + "epoch": 1.4722283205268936, + "grad_norm": 1.0854976177215576, + "learning_rate": 1.3751352204020324e-06, + "loss": 0.1258, + "step": 26824 + }, + { + "epoch": 1.472338090010977, + "grad_norm": 1.2688270807266235, + "learning_rate": 1.3734231205048825e-06, + "loss": 0.1793, + "step": 26826 + }, + { + "epoch": 1.4724478594950603, + "grad_norm": 1.458911657333374, + "learning_rate": 1.3717120569853554e-06, + "loss": 0.1494, + "step": 26828 + }, + { + "epoch": 1.4725576289791438, + "grad_norm": 0.7704915404319763, + "learning_rate": 1.3700020299185156e-06, + "loss": 0.2187, + "step": 26830 + }, + { + "epoch": 1.4726673984632273, + "grad_norm": 1.7400728464126587, + "learning_rate": 1.3682930393793703e-06, + "loss": 0.268, + "step": 26832 + }, + { + "epoch": 1.4727771679473107, + "grad_norm": 0.9438616037368774, + "learning_rate": 1.366585085442884e-06, + "loss": 0.2286, + "step": 26834 + }, + { + "epoch": 1.4728869374313942, + "grad_norm": 0.9427353143692017, + "learning_rate": 1.364878168183975e-06, + "loss": 0.2296, + "step": 26836 + }, + { + "epoch": 1.4729967069154775, + "grad_norm": 0.8964154124259949, + "learning_rate": 1.3631722876775138e-06, + "loss": 0.1145, + "step": 26838 + }, + { + "epoch": 1.473106476399561, + "grad_norm": 0.8259122371673584, + "learning_rate": 1.361467443998335e-06, + "loss": 0.1214, + "step": 26840 + }, + { + "epoch": 1.4732162458836444, + "grad_norm": 1.8962656259536743, + "learning_rate": 1.3597636372212202e-06, + "loss": 0.2001, + "step": 26842 + }, + { + "epoch": 1.4733260153677277, + "grad_norm": 1.2231433391571045, + "learning_rate": 1.3580608674209072e-06, + "loss": 0.1446, + "step": 26844 + }, + { + "epoch": 1.4734357848518111, + "grad_norm": 1.1352862119674683, + "learning_rate": 1.3563591346720804e-06, + "loss": 0.1586, + "step": 26846 + }, + { + "epoch": 1.4735455543358946, + "grad_norm": 0.8721756935119629, + "learning_rate": 1.3546584390493943e-06, + "loss": 0.1582, + "step": 26848 + }, + { + "epoch": 1.473655323819978, + "grad_norm": 1.2151457071304321, + "learning_rate": 1.3529587806274475e-06, + "loss": 0.191, + "step": 26850 + }, + { + "epoch": 1.4737650933040616, + "grad_norm": 1.0952259302139282, + "learning_rate": 1.3512601594807938e-06, + "loss": 0.1839, + "step": 26852 + }, + { + "epoch": 1.4738748627881448, + "grad_norm": 1.2113728523254395, + "learning_rate": 1.3495625756839463e-06, + "loss": 0.1954, + "step": 26854 + }, + { + "epoch": 1.4739846322722283, + "grad_norm": 0.8478478193283081, + "learning_rate": 1.3478660293113676e-06, + "loss": 0.115, + "step": 26856 + }, + { + "epoch": 1.4740944017563118, + "grad_norm": 1.0972820520401, + "learning_rate": 1.346170520437476e-06, + "loss": 0.1967, + "step": 26858 + }, + { + "epoch": 1.474204171240395, + "grad_norm": 1.0062940120697021, + "learning_rate": 1.3444760491366449e-06, + "loss": 0.1462, + "step": 26860 + }, + { + "epoch": 1.4743139407244785, + "grad_norm": 1.1012015342712402, + "learning_rate": 1.3427826154832042e-06, + "loss": 0.1703, + "step": 26862 + }, + { + "epoch": 1.474423710208562, + "grad_norm": 1.1821331977844238, + "learning_rate": 1.3410902195514303e-06, + "loss": 0.2138, + "step": 26864 + }, + { + "epoch": 1.4745334796926455, + "grad_norm": 0.966488778591156, + "learning_rate": 1.3393988614155667e-06, + "loss": 0.1472, + "step": 26866 + }, + { + "epoch": 1.474643249176729, + "grad_norm": 0.9935032725334167, + "learning_rate": 1.3377085411498015e-06, + "loss": 0.1122, + "step": 26868 + }, + { + "epoch": 1.4747530186608122, + "grad_norm": 1.4284394979476929, + "learning_rate": 1.3360192588282832e-06, + "loss": 0.2039, + "step": 26870 + }, + { + "epoch": 1.4748627881448957, + "grad_norm": 1.1037102937698364, + "learning_rate": 1.3343310145251087e-06, + "loss": 0.2414, + "step": 26872 + }, + { + "epoch": 1.4749725576289792, + "grad_norm": 1.346912145614624, + "learning_rate": 1.3326438083143295e-06, + "loss": 0.1782, + "step": 26874 + }, + { + "epoch": 1.4750823271130626, + "grad_norm": 1.1936615705490112, + "learning_rate": 1.3309576402699641e-06, + "loss": 0.1261, + "step": 26876 + }, + { + "epoch": 1.4751920965971461, + "grad_norm": 1.177504539489746, + "learning_rate": 1.3292725104659676e-06, + "loss": 0.1764, + "step": 26878 + }, + { + "epoch": 1.4753018660812294, + "grad_norm": 1.3031938076019287, + "learning_rate": 1.3275884189762638e-06, + "loss": 0.1174, + "step": 26880 + }, + { + "epoch": 1.4754116355653129, + "grad_norm": 0.9558658003807068, + "learning_rate": 1.3259053658747245e-06, + "loss": 0.1704, + "step": 26882 + }, + { + "epoch": 1.4755214050493963, + "grad_norm": 0.6472469568252563, + "learning_rate": 1.3242233512351737e-06, + "loss": 0.2185, + "step": 26884 + }, + { + "epoch": 1.4756311745334796, + "grad_norm": 0.8335785865783691, + "learning_rate": 1.3225423751313942e-06, + "loss": 0.1308, + "step": 26886 + }, + { + "epoch": 1.475740944017563, + "grad_norm": 0.8600618243217468, + "learning_rate": 1.3208624376371186e-06, + "loss": 0.2339, + "step": 26888 + }, + { + "epoch": 1.4758507135016465, + "grad_norm": 1.5197803974151611, + "learning_rate": 1.3191835388260464e-06, + "loss": 0.1925, + "step": 26890 + }, + { + "epoch": 1.47596048298573, + "grad_norm": 1.0327397584915161, + "learning_rate": 1.3175056787718132e-06, + "loss": 0.1081, + "step": 26892 + }, + { + "epoch": 1.4760702524698135, + "grad_norm": 0.9578825831413269, + "learning_rate": 1.3158288575480237e-06, + "loss": 0.1875, + "step": 26894 + }, + { + "epoch": 1.4761800219538967, + "grad_norm": 1.8068606853485107, + "learning_rate": 1.3141530752282277e-06, + "loss": 0.324, + "step": 26896 + }, + { + "epoch": 1.4762897914379802, + "grad_norm": 1.23947012424469, + "learning_rate": 1.3124783318859357e-06, + "loss": 0.1866, + "step": 26898 + }, + { + "epoch": 1.4763995609220637, + "grad_norm": 1.4690587520599365, + "learning_rate": 1.3108046275946057e-06, + "loss": 0.1912, + "step": 26900 + }, + { + "epoch": 1.476509330406147, + "grad_norm": 1.151584506034851, + "learning_rate": 1.309131962427662e-06, + "loss": 0.1347, + "step": 26902 + }, + { + "epoch": 1.4766190998902304, + "grad_norm": 0.6556332111358643, + "learning_rate": 1.3074603364584715e-06, + "loss": 0.1034, + "step": 26904 + }, + { + "epoch": 1.476728869374314, + "grad_norm": 1.1973562240600586, + "learning_rate": 1.305789749760361e-06, + "loss": 0.1734, + "step": 26906 + }, + { + "epoch": 1.4768386388583974, + "grad_norm": 1.3555978536605835, + "learning_rate": 1.3041202024066056e-06, + "loss": 0.2272, + "step": 26908 + }, + { + "epoch": 1.4769484083424809, + "grad_norm": 1.187575340270996, + "learning_rate": 1.3024516944704496e-06, + "loss": 0.279, + "step": 26910 + }, + { + "epoch": 1.4770581778265641, + "grad_norm": 1.0206180810928345, + "learning_rate": 1.300784226025073e-06, + "loss": 0.2041, + "step": 26912 + }, + { + "epoch": 1.4771679473106476, + "grad_norm": 1.1733254194259644, + "learning_rate": 1.299117797143623e-06, + "loss": 0.1934, + "step": 26914 + }, + { + "epoch": 1.477277716794731, + "grad_norm": 1.1014409065246582, + "learning_rate": 1.2974524078991995e-06, + "loss": 0.1743, + "step": 26916 + }, + { + "epoch": 1.4773874862788146, + "grad_norm": 1.835199236869812, + "learning_rate": 1.2957880583648525e-06, + "loss": 0.31, + "step": 26918 + }, + { + "epoch": 1.477497255762898, + "grad_norm": 1.014581561088562, + "learning_rate": 1.294124748613587e-06, + "loss": 0.1511, + "step": 26920 + }, + { + "epoch": 1.4776070252469813, + "grad_norm": 1.229236364364624, + "learning_rate": 1.2924624787183675e-06, + "loss": 0.2648, + "step": 26922 + }, + { + "epoch": 1.4777167947310648, + "grad_norm": 0.9826124906539917, + "learning_rate": 1.290801248752102e-06, + "loss": 0.1057, + "step": 26924 + }, + { + "epoch": 1.4778265642151482, + "grad_norm": 1.4734761714935303, + "learning_rate": 1.2891410587876711e-06, + "loss": 0.1935, + "step": 26926 + }, + { + "epoch": 1.4779363336992315, + "grad_norm": 0.8179731369018555, + "learning_rate": 1.2874819088978917e-06, + "loss": 0.13, + "step": 26928 + }, + { + "epoch": 1.478046103183315, + "grad_norm": 1.8006935119628906, + "learning_rate": 1.2858237991555443e-06, + "loss": 0.2037, + "step": 26930 + }, + { + "epoch": 1.4781558726673985, + "grad_norm": 1.3867557048797607, + "learning_rate": 1.2841667296333598e-06, + "loss": 0.1957, + "step": 26932 + }, + { + "epoch": 1.478265642151482, + "grad_norm": 1.268653154373169, + "learning_rate": 1.2825107004040272e-06, + "loss": 0.1406, + "step": 26934 + }, + { + "epoch": 1.4783754116355654, + "grad_norm": 1.0732353925704956, + "learning_rate": 1.2808557115401826e-06, + "loss": 0.199, + "step": 26936 + }, + { + "epoch": 1.4784851811196487, + "grad_norm": 1.1515387296676636, + "learning_rate": 1.2792017631144294e-06, + "loss": 0.3025, + "step": 26938 + }, + { + "epoch": 1.4785949506037321, + "grad_norm": 1.3125108480453491, + "learning_rate": 1.2775488551993176e-06, + "loss": 0.214, + "step": 26940 + }, + { + "epoch": 1.4787047200878156, + "grad_norm": 1.5566582679748535, + "learning_rate": 1.2758969878673504e-06, + "loss": 0.2775, + "step": 26942 + }, + { + "epoch": 1.478814489571899, + "grad_norm": 0.7478396892547607, + "learning_rate": 1.274246161190984e-06, + "loss": 0.1393, + "step": 26944 + }, + { + "epoch": 1.4789242590559826, + "grad_norm": 0.8608283996582031, + "learning_rate": 1.2725963752426379e-06, + "loss": 0.1617, + "step": 26946 + }, + { + "epoch": 1.4790340285400658, + "grad_norm": 1.6883043050765991, + "learning_rate": 1.270947630094671e-06, + "loss": 0.2721, + "step": 26948 + }, + { + "epoch": 1.4791437980241493, + "grad_norm": 1.4052581787109375, + "learning_rate": 1.2692999258194088e-06, + "loss": 0.1939, + "step": 26950 + }, + { + "epoch": 1.4792535675082328, + "grad_norm": 1.39873206615448, + "learning_rate": 1.2676532624891323e-06, + "loss": 0.2053, + "step": 26952 + }, + { + "epoch": 1.479363336992316, + "grad_norm": 1.3116319179534912, + "learning_rate": 1.26600764017607e-06, + "loss": 0.245, + "step": 26954 + }, + { + "epoch": 1.4794731064763995, + "grad_norm": 1.681062936782837, + "learning_rate": 1.264363058952406e-06, + "loss": 0.1818, + "step": 26956 + }, + { + "epoch": 1.479582875960483, + "grad_norm": 1.2001434564590454, + "learning_rate": 1.2627195188902791e-06, + "loss": 0.1597, + "step": 26958 + }, + { + "epoch": 1.4796926454445665, + "grad_norm": 1.18952214717865, + "learning_rate": 1.2610770200617794e-06, + "loss": 0.1907, + "step": 26960 + }, + { + "epoch": 1.47980241492865, + "grad_norm": 0.8632125854492188, + "learning_rate": 1.259435562538963e-06, + "loss": 0.1217, + "step": 26962 + }, + { + "epoch": 1.4799121844127332, + "grad_norm": 1.2401578426361084, + "learning_rate": 1.2577951463938282e-06, + "loss": 0.1696, + "step": 26964 + }, + { + "epoch": 1.4800219538968167, + "grad_norm": 0.7965257167816162, + "learning_rate": 1.2561557716983307e-06, + "loss": 0.105, + "step": 26966 + }, + { + "epoch": 1.4801317233809002, + "grad_norm": 1.8447635173797607, + "learning_rate": 1.2545174385243803e-06, + "loss": 0.1955, + "step": 26968 + }, + { + "epoch": 1.4802414928649834, + "grad_norm": 1.1331177949905396, + "learning_rate": 1.2528801469438495e-06, + "loss": 0.1215, + "step": 26970 + }, + { + "epoch": 1.4803512623490669, + "grad_norm": 1.280591368675232, + "learning_rate": 1.2512438970285506e-06, + "loss": 0.1577, + "step": 26972 + }, + { + "epoch": 1.4804610318331504, + "grad_norm": 1.2183266878128052, + "learning_rate": 1.2496086888502595e-06, + "loss": 0.131, + "step": 26974 + }, + { + "epoch": 1.4805708013172338, + "grad_norm": 2.2679970264434814, + "learning_rate": 1.2479745224807048e-06, + "loss": 0.1307, + "step": 26976 + }, + { + "epoch": 1.4806805708013173, + "grad_norm": 0.8130825161933899, + "learning_rate": 1.246341397991574e-06, + "loss": 0.1413, + "step": 26978 + }, + { + "epoch": 1.4807903402854006, + "grad_norm": 0.9964786767959595, + "learning_rate": 1.2447093154544954e-06, + "loss": 0.1661, + "step": 26980 + }, + { + "epoch": 1.480900109769484, + "grad_norm": 1.1431753635406494, + "learning_rate": 1.2430782749410673e-06, + "loss": 0.1539, + "step": 26982 + }, + { + "epoch": 1.4810098792535675, + "grad_norm": 1.0625287294387817, + "learning_rate": 1.2414482765228303e-06, + "loss": 0.1766, + "step": 26984 + }, + { + "epoch": 1.481119648737651, + "grad_norm": 0.9816773533821106, + "learning_rate": 1.2398193202712822e-06, + "loss": 0.1827, + "step": 26986 + }, + { + "epoch": 1.4812294182217345, + "grad_norm": 1.316699504852295, + "learning_rate": 1.2381914062578826e-06, + "loss": 0.2103, + "step": 26988 + }, + { + "epoch": 1.4813391877058177, + "grad_norm": 1.6896017789840698, + "learning_rate": 1.2365645345540384e-06, + "loss": 0.1839, + "step": 26990 + }, + { + "epoch": 1.4814489571899012, + "grad_norm": 1.4708582162857056, + "learning_rate": 1.2349387052311118e-06, + "loss": 0.2351, + "step": 26992 + }, + { + "epoch": 1.4815587266739847, + "grad_norm": 1.5132577419281006, + "learning_rate": 1.2333139183604208e-06, + "loss": 0.3254, + "step": 26994 + }, + { + "epoch": 1.481668496158068, + "grad_norm": 1.0325897932052612, + "learning_rate": 1.2316901740132335e-06, + "loss": 0.2104, + "step": 26996 + }, + { + "epoch": 1.4817782656421514, + "grad_norm": 1.0132081508636475, + "learning_rate": 1.2300674722607735e-06, + "loss": 0.2019, + "step": 26998 + }, + { + "epoch": 1.481888035126235, + "grad_norm": 1.1200238466262817, + "learning_rate": 1.2284458131742254e-06, + "loss": 0.1628, + "step": 27000 + }, + { + "epoch": 1.4819978046103184, + "grad_norm": 1.223275065422058, + "learning_rate": 1.226825196824724e-06, + "loss": 0.1676, + "step": 27002 + }, + { + "epoch": 1.4821075740944019, + "grad_norm": 1.0615131855010986, + "learning_rate": 1.2252056232833542e-06, + "loss": 0.2876, + "step": 27004 + }, + { + "epoch": 1.4822173435784851, + "grad_norm": 0.8231148719787598, + "learning_rate": 1.2235870926211619e-06, + "loss": 0.1734, + "step": 27006 + }, + { + "epoch": 1.4823271130625686, + "grad_norm": 1.1064118146896362, + "learning_rate": 1.2219696049091401e-06, + "loss": 0.1594, + "step": 27008 + }, + { + "epoch": 1.482436882546652, + "grad_norm": 2.162719249725342, + "learning_rate": 1.220353160218235e-06, + "loss": 0.1852, + "step": 27010 + }, + { + "epoch": 1.4825466520307353, + "grad_norm": 0.6460651159286499, + "learning_rate": 1.218737758619365e-06, + "loss": 0.1399, + "step": 27012 + }, + { + "epoch": 1.4826564215148188, + "grad_norm": 0.8508589863777161, + "learning_rate": 1.2171234001833787e-06, + "loss": 0.1644, + "step": 27014 + }, + { + "epoch": 1.4827661909989023, + "grad_norm": 1.0394654273986816, + "learning_rate": 1.2155100849810945e-06, + "loss": 0.1492, + "step": 27016 + }, + { + "epoch": 1.4828759604829858, + "grad_norm": 1.2426762580871582, + "learning_rate": 1.2138978130832813e-06, + "loss": 0.2007, + "step": 27018 + }, + { + "epoch": 1.4829857299670692, + "grad_norm": 1.189575433731079, + "learning_rate": 1.2122865845606569e-06, + "loss": 0.2058, + "step": 27020 + }, + { + "epoch": 1.4830954994511525, + "grad_norm": 1.0102990865707397, + "learning_rate": 1.2106763994838955e-06, + "loss": 0.1962, + "step": 27022 + }, + { + "epoch": 1.483205268935236, + "grad_norm": 1.459942102432251, + "learning_rate": 1.2090672579236379e-06, + "loss": 0.3576, + "step": 27024 + }, + { + "epoch": 1.4833150384193194, + "grad_norm": 1.233527421951294, + "learning_rate": 1.2074591599504608e-06, + "loss": 0.2082, + "step": 27026 + }, + { + "epoch": 1.483424807903403, + "grad_norm": 1.1816048622131348, + "learning_rate": 1.2058521056349054e-06, + "loss": 0.179, + "step": 27028 + }, + { + "epoch": 1.4835345773874864, + "grad_norm": 1.221389889717102, + "learning_rate": 1.2042460950474648e-06, + "loss": 0.185, + "step": 27030 + }, + { + "epoch": 1.4836443468715697, + "grad_norm": 1.2092076539993286, + "learning_rate": 1.2026411282585886e-06, + "loss": 0.1458, + "step": 27032 + }, + { + "epoch": 1.4837541163556531, + "grad_norm": 1.8764684200286865, + "learning_rate": 1.2010372053386787e-06, + "loss": 0.3049, + "step": 27034 + }, + { + "epoch": 1.4838638858397366, + "grad_norm": 1.0024229288101196, + "learning_rate": 1.1994343263580844e-06, + "loss": 0.1347, + "step": 27036 + }, + { + "epoch": 1.4839736553238199, + "grad_norm": 0.887673556804657, + "learning_rate": 1.1978324913871214e-06, + "loss": 0.1543, + "step": 27038 + }, + { + "epoch": 1.4840834248079033, + "grad_norm": 2.400355577468872, + "learning_rate": 1.196231700496056e-06, + "loss": 0.2126, + "step": 27040 + }, + { + "epoch": 1.4841931942919868, + "grad_norm": 1.1767361164093018, + "learning_rate": 1.1946319537551042e-06, + "loss": 0.162, + "step": 27042 + }, + { + "epoch": 1.4843029637760703, + "grad_norm": 0.8465344309806824, + "learning_rate": 1.1930332512344378e-06, + "loss": 0.151, + "step": 27044 + }, + { + "epoch": 1.4844127332601538, + "grad_norm": 1.6664787530899048, + "learning_rate": 1.1914355930041837e-06, + "loss": 0.183, + "step": 27046 + }, + { + "epoch": 1.484522502744237, + "grad_norm": 0.9323128461837769, + "learning_rate": 1.1898389791344223e-06, + "loss": 0.1931, + "step": 27048 + }, + { + "epoch": 1.4846322722283205, + "grad_norm": 1.373579978942871, + "learning_rate": 1.1882434096951916e-06, + "loss": 0.2281, + "step": 27050 + }, + { + "epoch": 1.484742041712404, + "grad_norm": 1.879095435142517, + "learning_rate": 1.1866488847564805e-06, + "loss": 0.2452, + "step": 27052 + }, + { + "epoch": 1.4848518111964875, + "grad_norm": 1.0517116785049438, + "learning_rate": 1.1850554043882328e-06, + "loss": 0.1782, + "step": 27054 + }, + { + "epoch": 1.484961580680571, + "grad_norm": 1.0708121061325073, + "learning_rate": 1.1834629686603455e-06, + "loss": 0.1415, + "step": 27056 + }, + { + "epoch": 1.4850713501646542, + "grad_norm": 1.0714521408081055, + "learning_rate": 1.181871577642668e-06, + "loss": 0.1545, + "step": 27058 + }, + { + "epoch": 1.4851811196487377, + "grad_norm": 0.6802189350128174, + "learning_rate": 1.1802812314050087e-06, + "loss": 0.1711, + "step": 27060 + }, + { + "epoch": 1.4852908891328211, + "grad_norm": 1.198815107345581, + "learning_rate": 1.178691930017134e-06, + "loss": 0.1923, + "step": 27062 + }, + { + "epoch": 1.4854006586169044, + "grad_norm": 1.312330722808838, + "learning_rate": 1.177103673548749e-06, + "loss": 0.1984, + "step": 27064 + }, + { + "epoch": 1.4855104281009879, + "grad_norm": 1.5203088521957397, + "learning_rate": 1.1755164620695315e-06, + "loss": 0.1632, + "step": 27066 + }, + { + "epoch": 1.4856201975850714, + "grad_norm": 1.109151005744934, + "learning_rate": 1.1739302956490949e-06, + "loss": 0.174, + "step": 27068 + }, + { + "epoch": 1.4857299670691548, + "grad_norm": 1.2100623846054077, + "learning_rate": 1.1723451743570229e-06, + "loss": 0.1811, + "step": 27070 + }, + { + "epoch": 1.4858397365532383, + "grad_norm": 1.415311336517334, + "learning_rate": 1.1707610982628402e-06, + "loss": 0.147, + "step": 27072 + }, + { + "epoch": 1.4859495060373216, + "grad_norm": 1.004027009010315, + "learning_rate": 1.1691780674360415e-06, + "loss": 0.2261, + "step": 27074 + }, + { + "epoch": 1.486059275521405, + "grad_norm": 1.3991968631744385, + "learning_rate": 1.1675960819460597e-06, + "loss": 0.2164, + "step": 27076 + }, + { + "epoch": 1.4861690450054885, + "grad_norm": 0.9029610753059387, + "learning_rate": 1.1660151418622922e-06, + "loss": 0.1117, + "step": 27078 + }, + { + "epoch": 1.4862788144895718, + "grad_norm": 1.622280240058899, + "learning_rate": 1.1644352472540837e-06, + "loss": 0.1798, + "step": 27080 + }, + { + "epoch": 1.4863885839736553, + "grad_norm": 0.9043530225753784, + "learning_rate": 1.162856398190737e-06, + "loss": 0.1496, + "step": 27082 + }, + { + "epoch": 1.4864983534577387, + "grad_norm": 1.740635633468628, + "learning_rate": 1.1612785947415022e-06, + "loss": 0.1451, + "step": 27084 + }, + { + "epoch": 1.4866081229418222, + "grad_norm": 1.1528083086013794, + "learning_rate": 1.159701836975602e-06, + "loss": 0.329, + "step": 27086 + }, + { + "epoch": 1.4867178924259057, + "grad_norm": 1.0484943389892578, + "learning_rate": 1.1581261249621916e-06, + "loss": 0.1167, + "step": 27088 + }, + { + "epoch": 1.486827661909989, + "grad_norm": 1.0265759229660034, + "learning_rate": 1.156551458770394e-06, + "loss": 0.1336, + "step": 27090 + }, + { + "epoch": 1.4869374313940724, + "grad_norm": 1.0782312154769897, + "learning_rate": 1.154977838469279e-06, + "loss": 0.1683, + "step": 27092 + }, + { + "epoch": 1.487047200878156, + "grad_norm": 1.0665804147720337, + "learning_rate": 1.153405264127877e-06, + "loss": 0.1696, + "step": 27094 + }, + { + "epoch": 1.4871569703622394, + "grad_norm": 1.0046521425247192, + "learning_rate": 1.1518337358151638e-06, + "loss": 0.2693, + "step": 27096 + }, + { + "epoch": 1.4872667398463228, + "grad_norm": 0.8330484628677368, + "learning_rate": 1.1502632536000729e-06, + "loss": 0.1336, + "step": 27098 + }, + { + "epoch": 1.487376509330406, + "grad_norm": 1.0901243686676025, + "learning_rate": 1.1486938175515021e-06, + "loss": 0.1699, + "step": 27100 + }, + { + "epoch": 1.4874862788144896, + "grad_norm": 1.3924877643585205, + "learning_rate": 1.1471254277382881e-06, + "loss": 0.2589, + "step": 27102 + }, + { + "epoch": 1.487596048298573, + "grad_norm": 0.9784572720527649, + "learning_rate": 1.1455580842292313e-06, + "loss": 0.1946, + "step": 27104 + }, + { + "epoch": 1.4877058177826563, + "grad_norm": 1.4969885349273682, + "learning_rate": 1.1439917870930793e-06, + "loss": 0.2841, + "step": 27106 + }, + { + "epoch": 1.4878155872667398, + "grad_norm": 1.1839015483856201, + "learning_rate": 1.1424265363985387e-06, + "loss": 0.2299, + "step": 27108 + }, + { + "epoch": 1.4879253567508233, + "grad_norm": 0.9580636620521545, + "learning_rate": 1.1408623322142736e-06, + "loss": 0.2814, + "step": 27110 + }, + { + "epoch": 1.4880351262349067, + "grad_norm": 0.822931170463562, + "learning_rate": 1.1392991746088932e-06, + "loss": 0.2425, + "step": 27112 + }, + { + "epoch": 1.4881448957189902, + "grad_norm": 1.3189200162887573, + "learning_rate": 1.137737063650965e-06, + "loss": 0.1936, + "step": 27114 + }, + { + "epoch": 1.4882546652030735, + "grad_norm": 1.182602882385254, + "learning_rate": 1.1361759994090116e-06, + "loss": 0.2103, + "step": 27116 + }, + { + "epoch": 1.488364434687157, + "grad_norm": 0.5868790149688721, + "learning_rate": 1.134615981951509e-06, + "loss": 0.0936, + "step": 27118 + }, + { + "epoch": 1.4884742041712404, + "grad_norm": 1.1096614599227905, + "learning_rate": 1.1330570113468886e-06, + "loss": 0.351, + "step": 27120 + }, + { + "epoch": 1.4885839736553237, + "grad_norm": 0.8171179294586182, + "learning_rate": 1.1314990876635318e-06, + "loss": 0.1065, + "step": 27122 + }, + { + "epoch": 1.4886937431394072, + "grad_norm": 1.3857829570770264, + "learning_rate": 1.1299422109697811e-06, + "loss": 0.1999, + "step": 27124 + }, + { + "epoch": 1.4888035126234906, + "grad_norm": 1.2680124044418335, + "learning_rate": 1.1283863813339263e-06, + "loss": 0.1634, + "step": 27126 + }, + { + "epoch": 1.4889132821075741, + "grad_norm": 1.0118238925933838, + "learning_rate": 1.1268315988242128e-06, + "loss": 0.2328, + "step": 27128 + }, + { + "epoch": 1.4890230515916576, + "grad_norm": 1.0853267908096313, + "learning_rate": 1.1252778635088418e-06, + "loss": 0.15, + "step": 27130 + }, + { + "epoch": 1.4891328210757409, + "grad_norm": 1.2866238355636597, + "learning_rate": 1.1237251754559696e-06, + "loss": 0.2197, + "step": 27132 + }, + { + "epoch": 1.4892425905598243, + "grad_norm": 0.7274439334869385, + "learning_rate": 1.1221735347336976e-06, + "loss": 0.1708, + "step": 27134 + }, + { + "epoch": 1.4893523600439078, + "grad_norm": 1.0099068880081177, + "learning_rate": 1.1206229414100988e-06, + "loss": 0.1531, + "step": 27136 + }, + { + "epoch": 1.4894621295279913, + "grad_norm": 1.7417556047439575, + "learning_rate": 1.119073395553183e-06, + "loss": 0.3166, + "step": 27138 + }, + { + "epoch": 1.4895718990120748, + "grad_norm": 0.8588268756866455, + "learning_rate": 1.1175248972309233e-06, + "loss": 0.1251, + "step": 27140 + }, + { + "epoch": 1.489681668496158, + "grad_norm": 1.274351954460144, + "learning_rate": 1.1159774465112432e-06, + "loss": 0.1829, + "step": 27142 + }, + { + "epoch": 1.4897914379802415, + "grad_norm": 1.0284122228622437, + "learning_rate": 1.1144310434620191e-06, + "loss": 0.1845, + "step": 27144 + }, + { + "epoch": 1.489901207464325, + "grad_norm": 1.4026669263839722, + "learning_rate": 1.1128856881510913e-06, + "loss": 0.1844, + "step": 27146 + }, + { + "epoch": 1.4900109769484082, + "grad_norm": 1.2346528768539429, + "learning_rate": 1.1113413806462385e-06, + "loss": 0.2051, + "step": 27148 + }, + { + "epoch": 1.4901207464324917, + "grad_norm": 1.2330061197280884, + "learning_rate": 1.1097981210152043e-06, + "loss": 0.1616, + "step": 27150 + }, + { + "epoch": 1.4902305159165752, + "grad_norm": 1.2900241613388062, + "learning_rate": 1.1090268841735308e-06, + "loss": 0.1826, + "step": 27152 + }, + { + "epoch": 1.4903402854006587, + "grad_norm": 1.2929307222366333, + "learning_rate": 1.1074851964801375e-06, + "loss": 0.1381, + "step": 27154 + }, + { + "epoch": 1.4904500548847421, + "grad_norm": 1.1488211154937744, + "learning_rate": 1.1059445568297216e-06, + "loss": 0.222, + "step": 27156 + }, + { + "epoch": 1.4905598243688254, + "grad_norm": 0.9024020433425903, + "learning_rate": 1.1044049652898541e-06, + "loss": 0.1404, + "step": 27158 + }, + { + "epoch": 1.4906695938529089, + "grad_norm": 1.18501615524292, + "learning_rate": 1.1028664219280727e-06, + "loss": 0.2328, + "step": 27160 + }, + { + "epoch": 1.4907793633369923, + "grad_norm": 1.2989379167556763, + "learning_rate": 1.101328926811862e-06, + "loss": 0.1559, + "step": 27162 + }, + { + "epoch": 1.4908891328210756, + "grad_norm": 1.2808988094329834, + "learning_rate": 1.0997924800086656e-06, + "loss": 0.1919, + "step": 27164 + }, + { + "epoch": 1.4909989023051593, + "grad_norm": 0.9049554467201233, + "learning_rate": 1.0982570815858795e-06, + "loss": 0.1506, + "step": 27166 + }, + { + "epoch": 1.4911086717892426, + "grad_norm": 0.8545461893081665, + "learning_rate": 1.0967227316108581e-06, + "loss": 0.1878, + "step": 27168 + }, + { + "epoch": 1.491218441273326, + "grad_norm": 1.2152410745620728, + "learning_rate": 1.0951894301509003e-06, + "loss": 0.2499, + "step": 27170 + }, + { + "epoch": 1.4913282107574095, + "grad_norm": 0.750538170337677, + "learning_rate": 1.0936571772732662e-06, + "loss": 0.1152, + "step": 27172 + }, + { + "epoch": 1.4914379802414928, + "grad_norm": 1.0543169975280762, + "learning_rate": 1.0921259730451689e-06, + "loss": 0.1375, + "step": 27174 + }, + { + "epoch": 1.4915477497255762, + "grad_norm": 0.9950021505355835, + "learning_rate": 1.0905958175337684e-06, + "loss": 0.1958, + "step": 27176 + }, + { + "epoch": 1.4916575192096597, + "grad_norm": 1.30376136302948, + "learning_rate": 1.0890667108061914e-06, + "loss": 0.1674, + "step": 27178 + }, + { + "epoch": 1.4917672886937432, + "grad_norm": 0.9384199976921082, + "learning_rate": 1.0875386529295096e-06, + "loss": 0.1164, + "step": 27180 + }, + { + "epoch": 1.4918770581778267, + "grad_norm": 2.699928045272827, + "learning_rate": 1.086011643970755e-06, + "loss": 0.1959, + "step": 27182 + }, + { + "epoch": 1.49198682766191, + "grad_norm": 1.4551725387573242, + "learning_rate": 1.0844856839969053e-06, + "loss": 0.1819, + "step": 27184 + }, + { + "epoch": 1.4920965971459934, + "grad_norm": 0.6867823600769043, + "learning_rate": 1.082960773074898e-06, + "loss": 0.1559, + "step": 27186 + }, + { + "epoch": 1.4922063666300769, + "grad_norm": 1.2927204370498657, + "learning_rate": 1.0814369112716215e-06, + "loss": 0.1786, + "step": 27188 + }, + { + "epoch": 1.4923161361141601, + "grad_norm": 0.8156511783599854, + "learning_rate": 1.0799140986539197e-06, + "loss": 0.1162, + "step": 27190 + }, + { + "epoch": 1.4924259055982436, + "grad_norm": 0.8498172760009766, + "learning_rate": 1.0783923352885945e-06, + "loss": 0.1364, + "step": 27192 + }, + { + "epoch": 1.492535675082327, + "grad_norm": 1.6390297412872314, + "learning_rate": 1.0768716212423978e-06, + "loss": 0.2048, + "step": 27194 + }, + { + "epoch": 1.4926454445664106, + "grad_norm": 1.3107185363769531, + "learning_rate": 1.0753519565820324e-06, + "loss": 0.2346, + "step": 27196 + }, + { + "epoch": 1.492755214050494, + "grad_norm": 1.5228681564331055, + "learning_rate": 1.0738333413741585e-06, + "loss": 0.1934, + "step": 27198 + }, + { + "epoch": 1.4928649835345773, + "grad_norm": 0.9767048358917236, + "learning_rate": 1.0723157756853897e-06, + "loss": 0.1723, + "step": 27200 + }, + { + "epoch": 1.4929747530186608, + "grad_norm": 1.2236653566360474, + "learning_rate": 1.0707992595822946e-06, + "loss": 0.1263, + "step": 27202 + }, + { + "epoch": 1.4930845225027443, + "grad_norm": 1.234204888343811, + "learning_rate": 1.0692837931313954e-06, + "loss": 0.1116, + "step": 27204 + }, + { + "epoch": 1.4931942919868277, + "grad_norm": 0.9713797569274902, + "learning_rate": 1.0677693763991692e-06, + "loss": 0.1363, + "step": 27206 + }, + { + "epoch": 1.4933040614709112, + "grad_norm": 1.1356507539749146, + "learning_rate": 1.0662560094520463e-06, + "loss": 0.1435, + "step": 27208 + }, + { + "epoch": 1.4934138309549945, + "grad_norm": 1.2614386081695557, + "learning_rate": 1.0647436923564042e-06, + "loss": 0.1769, + "step": 27210 + }, + { + "epoch": 1.493523600439078, + "grad_norm": 0.856023371219635, + "learning_rate": 1.063232425178587e-06, + "loss": 0.159, + "step": 27212 + }, + { + "epoch": 1.4936333699231614, + "grad_norm": 0.985474705696106, + "learning_rate": 1.0617222079848832e-06, + "loss": 0.1838, + "step": 27214 + }, + { + "epoch": 1.4937431394072447, + "grad_norm": 1.109570026397705, + "learning_rate": 1.06021304084154e-06, + "loss": 0.1696, + "step": 27216 + }, + { + "epoch": 1.4938529088913282, + "grad_norm": 0.9986222982406616, + "learning_rate": 1.0587049238147573e-06, + "loss": 0.1314, + "step": 27218 + }, + { + "epoch": 1.4939626783754116, + "grad_norm": 1.683457851409912, + "learning_rate": 1.0571978569706876e-06, + "loss": 0.1841, + "step": 27220 + }, + { + "epoch": 1.4940724478594951, + "grad_norm": 3.724536180496216, + "learning_rate": 1.055691840375439e-06, + "loss": 0.2178, + "step": 27222 + }, + { + "epoch": 1.4941822173435786, + "grad_norm": 1.4891070127487183, + "learning_rate": 1.05418687409507e-06, + "loss": 0.249, + "step": 27224 + }, + { + "epoch": 1.4942919868276618, + "grad_norm": 1.2858988046646118, + "learning_rate": 1.0526829581955972e-06, + "loss": 0.194, + "step": 27226 + }, + { + "epoch": 1.4944017563117453, + "grad_norm": 1.040900707244873, + "learning_rate": 1.05118009274299e-06, + "loss": 0.2228, + "step": 27228 + }, + { + "epoch": 1.4945115257958288, + "grad_norm": 1.3624581098556519, + "learning_rate": 1.0496782778031733e-06, + "loss": 0.2273, + "step": 27230 + }, + { + "epoch": 1.494621295279912, + "grad_norm": 1.4266327619552612, + "learning_rate": 1.0481775134420225e-06, + "loss": 0.2553, + "step": 27232 + }, + { + "epoch": 1.4947310647639955, + "grad_norm": 1.1887348890304565, + "learning_rate": 1.046677799725368e-06, + "loss": 0.1614, + "step": 27234 + }, + { + "epoch": 1.494840834248079, + "grad_norm": 1.278795599937439, + "learning_rate": 1.0451791367189962e-06, + "loss": 0.1867, + "step": 27236 + }, + { + "epoch": 1.4949506037321625, + "grad_norm": 1.0233500003814697, + "learning_rate": 1.0436815244886406e-06, + "loss": 0.1512, + "step": 27238 + }, + { + "epoch": 1.495060373216246, + "grad_norm": 0.9863272905349731, + "learning_rate": 1.0421849630999985e-06, + "loss": 0.2338, + "step": 27240 + }, + { + "epoch": 1.4951701427003292, + "grad_norm": 1.3618078231811523, + "learning_rate": 1.0406894526187177e-06, + "loss": 0.2185, + "step": 27242 + }, + { + "epoch": 1.4952799121844127, + "grad_norm": 1.2610585689544678, + "learning_rate": 1.0391949931103983e-06, + "loss": 0.2761, + "step": 27244 + }, + { + "epoch": 1.4953896816684962, + "grad_norm": 0.8963082432746887, + "learning_rate": 1.0377015846405908e-06, + "loss": 0.1525, + "step": 27246 + }, + { + "epoch": 1.4954994511525797, + "grad_norm": 0.9912958741188049, + "learning_rate": 1.0362092272748063e-06, + "loss": 0.1386, + "step": 27248 + }, + { + "epoch": 1.4956092206366631, + "grad_norm": 0.7719630002975464, + "learning_rate": 1.034717921078507e-06, + "loss": 0.1707, + "step": 27250 + }, + { + "epoch": 1.4957189901207464, + "grad_norm": 0.9465572834014893, + "learning_rate": 1.0332276661171064e-06, + "loss": 0.2293, + "step": 27252 + }, + { + "epoch": 1.4958287596048299, + "grad_norm": 1.5513575077056885, + "learning_rate": 1.031738462455978e-06, + "loss": 0.1791, + "step": 27254 + }, + { + "epoch": 1.4959385290889133, + "grad_norm": 1.8956751823425293, + "learning_rate": 1.0302503101604438e-06, + "loss": 0.1378, + "step": 27256 + }, + { + "epoch": 1.4960482985729966, + "grad_norm": 1.016487956047058, + "learning_rate": 1.02876320929578e-06, + "loss": 0.1157, + "step": 27258 + }, + { + "epoch": 1.49615806805708, + "grad_norm": 0.7628695964813232, + "learning_rate": 1.027277159927223e-06, + "loss": 0.217, + "step": 27260 + }, + { + "epoch": 1.4962678375411635, + "grad_norm": 1.0079882144927979, + "learning_rate": 1.0257921621199485e-06, + "loss": 0.1277, + "step": 27262 + }, + { + "epoch": 1.496377607025247, + "grad_norm": 1.5400042533874512, + "learning_rate": 1.024308215939107e-06, + "loss": 0.1727, + "step": 27264 + }, + { + "epoch": 1.4964873765093305, + "grad_norm": 1.1895121335983276, + "learning_rate": 1.0228253214497857e-06, + "loss": 0.2351, + "step": 27266 + }, + { + "epoch": 1.4965971459934138, + "grad_norm": 1.0138276815414429, + "learning_rate": 1.0213434787170323e-06, + "loss": 0.1341, + "step": 27268 + }, + { + "epoch": 1.4967069154774972, + "grad_norm": 1.0216845273971558, + "learning_rate": 1.0198626878058505e-06, + "loss": 0.1624, + "step": 27270 + }, + { + "epoch": 1.4968166849615807, + "grad_norm": 0.9549197554588318, + "learning_rate": 1.018382948781188e-06, + "loss": 0.1612, + "step": 27272 + }, + { + "epoch": 1.496926454445664, + "grad_norm": 2.4435067176818848, + "learning_rate": 1.01690426170796e-06, + "loss": 0.1888, + "step": 27274 + }, + { + "epoch": 1.4970362239297477, + "grad_norm": 1.0061026811599731, + "learning_rate": 1.0154266266510254e-06, + "loss": 0.1144, + "step": 27276 + }, + { + "epoch": 1.497145993413831, + "grad_norm": 1.1011693477630615, + "learning_rate": 1.0139500436752075e-06, + "loss": 0.1741, + "step": 27278 + }, + { + "epoch": 1.4972557628979144, + "grad_norm": 1.776753544807434, + "learning_rate": 1.0124745128452685e-06, + "loss": 0.1539, + "step": 27280 + }, + { + "epoch": 1.4973655323819979, + "grad_norm": 1.0300239324569702, + "learning_rate": 1.011000034225934e-06, + "loss": 0.1794, + "step": 27282 + }, + { + "epoch": 1.4974753018660811, + "grad_norm": 0.9554561376571655, + "learning_rate": 1.0095266078818832e-06, + "loss": 0.2153, + "step": 27284 + }, + { + "epoch": 1.4975850713501646, + "grad_norm": 1.0216975212097168, + "learning_rate": 1.0080542338777504e-06, + "loss": 0.138, + "step": 27286 + }, + { + "epoch": 1.497694840834248, + "grad_norm": 1.4572941064834595, + "learning_rate": 1.0065829122781117e-06, + "loss": 0.1421, + "step": 27288 + }, + { + "epoch": 1.4978046103183316, + "grad_norm": 1.246800422668457, + "learning_rate": 1.0051126431475183e-06, + "loss": 0.2843, + "step": 27290 + }, + { + "epoch": 1.497914379802415, + "grad_norm": 0.9721777439117432, + "learning_rate": 1.0036434265504575e-06, + "loss": 0.2492, + "step": 27292 + }, + { + "epoch": 1.4980241492864983, + "grad_norm": 1.3420954942703247, + "learning_rate": 1.0021752625513775e-06, + "loss": 0.2548, + "step": 27294 + }, + { + "epoch": 1.4981339187705818, + "grad_norm": 1.569533109664917, + "learning_rate": 1.00070815121468e-06, + "loss": 0.162, + "step": 27296 + }, + { + "epoch": 1.4982436882546653, + "grad_norm": 0.9957597255706787, + "learning_rate": 9.99242092604713e-07, + "loss": 0.2268, + "step": 27298 + }, + { + "epoch": 1.4983534577387485, + "grad_norm": 1.2686924934387207, + "learning_rate": 9.97777086785795e-07, + "loss": 0.1614, + "step": 27300 + }, + { + "epoch": 1.498463227222832, + "grad_norm": 0.8217695951461792, + "learning_rate": 9.9631313382218e-07, + "loss": 0.1444, + "step": 27302 + }, + { + "epoch": 1.4985729967069155, + "grad_norm": 1.294048547744751, + "learning_rate": 9.948502337780912e-07, + "loss": 0.1432, + "step": 27304 + }, + { + "epoch": 1.498682766190999, + "grad_norm": 0.47164973616600037, + "learning_rate": 9.933883867176975e-07, + "loss": 0.1486, + "step": 27306 + }, + { + "epoch": 1.4987925356750824, + "grad_norm": 0.9670827388763428, + "learning_rate": 9.919275927051163e-07, + "loss": 0.1418, + "step": 27308 + }, + { + "epoch": 1.4989023051591657, + "grad_norm": 1.5098158121109009, + "learning_rate": 9.90467851804433e-07, + "loss": 0.2589, + "step": 27310 + }, + { + "epoch": 1.4990120746432491, + "grad_norm": 0.9621433019638062, + "learning_rate": 9.89009164079671e-07, + "loss": 0.1798, + "step": 27312 + }, + { + "epoch": 1.4991218441273326, + "grad_norm": 0.8505073189735413, + "learning_rate": 9.87551529594824e-07, + "loss": 0.1562, + "step": 27314 + }, + { + "epoch": 1.499231613611416, + "grad_norm": 1.3205729722976685, + "learning_rate": 9.860949484138237e-07, + "loss": 0.1969, + "step": 27316 + }, + { + "epoch": 1.4993413830954996, + "grad_norm": 1.3811004161834717, + "learning_rate": 9.846394206005694e-07, + "loss": 0.2892, + "step": 27318 + }, + { + "epoch": 1.4994511525795828, + "grad_norm": 1.3128674030303955, + "learning_rate": 9.83184946218904e-07, + "loss": 0.1613, + "step": 27320 + }, + { + "epoch": 1.4995609220636663, + "grad_norm": 0.9915310740470886, + "learning_rate": 9.81731525332627e-07, + "loss": 0.1623, + "step": 27322 + }, + { + "epoch": 1.4996706915477498, + "grad_norm": 0.814268171787262, + "learning_rate": 9.802791580054925e-07, + "loss": 0.1496, + "step": 27324 + }, + { + "epoch": 1.499780461031833, + "grad_norm": 0.6719660758972168, + "learning_rate": 9.788278443012111e-07, + "loss": 0.1278, + "step": 27326 + }, + { + "epoch": 1.4998902305159165, + "grad_norm": 1.2484791278839111, + "learning_rate": 9.773775842834453e-07, + "loss": 0.2736, + "step": 27328 + }, + { + "epoch": 1.5, + "grad_norm": 1.159603476524353, + "learning_rate": 9.759283780158085e-07, + "loss": 0.1613, + "step": 27330 + }, + { + "epoch": 1.5001097694840835, + "grad_norm": 1.2633953094482422, + "learning_rate": 9.744802255618663e-07, + "loss": 0.2426, + "step": 27332 + }, + { + "epoch": 1.500219538968167, + "grad_norm": 1.3607152700424194, + "learning_rate": 9.730331269851484e-07, + "loss": 0.2532, + "step": 27334 + }, + { + "epoch": 1.5003293084522502, + "grad_norm": 2.288706064224243, + "learning_rate": 9.71587082349129e-07, + "loss": 0.2472, + "step": 27336 + }, + { + "epoch": 1.5004390779363337, + "grad_norm": 0.6265134811401367, + "learning_rate": 9.701420917172355e-07, + "loss": 0.1505, + "step": 27338 + }, + { + "epoch": 1.5005488474204172, + "grad_norm": 1.5842210054397583, + "learning_rate": 9.686981551528584e-07, + "loss": 0.2105, + "step": 27340 + }, + { + "epoch": 1.5006586169045004, + "grad_norm": 1.4393211603164673, + "learning_rate": 9.672552727193307e-07, + "loss": 0.1855, + "step": 27342 + }, + { + "epoch": 1.5007683863885841, + "grad_norm": 0.933541476726532, + "learning_rate": 9.658134444799488e-07, + "loss": 0.2392, + "step": 27344 + }, + { + "epoch": 1.5008781558726674, + "grad_norm": 0.9138675332069397, + "learning_rate": 9.643726704979567e-07, + "loss": 0.1643, + "step": 27346 + }, + { + "epoch": 1.5009879253567509, + "grad_norm": 1.101987600326538, + "learning_rate": 9.629329508365482e-07, + "loss": 0.1577, + "step": 27348 + }, + { + "epoch": 1.5010976948408343, + "grad_norm": 0.8095799088478088, + "learning_rate": 9.614942855588865e-07, + "loss": 0.216, + "step": 27350 + }, + { + "epoch": 1.5012074643249176, + "grad_norm": 0.8269360661506653, + "learning_rate": 9.600566747280715e-07, + "loss": 0.1543, + "step": 27352 + }, + { + "epoch": 1.501317233809001, + "grad_norm": 1.084173560142517, + "learning_rate": 9.586201184071664e-07, + "loss": 0.1632, + "step": 27354 + }, + { + "epoch": 1.5014270032930845, + "grad_norm": 0.8464735150337219, + "learning_rate": 9.571846166591848e-07, + "loss": 0.1432, + "step": 27356 + }, + { + "epoch": 1.5015367727771678, + "grad_norm": 1.058132290840149, + "learning_rate": 9.557501695470956e-07, + "loss": 0.2252, + "step": 27358 + }, + { + "epoch": 1.5016465422612515, + "grad_norm": 0.8725492358207703, + "learning_rate": 9.543167771338152e-07, + "loss": 0.1319, + "step": 27360 + }, + { + "epoch": 1.5017563117453347, + "grad_norm": 1.2819215059280396, + "learning_rate": 9.528844394822239e-07, + "loss": 0.1333, + "step": 27362 + }, + { + "epoch": 1.5018660812294182, + "grad_norm": 1.343977689743042, + "learning_rate": 9.514531566551577e-07, + "loss": 0.2817, + "step": 27364 + }, + { + "epoch": 1.5019758507135017, + "grad_norm": 1.4672225713729858, + "learning_rate": 9.500229287153911e-07, + "loss": 0.1467, + "step": 27366 + }, + { + "epoch": 1.502085620197585, + "grad_norm": 1.2318096160888672, + "learning_rate": 9.48593755725663e-07, + "loss": 0.1935, + "step": 27368 + }, + { + "epoch": 1.5021953896816687, + "grad_norm": 0.8166566491127014, + "learning_rate": 9.471656377486649e-07, + "loss": 0.2232, + "step": 27370 + }, + { + "epoch": 1.502305159165752, + "grad_norm": 1.1351999044418335, + "learning_rate": 9.457385748470382e-07, + "loss": 0.1669, + "step": 27372 + }, + { + "epoch": 1.5024149286498354, + "grad_norm": 1.0981196165084839, + "learning_rate": 9.44312567083383e-07, + "loss": 0.1519, + "step": 27374 + }, + { + "epoch": 1.5025246981339189, + "grad_norm": 0.6043209433555603, + "learning_rate": 9.428876145202519e-07, + "loss": 0.1456, + "step": 27376 + }, + { + "epoch": 1.5026344676180021, + "grad_norm": 0.9321527481079102, + "learning_rate": 9.414637172201479e-07, + "loss": 0.2246, + "step": 27378 + }, + { + "epoch": 1.5027442371020856, + "grad_norm": 1.2576979398727417, + "learning_rate": 9.400408752455348e-07, + "loss": 0.1649, + "step": 27380 + }, + { + "epoch": 1.502854006586169, + "grad_norm": 0.8702871203422546, + "learning_rate": 9.386190886588208e-07, + "loss": 0.1434, + "step": 27382 + }, + { + "epoch": 1.5029637760702523, + "grad_norm": 1.2184314727783203, + "learning_rate": 9.371983575223703e-07, + "loss": 0.1783, + "step": 27384 + }, + { + "epoch": 1.503073545554336, + "grad_norm": 1.3795442581176758, + "learning_rate": 9.357786818985109e-07, + "loss": 0.1472, + "step": 27386 + }, + { + "epoch": 1.5031833150384193, + "grad_norm": 1.2044508457183838, + "learning_rate": 9.343600618495096e-07, + "loss": 0.085, + "step": 27388 + }, + { + "epoch": 1.5032930845225028, + "grad_norm": 1.4222536087036133, + "learning_rate": 9.329424974376e-07, + "loss": 0.3114, + "step": 27390 + }, + { + "epoch": 1.5034028540065862, + "grad_norm": 0.9731217622756958, + "learning_rate": 9.315259887249572e-07, + "loss": 0.1997, + "step": 27392 + }, + { + "epoch": 1.5035126234906695, + "grad_norm": 3.6699540615081787, + "learning_rate": 9.301105357737233e-07, + "loss": 0.1832, + "step": 27394 + }, + { + "epoch": 1.503622392974753, + "grad_norm": 1.0038509368896484, + "learning_rate": 9.286961386459819e-07, + "loss": 0.1448, + "step": 27396 + }, + { + "epoch": 1.5037321624588365, + "grad_norm": 0.8960392475128174, + "learning_rate": 9.272827974037751e-07, + "loss": 0.1498, + "step": 27398 + }, + { + "epoch": 1.5038419319429197, + "grad_norm": 1.0786572694778442, + "learning_rate": 9.258705121091032e-07, + "loss": 0.1794, + "step": 27400 + }, + { + "epoch": 1.5039517014270034, + "grad_norm": 0.9405278563499451, + "learning_rate": 9.244592828239113e-07, + "loss": 0.1123, + "step": 27402 + }, + { + "epoch": 1.5040614709110867, + "grad_norm": 0.996332049369812, + "learning_rate": 9.230491096101079e-07, + "loss": 0.1756, + "step": 27404 + }, + { + "epoch": 1.5041712403951701, + "grad_norm": 1.0309271812438965, + "learning_rate": 9.216399925295466e-07, + "loss": 0.1631, + "step": 27406 + }, + { + "epoch": 1.5042810098792536, + "grad_norm": 1.224302887916565, + "learning_rate": 9.202319316440361e-07, + "loss": 0.1709, + "step": 27408 + }, + { + "epoch": 1.5043907793633369, + "grad_norm": 1.3035422563552856, + "learning_rate": 9.188249270153437e-07, + "loss": 0.1452, + "step": 27410 + }, + { + "epoch": 1.5045005488474206, + "grad_norm": 1.3694438934326172, + "learning_rate": 9.174189787051896e-07, + "loss": 0.373, + "step": 27412 + }, + { + "epoch": 1.5046103183315038, + "grad_norm": 1.1225193738937378, + "learning_rate": 9.160140867752409e-07, + "loss": 0.1581, + "step": 27414 + }, + { + "epoch": 1.5047200878155873, + "grad_norm": 0.979350745677948, + "learning_rate": 9.14610251287129e-07, + "loss": 0.2387, + "step": 27416 + }, + { + "epoch": 1.5048298572996708, + "grad_norm": 1.0691943168640137, + "learning_rate": 9.13207472302427e-07, + "loss": 0.1333, + "step": 27418 + }, + { + "epoch": 1.504939626783754, + "grad_norm": 1.4082533121109009, + "learning_rate": 9.118057498826715e-07, + "loss": 0.2288, + "step": 27420 + }, + { + "epoch": 1.5050493962678375, + "grad_norm": 0.9993407130241394, + "learning_rate": 9.104050840893441e-07, + "loss": 0.2031, + "step": 27422 + }, + { + "epoch": 1.505159165751921, + "grad_norm": 1.1543638706207275, + "learning_rate": 9.0900547498389e-07, + "loss": 0.2177, + "step": 27424 + }, + { + "epoch": 1.5052689352360042, + "grad_norm": 1.0955753326416016, + "learning_rate": 9.076069226277017e-07, + "loss": 0.1438, + "step": 27426 + }, + { + "epoch": 1.505378704720088, + "grad_norm": 1.3903436660766602, + "learning_rate": 9.062094270821303e-07, + "loss": 0.2163, + "step": 27428 + }, + { + "epoch": 1.5054884742041712, + "grad_norm": 1.2857383489608765, + "learning_rate": 9.048129884084683e-07, + "loss": 0.1943, + "step": 27430 + }, + { + "epoch": 1.5055982436882547, + "grad_norm": 1.6430370807647705, + "learning_rate": 9.034176066679778e-07, + "loss": 0.1377, + "step": 27432 + }, + { + "epoch": 1.5057080131723382, + "grad_norm": 0.9128066301345825, + "learning_rate": 9.020232819218599e-07, + "loss": 0.1384, + "step": 27434 + }, + { + "epoch": 1.5058177826564214, + "grad_norm": 1.2326922416687012, + "learning_rate": 9.006300142312824e-07, + "loss": 0.1641, + "step": 27436 + }, + { + "epoch": 1.5059275521405049, + "grad_norm": 1.3023549318313599, + "learning_rate": 8.992378036573601e-07, + "loss": 0.1401, + "step": 27438 + }, + { + "epoch": 1.5060373216245884, + "grad_norm": 0.8813126087188721, + "learning_rate": 8.97846650261161e-07, + "loss": 0.2049, + "step": 27440 + }, + { + "epoch": 1.5061470911086718, + "grad_norm": 1.176814079284668, + "learning_rate": 8.964565541037085e-07, + "loss": 0.1709, + "step": 27442 + }, + { + "epoch": 1.5062568605927553, + "grad_norm": 1.0348336696624756, + "learning_rate": 8.950675152459786e-07, + "loss": 0.1551, + "step": 27444 + }, + { + "epoch": 1.5063666300768386, + "grad_norm": 1.0497983694076538, + "learning_rate": 8.936795337489007e-07, + "loss": 0.1956, + "step": 27446 + }, + { + "epoch": 1.506476399560922, + "grad_norm": 1.2334604263305664, + "learning_rate": 8.922926096733591e-07, + "loss": 0.1576, + "step": 27448 + }, + { + "epoch": 1.5065861690450055, + "grad_norm": 1.4686611890792847, + "learning_rate": 8.909067430801915e-07, + "loss": 0.1876, + "step": 27450 + }, + { + "epoch": 1.5066959385290888, + "grad_norm": 0.7323606610298157, + "learning_rate": 8.89521934030188e-07, + "loss": 0.2735, + "step": 27452 + }, + { + "epoch": 1.5068057080131725, + "grad_norm": 1.569042682647705, + "learning_rate": 8.881381825840946e-07, + "loss": 0.3207, + "step": 27454 + }, + { + "epoch": 1.5069154774972557, + "grad_norm": 0.7385985255241394, + "learning_rate": 8.867554888026097e-07, + "loss": 0.114, + "step": 27456 + }, + { + "epoch": 1.5070252469813392, + "grad_norm": 0.7860503792762756, + "learning_rate": 8.853738527463823e-07, + "loss": 0.1115, + "step": 27458 + }, + { + "epoch": 1.5071350164654227, + "grad_norm": 0.9607559442520142, + "learning_rate": 8.839932744760165e-07, + "loss": 0.2369, + "step": 27460 + }, + { + "epoch": 1.507244785949506, + "grad_norm": 1.0615379810333252, + "learning_rate": 8.826137540520779e-07, + "loss": 0.1487, + "step": 27462 + }, + { + "epoch": 1.5073545554335894, + "grad_norm": 2.9506731033325195, + "learning_rate": 8.812352915350736e-07, + "loss": 0.1609, + "step": 27464 + }, + { + "epoch": 1.507464324917673, + "grad_norm": 0.8877100944519043, + "learning_rate": 8.798578869854717e-07, + "loss": 0.2639, + "step": 27466 + }, + { + "epoch": 1.5075740944017562, + "grad_norm": 0.7494379878044128, + "learning_rate": 8.784815404636937e-07, + "loss": 0.1133, + "step": 27468 + }, + { + "epoch": 1.5076838638858399, + "grad_norm": 1.2285692691802979, + "learning_rate": 8.771062520301049e-07, + "loss": 0.2517, + "step": 27470 + }, + { + "epoch": 1.5077936333699231, + "grad_norm": 0.9394497275352478, + "learning_rate": 8.757320217450432e-07, + "loss": 0.1972, + "step": 27472 + }, + { + "epoch": 1.5079034028540066, + "grad_norm": 0.9773696660995483, + "learning_rate": 8.743588496687827e-07, + "loss": 0.2059, + "step": 27474 + }, + { + "epoch": 1.50801317233809, + "grad_norm": 1.1193927526474, + "learning_rate": 8.729867358615613e-07, + "loss": 0.167, + "step": 27476 + }, + { + "epoch": 1.5081229418221733, + "grad_norm": 1.1916924715042114, + "learning_rate": 8.716156803835613e-07, + "loss": 0.1239, + "step": 27478 + }, + { + "epoch": 1.508232711306257, + "grad_norm": 0.6764519214630127, + "learning_rate": 8.702456832949262e-07, + "loss": 0.0967, + "step": 27480 + }, + { + "epoch": 1.5083424807903403, + "grad_norm": 1.1566054821014404, + "learning_rate": 8.6887674465575e-07, + "loss": 0.1826, + "step": 27482 + }, + { + "epoch": 1.5084522502744238, + "grad_norm": 1.1914937496185303, + "learning_rate": 8.675088645260815e-07, + "loss": 0.1478, + "step": 27484 + }, + { + "epoch": 1.5085620197585072, + "grad_norm": 0.8898447155952454, + "learning_rate": 8.661420429659256e-07, + "loss": 0.1532, + "step": 27486 + }, + { + "epoch": 1.5086717892425905, + "grad_norm": 0.9363105893135071, + "learning_rate": 8.647762800352371e-07, + "loss": 0.108, + "step": 27488 + }, + { + "epoch": 1.508781558726674, + "grad_norm": 0.9121467471122742, + "learning_rate": 8.634115757939209e-07, + "loss": 0.1036, + "step": 27490 + }, + { + "epoch": 1.5088913282107574, + "grad_norm": 1.2824140787124634, + "learning_rate": 8.620479303018458e-07, + "loss": 0.1434, + "step": 27492 + }, + { + "epoch": 1.5090010976948407, + "grad_norm": 0.9173287749290466, + "learning_rate": 8.606853436188222e-07, + "loss": 0.1824, + "step": 27494 + }, + { + "epoch": 1.5091108671789244, + "grad_norm": 0.9987452030181885, + "learning_rate": 8.593238158046219e-07, + "loss": 0.1702, + "step": 27496 + }, + { + "epoch": 1.5092206366630077, + "grad_norm": 1.1171623468399048, + "learning_rate": 8.57963346918969e-07, + "loss": 0.1498, + "step": 27498 + }, + { + "epoch": 1.5093304061470911, + "grad_norm": 1.0110690593719482, + "learning_rate": 8.566039370215412e-07, + "loss": 0.1242, + "step": 27500 + }, + { + "epoch": 1.5094401756311746, + "grad_norm": 1.2426797151565552, + "learning_rate": 8.552455861719654e-07, + "loss": 0.1631, + "step": 27502 + }, + { + "epoch": 1.5095499451152579, + "grad_norm": 2.016388177871704, + "learning_rate": 8.538882944298277e-07, + "loss": 0.1814, + "step": 27504 + }, + { + "epoch": 1.5096597145993413, + "grad_norm": 1.0213723182678223, + "learning_rate": 8.52532061854669e-07, + "loss": 0.1171, + "step": 27506 + }, + { + "epoch": 1.5097694840834248, + "grad_norm": 2.1398260593414307, + "learning_rate": 8.511768885059695e-07, + "loss": 0.2423, + "step": 27508 + }, + { + "epoch": 1.509879253567508, + "grad_norm": 1.1266463994979858, + "learning_rate": 8.498227744431875e-07, + "loss": 0.2547, + "step": 27510 + }, + { + "epoch": 1.5099890230515918, + "grad_norm": 0.8976377248764038, + "learning_rate": 8.484697197257141e-07, + "loss": 0.124, + "step": 27512 + }, + { + "epoch": 1.510098792535675, + "grad_norm": 1.036623239517212, + "learning_rate": 8.471177244128964e-07, + "loss": 0.1243, + "step": 27514 + }, + { + "epoch": 1.5102085620197585, + "grad_norm": 0.8843521475791931, + "learning_rate": 8.457667885640481e-07, + "loss": 0.1118, + "step": 27516 + }, + { + "epoch": 1.510318331503842, + "grad_norm": 1.351015567779541, + "learning_rate": 8.444169122384271e-07, + "loss": 0.2018, + "step": 27518 + }, + { + "epoch": 1.5104281009879252, + "grad_norm": 0.9789005517959595, + "learning_rate": 8.430680954952364e-07, + "loss": 0.1788, + "step": 27520 + }, + { + "epoch": 1.510537870472009, + "grad_norm": 1.889992117881775, + "learning_rate": 8.417203383936534e-07, + "loss": 0.174, + "step": 27522 + }, + { + "epoch": 1.5106476399560922, + "grad_norm": 0.9638047218322754, + "learning_rate": 8.40373640992792e-07, + "loss": 0.1643, + "step": 27524 + }, + { + "epoch": 1.5107574094401757, + "grad_norm": 2.0035829544067383, + "learning_rate": 8.390280033517245e-07, + "loss": 0.1804, + "step": 27526 + }, + { + "epoch": 1.5108671789242591, + "grad_norm": 1.0007193088531494, + "learning_rate": 8.376834255294786e-07, + "loss": 0.132, + "step": 27528 + }, + { + "epoch": 1.5109769484083424, + "grad_norm": 1.429442286491394, + "learning_rate": 8.36339907585032e-07, + "loss": 0.1535, + "step": 27530 + }, + { + "epoch": 1.5110867178924259, + "grad_norm": 1.2739988565444946, + "learning_rate": 8.349974495773183e-07, + "loss": 0.2833, + "step": 27532 + }, + { + "epoch": 1.5111964873765094, + "grad_norm": 1.1414618492126465, + "learning_rate": 8.336560515652264e-07, + "loss": 0.2282, + "step": 27534 + }, + { + "epoch": 1.5113062568605926, + "grad_norm": 1.340187907218933, + "learning_rate": 8.323157136075955e-07, + "loss": 0.2254, + "step": 27536 + }, + { + "epoch": 1.5114160263446763, + "grad_norm": 1.7623372077941895, + "learning_rate": 8.309764357632199e-07, + "loss": 0.3134, + "step": 27538 + }, + { + "epoch": 1.5115257958287596, + "grad_norm": 1.129217505455017, + "learning_rate": 8.296382180908474e-07, + "loss": 0.17, + "step": 27540 + }, + { + "epoch": 1.511635565312843, + "grad_norm": 0.8465173244476318, + "learning_rate": 8.283010606491754e-07, + "loss": 0.1535, + "step": 27542 + }, + { + "epoch": 1.5117453347969265, + "grad_norm": 1.015540361404419, + "learning_rate": 8.269649634968596e-07, + "loss": 0.1248, + "step": 27544 + }, + { + "epoch": 1.5118551042810098, + "grad_norm": 1.6158629655838013, + "learning_rate": 8.256299266925088e-07, + "loss": 0.1777, + "step": 27546 + }, + { + "epoch": 1.5119648737650933, + "grad_norm": 1.2293801307678223, + "learning_rate": 8.242959502946846e-07, + "loss": 0.2948, + "step": 27548 + }, + { + "epoch": 1.5120746432491767, + "grad_norm": 1.133963942527771, + "learning_rate": 8.229630343619038e-07, + "loss": 0.2666, + "step": 27550 + }, + { + "epoch": 1.5121844127332602, + "grad_norm": 0.8805772662162781, + "learning_rate": 8.21631178952631e-07, + "loss": 0.1176, + "step": 27552 + }, + { + "epoch": 1.5122941822173437, + "grad_norm": 1.4347418546676636, + "learning_rate": 8.203003841252915e-07, + "loss": 0.1694, + "step": 27554 + }, + { + "epoch": 1.512403951701427, + "grad_norm": 1.4436856508255005, + "learning_rate": 8.189706499382555e-07, + "loss": 0.2507, + "step": 27556 + }, + { + "epoch": 1.5125137211855104, + "grad_norm": 1.330896258354187, + "learning_rate": 8.176419764498538e-07, + "loss": 0.1963, + "step": 27558 + }, + { + "epoch": 1.512623490669594, + "grad_norm": 1.1735531091690063, + "learning_rate": 8.163143637183679e-07, + "loss": 0.1381, + "step": 27560 + }, + { + "epoch": 1.5127332601536772, + "grad_norm": 0.8928473591804504, + "learning_rate": 8.14987811802037e-07, + "loss": 0.1679, + "step": 27562 + }, + { + "epoch": 1.5128430296377608, + "grad_norm": 1.1944893598556519, + "learning_rate": 8.136623207590483e-07, + "loss": 0.1742, + "step": 27564 + }, + { + "epoch": 1.512952799121844, + "grad_norm": 0.9508421421051025, + "learning_rate": 8.123378906475437e-07, + "loss": 0.1637, + "step": 27566 + }, + { + "epoch": 1.5130625686059276, + "grad_norm": 1.4458036422729492, + "learning_rate": 8.110145215256159e-07, + "loss": 0.2447, + "step": 27568 + }, + { + "epoch": 1.513172338090011, + "grad_norm": 1.0139144659042358, + "learning_rate": 8.096922134513213e-07, + "loss": 0.1121, + "step": 27570 + }, + { + "epoch": 1.5132821075740943, + "grad_norm": 1.4105784893035889, + "learning_rate": 8.083709664826578e-07, + "loss": 0.1704, + "step": 27572 + }, + { + "epoch": 1.5133918770581778, + "grad_norm": 1.5702275037765503, + "learning_rate": 8.070507806775818e-07, + "loss": 0.2421, + "step": 27574 + }, + { + "epoch": 1.5135016465422613, + "grad_norm": 0.9534794688224792, + "learning_rate": 8.057316560940053e-07, + "loss": 0.1702, + "step": 27576 + }, + { + "epoch": 1.5136114160263445, + "grad_norm": 1.154630422592163, + "learning_rate": 8.044135927897934e-07, + "loss": 0.1793, + "step": 27578 + }, + { + "epoch": 1.5137211855104282, + "grad_norm": 1.1876474618911743, + "learning_rate": 8.030965908227578e-07, + "loss": 0.2103, + "step": 27580 + }, + { + "epoch": 1.5138309549945115, + "grad_norm": 1.1286059617996216, + "learning_rate": 8.017806502506691e-07, + "loss": 0.1166, + "step": 27582 + }, + { + "epoch": 1.513940724478595, + "grad_norm": 0.8938579559326172, + "learning_rate": 8.004657711312564e-07, + "loss": 0.1842, + "step": 27584 + }, + { + "epoch": 1.5140504939626784, + "grad_norm": 0.9943583011627197, + "learning_rate": 7.991519535221925e-07, + "loss": 0.224, + "step": 27586 + }, + { + "epoch": 1.5141602634467617, + "grad_norm": 1.4085503816604614, + "learning_rate": 7.978391974811094e-07, + "loss": 0.1861, + "step": 27588 + }, + { + "epoch": 1.5142700329308454, + "grad_norm": 0.872281014919281, + "learning_rate": 7.965275030655889e-07, + "loss": 0.1157, + "step": 27590 + }, + { + "epoch": 1.5143798024149286, + "grad_norm": 0.7606185674667358, + "learning_rate": 7.952168703331708e-07, + "loss": 0.1019, + "step": 27592 + }, + { + "epoch": 1.5144895718990121, + "grad_norm": 0.8269579410552979, + "learning_rate": 7.939072993413427e-07, + "loss": 0.1319, + "step": 27594 + }, + { + "epoch": 1.5145993413830956, + "grad_norm": 1.1715437173843384, + "learning_rate": 7.925987901475529e-07, + "loss": 0.1577, + "step": 27596 + }, + { + "epoch": 1.5147091108671789, + "grad_norm": 0.983806848526001, + "learning_rate": 7.912913428091972e-07, + "loss": 0.1837, + "step": 27598 + }, + { + "epoch": 1.5148188803512623, + "grad_norm": 1.2233318090438843, + "learning_rate": 7.899849573836271e-07, + "loss": 0.1716, + "step": 27600 + }, + { + "epoch": 1.5149286498353458, + "grad_norm": 1.0270434617996216, + "learning_rate": 7.886796339281466e-07, + "loss": 0.1666, + "step": 27602 + }, + { + "epoch": 1.515038419319429, + "grad_norm": 1.784755825996399, + "learning_rate": 7.873753725000099e-07, + "loss": 0.1597, + "step": 27604 + }, + { + "epoch": 1.5151481888035128, + "grad_norm": 0.9210883378982544, + "learning_rate": 7.860721731564352e-07, + "loss": 0.1325, + "step": 27606 + }, + { + "epoch": 1.515257958287596, + "grad_norm": 1.28836989402771, + "learning_rate": 7.847700359545823e-07, + "loss": 0.2632, + "step": 27608 + }, + { + "epoch": 1.5153677277716795, + "grad_norm": 1.3793734312057495, + "learning_rate": 7.834689609515722e-07, + "loss": 0.1796, + "step": 27610 + }, + { + "epoch": 1.515477497255763, + "grad_norm": 1.0012307167053223, + "learning_rate": 7.821689482044759e-07, + "loss": 0.2462, + "step": 27612 + }, + { + "epoch": 1.5155872667398462, + "grad_norm": 0.773057758808136, + "learning_rate": 7.808699977703171e-07, + "loss": 0.144, + "step": 27614 + }, + { + "epoch": 1.5156970362239297, + "grad_norm": 0.7916688323020935, + "learning_rate": 7.795721097060726e-07, + "loss": 0.103, + "step": 27616 + }, + { + "epoch": 1.5158068057080132, + "grad_norm": 1.1618679761886597, + "learning_rate": 7.782752840686775e-07, + "loss": 0.1643, + "step": 27618 + }, + { + "epoch": 1.5159165751920964, + "grad_norm": 1.1776341199874878, + "learning_rate": 7.769795209150166e-07, + "loss": 0.1443, + "step": 27620 + }, + { + "epoch": 1.5160263446761801, + "grad_norm": 1.1711186170578003, + "learning_rate": 7.756848203019279e-07, + "loss": 0.2105, + "step": 27622 + }, + { + "epoch": 1.5161361141602634, + "grad_norm": 0.6659111976623535, + "learning_rate": 7.743911822862021e-07, + "loss": 0.1695, + "step": 27624 + }, + { + "epoch": 1.5162458836443469, + "grad_norm": 1.1256535053253174, + "learning_rate": 7.730986069245855e-07, + "loss": 0.2494, + "step": 27626 + }, + { + "epoch": 1.5163556531284303, + "grad_norm": 0.8121740221977234, + "learning_rate": 7.718070942737771e-07, + "loss": 0.1321, + "step": 27628 + }, + { + "epoch": 1.5164654226125136, + "grad_norm": 1.214327335357666, + "learning_rate": 7.705166443904261e-07, + "loss": 0.3058, + "step": 27630 + }, + { + "epoch": 1.5165751920965973, + "grad_norm": 1.1723743677139282, + "learning_rate": 7.692272573311426e-07, + "loss": 0.1851, + "step": 27632 + }, + { + "epoch": 1.5166849615806806, + "grad_norm": 1.3747386932373047, + "learning_rate": 7.679389331524789e-07, + "loss": 0.1611, + "step": 27634 + }, + { + "epoch": 1.516794731064764, + "grad_norm": 0.9581610560417175, + "learning_rate": 7.666516719109562e-07, + "loss": 0.1281, + "step": 27636 + }, + { + "epoch": 1.5169045005488475, + "grad_norm": 1.2672735452651978, + "learning_rate": 7.65365473663035e-07, + "loss": 0.1571, + "step": 27638 + }, + { + "epoch": 1.5170142700329308, + "grad_norm": 1.3636596202850342, + "learning_rate": 7.64080338465134e-07, + "loss": 0.1954, + "step": 27640 + }, + { + "epoch": 1.5171240395170142, + "grad_norm": 1.4751129150390625, + "learning_rate": 7.627962663736276e-07, + "loss": 0.2211, + "step": 27642 + }, + { + "epoch": 1.5172338090010977, + "grad_norm": 1.1909788846969604, + "learning_rate": 7.615132574448375e-07, + "loss": 0.1378, + "step": 27644 + }, + { + "epoch": 1.517343578485181, + "grad_norm": 0.9192255139350891, + "learning_rate": 7.602313117350462e-07, + "loss": 0.1288, + "step": 27646 + }, + { + "epoch": 1.5174533479692647, + "grad_norm": 1.0587973594665527, + "learning_rate": 7.589504293004868e-07, + "loss": 0.1436, + "step": 27648 + }, + { + "epoch": 1.517563117453348, + "grad_norm": 1.379966139793396, + "learning_rate": 7.576706101973447e-07, + "loss": 0.1447, + "step": 27650 + }, + { + "epoch": 1.5176728869374314, + "grad_norm": 0.849219799041748, + "learning_rate": 7.563918544817556e-07, + "loss": 0.2129, + "step": 27652 + }, + { + "epoch": 1.5177826564215149, + "grad_norm": 0.9579060077667236, + "learning_rate": 7.551141622098134e-07, + "loss": 0.1451, + "step": 27654 + }, + { + "epoch": 1.5178924259055981, + "grad_norm": 1.7255120277404785, + "learning_rate": 7.53837533437568e-07, + "loss": 0.2386, + "step": 27656 + }, + { + "epoch": 1.5180021953896816, + "grad_norm": 1.5450365543365479, + "learning_rate": 7.525619682210133e-07, + "loss": 0.1777, + "step": 27658 + }, + { + "epoch": 1.518111964873765, + "grad_norm": 1.2828203439712524, + "learning_rate": 7.512874666161074e-07, + "loss": 0.3157, + "step": 27660 + }, + { + "epoch": 1.5182217343578486, + "grad_norm": 1.4227356910705566, + "learning_rate": 7.500140286787499e-07, + "loss": 0.217, + "step": 27662 + }, + { + "epoch": 1.518331503841932, + "grad_norm": 1.2243882417678833, + "learning_rate": 7.487416544648018e-07, + "loss": 0.1899, + "step": 27664 + }, + { + "epoch": 1.5184412733260153, + "grad_norm": 0.9529960751533508, + "learning_rate": 7.474703440300796e-07, + "loss": 0.1939, + "step": 27666 + }, + { + "epoch": 1.5185510428100988, + "grad_norm": 1.365634799003601, + "learning_rate": 7.462000974303441e-07, + "loss": 0.2589, + "step": 27668 + }, + { + "epoch": 1.5186608122941823, + "grad_norm": 1.4836251735687256, + "learning_rate": 7.449309147213173e-07, + "loss": 0.1696, + "step": 27670 + }, + { + "epoch": 1.5187705817782655, + "grad_norm": 0.7399176359176636, + "learning_rate": 7.436627959586745e-07, + "loss": 0.1297, + "step": 27672 + }, + { + "epoch": 1.5188803512623492, + "grad_norm": 1.205418586730957, + "learning_rate": 7.423957411980376e-07, + "loss": 0.1341, + "step": 27674 + }, + { + "epoch": 1.5189901207464325, + "grad_norm": 1.1911473274230957, + "learning_rate": 7.411297504949844e-07, + "loss": 0.2951, + "step": 27676 + }, + { + "epoch": 1.519099890230516, + "grad_norm": 0.918478786945343, + "learning_rate": 7.398648239050537e-07, + "loss": 0.1511, + "step": 27678 + }, + { + "epoch": 1.5192096597145994, + "grad_norm": 1.0206599235534668, + "learning_rate": 7.386009614837236e-07, + "loss": 0.1581, + "step": 27680 + }, + { + "epoch": 1.5193194291986827, + "grad_norm": 1.1492506265640259, + "learning_rate": 7.373381632864384e-07, + "loss": 0.1887, + "step": 27682 + }, + { + "epoch": 1.5194291986827662, + "grad_norm": 1.0596339702606201, + "learning_rate": 7.360764293685901e-07, + "loss": 0.2241, + "step": 27684 + }, + { + "epoch": 1.5195389681668496, + "grad_norm": 0.7994253635406494, + "learning_rate": 7.34815759785526e-07, + "loss": 0.0816, + "step": 27686 + }, + { + "epoch": 1.519648737650933, + "grad_norm": 0.8501916527748108, + "learning_rate": 7.335561545925407e-07, + "loss": 0.164, + "step": 27688 + }, + { + "epoch": 1.5197585071350166, + "grad_norm": 1.068621277809143, + "learning_rate": 7.322976138448873e-07, + "loss": 0.1149, + "step": 27690 + }, + { + "epoch": 1.5198682766190998, + "grad_norm": 1.3822431564331055, + "learning_rate": 7.310401375977771e-07, + "loss": 0.185, + "step": 27692 + }, + { + "epoch": 1.5199780461031833, + "grad_norm": 1.243014931678772, + "learning_rate": 7.297837259063634e-07, + "loss": 0.1654, + "step": 27694 + }, + { + "epoch": 1.5200878155872668, + "grad_norm": 1.1881693601608276, + "learning_rate": 7.285283788257602e-07, + "loss": 0.1891, + "step": 27696 + }, + { + "epoch": 1.52019758507135, + "grad_norm": 1.129004716873169, + "learning_rate": 7.272740964110375e-07, + "loss": 0.1046, + "step": 27698 + }, + { + "epoch": 1.5203073545554338, + "grad_norm": 1.1311862468719482, + "learning_rate": 7.260208787172068e-07, + "loss": 0.1485, + "step": 27700 + }, + { + "epoch": 1.520417124039517, + "grad_norm": 1.116018295288086, + "learning_rate": 7.247687257992464e-07, + "loss": 0.1363, + "step": 27702 + }, + { + "epoch": 1.5205268935236005, + "grad_norm": 1.7817881107330322, + "learning_rate": 7.235176377120761e-07, + "loss": 0.1927, + "step": 27704 + }, + { + "epoch": 1.520636663007684, + "grad_norm": 1.05472731590271, + "learning_rate": 7.222676145105828e-07, + "loss": 0.2162, + "step": 27706 + }, + { + "epoch": 1.5207464324917672, + "grad_norm": 1.504442811012268, + "learning_rate": 7.210186562495919e-07, + "loss": 0.122, + "step": 27708 + }, + { + "epoch": 1.5208562019758507, + "grad_norm": 1.3204855918884277, + "learning_rate": 7.197707629838901e-07, + "loss": 0.1414, + "step": 27710 + }, + { + "epoch": 1.5209659714599342, + "grad_norm": 1.0383588075637817, + "learning_rate": 7.185239347682199e-07, + "loss": 0.2255, + "step": 27712 + }, + { + "epoch": 1.5210757409440174, + "grad_norm": 1.3674553632736206, + "learning_rate": 7.172781716572679e-07, + "loss": 0.1454, + "step": 27714 + }, + { + "epoch": 1.5211855104281011, + "grad_norm": 1.9309853315353394, + "learning_rate": 7.160334737056823e-07, + "loss": 0.234, + "step": 27716 + }, + { + "epoch": 1.5212952799121844, + "grad_norm": 1.2367507219314575, + "learning_rate": 7.147898409680609e-07, + "loss": 0.211, + "step": 27718 + }, + { + "epoch": 1.5214050493962679, + "grad_norm": 0.8922054171562195, + "learning_rate": 7.135472734989573e-07, + "loss": 0.1964, + "step": 27720 + }, + { + "epoch": 1.5215148188803513, + "grad_norm": 1.1431175470352173, + "learning_rate": 7.123057713528752e-07, + "loss": 0.1762, + "step": 27722 + }, + { + "epoch": 1.5216245883644346, + "grad_norm": 1.0797624588012695, + "learning_rate": 7.110653345842683e-07, + "loss": 0.1229, + "step": 27724 + }, + { + "epoch": 1.521734357848518, + "grad_norm": 1.054906964302063, + "learning_rate": 7.098259632475568e-07, + "loss": 0.13, + "step": 27726 + }, + { + "epoch": 1.5218441273326015, + "grad_norm": 0.9465566873550415, + "learning_rate": 7.085876573971029e-07, + "loss": 0.1484, + "step": 27728 + }, + { + "epoch": 1.5219538968166848, + "grad_norm": 1.2685863971710205, + "learning_rate": 7.073504170872213e-07, + "loss": 0.1441, + "step": 27730 + }, + { + "epoch": 1.5220636663007685, + "grad_norm": 1.501059651374817, + "learning_rate": 7.061142423721856e-07, + "loss": 0.1946, + "step": 27732 + }, + { + "epoch": 1.5221734357848518, + "grad_norm": 1.1322863101959229, + "learning_rate": 7.048791333062244e-07, + "loss": 0.1397, + "step": 27734 + }, + { + "epoch": 1.5222832052689352, + "grad_norm": 1.2910445928573608, + "learning_rate": 7.036450899435082e-07, + "loss": 0.1682, + "step": 27736 + }, + { + "epoch": 1.5223929747530187, + "grad_norm": 0.9676533937454224, + "learning_rate": 7.024121123381743e-07, + "loss": 0.2036, + "step": 27738 + }, + { + "epoch": 1.522502744237102, + "grad_norm": 1.3169355392456055, + "learning_rate": 7.011802005443019e-07, + "loss": 0.2047, + "step": 27740 + }, + { + "epoch": 1.5226125137211857, + "grad_norm": 1.5424646139144897, + "learning_rate": 6.999493546159336e-07, + "loss": 0.1614, + "step": 27742 + }, + { + "epoch": 1.522722283205269, + "grad_norm": 1.8025678396224976, + "learning_rate": 6.987195746070568e-07, + "loss": 0.206, + "step": 27744 + }, + { + "epoch": 1.5228320526893524, + "grad_norm": 1.0611546039581299, + "learning_rate": 6.974908605716202e-07, + "loss": 0.158, + "step": 27746 + }, + { + "epoch": 1.5229418221734359, + "grad_norm": 1.088887333869934, + "learning_rate": 6.962632125635166e-07, + "loss": 0.1309, + "step": 27748 + }, + { + "epoch": 1.5230515916575191, + "grad_norm": 1.0491576194763184, + "learning_rate": 6.950366306365974e-07, + "loss": 0.1554, + "step": 27750 + }, + { + "epoch": 1.5231613611416026, + "grad_norm": 1.1944104433059692, + "learning_rate": 6.938111148446669e-07, + "loss": 0.1751, + "step": 27752 + }, + { + "epoch": 1.523271130625686, + "grad_norm": 1.1338839530944824, + "learning_rate": 6.92586665241482e-07, + "loss": 0.1366, + "step": 27754 + }, + { + "epoch": 1.5233809001097693, + "grad_norm": 1.1557068824768066, + "learning_rate": 6.913632818807553e-07, + "loss": 0.191, + "step": 27756 + }, + { + "epoch": 1.523490669593853, + "grad_norm": 1.399821162223816, + "learning_rate": 6.901409648161494e-07, + "loss": 0.3108, + "step": 27758 + }, + { + "epoch": 1.5236004390779363, + "grad_norm": 1.0594792366027832, + "learning_rate": 6.889197141012799e-07, + "loss": 0.1706, + "step": 27760 + }, + { + "epoch": 1.5237102085620198, + "grad_norm": 1.7868503332138062, + "learning_rate": 6.876995297897176e-07, + "loss": 0.168, + "step": 27762 + }, + { + "epoch": 1.5238199780461033, + "grad_norm": 1.1786540746688843, + "learning_rate": 6.864804119349866e-07, + "loss": 0.244, + "step": 27764 + }, + { + "epoch": 1.5239297475301865, + "grad_norm": 0.8982097506523132, + "learning_rate": 6.852623605905606e-07, + "loss": 0.1482, + "step": 27766 + }, + { + "epoch": 1.52403951701427, + "grad_norm": 1.4360437393188477, + "learning_rate": 6.840453758098719e-07, + "loss": 0.2454, + "step": 27768 + }, + { + "epoch": 1.5241492864983535, + "grad_norm": 0.911723792552948, + "learning_rate": 6.828294576463029e-07, + "loss": 0.1089, + "step": 27770 + }, + { + "epoch": 1.5242590559824367, + "grad_norm": 1.1042686700820923, + "learning_rate": 6.816146061531914e-07, + "loss": 0.2179, + "step": 27772 + }, + { + "epoch": 1.5243688254665204, + "grad_norm": 0.7197868227958679, + "learning_rate": 6.804008213838253e-07, + "loss": 0.1127, + "step": 27774 + }, + { + "epoch": 1.5244785949506037, + "grad_norm": 0.7843053340911865, + "learning_rate": 6.791881033914454e-07, + "loss": 0.195, + "step": 27776 + }, + { + "epoch": 1.5245883644346871, + "grad_norm": 2.0096232891082764, + "learning_rate": 6.779764522292454e-07, + "loss": 0.2173, + "step": 27778 + }, + { + "epoch": 1.5246981339187706, + "grad_norm": 0.8828389048576355, + "learning_rate": 6.767658679503825e-07, + "loss": 0.1736, + "step": 27780 + }, + { + "epoch": 1.5248079034028539, + "grad_norm": 1.0398470163345337, + "learning_rate": 6.755563506079504e-07, + "loss": 0.1499, + "step": 27782 + }, + { + "epoch": 1.5249176728869376, + "grad_norm": 0.7932868003845215, + "learning_rate": 6.743479002550124e-07, + "loss": 0.2002, + "step": 27784 + }, + { + "epoch": 1.5250274423710208, + "grad_norm": 1.5103216171264648, + "learning_rate": 6.731405169445676e-07, + "loss": 0.2244, + "step": 27786 + }, + { + "epoch": 1.5251372118551043, + "grad_norm": 0.6999813318252563, + "learning_rate": 6.719342007295875e-07, + "loss": 0.1065, + "step": 27788 + }, + { + "epoch": 1.5252469813391878, + "grad_norm": 1.3184428215026855, + "learning_rate": 6.707289516629772e-07, + "loss": 0.1422, + "step": 27790 + }, + { + "epoch": 1.525356750823271, + "grad_norm": 0.8963886499404907, + "learning_rate": 6.695247697976165e-07, + "loss": 0.1479, + "step": 27792 + }, + { + "epoch": 1.5254665203073545, + "grad_norm": 1.0132999420166016, + "learning_rate": 6.683216551863158e-07, + "loss": 0.1924, + "step": 27794 + }, + { + "epoch": 1.525576289791438, + "grad_norm": 0.7074255347251892, + "learning_rate": 6.671196078818581e-07, + "loss": 0.0879, + "step": 27796 + }, + { + "epoch": 1.5256860592755213, + "grad_norm": 1.2487363815307617, + "learning_rate": 6.65918627936965e-07, + "loss": 0.1545, + "step": 27798 + }, + { + "epoch": 1.525795828759605, + "grad_norm": 1.1447410583496094, + "learning_rate": 6.647187154043222e-07, + "loss": 0.1688, + "step": 27800 + }, + { + "epoch": 1.5259055982436882, + "grad_norm": 2.077861785888672, + "learning_rate": 6.63519870336557e-07, + "loss": 0.2663, + "step": 27802 + }, + { + "epoch": 1.5260153677277717, + "grad_norm": 1.1612441539764404, + "learning_rate": 6.623220927862605e-07, + "loss": 0.129, + "step": 27804 + }, + { + "epoch": 1.5261251372118552, + "grad_norm": 1.1634206771850586, + "learning_rate": 6.611253828059771e-07, + "loss": 0.2601, + "step": 27806 + }, + { + "epoch": 1.5262349066959384, + "grad_norm": 0.9833105206489563, + "learning_rate": 6.599297404481952e-07, + "loss": 0.1899, + "step": 27808 + }, + { + "epoch": 1.5263446761800221, + "grad_norm": 1.3493568897247314, + "learning_rate": 6.587351657653645e-07, + "loss": 0.1886, + "step": 27810 + }, + { + "epoch": 1.5264544456641054, + "grad_norm": 1.244808554649353, + "learning_rate": 6.575416588098821e-07, + "loss": 0.2009, + "step": 27812 + }, + { + "epoch": 1.5265642151481889, + "grad_norm": 1.2475641965866089, + "learning_rate": 6.563492196341003e-07, + "loss": 0.1821, + "step": 27814 + }, + { + "epoch": 1.5266739846322723, + "grad_norm": 1.1809413433074951, + "learning_rate": 6.551578482903248e-07, + "loss": 0.1914, + "step": 27816 + }, + { + "epoch": 1.5267837541163556, + "grad_norm": 1.2443811893463135, + "learning_rate": 6.539675448308247e-07, + "loss": 0.176, + "step": 27818 + }, + { + "epoch": 1.526893523600439, + "grad_norm": 1.2827754020690918, + "learning_rate": 6.527783093078027e-07, + "loss": 0.2196, + "step": 27820 + }, + { + "epoch": 1.5270032930845225, + "grad_norm": 1.351997971534729, + "learning_rate": 6.515901417734255e-07, + "loss": 0.1243, + "step": 27822 + }, + { + "epoch": 1.5271130625686058, + "grad_norm": 1.1669329404830933, + "learning_rate": 6.504030422798179e-07, + "loss": 0.1428, + "step": 27824 + }, + { + "epoch": 1.5272228320526895, + "grad_norm": 1.5004547834396362, + "learning_rate": 6.492170108790413e-07, + "loss": 0.172, + "step": 27826 + }, + { + "epoch": 1.5273326015367727, + "grad_norm": 1.0063812732696533, + "learning_rate": 6.480320476231316e-07, + "loss": 0.2341, + "step": 27828 + }, + { + "epoch": 1.5274423710208562, + "grad_norm": 1.2123178243637085, + "learning_rate": 6.468481525640641e-07, + "loss": 0.1455, + "step": 27830 + }, + { + "epoch": 1.5275521405049397, + "grad_norm": 1.0021284818649292, + "learning_rate": 6.456653257537665e-07, + "loss": 0.1109, + "step": 27832 + }, + { + "epoch": 1.527661909989023, + "grad_norm": 1.2440036535263062, + "learning_rate": 6.44483567244128e-07, + "loss": 0.1414, + "step": 27834 + }, + { + "epoch": 1.5277716794731064, + "grad_norm": 1.1792737245559692, + "learning_rate": 6.43302877086982e-07, + "loss": 0.1217, + "step": 27836 + }, + { + "epoch": 1.52788144895719, + "grad_norm": 1.041571855545044, + "learning_rate": 6.421232553341205e-07, + "loss": 0.1326, + "step": 27838 + }, + { + "epoch": 1.5279912184412732, + "grad_norm": 1.0442571640014648, + "learning_rate": 6.409447020372911e-07, + "loss": 0.1966, + "step": 27840 + }, + { + "epoch": 1.5281009879253569, + "grad_norm": 3.0768959522247314, + "learning_rate": 6.397672172481884e-07, + "loss": 0.177, + "step": 27842 + }, + { + "epoch": 1.5282107574094401, + "grad_norm": 1.1524931192398071, + "learning_rate": 6.385908010184627e-07, + "loss": 0.1173, + "step": 27844 + }, + { + "epoch": 1.5283205268935236, + "grad_norm": 0.7188471555709839, + "learning_rate": 6.374154533997145e-07, + "loss": 0.1179, + "step": 27846 + }, + { + "epoch": 1.528430296377607, + "grad_norm": 1.250964641571045, + "learning_rate": 6.362411744435054e-07, + "loss": 0.2692, + "step": 27848 + }, + { + "epoch": 1.5285400658616903, + "grad_norm": 1.176631212234497, + "learning_rate": 6.350679642013413e-07, + "loss": 0.1919, + "step": 27850 + }, + { + "epoch": 1.528649835345774, + "grad_norm": 1.016512393951416, + "learning_rate": 6.338958227246866e-07, + "loss": 0.2509, + "step": 27852 + }, + { + "epoch": 1.5287596048298573, + "grad_norm": 1.5000348091125488, + "learning_rate": 6.327247500649585e-07, + "loss": 0.205, + "step": 27854 + }, + { + "epoch": 1.5288693743139408, + "grad_norm": 0.9286444187164307, + "learning_rate": 6.315547462735244e-07, + "loss": 0.2329, + "step": 27856 + }, + { + "epoch": 1.5289791437980242, + "grad_norm": 1.5048102140426636, + "learning_rate": 6.303858114017041e-07, + "loss": 0.1781, + "step": 27858 + }, + { + "epoch": 1.5290889132821075, + "grad_norm": 1.117926001548767, + "learning_rate": 6.29217945500779e-07, + "loss": 0.1937, + "step": 27860 + }, + { + "epoch": 1.529198682766191, + "grad_norm": 1.2606921195983887, + "learning_rate": 6.280511486219721e-07, + "loss": 0.169, + "step": 27862 + }, + { + "epoch": 1.5293084522502745, + "grad_norm": 1.0531545877456665, + "learning_rate": 6.268854208164643e-07, + "loss": 0.1904, + "step": 27864 + }, + { + "epoch": 1.5294182217343577, + "grad_norm": 1.5289292335510254, + "learning_rate": 6.257207621353927e-07, + "loss": 0.3371, + "step": 27866 + }, + { + "epoch": 1.5295279912184414, + "grad_norm": 0.9454666376113892, + "learning_rate": 6.245571726298471e-07, + "loss": 0.1372, + "step": 27868 + }, + { + "epoch": 1.5296377607025247, + "grad_norm": 0.8915786147117615, + "learning_rate": 6.233946523508644e-07, + "loss": 0.1254, + "step": 27870 + }, + { + "epoch": 1.5297475301866081, + "grad_norm": 1.0990098714828491, + "learning_rate": 6.222332013494397e-07, + "loss": 0.2015, + "step": 27872 + }, + { + "epoch": 1.5298572996706916, + "grad_norm": 0.7199308276176453, + "learning_rate": 6.210728196765186e-07, + "loss": 0.2475, + "step": 27874 + }, + { + "epoch": 1.5299670691547749, + "grad_norm": 0.8975943922996521, + "learning_rate": 6.199135073829992e-07, + "loss": 0.1338, + "step": 27876 + }, + { + "epoch": 1.5300768386388583, + "grad_norm": 1.1437098979949951, + "learning_rate": 6.187552645197436e-07, + "loss": 0.2324, + "step": 27878 + }, + { + "epoch": 1.5301866081229418, + "grad_norm": 1.0357623100280762, + "learning_rate": 6.175980911375528e-07, + "loss": 0.146, + "step": 27880 + }, + { + "epoch": 1.530296377607025, + "grad_norm": 0.8329336643218994, + "learning_rate": 6.164419872871835e-07, + "loss": 0.216, + "step": 27882 + }, + { + "epoch": 1.5304061470911088, + "grad_norm": 1.142948865890503, + "learning_rate": 6.152869530193505e-07, + "loss": 0.2798, + "step": 27884 + }, + { + "epoch": 1.530515916575192, + "grad_norm": 2.2418572902679443, + "learning_rate": 6.141329883847191e-07, + "loss": 0.3082, + "step": 27886 + }, + { + "epoch": 1.5306256860592755, + "grad_norm": 1.2511647939682007, + "learning_rate": 6.129800934339042e-07, + "loss": 0.1932, + "step": 27888 + }, + { + "epoch": 1.530735455543359, + "grad_norm": 0.9071027040481567, + "learning_rate": 6.118282682174848e-07, + "loss": 0.2718, + "step": 27890 + }, + { + "epoch": 1.5308452250274422, + "grad_norm": 1.5108518600463867, + "learning_rate": 6.106775127859816e-07, + "loss": 0.2681, + "step": 27892 + }, + { + "epoch": 1.530954994511526, + "grad_norm": 0.9228238463401794, + "learning_rate": 6.095278271898735e-07, + "loss": 0.1672, + "step": 27894 + }, + { + "epoch": 1.5310647639956092, + "grad_norm": 0.9627511501312256, + "learning_rate": 6.0837921147959e-07, + "loss": 0.1652, + "step": 27896 + }, + { + "epoch": 1.5311745334796927, + "grad_norm": 0.8994134068489075, + "learning_rate": 6.072316657055127e-07, + "loss": 0.1716, + "step": 27898 + }, + { + "epoch": 1.5312843029637762, + "grad_norm": 1.9213521480560303, + "learning_rate": 6.06085189917982e-07, + "loss": 0.139, + "step": 27900 + }, + { + "epoch": 1.5313940724478594, + "grad_norm": 0.7938262820243835, + "learning_rate": 6.049397841672883e-07, + "loss": 0.202, + "step": 27902 + }, + { + "epoch": 1.5315038419319429, + "grad_norm": 2.981656789779663, + "learning_rate": 6.037954485036745e-07, + "loss": 0.2034, + "step": 27904 + }, + { + "epoch": 1.5316136114160264, + "grad_norm": 0.8102681040763855, + "learning_rate": 6.026521829773341e-07, + "loss": 0.1561, + "step": 27906 + }, + { + "epoch": 1.5317233809001096, + "grad_norm": 2.2515642642974854, + "learning_rate": 6.015099876384184e-07, + "loss": 0.253, + "step": 27908 + }, + { + "epoch": 1.5318331503841933, + "grad_norm": 1.0686041116714478, + "learning_rate": 6.003688625370291e-07, + "loss": 0.1697, + "step": 27910 + }, + { + "epoch": 1.5319429198682766, + "grad_norm": 1.7737513780593872, + "learning_rate": 5.992288077232261e-07, + "loss": 0.1977, + "step": 27912 + }, + { + "epoch": 1.53205268935236, + "grad_norm": 0.8673962950706482, + "learning_rate": 5.980898232470083e-07, + "loss": 0.1431, + "step": 27914 + }, + { + "epoch": 1.5321624588364435, + "grad_norm": 1.126725196838379, + "learning_rate": 5.969519091583442e-07, + "loss": 0.1737, + "step": 27916 + }, + { + "epoch": 1.5322722283205268, + "grad_norm": 1.3228033781051636, + "learning_rate": 5.958150655071465e-07, + "loss": 0.2268, + "step": 27918 + }, + { + "epoch": 1.5323819978046103, + "grad_norm": 1.947332739830017, + "learning_rate": 5.946792923432837e-07, + "loss": 0.1865, + "step": 27920 + }, + { + "epoch": 1.5324917672886937, + "grad_norm": 0.822090744972229, + "learning_rate": 5.93544589716577e-07, + "loss": 0.1181, + "step": 27922 + }, + { + "epoch": 1.5326015367727772, + "grad_norm": 1.7060623168945312, + "learning_rate": 5.92410957676795e-07, + "loss": 0.2086, + "step": 27924 + }, + { + "epoch": 1.5327113062568607, + "grad_norm": 1.1806353330612183, + "learning_rate": 5.912783962736701e-07, + "loss": 0.158, + "step": 27926 + }, + { + "epoch": 1.532821075740944, + "grad_norm": 1.2925390005111694, + "learning_rate": 5.901469055568792e-07, + "loss": 0.2672, + "step": 27928 + }, + { + "epoch": 1.5329308452250274, + "grad_norm": 1.03067147731781, + "learning_rate": 5.890164855760549e-07, + "loss": 0.1437, + "step": 27930 + }, + { + "epoch": 1.533040614709111, + "grad_norm": 1.611633539199829, + "learning_rate": 5.878871363807853e-07, + "loss": 0.1831, + "step": 27932 + }, + { + "epoch": 1.5331503841931942, + "grad_norm": 0.7891833782196045, + "learning_rate": 5.867588580206085e-07, + "loss": 0.158, + "step": 27934 + }, + { + "epoch": 1.5332601536772779, + "grad_norm": 1.145735263824463, + "learning_rate": 5.856316505450126e-07, + "loss": 0.1643, + "step": 27936 + }, + { + "epoch": 1.5333699231613611, + "grad_norm": 0.9612054228782654, + "learning_rate": 5.845055140034444e-07, + "loss": 0.1725, + "step": 27938 + }, + { + "epoch": 1.5334796926454446, + "grad_norm": 0.9691546559333801, + "learning_rate": 5.833804484453031e-07, + "loss": 0.1681, + "step": 27940 + }, + { + "epoch": 1.533589462129528, + "grad_norm": 1.0141102075576782, + "learning_rate": 5.82256453919941e-07, + "loss": 0.1537, + "step": 27942 + }, + { + "epoch": 1.5336992316136113, + "grad_norm": 1.181300401687622, + "learning_rate": 5.811335304766602e-07, + "loss": 0.2434, + "step": 27944 + }, + { + "epoch": 1.5338090010976948, + "grad_norm": 0.7564032077789307, + "learning_rate": 5.800116781647185e-07, + "loss": 0.1745, + "step": 27946 + }, + { + "epoch": 1.5339187705817783, + "grad_norm": 0.9613982439041138, + "learning_rate": 5.78890897033324e-07, + "loss": 0.1245, + "step": 27948 + }, + { + "epoch": 1.5340285400658615, + "grad_norm": 1.3952115774154663, + "learning_rate": 5.777711871316399e-07, + "loss": 0.136, + "step": 27950 + }, + { + "epoch": 1.5341383095499452, + "grad_norm": 0.6984164118766785, + "learning_rate": 5.766525485087826e-07, + "loss": 0.1247, + "step": 27952 + }, + { + "epoch": 1.5342480790340285, + "grad_norm": 1.2336242198944092, + "learning_rate": 5.755349812138239e-07, + "loss": 0.1821, + "step": 27954 + }, + { + "epoch": 1.534357848518112, + "grad_norm": 1.1790575981140137, + "learning_rate": 5.744184852957801e-07, + "loss": 0.1618, + "step": 27956 + }, + { + "epoch": 1.5344676180021954, + "grad_norm": 1.0023109912872314, + "learning_rate": 5.733030608036316e-07, + "loss": 0.155, + "step": 27958 + }, + { + "epoch": 1.5345773874862787, + "grad_norm": 1.1609718799591064, + "learning_rate": 5.72188707786303e-07, + "loss": 0.1908, + "step": 27960 + }, + { + "epoch": 1.5346871569703624, + "grad_norm": 1.3943250179290771, + "learning_rate": 5.710754262926777e-07, + "loss": 0.2466, + "step": 27962 + }, + { + "epoch": 1.5347969264544457, + "grad_norm": 0.9691044092178345, + "learning_rate": 5.699632163715856e-07, + "loss": 0.1674, + "step": 27964 + }, + { + "epoch": 1.5349066959385291, + "grad_norm": 0.9775404930114746, + "learning_rate": 5.688520780718187e-07, + "loss": 0.1335, + "step": 27966 + }, + { + "epoch": 1.5350164654226126, + "grad_norm": 1.2713466882705688, + "learning_rate": 5.677420114421128e-07, + "loss": 0.1391, + "step": 27968 + }, + { + "epoch": 1.5351262349066959, + "grad_norm": 0.8638578653335571, + "learning_rate": 5.666330165311651e-07, + "loss": 0.1716, + "step": 27970 + }, + { + "epoch": 1.5352360043907793, + "grad_norm": 0.8765590190887451, + "learning_rate": 5.6552509338762e-07, + "loss": 0.2307, + "step": 27972 + }, + { + "epoch": 1.5353457738748628, + "grad_norm": 1.0348143577575684, + "learning_rate": 5.644182420600718e-07, + "loss": 0.1477, + "step": 27974 + }, + { + "epoch": 1.535455543358946, + "grad_norm": 0.9769943952560425, + "learning_rate": 5.633124625970793e-07, + "loss": 0.1175, + "step": 27976 + }, + { + "epoch": 1.5355653128430298, + "grad_norm": 0.6732799410820007, + "learning_rate": 5.62207755047145e-07, + "loss": 0.146, + "step": 27978 + }, + { + "epoch": 1.535675082327113, + "grad_norm": 0.6664646863937378, + "learning_rate": 5.611041194587274e-07, + "loss": 0.0906, + "step": 27980 + }, + { + "epoch": 1.5357848518111965, + "grad_norm": 1.3018503189086914, + "learning_rate": 5.600015558802352e-07, + "loss": 0.1998, + "step": 27982 + }, + { + "epoch": 1.53589462129528, + "grad_norm": 0.9440849423408508, + "learning_rate": 5.589000643600323e-07, + "loss": 0.1677, + "step": 27984 + }, + { + "epoch": 1.5360043907793632, + "grad_norm": 1.0994720458984375, + "learning_rate": 5.577996449464356e-07, + "loss": 0.1576, + "step": 27986 + }, + { + "epoch": 1.5361141602634467, + "grad_norm": 1.1458121538162231, + "learning_rate": 5.567002976877178e-07, + "loss": 0.1721, + "step": 27988 + }, + { + "epoch": 1.5362239297475302, + "grad_norm": 0.9917593002319336, + "learning_rate": 5.556020226320985e-07, + "loss": 0.1556, + "step": 27990 + }, + { + "epoch": 1.5363336992316134, + "grad_norm": 1.4067636728286743, + "learning_rate": 5.545048198277558e-07, + "loss": 0.2016, + "step": 27992 + }, + { + "epoch": 1.5364434687156971, + "grad_norm": 1.1455186605453491, + "learning_rate": 5.53408689322818e-07, + "loss": 0.1666, + "step": 27994 + }, + { + "epoch": 1.5365532381997804, + "grad_norm": 0.9995589852333069, + "learning_rate": 5.523136311653604e-07, + "loss": 0.2694, + "step": 27996 + }, + { + "epoch": 1.5366630076838639, + "grad_norm": 0.94254070520401, + "learning_rate": 5.512196454034279e-07, + "loss": 0.1515, + "step": 27998 + }, + { + "epoch": 1.5367727771679474, + "grad_norm": 1.1240383386611938, + "learning_rate": 5.501267320850018e-07, + "loss": 0.2634, + "step": 28000 + }, + { + "epoch": 1.5368825466520306, + "grad_norm": 0.8478116393089294, + "learning_rate": 5.490348912580268e-07, + "loss": 0.0724, + "step": 28002 + }, + { + "epoch": 1.5369923161361143, + "grad_norm": 1.3499317169189453, + "learning_rate": 5.479441229703924e-07, + "loss": 0.1315, + "step": 28004 + }, + { + "epoch": 1.5371020856201976, + "grad_norm": 1.4005146026611328, + "learning_rate": 5.468544272699466e-07, + "loss": 0.3217, + "step": 28006 + }, + { + "epoch": 1.537211855104281, + "grad_norm": 0.7447348237037659, + "learning_rate": 5.457658042044899e-07, + "loss": 0.2057, + "step": 28008 + }, + { + "epoch": 1.5373216245883645, + "grad_norm": 1.2826619148254395, + "learning_rate": 5.446782538217704e-07, + "loss": 0.1964, + "step": 28010 + }, + { + "epoch": 1.5374313940724478, + "grad_norm": 0.9997040033340454, + "learning_rate": 5.435917761694998e-07, + "loss": 0.1775, + "step": 28012 + }, + { + "epoch": 1.5375411635565313, + "grad_norm": 0.9243218302726746, + "learning_rate": 5.425063712953315e-07, + "loss": 0.1284, + "step": 28014 + }, + { + "epoch": 1.5376509330406147, + "grad_norm": 0.7407538890838623, + "learning_rate": 5.414220392468805e-07, + "loss": 0.1198, + "step": 28016 + }, + { + "epoch": 1.537760702524698, + "grad_norm": 1.2110129594802856, + "learning_rate": 5.403387800717058e-07, + "loss": 0.1722, + "step": 28018 + }, + { + "epoch": 1.5378704720087817, + "grad_norm": 0.6745911836624146, + "learning_rate": 5.392565938173278e-07, + "loss": 0.125, + "step": 28020 + }, + { + "epoch": 1.537980241492865, + "grad_norm": 1.0080435276031494, + "learning_rate": 5.381754805312139e-07, + "loss": 0.1474, + "step": 28022 + }, + { + "epoch": 1.5380900109769484, + "grad_norm": 1.3893017768859863, + "learning_rate": 5.370954402607931e-07, + "loss": 0.2079, + "step": 28024 + }, + { + "epoch": 1.538199780461032, + "grad_norm": 1.0594772100448608, + "learning_rate": 5.360164730534356e-07, + "loss": 0.1612, + "step": 28026 + }, + { + "epoch": 1.5383095499451152, + "grad_norm": 1.3010754585266113, + "learning_rate": 5.349385789564704e-07, + "loss": 0.2077, + "step": 28028 + }, + { + "epoch": 1.5384193194291986, + "grad_norm": 0.925370991230011, + "learning_rate": 5.338617580171817e-07, + "loss": 0.1765, + "step": 28030 + }, + { + "epoch": 1.538529088913282, + "grad_norm": 1.292809009552002, + "learning_rate": 5.327860102828042e-07, + "loss": 0.1711, + "step": 28032 + }, + { + "epoch": 1.5386388583973656, + "grad_norm": 1.0537371635437012, + "learning_rate": 5.317113358005249e-07, + "loss": 0.205, + "step": 28034 + }, + { + "epoch": 1.538748627881449, + "grad_norm": 1.414884328842163, + "learning_rate": 5.306377346174812e-07, + "loss": 0.2224, + "step": 28036 + }, + { + "epoch": 1.5388583973655323, + "grad_norm": 0.7570416331291199, + "learning_rate": 5.295652067807715e-07, + "loss": 0.1257, + "step": 28038 + }, + { + "epoch": 1.5389681668496158, + "grad_norm": 1.1016026735305786, + "learning_rate": 5.28493752337439e-07, + "loss": 0.1848, + "step": 28040 + }, + { + "epoch": 1.5390779363336993, + "grad_norm": 1.2006727457046509, + "learning_rate": 5.274233713344845e-07, + "loss": 0.1468, + "step": 28042 + }, + { + "epoch": 1.5391877058177825, + "grad_norm": 1.0334924459457397, + "learning_rate": 5.263540638188596e-07, + "loss": 0.1278, + "step": 28044 + }, + { + "epoch": 1.5392974753018662, + "grad_norm": 1.02210533618927, + "learning_rate": 5.252858298374657e-07, + "loss": 0.1059, + "step": 28046 + }, + { + "epoch": 1.5394072447859495, + "grad_norm": 1.3762140274047852, + "learning_rate": 5.242186694371682e-07, + "loss": 0.1566, + "step": 28048 + }, + { + "epoch": 1.539517014270033, + "grad_norm": 1.0024489164352417, + "learning_rate": 5.231525826647738e-07, + "loss": 0.1322, + "step": 28050 + }, + { + "epoch": 1.5396267837541164, + "grad_norm": 1.7777718305587769, + "learning_rate": 5.220875695670452e-07, + "loss": 0.2473, + "step": 28052 + }, + { + "epoch": 1.5397365532381997, + "grad_norm": 1.1046221256256104, + "learning_rate": 5.210236301907035e-07, + "loss": 0.1921, + "step": 28054 + }, + { + "epoch": 1.5398463227222832, + "grad_norm": 1.2039786577224731, + "learning_rate": 5.199607645824111e-07, + "loss": 0.2446, + "step": 28056 + }, + { + "epoch": 1.5399560922063666, + "grad_norm": 1.174843192100525, + "learning_rate": 5.18898972788795e-07, + "loss": 0.1523, + "step": 28058 + }, + { + "epoch": 1.54006586169045, + "grad_norm": 2.2158091068267822, + "learning_rate": 5.178382548564287e-07, + "loss": 0.1821, + "step": 28060 + }, + { + "epoch": 1.5401756311745336, + "grad_norm": 1.0725935697555542, + "learning_rate": 5.167786108318473e-07, + "loss": 0.0936, + "step": 28062 + }, + { + "epoch": 1.5402854006586169, + "grad_norm": 0.707046627998352, + "learning_rate": 5.157200407615248e-07, + "loss": 0.0939, + "step": 28064 + }, + { + "epoch": 1.5403951701427003, + "grad_norm": 0.9835708737373352, + "learning_rate": 5.146625446918962e-07, + "loss": 0.1524, + "step": 28066 + }, + { + "epoch": 1.5405049396267838, + "grad_norm": 1.014140009880066, + "learning_rate": 5.136061226693495e-07, + "loss": 0.2128, + "step": 28068 + }, + { + "epoch": 1.540614709110867, + "grad_norm": 1.076727032661438, + "learning_rate": 5.125507747402253e-07, + "loss": 0.1583, + "step": 28070 + }, + { + "epoch": 1.5407244785949508, + "grad_norm": 1.1687750816345215, + "learning_rate": 5.114965009508144e-07, + "loss": 0.1804, + "step": 28072 + }, + { + "epoch": 1.540834248079034, + "grad_norm": 1.1889114379882812, + "learning_rate": 5.104433013473658e-07, + "loss": 0.1483, + "step": 28074 + }, + { + "epoch": 1.5409440175631175, + "grad_norm": 1.2640986442565918, + "learning_rate": 5.093911759760761e-07, + "loss": 0.2286, + "step": 28076 + }, + { + "epoch": 1.541053787047201, + "grad_norm": 1.0337620973587036, + "learning_rate": 5.083401248830944e-07, + "loss": 0.1934, + "step": 28078 + }, + { + "epoch": 1.5411635565312842, + "grad_norm": 0.810384213924408, + "learning_rate": 5.072901481145281e-07, + "loss": 0.1145, + "step": 28080 + }, + { + "epoch": 1.5412733260153677, + "grad_norm": 1.3372855186462402, + "learning_rate": 5.062412457164322e-07, + "loss": 0.1895, + "step": 28082 + }, + { + "epoch": 1.5413830954994512, + "grad_norm": 4.770977973937988, + "learning_rate": 5.051934177348172e-07, + "loss": 0.1805, + "step": 28084 + }, + { + "epoch": 1.5414928649835344, + "grad_norm": 1.118371844291687, + "learning_rate": 5.041466642156462e-07, + "loss": 0.1446, + "step": 28086 + }, + { + "epoch": 1.5416026344676181, + "grad_norm": 0.7991020083427429, + "learning_rate": 5.031009852048352e-07, + "loss": 0.102, + "step": 28088 + }, + { + "epoch": 1.5417124039517014, + "grad_norm": 1.222517490386963, + "learning_rate": 5.020563807482559e-07, + "loss": 0.3031, + "step": 28090 + }, + { + "epoch": 1.5418221734357849, + "grad_norm": 1.4787869453430176, + "learning_rate": 5.010128508917245e-07, + "loss": 0.1708, + "step": 28092 + }, + { + "epoch": 1.5419319429198683, + "grad_norm": 1.1271768808364868, + "learning_rate": 4.999703956810181e-07, + "loss": 0.1768, + "step": 28094 + }, + { + "epoch": 1.5420417124039516, + "grad_norm": 1.3211973905563354, + "learning_rate": 4.989290151618614e-07, + "loss": 0.1894, + "step": 28096 + }, + { + "epoch": 1.542151481888035, + "grad_norm": 1.3459482192993164, + "learning_rate": 4.978887093799401e-07, + "loss": 0.2308, + "step": 28098 + }, + { + "epoch": 1.5422612513721186, + "grad_norm": 1.1847347021102905, + "learning_rate": 4.968494783808814e-07, + "loss": 0.1357, + "step": 28100 + }, + { + "epoch": 1.5423710208562018, + "grad_norm": 0.9767933487892151, + "learning_rate": 4.958113222102739e-07, + "loss": 0.1833, + "step": 28102 + }, + { + "epoch": 1.5424807903402855, + "grad_norm": 1.4089782238006592, + "learning_rate": 4.947742409136535e-07, + "loss": 0.2283, + "step": 28104 + }, + { + "epoch": 1.5425905598243688, + "grad_norm": 1.2953860759735107, + "learning_rate": 4.937382345365172e-07, + "loss": 0.2705, + "step": 28106 + }, + { + "epoch": 1.5427003293084522, + "grad_norm": 0.8515232801437378, + "learning_rate": 4.927033031243006e-07, + "loss": 0.1592, + "step": 28108 + }, + { + "epoch": 1.5428100987925357, + "grad_norm": 0.6719789505004883, + "learning_rate": 4.916694467224092e-07, + "loss": 0.1619, + "step": 28110 + }, + { + "epoch": 1.542919868276619, + "grad_norm": 1.0359382629394531, + "learning_rate": 4.906366653761901e-07, + "loss": 0.2422, + "step": 28112 + }, + { + "epoch": 1.5430296377607027, + "grad_norm": 0.7707464098930359, + "learning_rate": 4.896049591309459e-07, + "loss": 0.2391, + "step": 28114 + }, + { + "epoch": 1.543139407244786, + "grad_norm": 1.0443133115768433, + "learning_rate": 4.885743280319321e-07, + "loss": 0.1808, + "step": 28116 + }, + { + "epoch": 1.5432491767288694, + "grad_norm": 0.9318675994873047, + "learning_rate": 4.87544772124357e-07, + "loss": 0.1718, + "step": 28118 + }, + { + "epoch": 1.5433589462129529, + "grad_norm": 0.8661987781524658, + "learning_rate": 4.865162914533816e-07, + "loss": 0.0859, + "step": 28120 + }, + { + "epoch": 1.5434687156970361, + "grad_norm": 1.3587747812271118, + "learning_rate": 4.854888860641199e-07, + "loss": 0.1806, + "step": 28122 + }, + { + "epoch": 1.5435784851811196, + "grad_norm": 1.119644045829773, + "learning_rate": 4.844625560016442e-07, + "loss": 0.2502, + "step": 28124 + }, + { + "epoch": 1.543688254665203, + "grad_norm": 0.8434059619903564, + "learning_rate": 4.834373013109683e-07, + "loss": 0.1225, + "step": 28126 + }, + { + "epoch": 1.5437980241492864, + "grad_norm": 0.9999926090240479, + "learning_rate": 4.824131220370648e-07, + "loss": 0.1073, + "step": 28128 + }, + { + "epoch": 1.54390779363337, + "grad_norm": 1.656816005706787, + "learning_rate": 4.813900182248643e-07, + "loss": 0.2314, + "step": 28130 + }, + { + "epoch": 1.5440175631174533, + "grad_norm": 1.1145657300949097, + "learning_rate": 4.803679899192392e-07, + "loss": 0.1192, + "step": 28132 + }, + { + "epoch": 1.5441273326015368, + "grad_norm": 1.4388474225997925, + "learning_rate": 4.793470371650233e-07, + "loss": 0.2641, + "step": 28134 + }, + { + "epoch": 1.5442371020856203, + "grad_norm": 1.0736171007156372, + "learning_rate": 4.783271600070027e-07, + "loss": 0.1291, + "step": 28136 + }, + { + "epoch": 1.5443468715697035, + "grad_norm": 1.688371181488037, + "learning_rate": 4.77308358489914e-07, + "loss": 0.2737, + "step": 28138 + }, + { + "epoch": 1.544456641053787, + "grad_norm": 1.1203593015670776, + "learning_rate": 4.7629063265844096e-07, + "loss": 0.1903, + "step": 28140 + }, + { + "epoch": 1.5445664105378705, + "grad_norm": 0.8131622672080994, + "learning_rate": 4.752739825572339e-07, + "loss": 0.1263, + "step": 28142 + }, + { + "epoch": 1.544676180021954, + "grad_norm": 1.4857317209243774, + "learning_rate": 4.742584082308821e-07, + "loss": 0.3195, + "step": 28144 + }, + { + "epoch": 1.5447859495060374, + "grad_norm": 1.343831181526184, + "learning_rate": 4.7324390972393604e-07, + "loss": 0.1761, + "step": 28146 + }, + { + "epoch": 1.5448957189901207, + "grad_norm": 0.8597972393035889, + "learning_rate": 4.7223048708089637e-07, + "loss": 0.1995, + "step": 28148 + }, + { + "epoch": 1.5450054884742042, + "grad_norm": 1.163215160369873, + "learning_rate": 4.7121814034621623e-07, + "loss": 0.1874, + "step": 28150 + }, + { + "epoch": 1.5451152579582876, + "grad_norm": 1.4775395393371582, + "learning_rate": 4.7020686956430473e-07, + "loss": 0.3239, + "step": 28152 + }, + { + "epoch": 1.545225027442371, + "grad_norm": 0.9727715849876404, + "learning_rate": 4.6919667477952066e-07, + "loss": 0.1791, + "step": 28154 + }, + { + "epoch": 1.5453347969264546, + "grad_norm": 1.1199527978897095, + "learning_rate": 4.6818755603617316e-07, + "loss": 0.162, + "step": 28156 + }, + { + "epoch": 1.5454445664105378, + "grad_norm": 0.9024277925491333, + "learning_rate": 4.671795133785267e-07, + "loss": 0.1661, + "step": 28158 + }, + { + "epoch": 1.5455543358946213, + "grad_norm": 1.0946967601776123, + "learning_rate": 4.661725468508016e-07, + "loss": 0.1328, + "step": 28160 + }, + { + "epoch": 1.5456641053787048, + "grad_norm": 1.1371290683746338, + "learning_rate": 4.6516665649716797e-07, + "loss": 0.2591, + "step": 28162 + }, + { + "epoch": 1.545773874862788, + "grad_norm": 1.235611081123352, + "learning_rate": 4.6416184236175163e-07, + "loss": 0.1411, + "step": 28164 + }, + { + "epoch": 1.5458836443468715, + "grad_norm": 1.1756800413131714, + "learning_rate": 4.6315810448862295e-07, + "loss": 0.1806, + "step": 28166 + }, + { + "epoch": 1.545993413830955, + "grad_norm": 1.132108211517334, + "learning_rate": 4.621554429218161e-07, + "loss": 0.209, + "step": 28168 + }, + { + "epoch": 1.5461031833150383, + "grad_norm": 1.119802474975586, + "learning_rate": 4.6115385770530706e-07, + "loss": 0.2838, + "step": 28170 + }, + { + "epoch": 1.546212952799122, + "grad_norm": 0.8155277967453003, + "learning_rate": 4.6015334888303563e-07, + "loss": 0.1674, + "step": 28172 + }, + { + "epoch": 1.5463227222832052, + "grad_norm": 0.7923842072486877, + "learning_rate": 4.5915391649888615e-07, + "loss": 0.1383, + "step": 28174 + }, + { + "epoch": 1.5464324917672887, + "grad_norm": 1.2734798192977905, + "learning_rate": 4.5815556059670137e-07, + "loss": 0.1924, + "step": 28176 + }, + { + "epoch": 1.5465422612513722, + "grad_norm": 0.9863306283950806, + "learning_rate": 4.571582812202685e-07, + "loss": 0.1338, + "step": 28178 + }, + { + "epoch": 1.5466520307354554, + "grad_norm": 1.1609776020050049, + "learning_rate": 4.561620784133386e-07, + "loss": 0.1601, + "step": 28180 + }, + { + "epoch": 1.5467618002195391, + "grad_norm": 1.6203643083572388, + "learning_rate": 4.5516695221960724e-07, + "loss": 0.1192, + "step": 28182 + }, + { + "epoch": 1.5468715697036224, + "grad_norm": 1.1054646968841553, + "learning_rate": 4.541729026827285e-07, + "loss": 0.185, + "step": 28184 + }, + { + "epoch": 1.5469813391877059, + "grad_norm": 1.001572847366333, + "learning_rate": 4.5317992984630354e-07, + "loss": 0.2206, + "step": 28186 + }, + { + "epoch": 1.5470911086717893, + "grad_norm": 1.5263299942016602, + "learning_rate": 4.5218803375388917e-07, + "loss": 0.2361, + "step": 28188 + }, + { + "epoch": 1.5472008781558726, + "grad_norm": 2.1067862510681152, + "learning_rate": 4.511972144489979e-07, + "loss": 0.2644, + "step": 28190 + }, + { + "epoch": 1.547310647639956, + "grad_norm": 1.8534255027770996, + "learning_rate": 4.502074719750865e-07, + "loss": 0.2015, + "step": 28192 + }, + { + "epoch": 1.5474204171240395, + "grad_norm": 1.1607203483581543, + "learning_rate": 4.4921880637557314e-07, + "loss": 0.114, + "step": 28194 + }, + { + "epoch": 1.5475301866081228, + "grad_norm": 0.9252521991729736, + "learning_rate": 4.482312176938258e-07, + "loss": 0.1166, + "step": 28196 + }, + { + "epoch": 1.5476399560922065, + "grad_norm": 0.8317095041275024, + "learning_rate": 4.4724470597316546e-07, + "loss": 0.1249, + "step": 28198 + }, + { + "epoch": 1.5477497255762898, + "grad_norm": 1.0635161399841309, + "learning_rate": 4.462592712568631e-07, + "loss": 0.115, + "step": 28200 + }, + { + "epoch": 1.5478594950603732, + "grad_norm": 1.9092531204223633, + "learning_rate": 4.4527491358814786e-07, + "loss": 0.1755, + "step": 28202 + }, + { + "epoch": 1.5479692645444567, + "grad_norm": 1.0793535709381104, + "learning_rate": 4.4429163301019373e-07, + "loss": 0.1435, + "step": 28204 + }, + { + "epoch": 1.54807903402854, + "grad_norm": 0.8457536697387695, + "learning_rate": 4.4330942956613555e-07, + "loss": 0.1343, + "step": 28206 + }, + { + "epoch": 1.5481888035126234, + "grad_norm": 1.194759726524353, + "learning_rate": 4.423283032990555e-07, + "loss": 0.1922, + "step": 28208 + }, + { + "epoch": 1.548298572996707, + "grad_norm": 1.3483673334121704, + "learning_rate": 4.41348254251997e-07, + "loss": 0.3144, + "step": 28210 + }, + { + "epoch": 1.5484083424807902, + "grad_norm": 0.938708484172821, + "learning_rate": 4.403692824679423e-07, + "loss": 0.1156, + "step": 28212 + }, + { + "epoch": 1.5485181119648739, + "grad_norm": 1.0521138906478882, + "learning_rate": 4.393913879898376e-07, + "loss": 0.1521, + "step": 28214 + }, + { + "epoch": 1.5486278814489571, + "grad_norm": 1.5318354368209839, + "learning_rate": 4.3841457086057634e-07, + "loss": 0.2141, + "step": 28216 + }, + { + "epoch": 1.5487376509330406, + "grad_norm": 1.118517279624939, + "learning_rate": 4.3743883112300763e-07, + "loss": 0.2633, + "step": 28218 + }, + { + "epoch": 1.548847420417124, + "grad_norm": 1.2170153856277466, + "learning_rate": 4.3646416881993055e-07, + "loss": 0.1698, + "step": 28220 + }, + { + "epoch": 1.5489571899012073, + "grad_norm": 1.1763687133789062, + "learning_rate": 4.354905839941026e-07, + "loss": 0.1309, + "step": 28222 + }, + { + "epoch": 1.549066959385291, + "grad_norm": 1.060018539428711, + "learning_rate": 4.3451807668822566e-07, + "loss": 0.112, + "step": 28224 + }, + { + "epoch": 1.5491767288693743, + "grad_norm": 0.9429810047149658, + "learning_rate": 4.3354664694496284e-07, + "loss": 0.14, + "step": 28226 + }, + { + "epoch": 1.5492864983534578, + "grad_norm": 1.426296353340149, + "learning_rate": 4.3257629480692173e-07, + "loss": 0.2142, + "step": 28228 + }, + { + "epoch": 1.5493962678375413, + "grad_norm": 0.7807531952857971, + "learning_rate": 4.3160702031666557e-07, + "loss": 0.1102, + "step": 28230 + }, + { + "epoch": 1.5495060373216245, + "grad_norm": 0.9539024829864502, + "learning_rate": 4.306388235167158e-07, + "loss": 0.1406, + "step": 28232 + }, + { + "epoch": 1.549615806805708, + "grad_norm": 1.0059711933135986, + "learning_rate": 4.296717044495441e-07, + "loss": 0.198, + "step": 28234 + }, + { + "epoch": 1.5497255762897915, + "grad_norm": 1.0348507165908813, + "learning_rate": 4.287056631575664e-07, + "loss": 0.1438, + "step": 28236 + }, + { + "epoch": 1.5498353457738747, + "grad_norm": 1.394868016242981, + "learning_rate": 4.277406996831601e-07, + "loss": 0.1977, + "step": 28238 + }, + { + "epoch": 1.5499451152579584, + "grad_norm": 1.127304196357727, + "learning_rate": 4.267768140686579e-07, + "loss": 0.2528, + "step": 28240 + }, + { + "epoch": 1.5500548847420417, + "grad_norm": 1.037698745727539, + "learning_rate": 4.2581400635633427e-07, + "loss": 0.133, + "step": 28242 + }, + { + "epoch": 1.5501646542261251, + "grad_norm": 1.2879490852355957, + "learning_rate": 4.2485227658842775e-07, + "loss": 0.205, + "step": 28244 + }, + { + "epoch": 1.5502744237102086, + "grad_norm": 1.1362242698669434, + "learning_rate": 4.238916248071212e-07, + "loss": 0.134, + "step": 28246 + }, + { + "epoch": 1.5503841931942919, + "grad_norm": 1.251625895500183, + "learning_rate": 4.2293205105455593e-07, + "loss": 0.1384, + "step": 28248 + }, + { + "epoch": 1.5504939626783754, + "grad_norm": 0.8830093741416931, + "learning_rate": 4.219735553728205e-07, + "loss": 0.1573, + "step": 28250 + }, + { + "epoch": 1.5506037321624588, + "grad_norm": 1.1083581447601318, + "learning_rate": 4.2101613780396176e-07, + "loss": 0.1313, + "step": 28252 + }, + { + "epoch": 1.5507135016465423, + "grad_norm": 1.2442582845687866, + "learning_rate": 4.200597983899768e-07, + "loss": 0.1228, + "step": 28254 + }, + { + "epoch": 1.5508232711306258, + "grad_norm": 1.316465973854065, + "learning_rate": 4.191045371728125e-07, + "loss": 0.3186, + "step": 28256 + }, + { + "epoch": 1.550933040614709, + "grad_norm": 0.9952749013900757, + "learning_rate": 4.181503541943743e-07, + "loss": 0.2843, + "step": 28258 + }, + { + "epoch": 1.5510428100987925, + "grad_norm": 1.2398207187652588, + "learning_rate": 4.171972494965176e-07, + "loss": 0.1815, + "step": 28260 + }, + { + "epoch": 1.551152579582876, + "grad_norm": 2.9251646995544434, + "learning_rate": 4.1624522312104786e-07, + "loss": 0.2238, + "step": 28262 + }, + { + "epoch": 1.5512623490669593, + "grad_norm": 3.3407769203186035, + "learning_rate": 4.1529427510972883e-07, + "loss": 0.1808, + "step": 28264 + }, + { + "epoch": 1.551372118551043, + "grad_norm": 0.8669270873069763, + "learning_rate": 4.143444055042661e-07, + "loss": 0.22, + "step": 28266 + }, + { + "epoch": 1.5514818880351262, + "grad_norm": 1.1892039775848389, + "learning_rate": 4.1339561434633743e-07, + "loss": 0.1594, + "step": 28268 + }, + { + "epoch": 1.5515916575192097, + "grad_norm": 1.017317295074463, + "learning_rate": 4.124479016775512e-07, + "loss": 0.2505, + "step": 28270 + }, + { + "epoch": 1.5517014270032932, + "grad_norm": 1.0012966394424438, + "learning_rate": 4.1150126753948247e-07, + "loss": 0.1521, + "step": 28272 + }, + { + "epoch": 1.5518111964873764, + "grad_norm": 1.589140772819519, + "learning_rate": 4.105557119736592e-07, + "loss": 0.2295, + "step": 28274 + }, + { + "epoch": 1.55192096597146, + "grad_norm": 0.9552419185638428, + "learning_rate": 4.096112350215536e-07, + "loss": 0.2032, + "step": 28276 + }, + { + "epoch": 1.5520307354555434, + "grad_norm": 1.0236040353775024, + "learning_rate": 4.086678367245966e-07, + "loss": 0.1393, + "step": 28278 + }, + { + "epoch": 1.5521405049396266, + "grad_norm": 2.898850917816162, + "learning_rate": 4.077255171241662e-07, + "loss": 0.1921, + "step": 28280 + }, + { + "epoch": 1.5522502744237103, + "grad_norm": 1.175774097442627, + "learning_rate": 4.067842762616014e-07, + "loss": 0.172, + "step": 28282 + }, + { + "epoch": 1.5523600439077936, + "grad_norm": 1.1581766605377197, + "learning_rate": 4.0584411417819157e-07, + "loss": 0.1729, + "step": 28284 + }, + { + "epoch": 1.552469813391877, + "grad_norm": 1.46867036819458, + "learning_rate": 4.0490503091517306e-07, + "loss": 0.2177, + "step": 28286 + }, + { + "epoch": 1.5525795828759605, + "grad_norm": 0.6559618711471558, + "learning_rate": 4.039670265137407e-07, + "loss": 0.0982, + "step": 28288 + }, + { + "epoch": 1.5526893523600438, + "grad_norm": 1.5782201290130615, + "learning_rate": 4.030301010150367e-07, + "loss": 0.1244, + "step": 28290 + }, + { + "epoch": 1.5527991218441275, + "grad_norm": 1.5223668813705444, + "learning_rate": 4.0209425446016134e-07, + "loss": 0.2181, + "step": 28292 + }, + { + "epoch": 1.5529088913282107, + "grad_norm": 1.8177438974380493, + "learning_rate": 4.01159486890168e-07, + "loss": 0.1745, + "step": 28294 + }, + { + "epoch": 1.5530186608122942, + "grad_norm": 0.9552173614501953, + "learning_rate": 4.002257983460572e-07, + "loss": 0.1364, + "step": 28296 + }, + { + "epoch": 1.5531284302963777, + "grad_norm": 1.216094970703125, + "learning_rate": 3.9929318886878507e-07, + "loss": 0.1837, + "step": 28298 + }, + { + "epoch": 1.553238199780461, + "grad_norm": 1.118329405784607, + "learning_rate": 3.983616584992578e-07, + "loss": 0.2479, + "step": 28300 + }, + { + "epoch": 1.5533479692645444, + "grad_norm": 0.9765486717224121, + "learning_rate": 3.9743120727834537e-07, + "loss": 0.1707, + "step": 28302 + }, + { + "epoch": 1.553457738748628, + "grad_norm": 0.9236847758293152, + "learning_rate": 3.965018352468541e-07, + "loss": 0.1341, + "step": 28304 + }, + { + "epoch": 1.5535675082327112, + "grad_norm": 1.0868184566497803, + "learning_rate": 3.9557354244555423e-07, + "loss": 0.1604, + "step": 28306 + }, + { + "epoch": 1.5536772777167949, + "grad_norm": 2.0598769187927246, + "learning_rate": 3.9464632891516306e-07, + "loss": 0.1999, + "step": 28308 + }, + { + "epoch": 1.5537870472008781, + "grad_norm": 0.9478192329406738, + "learning_rate": 3.9372019469635645e-07, + "loss": 0.1184, + "step": 28310 + }, + { + "epoch": 1.5538968166849616, + "grad_norm": 1.5994267463684082, + "learning_rate": 3.927951398297547e-07, + "loss": 0.2829, + "step": 28312 + }, + { + "epoch": 1.554006586169045, + "grad_norm": 0.9449111819267273, + "learning_rate": 3.918711643559392e-07, + "loss": 0.1357, + "step": 28314 + }, + { + "epoch": 1.5541163556531283, + "grad_norm": 1.6350457668304443, + "learning_rate": 3.9094826831543587e-07, + "loss": 0.2645, + "step": 28316 + }, + { + "epoch": 1.5542261251372118, + "grad_norm": 4.476511478424072, + "learning_rate": 3.9002645174873175e-07, + "loss": 0.2397, + "step": 28318 + }, + { + "epoch": 1.5543358946212953, + "grad_norm": 1.2588614225387573, + "learning_rate": 3.8910571469626123e-07, + "loss": 0.1574, + "step": 28320 + }, + { + "epoch": 1.5544456641053785, + "grad_norm": 1.30306875705719, + "learning_rate": 3.881860571984086e-07, + "loss": 0.2228, + "step": 28322 + }, + { + "epoch": 1.5545554335894622, + "grad_norm": 0.8720991611480713, + "learning_rate": 3.8726747929551943e-07, + "loss": 0.1144, + "step": 28324 + }, + { + "epoch": 1.5546652030735455, + "grad_norm": 1.0359739065170288, + "learning_rate": 3.8634998102788644e-07, + "loss": 0.1652, + "step": 28326 + }, + { + "epoch": 1.554774972557629, + "grad_norm": 1.2136998176574707, + "learning_rate": 3.854335624357497e-07, + "loss": 0.1364, + "step": 28328 + }, + { + "epoch": 1.5548847420417125, + "grad_norm": 1.5403825044631958, + "learning_rate": 3.8451822355931313e-07, + "loss": 0.2528, + "step": 28330 + }, + { + "epoch": 1.5549945115257957, + "grad_norm": 1.430885910987854, + "learning_rate": 3.836039644387307e-07, + "loss": 0.15, + "step": 28332 + }, + { + "epoch": 1.5551042810098794, + "grad_norm": 1.663984775543213, + "learning_rate": 3.8269078511410093e-07, + "loss": 0.1209, + "step": 28334 + }, + { + "epoch": 1.5552140504939627, + "grad_norm": 1.352622389793396, + "learning_rate": 3.817786856254807e-07, + "loss": 0.2739, + "step": 28336 + }, + { + "epoch": 1.5553238199780461, + "grad_norm": 0.8448898792266846, + "learning_rate": 3.8086766601288236e-07, + "loss": 0.1364, + "step": 28338 + }, + { + "epoch": 1.5554335894621296, + "grad_norm": 1.2724559307098389, + "learning_rate": 3.799577263162685e-07, + "loss": 0.1498, + "step": 28340 + }, + { + "epoch": 1.5555433589462129, + "grad_norm": 1.152758240699768, + "learning_rate": 3.7904886657554593e-07, + "loss": 0.2359, + "step": 28342 + }, + { + "epoch": 1.5556531284302964, + "grad_norm": 1.0433661937713623, + "learning_rate": 3.781410868305885e-07, + "loss": 0.2036, + "step": 28344 + }, + { + "epoch": 1.5557628979143798, + "grad_norm": 1.5175403356552124, + "learning_rate": 3.7723438712121696e-07, + "loss": 0.1542, + "step": 28346 + }, + { + "epoch": 1.555872667398463, + "grad_norm": 1.2343670129776, + "learning_rate": 3.763287674871996e-07, + "loss": 0.1554, + "step": 28348 + }, + { + "epoch": 1.5559824368825468, + "grad_norm": 1.9255826473236084, + "learning_rate": 3.7542422796826013e-07, + "loss": 0.186, + "step": 28350 + }, + { + "epoch": 1.55609220636663, + "grad_norm": 0.8030907511711121, + "learning_rate": 3.745207686040808e-07, + "loss": 0.1217, + "step": 28352 + }, + { + "epoch": 1.5562019758507135, + "grad_norm": 1.1092188358306885, + "learning_rate": 3.7361838943428815e-07, + "loss": 0.217, + "step": 28354 + }, + { + "epoch": 1.556311745334797, + "grad_norm": 1.173978567123413, + "learning_rate": 3.7271709049846724e-07, + "loss": 0.2352, + "step": 28356 + }, + { + "epoch": 1.5564215148188802, + "grad_norm": 0.9608809351921082, + "learning_rate": 3.718168718361531e-07, + "loss": 0.1908, + "step": 28358 + }, + { + "epoch": 1.5565312843029637, + "grad_norm": 1.1521832942962646, + "learning_rate": 3.709177334868308e-07, + "loss": 0.14, + "step": 28360 + }, + { + "epoch": 1.5566410537870472, + "grad_norm": 1.5232665538787842, + "learning_rate": 3.7001967548994653e-07, + "loss": 0.2545, + "step": 28362 + }, + { + "epoch": 1.5567508232711307, + "grad_norm": 1.0586833953857422, + "learning_rate": 3.691226978848883e-07, + "loss": 0.1973, + "step": 28364 + }, + { + "epoch": 1.5568605927552142, + "grad_norm": 1.4468789100646973, + "learning_rate": 3.6822680071100236e-07, + "loss": 0.198, + "step": 28366 + }, + { + "epoch": 1.5569703622392974, + "grad_norm": 1.2374430894851685, + "learning_rate": 3.673319840075934e-07, + "loss": 0.2287, + "step": 28368 + }, + { + "epoch": 1.557080131723381, + "grad_norm": 0.971501350402832, + "learning_rate": 3.66438247813905e-07, + "loss": 0.113, + "step": 28370 + }, + { + "epoch": 1.5571899012074644, + "grad_norm": 1.420649528503418, + "learning_rate": 3.6554559216914474e-07, + "loss": 0.2073, + "step": 28372 + }, + { + "epoch": 1.5572996706915476, + "grad_norm": 1.355025053024292, + "learning_rate": 3.646540171124674e-07, + "loss": 0.2146, + "step": 28374 + }, + { + "epoch": 1.5574094401756313, + "grad_norm": 0.9175900816917419, + "learning_rate": 3.637635226829833e-07, + "loss": 0.194, + "step": 28376 + }, + { + "epoch": 1.5575192096597146, + "grad_norm": 1.514100432395935, + "learning_rate": 3.6287410891975014e-07, + "loss": 0.2005, + "step": 28378 + }, + { + "epoch": 1.557628979143798, + "grad_norm": 1.3865598440170288, + "learning_rate": 3.6198577586178673e-07, + "loss": 0.2498, + "step": 28380 + }, + { + "epoch": 1.5577387486278815, + "grad_norm": 2.8058273792266846, + "learning_rate": 3.6109852354805627e-07, + "loss": 0.1796, + "step": 28382 + }, + { + "epoch": 1.5578485181119648, + "grad_norm": 1.2096154689788818, + "learning_rate": 3.6021235201748327e-07, + "loss": 0.2106, + "step": 28384 + }, + { + "epoch": 1.5579582875960483, + "grad_norm": 1.274609923362732, + "learning_rate": 3.59327261308931e-07, + "loss": 0.1902, + "step": 28386 + }, + { + "epoch": 1.5580680570801317, + "grad_norm": 1.24869704246521, + "learning_rate": 3.584432514612324e-07, + "loss": 0.226, + "step": 28388 + }, + { + "epoch": 1.558177826564215, + "grad_norm": 1.4749336242675781, + "learning_rate": 3.575603225131563e-07, + "loss": 0.2051, + "step": 28390 + }, + { + "epoch": 1.5582875960482987, + "grad_norm": 1.0770410299301147, + "learning_rate": 3.566784745034385e-07, + "loss": 0.1844, + "step": 28392 + }, + { + "epoch": 1.558397365532382, + "grad_norm": 0.7929877042770386, + "learning_rate": 3.5579770747075914e-07, + "loss": 0.1387, + "step": 28394 + }, + { + "epoch": 1.5585071350164654, + "grad_norm": 1.3809599876403809, + "learning_rate": 3.54918021453754e-07, + "loss": 0.1743, + "step": 28396 + }, + { + "epoch": 1.558616904500549, + "grad_norm": 1.5070968866348267, + "learning_rate": 3.5403941649101156e-07, + "loss": 0.2525, + "step": 28398 + }, + { + "epoch": 1.5587266739846322, + "grad_norm": 1.1306782960891724, + "learning_rate": 3.531618926210678e-07, + "loss": 0.2281, + "step": 28400 + }, + { + "epoch": 1.5588364434687159, + "grad_norm": 1.5487936735153198, + "learning_rate": 3.5228544988241684e-07, + "loss": 0.2526, + "step": 28402 + }, + { + "epoch": 1.5589462129527991, + "grad_norm": 0.7808268070220947, + "learning_rate": 3.51410088313503e-07, + "loss": 0.1537, + "step": 28404 + }, + { + "epoch": 1.5590559824368826, + "grad_norm": 1.3184309005737305, + "learning_rate": 3.505358079527288e-07, + "loss": 0.2028, + "step": 28406 + }, + { + "epoch": 1.559165751920966, + "grad_norm": 1.588986873626709, + "learning_rate": 3.496626088384386e-07, + "loss": 0.2371, + "step": 28408 + }, + { + "epoch": 1.5592755214050493, + "grad_norm": 0.8819313049316406, + "learning_rate": 3.487904910089379e-07, + "loss": 0.1729, + "step": 28410 + }, + { + "epoch": 1.5593852908891328, + "grad_norm": 1.2371762990951538, + "learning_rate": 3.479194545024822e-07, + "loss": 0.1359, + "step": 28412 + }, + { + "epoch": 1.5594950603732163, + "grad_norm": 1.231401801109314, + "learning_rate": 3.4704949935727427e-07, + "loss": 0.185, + "step": 28414 + }, + { + "epoch": 1.5596048298572995, + "grad_norm": 0.9147043824195862, + "learning_rate": 3.4618062561148355e-07, + "loss": 0.1307, + "step": 28416 + }, + { + "epoch": 1.5597145993413832, + "grad_norm": 1.3229907751083374, + "learning_rate": 3.453128333032185e-07, + "loss": 0.2714, + "step": 28418 + }, + { + "epoch": 1.5598243688254665, + "grad_norm": 0.8766411542892456, + "learning_rate": 3.444461224705431e-07, + "loss": 0.1963, + "step": 28420 + }, + { + "epoch": 1.55993413830955, + "grad_norm": 2.2958884239196777, + "learning_rate": 3.435804931514769e-07, + "loss": 0.2687, + "step": 28422 + }, + { + "epoch": 1.5600439077936334, + "grad_norm": 1.0526643991470337, + "learning_rate": 3.427159453839923e-07, + "loss": 0.2176, + "step": 28424 + }, + { + "epoch": 1.5601536772777167, + "grad_norm": 2.124851703643799, + "learning_rate": 3.418524792060118e-07, + "loss": 0.1151, + "step": 28426 + }, + { + "epoch": 1.5602634467618002, + "grad_norm": 0.9402503967285156, + "learning_rate": 3.4099009465541065e-07, + "loss": 0.1481, + "step": 28428 + }, + { + "epoch": 1.5603732162458837, + "grad_norm": 0.9995006322860718, + "learning_rate": 3.4012879177001687e-07, + "loss": 0.1567, + "step": 28430 + }, + { + "epoch": 1.560482985729967, + "grad_norm": 0.9241123199462891, + "learning_rate": 3.3926857058761417e-07, + "loss": 0.1316, + "step": 28432 + }, + { + "epoch": 1.5605927552140506, + "grad_norm": 1.2321945428848267, + "learning_rate": 3.384094311459307e-07, + "loss": 0.1711, + "step": 28434 + }, + { + "epoch": 1.5607025246981339, + "grad_norm": 1.817847490310669, + "learning_rate": 3.3755137348265854e-07, + "loss": 0.1448, + "step": 28436 + }, + { + "epoch": 1.5608122941822173, + "grad_norm": 1.3603589534759521, + "learning_rate": 3.366943976354342e-07, + "loss": 0.1534, + "step": 28438 + }, + { + "epoch": 1.5609220636663008, + "grad_norm": 0.8598580956459045, + "learning_rate": 3.3583850364184434e-07, + "loss": 0.1385, + "step": 28440 + }, + { + "epoch": 1.561031833150384, + "grad_norm": 0.9714413285255432, + "learning_rate": 3.3498369153943944e-07, + "loss": 0.2786, + "step": 28442 + }, + { + "epoch": 1.5611416026344678, + "grad_norm": 0.7640584111213684, + "learning_rate": 3.341299613657117e-07, + "loss": 0.1277, + "step": 28444 + }, + { + "epoch": 1.561251372118551, + "grad_norm": 1.029769778251648, + "learning_rate": 3.3327731315811185e-07, + "loss": 0.2045, + "step": 28446 + }, + { + "epoch": 1.5613611416026345, + "grad_norm": 1.0988703966140747, + "learning_rate": 3.3242574695404036e-07, + "loss": 0.1738, + "step": 28448 + }, + { + "epoch": 1.561470911086718, + "grad_norm": 1.2807139158248901, + "learning_rate": 3.315752627908508e-07, + "loss": 0.2076, + "step": 28450 + }, + { + "epoch": 1.5615806805708012, + "grad_norm": 1.2425220012664795, + "learning_rate": 3.3072586070584665e-07, + "loss": 0.1842, + "step": 28452 + }, + { + "epoch": 1.5616904500548847, + "grad_norm": 1.293442964553833, + "learning_rate": 3.298775407362953e-07, + "loss": 0.1225, + "step": 28454 + }, + { + "epoch": 1.5618002195389682, + "grad_norm": 2.215155601501465, + "learning_rate": 3.290303029194031e-07, + "loss": 0.2025, + "step": 28456 + }, + { + "epoch": 1.5619099890230514, + "grad_norm": 1.0822001695632935, + "learning_rate": 3.2818414729233195e-07, + "loss": 0.2118, + "step": 28458 + }, + { + "epoch": 1.5620197585071351, + "grad_norm": 1.3302714824676514, + "learning_rate": 3.273390738922022e-07, + "loss": 0.3674, + "step": 28460 + }, + { + "epoch": 1.5621295279912184, + "grad_norm": 0.8802470564842224, + "learning_rate": 3.264950827560787e-07, + "loss": 0.1242, + "step": 28462 + }, + { + "epoch": 1.5622392974753019, + "grad_norm": 1.4434325695037842, + "learning_rate": 3.256521739209845e-07, + "loss": 0.3214, + "step": 28464 + }, + { + "epoch": 1.5623490669593854, + "grad_norm": 0.8625388145446777, + "learning_rate": 3.248103474238984e-07, + "loss": 0.1366, + "step": 28466 + }, + { + "epoch": 1.5624588364434686, + "grad_norm": 1.42332124710083, + "learning_rate": 3.2396960330174363e-07, + "loss": 0.2227, + "step": 28468 + }, + { + "epoch": 1.562568605927552, + "grad_norm": 1.1492226123809814, + "learning_rate": 3.231299415913963e-07, + "loss": 0.2285, + "step": 28470 + }, + { + "epoch": 1.5626783754116356, + "grad_norm": 1.101173758506775, + "learning_rate": 3.2229136232969357e-07, + "loss": 0.2021, + "step": 28472 + }, + { + "epoch": 1.562788144895719, + "grad_norm": 0.9955928921699524, + "learning_rate": 3.214538655534172e-07, + "loss": 0.2203, + "step": 28474 + }, + { + "epoch": 1.5628979143798025, + "grad_norm": 0.9465897679328918, + "learning_rate": 3.2061745129929876e-07, + "loss": 0.1237, + "step": 28476 + }, + { + "epoch": 1.5630076838638858, + "grad_norm": 1.1128355264663696, + "learning_rate": 3.197821196040368e-07, + "loss": 0.1572, + "step": 28478 + }, + { + "epoch": 1.5631174533479693, + "grad_norm": 1.261155128479004, + "learning_rate": 3.189478705042659e-07, + "loss": 0.1612, + "step": 28480 + }, + { + "epoch": 1.5632272228320527, + "grad_norm": 1.3438189029693604, + "learning_rate": 3.1811470403658727e-07, + "loss": 0.2703, + "step": 28482 + }, + { + "epoch": 1.563336992316136, + "grad_norm": 0.7897114157676697, + "learning_rate": 3.172826202375412e-07, + "loss": 0.1515, + "step": 28484 + }, + { + "epoch": 1.5634467618002197, + "grad_norm": 1.258642554283142, + "learning_rate": 3.16451619143629e-07, + "loss": 0.1525, + "step": 28486 + }, + { + "epoch": 1.563556531284303, + "grad_norm": 0.8490167856216431, + "learning_rate": 3.1562170079130203e-07, + "loss": 0.0953, + "step": 28488 + }, + { + "epoch": 1.5636663007683864, + "grad_norm": 2.6089789867401123, + "learning_rate": 3.147928652169674e-07, + "loss": 0.1587, + "step": 28490 + }, + { + "epoch": 1.56377607025247, + "grad_norm": 0.8327347636222839, + "learning_rate": 3.1396511245697925e-07, + "loss": 0.1949, + "step": 28492 + }, + { + "epoch": 1.5638858397365532, + "grad_norm": 1.0675767660140991, + "learning_rate": 3.131384425476475e-07, + "loss": 0.1281, + "step": 28494 + }, + { + "epoch": 1.5639956092206366, + "grad_norm": 1.1184889078140259, + "learning_rate": 3.123128555252347e-07, + "loss": 0.178, + "step": 28496 + }, + { + "epoch": 1.56410537870472, + "grad_norm": 1.0212185382843018, + "learning_rate": 3.114883514259537e-07, + "loss": 0.1672, + "step": 28498 + }, + { + "epoch": 1.5642151481888034, + "grad_norm": 0.8516296744346619, + "learning_rate": 3.1066493028596986e-07, + "loss": 0.1257, + "step": 28500 + }, + { + "epoch": 1.564324917672887, + "grad_norm": 1.729966402053833, + "learning_rate": 3.098425921414072e-07, + "loss": 0.148, + "step": 28502 + }, + { + "epoch": 1.5644346871569703, + "grad_norm": 0.7330749034881592, + "learning_rate": 3.09021337028334e-07, + "loss": 0.1519, + "step": 28504 + }, + { + "epoch": 1.5645444566410538, + "grad_norm": 1.6670067310333252, + "learning_rate": 3.0820116498277427e-07, + "loss": 0.149, + "step": 28506 + }, + { + "epoch": 1.5646542261251373, + "grad_norm": 1.9469289779663086, + "learning_rate": 3.0738207604070756e-07, + "loss": 0.1419, + "step": 28508 + }, + { + "epoch": 1.5647639956092205, + "grad_norm": 1.5189110040664673, + "learning_rate": 3.065640702380607e-07, + "loss": 0.145, + "step": 28510 + }, + { + "epoch": 1.5648737650933042, + "grad_norm": 0.9661934971809387, + "learning_rate": 3.0574714761071596e-07, + "loss": 0.1469, + "step": 28512 + }, + { + "epoch": 1.5649835345773875, + "grad_norm": 1.3011196851730347, + "learning_rate": 3.04931308194506e-07, + "loss": 0.2036, + "step": 28514 + }, + { + "epoch": 1.565093304061471, + "grad_norm": 1.1296584606170654, + "learning_rate": 3.0411655202522426e-07, + "loss": 0.2228, + "step": 28516 + }, + { + "epoch": 1.5652030735455544, + "grad_norm": 1.1658474206924438, + "learning_rate": 3.0330287913860057e-07, + "loss": 0.1325, + "step": 28518 + }, + { + "epoch": 1.5653128430296377, + "grad_norm": 1.181252360343933, + "learning_rate": 3.024902895703341e-07, + "loss": 0.1912, + "step": 28520 + }, + { + "epoch": 1.5654226125137212, + "grad_norm": 1.4146974086761475, + "learning_rate": 3.0167878335606583e-07, + "loss": 0.2369, + "step": 28522 + }, + { + "epoch": 1.5655323819978046, + "grad_norm": 1.3116185665130615, + "learning_rate": 3.008683605313922e-07, + "loss": 0.1846, + "step": 28524 + }, + { + "epoch": 1.565642151481888, + "grad_norm": 1.0790079832077026, + "learning_rate": 3.000590211318599e-07, + "loss": 0.2034, + "step": 28526 + }, + { + "epoch": 1.5657519209659716, + "grad_norm": 1.2226643562316895, + "learning_rate": 2.9925076519297656e-07, + "loss": 0.1898, + "step": 28528 + }, + { + "epoch": 1.5658616904500549, + "grad_norm": 4.078540802001953, + "learning_rate": 2.984435927501944e-07, + "loss": 0.22, + "step": 28530 + }, + { + "epoch": 1.5659714599341383, + "grad_norm": 1.0479429960250854, + "learning_rate": 2.976375038389156e-07, + "loss": 0.1086, + "step": 28532 + }, + { + "epoch": 1.5660812294182218, + "grad_norm": 1.2852797508239746, + "learning_rate": 2.9683249849450366e-07, + "loss": 0.2522, + "step": 28534 + }, + { + "epoch": 1.566190998902305, + "grad_norm": 1.0496045351028442, + "learning_rate": 2.960285767522664e-07, + "loss": 0.2756, + "step": 28536 + }, + { + "epoch": 1.5663007683863885, + "grad_norm": 1.111838459968567, + "learning_rate": 2.952257386474727e-07, + "loss": 0.1872, + "step": 28538 + }, + { + "epoch": 1.566410537870472, + "grad_norm": 0.9437655210494995, + "learning_rate": 2.944239842153362e-07, + "loss": 0.1137, + "step": 28540 + }, + { + "epoch": 1.5665203073545553, + "grad_norm": 1.7103227376937866, + "learning_rate": 2.9362331349102326e-07, + "loss": 0.2487, + "step": 28542 + }, + { + "epoch": 1.566630076838639, + "grad_norm": 1.654073715209961, + "learning_rate": 2.9282372650966117e-07, + "loss": 0.1868, + "step": 28544 + }, + { + "epoch": 1.5667398463227222, + "grad_norm": 1.4646714925765991, + "learning_rate": 2.9202522330631923e-07, + "loss": 0.2213, + "step": 28546 + }, + { + "epoch": 1.5668496158068057, + "grad_norm": 1.3730313777923584, + "learning_rate": 2.912278039160249e-07, + "loss": 0.1303, + "step": 28548 + }, + { + "epoch": 1.5669593852908892, + "grad_norm": 1.576600193977356, + "learning_rate": 2.90431468373753e-07, + "loss": 0.2532, + "step": 28550 + }, + { + "epoch": 1.5670691547749724, + "grad_norm": 2.365499496459961, + "learning_rate": 2.896362167144423e-07, + "loss": 0.2115, + "step": 28552 + }, + { + "epoch": 1.5671789242590561, + "grad_norm": 0.8007627725601196, + "learning_rate": 2.888420489729732e-07, + "loss": 0.1867, + "step": 28554 + }, + { + "epoch": 1.5672886937431394, + "grad_norm": 1.3279296159744263, + "learning_rate": 2.880489651841817e-07, + "loss": 0.2206, + "step": 28556 + }, + { + "epoch": 1.5673984632272229, + "grad_norm": 1.3423285484313965, + "learning_rate": 2.8725696538285386e-07, + "loss": 0.3039, + "step": 28558 + }, + { + "epoch": 1.5675082327113063, + "grad_norm": 1.192285180091858, + "learning_rate": 2.8646604960373413e-07, + "loss": 0.2433, + "step": 28560 + }, + { + "epoch": 1.5676180021953896, + "grad_norm": 1.0839829444885254, + "learning_rate": 2.856762178815142e-07, + "loss": 0.1304, + "step": 28562 + }, + { + "epoch": 1.567727771679473, + "grad_norm": 1.1912025213241577, + "learning_rate": 2.848874702508414e-07, + "loss": 0.2465, + "step": 28564 + }, + { + "epoch": 1.5678375411635566, + "grad_norm": 1.1442933082580566, + "learning_rate": 2.84099806746313e-07, + "loss": 0.171, + "step": 28566 + }, + { + "epoch": 1.5679473106476398, + "grad_norm": 1.0880656242370605, + "learning_rate": 2.833132274024791e-07, + "loss": 0.1705, + "step": 28568 + }, + { + "epoch": 1.5680570801317235, + "grad_norm": 0.6832332015037537, + "learning_rate": 2.8252773225384276e-07, + "loss": 0.0963, + "step": 28570 + }, + { + "epoch": 1.5681668496158068, + "grad_norm": 1.42799973487854, + "learning_rate": 2.817433213348597e-07, + "loss": 0.2567, + "step": 28572 + }, + { + "epoch": 1.5682766190998902, + "grad_norm": 1.1219974756240845, + "learning_rate": 2.8095999467994127e-07, + "loss": 0.3559, + "step": 28574 + }, + { + "epoch": 1.5683863885839737, + "grad_norm": 1.3768634796142578, + "learning_rate": 2.8017775232344335e-07, + "loss": 0.2008, + "step": 28576 + }, + { + "epoch": 1.568496158068057, + "grad_norm": 0.9345909357070923, + "learning_rate": 2.7939659429968566e-07, + "loss": 0.248, + "step": 28578 + }, + { + "epoch": 1.5686059275521405, + "grad_norm": 0.9212531447410583, + "learning_rate": 2.78616520642927e-07, + "loss": 0.0994, + "step": 28580 + }, + { + "epoch": 1.568715697036224, + "grad_norm": 1.248511552810669, + "learning_rate": 2.778375313873871e-07, + "loss": 0.1719, + "step": 28582 + }, + { + "epoch": 1.5688254665203072, + "grad_norm": 1.4031637907028198, + "learning_rate": 2.7705962656723593e-07, + "loss": 0.1709, + "step": 28584 + }, + { + "epoch": 1.5689352360043909, + "grad_norm": 0.8888716697692871, + "learning_rate": 2.7628280621659617e-07, + "loss": 0.1213, + "step": 28586 + }, + { + "epoch": 1.5690450054884741, + "grad_norm": 1.1457927227020264, + "learning_rate": 2.7550707036954615e-07, + "loss": 0.1233, + "step": 28588 + }, + { + "epoch": 1.5691547749725576, + "grad_norm": 1.4928011894226074, + "learning_rate": 2.747324190601086e-07, + "loss": 0.1685, + "step": 28590 + }, + { + "epoch": 1.569264544456641, + "grad_norm": 1.5727200508117676, + "learning_rate": 2.7395885232226747e-07, + "loss": 0.2519, + "step": 28592 + }, + { + "epoch": 1.5693743139407244, + "grad_norm": 0.9738843441009521, + "learning_rate": 2.7318637018995396e-07, + "loss": 0.2434, + "step": 28594 + }, + { + "epoch": 1.569484083424808, + "grad_norm": 0.9170289039611816, + "learning_rate": 2.724149726970521e-07, + "loss": 0.2076, + "step": 28596 + }, + { + "epoch": 1.5695938529088913, + "grad_norm": 1.900278091430664, + "learning_rate": 2.7164465987739863e-07, + "loss": 0.1444, + "step": 28598 + }, + { + "epoch": 1.5697036223929748, + "grad_norm": 0.6773984432220459, + "learning_rate": 2.7087543176478324e-07, + "loss": 0.1517, + "step": 28600 + }, + { + "epoch": 1.5698133918770583, + "grad_norm": 1.3787264823913574, + "learning_rate": 2.7010728839295117e-07, + "loss": 0.2451, + "step": 28602 + }, + { + "epoch": 1.5699231613611415, + "grad_norm": 1.2008939981460571, + "learning_rate": 2.693402297955949e-07, + "loss": 0.1328, + "step": 28604 + }, + { + "epoch": 1.570032930845225, + "grad_norm": 1.2060803174972534, + "learning_rate": 2.6857425600636255e-07, + "loss": 0.1987, + "step": 28606 + }, + { + "epoch": 1.5701427003293085, + "grad_norm": 1.3029892444610596, + "learning_rate": 2.678093670588494e-07, + "loss": 0.1937, + "step": 28608 + }, + { + "epoch": 1.5702524698133917, + "grad_norm": 0.9463843107223511, + "learning_rate": 2.67045562986612e-07, + "loss": 0.1268, + "step": 28610 + }, + { + "epoch": 1.5703622392974754, + "grad_norm": 0.894360363483429, + "learning_rate": 2.662828438231513e-07, + "loss": 0.1539, + "step": 28612 + }, + { + "epoch": 1.5704720087815587, + "grad_norm": 1.1243093013763428, + "learning_rate": 2.655212096019266e-07, + "loss": 0.2573, + "step": 28614 + }, + { + "epoch": 1.5705817782656422, + "grad_norm": 0.9366344213485718, + "learning_rate": 2.647606603563474e-07, + "loss": 0.1659, + "step": 28616 + }, + { + "epoch": 1.5706915477497256, + "grad_norm": 1.0509496927261353, + "learning_rate": 2.640011961197703e-07, + "loss": 0.107, + "step": 28618 + }, + { + "epoch": 1.570801317233809, + "grad_norm": 1.3249025344848633, + "learning_rate": 2.63242816925513e-07, + "loss": 0.225, + "step": 28620 + }, + { + "epoch": 1.5709110867178926, + "grad_norm": 0.791178286075592, + "learning_rate": 2.624855228068407e-07, + "loss": 0.1244, + "step": 28622 + }, + { + "epoch": 1.5710208562019758, + "grad_norm": 1.822917103767395, + "learning_rate": 2.617293137969712e-07, + "loss": 0.1594, + "step": 28624 + }, + { + "epoch": 1.5711306256860593, + "grad_norm": 0.9530569314956665, + "learning_rate": 2.6097418992908073e-07, + "loss": 0.1905, + "step": 28626 + }, + { + "epoch": 1.5712403951701428, + "grad_norm": 1.0178253650665283, + "learning_rate": 2.602201512362845e-07, + "loss": 0.193, + "step": 28628 + }, + { + "epoch": 1.571350164654226, + "grad_norm": 1.3053110837936401, + "learning_rate": 2.5946719775166437e-07, + "loss": 0.3151, + "step": 28630 + }, + { + "epoch": 1.5714599341383095, + "grad_norm": 1.4526869058609009, + "learning_rate": 2.5871532950824394e-07, + "loss": 0.2649, + "step": 28632 + }, + { + "epoch": 1.571569703622393, + "grad_norm": 1.9915030002593994, + "learning_rate": 2.579645465390107e-07, + "loss": 0.161, + "step": 28634 + }, + { + "epoch": 1.5716794731064763, + "grad_norm": 1.0477285385131836, + "learning_rate": 2.572148488768883e-07, + "loss": 0.2395, + "step": 28636 + }, + { + "epoch": 1.57178924259056, + "grad_norm": 1.6793915033340454, + "learning_rate": 2.564662365547726e-07, + "loss": 0.273, + "step": 28638 + }, + { + "epoch": 1.5718990120746432, + "grad_norm": 1.1797189712524414, + "learning_rate": 2.55718709605493e-07, + "loss": 0.237, + "step": 28640 + }, + { + "epoch": 1.5720087815587267, + "grad_norm": 1.5734057426452637, + "learning_rate": 2.549722680618455e-07, + "loss": 0.1237, + "step": 28642 + }, + { + "epoch": 1.5721185510428102, + "grad_norm": 1.1921114921569824, + "learning_rate": 2.542269119565677e-07, + "loss": 0.1546, + "step": 28644 + }, + { + "epoch": 1.5722283205268934, + "grad_norm": 1.3407824039459229, + "learning_rate": 2.534826413223584e-07, + "loss": 0.3756, + "step": 28646 + }, + { + "epoch": 1.572338090010977, + "grad_norm": 0.9943207502365112, + "learning_rate": 2.527394561918611e-07, + "loss": 0.2082, + "step": 28648 + }, + { + "epoch": 1.5724478594950604, + "grad_norm": 0.7892484068870544, + "learning_rate": 2.519973565976802e-07, + "loss": 0.1491, + "step": 28650 + }, + { + "epoch": 1.5725576289791436, + "grad_norm": 1.1823039054870605, + "learning_rate": 2.5125634257236463e-07, + "loss": 0.1978, + "step": 28652 + }, + { + "epoch": 1.5726673984632273, + "grad_norm": 1.0184659957885742, + "learning_rate": 2.5051641414842173e-07, + "loss": 0.1598, + "step": 28654 + }, + { + "epoch": 1.5727771679473106, + "grad_norm": 1.2510044574737549, + "learning_rate": 2.4977757135830614e-07, + "loss": 0.2293, + "step": 28656 + }, + { + "epoch": 1.572886937431394, + "grad_norm": 0.9050290584564209, + "learning_rate": 2.4903981423442523e-07, + "loss": 0.1715, + "step": 28658 + }, + { + "epoch": 1.5729967069154775, + "grad_norm": 0.7164849638938904, + "learning_rate": 2.483031428091448e-07, + "loss": 0.2737, + "step": 28660 + }, + { + "epoch": 1.5731064763995608, + "grad_norm": 0.7463716864585876, + "learning_rate": 2.475675571147751e-07, + "loss": 0.1832, + "step": 28662 + }, + { + "epoch": 1.5732162458836445, + "grad_norm": 1.1316101551055908, + "learning_rate": 2.468330571835847e-07, + "loss": 0.213, + "step": 28664 + }, + { + "epoch": 1.5733260153677278, + "grad_norm": 1.469401478767395, + "learning_rate": 2.4609964304779245e-07, + "loss": 0.2168, + "step": 28666 + }, + { + "epoch": 1.5734357848518112, + "grad_norm": 1.2557953596115112, + "learning_rate": 2.453673147395724e-07, + "loss": 0.1834, + "step": 28668 + }, + { + "epoch": 1.5735455543358947, + "grad_norm": 0.8911282420158386, + "learning_rate": 2.4463607229104067e-07, + "loss": 0.1425, + "step": 28670 + }, + { + "epoch": 1.573655323819978, + "grad_norm": 1.0019609928131104, + "learning_rate": 2.439059157342799e-07, + "loss": 0.1283, + "step": 28672 + }, + { + "epoch": 1.5737650933040614, + "grad_norm": 1.0939234495162964, + "learning_rate": 2.4317684510131443e-07, + "loss": 0.1114, + "step": 28674 + }, + { + "epoch": 1.573874862788145, + "grad_norm": 1.868268370628357, + "learning_rate": 2.424488604241271e-07, + "loss": 0.1983, + "step": 28676 + }, + { + "epoch": 1.5739846322722282, + "grad_norm": 1.1417112350463867, + "learning_rate": 2.4172196173464776e-07, + "loss": 0.1594, + "step": 28678 + }, + { + "epoch": 1.5740944017563119, + "grad_norm": 1.0925137996673584, + "learning_rate": 2.4099614906476775e-07, + "loss": 0.1538, + "step": 28680 + }, + { + "epoch": 1.5742041712403951, + "grad_norm": 1.5471856594085693, + "learning_rate": 2.4027142244631706e-07, + "loss": 0.2156, + "step": 28682 + }, + { + "epoch": 1.5743139407244786, + "grad_norm": 1.7512141466140747, + "learning_rate": 2.3954778191108976e-07, + "loss": 0.2304, + "step": 28684 + }, + { + "epoch": 1.574423710208562, + "grad_norm": 0.913716197013855, + "learning_rate": 2.388252274908326e-07, + "loss": 0.1439, + "step": 28686 + }, + { + "epoch": 1.5745334796926453, + "grad_norm": 1.097751498222351, + "learning_rate": 2.381037592172314e-07, + "loss": 0.1864, + "step": 28688 + }, + { + "epoch": 1.5746432491767288, + "grad_norm": 1.2906556129455566, + "learning_rate": 2.3738337712194137e-07, + "loss": 0.1458, + "step": 28690 + }, + { + "epoch": 1.5747530186608123, + "grad_norm": 0.7128433585166931, + "learning_rate": 2.3666408123655392e-07, + "loss": 0.226, + "step": 28692 + }, + { + "epoch": 1.5748627881448956, + "grad_norm": 1.05134916305542, + "learning_rate": 2.3594587159262993e-07, + "loss": 0.3259, + "step": 28694 + }, + { + "epoch": 1.5749725576289793, + "grad_norm": 1.2056992053985596, + "learning_rate": 2.352287482216664e-07, + "loss": 0.2478, + "step": 28696 + }, + { + "epoch": 1.5750823271130625, + "grad_norm": 1.1493127346038818, + "learning_rate": 2.345127111551243e-07, + "loss": 0.2183, + "step": 28698 + }, + { + "epoch": 1.575192096597146, + "grad_norm": 1.399832010269165, + "learning_rate": 2.3379776042440905e-07, + "loss": 0.1905, + "step": 28700 + }, + { + "epoch": 1.5753018660812295, + "grad_norm": 1.5419504642486572, + "learning_rate": 2.330838960608872e-07, + "loss": 0.2684, + "step": 28702 + }, + { + "epoch": 1.5754116355653127, + "grad_norm": 1.4229545593261719, + "learning_rate": 2.323711180958671e-07, + "loss": 0.1521, + "step": 28704 + }, + { + "epoch": 1.5755214050493964, + "grad_norm": 1.2220550775527954, + "learning_rate": 2.3165942656061535e-07, + "loss": 0.1719, + "step": 28706 + }, + { + "epoch": 1.5756311745334797, + "grad_norm": 0.9514754414558411, + "learning_rate": 2.3094882148635144e-07, + "loss": 0.1418, + "step": 28708 + }, + { + "epoch": 1.5757409440175631, + "grad_norm": 1.3339937925338745, + "learning_rate": 2.302393029042449e-07, + "loss": 0.1583, + "step": 28710 + }, + { + "epoch": 1.5758507135016466, + "grad_norm": 1.2131320238113403, + "learning_rate": 2.2953087084542358e-07, + "loss": 0.2551, + "step": 28712 + }, + { + "epoch": 1.5759604829857299, + "grad_norm": 1.2111319303512573, + "learning_rate": 2.288235253409543e-07, + "loss": 0.1426, + "step": 28714 + }, + { + "epoch": 1.5760702524698134, + "grad_norm": 2.2118136882781982, + "learning_rate": 2.2811726642187338e-07, + "loss": 0.1805, + "step": 28716 + }, + { + "epoch": 1.5761800219538968, + "grad_norm": 0.8678296208381653, + "learning_rate": 2.274120941191532e-07, + "loss": 0.1295, + "step": 28718 + }, + { + "epoch": 1.57628979143798, + "grad_norm": 1.0639883279800415, + "learning_rate": 2.2670800846373018e-07, + "loss": 0.1364, + "step": 28720 + }, + { + "epoch": 1.5763995609220638, + "grad_norm": 1.289959192276001, + "learning_rate": 2.2600500948648794e-07, + "loss": 0.1935, + "step": 28722 + }, + { + "epoch": 1.576509330406147, + "grad_norm": 1.2818142175674438, + "learning_rate": 2.253030972182657e-07, + "loss": 0.1879, + "step": 28724 + }, + { + "epoch": 1.5766190998902305, + "grad_norm": 1.273716926574707, + "learning_rate": 2.2460227168985271e-07, + "loss": 0.1994, + "step": 28726 + }, + { + "epoch": 1.576728869374314, + "grad_norm": 1.1785407066345215, + "learning_rate": 2.239025329319855e-07, + "loss": 0.1572, + "step": 28728 + }, + { + "epoch": 1.5768386388583973, + "grad_norm": 1.0237128734588623, + "learning_rate": 2.2320388097536448e-07, + "loss": 0.192, + "step": 28730 + }, + { + "epoch": 1.576948408342481, + "grad_norm": 0.7812864780426025, + "learning_rate": 2.2250631585063186e-07, + "loss": 0.1468, + "step": 28732 + }, + { + "epoch": 1.5770581778265642, + "grad_norm": 0.9354710578918457, + "learning_rate": 2.2180983758838537e-07, + "loss": 0.2424, + "step": 28734 + }, + { + "epoch": 1.5771679473106477, + "grad_norm": 1.2503294944763184, + "learning_rate": 2.211144462191783e-07, + "loss": 0.148, + "step": 28736 + }, + { + "epoch": 1.5772777167947312, + "grad_norm": 0.922004759311676, + "learning_rate": 2.204201417735141e-07, + "loss": 0.1545, + "step": 28738 + }, + { + "epoch": 1.5773874862788144, + "grad_norm": 1.1359943151474, + "learning_rate": 2.1972692428184892e-07, + "loss": 0.2345, + "step": 28740 + }, + { + "epoch": 1.577497255762898, + "grad_norm": 0.9555188417434692, + "learning_rate": 2.1903479377458902e-07, + "loss": 0.1806, + "step": 28742 + }, + { + "epoch": 1.5776070252469814, + "grad_norm": 0.9959547519683838, + "learning_rate": 2.183437502820962e-07, + "loss": 0.2919, + "step": 28744 + }, + { + "epoch": 1.5777167947310646, + "grad_norm": 1.2281373739242554, + "learning_rate": 2.1765379383467954e-07, + "loss": 0.1632, + "step": 28746 + }, + { + "epoch": 1.5778265642151483, + "grad_norm": 1.6505637168884277, + "learning_rate": 2.169649244626093e-07, + "loss": 0.1788, + "step": 28748 + }, + { + "epoch": 1.5779363336992316, + "grad_norm": 0.8427567481994629, + "learning_rate": 2.162771421960974e-07, + "loss": 0.1268, + "step": 28750 + }, + { + "epoch": 1.578046103183315, + "grad_norm": 0.970941424369812, + "learning_rate": 2.1559044706531417e-07, + "loss": 0.1531, + "step": 28752 + }, + { + "epoch": 1.5781558726673985, + "grad_norm": 1.1805576086044312, + "learning_rate": 2.149048391003855e-07, + "loss": 0.1346, + "step": 28754 + }, + { + "epoch": 1.5782656421514818, + "grad_norm": 1.5247673988342285, + "learning_rate": 2.1422031833138455e-07, + "loss": 0.2684, + "step": 28756 + }, + { + "epoch": 1.5783754116355653, + "grad_norm": 1.0757654905319214, + "learning_rate": 2.1353688478833178e-07, + "loss": 0.1424, + "step": 28758 + }, + { + "epoch": 1.5784851811196488, + "grad_norm": 1.5513666868209839, + "learning_rate": 2.1285453850121428e-07, + "loss": 0.2068, + "step": 28760 + }, + { + "epoch": 1.578594950603732, + "grad_norm": 1.4729397296905518, + "learning_rate": 2.1217327949995814e-07, + "loss": 0.223, + "step": 28762 + }, + { + "epoch": 1.5787047200878157, + "grad_norm": 1.1853084564208984, + "learning_rate": 2.1149310781444777e-07, + "loss": 0.1437, + "step": 28764 + }, + { + "epoch": 1.578814489571899, + "grad_norm": 0.6694956421852112, + "learning_rate": 2.108140234745204e-07, + "loss": 0.2129, + "step": 28766 + }, + { + "epoch": 1.5789242590559824, + "grad_norm": 0.7345982193946838, + "learning_rate": 2.1013602650996056e-07, + "loss": 0.1335, + "step": 28768 + }, + { + "epoch": 1.579034028540066, + "grad_norm": 1.1005595922470093, + "learning_rate": 2.0945911695050834e-07, + "loss": 0.1721, + "step": 28770 + }, + { + "epoch": 1.5791437980241492, + "grad_norm": 1.7116130590438843, + "learning_rate": 2.0878329482586224e-07, + "loss": 0.149, + "step": 28772 + }, + { + "epoch": 1.5792535675082329, + "grad_norm": 1.1446282863616943, + "learning_rate": 2.081085601656596e-07, + "loss": 0.2001, + "step": 28774 + }, + { + "epoch": 1.5793633369923161, + "grad_norm": 2.1444053649902344, + "learning_rate": 2.0743491299950457e-07, + "loss": 0.2611, + "step": 28776 + }, + { + "epoch": 1.5794731064763996, + "grad_norm": 1.033903956413269, + "learning_rate": 2.0676235335694293e-07, + "loss": 0.1863, + "step": 28778 + }, + { + "epoch": 1.579582875960483, + "grad_norm": 1.0426371097564697, + "learning_rate": 2.060908812674761e-07, + "loss": 0.1432, + "step": 28780 + }, + { + "epoch": 1.5796926454445663, + "grad_norm": 0.9050781726837158, + "learning_rate": 2.054204967605583e-07, + "loss": 0.1848, + "step": 28782 + }, + { + "epoch": 1.5798024149286498, + "grad_norm": 1.3613009452819824, + "learning_rate": 2.0475119986559378e-07, + "loss": 0.2127, + "step": 28784 + }, + { + "epoch": 1.5799121844127333, + "grad_norm": 1.0430384874343872, + "learning_rate": 2.040829906119479e-07, + "loss": 0.166, + "step": 28786 + }, + { + "epoch": 1.5800219538968165, + "grad_norm": 1.2755063772201538, + "learning_rate": 2.0341586902892784e-07, + "loss": 0.1838, + "step": 28788 + }, + { + "epoch": 1.5801317233809002, + "grad_norm": 4.350892066955566, + "learning_rate": 2.0274983514579626e-07, + "loss": 0.2556, + "step": 28790 + }, + { + "epoch": 1.5802414928649835, + "grad_norm": 1.4985696077346802, + "learning_rate": 2.0208488899176869e-07, + "loss": 0.1776, + "step": 28792 + }, + { + "epoch": 1.580351262349067, + "grad_norm": 1.1632684469223022, + "learning_rate": 2.0142103059601348e-07, + "loss": 0.1303, + "step": 28794 + }, + { + "epoch": 1.5804610318331505, + "grad_norm": 1.5131025314331055, + "learning_rate": 2.007582599876462e-07, + "loss": 0.2236, + "step": 28796 + }, + { + "epoch": 1.5805708013172337, + "grad_norm": 0.8355984091758728, + "learning_rate": 2.0009657719574915e-07, + "loss": 0.138, + "step": 28798 + }, + { + "epoch": 1.5806805708013172, + "grad_norm": 0.9869404435157776, + "learning_rate": 1.9943598224933802e-07, + "loss": 0.1443, + "step": 28800 + }, + { + "epoch": 1.5807903402854007, + "grad_norm": 1.2904233932495117, + "learning_rate": 1.9877647517739518e-07, + "loss": 0.1845, + "step": 28802 + }, + { + "epoch": 1.580900109769484, + "grad_norm": 0.9692951440811157, + "learning_rate": 1.9811805600884746e-07, + "loss": 0.1748, + "step": 28804 + }, + { + "epoch": 1.5810098792535676, + "grad_norm": 1.0888901948928833, + "learning_rate": 1.974607247725746e-07, + "loss": 0.1391, + "step": 28806 + }, + { + "epoch": 1.5811196487376509, + "grad_norm": 1.116054892539978, + "learning_rate": 1.9680448149741183e-07, + "loss": 0.1776, + "step": 28808 + }, + { + "epoch": 1.5812294182217344, + "grad_norm": 1.1121803522109985, + "learning_rate": 1.9614932621215e-07, + "loss": 0.1494, + "step": 28810 + }, + { + "epoch": 1.5813391877058178, + "grad_norm": 1.0508270263671875, + "learning_rate": 1.954952589455189e-07, + "loss": 0.2192, + "step": 28812 + }, + { + "epoch": 1.581448957189901, + "grad_norm": 1.3763298988342285, + "learning_rate": 1.9484227972621505e-07, + "loss": 0.1508, + "step": 28814 + }, + { + "epoch": 1.5815587266739848, + "grad_norm": 1.6443885564804077, + "learning_rate": 1.9419038858287942e-07, + "loss": 0.2441, + "step": 28816 + }, + { + "epoch": 1.581668496158068, + "grad_norm": 1.024987816810608, + "learning_rate": 1.9353958554410856e-07, + "loss": 0.2222, + "step": 28818 + }, + { + "epoch": 1.5817782656421515, + "grad_norm": 1.557211995124817, + "learning_rate": 1.9288987063844633e-07, + "loss": 0.2011, + "step": 28820 + }, + { + "epoch": 1.581888035126235, + "grad_norm": 1.1720213890075684, + "learning_rate": 1.922412438943977e-07, + "loss": 0.146, + "step": 28822 + }, + { + "epoch": 1.5819978046103182, + "grad_norm": 0.9526304602622986, + "learning_rate": 1.915937053404121e-07, + "loss": 0.1239, + "step": 28824 + }, + { + "epoch": 1.5821075740944017, + "grad_norm": 1.094130039215088, + "learning_rate": 1.9094725500489185e-07, + "loss": 0.1976, + "step": 28826 + }, + { + "epoch": 1.5822173435784852, + "grad_norm": 2.084970712661743, + "learning_rate": 1.903018929161948e-07, + "loss": 0.234, + "step": 28828 + }, + { + "epoch": 1.5823271130625685, + "grad_norm": 1.3499584197998047, + "learning_rate": 1.8965761910263168e-07, + "loss": 0.2114, + "step": 28830 + }, + { + "epoch": 1.5824368825466522, + "grad_norm": 0.6837357878684998, + "learning_rate": 1.8901443359245763e-07, + "loss": 0.1207, + "step": 28832 + }, + { + "epoch": 1.5825466520307354, + "grad_norm": 0.8097141981124878, + "learning_rate": 1.883723364138945e-07, + "loss": 0.1049, + "step": 28834 + }, + { + "epoch": 1.582656421514819, + "grad_norm": 0.8436933755874634, + "learning_rate": 1.8773132759510036e-07, + "loss": 0.1469, + "step": 28836 + }, + { + "epoch": 1.5827661909989024, + "grad_norm": 1.4933021068572998, + "learning_rate": 1.8709140716419716e-07, + "loss": 0.2271, + "step": 28838 + }, + { + "epoch": 1.5828759604829856, + "grad_norm": 1.0537304878234863, + "learning_rate": 1.8645257514925406e-07, + "loss": 0.1625, + "step": 28840 + }, + { + "epoch": 1.582985729967069, + "grad_norm": 1.0751527547836304, + "learning_rate": 1.8581483157829316e-07, + "loss": 0.266, + "step": 28842 + }, + { + "epoch": 1.5830954994511526, + "grad_norm": 0.7756512761116028, + "learning_rate": 1.8517817647928647e-07, + "loss": 0.1778, + "step": 28844 + }, + { + "epoch": 1.583205268935236, + "grad_norm": 0.9728425145149231, + "learning_rate": 1.8454260988016448e-07, + "loss": 0.1679, + "step": 28846 + }, + { + "epoch": 1.5833150384193195, + "grad_norm": 1.101798176765442, + "learning_rate": 1.8390813180880763e-07, + "loss": 0.2319, + "step": 28848 + }, + { + "epoch": 1.5834248079034028, + "grad_norm": 1.3598761558532715, + "learning_rate": 1.8327474229304364e-07, + "loss": 0.3105, + "step": 28850 + }, + { + "epoch": 1.5835345773874863, + "grad_norm": 1.1509543657302856, + "learning_rate": 1.826424413606559e-07, + "loss": 0.1948, + "step": 28852 + }, + { + "epoch": 1.5836443468715697, + "grad_norm": 1.4546308517456055, + "learning_rate": 1.820112290393805e-07, + "loss": 0.2352, + "step": 28854 + }, + { + "epoch": 1.583754116355653, + "grad_norm": 1.0531243085861206, + "learning_rate": 1.8138110535690923e-07, + "loss": 0.1586, + "step": 28856 + }, + { + "epoch": 1.5838638858397367, + "grad_norm": 1.0650209188461304, + "learning_rate": 1.8075207034087826e-07, + "loss": 0.204, + "step": 28858 + }, + { + "epoch": 1.58397365532382, + "grad_norm": 1.3894808292388916, + "learning_rate": 1.8012412401887947e-07, + "loss": 0.1291, + "step": 28860 + }, + { + "epoch": 1.5840834248079034, + "grad_norm": 1.0633518695831299, + "learning_rate": 1.79497266418463e-07, + "loss": 0.2141, + "step": 28862 + }, + { + "epoch": 1.584193194291987, + "grad_norm": 1.529128909111023, + "learning_rate": 1.7887149756712074e-07, + "loss": 0.254, + "step": 28864 + }, + { + "epoch": 1.5843029637760702, + "grad_norm": 1.214064121246338, + "learning_rate": 1.7824681749230576e-07, + "loss": 0.1433, + "step": 28866 + }, + { + "epoch": 1.5844127332601536, + "grad_norm": 1.575717806816101, + "learning_rate": 1.7762322622141558e-07, + "loss": 0.1541, + "step": 28868 + }, + { + "epoch": 1.5845225027442371, + "grad_norm": 2.1374714374542236, + "learning_rate": 1.770007237818061e-07, + "loss": 0.3291, + "step": 28870 + }, + { + "epoch": 1.5846322722283204, + "grad_norm": 0.9510864615440369, + "learning_rate": 1.7637931020078602e-07, + "loss": 0.1407, + "step": 28872 + }, + { + "epoch": 1.584742041712404, + "grad_norm": 1.157809853553772, + "learning_rate": 1.7575898550560856e-07, + "loss": 0.15, + "step": 28874 + }, + { + "epoch": 1.5848518111964873, + "grad_norm": 1.2558339834213257, + "learning_rate": 1.7513974972348523e-07, + "loss": 0.1631, + "step": 28876 + }, + { + "epoch": 1.5849615806805708, + "grad_norm": 1.0469927787780762, + "learning_rate": 1.7452160288158325e-07, + "loss": 0.2559, + "step": 28878 + }, + { + "epoch": 1.5850713501646543, + "grad_norm": 0.947482168674469, + "learning_rate": 1.7390454500701426e-07, + "loss": 0.1862, + "step": 28880 + }, + { + "epoch": 1.5851811196487375, + "grad_norm": 1.1462351083755493, + "learning_rate": 1.732885761268427e-07, + "loss": 0.2101, + "step": 28882 + }, + { + "epoch": 1.5852908891328212, + "grad_norm": 0.7961814999580383, + "learning_rate": 1.726736962680914e-07, + "loss": 0.113, + "step": 28884 + }, + { + "epoch": 1.5854006586169045, + "grad_norm": 1.0731453895568848, + "learning_rate": 1.720599054577332e-07, + "loss": 0.1414, + "step": 28886 + }, + { + "epoch": 1.585510428100988, + "grad_norm": 1.083648681640625, + "learning_rate": 1.7144720372269109e-07, + "loss": 0.1675, + "step": 28888 + }, + { + "epoch": 1.5856201975850714, + "grad_norm": 1.085037350654602, + "learning_rate": 1.7083559108983794e-07, + "loss": 0.1907, + "step": 28890 + }, + { + "epoch": 1.5857299670691547, + "grad_norm": 0.7796459794044495, + "learning_rate": 1.702250675860051e-07, + "loss": 0.1133, + "step": 28892 + }, + { + "epoch": 1.5858397365532382, + "grad_norm": 1.0911381244659424, + "learning_rate": 1.6961563323797393e-07, + "loss": 0.2248, + "step": 28894 + }, + { + "epoch": 1.5859495060373217, + "grad_norm": 0.8413218855857849, + "learning_rate": 1.6900728807247302e-07, + "loss": 0.1576, + "step": 28896 + }, + { + "epoch": 1.586059275521405, + "grad_norm": 1.1602154970169067, + "learning_rate": 1.6840003211619215e-07, + "loss": 0.1405, + "step": 28898 + }, + { + "epoch": 1.5861690450054886, + "grad_norm": 1.2548725605010986, + "learning_rate": 1.6779386539576835e-07, + "loss": 0.1967, + "step": 28900 + }, + { + "epoch": 1.5862788144895719, + "grad_norm": 1.3645623922348022, + "learning_rate": 1.671887879377859e-07, + "loss": 0.2718, + "step": 28902 + }, + { + "epoch": 1.5863885839736553, + "grad_norm": 3.276977777481079, + "learning_rate": 1.6658479976879026e-07, + "loss": 0.1492, + "step": 28904 + }, + { + "epoch": 1.5864983534577388, + "grad_norm": 0.9499015808105469, + "learning_rate": 1.659819009152741e-07, + "loss": 0.2003, + "step": 28906 + }, + { + "epoch": 1.586608122941822, + "grad_norm": 1.7661290168762207, + "learning_rate": 1.6538009140368571e-07, + "loss": 0.2351, + "step": 28908 + }, + { + "epoch": 1.5867178924259056, + "grad_norm": 0.8266311883926392, + "learning_rate": 1.6477937126042342e-07, + "loss": 0.2273, + "step": 28910 + }, + { + "epoch": 1.586827661909989, + "grad_norm": 1.3940701484680176, + "learning_rate": 1.641797405118356e-07, + "loss": 0.2766, + "step": 28912 + }, + { + "epoch": 1.5869374313940723, + "grad_norm": 1.2690274715423584, + "learning_rate": 1.6358119918422622e-07, + "loss": 0.1528, + "step": 28914 + }, + { + "epoch": 1.587047200878156, + "grad_norm": 1.2101161479949951, + "learning_rate": 1.6298374730384925e-07, + "loss": 0.2627, + "step": 28916 + }, + { + "epoch": 1.5871569703622392, + "grad_norm": 1.122997760772705, + "learning_rate": 1.6238738489690874e-07, + "loss": 0.2066, + "step": 28918 + }, + { + "epoch": 1.5872667398463227, + "grad_norm": 1.04216730594635, + "learning_rate": 1.6179211198957266e-07, + "loss": 0.1241, + "step": 28920 + }, + { + "epoch": 1.5873765093304062, + "grad_norm": 1.515215516090393, + "learning_rate": 1.6119792860794513e-07, + "loss": 0.2255, + "step": 28922 + }, + { + "epoch": 1.5874862788144894, + "grad_norm": 1.16465425491333, + "learning_rate": 1.606048347780942e-07, + "loss": 0.159, + "step": 28924 + }, + { + "epoch": 1.5875960482985731, + "grad_norm": 1.1581978797912598, + "learning_rate": 1.6001283052603233e-07, + "loss": 0.1987, + "step": 28926 + }, + { + "epoch": 1.5877058177826564, + "grad_norm": 2.187157392501831, + "learning_rate": 1.5942191587773047e-07, + "loss": 0.19, + "step": 28928 + }, + { + "epoch": 1.5878155872667399, + "grad_norm": 1.348484754562378, + "learning_rate": 1.5883209085910678e-07, + "loss": 0.2018, + "step": 28930 + }, + { + "epoch": 1.5879253567508234, + "grad_norm": 0.9150080680847168, + "learning_rate": 1.5824335549603775e-07, + "loss": 0.1541, + "step": 28932 + }, + { + "epoch": 1.5880351262349066, + "grad_norm": 1.027785301208496, + "learning_rate": 1.576557098143444e-07, + "loss": 0.1571, + "step": 28934 + }, + { + "epoch": 1.58814489571899, + "grad_norm": 1.259670615196228, + "learning_rate": 1.5706915383980335e-07, + "loss": 0.2679, + "step": 28936 + }, + { + "epoch": 1.5882546652030736, + "grad_norm": 2.003995895385742, + "learning_rate": 1.564836875981468e-07, + "loss": 0.2766, + "step": 28938 + }, + { + "epoch": 1.5883644346871568, + "grad_norm": 1.389793872833252, + "learning_rate": 1.5589931111505696e-07, + "loss": 0.242, + "step": 28940 + }, + { + "epoch": 1.5884742041712405, + "grad_norm": 1.1787519454956055, + "learning_rate": 1.5531602441616332e-07, + "loss": 0.2227, + "step": 28942 + }, + { + "epoch": 1.5885839736553238, + "grad_norm": 1.863377332687378, + "learning_rate": 1.5473382752705378e-07, + "loss": 0.1688, + "step": 28944 + }, + { + "epoch": 1.5886937431394073, + "grad_norm": 2.0327720642089844, + "learning_rate": 1.5415272047326622e-07, + "loss": 0.2034, + "step": 28946 + }, + { + "epoch": 1.5888035126234907, + "grad_norm": 1.2158472537994385, + "learning_rate": 1.5357270328029138e-07, + "loss": 0.1846, + "step": 28948 + }, + { + "epoch": 1.588913282107574, + "grad_norm": 0.6744571328163147, + "learning_rate": 1.5299377597357e-07, + "loss": 0.1473, + "step": 28950 + }, + { + "epoch": 1.5890230515916575, + "grad_norm": 1.128176212310791, + "learning_rate": 1.5241593857850124e-07, + "loss": 0.2088, + "step": 28952 + }, + { + "epoch": 1.589132821075741, + "grad_norm": 1.1974782943725586, + "learning_rate": 1.5183919112042312e-07, + "loss": 0.1653, + "step": 28954 + }, + { + "epoch": 1.5892425905598244, + "grad_norm": 1.1996314525604248, + "learning_rate": 1.5126353362464595e-07, + "loss": 0.203, + "step": 28956 + }, + { + "epoch": 1.589352360043908, + "grad_norm": 0.8890063762664795, + "learning_rate": 1.506889661164107e-07, + "loss": 0.2014, + "step": 28958 + }, + { + "epoch": 1.5894621295279912, + "grad_norm": 0.6480370759963989, + "learning_rate": 1.5011548862092773e-07, + "loss": 0.1395, + "step": 28960 + }, + { + "epoch": 1.5895718990120746, + "grad_norm": 1.1202194690704346, + "learning_rate": 1.4954310116334914e-07, + "loss": 0.1504, + "step": 28962 + }, + { + "epoch": 1.589681668496158, + "grad_norm": 1.516942024230957, + "learning_rate": 1.4897180376877983e-07, + "loss": 0.319, + "step": 28964 + }, + { + "epoch": 1.5897914379802414, + "grad_norm": 1.5276541709899902, + "learning_rate": 1.484015964622887e-07, + "loss": 0.2483, + "step": 28966 + }, + { + "epoch": 1.589901207464325, + "grad_norm": 1.1463474035263062, + "learning_rate": 1.478324792688779e-07, + "loss": 0.1639, + "step": 28968 + }, + { + "epoch": 1.5900109769484083, + "grad_norm": 1.6756480932235718, + "learning_rate": 1.4726445221351915e-07, + "loss": 0.2312, + "step": 28970 + }, + { + "epoch": 1.5901207464324918, + "grad_norm": 0.8085098266601562, + "learning_rate": 1.4669751532112308e-07, + "loss": 0.076, + "step": 28972 + }, + { + "epoch": 1.5902305159165753, + "grad_norm": 1.358960747718811, + "learning_rate": 1.4613166861656424e-07, + "loss": 0.1978, + "step": 28974 + }, + { + "epoch": 1.5903402854006585, + "grad_norm": 0.8397647738456726, + "learning_rate": 1.455669121246589e-07, + "loss": 0.1094, + "step": 28976 + }, + { + "epoch": 1.590450054884742, + "grad_norm": 0.9760645627975464, + "learning_rate": 1.4500324587018165e-07, + "loss": 0.1681, + "step": 28978 + }, + { + "epoch": 1.5905598243688255, + "grad_norm": 0.8620776534080505, + "learning_rate": 1.444406698778572e-07, + "loss": 0.147, + "step": 28980 + }, + { + "epoch": 1.5906695938529087, + "grad_norm": 1.5589128732681274, + "learning_rate": 1.438791841723658e-07, + "loss": 0.0868, + "step": 28982 + }, + { + "epoch": 1.5907793633369924, + "grad_norm": 1.0193829536437988, + "learning_rate": 1.4331878877833216e-07, + "loss": 0.1913, + "step": 28984 + }, + { + "epoch": 1.5908891328210757, + "grad_norm": 1.1686547994613647, + "learning_rate": 1.4275948372033942e-07, + "loss": 0.2075, + "step": 28986 + }, + { + "epoch": 1.5909989023051592, + "grad_norm": 1.1884609460830688, + "learning_rate": 1.4220126902292353e-07, + "loss": 0.1236, + "step": 28988 + }, + { + "epoch": 1.5911086717892426, + "grad_norm": 1.1052577495574951, + "learning_rate": 1.4164414471056764e-07, + "loss": 0.1393, + "step": 28990 + }, + { + "epoch": 1.591218441273326, + "grad_norm": 1.1463881731033325, + "learning_rate": 1.4108811080771333e-07, + "loss": 0.2277, + "step": 28992 + }, + { + "epoch": 1.5913282107574096, + "grad_norm": 0.8241549134254456, + "learning_rate": 1.4053316733874944e-07, + "loss": 0.0948, + "step": 28994 + }, + { + "epoch": 1.5914379802414929, + "grad_norm": 1.4738107919692993, + "learning_rate": 1.399793143280176e-07, + "loss": 0.2255, + "step": 28996 + }, + { + "epoch": 1.5915477497255763, + "grad_norm": 1.6219062805175781, + "learning_rate": 1.3942655179981224e-07, + "loss": 0.3175, + "step": 28998 + }, + { + "epoch": 1.5916575192096598, + "grad_norm": 1.0452568531036377, + "learning_rate": 1.3887487977838343e-07, + "loss": 0.1715, + "step": 29000 + }, + { + "epoch": 1.591767288693743, + "grad_norm": 1.0153480768203735, + "learning_rate": 1.383242982879257e-07, + "loss": 0.1625, + "step": 29002 + }, + { + "epoch": 1.5918770581778265, + "grad_norm": 1.0921895503997803, + "learning_rate": 1.3777480735259196e-07, + "loss": 0.1425, + "step": 29004 + }, + { + "epoch": 1.59198682766191, + "grad_norm": 1.700053334236145, + "learning_rate": 1.3722640699648792e-07, + "loss": 0.3802, + "step": 29006 + }, + { + "epoch": 1.5920965971459933, + "grad_norm": 1.2112224102020264, + "learning_rate": 1.3667909724366934e-07, + "loss": 0.2379, + "step": 29008 + }, + { + "epoch": 1.592206366630077, + "grad_norm": 1.2522715330123901, + "learning_rate": 1.3613287811813923e-07, + "loss": 0.1522, + "step": 29010 + }, + { + "epoch": 1.5923161361141602, + "grad_norm": 1.0602061748504639, + "learning_rate": 1.3558774964386177e-07, + "loss": 0.2397, + "step": 29012 + }, + { + "epoch": 1.5924259055982437, + "grad_norm": 1.2421317100524902, + "learning_rate": 1.3504371184474285e-07, + "loss": 0.1894, + "step": 29014 + }, + { + "epoch": 1.5925356750823272, + "grad_norm": 0.8737250566482544, + "learning_rate": 1.34500764744655e-07, + "loss": 0.1636, + "step": 29016 + }, + { + "epoch": 1.5926454445664104, + "grad_norm": Infinity, + "learning_rate": 1.342297002141918e-07, + "loss": 0.1384, + "step": 29018 + }, + { + "epoch": 1.592755214050494, + "grad_norm": 1.511995553970337, + "learning_rate": 1.3368838920728432e-07, + "loss": 0.2554, + "step": 29020 + }, + { + "epoch": 1.5928649835345774, + "grad_norm": 0.6937594413757324, + "learning_rate": 1.3314816895885485e-07, + "loss": 0.1236, + "step": 29022 + }, + { + "epoch": 1.5929747530186606, + "grad_norm": 1.4161083698272705, + "learning_rate": 1.3260903949260107e-07, + "loss": 0.241, + "step": 29024 + }, + { + "epoch": 1.5930845225027443, + "grad_norm": 0.7082158327102661, + "learning_rate": 1.3207100083217071e-07, + "loss": 0.1236, + "step": 29026 + }, + { + "epoch": 1.5931942919868276, + "grad_norm": 1.5163425207138062, + "learning_rate": 1.3153405300116716e-07, + "loss": 0.2179, + "step": 29028 + }, + { + "epoch": 1.593304061470911, + "grad_norm": 1.120238184928894, + "learning_rate": 1.30998196023141e-07, + "loss": 0.168, + "step": 29030 + }, + { + "epoch": 1.5934138309549946, + "grad_norm": 1.1559141874313354, + "learning_rate": 1.3046342992159566e-07, + "loss": 0.1773, + "step": 29032 + }, + { + "epoch": 1.5935236004390778, + "grad_norm": 0.7440048456192017, + "learning_rate": 1.299297547199957e-07, + "loss": 0.2246, + "step": 29034 + }, + { + "epoch": 1.5936333699231615, + "grad_norm": 1.0015923976898193, + "learning_rate": 1.2939717044174183e-07, + "loss": 0.2149, + "step": 29036 + }, + { + "epoch": 1.5937431394072448, + "grad_norm": 1.5430018901824951, + "learning_rate": 1.2886567711020148e-07, + "loss": 0.2105, + "step": 29038 + }, + { + "epoch": 1.5938529088913282, + "grad_norm": 1.148008108139038, + "learning_rate": 1.2833527474868655e-07, + "loss": 0.174, + "step": 29040 + }, + { + "epoch": 1.5939626783754117, + "grad_norm": 1.0070205926895142, + "learning_rate": 1.2780596338046735e-07, + "loss": 0.1282, + "step": 29042 + }, + { + "epoch": 1.594072447859495, + "grad_norm": 1.0002707242965698, + "learning_rate": 1.2727774302875585e-07, + "loss": 0.2345, + "step": 29044 + }, + { + "epoch": 1.5941822173435785, + "grad_norm": 1.2138508558273315, + "learning_rate": 1.2675061371672237e-07, + "loss": 0.2953, + "step": 29046 + }, + { + "epoch": 1.594291986827662, + "grad_norm": 1.0720798969268799, + "learning_rate": 1.2622457546749567e-07, + "loss": 0.2652, + "step": 29048 + }, + { + "epoch": 1.5944017563117452, + "grad_norm": 1.1642191410064697, + "learning_rate": 1.2569962830414618e-07, + "loss": 0.1954, + "step": 29050 + }, + { + "epoch": 1.5945115257958289, + "grad_norm": 1.1249606609344482, + "learning_rate": 1.2517577224970267e-07, + "loss": 0.1494, + "step": 29052 + }, + { + "epoch": 1.5946212952799121, + "grad_norm": 1.3410499095916748, + "learning_rate": 1.2465300732714125e-07, + "loss": 0.2075, + "step": 29054 + }, + { + "epoch": 1.5947310647639956, + "grad_norm": 1.2849171161651611, + "learning_rate": 1.2413133355939356e-07, + "loss": 0.2373, + "step": 29056 + }, + { + "epoch": 1.594840834248079, + "grad_norm": 0.8124650120735168, + "learning_rate": 1.2361075096934406e-07, + "loss": 0.1011, + "step": 29058 + }, + { + "epoch": 1.5949506037321624, + "grad_norm": 1.034619927406311, + "learning_rate": 1.2309125957982448e-07, + "loss": 0.2045, + "step": 29060 + }, + { + "epoch": 1.5950603732162458, + "grad_norm": 0.9701600670814514, + "learning_rate": 1.225728594136305e-07, + "loss": 0.1431, + "step": 29062 + }, + { + "epoch": 1.5951701427003293, + "grad_norm": 0.9573978185653687, + "learning_rate": 1.2205555049349394e-07, + "loss": 0.1191, + "step": 29064 + }, + { + "epoch": 1.5952799121844128, + "grad_norm": 1.4730333089828491, + "learning_rate": 1.215393328421105e-07, + "loss": 0.1163, + "step": 29066 + }, + { + "epoch": 1.5953896816684963, + "grad_norm": 0.773430347442627, + "learning_rate": 1.2102420648212043e-07, + "loss": 0.1406, + "step": 29068 + }, + { + "epoch": 1.5954994511525795, + "grad_norm": 1.4388905763626099, + "learning_rate": 1.205101714361223e-07, + "loss": 0.2449, + "step": 29070 + }, + { + "epoch": 1.595609220636663, + "grad_norm": 0.9954429268836975, + "learning_rate": 1.1999722772666476e-07, + "loss": 0.1222, + "step": 29072 + }, + { + "epoch": 1.5957189901207465, + "grad_norm": 1.4397674798965454, + "learning_rate": 1.194853753762465e-07, + "loss": 0.224, + "step": 29074 + }, + { + "epoch": 1.5958287596048297, + "grad_norm": 1.0186445713043213, + "learning_rate": 1.1897461440732171e-07, + "loss": 0.1604, + "step": 29076 + }, + { + "epoch": 1.5959385290889134, + "grad_norm": 1.193008542060852, + "learning_rate": 1.1846494484229198e-07, + "loss": 0.1738, + "step": 29078 + }, + { + "epoch": 1.5960482985729967, + "grad_norm": 1.033040165901184, + "learning_rate": 1.1795636670351717e-07, + "loss": 0.2152, + "step": 29080 + }, + { + "epoch": 1.5961580680570802, + "grad_norm": 1.239843726158142, + "learning_rate": 1.1744888001330168e-07, + "loss": 0.1401, + "step": 29082 + }, + { + "epoch": 1.5962678375411636, + "grad_norm": 0.5665268898010254, + "learning_rate": 1.1694248479391101e-07, + "loss": 0.1501, + "step": 29084 + }, + { + "epoch": 1.596377607025247, + "grad_norm": 1.2578692436218262, + "learning_rate": 1.1643718106755519e-07, + "loss": 0.2268, + "step": 29086 + }, + { + "epoch": 1.5964873765093304, + "grad_norm": 1.3123027086257935, + "learning_rate": 1.1593296885640259e-07, + "loss": 0.2027, + "step": 29088 + }, + { + "epoch": 1.5965971459934138, + "grad_norm": 0.7842592000961304, + "learning_rate": 1.1542984818256608e-07, + "loss": 0.1193, + "step": 29090 + }, + { + "epoch": 1.596706915477497, + "grad_norm": 1.3903172016143799, + "learning_rate": 1.149278190681169e-07, + "loss": 0.1589, + "step": 29092 + }, + { + "epoch": 1.5968166849615808, + "grad_norm": 1.1456996202468872, + "learning_rate": 1.1442688153507908e-07, + "loss": 0.2095, + "step": 29094 + }, + { + "epoch": 1.596926454445664, + "grad_norm": 1.0488725900650024, + "learning_rate": 1.1392703560542117e-07, + "loss": 0.1669, + "step": 29096 + }, + { + "epoch": 1.5970362239297475, + "grad_norm": 0.9062263369560242, + "learning_rate": 1.1342828130107286e-07, + "loss": 0.2043, + "step": 29098 + }, + { + "epoch": 1.597145993413831, + "grad_norm": 0.9132539629936218, + "learning_rate": 1.129306186439083e-07, + "loss": 0.1129, + "step": 29100 + }, + { + "epoch": 1.5972557628979143, + "grad_norm": 1.039937973022461, + "learning_rate": 1.1243404765576282e-07, + "loss": 0.1678, + "step": 29102 + }, + { + "epoch": 1.597365532381998, + "grad_norm": 1.1273233890533447, + "learning_rate": 1.1193856835841343e-07, + "loss": 0.1258, + "step": 29104 + }, + { + "epoch": 1.5974753018660812, + "grad_norm": 0.8404263854026794, + "learning_rate": 1.1144418077359831e-07, + "loss": 0.152, + "step": 29106 + }, + { + "epoch": 1.5975850713501647, + "grad_norm": 1.4330412149429321, + "learning_rate": 1.109508849230001e-07, + "loss": 0.2336, + "step": 29108 + }, + { + "epoch": 1.5976948408342482, + "grad_norm": 1.1904252767562866, + "learning_rate": 1.1045868082825983e-07, + "loss": 0.2459, + "step": 29110 + }, + { + "epoch": 1.5978046103183314, + "grad_norm": 2.4051289558410645, + "learning_rate": 1.0996756851096579e-07, + "loss": 0.1689, + "step": 29112 + }, + { + "epoch": 1.597914379802415, + "grad_norm": 0.7977201342582703, + "learning_rate": 1.0947754799266186e-07, + "loss": 0.097, + "step": 29114 + }, + { + "epoch": 1.5980241492864984, + "grad_norm": 0.9087691903114319, + "learning_rate": 1.0898861929484194e-07, + "loss": 0.1504, + "step": 29116 + }, + { + "epoch": 1.5981339187705816, + "grad_norm": 1.516135573387146, + "learning_rate": 1.0850078243895279e-07, + "loss": 0.2428, + "step": 29118 + }, + { + "epoch": 1.5982436882546653, + "grad_norm": 1.0990906953811646, + "learning_rate": 1.080140374463967e-07, + "loss": 0.2419, + "step": 29120 + }, + { + "epoch": 1.5983534577387486, + "grad_norm": 1.3121769428253174, + "learning_rate": 1.0752838433852053e-07, + "loss": 0.1848, + "step": 29122 + }, + { + "epoch": 1.598463227222832, + "grad_norm": 1.4717687368392944, + "learning_rate": 1.0704382313662941e-07, + "loss": 0.2978, + "step": 29124 + }, + { + "epoch": 1.5985729967069155, + "grad_norm": 1.2732090950012207, + "learning_rate": 1.0656035386197583e-07, + "loss": 0.271, + "step": 29126 + }, + { + "epoch": 1.5986827661909988, + "grad_norm": 0.8188881874084473, + "learning_rate": 1.0607797653577334e-07, + "loss": 0.1318, + "step": 29128 + }, + { + "epoch": 1.5987925356750823, + "grad_norm": 0.8661463260650635, + "learning_rate": 1.0559669117917447e-07, + "loss": 0.2539, + "step": 29130 + }, + { + "epoch": 1.5989023051591658, + "grad_norm": 1.5395392179489136, + "learning_rate": 1.0511649781329291e-07, + "loss": 0.3093, + "step": 29132 + }, + { + "epoch": 1.599012074643249, + "grad_norm": 1.7248550653457642, + "learning_rate": 1.0463739645919512e-07, + "loss": 0.276, + "step": 29134 + }, + { + "epoch": 1.5991218441273327, + "grad_norm": 1.3700584173202515, + "learning_rate": 1.0415938713789486e-07, + "loss": 0.1693, + "step": 29136 + }, + { + "epoch": 1.599231613611416, + "grad_norm": 1.0688079595565796, + "learning_rate": 1.0368246987035868e-07, + "loss": 0.1378, + "step": 29138 + }, + { + "epoch": 1.5993413830954994, + "grad_norm": 1.3613759279251099, + "learning_rate": 1.0320664467750874e-07, + "loss": 0.2165, + "step": 29140 + }, + { + "epoch": 1.599451152579583, + "grad_norm": 1.0659496784210205, + "learning_rate": 1.0273191158021445e-07, + "loss": 0.2842, + "step": 29142 + }, + { + "epoch": 1.5995609220636662, + "grad_norm": 1.5557259321212769, + "learning_rate": 1.0225827059930083e-07, + "loss": 0.194, + "step": 29144 + }, + { + "epoch": 1.5996706915477499, + "grad_norm": 1.3035979270935059, + "learning_rate": 1.0178572175554846e-07, + "loss": 0.1231, + "step": 29146 + }, + { + "epoch": 1.5997804610318331, + "grad_norm": 1.3240607976913452, + "learning_rate": 1.0131426506967689e-07, + "loss": 0.2533, + "step": 29148 + }, + { + "epoch": 1.5998902305159166, + "grad_norm": 0.7395314574241638, + "learning_rate": 1.0084390056237513e-07, + "loss": 0.1052, + "step": 29150 + }, + { + "epoch": 1.6, + "grad_norm": 0.9474108219146729, + "learning_rate": 1.0037462825427114e-07, + "loss": 0.167, + "step": 29152 + }, + { + "epoch": 1.6001097694840833, + "grad_norm": 0.937870442867279, + "learning_rate": 9.990644816595118e-08, + "loss": 0.1192, + "step": 29154 + }, + { + "epoch": 1.6002195389681668, + "grad_norm": 1.1334218978881836, + "learning_rate": 9.943936031795165e-08, + "loss": 0.2059, + "step": 29156 + }, + { + "epoch": 1.6003293084522503, + "grad_norm": 1.2930004596710205, + "learning_rate": 9.897336473076168e-08, + "loss": 0.2089, + "step": 29158 + }, + { + "epoch": 1.6004390779363336, + "grad_norm": 1.0196129083633423, + "learning_rate": 9.850846142481773e-08, + "loss": 0.1362, + "step": 29160 + }, + { + "epoch": 1.6005488474204173, + "grad_norm": 1.1788069009780884, + "learning_rate": 9.804465042052014e-08, + "loss": 0.1125, + "step": 29162 + }, + { + "epoch": 1.6006586169045005, + "grad_norm": 0.8942195177078247, + "learning_rate": 9.758193173820817e-08, + "loss": 0.1895, + "step": 29164 + }, + { + "epoch": 1.600768386388584, + "grad_norm": 1.0538872480392456, + "learning_rate": 9.71203053981823e-08, + "loss": 0.1941, + "step": 29166 + }, + { + "epoch": 1.6008781558726675, + "grad_norm": 0.9369055032730103, + "learning_rate": 9.665977142068738e-08, + "loss": 0.1503, + "step": 29168 + }, + { + "epoch": 1.6009879253567507, + "grad_norm": 1.2046488523483276, + "learning_rate": 9.620032982592952e-08, + "loss": 0.179, + "step": 29170 + }, + { + "epoch": 1.6010976948408342, + "grad_norm": 1.270339012145996, + "learning_rate": 9.574198063406203e-08, + "loss": 0.2057, + "step": 29172 + }, + { + "epoch": 1.6012074643249177, + "grad_norm": 1.1264677047729492, + "learning_rate": 9.528472386518827e-08, + "loss": 0.18, + "step": 29174 + }, + { + "epoch": 1.6013172338090012, + "grad_norm": 1.5252622365951538, + "learning_rate": 9.482855953936443e-08, + "loss": 0.1858, + "step": 29176 + }, + { + "epoch": 1.6014270032930846, + "grad_norm": 1.1827512979507446, + "learning_rate": 9.437348767659948e-08, + "loss": 0.1789, + "step": 29178 + }, + { + "epoch": 1.6015367727771679, + "grad_norm": 1.2086600065231323, + "learning_rate": 9.39195082968608e-08, + "loss": 0.1973, + "step": 29180 + }, + { + "epoch": 1.6016465422612514, + "grad_norm": 0.948870062828064, + "learning_rate": 9.346662142005747e-08, + "loss": 0.1999, + "step": 29182 + }, + { + "epoch": 1.6017563117453348, + "grad_norm": 1.281783103942871, + "learning_rate": 9.301482706605691e-08, + "loss": 0.2507, + "step": 29184 + }, + { + "epoch": 1.601866081229418, + "grad_norm": 1.2070096731185913, + "learning_rate": 9.256412525467661e-08, + "loss": 0.1706, + "step": 29186 + }, + { + "epoch": 1.6019758507135018, + "grad_norm": 0.9578901529312134, + "learning_rate": 9.211451600568966e-08, + "loss": 0.1429, + "step": 29188 + }, + { + "epoch": 1.602085620197585, + "grad_norm": 0.6979537010192871, + "learning_rate": 9.166599933881081e-08, + "loss": 0.1107, + "step": 29190 + }, + { + "epoch": 1.6021953896816685, + "grad_norm": 1.4112074375152588, + "learning_rate": 9.121857527372158e-08, + "loss": 0.2046, + "step": 29192 + }, + { + "epoch": 1.602305159165752, + "grad_norm": 1.4822046756744385, + "learning_rate": 9.077224383004235e-08, + "loss": 0.2725, + "step": 29194 + }, + { + "epoch": 1.6024149286498353, + "grad_norm": 0.9127855896949768, + "learning_rate": 9.03270050273547e-08, + "loss": 0.1264, + "step": 29196 + }, + { + "epoch": 1.6025246981339187, + "grad_norm": 1.776152491569519, + "learning_rate": 8.988285888519022e-08, + "loss": 0.3397, + "step": 29198 + }, + { + "epoch": 1.6026344676180022, + "grad_norm": 1.8602931499481201, + "learning_rate": 8.943980542302777e-08, + "loss": 0.1633, + "step": 29200 + }, + { + "epoch": 1.6027442371020855, + "grad_norm": 0.9727676510810852, + "learning_rate": 8.899784466030459e-08, + "loss": 0.2091, + "step": 29202 + }, + { + "epoch": 1.6028540065861692, + "grad_norm": 0.6820509433746338, + "learning_rate": 8.855697661640793e-08, + "loss": 0.1483, + "step": 29204 + }, + { + "epoch": 1.6029637760702524, + "grad_norm": 1.1121290922164917, + "learning_rate": 8.811720131067236e-08, + "loss": 0.1714, + "step": 29206 + }, + { + "epoch": 1.603073545554336, + "grad_norm": 1.0541462898254395, + "learning_rate": 8.767851876239074e-08, + "loss": 0.1338, + "step": 29208 + }, + { + "epoch": 1.6031833150384194, + "grad_norm": 1.0018469095230103, + "learning_rate": 8.724092899080882e-08, + "loss": 0.1583, + "step": 29210 + }, + { + "epoch": 1.6032930845225026, + "grad_norm": 1.3193063735961914, + "learning_rate": 8.680443201511679e-08, + "loss": 0.1481, + "step": 29212 + }, + { + "epoch": 1.6034028540065863, + "grad_norm": 1.1398463249206543, + "learning_rate": 8.636902785446322e-08, + "loss": 0.2334, + "step": 29214 + }, + { + "epoch": 1.6035126234906696, + "grad_norm": 1.2406085729599, + "learning_rate": 8.593471652794949e-08, + "loss": 0.243, + "step": 29216 + }, + { + "epoch": 1.603622392974753, + "grad_norm": 0.7810348868370056, + "learning_rate": 8.550149805462149e-08, + "loss": 0.1355, + "step": 29218 + }, + { + "epoch": 1.6037321624588365, + "grad_norm": 0.8885647654533386, + "learning_rate": 8.5069372453489e-08, + "loss": 0.1898, + "step": 29220 + }, + { + "epoch": 1.6038419319429198, + "grad_norm": 1.4413139820098877, + "learning_rate": 8.463833974350078e-08, + "loss": 0.2786, + "step": 29222 + }, + { + "epoch": 1.6039517014270033, + "grad_norm": 1.6370328664779663, + "learning_rate": 8.420839994356666e-08, + "loss": 0.2472, + "step": 29224 + }, + { + "epoch": 1.6040614709110868, + "grad_norm": 0.7212934494018555, + "learning_rate": 8.377955307254937e-08, + "loss": 0.1789, + "step": 29226 + }, + { + "epoch": 1.60417124039517, + "grad_norm": 0.9167606830596924, + "learning_rate": 8.335179914925328e-08, + "loss": 0.2326, + "step": 29228 + }, + { + "epoch": 1.6042810098792537, + "grad_norm": 1.3447834253311157, + "learning_rate": 8.292513819244674e-08, + "loss": 0.2538, + "step": 29230 + }, + { + "epoch": 1.604390779363337, + "grad_norm": 4.95709753036499, + "learning_rate": 8.249957022084254e-08, + "loss": 0.273, + "step": 29232 + }, + { + "epoch": 1.6045005488474204, + "grad_norm": 1.6376181840896606, + "learning_rate": 8.207509525311185e-08, + "loss": 0.2409, + "step": 29234 + }, + { + "epoch": 1.604610318331504, + "grad_norm": 0.831266462802887, + "learning_rate": 8.165171330787036e-08, + "loss": 0.112, + "step": 29236 + }, + { + "epoch": 1.6047200878155872, + "grad_norm": 1.0045970678329468, + "learning_rate": 8.122942440369207e-08, + "loss": 0.2065, + "step": 29238 + }, + { + "epoch": 1.6048298572996706, + "grad_norm": 1.0784149169921875, + "learning_rate": 8.080822855909831e-08, + "loss": 0.2123, + "step": 29240 + }, + { + "epoch": 1.6049396267837541, + "grad_norm": 1.3225395679473877, + "learning_rate": 8.038812579256594e-08, + "loss": 0.2165, + "step": 29242 + }, + { + "epoch": 1.6050493962678374, + "grad_norm": 1.1685281991958618, + "learning_rate": 7.996911612252467e-08, + "loss": 0.2057, + "step": 29244 + }, + { + "epoch": 1.605159165751921, + "grad_norm": 1.2080947160720825, + "learning_rate": 7.955119956735146e-08, + "loss": 0.1061, + "step": 29246 + }, + { + "epoch": 1.6052689352360043, + "grad_norm": 1.2494484186172485, + "learning_rate": 7.913437614538166e-08, + "loss": 0.2081, + "step": 29248 + }, + { + "epoch": 1.6053787047200878, + "grad_norm": 1.2093912363052368, + "learning_rate": 7.871864587489508e-08, + "loss": 0.1827, + "step": 29250 + }, + { + "epoch": 1.6054884742041713, + "grad_norm": 1.5066297054290771, + "learning_rate": 7.830400877412991e-08, + "loss": 0.4665, + "step": 29252 + }, + { + "epoch": 1.6055982436882545, + "grad_norm": 1.3032466173171997, + "learning_rate": 7.789046486127438e-08, + "loss": 0.2038, + "step": 29254 + }, + { + "epoch": 1.6057080131723382, + "grad_norm": 0.735100269317627, + "learning_rate": 7.747801415446677e-08, + "loss": 0.1369, + "step": 29256 + }, + { + "epoch": 1.6058177826564215, + "grad_norm": 1.2707679271697998, + "learning_rate": 7.706665667180091e-08, + "loss": 0.1944, + "step": 29258 + }, + { + "epoch": 1.605927552140505, + "grad_norm": 0.8265319466590881, + "learning_rate": 7.665639243132072e-08, + "loss": 0.1436, + "step": 29260 + }, + { + "epoch": 1.6060373216245885, + "grad_norm": 1.3351401090621948, + "learning_rate": 7.624722145102292e-08, + "loss": 0.2552, + "step": 29262 + }, + { + "epoch": 1.6061470911086717, + "grad_norm": 1.164015293121338, + "learning_rate": 7.583914374885426e-08, + "loss": 0.1839, + "step": 29264 + }, + { + "epoch": 1.6062568605927552, + "grad_norm": 0.6302862763404846, + "learning_rate": 7.54321593427143e-08, + "loss": 0.1475, + "step": 29266 + }, + { + "epoch": 1.6063666300768387, + "grad_norm": 1.1099635362625122, + "learning_rate": 7.502626825045545e-08, + "loss": 0.1403, + "step": 29268 + }, + { + "epoch": 1.606476399560922, + "grad_norm": 1.0783116817474365, + "learning_rate": 7.462147048988843e-08, + "loss": 0.1127, + "step": 29270 + }, + { + "epoch": 1.6065861690450056, + "grad_norm": 0.6752908825874329, + "learning_rate": 7.421776607876019e-08, + "loss": 0.13, + "step": 29272 + }, + { + "epoch": 1.6066959385290889, + "grad_norm": 0.7317644953727722, + "learning_rate": 7.381515503478708e-08, + "loss": 0.2448, + "step": 29274 + }, + { + "epoch": 1.6068057080131724, + "grad_norm": 1.2299559116363525, + "learning_rate": 7.341363737562445e-08, + "loss": 0.1275, + "step": 29276 + }, + { + "epoch": 1.6069154774972558, + "grad_norm": 1.173476219177246, + "learning_rate": 7.301321311888876e-08, + "loss": 0.2871, + "step": 29278 + }, + { + "epoch": 1.607025246981339, + "grad_norm": 0.8827772736549377, + "learning_rate": 7.261388228213816e-08, + "loss": 0.1503, + "step": 29280 + }, + { + "epoch": 1.6071350164654226, + "grad_norm": 0.9488176107406616, + "learning_rate": 7.221564488289756e-08, + "loss": 0.1674, + "step": 29282 + }, + { + "epoch": 1.607244785949506, + "grad_norm": 1.0617505311965942, + "learning_rate": 7.181850093862797e-08, + "loss": 0.1538, + "step": 29284 + }, + { + "epoch": 1.6073545554335895, + "grad_norm": 1.129859447479248, + "learning_rate": 7.14224504667571e-08, + "loss": 0.1926, + "step": 29286 + }, + { + "epoch": 1.607464324917673, + "grad_norm": 0.9921932816505432, + "learning_rate": 7.102749348465165e-08, + "loss": 0.1263, + "step": 29288 + }, + { + "epoch": 1.6075740944017562, + "grad_norm": 0.7193262577056885, + "learning_rate": 7.063363000963941e-08, + "loss": 0.1464, + "step": 29290 + }, + { + "epoch": 1.6076838638858397, + "grad_norm": 0.835702121257782, + "learning_rate": 7.024086005899821e-08, + "loss": 0.1604, + "step": 29292 + }, + { + "epoch": 1.6077936333699232, + "grad_norm": 0.8188895583152771, + "learning_rate": 6.984918364995319e-08, + "loss": 0.0874, + "step": 29294 + }, + { + "epoch": 1.6079034028540065, + "grad_norm": 1.3880102634429932, + "learning_rate": 6.945860079969057e-08, + "loss": 0.2832, + "step": 29296 + }, + { + "epoch": 1.6080131723380902, + "grad_norm": 0.9544348120689392, + "learning_rate": 6.906911152533558e-08, + "loss": 0.2905, + "step": 29298 + }, + { + "epoch": 1.6081229418221734, + "grad_norm": 1.0866734981536865, + "learning_rate": 6.868071584398006e-08, + "loss": 0.121, + "step": 29300 + }, + { + "epoch": 1.608232711306257, + "grad_norm": 1.2121020555496216, + "learning_rate": 6.829341377266041e-08, + "loss": 0.1613, + "step": 29302 + }, + { + "epoch": 1.6083424807903404, + "grad_norm": 1.1025727987289429, + "learning_rate": 6.790720532836025e-08, + "loss": 0.2512, + "step": 29304 + }, + { + "epoch": 1.6084522502744236, + "grad_norm": 0.8304786086082458, + "learning_rate": 6.752209052802439e-08, + "loss": 0.1743, + "step": 29306 + }, + { + "epoch": 1.608562019758507, + "grad_norm": 1.928006887435913, + "learning_rate": 6.71380693885476e-08, + "loss": 0.1792, + "step": 29308 + }, + { + "epoch": 1.6086717892425906, + "grad_norm": 0.8119585514068604, + "learning_rate": 6.675514192677202e-08, + "loss": 0.1365, + "step": 29310 + }, + { + "epoch": 1.6087815587266738, + "grad_norm": 1.188492774963379, + "learning_rate": 6.637330815949527e-08, + "loss": 0.1953, + "step": 29312 + }, + { + "epoch": 1.6088913282107575, + "grad_norm": 1.2583588361740112, + "learning_rate": 6.59925681034651e-08, + "loss": 0.178, + "step": 29314 + }, + { + "epoch": 1.6090010976948408, + "grad_norm": 1.5779341459274292, + "learning_rate": 6.561292177538481e-08, + "loss": 0.1974, + "step": 29316 + }, + { + "epoch": 1.6091108671789243, + "grad_norm": 1.3752493858337402, + "learning_rate": 6.523436919190773e-08, + "loss": 0.2205, + "step": 29318 + }, + { + "epoch": 1.6092206366630077, + "grad_norm": 0.9482775926589966, + "learning_rate": 6.485691036964003e-08, + "loss": 0.146, + "step": 29320 + }, + { + "epoch": 1.609330406147091, + "grad_norm": 1.200391173362732, + "learning_rate": 6.448054532513514e-08, + "loss": 0.2065, + "step": 29322 + }, + { + "epoch": 1.6094401756311747, + "grad_norm": 1.450644850730896, + "learning_rate": 6.410527407490485e-08, + "loss": 0.1559, + "step": 29324 + }, + { + "epoch": 1.609549945115258, + "grad_norm": 1.0776416063308716, + "learning_rate": 6.373109663540822e-08, + "loss": 0.2204, + "step": 29326 + }, + { + "epoch": 1.6096597145993414, + "grad_norm": 1.18860924243927, + "learning_rate": 6.335801302306265e-08, + "loss": 0.1119, + "step": 29328 + }, + { + "epoch": 1.609769484083425, + "grad_norm": 1.405019998550415, + "learning_rate": 6.298602325422731e-08, + "loss": 0.1496, + "step": 29330 + }, + { + "epoch": 1.6098792535675082, + "grad_norm": 1.000659704208374, + "learning_rate": 6.261512734522524e-08, + "loss": 0.1059, + "step": 29332 + }, + { + "epoch": 1.6099890230515916, + "grad_norm": 0.9969455599784851, + "learning_rate": 6.224532531232397e-08, + "loss": 0.1127, + "step": 29334 + }, + { + "epoch": 1.6100987925356751, + "grad_norm": 1.3942983150482178, + "learning_rate": 6.187661717174386e-08, + "loss": 0.1564, + "step": 29336 + }, + { + "epoch": 1.6102085620197584, + "grad_norm": 1.1304486989974976, + "learning_rate": 6.150900293966089e-08, + "loss": 0.1762, + "step": 29338 + }, + { + "epoch": 1.610318331503842, + "grad_norm": 0.9426811337471008, + "learning_rate": 6.114248263219546e-08, + "loss": 0.1236, + "step": 29340 + }, + { + "epoch": 1.6104281009879253, + "grad_norm": 0.8973815441131592, + "learning_rate": 6.077705626542918e-08, + "loss": 0.1603, + "step": 29342 + }, + { + "epoch": 1.6105378704720088, + "grad_norm": 1.7917474508285522, + "learning_rate": 6.04127238553881e-08, + "loss": 0.158, + "step": 29344 + }, + { + "epoch": 1.6106476399560923, + "grad_norm": 1.0510283708572388, + "learning_rate": 6.004948541805943e-08, + "loss": 0.2011, + "step": 29346 + }, + { + "epoch": 1.6107574094401755, + "grad_norm": 1.5638220310211182, + "learning_rate": 5.968734096936935e-08, + "loss": 0.2175, + "step": 29348 + }, + { + "epoch": 1.610867178924259, + "grad_norm": 1.7095197439193726, + "learning_rate": 5.9326290525207885e-08, + "loss": 0.2525, + "step": 29350 + }, + { + "epoch": 1.6109769484083425, + "grad_norm": 1.2719436883926392, + "learning_rate": 5.896633410141239e-08, + "loss": 0.2003, + "step": 29352 + }, + { + "epoch": 1.6110867178924257, + "grad_norm": 1.7148799896240234, + "learning_rate": 5.860747171377024e-08, + "loss": 0.1929, + "step": 29354 + }, + { + "epoch": 1.6111964873765094, + "grad_norm": 0.9551694393157959, + "learning_rate": 5.8249703378024376e-08, + "loss": 0.2204, + "step": 29356 + }, + { + "epoch": 1.6113062568605927, + "grad_norm": 1.1353020668029785, + "learning_rate": 5.7893029109867823e-08, + "loss": 0.2501, + "step": 29358 + }, + { + "epoch": 1.6114160263446762, + "grad_norm": 0.9897851943969727, + "learning_rate": 5.753744892494639e-08, + "loss": 0.1791, + "step": 29360 + }, + { + "epoch": 1.6115257958287597, + "grad_norm": 1.7150416374206543, + "learning_rate": 5.7182962838855934e-08, + "loss": 0.2076, + "step": 29362 + }, + { + "epoch": 1.611635565312843, + "grad_norm": 1.4404141902923584, + "learning_rate": 5.682957086714791e-08, + "loss": 0.1988, + "step": 29364 + }, + { + "epoch": 1.6117453347969266, + "grad_norm": 0.7290984392166138, + "learning_rate": 5.64772730253238e-08, + "loss": 0.1184, + "step": 29366 + }, + { + "epoch": 1.6118551042810099, + "grad_norm": 1.3075817823410034, + "learning_rate": 5.612606932883513e-08, + "loss": 0.1591, + "step": 29368 + }, + { + "epoch": 1.6119648737650933, + "grad_norm": 1.4910664558410645, + "learning_rate": 5.5775959793091804e-08, + "loss": 0.0746, + "step": 29370 + }, + { + "epoch": 1.6120746432491768, + "grad_norm": 1.1657097339630127, + "learning_rate": 5.5426944433445426e-08, + "loss": 0.2441, + "step": 29372 + }, + { + "epoch": 1.61218441273326, + "grad_norm": 1.2185498476028442, + "learning_rate": 5.507902326520875e-08, + "loss": 0.1422, + "step": 29374 + }, + { + "epoch": 1.6122941822173436, + "grad_norm": 1.1119306087493896, + "learning_rate": 5.473219630364457e-08, + "loss": 0.2714, + "step": 29376 + }, + { + "epoch": 1.612403951701427, + "grad_norm": 1.012871503829956, + "learning_rate": 5.438646356396293e-08, + "loss": 0.1625, + "step": 29378 + }, + { + "epoch": 1.6125137211855103, + "grad_norm": 1.1204299926757812, + "learning_rate": 5.404182506133226e-08, + "loss": 0.1711, + "step": 29380 + }, + { + "epoch": 1.612623490669594, + "grad_norm": 1.2142595052719116, + "learning_rate": 5.3698280810871025e-08, + "loss": 0.1636, + "step": 29382 + }, + { + "epoch": 1.6127332601536772, + "grad_norm": 1.3390142917633057, + "learning_rate": 5.335583082764495e-08, + "loss": 0.2043, + "step": 29384 + }, + { + "epoch": 1.6128430296377607, + "grad_norm": 1.715437650680542, + "learning_rate": 5.301447512667812e-08, + "loss": 0.2435, + "step": 29386 + }, + { + "epoch": 1.6129527991218442, + "grad_norm": 1.6079078912734985, + "learning_rate": 5.26742137229419e-08, + "loss": 0.2063, + "step": 29388 + }, + { + "epoch": 1.6130625686059274, + "grad_norm": 1.0094435214996338, + "learning_rate": 5.233504663136324e-08, + "loss": 0.114, + "step": 29390 + }, + { + "epoch": 1.613172338090011, + "grad_norm": 1.4247552156448364, + "learning_rate": 5.1996973866821894e-08, + "loss": 0.1925, + "step": 29392 + }, + { + "epoch": 1.6132821075740944, + "grad_norm": 1.0373398065567017, + "learning_rate": 5.1659995444142125e-08, + "loss": 0.1548, + "step": 29394 + }, + { + "epoch": 1.6133918770581779, + "grad_norm": 1.0358266830444336, + "learning_rate": 5.13241113781121e-08, + "loss": 0.1661, + "step": 29396 + }, + { + "epoch": 1.6135016465422614, + "grad_norm": 1.2947111129760742, + "learning_rate": 5.0989321683458935e-08, + "loss": 0.1857, + "step": 29398 + }, + { + "epoch": 1.6136114160263446, + "grad_norm": 1.3054050207138062, + "learning_rate": 5.065562637487364e-08, + "loss": 0.1283, + "step": 29400 + }, + { + "epoch": 1.613721185510428, + "grad_norm": 1.2376654148101807, + "learning_rate": 5.032302546698897e-08, + "loss": 0.1698, + "step": 29402 + }, + { + "epoch": 1.6138309549945116, + "grad_norm": 1.2750352621078491, + "learning_rate": 4.99915189743988e-08, + "loss": 0.1931, + "step": 29404 + }, + { + "epoch": 1.6139407244785948, + "grad_norm": 1.260367512702942, + "learning_rate": 4.9661106911641495e-08, + "loss": 0.1454, + "step": 29406 + }, + { + "epoch": 1.6140504939626785, + "grad_norm": 1.0019737482070923, + "learning_rate": 4.9331789293211026e-08, + "loss": 0.108, + "step": 29408 + }, + { + "epoch": 1.6141602634467618, + "grad_norm": 1.1385926008224487, + "learning_rate": 4.900356613355417e-08, + "loss": 0.248, + "step": 29410 + }, + { + "epoch": 1.6142700329308453, + "grad_norm": 0.933291494846344, + "learning_rate": 4.867643744706774e-08, + "loss": 0.1718, + "step": 29412 + }, + { + "epoch": 1.6143798024149287, + "grad_norm": 0.9816680550575256, + "learning_rate": 4.835040324809858e-08, + "loss": 0.1332, + "step": 29414 + }, + { + "epoch": 1.614489571899012, + "grad_norm": 1.895599126815796, + "learning_rate": 4.802546355095472e-08, + "loss": 0.1824, + "step": 29416 + }, + { + "epoch": 1.6145993413830955, + "grad_norm": 1.2280726432800293, + "learning_rate": 4.770161836988307e-08, + "loss": 0.1554, + "step": 29418 + }, + { + "epoch": 1.614709110867179, + "grad_norm": 1.085747241973877, + "learning_rate": 4.737886771909172e-08, + "loss": 0.1777, + "step": 29420 + }, + { + "epoch": 1.6148188803512622, + "grad_norm": 1.2496004104614258, + "learning_rate": 4.705721161273602e-08, + "loss": 0.1788, + "step": 29422 + }, + { + "epoch": 1.614928649835346, + "grad_norm": 1.5889400243759155, + "learning_rate": 4.6736650064929666e-08, + "loss": 0.1777, + "step": 29424 + }, + { + "epoch": 1.6150384193194292, + "grad_norm": 0.9209848642349243, + "learning_rate": 4.6417183089730866e-08, + "loss": 0.2564, + "step": 29426 + }, + { + "epoch": 1.6151481888035126, + "grad_norm": 0.9471473097801208, + "learning_rate": 4.609881070115618e-08, + "loss": 0.2317, + "step": 29428 + }, + { + "epoch": 1.615257958287596, + "grad_norm": 1.5057287216186523, + "learning_rate": 4.578153291316667e-08, + "loss": 0.1776, + "step": 29430 + }, + { + "epoch": 1.6153677277716794, + "grad_norm": 1.0324440002441406, + "learning_rate": 4.546534973968175e-08, + "loss": 0.1606, + "step": 29432 + }, + { + "epoch": 1.615477497255763, + "grad_norm": 1.408877968788147, + "learning_rate": 4.515026119456811e-08, + "loss": 0.258, + "step": 29434 + }, + { + "epoch": 1.6155872667398463, + "grad_norm": 1.1265052556991577, + "learning_rate": 4.4836267291653575e-08, + "loss": 0.1825, + "step": 29436 + }, + { + "epoch": 1.6156970362239298, + "grad_norm": 1.2479749917984009, + "learning_rate": 4.4523368044704916e-08, + "loss": 0.1492, + "step": 29438 + }, + { + "epoch": 1.6158068057080133, + "grad_norm": 1.483195424079895, + "learning_rate": 4.421156346745281e-08, + "loss": 0.2021, + "step": 29440 + }, + { + "epoch": 1.6159165751920965, + "grad_norm": 0.9881606101989746, + "learning_rate": 4.390085357356966e-08, + "loss": 0.2101, + "step": 29442 + }, + { + "epoch": 1.61602634467618, + "grad_norm": 1.1049318313598633, + "learning_rate": 4.359123837668622e-08, + "loss": 0.1825, + "step": 29444 + }, + { + "epoch": 1.6161361141602635, + "grad_norm": 1.3122570514678955, + "learning_rate": 4.328271789038607e-08, + "loss": 0.1958, + "step": 29446 + }, + { + "epoch": 1.6162458836443467, + "grad_norm": 1.4831756353378296, + "learning_rate": 4.2975292128200064e-08, + "loss": 0.193, + "step": 29448 + }, + { + "epoch": 1.6163556531284304, + "grad_norm": 1.7231403589248657, + "learning_rate": 4.266896110361185e-08, + "loss": 0.1906, + "step": 29450 + }, + { + "epoch": 1.6164654226125137, + "grad_norm": 0.9171730875968933, + "learning_rate": 4.2363724830063456e-08, + "loss": 0.1539, + "step": 29452 + }, + { + "epoch": 1.6165751920965972, + "grad_norm": 2.0593743324279785, + "learning_rate": 4.2059583320941396e-08, + "loss": 0.1791, + "step": 29454 + }, + { + "epoch": 1.6166849615806806, + "grad_norm": 1.3020176887512207, + "learning_rate": 4.1756536589585004e-08, + "loss": 0.1517, + "step": 29456 + }, + { + "epoch": 1.616794731064764, + "grad_norm": 1.0171526670455933, + "learning_rate": 4.145458464929197e-08, + "loss": 0.1448, + "step": 29458 + }, + { + "epoch": 1.6169045005488474, + "grad_norm": 0.9759056568145752, + "learning_rate": 4.115372751330171e-08, + "loss": 0.1406, + "step": 29460 + }, + { + "epoch": 1.6170142700329309, + "grad_norm": 0.8903030157089233, + "learning_rate": 4.0853965194817547e-08, + "loss": 0.1938, + "step": 29462 + }, + { + "epoch": 1.6171240395170141, + "grad_norm": 1.3095186948776245, + "learning_rate": 4.055529770698174e-08, + "loss": 0.2095, + "step": 29464 + }, + { + "epoch": 1.6172338090010978, + "grad_norm": 0.8077903985977173, + "learning_rate": 4.0257725062900485e-08, + "loss": 0.2094, + "step": 29466 + }, + { + "epoch": 1.617343578485181, + "grad_norm": 1.638623595237732, + "learning_rate": 3.996124727562445e-08, + "loss": 0.2074, + "step": 29468 + }, + { + "epoch": 1.6174533479692645, + "grad_norm": 1.8297480344772339, + "learning_rate": 3.966586435815989e-08, + "loss": 0.2184, + "step": 29470 + }, + { + "epoch": 1.617563117453348, + "grad_norm": 0.6737520694732666, + "learning_rate": 3.937157632346311e-08, + "loss": 0.1673, + "step": 29472 + }, + { + "epoch": 1.6176728869374313, + "grad_norm": 1.1153099536895752, + "learning_rate": 3.907838318444046e-08, + "loss": 0.1941, + "step": 29474 + }, + { + "epoch": 1.617782656421515, + "grad_norm": 1.1895301342010498, + "learning_rate": 3.878628495395664e-08, + "loss": 0.1546, + "step": 29476 + }, + { + "epoch": 1.6178924259055982, + "grad_norm": 0.9299042224884033, + "learning_rate": 3.849528164482363e-08, + "loss": 0.1571, + "step": 29478 + }, + { + "epoch": 1.6180021953896817, + "grad_norm": 1.5054278373718262, + "learning_rate": 3.820537326980622e-08, + "loss": 0.1524, + "step": 29480 + }, + { + "epoch": 1.6181119648737652, + "grad_norm": 1.2107890844345093, + "learning_rate": 3.791655984162201e-08, + "loss": 0.1139, + "step": 29482 + }, + { + "epoch": 1.6182217343578484, + "grad_norm": 0.8580663204193115, + "learning_rate": 3.762884137293587e-08, + "loss": 0.1961, + "step": 29484 + }, + { + "epoch": 1.618331503841932, + "grad_norm": 1.2480738162994385, + "learning_rate": 3.734221787637382e-08, + "loss": 0.1967, + "step": 29486 + }, + { + "epoch": 1.6184412733260154, + "grad_norm": 1.1009520292282104, + "learning_rate": 3.705668936450357e-08, + "loss": 0.1322, + "step": 29488 + }, + { + "epoch": 1.6185510428100987, + "grad_norm": 1.2009975910186768, + "learning_rate": 3.6772255849853996e-08, + "loss": 0.1361, + "step": 29490 + }, + { + "epoch": 1.6186608122941823, + "grad_norm": 1.1292070150375366, + "learning_rate": 3.648891734490123e-08, + "loss": 0.1464, + "step": 29492 + }, + { + "epoch": 1.6187705817782656, + "grad_norm": 0.9447659850120544, + "learning_rate": 3.620667386207144e-08, + "loss": 0.2366, + "step": 29494 + }, + { + "epoch": 1.618880351262349, + "grad_norm": 1.2623512744903564, + "learning_rate": 3.592552541374361e-08, + "loss": 0.127, + "step": 29496 + }, + { + "epoch": 1.6189901207464326, + "grad_norm": 1.2337360382080078, + "learning_rate": 3.5645472012257874e-08, + "loss": 0.1719, + "step": 29498 + }, + { + "epoch": 1.6190998902305158, + "grad_norm": 1.3007798194885254, + "learning_rate": 3.536651366989052e-08, + "loss": 0.2132, + "step": 29500 + }, + { + "epoch": 1.6192096597145993, + "grad_norm": 0.9208580255508423, + "learning_rate": 3.508865039888176e-08, + "loss": 0.1722, + "step": 29502 + }, + { + "epoch": 1.6193194291986828, + "grad_norm": 1.207822561264038, + "learning_rate": 3.481188221142184e-08, + "loss": 0.1339, + "step": 29504 + }, + { + "epoch": 1.619429198682766, + "grad_norm": 1.464762806892395, + "learning_rate": 3.453620911964828e-08, + "loss": 0.2967, + "step": 29506 + }, + { + "epoch": 1.6195389681668497, + "grad_norm": 0.8531780242919922, + "learning_rate": 3.426163113565417e-08, + "loss": 0.2038, + "step": 29508 + }, + { + "epoch": 1.619648737650933, + "grad_norm": 1.0527937412261963, + "learning_rate": 3.398814827148267e-08, + "loss": 0.1432, + "step": 29510 + }, + { + "epoch": 1.6197585071350165, + "grad_norm": 1.5486663579940796, + "learning_rate": 3.37157605391325e-08, + "loss": 0.2592, + "step": 29512 + }, + { + "epoch": 1.6198682766191, + "grad_norm": 0.9336459040641785, + "learning_rate": 3.3444467950549676e-08, + "loss": 0.1082, + "step": 29514 + }, + { + "epoch": 1.6199780461031832, + "grad_norm": 0.8143896460533142, + "learning_rate": 3.317427051763855e-08, + "loss": 0.1111, + "step": 29516 + }, + { + "epoch": 1.6200878155872669, + "grad_norm": 1.4046810865402222, + "learning_rate": 3.290516825224521e-08, + "loss": 0.1641, + "step": 29518 + }, + { + "epoch": 1.6201975850713501, + "grad_norm": 0.9319739937782288, + "learning_rate": 3.263716116617965e-08, + "loss": 0.1016, + "step": 29520 + }, + { + "epoch": 1.6203073545554336, + "grad_norm": 1.2015315294265747, + "learning_rate": 3.237024927119359e-08, + "loss": 0.196, + "step": 29522 + }, + { + "epoch": 1.620417124039517, + "grad_norm": 0.7921619415283203, + "learning_rate": 3.210443257899709e-08, + "loss": 0.1833, + "step": 29524 + }, + { + "epoch": 1.6205268935236004, + "grad_norm": 0.7913414239883423, + "learning_rate": 3.1839711101247506e-08, + "loss": 0.1111, + "step": 29526 + }, + { + "epoch": 1.6206366630076838, + "grad_norm": 0.8257393836975098, + "learning_rate": 3.157608484956332e-08, + "loss": 0.0871, + "step": 29528 + }, + { + "epoch": 1.6207464324917673, + "grad_norm": 1.103928565979004, + "learning_rate": 3.131355383550194e-08, + "loss": 0.1777, + "step": 29530 + }, + { + "epoch": 1.6208562019758506, + "grad_norm": 1.3228057622909546, + "learning_rate": 3.105211807058195e-08, + "loss": 0.1598, + "step": 29532 + }, + { + "epoch": 1.6209659714599343, + "grad_norm": 0.8902404308319092, + "learning_rate": 3.079177756627194e-08, + "loss": 0.2588, + "step": 29534 + }, + { + "epoch": 1.6210757409440175, + "grad_norm": 2.2598090171813965, + "learning_rate": 3.053253233398779e-08, + "loss": 0.2636, + "step": 29536 + }, + { + "epoch": 1.621185510428101, + "grad_norm": 1.0685268640518188, + "learning_rate": 3.0274382385106494e-08, + "loss": 0.1602, + "step": 29538 + }, + { + "epoch": 1.6212952799121845, + "grad_norm": 1.4462857246398926, + "learning_rate": 3.001732773094679e-08, + "loss": 0.1342, + "step": 29540 + }, + { + "epoch": 1.6214050493962677, + "grad_norm": 1.2464667558670044, + "learning_rate": 2.976136838279131e-08, + "loss": 0.1309, + "step": 29542 + }, + { + "epoch": 1.6215148188803514, + "grad_norm": 1.1142559051513672, + "learning_rate": 2.950650435186164e-08, + "loss": 0.1696, + "step": 29544 + }, + { + "epoch": 1.6216245883644347, + "grad_norm": 0.8316273093223572, + "learning_rate": 2.9252735649337726e-08, + "loss": 0.211, + "step": 29546 + }, + { + "epoch": 1.6217343578485182, + "grad_norm": 1.1920337677001953, + "learning_rate": 2.9000062286352324e-08, + "loss": 0.256, + "step": 29548 + }, + { + "epoch": 1.6218441273326016, + "grad_norm": 0.883554995059967, + "learning_rate": 2.8748484273991017e-08, + "loss": 0.1704, + "step": 29550 + }, + { + "epoch": 1.621953896816685, + "grad_norm": 0.9455184936523438, + "learning_rate": 2.8498001623286642e-08, + "loss": 0.1008, + "step": 29552 + }, + { + "epoch": 1.6220636663007684, + "grad_norm": 0.9884763956069946, + "learning_rate": 2.8248614345224854e-08, + "loss": 0.2553, + "step": 29554 + }, + { + "epoch": 1.6221734357848518, + "grad_norm": 0.9210255742073059, + "learning_rate": 2.8000322450749684e-08, + "loss": 0.1118, + "step": 29556 + }, + { + "epoch": 1.622283205268935, + "grad_norm": 0.9909477233886719, + "learning_rate": 2.7753125950752413e-08, + "loss": 0.1899, + "step": 29558 + }, + { + "epoch": 1.6223929747530188, + "grad_norm": 1.1354327201843262, + "learning_rate": 2.7507024856071593e-08, + "loss": 0.1615, + "step": 29560 + }, + { + "epoch": 1.622502744237102, + "grad_norm": 1.125803828239441, + "learning_rate": 2.726201917750415e-08, + "loss": 0.1533, + "step": 29562 + }, + { + "epoch": 1.6226125137211855, + "grad_norm": 0.9938721060752869, + "learning_rate": 2.7018108925797038e-08, + "loss": 0.1325, + "step": 29564 + }, + { + "epoch": 1.622722283205269, + "grad_norm": 0.8249375224113464, + "learning_rate": 2.6775294111652804e-08, + "loss": 0.1317, + "step": 29566 + }, + { + "epoch": 1.6228320526893523, + "grad_norm": 1.23310387134552, + "learning_rate": 2.6533574745718494e-08, + "loss": 0.1442, + "step": 29568 + }, + { + "epoch": 1.6229418221734357, + "grad_norm": 0.8344631791114807, + "learning_rate": 2.6292950838599507e-08, + "loss": 0.1189, + "step": 29570 + }, + { + "epoch": 1.6230515916575192, + "grad_norm": 1.172223687171936, + "learning_rate": 2.6053422400848516e-08, + "loss": 0.1844, + "step": 29572 + }, + { + "epoch": 1.6231613611416025, + "grad_norm": 1.6362998485565186, + "learning_rate": 2.5814989442976556e-08, + "loss": 0.173, + "step": 29574 + }, + { + "epoch": 1.6232711306256862, + "grad_norm": 9.911587715148926, + "learning_rate": 2.557765197543638e-08, + "loss": 0.2045, + "step": 29576 + }, + { + "epoch": 1.6233809001097694, + "grad_norm": 1.2352392673492432, + "learning_rate": 2.534141000864465e-08, + "loss": 0.1229, + "step": 29578 + }, + { + "epoch": 1.623490669593853, + "grad_norm": 0.9182162880897522, + "learning_rate": 2.5106263552959752e-08, + "loss": 0.2199, + "step": 29580 + }, + { + "epoch": 1.6236004390779364, + "grad_norm": 1.1451672315597534, + "learning_rate": 2.4872212618698432e-08, + "loss": 0.2389, + "step": 29582 + }, + { + "epoch": 1.6237102085620196, + "grad_norm": 0.6725055575370789, + "learning_rate": 2.4639257216127477e-08, + "loss": 0.1573, + "step": 29584 + }, + { + "epoch": 1.6238199780461033, + "grad_norm": 1.0498212575912476, + "learning_rate": 2.4407397355466487e-08, + "loss": 0.2178, + "step": 29586 + }, + { + "epoch": 1.6239297475301866, + "grad_norm": 0.7951468229293823, + "learning_rate": 2.4176633046882335e-08, + "loss": 0.1895, + "step": 29588 + }, + { + "epoch": 1.62403951701427, + "grad_norm": 1.1599266529083252, + "learning_rate": 2.3946964300500252e-08, + "loss": 0.1355, + "step": 29590 + }, + { + "epoch": 1.6241492864983536, + "grad_norm": 1.1614048480987549, + "learning_rate": 2.3718391126392738e-08, + "loss": 0.1626, + "step": 29592 + }, + { + "epoch": 1.6242590559824368, + "grad_norm": 1.0304710865020752, + "learning_rate": 2.3490913534590654e-08, + "loss": 0.1339, + "step": 29594 + }, + { + "epoch": 1.6243688254665203, + "grad_norm": 1.218307375907898, + "learning_rate": 2.326453153506658e-08, + "loss": 0.1365, + "step": 29596 + }, + { + "epoch": 1.6244785949506038, + "grad_norm": 0.930066704750061, + "learning_rate": 2.3039245137751463e-08, + "loss": 0.1222, + "step": 29598 + }, + { + "epoch": 1.624588364434687, + "grad_norm": 1.889181137084961, + "learning_rate": 2.281505435253184e-08, + "loss": 0.258, + "step": 29600 + }, + { + "epoch": 1.6246981339187707, + "grad_norm": 0.9920958280563354, + "learning_rate": 2.2591959189238733e-08, + "loss": 0.1756, + "step": 29602 + }, + { + "epoch": 1.624807903402854, + "grad_norm": 1.0970691442489624, + "learning_rate": 2.2369959657658755e-08, + "loss": 0.1757, + "step": 29604 + }, + { + "epoch": 1.6249176728869374, + "grad_norm": 1.3350337743759155, + "learning_rate": 2.2149055767528572e-08, + "loss": 0.1487, + "step": 29606 + }, + { + "epoch": 1.625027442371021, + "grad_norm": 0.9048160910606384, + "learning_rate": 2.192924752854042e-08, + "loss": 0.1225, + "step": 29608 + }, + { + "epoch": 1.6251372118551042, + "grad_norm": 1.073733925819397, + "learning_rate": 2.171053495033659e-08, + "loss": 0.1133, + "step": 29610 + }, + { + "epoch": 1.6252469813391877, + "grad_norm": 1.3256202936172485, + "learning_rate": 2.1492918042506637e-08, + "loss": 0.2196, + "step": 29612 + }, + { + "epoch": 1.6253567508232711, + "grad_norm": 0.9836252927780151, + "learning_rate": 2.1276396814601252e-08, + "loss": 0.1323, + "step": 29614 + }, + { + "epoch": 1.6254665203073544, + "grad_norm": 1.265288233757019, + "learning_rate": 2.106097127611284e-08, + "loss": 0.1775, + "step": 29616 + }, + { + "epoch": 1.625576289791438, + "grad_norm": 0.8030617833137512, + "learning_rate": 2.0846641436497726e-08, + "loss": 0.1009, + "step": 29618 + }, + { + "epoch": 1.6256860592755213, + "grad_norm": 1.0393444299697876, + "learning_rate": 2.0633407305151174e-08, + "loss": 0.1684, + "step": 29620 + }, + { + "epoch": 1.6257958287596048, + "grad_norm": 0.9940599203109741, + "learning_rate": 2.042126889142959e-08, + "loss": 0.1436, + "step": 29622 + }, + { + "epoch": 1.6259055982436883, + "grad_norm": 1.0990005731582642, + "learning_rate": 2.0210226204639414e-08, + "loss": 0.2027, + "step": 29624 + }, + { + "epoch": 1.6260153677277716, + "grad_norm": 1.614382266998291, + "learning_rate": 2.0000279254037135e-08, + "loss": 0.2702, + "step": 29626 + }, + { + "epoch": 1.6261251372118553, + "grad_norm": 0.7891789674758911, + "learning_rate": 1.9791428048829275e-08, + "loss": 0.1701, + "step": 29628 + }, + { + "epoch": 1.6262349066959385, + "grad_norm": 1.2623695135116577, + "learning_rate": 1.9583672598180725e-08, + "loss": 0.1358, + "step": 29630 + }, + { + "epoch": 1.626344676180022, + "grad_norm": 1.9334652423858643, + "learning_rate": 1.937701291120364e-08, + "loss": 0.2127, + "step": 29632 + }, + { + "epoch": 1.6264544456641055, + "grad_norm": 0.975222647190094, + "learning_rate": 1.9171448996962994e-08, + "loss": 0.1403, + "step": 29634 + }, + { + "epoch": 1.6265642151481887, + "grad_norm": 0.9629133343696594, + "learning_rate": 1.896698086447657e-08, + "loss": 0.1347, + "step": 29636 + }, + { + "epoch": 1.6266739846322722, + "grad_norm": 2.1230270862579346, + "learning_rate": 1.876360852270942e-08, + "loss": 0.1198, + "step": 29638 + }, + { + "epoch": 1.6267837541163557, + "grad_norm": 1.2168362140655518, + "learning_rate": 1.856133198058774e-08, + "loss": 0.1578, + "step": 29640 + }, + { + "epoch": 1.626893523600439, + "grad_norm": 0.8383287787437439, + "learning_rate": 1.8360151246982203e-08, + "loss": 0.1659, + "step": 29642 + }, + { + "epoch": 1.6270032930845226, + "grad_norm": 0.8203924298286438, + "learning_rate": 1.8160066330716318e-08, + "loss": 0.2103, + "step": 29644 + }, + { + "epoch": 1.6271130625686059, + "grad_norm": 3.3696675300598145, + "learning_rate": 1.796107724056917e-08, + "loss": 0.2653, + "step": 29646 + }, + { + "epoch": 1.6272228320526894, + "grad_norm": 0.8176916241645813, + "learning_rate": 1.7763183985269883e-08, + "loss": 0.1682, + "step": 29648 + }, + { + "epoch": 1.6273326015367728, + "grad_norm": 2.617493152618408, + "learning_rate": 1.7566386573494854e-08, + "loss": 0.2243, + "step": 29650 + }, + { + "epoch": 1.627442371020856, + "grad_norm": 1.2433372735977173, + "learning_rate": 1.737068501387884e-08, + "loss": 0.321, + "step": 29652 + }, + { + "epoch": 1.6275521405049398, + "grad_norm": 1.5854899883270264, + "learning_rate": 1.717607931500942e-08, + "loss": 0.2476, + "step": 29654 + }, + { + "epoch": 1.627661909989023, + "grad_norm": 0.9742785692214966, + "learning_rate": 1.6982569485415877e-08, + "loss": 0.1734, + "step": 29656 + }, + { + "epoch": 1.6277716794731065, + "grad_norm": 1.4771876335144043, + "learning_rate": 1.67901555335942e-08, + "loss": 0.1911, + "step": 29658 + }, + { + "epoch": 1.62788144895719, + "grad_norm": 1.7023870944976807, + "learning_rate": 1.65988374679793e-08, + "loss": 0.2041, + "step": 29660 + }, + { + "epoch": 1.6279912184412733, + "grad_norm": 1.124038577079773, + "learning_rate": 1.640861529696447e-08, + "loss": 0.1607, + "step": 29662 + }, + { + "epoch": 1.6281009879253567, + "grad_norm": 1.232292652130127, + "learning_rate": 1.6219489028895806e-08, + "loss": 0.134, + "step": 29664 + }, + { + "epoch": 1.6282107574094402, + "grad_norm": 0.9595617055892944, + "learning_rate": 1.6031458672069455e-08, + "loss": 0.1259, + "step": 29666 + }, + { + "epoch": 1.6283205268935235, + "grad_norm": 1.2665282487869263, + "learning_rate": 1.584452423472882e-08, + "loss": 0.1694, + "step": 29668 + }, + { + "epoch": 1.6284302963776072, + "grad_norm": 0.8302558064460754, + "learning_rate": 1.5658685725078447e-08, + "loss": 0.1325, + "step": 29670 + }, + { + "epoch": 1.6285400658616904, + "grad_norm": 1.1405640840530396, + "learning_rate": 1.5473943151270153e-08, + "loss": 0.1802, + "step": 29672 + }, + { + "epoch": 1.628649835345774, + "grad_norm": 1.677743911743164, + "learning_rate": 1.5290296521403014e-08, + "loss": 0.2367, + "step": 29674 + }, + { + "epoch": 1.6287596048298574, + "grad_norm": 1.0244404077529907, + "learning_rate": 1.5107745843537246e-08, + "loss": 0.1459, + "step": 29676 + }, + { + "epoch": 1.6288693743139406, + "grad_norm": 0.8903450965881348, + "learning_rate": 1.492629112567756e-08, + "loss": 0.1783, + "step": 29678 + }, + { + "epoch": 1.628979143798024, + "grad_norm": 1.355812907218933, + "learning_rate": 1.474593237578703e-08, + "loss": 0.1675, + "step": 29680 + }, + { + "epoch": 1.6290889132821076, + "grad_norm": 1.0905948877334595, + "learning_rate": 1.456666960177322e-08, + "loss": 0.1794, + "step": 29682 + }, + { + "epoch": 1.6291986827661908, + "grad_norm": 1.7749900817871094, + "learning_rate": 1.4388502811499282e-08, + "loss": 0.2554, + "step": 29684 + }, + { + "epoch": 1.6293084522502745, + "grad_norm": 1.2212576866149902, + "learning_rate": 1.4211432012783965e-08, + "loss": 0.2274, + "step": 29686 + }, + { + "epoch": 1.6294182217343578, + "grad_norm": 1.1243259906768799, + "learning_rate": 1.4035457213393276e-08, + "loss": 0.1431, + "step": 29688 + }, + { + "epoch": 1.6295279912184413, + "grad_norm": 0.6286256313323975, + "learning_rate": 1.3860578421046044e-08, + "loss": 0.1061, + "step": 29690 + }, + { + "epoch": 1.6296377607025248, + "grad_norm": 1.1873599290847778, + "learning_rate": 1.3686795643411131e-08, + "loss": 0.2464, + "step": 29692 + }, + { + "epoch": 1.629747530186608, + "grad_norm": 0.8018577694892883, + "learning_rate": 1.3514108888112997e-08, + "loss": 0.1368, + "step": 29694 + }, + { + "epoch": 1.6298572996706917, + "grad_norm": 1.1057064533233643, + "learning_rate": 1.3342518162728912e-08, + "loss": 0.1717, + "step": 29696 + }, + { + "epoch": 1.629967069154775, + "grad_norm": 1.8595366477966309, + "learning_rate": 1.3172023474783412e-08, + "loss": 0.3204, + "step": 29698 + }, + { + "epoch": 1.6300768386388584, + "grad_norm": 1.2892128229141235, + "learning_rate": 1.3002624831756627e-08, + "loss": 0.1246, + "step": 29700 + }, + { + "epoch": 1.630186608122942, + "grad_norm": 1.949392557144165, + "learning_rate": 1.2834322241075947e-08, + "loss": 0.2229, + "step": 29702 + }, + { + "epoch": 1.6302963776070252, + "grad_norm": 1.2614085674285889, + "learning_rate": 1.2667115710127132e-08, + "loss": 0.2468, + "step": 29704 + }, + { + "epoch": 1.6304061470911086, + "grad_norm": 1.5061177015304565, + "learning_rate": 1.2501005246243202e-08, + "loss": 0.1629, + "step": 29706 + }, + { + "epoch": 1.6305159165751921, + "grad_norm": 0.7445420622825623, + "learning_rate": 1.233599085671e-08, + "loss": 0.1217, + "step": 29708 + }, + { + "epoch": 1.6306256860592754, + "grad_norm": 1.6593496799468994, + "learning_rate": 1.2172072548768954e-08, + "loss": 0.1971, + "step": 29710 + }, + { + "epoch": 1.630735455543359, + "grad_norm": 1.2722007036209106, + "learning_rate": 1.2009250329608757e-08, + "loss": 0.2319, + "step": 29712 + }, + { + "epoch": 1.6308452250274423, + "grad_norm": 1.0342371463775635, + "learning_rate": 1.1847524206368143e-08, + "loss": 0.1233, + "step": 29714 + }, + { + "epoch": 1.6309549945115258, + "grad_norm": 1.3148070573806763, + "learning_rate": 1.1686894186146991e-08, + "loss": 0.1364, + "step": 29716 + }, + { + "epoch": 1.6310647639956093, + "grad_norm": 1.2918777465820312, + "learning_rate": 1.1527360275986888e-08, + "loss": 0.2578, + "step": 29718 + }, + { + "epoch": 1.6311745334796925, + "grad_norm": 0.952185869216919, + "learning_rate": 1.136892248288779e-08, + "loss": 0.1748, + "step": 29720 + }, + { + "epoch": 1.631284302963776, + "grad_norm": 1.134081244468689, + "learning_rate": 1.1211580813799694e-08, + "loss": 0.1908, + "step": 29722 + }, + { + "epoch": 1.6313940724478595, + "grad_norm": 2.3327183723449707, + "learning_rate": 1.1055335275622636e-08, + "loss": 0.1786, + "step": 29724 + }, + { + "epoch": 1.6315038419319428, + "grad_norm": 1.1660603284835815, + "learning_rate": 1.0900185875215018e-08, + "loss": 0.2671, + "step": 29726 + }, + { + "epoch": 1.6316136114160265, + "grad_norm": 1.095460057258606, + "learning_rate": 1.0746132619374183e-08, + "loss": 0.2144, + "step": 29728 + }, + { + "epoch": 1.6317233809001097, + "grad_norm": 1.340430736541748, + "learning_rate": 1.0593175514866938e-08, + "loss": 0.1591, + "step": 29730 + }, + { + "epoch": 1.6318331503841932, + "grad_norm": 1.3406926393508911, + "learning_rate": 1.0441314568396254e-08, + "loss": 0.2245, + "step": 29732 + }, + { + "epoch": 1.6319429198682767, + "grad_norm": 0.975890040397644, + "learning_rate": 1.0290549786623472e-08, + "loss": 0.138, + "step": 29734 + }, + { + "epoch": 1.63205268935236, + "grad_norm": 0.9312527775764465, + "learning_rate": 1.0140881176165518e-08, + "loss": 0.2321, + "step": 29736 + }, + { + "epoch": 1.6321624588364436, + "grad_norm": 0.8352346420288086, + "learning_rate": 9.992308743586587e-09, + "loss": 0.2179, + "step": 29738 + }, + { + "epoch": 1.6322722283205269, + "grad_norm": 1.6623784303665161, + "learning_rate": 9.84483249540369e-09, + "loss": 0.1591, + "step": 29740 + }, + { + "epoch": 1.6323819978046104, + "grad_norm": 2.2222156524658203, + "learning_rate": 9.69845243808387e-09, + "loss": 0.1924, + "step": 29742 + }, + { + "epoch": 1.6324917672886938, + "grad_norm": 0.9877468943595886, + "learning_rate": 9.553168578049775e-09, + "loss": 0.1235, + "step": 29744 + }, + { + "epoch": 1.632601536772777, + "grad_norm": 1.4690104722976685, + "learning_rate": 9.408980921674081e-09, + "loss": 0.2143, + "step": 29746 + }, + { + "epoch": 1.6327113062568606, + "grad_norm": 1.3728915452957153, + "learning_rate": 9.265889475282286e-09, + "loss": 0.1372, + "step": 29748 + }, + { + "epoch": 1.632821075740944, + "grad_norm": 0.9471210241317749, + "learning_rate": 9.123894245149922e-09, + "loss": 0.1275, + "step": 29750 + }, + { + "epoch": 1.6329308452250273, + "grad_norm": 1.0719975233078003, + "learning_rate": 8.982995237505342e-09, + "loss": 0.1709, + "step": 29752 + }, + { + "epoch": 1.633040614709111, + "grad_norm": 1.8769299983978271, + "learning_rate": 8.843192458529714e-09, + "loss": 0.2537, + "step": 29754 + }, + { + "epoch": 1.6331503841931942, + "grad_norm": 1.0207494497299194, + "learning_rate": 8.704485914357019e-09, + "loss": 0.2135, + "step": 29756 + }, + { + "epoch": 1.6332601536772777, + "grad_norm": 0.5711489319801331, + "learning_rate": 8.566875611068504e-09, + "loss": 0.1032, + "step": 29758 + }, + { + "epoch": 1.6333699231613612, + "grad_norm": 1.0773746967315674, + "learning_rate": 8.430361554701005e-09, + "loss": 0.1939, + "step": 29760 + }, + { + "epoch": 1.6334796926454445, + "grad_norm": 1.0915566682815552, + "learning_rate": 8.294943751246954e-09, + "loss": 0.2115, + "step": 29762 + }, + { + "epoch": 1.633589462129528, + "grad_norm": 0.7660297751426697, + "learning_rate": 8.16062220664049e-09, + "loss": 0.1027, + "step": 29764 + }, + { + "epoch": 1.6336992316136114, + "grad_norm": 0.900168240070343, + "learning_rate": 8.027396926776897e-09, + "loss": 0.1458, + "step": 29766 + }, + { + "epoch": 1.633809001097695, + "grad_norm": 1.3411484956741333, + "learning_rate": 7.895267917501504e-09, + "loss": 0.2338, + "step": 29768 + }, + { + "epoch": 1.6339187705817784, + "grad_norm": 0.8563511371612549, + "learning_rate": 7.764235184604119e-09, + "loss": 0.2517, + "step": 29770 + }, + { + "epoch": 1.6340285400658616, + "grad_norm": 1.7050020694732666, + "learning_rate": 7.63429873384125e-09, + "loss": 0.2234, + "step": 29772 + }, + { + "epoch": 1.634138309549945, + "grad_norm": 1.0423216819763184, + "learning_rate": 7.505458570905565e-09, + "loss": 0.1822, + "step": 29774 + }, + { + "epoch": 1.6342480790340286, + "grad_norm": 0.8667453527450562, + "learning_rate": 7.377714701450877e-09, + "loss": 0.1234, + "step": 29776 + }, + { + "epoch": 1.6343578485181118, + "grad_norm": 1.3354032039642334, + "learning_rate": 7.25106713107826e-09, + "loss": 0.2707, + "step": 29778 + }, + { + "epoch": 1.6344676180021955, + "grad_norm": 1.3049856424331665, + "learning_rate": 7.12551586534993e-09, + "loss": 0.2598, + "step": 29780 + }, + { + "epoch": 1.6345773874862788, + "grad_norm": 1.6031303405761719, + "learning_rate": 7.001060909764268e-09, + "loss": 0.247, + "step": 29782 + }, + { + "epoch": 1.6346871569703623, + "grad_norm": 1.2382930517196655, + "learning_rate": 6.877702269786346e-09, + "loss": 0.1326, + "step": 29784 + }, + { + "epoch": 1.6347969264544457, + "grad_norm": 1.7664870023727417, + "learning_rate": 6.755439950828501e-09, + "loss": 0.2375, + "step": 29786 + }, + { + "epoch": 1.634906695938529, + "grad_norm": 1.166272521018982, + "learning_rate": 6.634273958247561e-09, + "loss": 0.2331, + "step": 29788 + }, + { + "epoch": 1.6350164654226125, + "grad_norm": 1.575636386871338, + "learning_rate": 6.514204297364268e-09, + "loss": 0.241, + "step": 29790 + }, + { + "epoch": 1.635126234906696, + "grad_norm": 1.3869775533676147, + "learning_rate": 6.395230973443856e-09, + "loss": 0.1677, + "step": 29792 + }, + { + "epoch": 1.6352360043907792, + "grad_norm": 1.5483490228652954, + "learning_rate": 6.277353991701596e-09, + "loss": 0.1562, + "step": 29794 + }, + { + "epoch": 1.635345773874863, + "grad_norm": 1.3384476900100708, + "learning_rate": 6.160573357313904e-09, + "loss": 0.1926, + "step": 29796 + }, + { + "epoch": 1.6354555433589462, + "grad_norm": 1.2540291547775269, + "learning_rate": 6.044889075398907e-09, + "loss": 0.208, + "step": 29798 + }, + { + "epoch": 1.6355653128430296, + "grad_norm": 1.3624006509780884, + "learning_rate": 5.930301151033102e-09, + "loss": 0.1983, + "step": 29800 + }, + { + "epoch": 1.6356750823271131, + "grad_norm": 1.4536970853805542, + "learning_rate": 5.816809589243022e-09, + "loss": 0.2381, + "step": 29802 + }, + { + "epoch": 1.6357848518111964, + "grad_norm": 1.1684616804122925, + "learning_rate": 5.704414395005242e-09, + "loss": 0.2201, + "step": 29804 + }, + { + "epoch": 1.63589462129528, + "grad_norm": 1.0546493530273438, + "learning_rate": 5.593115573251928e-09, + "loss": 0.247, + "step": 29806 + }, + { + "epoch": 1.6360043907793633, + "grad_norm": 1.2121927738189697, + "learning_rate": 5.48291312886251e-09, + "loss": 0.2769, + "step": 29808 + }, + { + "epoch": 1.6361141602634468, + "grad_norm": 1.1459075212478638, + "learning_rate": 5.373807066674785e-09, + "loss": 0.2384, + "step": 29810 + }, + { + "epoch": 1.6362239297475303, + "grad_norm": 1.2533907890319824, + "learning_rate": 5.265797391471039e-09, + "loss": 0.1678, + "step": 29812 + }, + { + "epoch": 1.6363336992316135, + "grad_norm": 1.499334692955017, + "learning_rate": 5.158884107991923e-09, + "loss": 0.2112, + "step": 29814 + }, + { + "epoch": 1.636443468715697, + "grad_norm": 1.2050641775131226, + "learning_rate": 5.053067220925356e-09, + "loss": 0.2482, + "step": 29816 + }, + { + "epoch": 1.6365532381997805, + "grad_norm": 1.4410178661346436, + "learning_rate": 4.948346734914844e-09, + "loss": 0.2021, + "step": 29818 + }, + { + "epoch": 1.6366630076838637, + "grad_norm": 1.145318627357483, + "learning_rate": 4.844722654553935e-09, + "loss": 0.1244, + "step": 29820 + }, + { + "epoch": 1.6367727771679474, + "grad_norm": 1.6579456329345703, + "learning_rate": 4.742194984383441e-09, + "loss": 0.2663, + "step": 29822 + }, + { + "epoch": 1.6368825466520307, + "grad_norm": 0.694979727268219, + "learning_rate": 4.640763728908093e-09, + "loss": 0.2101, + "step": 29824 + }, + { + "epoch": 1.6369923161361142, + "grad_norm": 1.2544704675674438, + "learning_rate": 4.540428892568782e-09, + "loss": 0.1711, + "step": 29826 + }, + { + "epoch": 1.6371020856201977, + "grad_norm": 1.411633849143982, + "learning_rate": 4.441190479775869e-09, + "loss": 0.1314, + "step": 29828 + }, + { + "epoch": 1.637211855104281, + "grad_norm": 0.8766047358512878, + "learning_rate": 4.3430484948758785e-09, + "loss": 0.1158, + "step": 29830 + }, + { + "epoch": 1.6373216245883644, + "grad_norm": 1.1825512647628784, + "learning_rate": 4.246002942173699e-09, + "loss": 0.2059, + "step": 29832 + }, + { + "epoch": 1.6374313940724479, + "grad_norm": 0.8201817870140076, + "learning_rate": 4.1500538259298115e-09, + "loss": 0.1673, + "step": 29834 + }, + { + "epoch": 1.6375411635565311, + "grad_norm": 1.4353708028793335, + "learning_rate": 4.055201150351961e-09, + "loss": 0.168, + "step": 29836 + }, + { + "epoch": 1.6376509330406148, + "grad_norm": 1.149905800819397, + "learning_rate": 3.96144491960071e-09, + "loss": 0.1842, + "step": 29838 + }, + { + "epoch": 1.637760702524698, + "grad_norm": 0.795581042766571, + "learning_rate": 3.868785137786657e-09, + "loss": 0.183, + "step": 29840 + }, + { + "epoch": 1.6378704720087816, + "grad_norm": 1.1209733486175537, + "learning_rate": 3.777221808975995e-09, + "loss": 0.1727, + "step": 29842 + }, + { + "epoch": 1.637980241492865, + "grad_norm": 0.590652585029602, + "learning_rate": 3.686754937184955e-09, + "loss": 0.163, + "step": 29844 + }, + { + "epoch": 1.6380900109769483, + "grad_norm": 1.084768295288086, + "learning_rate": 3.5973845263825857e-09, + "loss": 0.2398, + "step": 29846 + }, + { + "epoch": 1.638199780461032, + "grad_norm": 2.172666549682617, + "learning_rate": 3.5091105804907487e-09, + "loss": 0.248, + "step": 29848 + }, + { + "epoch": 1.6383095499451152, + "grad_norm": 0.9926042556762695, + "learning_rate": 3.421933103375796e-09, + "loss": 0.3632, + "step": 29850 + }, + { + "epoch": 1.6384193194291987, + "grad_norm": 1.4242222309112549, + "learning_rate": 3.3358520988679976e-09, + "loss": 0.1531, + "step": 29852 + }, + { + "epoch": 1.6385290889132822, + "grad_norm": 0.7837753891944885, + "learning_rate": 3.2508675707393354e-09, + "loss": 0.1336, + "step": 29854 + }, + { + "epoch": 1.6386388583973654, + "grad_norm": 1.012492060661316, + "learning_rate": 3.166979522717384e-09, + "loss": 0.1658, + "step": 29856 + }, + { + "epoch": 1.638748627881449, + "grad_norm": 1.3461847305297852, + "learning_rate": 3.0841879584853073e-09, + "loss": 0.25, + "step": 29858 + }, + { + "epoch": 1.6388583973655324, + "grad_norm": 2.1498420238494873, + "learning_rate": 3.0024928816707597e-09, + "loss": 0.3322, + "step": 29860 + }, + { + "epoch": 1.6389681668496157, + "grad_norm": 1.3794604539871216, + "learning_rate": 2.921894295862537e-09, + "loss": 0.1816, + "step": 29862 + }, + { + "epoch": 1.6390779363336994, + "grad_norm": 1.061031460762024, + "learning_rate": 2.842392204591149e-09, + "loss": 0.1995, + "step": 29864 + }, + { + "epoch": 1.6391877058177826, + "grad_norm": 1.483151912689209, + "learning_rate": 2.7639866113454706e-09, + "loss": 0.2359, + "step": 29866 + }, + { + "epoch": 1.639297475301866, + "grad_norm": 0.9464792013168335, + "learning_rate": 2.6866775195644176e-09, + "loss": 0.1624, + "step": 29868 + }, + { + "epoch": 1.6394072447859496, + "grad_norm": 1.4337605237960815, + "learning_rate": 2.610464932642498e-09, + "loss": 0.2129, + "step": 29870 + }, + { + "epoch": 1.6395170142700328, + "grad_norm": 1.0193934440612793, + "learning_rate": 2.5353488539187065e-09, + "loss": 0.1324, + "step": 29872 + }, + { + "epoch": 1.6396267837541163, + "grad_norm": 1.6457843780517578, + "learning_rate": 2.4613292866876303e-09, + "loss": 0.2467, + "step": 29874 + }, + { + "epoch": 1.6397365532381998, + "grad_norm": 1.2436647415161133, + "learning_rate": 2.3884062341994475e-09, + "loss": 0.2583, + "step": 29876 + }, + { + "epoch": 1.6398463227222833, + "grad_norm": 1.4373371601104736, + "learning_rate": 2.3165796996516e-09, + "loss": 0.2684, + "step": 29878 + }, + { + "epoch": 1.6399560922063667, + "grad_norm": 0.8073328733444214, + "learning_rate": 2.24584968619157e-09, + "loss": 0.1989, + "step": 29880 + }, + { + "epoch": 1.64006586169045, + "grad_norm": 0.9186545610427856, + "learning_rate": 2.1762161969279825e-09, + "loss": 0.2564, + "step": 29882 + }, + { + "epoch": 1.6401756311745335, + "grad_norm": 1.0944249629974365, + "learning_rate": 2.107679234911175e-09, + "loss": 0.2504, + "step": 29884 + }, + { + "epoch": 1.640285400658617, + "grad_norm": 0.9593408107757568, + "learning_rate": 2.0402388031470765e-09, + "loss": 0.2662, + "step": 29886 + }, + { + "epoch": 1.6403951701427002, + "grad_norm": 0.9865410327911377, + "learning_rate": 1.973894904597207e-09, + "loss": 0.1418, + "step": 29888 + }, + { + "epoch": 1.640504939626784, + "grad_norm": 0.9382830858230591, + "learning_rate": 1.908647542167574e-09, + "loss": 0.1253, + "step": 29890 + }, + { + "epoch": 1.6406147091108672, + "grad_norm": 0.7992318272590637, + "learning_rate": 1.8444967187253305e-09, + "loss": 0.1397, + "step": 29892 + }, + { + "epoch": 1.6407244785949506, + "grad_norm": 1.147316575050354, + "learning_rate": 1.7814424370793393e-09, + "loss": 0.1853, + "step": 29894 + }, + { + "epoch": 1.640834248079034, + "grad_norm": 0.9724656939506531, + "learning_rate": 1.7194846999996073e-09, + "loss": 0.1999, + "step": 29896 + }, + { + "epoch": 1.6409440175631174, + "grad_norm": 0.7066361308097839, + "learning_rate": 1.6586235102006297e-09, + "loss": 0.1709, + "step": 29898 + }, + { + "epoch": 1.6410537870472008, + "grad_norm": 1.4880198240280151, + "learning_rate": 1.5988588703552686e-09, + "loss": 0.179, + "step": 29900 + }, + { + "epoch": 1.6411635565312843, + "grad_norm": 1.3592129945755005, + "learning_rate": 1.540190783080875e-09, + "loss": 0.1699, + "step": 29902 + }, + { + "epoch": 1.6412733260153676, + "grad_norm": 1.0683602094650269, + "learning_rate": 1.4826192509559412e-09, + "loss": 0.2312, + "step": 29904 + }, + { + "epoch": 1.6413830954994513, + "grad_norm": 0.9584289193153381, + "learning_rate": 1.4261442765006739e-09, + "loss": 0.2117, + "step": 29906 + }, + { + "epoch": 1.6414928649835345, + "grad_norm": 1.2581422328948975, + "learning_rate": 1.3707658621964215e-09, + "loss": 0.1878, + "step": 29908 + }, + { + "epoch": 1.641602634467618, + "grad_norm": 0.8349116444587708, + "learning_rate": 1.3164840104717968e-09, + "loss": 0.1506, + "step": 29910 + }, + { + "epoch": 1.6417124039517015, + "grad_norm": 1.4347881078720093, + "learning_rate": 1.2632987237054528e-09, + "loss": 0.1499, + "step": 29912 + }, + { + "epoch": 1.6418221734357847, + "grad_norm": 0.9160740375518799, + "learning_rate": 1.2112100042316333e-09, + "loss": 0.1412, + "step": 29914 + }, + { + "epoch": 1.6419319429198684, + "grad_norm": 0.7936190366744995, + "learning_rate": 1.1602178543373975e-09, + "loss": 0.1705, + "step": 29916 + }, + { + "epoch": 1.6420417124039517, + "grad_norm": 1.1193958520889282, + "learning_rate": 1.1103222762542941e-09, + "loss": 0.2489, + "step": 29918 + }, + { + "epoch": 1.6421514818880352, + "grad_norm": 1.0619480609893799, + "learning_rate": 1.061523272177789e-09, + "loss": 0.1978, + "step": 29920 + }, + { + "epoch": 1.6422612513721186, + "grad_norm": 1.2368193864822388, + "learning_rate": 1.0138208442422859e-09, + "loss": 0.1806, + "step": 29922 + }, + { + "epoch": 1.642371020856202, + "grad_norm": 1.1387369632720947, + "learning_rate": 9.672149945433306e-10, + "loss": 0.2454, + "step": 29924 + }, + { + "epoch": 1.6424807903402854, + "grad_norm": 1.257914423942566, + "learning_rate": 9.217057251265093e-10, + "loss": 0.2047, + "step": 29926 + }, + { + "epoch": 1.6425905598243689, + "grad_norm": 1.1920608282089233, + "learning_rate": 8.772930379846722e-10, + "loss": 0.183, + "step": 29928 + }, + { + "epoch": 1.6427003293084521, + "grad_norm": 1.362051248550415, + "learning_rate": 8.339769350662607e-10, + "loss": 0.2745, + "step": 29930 + }, + { + "epoch": 1.6428100987925358, + "grad_norm": 0.7170136570930481, + "learning_rate": 7.917574182753074e-10, + "loss": 0.1092, + "step": 29932 + }, + { + "epoch": 1.642919868276619, + "grad_norm": 1.0778502225875854, + "learning_rate": 7.506344894603335e-10, + "loss": 0.104, + "step": 29934 + }, + { + "epoch": 1.6430296377607025, + "grad_norm": 1.487939715385437, + "learning_rate": 7.106081504254514e-10, + "loss": 0.2903, + "step": 29936 + }, + { + "epoch": 1.643139407244786, + "grad_norm": 0.7612434029579163, + "learning_rate": 6.716784029303647e-10, + "loss": 0.1323, + "step": 29938 + }, + { + "epoch": 1.6432491767288693, + "grad_norm": 1.3607312440872192, + "learning_rate": 6.338452486764901e-10, + "loss": 0.2053, + "step": 29940 + }, + { + "epoch": 1.6433589462129528, + "grad_norm": 1.688311219215393, + "learning_rate": 5.971086893263867e-10, + "loss": 0.1934, + "step": 29942 + }, + { + "epoch": 1.6434687156970362, + "grad_norm": 0.8936418890953064, + "learning_rate": 5.614687264898777e-10, + "loss": 0.2376, + "step": 29944 + }, + { + "epoch": 1.6435784851811195, + "grad_norm": 1.1485540866851807, + "learning_rate": 5.269253617351533e-10, + "loss": 0.1753, + "step": 29946 + }, + { + "epoch": 1.6436882546652032, + "grad_norm": 1.1621415615081787, + "learning_rate": 4.934785965721167e-10, + "loss": 0.1981, + "step": 29948 + }, + { + "epoch": 1.6437980241492864, + "grad_norm": 1.6899462938308716, + "learning_rate": 4.6112843246903794e-10, + "loss": 0.2556, + "step": 29950 + }, + { + "epoch": 1.64390779363337, + "grad_norm": 2.3713884353637695, + "learning_rate": 4.298748708470024e-10, + "loss": 0.1066, + "step": 29952 + }, + { + "epoch": 1.6440175631174534, + "grad_norm": 1.1065007448196411, + "learning_rate": 3.997179130771356e-10, + "loss": 0.1771, + "step": 29954 + }, + { + "epoch": 1.6441273326015367, + "grad_norm": 0.8111040592193604, + "learning_rate": 3.7065756048060287e-10, + "loss": 0.1368, + "step": 29956 + }, + { + "epoch": 1.6442371020856204, + "grad_norm": 0.7422636151313782, + "learning_rate": 3.426938143313851e-10, + "loss": 0.1861, + "step": 29958 + }, + { + "epoch": 1.6443468715697036, + "grad_norm": 0.9427836537361145, + "learning_rate": 3.158266758562789e-10, + "loss": 0.1064, + "step": 29960 + }, + { + "epoch": 1.644456641053787, + "grad_norm": 1.5767019987106323, + "learning_rate": 2.900561462348961e-10, + "loss": 0.3102, + "step": 29962 + }, + { + "epoch": 1.6445664105378706, + "grad_norm": 1.30277681350708, + "learning_rate": 2.6538222659966417e-10, + "loss": 0.2006, + "step": 29964 + }, + { + "epoch": 1.6446761800219538, + "grad_norm": 1.076054334640503, + "learning_rate": 2.418049180274995e-10, + "loss": 0.2842, + "step": 29966 + }, + { + "epoch": 1.6447859495060373, + "grad_norm": 1.2014013528823853, + "learning_rate": 2.1932422155923616e-10, + "loss": 0.1724, + "step": 29968 + }, + { + "epoch": 1.6448957189901208, + "grad_norm": 1.0241725444793701, + "learning_rate": 1.9794013817464596e-10, + "loss": 0.1586, + "step": 29970 + }, + { + "epoch": 1.645005488474204, + "grad_norm": 1.0352197885513306, + "learning_rate": 1.7765266881741848e-10, + "loss": 0.1553, + "step": 29972 + }, + { + "epoch": 1.6451152579582877, + "grad_norm": 1.0628383159637451, + "learning_rate": 1.584618143729566e-10, + "loss": 0.127, + "step": 29974 + }, + { + "epoch": 1.645225027442371, + "grad_norm": 1.1831018924713135, + "learning_rate": 1.4036757568502978e-10, + "loss": 0.1231, + "step": 29976 + }, + { + "epoch": 1.6453347969264545, + "grad_norm": 1.2989540100097656, + "learning_rate": 1.2336995354467197e-10, + "loss": 0.1813, + "step": 29978 + }, + { + "epoch": 1.645444566410538, + "grad_norm": 1.3935492038726807, + "learning_rate": 1.0746894870405922e-10, + "loss": 0.3583, + "step": 29980 + }, + { + "epoch": 1.6455543358946212, + "grad_norm": 1.0777441263198853, + "learning_rate": 9.266456185430539e-11, + "loss": 0.1378, + "step": 29982 + }, + { + "epoch": 1.6456641053787047, + "grad_norm": 2.083275318145752, + "learning_rate": 7.89567936476665e-11, + "loss": 0.2231, + "step": 29984 + }, + { + "epoch": 1.6457738748627881, + "grad_norm": 1.3656355142593384, + "learning_rate": 6.6345644683663e-11, + "loss": 0.3001, + "step": 29986 + }, + { + "epoch": 1.6458836443468716, + "grad_norm": 1.6670531034469604, + "learning_rate": 5.483111551740638e-11, + "loss": 0.2496, + "step": 29988 + }, + { + "epoch": 1.645993413830955, + "grad_norm": 1.6403172016143799, + "learning_rate": 4.441320665404813e-11, + "loss": 0.2526, + "step": 29990 + }, + { + "epoch": 1.6461031833150384, + "grad_norm": 1.0918723344802856, + "learning_rate": 3.509191854877969e-11, + "loss": 0.1357, + "step": 29992 + }, + { + "epoch": 1.6462129527991218, + "grad_norm": 1.1011109352111816, + "learning_rate": 2.686725161238357e-11, + "loss": 0.1039, + "step": 29994 + }, + { + "epoch": 1.6463227222832053, + "grad_norm": 0.901684582233429, + "learning_rate": 1.9739206205682258e-11, + "loss": 0.2243, + "step": 29996 + }, + { + "epoch": 1.6464324917672886, + "grad_norm": 1.2222239971160889, + "learning_rate": 1.3707782636762645e-11, + "loss": 0.2901, + "step": 29998 + }, + { + "epoch": 1.6465422612513723, + "grad_norm": 0.9421258568763733, + "learning_rate": 8.772981177629368e-12, + "loss": 0.135, + "step": 30000 + } + ], + "logging_steps": 2, + "max_steps": 30000, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.7376727527062405e+20, + "train_batch_size": 32, + "trial_name": null, + "trial_params": null +}