{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.999855103962907, "eval_steps": 500, "global_step": 5176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001159168296747084, "grad_norm": 2.7580204010009766, "learning_rate": 4.999425287356322e-05, "loss": 0.9725, "step": 1 }, { "epoch": 0.002318336593494168, "grad_norm": 1.794647216796875, "learning_rate": 4.998850574712644e-05, "loss": 0.8067, "step": 2 }, { "epoch": 0.003477504890241252, "grad_norm": 1.9236769676208496, "learning_rate": 4.9982758620689654e-05, "loss": 0.7883, "step": 3 }, { "epoch": 0.004636673186988336, "grad_norm": 1.9243451356887817, "learning_rate": 4.9977011494252876e-05, "loss": 0.694, "step": 4 }, { "epoch": 0.00579584148373542, "grad_norm": 2.325658082962036, "learning_rate": 4.99712643678161e-05, "loss": 0.658, "step": 5 }, { "epoch": 0.006955009780482504, "grad_norm": 1.9371262788772583, "learning_rate": 4.996551724137931e-05, "loss": 0.6053, "step": 6 }, { "epoch": 0.008114178077229587, "grad_norm": 2.50772762298584, "learning_rate": 4.9959770114942534e-05, "loss": 0.5673, "step": 7 }, { "epoch": 0.009273346373976673, "grad_norm": 1.7069337368011475, "learning_rate": 4.995402298850575e-05, "loss": 0.5236, "step": 8 }, { "epoch": 0.010432514670723756, "grad_norm": 2.124687433242798, "learning_rate": 4.9948275862068964e-05, "loss": 0.4998, "step": 9 }, { "epoch": 0.01159168296747084, "grad_norm": 0.8905035257339478, "learning_rate": 4.9942528735632185e-05, "loss": 0.4632, "step": 10 }, { "epoch": 0.012750851264217924, "grad_norm": 4.519129276275635, "learning_rate": 4.993678160919541e-05, "loss": 0.3914, "step": 11 }, { "epoch": 0.013910019560965008, "grad_norm": 0.8016653060913086, "learning_rate": 4.993103448275862e-05, "loss": 0.3829, "step": 12 }, { "epoch": 0.015069187857712092, "grad_norm": 0.8521615862846375, "learning_rate": 4.9925287356321844e-05, "loss": 0.3719, "step": 13 }, { "epoch": 0.016228356154459174, "grad_norm": 0.664720356464386, "learning_rate": 4.991954022988506e-05, "loss": 0.3444, "step": 14 }, { "epoch": 0.017387524451206258, "grad_norm": 1.236467719078064, "learning_rate": 4.991379310344828e-05, "loss": 0.3283, "step": 15 }, { "epoch": 0.018546692747953345, "grad_norm": 1.1017507314682007, "learning_rate": 4.9908045977011495e-05, "loss": 0.3348, "step": 16 }, { "epoch": 0.01970586104470043, "grad_norm": 0.70470130443573, "learning_rate": 4.990229885057472e-05, "loss": 0.318, "step": 17 }, { "epoch": 0.020865029341447513, "grad_norm": 0.6223506927490234, "learning_rate": 4.989655172413794e-05, "loss": 0.3032, "step": 18 }, { "epoch": 0.022024197638194597, "grad_norm": 0.7839768528938293, "learning_rate": 4.989080459770115e-05, "loss": 0.2854, "step": 19 }, { "epoch": 0.02318336593494168, "grad_norm": 1.5129585266113281, "learning_rate": 4.988505747126437e-05, "loss": 0.2912, "step": 20 }, { "epoch": 0.024342534231688764, "grad_norm": 0.6539391279220581, "learning_rate": 4.987931034482759e-05, "loss": 0.27, "step": 21 }, { "epoch": 0.025501702528435848, "grad_norm": 0.693023145198822, "learning_rate": 4.9873563218390805e-05, "loss": 0.2558, "step": 22 }, { "epoch": 0.026660870825182932, "grad_norm": 0.5335514545440674, "learning_rate": 4.986781609195402e-05, "loss": 0.2276, "step": 23 }, { "epoch": 0.027820039121930016, "grad_norm": 0.6515309810638428, "learning_rate": 4.986206896551724e-05, "loss": 0.2594, "step": 24 }, { "epoch": 0.0289792074186771, "grad_norm": 0.6215941905975342, "learning_rate": 4.985632183908046e-05, "loss": 0.255, "step": 25 }, { "epoch": 0.030138375715424184, "grad_norm": 0.6825059056282043, "learning_rate": 4.9850574712643685e-05, "loss": 0.2719, "step": 26 }, { "epoch": 0.03129754401217127, "grad_norm": 0.5802633166313171, "learning_rate": 4.98448275862069e-05, "loss": 0.2256, "step": 27 }, { "epoch": 0.03245671230891835, "grad_norm": 0.5147871971130371, "learning_rate": 4.9839080459770115e-05, "loss": 0.214, "step": 28 }, { "epoch": 0.033615880605665435, "grad_norm": 0.6146939396858215, "learning_rate": 4.9833333333333336e-05, "loss": 0.2401, "step": 29 }, { "epoch": 0.034775048902412516, "grad_norm": 0.62993323802948, "learning_rate": 4.982758620689655e-05, "loss": 0.246, "step": 30 }, { "epoch": 0.0359342171991596, "grad_norm": 0.6217732429504395, "learning_rate": 4.982183908045977e-05, "loss": 0.2162, "step": 31 }, { "epoch": 0.03709338549590669, "grad_norm": 0.5851067304611206, "learning_rate": 4.9816091954022994e-05, "loss": 0.2226, "step": 32 }, { "epoch": 0.03825255379265377, "grad_norm": 0.6002720594406128, "learning_rate": 4.981034482758621e-05, "loss": 0.226, "step": 33 }, { "epoch": 0.03941172208940086, "grad_norm": 0.6244901418685913, "learning_rate": 4.980459770114943e-05, "loss": 0.2536, "step": 34 }, { "epoch": 0.04057089038614794, "grad_norm": 0.6513347625732422, "learning_rate": 4.9798850574712646e-05, "loss": 0.2388, "step": 35 }, { "epoch": 0.041730058682895026, "grad_norm": 0.45215773582458496, "learning_rate": 4.979310344827586e-05, "loss": 0.197, "step": 36 }, { "epoch": 0.042889226979642106, "grad_norm": 0.45586729049682617, "learning_rate": 4.978735632183908e-05, "loss": 0.1992, "step": 37 }, { "epoch": 0.04404839527638919, "grad_norm": 0.49368321895599365, "learning_rate": 4.9781609195402304e-05, "loss": 0.2319, "step": 38 }, { "epoch": 0.045207563573136274, "grad_norm": 0.4354698956012726, "learning_rate": 4.977586206896552e-05, "loss": 0.2072, "step": 39 }, { "epoch": 0.04636673186988336, "grad_norm": 0.40046998858451843, "learning_rate": 4.977011494252874e-05, "loss": 0.183, "step": 40 }, { "epoch": 0.04752590016663044, "grad_norm": 0.41561418771743774, "learning_rate": 4.9764367816091956e-05, "loss": 0.196, "step": 41 }, { "epoch": 0.04868506846337753, "grad_norm": 0.4249739646911621, "learning_rate": 4.975862068965517e-05, "loss": 0.1903, "step": 42 }, { "epoch": 0.04984423676012461, "grad_norm": 0.5787274837493896, "learning_rate": 4.975287356321839e-05, "loss": 0.2098, "step": 43 }, { "epoch": 0.051003405056871697, "grad_norm": 0.4138984978199005, "learning_rate": 4.974712643678161e-05, "loss": 0.1942, "step": 44 }, { "epoch": 0.05216257335361878, "grad_norm": 0.4079653322696686, "learning_rate": 4.9741379310344836e-05, "loss": 0.2141, "step": 45 }, { "epoch": 0.053321741650365864, "grad_norm": 0.4127311408519745, "learning_rate": 4.973563218390805e-05, "loss": 0.1795, "step": 46 }, { "epoch": 0.054480909947112945, "grad_norm": 0.48196831345558167, "learning_rate": 4.9729885057471265e-05, "loss": 0.2008, "step": 47 }, { "epoch": 0.05564007824386003, "grad_norm": 0.43922609090805054, "learning_rate": 4.972413793103449e-05, "loss": 0.1897, "step": 48 }, { "epoch": 0.05679924654060711, "grad_norm": 0.455390065908432, "learning_rate": 4.97183908045977e-05, "loss": 0.1964, "step": 49 }, { "epoch": 0.0579584148373542, "grad_norm": 0.4199633300304413, "learning_rate": 4.971264367816092e-05, "loss": 0.1984, "step": 50 }, { "epoch": 0.05911758313410128, "grad_norm": 0.4313494861125946, "learning_rate": 4.970689655172414e-05, "loss": 0.2044, "step": 51 }, { "epoch": 0.06027675143084837, "grad_norm": 0.431208997964859, "learning_rate": 4.970114942528736e-05, "loss": 0.2028, "step": 52 }, { "epoch": 0.06143591972759545, "grad_norm": 0.4196656048297882, "learning_rate": 4.969540229885058e-05, "loss": 0.1892, "step": 53 }, { "epoch": 0.06259508802434254, "grad_norm": 0.43570151925086975, "learning_rate": 4.96896551724138e-05, "loss": 0.2068, "step": 54 }, { "epoch": 0.06375425632108962, "grad_norm": 0.3962918519973755, "learning_rate": 4.968390804597701e-05, "loss": 0.1936, "step": 55 }, { "epoch": 0.0649134246178367, "grad_norm": 0.6284250617027283, "learning_rate": 4.9678160919540233e-05, "loss": 0.1999, "step": 56 }, { "epoch": 0.06607259291458378, "grad_norm": 0.45828792452812195, "learning_rate": 4.967241379310345e-05, "loss": 0.1757, "step": 57 }, { "epoch": 0.06723176121133087, "grad_norm": 0.43321043252944946, "learning_rate": 4.966666666666667e-05, "loss": 0.1908, "step": 58 }, { "epoch": 0.06839092950807796, "grad_norm": 0.38716545701026917, "learning_rate": 4.966091954022989e-05, "loss": 0.1838, "step": 59 }, { "epoch": 0.06955009780482503, "grad_norm": 0.5458523035049438, "learning_rate": 4.9655172413793107e-05, "loss": 0.1996, "step": 60 }, { "epoch": 0.07070926610157212, "grad_norm": 0.5392327904701233, "learning_rate": 4.964942528735632e-05, "loss": 0.1895, "step": 61 }, { "epoch": 0.0718684343983192, "grad_norm": 0.4704841077327728, "learning_rate": 4.964367816091954e-05, "loss": 0.1984, "step": 62 }, { "epoch": 0.0730276026950663, "grad_norm": 0.3919082283973694, "learning_rate": 4.963793103448276e-05, "loss": 0.1642, "step": 63 }, { "epoch": 0.07418677099181338, "grad_norm": 0.46172910928726196, "learning_rate": 4.963218390804598e-05, "loss": 0.1665, "step": 64 }, { "epoch": 0.07534593928856045, "grad_norm": 0.42953500151634216, "learning_rate": 4.9626436781609195e-05, "loss": 0.1807, "step": 65 }, { "epoch": 0.07650510758530754, "grad_norm": 0.4844302535057068, "learning_rate": 4.9620689655172416e-05, "loss": 0.1978, "step": 66 }, { "epoch": 0.07766427588205463, "grad_norm": 0.5020731091499329, "learning_rate": 4.961494252873564e-05, "loss": 0.1943, "step": 67 }, { "epoch": 0.07882344417880172, "grad_norm": 0.38344666361808777, "learning_rate": 4.960919540229885e-05, "loss": 0.1893, "step": 68 }, { "epoch": 0.07998261247554879, "grad_norm": 0.46443063020706177, "learning_rate": 4.960344827586207e-05, "loss": 0.1896, "step": 69 }, { "epoch": 0.08114178077229588, "grad_norm": 0.49002230167388916, "learning_rate": 4.959770114942529e-05, "loss": 0.1754, "step": 70 }, { "epoch": 0.08230094906904296, "grad_norm": 0.40396377444267273, "learning_rate": 4.9591954022988504e-05, "loss": 0.1712, "step": 71 }, { "epoch": 0.08346011736579005, "grad_norm": 0.44212767481803894, "learning_rate": 4.9586206896551726e-05, "loss": 0.1745, "step": 72 }, { "epoch": 0.08461928566253712, "grad_norm": 0.40350252389907837, "learning_rate": 4.958045977011495e-05, "loss": 0.1664, "step": 73 }, { "epoch": 0.08577845395928421, "grad_norm": 0.45109373331069946, "learning_rate": 4.957471264367816e-05, "loss": 0.1794, "step": 74 }, { "epoch": 0.0869376222560313, "grad_norm": 0.4555060863494873, "learning_rate": 4.9568965517241384e-05, "loss": 0.1976, "step": 75 }, { "epoch": 0.08809679055277839, "grad_norm": 0.39049214124679565, "learning_rate": 4.95632183908046e-05, "loss": 0.1642, "step": 76 }, { "epoch": 0.08925595884952546, "grad_norm": 0.4325527250766754, "learning_rate": 4.9557471264367814e-05, "loss": 0.1735, "step": 77 }, { "epoch": 0.09041512714627255, "grad_norm": 0.38487985730171204, "learning_rate": 4.9551724137931036e-05, "loss": 0.1727, "step": 78 }, { "epoch": 0.09157429544301963, "grad_norm": 0.38345035910606384, "learning_rate": 4.954597701149426e-05, "loss": 0.153, "step": 79 }, { "epoch": 0.09273346373976672, "grad_norm": 0.504005491733551, "learning_rate": 4.954022988505747e-05, "loss": 0.1852, "step": 80 }, { "epoch": 0.0938926320365138, "grad_norm": 0.40111425518989563, "learning_rate": 4.9534482758620694e-05, "loss": 0.1834, "step": 81 }, { "epoch": 0.09505180033326088, "grad_norm": 0.3611973226070404, "learning_rate": 4.952873563218391e-05, "loss": 0.1714, "step": 82 }, { "epoch": 0.09621096863000797, "grad_norm": 0.37155240774154663, "learning_rate": 4.952298850574713e-05, "loss": 0.1589, "step": 83 }, { "epoch": 0.09737013692675506, "grad_norm": 0.4023563861846924, "learning_rate": 4.9517241379310346e-05, "loss": 0.1751, "step": 84 }, { "epoch": 0.09852930522350213, "grad_norm": 0.3763880431652069, "learning_rate": 4.951149425287356e-05, "loss": 0.1624, "step": 85 }, { "epoch": 0.09968847352024922, "grad_norm": 0.38636812567710876, "learning_rate": 4.950574712643679e-05, "loss": 0.1609, "step": 86 }, { "epoch": 0.1008476418169963, "grad_norm": 0.4320867657661438, "learning_rate": 4.9500000000000004e-05, "loss": 0.1945, "step": 87 }, { "epoch": 0.10200681011374339, "grad_norm": 0.3686438798904419, "learning_rate": 4.949425287356322e-05, "loss": 0.167, "step": 88 }, { "epoch": 0.10316597841049047, "grad_norm": 0.4408303201198578, "learning_rate": 4.948850574712644e-05, "loss": 0.1628, "step": 89 }, { "epoch": 0.10432514670723755, "grad_norm": 0.5211153030395508, "learning_rate": 4.9482758620689655e-05, "loss": 0.1902, "step": 90 }, { "epoch": 0.10548431500398464, "grad_norm": 0.3960166275501251, "learning_rate": 4.947701149425288e-05, "loss": 0.1615, "step": 91 }, { "epoch": 0.10664348330073173, "grad_norm": 0.3887445628643036, "learning_rate": 4.947126436781609e-05, "loss": 0.1663, "step": 92 }, { "epoch": 0.10780265159747882, "grad_norm": 0.4395574927330017, "learning_rate": 4.9465517241379314e-05, "loss": 0.1825, "step": 93 }, { "epoch": 0.10896181989422589, "grad_norm": 0.4707461893558502, "learning_rate": 4.9459770114942535e-05, "loss": 0.1955, "step": 94 }, { "epoch": 0.11012098819097298, "grad_norm": 0.4846711754798889, "learning_rate": 4.945402298850575e-05, "loss": 0.2082, "step": 95 }, { "epoch": 0.11128015648772006, "grad_norm": 0.45903241634368896, "learning_rate": 4.9448275862068965e-05, "loss": 0.1856, "step": 96 }, { "epoch": 0.11243932478446715, "grad_norm": 0.4076662063598633, "learning_rate": 4.944252873563219e-05, "loss": 0.1644, "step": 97 }, { "epoch": 0.11359849308121422, "grad_norm": 0.4092468321323395, "learning_rate": 4.94367816091954e-05, "loss": 0.1709, "step": 98 }, { "epoch": 0.11475766137796131, "grad_norm": 0.3405769169330597, "learning_rate": 4.943103448275862e-05, "loss": 0.1428, "step": 99 }, { "epoch": 0.1159168296747084, "grad_norm": 0.38396161794662476, "learning_rate": 4.9425287356321845e-05, "loss": 0.1582, "step": 100 }, { "epoch": 0.11707599797145549, "grad_norm": 0.38220831751823425, "learning_rate": 4.941954022988506e-05, "loss": 0.1645, "step": 101 }, { "epoch": 0.11823516626820256, "grad_norm": 0.3981216549873352, "learning_rate": 4.941379310344828e-05, "loss": 0.1587, "step": 102 }, { "epoch": 0.11939433456494965, "grad_norm": 0.3643917739391327, "learning_rate": 4.9408045977011496e-05, "loss": 0.1534, "step": 103 }, { "epoch": 0.12055350286169673, "grad_norm": 0.42615723609924316, "learning_rate": 4.940229885057471e-05, "loss": 0.169, "step": 104 }, { "epoch": 0.12171267115844382, "grad_norm": 0.4193412959575653, "learning_rate": 4.939655172413793e-05, "loss": 0.1586, "step": 105 }, { "epoch": 0.1228718394551909, "grad_norm": 0.4014485478401184, "learning_rate": 4.9390804597701155e-05, "loss": 0.1523, "step": 106 }, { "epoch": 0.12403100775193798, "grad_norm": 0.41445374488830566, "learning_rate": 4.938505747126437e-05, "loss": 0.1555, "step": 107 }, { "epoch": 0.12519017604868507, "grad_norm": 0.39025428891181946, "learning_rate": 4.937931034482759e-05, "loss": 0.1719, "step": 108 }, { "epoch": 0.12634934434543216, "grad_norm": 0.36380714178085327, "learning_rate": 4.9373563218390806e-05, "loss": 0.1539, "step": 109 }, { "epoch": 0.12750851264217924, "grad_norm": 0.4117802679538727, "learning_rate": 4.936781609195403e-05, "loss": 0.162, "step": 110 }, { "epoch": 0.12866768093892633, "grad_norm": 0.3868742883205414, "learning_rate": 4.936206896551724e-05, "loss": 0.1589, "step": 111 }, { "epoch": 0.1298268492356734, "grad_norm": 0.34659501910209656, "learning_rate": 4.935632183908046e-05, "loss": 0.1594, "step": 112 }, { "epoch": 0.13098601753242048, "grad_norm": 0.38265547156333923, "learning_rate": 4.935057471264368e-05, "loss": 0.1459, "step": 113 }, { "epoch": 0.13214518582916757, "grad_norm": 0.39254799485206604, "learning_rate": 4.93448275862069e-05, "loss": 0.1608, "step": 114 }, { "epoch": 0.13330435412591465, "grad_norm": 0.3889453411102295, "learning_rate": 4.9339080459770116e-05, "loss": 0.1637, "step": 115 }, { "epoch": 0.13446352242266174, "grad_norm": 0.41454294323921204, "learning_rate": 4.933333333333334e-05, "loss": 0.1532, "step": 116 }, { "epoch": 0.13562269071940883, "grad_norm": 0.4028087854385376, "learning_rate": 4.932758620689655e-05, "loss": 0.1569, "step": 117 }, { "epoch": 0.13678185901615592, "grad_norm": 0.4449205994606018, "learning_rate": 4.9321839080459774e-05, "loss": 0.1745, "step": 118 }, { "epoch": 0.137941027312903, "grad_norm": 0.3685709536075592, "learning_rate": 4.931609195402299e-05, "loss": 0.1476, "step": 119 }, { "epoch": 0.13910019560965006, "grad_norm": 0.44437339901924133, "learning_rate": 4.931034482758621e-05, "loss": 0.1607, "step": 120 }, { "epoch": 0.14025936390639715, "grad_norm": 0.5566555261611938, "learning_rate": 4.930459770114943e-05, "loss": 0.1943, "step": 121 }, { "epoch": 0.14141853220314424, "grad_norm": 0.46472108364105225, "learning_rate": 4.929885057471265e-05, "loss": 0.1634, "step": 122 }, { "epoch": 0.14257770049989132, "grad_norm": 0.38040968775749207, "learning_rate": 4.929310344827586e-05, "loss": 0.1411, "step": 123 }, { "epoch": 0.1437368687966384, "grad_norm": 0.3785533607006073, "learning_rate": 4.9287356321839084e-05, "loss": 0.1515, "step": 124 }, { "epoch": 0.1448960370933855, "grad_norm": 0.402432918548584, "learning_rate": 4.92816091954023e-05, "loss": 0.1529, "step": 125 }, { "epoch": 0.1460552053901326, "grad_norm": 0.3900763690471649, "learning_rate": 4.9275862068965514e-05, "loss": 0.1716, "step": 126 }, { "epoch": 0.14721437368687967, "grad_norm": 0.39079996943473816, "learning_rate": 4.927011494252874e-05, "loss": 0.1563, "step": 127 }, { "epoch": 0.14837354198362676, "grad_norm": 0.38309377431869507, "learning_rate": 4.926436781609196e-05, "loss": 0.1602, "step": 128 }, { "epoch": 0.14953271028037382, "grad_norm": 0.37955620884895325, "learning_rate": 4.925862068965518e-05, "loss": 0.1703, "step": 129 }, { "epoch": 0.1506918785771209, "grad_norm": 0.37275955080986023, "learning_rate": 4.9252873563218394e-05, "loss": 0.1599, "step": 130 }, { "epoch": 0.151851046873868, "grad_norm": 0.48944053053855896, "learning_rate": 4.924712643678161e-05, "loss": 0.1932, "step": 131 }, { "epoch": 0.15301021517061508, "grad_norm": 0.4082753360271454, "learning_rate": 4.924137931034483e-05, "loss": 0.1674, "step": 132 }, { "epoch": 0.15416938346736217, "grad_norm": 0.5443015098571777, "learning_rate": 4.9235632183908045e-05, "loss": 0.1679, "step": 133 }, { "epoch": 0.15532855176410926, "grad_norm": 0.3231983184814453, "learning_rate": 4.922988505747127e-05, "loss": 0.1397, "step": 134 }, { "epoch": 0.15648772006085634, "grad_norm": 0.384142130613327, "learning_rate": 4.922413793103449e-05, "loss": 0.1601, "step": 135 }, { "epoch": 0.15764688835760343, "grad_norm": 0.4432188868522644, "learning_rate": 4.9218390804597703e-05, "loss": 0.1746, "step": 136 }, { "epoch": 0.1588060566543505, "grad_norm": 0.3848440647125244, "learning_rate": 4.9212643678160925e-05, "loss": 0.1514, "step": 137 }, { "epoch": 0.15996522495109758, "grad_norm": 0.40900081396102905, "learning_rate": 4.920689655172414e-05, "loss": 0.1728, "step": 138 }, { "epoch": 0.16112439324784467, "grad_norm": 0.35513079166412354, "learning_rate": 4.9201149425287355e-05, "loss": 0.141, "step": 139 }, { "epoch": 0.16228356154459175, "grad_norm": 0.4035043716430664, "learning_rate": 4.9195402298850577e-05, "loss": 0.1507, "step": 140 }, { "epoch": 0.16344272984133884, "grad_norm": 0.4026646614074707, "learning_rate": 4.91896551724138e-05, "loss": 0.1629, "step": 141 }, { "epoch": 0.16460189813808593, "grad_norm": 0.35455062985420227, "learning_rate": 4.918390804597701e-05, "loss": 0.1671, "step": 142 }, { "epoch": 0.16576106643483302, "grad_norm": 0.3873222768306732, "learning_rate": 4.9178160919540235e-05, "loss": 0.1606, "step": 143 }, { "epoch": 0.1669202347315801, "grad_norm": 0.34480100870132446, "learning_rate": 4.917241379310345e-05, "loss": 0.1446, "step": 144 }, { "epoch": 0.16807940302832716, "grad_norm": 0.3599451184272766, "learning_rate": 4.9166666666666665e-05, "loss": 0.1432, "step": 145 }, { "epoch": 0.16923857132507425, "grad_norm": 0.38096269965171814, "learning_rate": 4.9160919540229886e-05, "loss": 0.1726, "step": 146 }, { "epoch": 0.17039773962182134, "grad_norm": 0.384888619184494, "learning_rate": 4.915517241379311e-05, "loss": 0.1579, "step": 147 }, { "epoch": 0.17155690791856842, "grad_norm": 0.3715001344680786, "learning_rate": 4.914942528735633e-05, "loss": 0.1734, "step": 148 }, { "epoch": 0.1727160762153155, "grad_norm": 0.4220834970474243, "learning_rate": 4.9143678160919545e-05, "loss": 0.1752, "step": 149 }, { "epoch": 0.1738752445120626, "grad_norm": 0.3289235830307007, "learning_rate": 4.913793103448276e-05, "loss": 0.1441, "step": 150 }, { "epoch": 0.1750344128088097, "grad_norm": 0.3469524085521698, "learning_rate": 4.913218390804598e-05, "loss": 0.1416, "step": 151 }, { "epoch": 0.17619358110555677, "grad_norm": 0.38370615243911743, "learning_rate": 4.9126436781609196e-05, "loss": 0.1604, "step": 152 }, { "epoch": 0.17735274940230383, "grad_norm": 0.33676958084106445, "learning_rate": 4.912068965517241e-05, "loss": 0.1585, "step": 153 }, { "epoch": 0.17851191769905092, "grad_norm": 0.3484872281551361, "learning_rate": 4.911494252873563e-05, "loss": 0.1516, "step": 154 }, { "epoch": 0.179671085995798, "grad_norm": 0.3425162434577942, "learning_rate": 4.9109195402298854e-05, "loss": 0.1468, "step": 155 }, { "epoch": 0.1808302542925451, "grad_norm": 0.3834597170352936, "learning_rate": 4.9103448275862076e-05, "loss": 0.1902, "step": 156 }, { "epoch": 0.18198942258929218, "grad_norm": 0.4861818253993988, "learning_rate": 4.909770114942529e-05, "loss": 0.1666, "step": 157 }, { "epoch": 0.18314859088603927, "grad_norm": 0.4293661415576935, "learning_rate": 4.9091954022988506e-05, "loss": 0.1476, "step": 158 }, { "epoch": 0.18430775918278636, "grad_norm": 0.4156044125556946, "learning_rate": 4.908620689655173e-05, "loss": 0.1427, "step": 159 }, { "epoch": 0.18546692747953344, "grad_norm": 0.3734503388404846, "learning_rate": 4.908045977011494e-05, "loss": 0.1461, "step": 160 }, { "epoch": 0.18662609577628053, "grad_norm": 0.4116658568382263, "learning_rate": 4.9074712643678164e-05, "loss": 0.1538, "step": 161 }, { "epoch": 0.1877852640730276, "grad_norm": 0.40862905979156494, "learning_rate": 4.9068965517241386e-05, "loss": 0.1567, "step": 162 }, { "epoch": 0.18894443236977468, "grad_norm": 0.3499276041984558, "learning_rate": 4.90632183908046e-05, "loss": 0.1413, "step": 163 }, { "epoch": 0.19010360066652177, "grad_norm": 0.40991586446762085, "learning_rate": 4.9057471264367816e-05, "loss": 0.1686, "step": 164 }, { "epoch": 0.19126276896326885, "grad_norm": 0.3891771733760834, "learning_rate": 4.905172413793104e-05, "loss": 0.157, "step": 165 }, { "epoch": 0.19242193726001594, "grad_norm": 0.3548631966114044, "learning_rate": 4.904597701149425e-05, "loss": 0.1384, "step": 166 }, { "epoch": 0.19358110555676303, "grad_norm": 0.36914005875587463, "learning_rate": 4.9040229885057474e-05, "loss": 0.1409, "step": 167 }, { "epoch": 0.19474027385351012, "grad_norm": 0.36561641097068787, "learning_rate": 4.9034482758620695e-05, "loss": 0.155, "step": 168 }, { "epoch": 0.1958994421502572, "grad_norm": 0.38047492504119873, "learning_rate": 4.902873563218391e-05, "loss": 0.153, "step": 169 }, { "epoch": 0.19705861044700426, "grad_norm": 0.3930132985115051, "learning_rate": 4.902298850574713e-05, "loss": 0.168, "step": 170 }, { "epoch": 0.19821777874375135, "grad_norm": 0.4097788333892822, "learning_rate": 4.901724137931035e-05, "loss": 0.1684, "step": 171 }, { "epoch": 0.19937694704049844, "grad_norm": 0.3460334837436676, "learning_rate": 4.901149425287356e-05, "loss": 0.1477, "step": 172 }, { "epoch": 0.20053611533724552, "grad_norm": 0.31552115082740784, "learning_rate": 4.9005747126436784e-05, "loss": 0.1382, "step": 173 }, { "epoch": 0.2016952836339926, "grad_norm": 0.36335986852645874, "learning_rate": 4.9e-05, "loss": 0.1429, "step": 174 }, { "epoch": 0.2028544519307397, "grad_norm": 0.32661738991737366, "learning_rate": 4.899425287356322e-05, "loss": 0.1515, "step": 175 }, { "epoch": 0.20401362022748679, "grad_norm": 0.386250764131546, "learning_rate": 4.898850574712644e-05, "loss": 0.1558, "step": 176 }, { "epoch": 0.20517278852423387, "grad_norm": 0.4087134599685669, "learning_rate": 4.898275862068966e-05, "loss": 0.1639, "step": 177 }, { "epoch": 0.20633195682098093, "grad_norm": 0.39643535017967224, "learning_rate": 4.897701149425288e-05, "loss": 0.1487, "step": 178 }, { "epoch": 0.20749112511772802, "grad_norm": 0.36634111404418945, "learning_rate": 4.897126436781609e-05, "loss": 0.1383, "step": 179 }, { "epoch": 0.2086502934144751, "grad_norm": 0.32414987683296204, "learning_rate": 4.896551724137931e-05, "loss": 0.1345, "step": 180 }, { "epoch": 0.2098094617112222, "grad_norm": 0.3729398548603058, "learning_rate": 4.895977011494253e-05, "loss": 0.14, "step": 181 }, { "epoch": 0.21096863000796928, "grad_norm": 0.32442769408226013, "learning_rate": 4.895402298850575e-05, "loss": 0.1573, "step": 182 }, { "epoch": 0.21212779830471637, "grad_norm": 0.33038130402565, "learning_rate": 4.8948275862068966e-05, "loss": 0.1539, "step": 183 }, { "epoch": 0.21328696660146346, "grad_norm": 0.3526158630847931, "learning_rate": 4.894252873563219e-05, "loss": 0.1471, "step": 184 }, { "epoch": 0.21444613489821054, "grad_norm": 0.3698391020298004, "learning_rate": 4.89367816091954e-05, "loss": 0.155, "step": 185 }, { "epoch": 0.21560530319495763, "grad_norm": 0.35763153433799744, "learning_rate": 4.8931034482758625e-05, "loss": 0.145, "step": 186 }, { "epoch": 0.2167644714917047, "grad_norm": 0.396771639585495, "learning_rate": 4.892528735632184e-05, "loss": 0.1616, "step": 187 }, { "epoch": 0.21792363978845178, "grad_norm": 0.3490588963031769, "learning_rate": 4.891954022988506e-05, "loss": 0.1491, "step": 188 }, { "epoch": 0.21908280808519887, "grad_norm": 0.3624797761440277, "learning_rate": 4.891379310344828e-05, "loss": 0.1731, "step": 189 }, { "epoch": 0.22024197638194595, "grad_norm": 0.45168307423591614, "learning_rate": 4.89080459770115e-05, "loss": 0.1598, "step": 190 }, { "epoch": 0.22140114467869304, "grad_norm": 0.35063374042510986, "learning_rate": 4.890229885057471e-05, "loss": 0.1479, "step": 191 }, { "epoch": 0.22256031297544013, "grad_norm": 0.317920982837677, "learning_rate": 4.8896551724137934e-05, "loss": 0.1432, "step": 192 }, { "epoch": 0.22371948127218722, "grad_norm": 0.31059324741363525, "learning_rate": 4.889080459770115e-05, "loss": 0.1419, "step": 193 }, { "epoch": 0.2248786495689343, "grad_norm": 0.3705626428127289, "learning_rate": 4.888505747126437e-05, "loss": 0.1583, "step": 194 }, { "epoch": 0.22603781786568136, "grad_norm": 0.3402438759803772, "learning_rate": 4.8879310344827586e-05, "loss": 0.138, "step": 195 }, { "epoch": 0.22719698616242845, "grad_norm": 0.3639499247074127, "learning_rate": 4.887356321839081e-05, "loss": 0.154, "step": 196 }, { "epoch": 0.22835615445917554, "grad_norm": 0.33627521991729736, "learning_rate": 4.886781609195403e-05, "loss": 0.1435, "step": 197 }, { "epoch": 0.22951532275592262, "grad_norm": 0.35936155915260315, "learning_rate": 4.8862068965517244e-05, "loss": 0.1567, "step": 198 }, { "epoch": 0.2306744910526697, "grad_norm": 0.4516816735267639, "learning_rate": 4.885632183908046e-05, "loss": 0.14, "step": 199 }, { "epoch": 0.2318336593494168, "grad_norm": 0.3906489610671997, "learning_rate": 4.885057471264368e-05, "loss": 0.1405, "step": 200 }, { "epoch": 0.23299282764616389, "grad_norm": 0.34470584988594055, "learning_rate": 4.8844827586206896e-05, "loss": 0.1519, "step": 201 }, { "epoch": 0.23415199594291097, "grad_norm": 0.3606179654598236, "learning_rate": 4.883908045977012e-05, "loss": 0.1514, "step": 202 }, { "epoch": 0.23531116423965803, "grad_norm": 0.31473562121391296, "learning_rate": 4.883333333333334e-05, "loss": 0.1472, "step": 203 }, { "epoch": 0.23647033253640512, "grad_norm": 0.3649226725101471, "learning_rate": 4.8827586206896554e-05, "loss": 0.1565, "step": 204 }, { "epoch": 0.2376295008331522, "grad_norm": 0.3161579370498657, "learning_rate": 4.8821839080459776e-05, "loss": 0.1357, "step": 205 }, { "epoch": 0.2387886691298993, "grad_norm": 0.3359610140323639, "learning_rate": 4.881609195402299e-05, "loss": 0.1486, "step": 206 }, { "epoch": 0.23994783742664638, "grad_norm": 0.38040071725845337, "learning_rate": 4.8810344827586205e-05, "loss": 0.1607, "step": 207 }, { "epoch": 0.24110700572339347, "grad_norm": 0.37179407477378845, "learning_rate": 4.880459770114943e-05, "loss": 0.1472, "step": 208 }, { "epoch": 0.24226617402014056, "grad_norm": 0.3661664128303528, "learning_rate": 4.879885057471265e-05, "loss": 0.1528, "step": 209 }, { "epoch": 0.24342534231688764, "grad_norm": 0.39830392599105835, "learning_rate": 4.8793103448275864e-05, "loss": 0.1465, "step": 210 }, { "epoch": 0.2445845106136347, "grad_norm": 0.4218502938747406, "learning_rate": 4.8787356321839085e-05, "loss": 0.1552, "step": 211 }, { "epoch": 0.2457436789103818, "grad_norm": 0.45591163635253906, "learning_rate": 4.87816091954023e-05, "loss": 0.1648, "step": 212 }, { "epoch": 0.24690284720712888, "grad_norm": 0.33883577585220337, "learning_rate": 4.877586206896552e-05, "loss": 0.1471, "step": 213 }, { "epoch": 0.24806201550387597, "grad_norm": 0.4139156937599182, "learning_rate": 4.877011494252874e-05, "loss": 0.1603, "step": 214 }, { "epoch": 0.24922118380062305, "grad_norm": 0.4006992280483246, "learning_rate": 4.876436781609195e-05, "loss": 0.1698, "step": 215 }, { "epoch": 0.25038035209737014, "grad_norm": 0.3272416889667511, "learning_rate": 4.875862068965517e-05, "loss": 0.1271, "step": 216 }, { "epoch": 0.2515395203941172, "grad_norm": 0.3487521708011627, "learning_rate": 4.8752873563218395e-05, "loss": 0.1528, "step": 217 }, { "epoch": 0.2526986886908643, "grad_norm": 0.39113104343414307, "learning_rate": 4.874712643678161e-05, "loss": 0.1763, "step": 218 }, { "epoch": 0.2538578569876114, "grad_norm": 0.32122161984443665, "learning_rate": 4.874137931034483e-05, "loss": 0.1324, "step": 219 }, { "epoch": 0.2550170252843585, "grad_norm": 0.32524266839027405, "learning_rate": 4.8735632183908047e-05, "loss": 0.1483, "step": 220 }, { "epoch": 0.25617619358110555, "grad_norm": 0.3443804085254669, "learning_rate": 4.872988505747126e-05, "loss": 0.1656, "step": 221 }, { "epoch": 0.25733536187785266, "grad_norm": 0.33968833088874817, "learning_rate": 4.872413793103448e-05, "loss": 0.1425, "step": 222 }, { "epoch": 0.2584945301745997, "grad_norm": 0.29967644810676575, "learning_rate": 4.8718390804597705e-05, "loss": 0.1458, "step": 223 }, { "epoch": 0.2596536984713468, "grad_norm": 0.36407482624053955, "learning_rate": 4.8712643678160926e-05, "loss": 0.1467, "step": 224 }, { "epoch": 0.2608128667680939, "grad_norm": 0.42614683508872986, "learning_rate": 4.870689655172414e-05, "loss": 0.1665, "step": 225 }, { "epoch": 0.26197203506484096, "grad_norm": 0.3307810425758362, "learning_rate": 4.8701149425287356e-05, "loss": 0.1389, "step": 226 }, { "epoch": 0.2631312033615881, "grad_norm": 0.3416149616241455, "learning_rate": 4.869540229885058e-05, "loss": 0.1616, "step": 227 }, { "epoch": 0.26429037165833513, "grad_norm": 0.3303369879722595, "learning_rate": 4.868965517241379e-05, "loss": 0.1472, "step": 228 }, { "epoch": 0.26544953995508225, "grad_norm": 0.31898778676986694, "learning_rate": 4.8683908045977015e-05, "loss": 0.1374, "step": 229 }, { "epoch": 0.2666087082518293, "grad_norm": 0.33968180418014526, "learning_rate": 4.8678160919540236e-05, "loss": 0.1549, "step": 230 }, { "epoch": 0.2677678765485764, "grad_norm": 0.3462338149547577, "learning_rate": 4.867241379310345e-05, "loss": 0.1548, "step": 231 }, { "epoch": 0.2689270448453235, "grad_norm": 0.27784019708633423, "learning_rate": 4.866666666666667e-05, "loss": 0.1323, "step": 232 }, { "epoch": 0.27008621314207054, "grad_norm": 0.3605106770992279, "learning_rate": 4.866091954022989e-05, "loss": 0.149, "step": 233 }, { "epoch": 0.27124538143881766, "grad_norm": 0.39858150482177734, "learning_rate": 4.86551724137931e-05, "loss": 0.1625, "step": 234 }, { "epoch": 0.2724045497355647, "grad_norm": 0.34399569034576416, "learning_rate": 4.8649425287356324e-05, "loss": 0.1536, "step": 235 }, { "epoch": 0.27356371803231183, "grad_norm": 0.33953142166137695, "learning_rate": 4.864367816091954e-05, "loss": 0.1471, "step": 236 }, { "epoch": 0.2747228863290589, "grad_norm": 0.3053218722343445, "learning_rate": 4.863793103448276e-05, "loss": 0.1288, "step": 237 }, { "epoch": 0.275882054625806, "grad_norm": 0.40773364901542664, "learning_rate": 4.863218390804598e-05, "loss": 0.1814, "step": 238 }, { "epoch": 0.27704122292255307, "grad_norm": 0.35415759682655334, "learning_rate": 4.86264367816092e-05, "loss": 0.1579, "step": 239 }, { "epoch": 0.2782003912193001, "grad_norm": 0.39274096488952637, "learning_rate": 4.862068965517241e-05, "loss": 0.1678, "step": 240 }, { "epoch": 0.27935955951604724, "grad_norm": 0.3034732937812805, "learning_rate": 4.8614942528735634e-05, "loss": 0.1498, "step": 241 }, { "epoch": 0.2805187278127943, "grad_norm": 0.3109908998012543, "learning_rate": 4.860919540229885e-05, "loss": 0.149, "step": 242 }, { "epoch": 0.2816778961095414, "grad_norm": 0.31770509481430054, "learning_rate": 4.860344827586207e-05, "loss": 0.1479, "step": 243 }, { "epoch": 0.2828370644062885, "grad_norm": 0.3324630856513977, "learning_rate": 4.859770114942529e-05, "loss": 0.15, "step": 244 }, { "epoch": 0.2839962327030356, "grad_norm": 0.30650344491004944, "learning_rate": 4.859195402298851e-05, "loss": 0.1328, "step": 245 }, { "epoch": 0.28515540099978265, "grad_norm": 0.3292284309864044, "learning_rate": 4.858620689655173e-05, "loss": 0.1432, "step": 246 }, { "epoch": 0.28631456929652976, "grad_norm": 0.3721246123313904, "learning_rate": 4.8580459770114944e-05, "loss": 0.1695, "step": 247 }, { "epoch": 0.2874737375932768, "grad_norm": 0.39138564467430115, "learning_rate": 4.857471264367816e-05, "loss": 0.1595, "step": 248 }, { "epoch": 0.2886329058900239, "grad_norm": 0.3113502860069275, "learning_rate": 4.856896551724138e-05, "loss": 0.1474, "step": 249 }, { "epoch": 0.289792074186771, "grad_norm": 0.3272615373134613, "learning_rate": 4.85632183908046e-05, "loss": 0.1557, "step": 250 }, { "epoch": 0.29095124248351806, "grad_norm": 0.32985273003578186, "learning_rate": 4.8557471264367824e-05, "loss": 0.1543, "step": 251 }, { "epoch": 0.2921104107802652, "grad_norm": 0.4038926362991333, "learning_rate": 4.855172413793104e-05, "loss": 0.1762, "step": 252 }, { "epoch": 0.29326957907701223, "grad_norm": 0.5201258063316345, "learning_rate": 4.8545977011494253e-05, "loss": 0.1435, "step": 253 }, { "epoch": 0.29442874737375935, "grad_norm": 0.43428415060043335, "learning_rate": 4.8540229885057475e-05, "loss": 0.1461, "step": 254 }, { "epoch": 0.2955879156705064, "grad_norm": 0.29131442308425903, "learning_rate": 4.853448275862069e-05, "loss": 0.1384, "step": 255 }, { "epoch": 0.2967470839672535, "grad_norm": 0.40190252661705017, "learning_rate": 4.8528735632183905e-05, "loss": 0.1433, "step": 256 }, { "epoch": 0.2979062522640006, "grad_norm": 0.34535473585128784, "learning_rate": 4.8522988505747133e-05, "loss": 0.147, "step": 257 }, { "epoch": 0.29906542056074764, "grad_norm": 0.3541038930416107, "learning_rate": 4.851724137931035e-05, "loss": 0.1658, "step": 258 }, { "epoch": 0.30022458885749476, "grad_norm": 0.39234408736228943, "learning_rate": 4.851149425287357e-05, "loss": 0.1711, "step": 259 }, { "epoch": 0.3013837571542418, "grad_norm": 0.37314096093177795, "learning_rate": 4.8505747126436785e-05, "loss": 0.1475, "step": 260 }, { "epoch": 0.30254292545098893, "grad_norm": 0.3385733366012573, "learning_rate": 4.85e-05, "loss": 0.1406, "step": 261 }, { "epoch": 0.303702093747736, "grad_norm": 0.6898816823959351, "learning_rate": 4.849425287356322e-05, "loss": 0.1584, "step": 262 }, { "epoch": 0.3048612620444831, "grad_norm": 0.30045434832572937, "learning_rate": 4.8488505747126436e-05, "loss": 0.1329, "step": 263 }, { "epoch": 0.30602043034123017, "grad_norm": 0.40219223499298096, "learning_rate": 4.848275862068966e-05, "loss": 0.1529, "step": 264 }, { "epoch": 0.3071795986379772, "grad_norm": 0.30509233474731445, "learning_rate": 4.847701149425288e-05, "loss": 0.1591, "step": 265 }, { "epoch": 0.30833876693472434, "grad_norm": 0.3983840048313141, "learning_rate": 4.8471264367816095e-05, "loss": 0.1598, "step": 266 }, { "epoch": 0.3094979352314714, "grad_norm": 0.39158061146736145, "learning_rate": 4.846551724137931e-05, "loss": 0.164, "step": 267 }, { "epoch": 0.3106571035282185, "grad_norm": 0.3619145452976227, "learning_rate": 4.845977011494253e-05, "loss": 0.152, "step": 268 }, { "epoch": 0.3118162718249656, "grad_norm": 0.34043577313423157, "learning_rate": 4.8454022988505746e-05, "loss": 0.1465, "step": 269 }, { "epoch": 0.3129754401217127, "grad_norm": 0.38417112827301025, "learning_rate": 4.844827586206897e-05, "loss": 0.1398, "step": 270 }, { "epoch": 0.31413460841845975, "grad_norm": 0.33110880851745605, "learning_rate": 4.844252873563219e-05, "loss": 0.1435, "step": 271 }, { "epoch": 0.31529377671520686, "grad_norm": 0.3095364272594452, "learning_rate": 4.8436781609195404e-05, "loss": 0.1474, "step": 272 }, { "epoch": 0.3164529450119539, "grad_norm": 0.34774062037467957, "learning_rate": 4.8431034482758626e-05, "loss": 0.156, "step": 273 }, { "epoch": 0.317612113308701, "grad_norm": 0.3605648875236511, "learning_rate": 4.842528735632184e-05, "loss": 0.1545, "step": 274 }, { "epoch": 0.3187712816054481, "grad_norm": 0.32547488808631897, "learning_rate": 4.8419540229885056e-05, "loss": 0.1448, "step": 275 }, { "epoch": 0.31993044990219516, "grad_norm": 0.4434783458709717, "learning_rate": 4.841379310344828e-05, "loss": 0.1837, "step": 276 }, { "epoch": 0.3210896181989423, "grad_norm": 0.3950771391391754, "learning_rate": 4.840804597701149e-05, "loss": 0.1667, "step": 277 }, { "epoch": 0.32224878649568933, "grad_norm": 0.27604764699935913, "learning_rate": 4.840229885057472e-05, "loss": 0.1311, "step": 278 }, { "epoch": 0.32340795479243645, "grad_norm": 0.29781222343444824, "learning_rate": 4.8396551724137936e-05, "loss": 0.1507, "step": 279 }, { "epoch": 0.3245671230891835, "grad_norm": 0.300820529460907, "learning_rate": 4.839080459770115e-05, "loss": 0.1395, "step": 280 }, { "epoch": 0.3257262913859306, "grad_norm": 0.3563086688518524, "learning_rate": 4.838505747126437e-05, "loss": 0.1542, "step": 281 }, { "epoch": 0.3268854596826777, "grad_norm": 0.3500363826751709, "learning_rate": 4.837931034482759e-05, "loss": 0.1531, "step": 282 }, { "epoch": 0.32804462797942474, "grad_norm": 0.340751975774765, "learning_rate": 4.83735632183908e-05, "loss": 0.154, "step": 283 }, { "epoch": 0.32920379627617186, "grad_norm": 0.3770763874053955, "learning_rate": 4.8367816091954024e-05, "loss": 0.1632, "step": 284 }, { "epoch": 0.3303629645729189, "grad_norm": 0.3394136130809784, "learning_rate": 4.8362068965517246e-05, "loss": 0.1575, "step": 285 }, { "epoch": 0.33152213286966603, "grad_norm": 0.3633858263492584, "learning_rate": 4.835632183908046e-05, "loss": 0.1511, "step": 286 }, { "epoch": 0.3326813011664131, "grad_norm": 0.27468162775039673, "learning_rate": 4.835057471264368e-05, "loss": 0.1445, "step": 287 }, { "epoch": 0.3338404694631602, "grad_norm": 0.29763033986091614, "learning_rate": 4.83448275862069e-05, "loss": 0.1462, "step": 288 }, { "epoch": 0.33499963775990726, "grad_norm": 0.2883772850036621, "learning_rate": 4.833908045977012e-05, "loss": 0.135, "step": 289 }, { "epoch": 0.3361588060566543, "grad_norm": 0.3238728642463684, "learning_rate": 4.8333333333333334e-05, "loss": 0.1576, "step": 290 }, { "epoch": 0.33731797435340144, "grad_norm": 0.4045778214931488, "learning_rate": 4.8327586206896555e-05, "loss": 0.1555, "step": 291 }, { "epoch": 0.3384771426501485, "grad_norm": 0.3956005871295929, "learning_rate": 4.832183908045978e-05, "loss": 0.17, "step": 292 }, { "epoch": 0.3396363109468956, "grad_norm": 0.2896331548690796, "learning_rate": 4.831609195402299e-05, "loss": 0.1436, "step": 293 }, { "epoch": 0.3407954792436427, "grad_norm": 0.29543447494506836, "learning_rate": 4.831034482758621e-05, "loss": 0.1499, "step": 294 }, { "epoch": 0.3419546475403898, "grad_norm": 0.32724907994270325, "learning_rate": 4.830459770114943e-05, "loss": 0.1568, "step": 295 }, { "epoch": 0.34311381583713685, "grad_norm": 0.376208633184433, "learning_rate": 4.829885057471264e-05, "loss": 0.1449, "step": 296 }, { "epoch": 0.34427298413388396, "grad_norm": 0.29767367243766785, "learning_rate": 4.8293103448275865e-05, "loss": 0.1444, "step": 297 }, { "epoch": 0.345432152430631, "grad_norm": 0.30211395025253296, "learning_rate": 4.828735632183909e-05, "loss": 0.1375, "step": 298 }, { "epoch": 0.3465913207273781, "grad_norm": 0.29970985651016235, "learning_rate": 4.82816091954023e-05, "loss": 0.1319, "step": 299 }, { "epoch": 0.3477504890241252, "grad_norm": 0.30623510479927063, "learning_rate": 4.827586206896552e-05, "loss": 0.1489, "step": 300 }, { "epoch": 0.34890965732087226, "grad_norm": 0.31533321738243103, "learning_rate": 4.827011494252874e-05, "loss": 0.1525, "step": 301 }, { "epoch": 0.3500688256176194, "grad_norm": 0.35718074440956116, "learning_rate": 4.826436781609195e-05, "loss": 0.1465, "step": 302 }, { "epoch": 0.35122799391436643, "grad_norm": 0.3315868675708771, "learning_rate": 4.8258620689655175e-05, "loss": 0.1438, "step": 303 }, { "epoch": 0.35238716221111355, "grad_norm": 0.3789491355419159, "learning_rate": 4.825287356321839e-05, "loss": 0.1624, "step": 304 }, { "epoch": 0.3535463305078606, "grad_norm": 0.3216198980808258, "learning_rate": 4.824712643678161e-05, "loss": 0.1495, "step": 305 }, { "epoch": 0.35470549880460767, "grad_norm": 0.24692386388778687, "learning_rate": 4.824137931034483e-05, "loss": 0.1333, "step": 306 }, { "epoch": 0.3558646671013548, "grad_norm": 0.26879554986953735, "learning_rate": 4.823563218390805e-05, "loss": 0.1328, "step": 307 }, { "epoch": 0.35702383539810184, "grad_norm": 0.3616696894168854, "learning_rate": 4.822988505747127e-05, "loss": 0.1642, "step": 308 }, { "epoch": 0.35818300369484896, "grad_norm": 0.4611773192882538, "learning_rate": 4.8224137931034485e-05, "loss": 0.1342, "step": 309 }, { "epoch": 0.359342171991596, "grad_norm": 0.26914867758750916, "learning_rate": 4.82183908045977e-05, "loss": 0.1393, "step": 310 }, { "epoch": 0.36050134028834313, "grad_norm": 0.3391886353492737, "learning_rate": 4.821264367816092e-05, "loss": 0.1502, "step": 311 }, { "epoch": 0.3616605085850902, "grad_norm": 0.34171319007873535, "learning_rate": 4.820689655172414e-05, "loss": 0.1655, "step": 312 }, { "epoch": 0.3628196768818373, "grad_norm": 0.2735805809497833, "learning_rate": 4.820114942528736e-05, "loss": 0.1211, "step": 313 }, { "epoch": 0.36397884517858436, "grad_norm": 0.25337541103363037, "learning_rate": 4.819540229885058e-05, "loss": 0.1343, "step": 314 }, { "epoch": 0.3651380134753314, "grad_norm": 0.3965268135070801, "learning_rate": 4.8189655172413794e-05, "loss": 0.1507, "step": 315 }, { "epoch": 0.36629718177207854, "grad_norm": 0.26941707730293274, "learning_rate": 4.8183908045977016e-05, "loss": 0.1258, "step": 316 }, { "epoch": 0.3674563500688256, "grad_norm": 0.31832900643348694, "learning_rate": 4.817816091954023e-05, "loss": 0.1693, "step": 317 }, { "epoch": 0.3686155183655727, "grad_norm": 0.3194613456726074, "learning_rate": 4.817241379310345e-05, "loss": 0.1574, "step": 318 }, { "epoch": 0.3697746866623198, "grad_norm": 0.3136231601238251, "learning_rate": 4.8166666666666674e-05, "loss": 0.1301, "step": 319 }, { "epoch": 0.3709338549590669, "grad_norm": 0.302774578332901, "learning_rate": 4.816091954022989e-05, "loss": 0.1329, "step": 320 }, { "epoch": 0.37209302325581395, "grad_norm": 0.3740023672580719, "learning_rate": 4.8155172413793104e-05, "loss": 0.1398, "step": 321 }, { "epoch": 0.37325219155256106, "grad_norm": 0.35367128252983093, "learning_rate": 4.8149425287356326e-05, "loss": 0.1461, "step": 322 }, { "epoch": 0.3744113598493081, "grad_norm": 0.35140734910964966, "learning_rate": 4.814367816091954e-05, "loss": 0.1545, "step": 323 }, { "epoch": 0.3755705281460552, "grad_norm": 0.3153414726257324, "learning_rate": 4.8137931034482755e-05, "loss": 0.1283, "step": 324 }, { "epoch": 0.3767296964428023, "grad_norm": 0.30042028427124023, "learning_rate": 4.813218390804598e-05, "loss": 0.151, "step": 325 }, { "epoch": 0.37788886473954936, "grad_norm": 0.29036179184913635, "learning_rate": 4.81264367816092e-05, "loss": 0.1486, "step": 326 }, { "epoch": 0.3790480330362965, "grad_norm": 0.2988058924674988, "learning_rate": 4.812068965517242e-05, "loss": 0.1545, "step": 327 }, { "epoch": 0.38020720133304353, "grad_norm": 0.3292902410030365, "learning_rate": 4.8114942528735635e-05, "loss": 0.1629, "step": 328 }, { "epoch": 0.38136636962979065, "grad_norm": 0.3768965005874634, "learning_rate": 4.810919540229885e-05, "loss": 0.1692, "step": 329 }, { "epoch": 0.3825255379265377, "grad_norm": 0.28651896119117737, "learning_rate": 4.810344827586207e-05, "loss": 0.1534, "step": 330 }, { "epoch": 0.38368470622328477, "grad_norm": 0.2976914942264557, "learning_rate": 4.809770114942529e-05, "loss": 0.1451, "step": 331 }, { "epoch": 0.3848438745200319, "grad_norm": 0.3275349736213684, "learning_rate": 4.809195402298851e-05, "loss": 0.1632, "step": 332 }, { "epoch": 0.38600304281677894, "grad_norm": 0.26491785049438477, "learning_rate": 4.808620689655173e-05, "loss": 0.1334, "step": 333 }, { "epoch": 0.38716221111352606, "grad_norm": 0.28282400965690613, "learning_rate": 4.8080459770114945e-05, "loss": 0.1349, "step": 334 }, { "epoch": 0.3883213794102731, "grad_norm": 0.31815335154533386, "learning_rate": 4.807471264367817e-05, "loss": 0.1514, "step": 335 }, { "epoch": 0.38948054770702023, "grad_norm": 0.29894590377807617, "learning_rate": 4.806896551724138e-05, "loss": 0.1563, "step": 336 }, { "epoch": 0.3906397160037673, "grad_norm": 0.31386902928352356, "learning_rate": 4.80632183908046e-05, "loss": 0.1553, "step": 337 }, { "epoch": 0.3917988843005144, "grad_norm": 0.2859923541545868, "learning_rate": 4.805747126436782e-05, "loss": 0.1531, "step": 338 }, { "epoch": 0.39295805259726146, "grad_norm": 0.3352300524711609, "learning_rate": 4.805172413793104e-05, "loss": 0.1412, "step": 339 }, { "epoch": 0.3941172208940085, "grad_norm": 0.3148883283138275, "learning_rate": 4.8045977011494255e-05, "loss": 0.1297, "step": 340 }, { "epoch": 0.39527638919075564, "grad_norm": 0.27482402324676514, "learning_rate": 4.8040229885057477e-05, "loss": 0.1333, "step": 341 }, { "epoch": 0.3964355574875027, "grad_norm": 0.3114891052246094, "learning_rate": 4.803448275862069e-05, "loss": 0.1494, "step": 342 }, { "epoch": 0.3975947257842498, "grad_norm": 0.3976518213748932, "learning_rate": 4.8028735632183906e-05, "loss": 0.1575, "step": 343 }, { "epoch": 0.3987538940809969, "grad_norm": 0.4233146607875824, "learning_rate": 4.802298850574713e-05, "loss": 0.1459, "step": 344 }, { "epoch": 0.399913062377744, "grad_norm": 0.30527806282043457, "learning_rate": 4.801724137931034e-05, "loss": 0.1369, "step": 345 }, { "epoch": 0.40107223067449105, "grad_norm": 0.28466179966926575, "learning_rate": 4.8011494252873565e-05, "loss": 0.1283, "step": 346 }, { "epoch": 0.40223139897123816, "grad_norm": 0.3994863033294678, "learning_rate": 4.8005747126436786e-05, "loss": 0.1623, "step": 347 }, { "epoch": 0.4033905672679852, "grad_norm": 0.32415613532066345, "learning_rate": 4.8e-05, "loss": 0.155, "step": 348 }, { "epoch": 0.4045497355647323, "grad_norm": 0.37542182207107544, "learning_rate": 4.799425287356322e-05, "loss": 0.1594, "step": 349 }, { "epoch": 0.4057089038614794, "grad_norm": 0.33119359612464905, "learning_rate": 4.798850574712644e-05, "loss": 0.1555, "step": 350 }, { "epoch": 0.40686807215822646, "grad_norm": 0.2560279369354248, "learning_rate": 4.798275862068965e-05, "loss": 0.1241, "step": 351 }, { "epoch": 0.40802724045497357, "grad_norm": 0.2843601405620575, "learning_rate": 4.7977011494252874e-05, "loss": 0.1289, "step": 352 }, { "epoch": 0.40918640875172063, "grad_norm": 0.29628679156303406, "learning_rate": 4.7971264367816096e-05, "loss": 0.1476, "step": 353 }, { "epoch": 0.41034557704846775, "grad_norm": 0.36033913493156433, "learning_rate": 4.796551724137932e-05, "loss": 0.1579, "step": 354 }, { "epoch": 0.4115047453452148, "grad_norm": 0.2819597125053406, "learning_rate": 4.795977011494253e-05, "loss": 0.1389, "step": 355 }, { "epoch": 0.41266391364196187, "grad_norm": 0.2536991834640503, "learning_rate": 4.795402298850575e-05, "loss": 0.1342, "step": 356 }, { "epoch": 0.413823081938709, "grad_norm": 0.339702844619751, "learning_rate": 4.794827586206897e-05, "loss": 0.1452, "step": 357 }, { "epoch": 0.41498225023545604, "grad_norm": 0.29577532410621643, "learning_rate": 4.7942528735632184e-05, "loss": 0.1516, "step": 358 }, { "epoch": 0.41614141853220316, "grad_norm": 0.30197975039482117, "learning_rate": 4.7936781609195406e-05, "loss": 0.1341, "step": 359 }, { "epoch": 0.4173005868289502, "grad_norm": 0.2787219285964966, "learning_rate": 4.793103448275863e-05, "loss": 0.1426, "step": 360 }, { "epoch": 0.41845975512569733, "grad_norm": 0.3281581997871399, "learning_rate": 4.792528735632184e-05, "loss": 0.1708, "step": 361 }, { "epoch": 0.4196189234224444, "grad_norm": 0.2598758041858673, "learning_rate": 4.791954022988506e-05, "loss": 0.1323, "step": 362 }, { "epoch": 0.4207780917191915, "grad_norm": 0.26790300011634827, "learning_rate": 4.791379310344828e-05, "loss": 0.1202, "step": 363 }, { "epoch": 0.42193726001593856, "grad_norm": 0.2569654583930969, "learning_rate": 4.7908045977011494e-05, "loss": 0.1312, "step": 364 }, { "epoch": 0.4230964283126856, "grad_norm": 0.3506622314453125, "learning_rate": 4.7902298850574716e-05, "loss": 0.1634, "step": 365 }, { "epoch": 0.42425559660943274, "grad_norm": 0.3123525083065033, "learning_rate": 4.789655172413793e-05, "loss": 0.1588, "step": 366 }, { "epoch": 0.4254147649061798, "grad_norm": 0.26198455691337585, "learning_rate": 4.789080459770115e-05, "loss": 0.1387, "step": 367 }, { "epoch": 0.4265739332029269, "grad_norm": 0.3061137795448303, "learning_rate": 4.7885057471264374e-05, "loss": 0.1444, "step": 368 }, { "epoch": 0.427733101499674, "grad_norm": 0.48860934376716614, "learning_rate": 4.787931034482759e-05, "loss": 0.151, "step": 369 }, { "epoch": 0.4288922697964211, "grad_norm": 0.3341166377067566, "learning_rate": 4.7873563218390804e-05, "loss": 0.1377, "step": 370 }, { "epoch": 0.43005143809316815, "grad_norm": 0.38944321870803833, "learning_rate": 4.7867816091954025e-05, "loss": 0.1635, "step": 371 }, { "epoch": 0.43121060638991526, "grad_norm": 0.3289624750614166, "learning_rate": 4.786206896551724e-05, "loss": 0.1492, "step": 372 }, { "epoch": 0.4323697746866623, "grad_norm": 0.2940187454223633, "learning_rate": 4.785632183908046e-05, "loss": 0.1386, "step": 373 }, { "epoch": 0.4335289429834094, "grad_norm": 0.3036423921585083, "learning_rate": 4.7850574712643684e-05, "loss": 0.1383, "step": 374 }, { "epoch": 0.4346881112801565, "grad_norm": 0.343669593334198, "learning_rate": 4.78448275862069e-05, "loss": 0.1605, "step": 375 }, { "epoch": 0.43584727957690356, "grad_norm": 0.3061246871948242, "learning_rate": 4.783908045977012e-05, "loss": 0.1286, "step": 376 }, { "epoch": 0.43700644787365067, "grad_norm": 0.33070918917655945, "learning_rate": 4.7833333333333335e-05, "loss": 0.1458, "step": 377 }, { "epoch": 0.43816561617039773, "grad_norm": 0.9597157835960388, "learning_rate": 4.782758620689655e-05, "loss": 0.1461, "step": 378 }, { "epoch": 0.43932478446714485, "grad_norm": 0.28434523940086365, "learning_rate": 4.782183908045977e-05, "loss": 0.1383, "step": 379 }, { "epoch": 0.4404839527638919, "grad_norm": 0.2993975579738617, "learning_rate": 4.781609195402299e-05, "loss": 0.1398, "step": 380 }, { "epoch": 0.44164312106063897, "grad_norm": 0.2816145420074463, "learning_rate": 4.781034482758621e-05, "loss": 0.1337, "step": 381 }, { "epoch": 0.4428022893573861, "grad_norm": 0.906739354133606, "learning_rate": 4.780459770114943e-05, "loss": 0.1285, "step": 382 }, { "epoch": 0.44396145765413314, "grad_norm": 0.25160372257232666, "learning_rate": 4.7798850574712645e-05, "loss": 0.1219, "step": 383 }, { "epoch": 0.44512062595088026, "grad_norm": 0.3275229036808014, "learning_rate": 4.7793103448275866e-05, "loss": 0.1546, "step": 384 }, { "epoch": 0.4462797942476273, "grad_norm": 0.2726188600063324, "learning_rate": 4.778735632183908e-05, "loss": 0.1479, "step": 385 }, { "epoch": 0.44743896254437443, "grad_norm": 0.2804552912712097, "learning_rate": 4.7781609195402296e-05, "loss": 0.1386, "step": 386 }, { "epoch": 0.4485981308411215, "grad_norm": 0.3268015682697296, "learning_rate": 4.777586206896552e-05, "loss": 0.1549, "step": 387 }, { "epoch": 0.4497572991378686, "grad_norm": 0.263644814491272, "learning_rate": 4.777011494252874e-05, "loss": 0.1401, "step": 388 }, { "epoch": 0.45091646743461566, "grad_norm": 0.28181982040405273, "learning_rate": 4.7764367816091954e-05, "loss": 0.1351, "step": 389 }, { "epoch": 0.4520756357313627, "grad_norm": 0.2432672381401062, "learning_rate": 4.7758620689655176e-05, "loss": 0.1279, "step": 390 }, { "epoch": 0.45323480402810984, "grad_norm": 0.24589940905570984, "learning_rate": 4.775287356321839e-05, "loss": 0.133, "step": 391 }, { "epoch": 0.4543939723248569, "grad_norm": 0.29978325963020325, "learning_rate": 4.774712643678161e-05, "loss": 0.1392, "step": 392 }, { "epoch": 0.455553140621604, "grad_norm": 0.32140663266181946, "learning_rate": 4.774137931034483e-05, "loss": 0.1376, "step": 393 }, { "epoch": 0.4567123089183511, "grad_norm": 0.3055313527584076, "learning_rate": 4.773563218390805e-05, "loss": 0.148, "step": 394 }, { "epoch": 0.4578714772150982, "grad_norm": 0.31229299306869507, "learning_rate": 4.772988505747127e-05, "loss": 0.1446, "step": 395 }, { "epoch": 0.45903064551184525, "grad_norm": 0.3736218214035034, "learning_rate": 4.7724137931034486e-05, "loss": 0.1704, "step": 396 }, { "epoch": 0.4601898138085923, "grad_norm": 0.2979601323604584, "learning_rate": 4.77183908045977e-05, "loss": 0.1347, "step": 397 }, { "epoch": 0.4613489821053394, "grad_norm": 0.288699746131897, "learning_rate": 4.771264367816092e-05, "loss": 0.1335, "step": 398 }, { "epoch": 0.4625081504020865, "grad_norm": 0.30045926570892334, "learning_rate": 4.770689655172414e-05, "loss": 0.1466, "step": 399 }, { "epoch": 0.4636673186988336, "grad_norm": 0.41966867446899414, "learning_rate": 4.770114942528736e-05, "loss": 0.1628, "step": 400 }, { "epoch": 0.46482648699558066, "grad_norm": 0.3311939537525177, "learning_rate": 4.769540229885058e-05, "loss": 0.1448, "step": 401 }, { "epoch": 0.46598565529232777, "grad_norm": 0.2968727648258209, "learning_rate": 4.7689655172413796e-05, "loss": 0.1471, "step": 402 }, { "epoch": 0.46714482358907483, "grad_norm": 0.3325969874858856, "learning_rate": 4.768390804597702e-05, "loss": 0.1527, "step": 403 }, { "epoch": 0.46830399188582195, "grad_norm": 0.32422512769699097, "learning_rate": 4.767816091954023e-05, "loss": 0.1516, "step": 404 }, { "epoch": 0.469463160182569, "grad_norm": 0.28346890211105347, "learning_rate": 4.767241379310345e-05, "loss": 0.1359, "step": 405 }, { "epoch": 0.47062232847931607, "grad_norm": 0.27612075209617615, "learning_rate": 4.766666666666667e-05, "loss": 0.1373, "step": 406 }, { "epoch": 0.4717814967760632, "grad_norm": 0.2808459401130676, "learning_rate": 4.7660919540229884e-05, "loss": 0.1408, "step": 407 }, { "epoch": 0.47294066507281024, "grad_norm": 0.2832328677177429, "learning_rate": 4.7655172413793105e-05, "loss": 0.139, "step": 408 }, { "epoch": 0.47409983336955736, "grad_norm": 0.27688664197921753, "learning_rate": 4.764942528735633e-05, "loss": 0.1392, "step": 409 }, { "epoch": 0.4752590016663044, "grad_norm": 0.3172287344932556, "learning_rate": 4.764367816091954e-05, "loss": 0.1512, "step": 410 }, { "epoch": 0.47641816996305153, "grad_norm": 0.2498215287923813, "learning_rate": 4.7637931034482764e-05, "loss": 0.12, "step": 411 }, { "epoch": 0.4775773382597986, "grad_norm": 0.3091871440410614, "learning_rate": 4.763218390804598e-05, "loss": 0.1403, "step": 412 }, { "epoch": 0.4787365065565457, "grad_norm": 0.29513412714004517, "learning_rate": 4.7626436781609193e-05, "loss": 0.1553, "step": 413 }, { "epoch": 0.47989567485329276, "grad_norm": 0.31259381771087646, "learning_rate": 4.7620689655172415e-05, "loss": 0.138, "step": 414 }, { "epoch": 0.4810548431500398, "grad_norm": 0.3411064147949219, "learning_rate": 4.761494252873564e-05, "loss": 0.1385, "step": 415 }, { "epoch": 0.48221401144678694, "grad_norm": 0.5548298358917236, "learning_rate": 4.760919540229885e-05, "loss": 0.1357, "step": 416 }, { "epoch": 0.483373179743534, "grad_norm": 0.3198058605194092, "learning_rate": 4.7603448275862073e-05, "loss": 0.1578, "step": 417 }, { "epoch": 0.4845323480402811, "grad_norm": 0.2812342643737793, "learning_rate": 4.759770114942529e-05, "loss": 0.1559, "step": 418 }, { "epoch": 0.4856915163370282, "grad_norm": 0.36544597148895264, "learning_rate": 4.759195402298851e-05, "loss": 0.1513, "step": 419 }, { "epoch": 0.4868506846337753, "grad_norm": 0.29660576581954956, "learning_rate": 4.7586206896551725e-05, "loss": 0.1506, "step": 420 }, { "epoch": 0.48800985293052235, "grad_norm": 0.2850414216518402, "learning_rate": 4.7580459770114947e-05, "loss": 0.1371, "step": 421 }, { "epoch": 0.4891690212272694, "grad_norm": 0.3292869031429291, "learning_rate": 4.757471264367817e-05, "loss": 0.1758, "step": 422 }, { "epoch": 0.4903281895240165, "grad_norm": 0.2677929997444153, "learning_rate": 4.756896551724138e-05, "loss": 0.1345, "step": 423 }, { "epoch": 0.4914873578207636, "grad_norm": 0.32202059030532837, "learning_rate": 4.75632183908046e-05, "loss": 0.1465, "step": 424 }, { "epoch": 0.4926465261175107, "grad_norm": 0.2961694300174713, "learning_rate": 4.755747126436782e-05, "loss": 0.1461, "step": 425 }, { "epoch": 0.49380569441425776, "grad_norm": 0.27797091007232666, "learning_rate": 4.7551724137931035e-05, "loss": 0.1186, "step": 426 }, { "epoch": 0.49496486271100487, "grad_norm": 0.42147669196128845, "learning_rate": 4.754597701149425e-05, "loss": 0.1351, "step": 427 }, { "epoch": 0.49612403100775193, "grad_norm": 0.41197213530540466, "learning_rate": 4.754022988505747e-05, "loss": 0.1242, "step": 428 }, { "epoch": 0.49728319930449905, "grad_norm": 0.352716326713562, "learning_rate": 4.753448275862069e-05, "loss": 0.1541, "step": 429 }, { "epoch": 0.4984423676012461, "grad_norm": 0.29174548387527466, "learning_rate": 4.7528735632183915e-05, "loss": 0.1466, "step": 430 }, { "epoch": 0.49960153589799317, "grad_norm": 0.32256200909614563, "learning_rate": 4.752298850574713e-05, "loss": 0.1365, "step": 431 }, { "epoch": 0.5007607041947403, "grad_norm": 0.35533028841018677, "learning_rate": 4.7517241379310344e-05, "loss": 0.1489, "step": 432 }, { "epoch": 0.5019198724914874, "grad_norm": 0.3250570297241211, "learning_rate": 4.7511494252873566e-05, "loss": 0.157, "step": 433 }, { "epoch": 0.5030790407882344, "grad_norm": 0.36209189891815186, "learning_rate": 4.750574712643678e-05, "loss": 0.1579, "step": 434 }, { "epoch": 0.5042382090849815, "grad_norm": 0.33881399035453796, "learning_rate": 4.75e-05, "loss": 0.1516, "step": 435 }, { "epoch": 0.5053973773817286, "grad_norm": 0.2697497010231018, "learning_rate": 4.7494252873563224e-05, "loss": 0.1347, "step": 436 }, { "epoch": 0.5065565456784757, "grad_norm": 0.3097374439239502, "learning_rate": 4.748850574712644e-05, "loss": 0.1506, "step": 437 }, { "epoch": 0.5077157139752227, "grad_norm": 0.28055351972579956, "learning_rate": 4.748275862068966e-05, "loss": 0.1418, "step": 438 }, { "epoch": 0.5088748822719699, "grad_norm": 0.31767186522483826, "learning_rate": 4.7477011494252876e-05, "loss": 0.1502, "step": 439 }, { "epoch": 0.510034050568717, "grad_norm": 0.3027343451976776, "learning_rate": 4.747126436781609e-05, "loss": 0.1437, "step": 440 }, { "epoch": 0.511193218865464, "grad_norm": 0.2773297131061554, "learning_rate": 4.746551724137931e-05, "loss": 0.139, "step": 441 }, { "epoch": 0.5123523871622111, "grad_norm": 0.2976417541503906, "learning_rate": 4.7459770114942534e-05, "loss": 0.147, "step": 442 }, { "epoch": 0.5135115554589582, "grad_norm": 0.30484509468078613, "learning_rate": 4.745402298850575e-05, "loss": 0.1371, "step": 443 }, { "epoch": 0.5146707237557053, "grad_norm": 0.3369823694229126, "learning_rate": 4.744827586206897e-05, "loss": 0.1674, "step": 444 }, { "epoch": 0.5158298920524523, "grad_norm": 0.27961060404777527, "learning_rate": 4.7442528735632186e-05, "loss": 0.141, "step": 445 }, { "epoch": 0.5169890603491994, "grad_norm": 0.29113152623176575, "learning_rate": 4.74367816091954e-05, "loss": 0.1432, "step": 446 }, { "epoch": 0.5181482286459466, "grad_norm": 0.363936185836792, "learning_rate": 4.743103448275862e-05, "loss": 0.1637, "step": 447 }, { "epoch": 0.5193073969426936, "grad_norm": 0.3023361563682556, "learning_rate": 4.742528735632184e-05, "loss": 0.1368, "step": 448 }, { "epoch": 0.5204665652394407, "grad_norm": 0.33404213190078735, "learning_rate": 4.7419540229885065e-05, "loss": 0.1555, "step": 449 }, { "epoch": 0.5216257335361878, "grad_norm": 0.302898645401001, "learning_rate": 4.741379310344828e-05, "loss": 0.1354, "step": 450 }, { "epoch": 0.5227849018329349, "grad_norm": 0.35367482900619507, "learning_rate": 4.7408045977011495e-05, "loss": 0.1583, "step": 451 }, { "epoch": 0.5239440701296819, "grad_norm": 0.31517934799194336, "learning_rate": 4.740229885057472e-05, "loss": 0.1457, "step": 452 }, { "epoch": 0.525103238426429, "grad_norm": 0.2723785936832428, "learning_rate": 4.739655172413793e-05, "loss": 0.1381, "step": 453 }, { "epoch": 0.5262624067231761, "grad_norm": 0.2963480055332184, "learning_rate": 4.739080459770115e-05, "loss": 0.1408, "step": 454 }, { "epoch": 0.5274215750199232, "grad_norm": 0.31834137439727783, "learning_rate": 4.738505747126437e-05, "loss": 0.1411, "step": 455 }, { "epoch": 0.5285807433166703, "grad_norm": 0.31244561076164246, "learning_rate": 4.737931034482759e-05, "loss": 0.1368, "step": 456 }, { "epoch": 0.5297399116134174, "grad_norm": 0.29169803857803345, "learning_rate": 4.737356321839081e-05, "loss": 0.1367, "step": 457 }, { "epoch": 0.5308990799101645, "grad_norm": 0.2684137523174286, "learning_rate": 4.736781609195403e-05, "loss": 0.1482, "step": 458 }, { "epoch": 0.5320582482069115, "grad_norm": 0.26147085428237915, "learning_rate": 4.736206896551724e-05, "loss": 0.1406, "step": 459 }, { "epoch": 0.5332174165036586, "grad_norm": 0.2554711401462555, "learning_rate": 4.735632183908046e-05, "loss": 0.131, "step": 460 }, { "epoch": 0.5343765848004057, "grad_norm": 0.2564930319786072, "learning_rate": 4.735057471264368e-05, "loss": 0.126, "step": 461 }, { "epoch": 0.5355357530971528, "grad_norm": 0.2714848220348358, "learning_rate": 4.73448275862069e-05, "loss": 0.1308, "step": 462 }, { "epoch": 0.5366949213938998, "grad_norm": 0.2883203327655792, "learning_rate": 4.733908045977012e-05, "loss": 0.1286, "step": 463 }, { "epoch": 0.537854089690647, "grad_norm": 0.29676780104637146, "learning_rate": 4.7333333333333336e-05, "loss": 0.1499, "step": 464 }, { "epoch": 0.5390132579873941, "grad_norm": 0.2832816541194916, "learning_rate": 4.732758620689655e-05, "loss": 0.1452, "step": 465 }, { "epoch": 0.5401724262841411, "grad_norm": 0.28993991017341614, "learning_rate": 4.732183908045977e-05, "loss": 0.1501, "step": 466 }, { "epoch": 0.5413315945808882, "grad_norm": 0.3065154254436493, "learning_rate": 4.731609195402299e-05, "loss": 0.1545, "step": 467 }, { "epoch": 0.5424907628776353, "grad_norm": 0.28335148096084595, "learning_rate": 4.731034482758621e-05, "loss": 0.1305, "step": 468 }, { "epoch": 0.5436499311743824, "grad_norm": 0.29181987047195435, "learning_rate": 4.730459770114943e-05, "loss": 0.1379, "step": 469 }, { "epoch": 0.5448090994711294, "grad_norm": 0.2934599220752716, "learning_rate": 4.7298850574712646e-05, "loss": 0.1435, "step": 470 }, { "epoch": 0.5459682677678765, "grad_norm": 0.2953728437423706, "learning_rate": 4.729310344827587e-05, "loss": 0.1453, "step": 471 }, { "epoch": 0.5471274360646237, "grad_norm": 0.2823188006877899, "learning_rate": 4.728735632183908e-05, "loss": 0.1312, "step": 472 }, { "epoch": 0.5482866043613707, "grad_norm": 0.2421528846025467, "learning_rate": 4.72816091954023e-05, "loss": 0.1207, "step": 473 }, { "epoch": 0.5494457726581178, "grad_norm": 0.3098517954349518, "learning_rate": 4.727586206896552e-05, "loss": 0.1492, "step": 474 }, { "epoch": 0.5506049409548649, "grad_norm": 0.2447943240404129, "learning_rate": 4.7270114942528734e-05, "loss": 0.128, "step": 475 }, { "epoch": 0.551764109251612, "grad_norm": 0.37644991278648376, "learning_rate": 4.7264367816091956e-05, "loss": 0.1414, "step": 476 }, { "epoch": 0.552923277548359, "grad_norm": 0.2898421883583069, "learning_rate": 4.725862068965518e-05, "loss": 0.1382, "step": 477 }, { "epoch": 0.5540824458451061, "grad_norm": 0.3101235032081604, "learning_rate": 4.725287356321839e-05, "loss": 0.1507, "step": 478 }, { "epoch": 0.5552416141418532, "grad_norm": 0.31535589694976807, "learning_rate": 4.7247126436781614e-05, "loss": 0.1479, "step": 479 }, { "epoch": 0.5564007824386002, "grad_norm": 0.3017010986804962, "learning_rate": 4.724137931034483e-05, "loss": 0.1491, "step": 480 }, { "epoch": 0.5575599507353474, "grad_norm": 0.27675431966781616, "learning_rate": 4.7235632183908044e-05, "loss": 0.1343, "step": 481 }, { "epoch": 0.5587191190320945, "grad_norm": 0.28481778502464294, "learning_rate": 4.7229885057471266e-05, "loss": 0.1598, "step": 482 }, { "epoch": 0.5598782873288416, "grad_norm": 0.2603989541530609, "learning_rate": 4.722413793103449e-05, "loss": 0.1365, "step": 483 }, { "epoch": 0.5610374556255886, "grad_norm": 0.2759985625743866, "learning_rate": 4.72183908045977e-05, "loss": 0.1291, "step": 484 }, { "epoch": 0.5621966239223357, "grad_norm": 0.27497798204421997, "learning_rate": 4.7212643678160924e-05, "loss": 0.1302, "step": 485 }, { "epoch": 0.5633557922190828, "grad_norm": 0.4083801209926605, "learning_rate": 4.720689655172414e-05, "loss": 0.1512, "step": 486 }, { "epoch": 0.5645149605158299, "grad_norm": 0.25597989559173584, "learning_rate": 4.720114942528736e-05, "loss": 0.125, "step": 487 }, { "epoch": 0.565674128812577, "grad_norm": 0.34843572974205017, "learning_rate": 4.7195402298850575e-05, "loss": 0.1537, "step": 488 }, { "epoch": 0.5668332971093241, "grad_norm": 0.34892401099205017, "learning_rate": 4.718965517241379e-05, "loss": 0.1342, "step": 489 }, { "epoch": 0.5679924654060712, "grad_norm": 0.28030136227607727, "learning_rate": 4.718390804597702e-05, "loss": 0.1477, "step": 490 }, { "epoch": 0.5691516337028182, "grad_norm": 0.27783283591270447, "learning_rate": 4.7178160919540234e-05, "loss": 0.1431, "step": 491 }, { "epoch": 0.5703108019995653, "grad_norm": 0.23067143559455872, "learning_rate": 4.717241379310345e-05, "loss": 0.1242, "step": 492 }, { "epoch": 0.5714699702963124, "grad_norm": 0.25984445214271545, "learning_rate": 4.716666666666667e-05, "loss": 0.1298, "step": 493 }, { "epoch": 0.5726291385930595, "grad_norm": 0.255754292011261, "learning_rate": 4.7160919540229885e-05, "loss": 0.1398, "step": 494 }, { "epoch": 0.5737883068898065, "grad_norm": 0.28669071197509766, "learning_rate": 4.715517241379311e-05, "loss": 0.1428, "step": 495 }, { "epoch": 0.5749474751865536, "grad_norm": 0.29052338004112244, "learning_rate": 4.714942528735632e-05, "loss": 0.1399, "step": 496 }, { "epoch": 0.5761066434833008, "grad_norm": 0.24293456971645355, "learning_rate": 4.714367816091954e-05, "loss": 0.122, "step": 497 }, { "epoch": 0.5772658117800478, "grad_norm": 0.278358519077301, "learning_rate": 4.7137931034482765e-05, "loss": 0.146, "step": 498 }, { "epoch": 0.5784249800767949, "grad_norm": 0.25874063372612, "learning_rate": 4.713218390804598e-05, "loss": 0.1298, "step": 499 }, { "epoch": 0.579584148373542, "grad_norm": 0.4070255756378174, "learning_rate": 4.7126436781609195e-05, "loss": 0.1511, "step": 500 }, { "epoch": 0.5807433166702891, "grad_norm": 0.4098628759384155, "learning_rate": 4.7120689655172417e-05, "loss": 0.1399, "step": 501 }, { "epoch": 0.5819024849670361, "grad_norm": 0.33255958557128906, "learning_rate": 4.711494252873563e-05, "loss": 0.1489, "step": 502 }, { "epoch": 0.5830616532637832, "grad_norm": 0.2659994959831238, "learning_rate": 4.710919540229885e-05, "loss": 0.1258, "step": 503 }, { "epoch": 0.5842208215605303, "grad_norm": 0.30784937739372253, "learning_rate": 4.7103448275862075e-05, "loss": 0.1468, "step": 504 }, { "epoch": 0.5853799898572773, "grad_norm": 0.30625712871551514, "learning_rate": 4.709770114942529e-05, "loss": 0.1355, "step": 505 }, { "epoch": 0.5865391581540245, "grad_norm": 0.32840919494628906, "learning_rate": 4.709195402298851e-05, "loss": 0.1346, "step": 506 }, { "epoch": 0.5876983264507716, "grad_norm": 0.2901023030281067, "learning_rate": 4.7086206896551726e-05, "loss": 0.1621, "step": 507 }, { "epoch": 0.5888574947475187, "grad_norm": 0.29218408465385437, "learning_rate": 4.708045977011494e-05, "loss": 0.1415, "step": 508 }, { "epoch": 0.5900166630442657, "grad_norm": 0.2910397946834564, "learning_rate": 4.707471264367816e-05, "loss": 0.1536, "step": 509 }, { "epoch": 0.5911758313410128, "grad_norm": 0.23362290859222412, "learning_rate": 4.7068965517241385e-05, "loss": 0.1204, "step": 510 }, { "epoch": 0.5923349996377599, "grad_norm": 0.29354870319366455, "learning_rate": 4.70632183908046e-05, "loss": 0.1392, "step": 511 }, { "epoch": 0.593494167934507, "grad_norm": 0.26072168350219727, "learning_rate": 4.705747126436782e-05, "loss": 0.1376, "step": 512 }, { "epoch": 0.594653336231254, "grad_norm": 0.32599276304244995, "learning_rate": 4.7051724137931036e-05, "loss": 0.1371, "step": 513 }, { "epoch": 0.5958125045280012, "grad_norm": 0.2970302402973175, "learning_rate": 4.704597701149426e-05, "loss": 0.1376, "step": 514 }, { "epoch": 0.5969716728247483, "grad_norm": 0.2292068749666214, "learning_rate": 4.704022988505747e-05, "loss": 0.1264, "step": 515 }, { "epoch": 0.5981308411214953, "grad_norm": 0.2724842131137848, "learning_rate": 4.703448275862069e-05, "loss": 0.136, "step": 516 }, { "epoch": 0.5992900094182424, "grad_norm": 0.2579404413700104, "learning_rate": 4.702873563218391e-05, "loss": 0.1378, "step": 517 }, { "epoch": 0.6004491777149895, "grad_norm": 0.28821900486946106, "learning_rate": 4.702298850574713e-05, "loss": 0.1481, "step": 518 }, { "epoch": 0.6016083460117366, "grad_norm": 0.31739768385887146, "learning_rate": 4.7017241379310346e-05, "loss": 0.1536, "step": 519 }, { "epoch": 0.6027675143084836, "grad_norm": 0.2741948366165161, "learning_rate": 4.701149425287357e-05, "loss": 0.1293, "step": 520 }, { "epoch": 0.6039266826052307, "grad_norm": 0.2746334969997406, "learning_rate": 4.700574712643678e-05, "loss": 0.1309, "step": 521 }, { "epoch": 0.6050858509019779, "grad_norm": 0.2569972276687622, "learning_rate": 4.7e-05, "loss": 0.1192, "step": 522 }, { "epoch": 0.6062450191987249, "grad_norm": 0.3422163128852844, "learning_rate": 4.699425287356322e-05, "loss": 0.1518, "step": 523 }, { "epoch": 0.607404187495472, "grad_norm": 0.30587247014045715, "learning_rate": 4.698850574712644e-05, "loss": 0.1599, "step": 524 }, { "epoch": 0.6085633557922191, "grad_norm": 0.27892470359802246, "learning_rate": 4.698275862068966e-05, "loss": 0.1373, "step": 525 }, { "epoch": 0.6097225240889662, "grad_norm": 0.3484020233154297, "learning_rate": 4.697701149425288e-05, "loss": 0.1368, "step": 526 }, { "epoch": 0.6108816923857132, "grad_norm": 0.2913890779018402, "learning_rate": 4.697126436781609e-05, "loss": 0.1351, "step": 527 }, { "epoch": 0.6120408606824603, "grad_norm": 0.3555639982223511, "learning_rate": 4.6965517241379314e-05, "loss": 0.1608, "step": 528 }, { "epoch": 0.6132000289792074, "grad_norm": 0.2778078317642212, "learning_rate": 4.695977011494253e-05, "loss": 0.1446, "step": 529 }, { "epoch": 0.6143591972759544, "grad_norm": 0.24142858386039734, "learning_rate": 4.695402298850575e-05, "loss": 0.1161, "step": 530 }, { "epoch": 0.6155183655727016, "grad_norm": 0.2555319368839264, "learning_rate": 4.694827586206897e-05, "loss": 0.1229, "step": 531 }, { "epoch": 0.6166775338694487, "grad_norm": 0.2874903678894043, "learning_rate": 4.694252873563219e-05, "loss": 0.1487, "step": 532 }, { "epoch": 0.6178367021661958, "grad_norm": 0.27659285068511963, "learning_rate": 4.693678160919541e-05, "loss": 0.1458, "step": 533 }, { "epoch": 0.6189958704629428, "grad_norm": 0.3088036775588989, "learning_rate": 4.6931034482758623e-05, "loss": 0.1448, "step": 534 }, { "epoch": 0.6201550387596899, "grad_norm": 0.338383287191391, "learning_rate": 4.692528735632184e-05, "loss": 0.124, "step": 535 }, { "epoch": 0.621314207056437, "grad_norm": 0.27692559361457825, "learning_rate": 4.691954022988506e-05, "loss": 0.1417, "step": 536 }, { "epoch": 0.6224733753531841, "grad_norm": 0.2864556610584259, "learning_rate": 4.6913793103448275e-05, "loss": 0.1406, "step": 537 }, { "epoch": 0.6236325436499311, "grad_norm": 0.3573722243309021, "learning_rate": 4.69080459770115e-05, "loss": 0.1445, "step": 538 }, { "epoch": 0.6247917119466783, "grad_norm": 0.27817103266716003, "learning_rate": 4.690229885057472e-05, "loss": 0.1359, "step": 539 }, { "epoch": 0.6259508802434254, "grad_norm": 0.3294277787208557, "learning_rate": 4.689655172413793e-05, "loss": 0.1455, "step": 540 }, { "epoch": 0.6271100485401724, "grad_norm": 0.2555405795574188, "learning_rate": 4.689080459770115e-05, "loss": 0.1352, "step": 541 }, { "epoch": 0.6282692168369195, "grad_norm": 0.27222466468811035, "learning_rate": 4.688505747126437e-05, "loss": 0.1391, "step": 542 }, { "epoch": 0.6294283851336666, "grad_norm": 0.26646101474761963, "learning_rate": 4.6879310344827585e-05, "loss": 0.1405, "step": 543 }, { "epoch": 0.6305875534304137, "grad_norm": 0.295244961977005, "learning_rate": 4.6873563218390806e-05, "loss": 0.1401, "step": 544 }, { "epoch": 0.6317467217271607, "grad_norm": 0.26528140902519226, "learning_rate": 4.686781609195403e-05, "loss": 0.1342, "step": 545 }, { "epoch": 0.6329058900239078, "grad_norm": 0.2771555781364441, "learning_rate": 4.686206896551724e-05, "loss": 0.1525, "step": 546 }, { "epoch": 0.634065058320655, "grad_norm": 0.3281809091567993, "learning_rate": 4.6856321839080465e-05, "loss": 0.1653, "step": 547 }, { "epoch": 0.635224226617402, "grad_norm": 0.3082864284515381, "learning_rate": 4.685057471264368e-05, "loss": 0.125, "step": 548 }, { "epoch": 0.6363833949141491, "grad_norm": 0.30822286009788513, "learning_rate": 4.6844827586206894e-05, "loss": 0.1455, "step": 549 }, { "epoch": 0.6375425632108962, "grad_norm": 0.3183049261569977, "learning_rate": 4.6839080459770116e-05, "loss": 0.1363, "step": 550 }, { "epoch": 0.6387017315076433, "grad_norm": 0.2250736802816391, "learning_rate": 4.683333333333334e-05, "loss": 0.1147, "step": 551 }, { "epoch": 0.6398608998043903, "grad_norm": 0.2534612715244293, "learning_rate": 4.682758620689656e-05, "loss": 0.1447, "step": 552 }, { "epoch": 0.6410200681011374, "grad_norm": 0.2627923786640167, "learning_rate": 4.6821839080459774e-05, "loss": 0.1404, "step": 553 }, { "epoch": 0.6421792363978845, "grad_norm": 0.25790974497795105, "learning_rate": 4.681609195402299e-05, "loss": 0.1267, "step": 554 }, { "epoch": 0.6433384046946315, "grad_norm": 0.2535199224948883, "learning_rate": 4.681034482758621e-05, "loss": 0.1377, "step": 555 }, { "epoch": 0.6444975729913787, "grad_norm": 0.27798426151275635, "learning_rate": 4.6804597701149426e-05, "loss": 0.1422, "step": 556 }, { "epoch": 0.6456567412881258, "grad_norm": 0.26984500885009766, "learning_rate": 4.679885057471264e-05, "loss": 0.1383, "step": 557 }, { "epoch": 0.6468159095848729, "grad_norm": 0.28631749749183655, "learning_rate": 4.679310344827586e-05, "loss": 0.1402, "step": 558 }, { "epoch": 0.6479750778816199, "grad_norm": 0.29483139514923096, "learning_rate": 4.6787356321839084e-05, "loss": 0.1384, "step": 559 }, { "epoch": 0.649134246178367, "grad_norm": 0.23459696769714355, "learning_rate": 4.67816091954023e-05, "loss": 0.1242, "step": 560 }, { "epoch": 0.6502934144751141, "grad_norm": 0.28944167494773865, "learning_rate": 4.677586206896552e-05, "loss": 0.1505, "step": 561 }, { "epoch": 0.6514525827718612, "grad_norm": 0.3058931529521942, "learning_rate": 4.6770114942528736e-05, "loss": 0.1369, "step": 562 }, { "epoch": 0.6526117510686082, "grad_norm": 0.28087151050567627, "learning_rate": 4.676436781609196e-05, "loss": 0.1372, "step": 563 }, { "epoch": 0.6537709193653554, "grad_norm": 0.3356817364692688, "learning_rate": 4.675862068965517e-05, "loss": 0.1603, "step": 564 }, { "epoch": 0.6549300876621025, "grad_norm": 0.24305835366249084, "learning_rate": 4.6752873563218394e-05, "loss": 0.1336, "step": 565 }, { "epoch": 0.6560892559588495, "grad_norm": 0.26663926243782043, "learning_rate": 4.6747126436781616e-05, "loss": 0.1326, "step": 566 }, { "epoch": 0.6572484242555966, "grad_norm": 0.3176979124546051, "learning_rate": 4.674137931034483e-05, "loss": 0.1586, "step": 567 }, { "epoch": 0.6584075925523437, "grad_norm": 0.2597542405128479, "learning_rate": 4.6735632183908045e-05, "loss": 0.1325, "step": 568 }, { "epoch": 0.6595667608490908, "grad_norm": 0.2565508782863617, "learning_rate": 4.672988505747127e-05, "loss": 0.1403, "step": 569 }, { "epoch": 0.6607259291458378, "grad_norm": 0.22864797711372375, "learning_rate": 4.672413793103448e-05, "loss": 0.13, "step": 570 }, { "epoch": 0.661885097442585, "grad_norm": 0.27961140871047974, "learning_rate": 4.6718390804597704e-05, "loss": 0.145, "step": 571 }, { "epoch": 0.6630442657393321, "grad_norm": 0.23423443734645844, "learning_rate": 4.6712643678160925e-05, "loss": 0.1325, "step": 572 }, { "epoch": 0.6642034340360791, "grad_norm": 0.26428887248039246, "learning_rate": 4.670689655172414e-05, "loss": 0.1299, "step": 573 }, { "epoch": 0.6653626023328262, "grad_norm": 0.2903366684913635, "learning_rate": 4.670114942528736e-05, "loss": 0.1445, "step": 574 }, { "epoch": 0.6665217706295733, "grad_norm": 0.3106566369533539, "learning_rate": 4.669540229885058e-05, "loss": 0.1396, "step": 575 }, { "epoch": 0.6676809389263204, "grad_norm": 0.2907349169254303, "learning_rate": 4.668965517241379e-05, "loss": 0.1224, "step": 576 }, { "epoch": 0.6688401072230674, "grad_norm": 0.2840725779533386, "learning_rate": 4.668390804597701e-05, "loss": 0.1356, "step": 577 }, { "epoch": 0.6699992755198145, "grad_norm": 0.3185088634490967, "learning_rate": 4.667816091954023e-05, "loss": 0.1499, "step": 578 }, { "epoch": 0.6711584438165616, "grad_norm": 0.2769455909729004, "learning_rate": 4.667241379310345e-05, "loss": 0.1354, "step": 579 }, { "epoch": 0.6723176121133086, "grad_norm": 0.3882596492767334, "learning_rate": 4.666666666666667e-05, "loss": 0.1551, "step": 580 }, { "epoch": 0.6734767804100558, "grad_norm": 0.29669293761253357, "learning_rate": 4.6660919540229887e-05, "loss": 0.1437, "step": 581 }, { "epoch": 0.6746359487068029, "grad_norm": 0.24709312617778778, "learning_rate": 4.665517241379311e-05, "loss": 0.1349, "step": 582 }, { "epoch": 0.67579511700355, "grad_norm": 0.29217076301574707, "learning_rate": 4.664942528735632e-05, "loss": 0.1327, "step": 583 }, { "epoch": 0.676954285300297, "grad_norm": 0.2767103612422943, "learning_rate": 4.664367816091954e-05, "loss": 0.1341, "step": 584 }, { "epoch": 0.6781134535970441, "grad_norm": 0.3487448990345001, "learning_rate": 4.663793103448276e-05, "loss": 0.1567, "step": 585 }, { "epoch": 0.6792726218937912, "grad_norm": 0.30940744280815125, "learning_rate": 4.663218390804598e-05, "loss": 0.1486, "step": 586 }, { "epoch": 0.6804317901905383, "grad_norm": 0.2427985668182373, "learning_rate": 4.6626436781609196e-05, "loss": 0.1298, "step": 587 }, { "epoch": 0.6815909584872853, "grad_norm": 0.2587006092071533, "learning_rate": 4.662068965517242e-05, "loss": 0.1557, "step": 588 }, { "epoch": 0.6827501267840325, "grad_norm": 0.2949361205101013, "learning_rate": 4.661494252873563e-05, "loss": 0.1513, "step": 589 }, { "epoch": 0.6839092950807796, "grad_norm": 0.2784793972969055, "learning_rate": 4.6609195402298855e-05, "loss": 0.1407, "step": 590 }, { "epoch": 0.6850684633775266, "grad_norm": 0.28113847970962524, "learning_rate": 4.660344827586207e-05, "loss": 0.1346, "step": 591 }, { "epoch": 0.6862276316742737, "grad_norm": 0.30229294300079346, "learning_rate": 4.659770114942529e-05, "loss": 0.1533, "step": 592 }, { "epoch": 0.6873867999710208, "grad_norm": 0.30034735798835754, "learning_rate": 4.659195402298851e-05, "loss": 0.1382, "step": 593 }, { "epoch": 0.6885459682677679, "grad_norm": 0.23939953744411469, "learning_rate": 4.658620689655173e-05, "loss": 0.1248, "step": 594 }, { "epoch": 0.6897051365645149, "grad_norm": 0.2364337295293808, "learning_rate": 4.658045977011494e-05, "loss": 0.1335, "step": 595 }, { "epoch": 0.690864304861262, "grad_norm": 0.2641420364379883, "learning_rate": 4.6574712643678164e-05, "loss": 0.1492, "step": 596 }, { "epoch": 0.6920234731580092, "grad_norm": 0.29211926460266113, "learning_rate": 4.656896551724138e-05, "loss": 0.1397, "step": 597 }, { "epoch": 0.6931826414547562, "grad_norm": 0.2939442992210388, "learning_rate": 4.65632183908046e-05, "loss": 0.1345, "step": 598 }, { "epoch": 0.6943418097515033, "grad_norm": 0.2861159145832062, "learning_rate": 4.6557471264367816e-05, "loss": 0.1359, "step": 599 }, { "epoch": 0.6955009780482504, "grad_norm": 0.3505733907222748, "learning_rate": 4.655172413793104e-05, "loss": 0.1464, "step": 600 }, { "epoch": 0.6966601463449975, "grad_norm": 0.26291075348854065, "learning_rate": 4.654597701149426e-05, "loss": 0.1354, "step": 601 }, { "epoch": 0.6978193146417445, "grad_norm": 0.27308157086372375, "learning_rate": 4.6540229885057474e-05, "loss": 0.1374, "step": 602 }, { "epoch": 0.6989784829384916, "grad_norm": 0.3093547224998474, "learning_rate": 4.653448275862069e-05, "loss": 0.1366, "step": 603 }, { "epoch": 0.7001376512352387, "grad_norm": 0.29862943291664124, "learning_rate": 4.652873563218391e-05, "loss": 0.1417, "step": 604 }, { "epoch": 0.7012968195319857, "grad_norm": 0.35645461082458496, "learning_rate": 4.6522988505747125e-05, "loss": 0.1348, "step": 605 }, { "epoch": 0.7024559878287329, "grad_norm": 0.3341381847858429, "learning_rate": 4.651724137931035e-05, "loss": 0.1426, "step": 606 }, { "epoch": 0.70361515612548, "grad_norm": 0.22590871155261993, "learning_rate": 4.651149425287357e-05, "loss": 0.1192, "step": 607 }, { "epoch": 0.7047743244222271, "grad_norm": 0.22133195400238037, "learning_rate": 4.6505747126436784e-05, "loss": 0.1176, "step": 608 }, { "epoch": 0.7059334927189741, "grad_norm": 0.2593124806880951, "learning_rate": 4.6500000000000005e-05, "loss": 0.1356, "step": 609 }, { "epoch": 0.7070926610157212, "grad_norm": 0.28317561745643616, "learning_rate": 4.649425287356322e-05, "loss": 0.144, "step": 610 }, { "epoch": 0.7082518293124683, "grad_norm": 0.23904190957546234, "learning_rate": 4.6488505747126435e-05, "loss": 0.1371, "step": 611 }, { "epoch": 0.7094109976092153, "grad_norm": 0.23972614109516144, "learning_rate": 4.648275862068966e-05, "loss": 0.1268, "step": 612 }, { "epoch": 0.7105701659059624, "grad_norm": 0.27289271354675293, "learning_rate": 4.647701149425288e-05, "loss": 0.1257, "step": 613 }, { "epoch": 0.7117293342027096, "grad_norm": 0.2559848427772522, "learning_rate": 4.6471264367816093e-05, "loss": 0.139, "step": 614 }, { "epoch": 0.7128885024994567, "grad_norm": 0.28095269203186035, "learning_rate": 4.6465517241379315e-05, "loss": 0.1508, "step": 615 }, { "epoch": 0.7140476707962037, "grad_norm": 0.25798696279525757, "learning_rate": 4.645977011494253e-05, "loss": 0.1302, "step": 616 }, { "epoch": 0.7152068390929508, "grad_norm": 0.28479090332984924, "learning_rate": 4.645402298850575e-05, "loss": 0.1462, "step": 617 }, { "epoch": 0.7163660073896979, "grad_norm": 0.3032209575176239, "learning_rate": 4.644827586206897e-05, "loss": 0.1581, "step": 618 }, { "epoch": 0.717525175686445, "grad_norm": 0.26038414239883423, "learning_rate": 4.644252873563218e-05, "loss": 0.133, "step": 619 }, { "epoch": 0.718684343983192, "grad_norm": 0.2712029218673706, "learning_rate": 4.643678160919541e-05, "loss": 0.1456, "step": 620 }, { "epoch": 0.7198435122799391, "grad_norm": 0.2711297571659088, "learning_rate": 4.6431034482758625e-05, "loss": 0.1485, "step": 621 }, { "epoch": 0.7210026805766863, "grad_norm": 0.26714852452278137, "learning_rate": 4.642528735632184e-05, "loss": 0.1409, "step": 622 }, { "epoch": 0.7221618488734333, "grad_norm": 0.2638694941997528, "learning_rate": 4.641954022988506e-05, "loss": 0.1415, "step": 623 }, { "epoch": 0.7233210171701804, "grad_norm": 0.24792924523353577, "learning_rate": 4.6413793103448276e-05, "loss": 0.144, "step": 624 }, { "epoch": 0.7244801854669275, "grad_norm": 0.33576005697250366, "learning_rate": 4.640804597701149e-05, "loss": 0.1677, "step": 625 }, { "epoch": 0.7256393537636746, "grad_norm": 0.26434826850891113, "learning_rate": 4.640229885057471e-05, "loss": 0.1413, "step": 626 }, { "epoch": 0.7267985220604216, "grad_norm": 0.22275975346565247, "learning_rate": 4.6396551724137935e-05, "loss": 0.1265, "step": 627 }, { "epoch": 0.7279576903571687, "grad_norm": 0.25366446375846863, "learning_rate": 4.6390804597701156e-05, "loss": 0.1353, "step": 628 }, { "epoch": 0.7291168586539158, "grad_norm": 0.24983662366867065, "learning_rate": 4.638505747126437e-05, "loss": 0.1338, "step": 629 }, { "epoch": 0.7302760269506628, "grad_norm": 0.24769724905490875, "learning_rate": 4.6379310344827586e-05, "loss": 0.1242, "step": 630 }, { "epoch": 0.73143519524741, "grad_norm": 0.2604047358036041, "learning_rate": 4.637356321839081e-05, "loss": 0.126, "step": 631 }, { "epoch": 0.7325943635441571, "grad_norm": 0.2878481447696686, "learning_rate": 4.636781609195402e-05, "loss": 0.1573, "step": 632 }, { "epoch": 0.7337535318409042, "grad_norm": 0.25279513001441956, "learning_rate": 4.6362068965517244e-05, "loss": 0.1361, "step": 633 }, { "epoch": 0.7349127001376512, "grad_norm": 0.28761938214302063, "learning_rate": 4.6356321839080466e-05, "loss": 0.1437, "step": 634 }, { "epoch": 0.7360718684343983, "grad_norm": 0.30495256185531616, "learning_rate": 4.635057471264368e-05, "loss": 0.1549, "step": 635 }, { "epoch": 0.7372310367311454, "grad_norm": 0.2510074973106384, "learning_rate": 4.63448275862069e-05, "loss": 0.1286, "step": 636 }, { "epoch": 0.7383902050278924, "grad_norm": 0.2822047472000122, "learning_rate": 4.633908045977012e-05, "loss": 0.1279, "step": 637 }, { "epoch": 0.7395493733246395, "grad_norm": 0.23230963945388794, "learning_rate": 4.633333333333333e-05, "loss": 0.1406, "step": 638 }, { "epoch": 0.7407085416213867, "grad_norm": 0.2472311556339264, "learning_rate": 4.6327586206896554e-05, "loss": 0.1244, "step": 639 }, { "epoch": 0.7418677099181338, "grad_norm": 0.2912672460079193, "learning_rate": 4.632183908045977e-05, "loss": 0.136, "step": 640 }, { "epoch": 0.7430268782148808, "grad_norm": 0.35469523072242737, "learning_rate": 4.631609195402299e-05, "loss": 0.1484, "step": 641 }, { "epoch": 0.7441860465116279, "grad_norm": 0.26528200507164, "learning_rate": 4.631034482758621e-05, "loss": 0.1252, "step": 642 }, { "epoch": 0.745345214808375, "grad_norm": 0.2616005539894104, "learning_rate": 4.630459770114943e-05, "loss": 0.1222, "step": 643 }, { "epoch": 0.7465043831051221, "grad_norm": 0.26047012209892273, "learning_rate": 4.629885057471264e-05, "loss": 0.1401, "step": 644 }, { "epoch": 0.7476635514018691, "grad_norm": 0.27605798840522766, "learning_rate": 4.6293103448275864e-05, "loss": 0.13, "step": 645 }, { "epoch": 0.7488227196986162, "grad_norm": 0.30614152550697327, "learning_rate": 4.628735632183908e-05, "loss": 0.1486, "step": 646 }, { "epoch": 0.7499818879953634, "grad_norm": 0.34578898549079895, "learning_rate": 4.62816091954023e-05, "loss": 0.1515, "step": 647 }, { "epoch": 0.7511410562921104, "grad_norm": 0.24491111934185028, "learning_rate": 4.627586206896552e-05, "loss": 0.1234, "step": 648 }, { "epoch": 0.7523002245888575, "grad_norm": 0.2561955153942108, "learning_rate": 4.627011494252874e-05, "loss": 0.1255, "step": 649 }, { "epoch": 0.7534593928856046, "grad_norm": 0.24703799188137054, "learning_rate": 4.626436781609196e-05, "loss": 0.139, "step": 650 }, { "epoch": 0.7546185611823517, "grad_norm": 0.26947158575057983, "learning_rate": 4.6258620689655174e-05, "loss": 0.1435, "step": 651 }, { "epoch": 0.7557777294790987, "grad_norm": 0.2430969476699829, "learning_rate": 4.625287356321839e-05, "loss": 0.1316, "step": 652 }, { "epoch": 0.7569368977758458, "grad_norm": 0.40103209018707275, "learning_rate": 4.624712643678161e-05, "loss": 0.1605, "step": 653 }, { "epoch": 0.758096066072593, "grad_norm": 0.25342556834220886, "learning_rate": 4.624137931034483e-05, "loss": 0.1357, "step": 654 }, { "epoch": 0.75925523436934, "grad_norm": 0.27793052792549133, "learning_rate": 4.6235632183908054e-05, "loss": 0.1392, "step": 655 }, { "epoch": 0.7604144026660871, "grad_norm": 0.27625927329063416, "learning_rate": 4.622988505747127e-05, "loss": 0.1312, "step": 656 }, { "epoch": 0.7615735709628342, "grad_norm": 0.2726586163043976, "learning_rate": 4.622413793103448e-05, "loss": 0.1439, "step": 657 }, { "epoch": 0.7627327392595813, "grad_norm": 0.25028151273727417, "learning_rate": 4.6218390804597705e-05, "loss": 0.14, "step": 658 }, { "epoch": 0.7638919075563283, "grad_norm": 0.2604377865791321, "learning_rate": 4.621264367816092e-05, "loss": 0.1318, "step": 659 }, { "epoch": 0.7650510758530754, "grad_norm": 0.35959815979003906, "learning_rate": 4.6206896551724135e-05, "loss": 0.149, "step": 660 }, { "epoch": 0.7662102441498225, "grad_norm": 0.21311073005199432, "learning_rate": 4.620114942528736e-05, "loss": 0.1123, "step": 661 }, { "epoch": 0.7673694124465695, "grad_norm": 0.24613121151924133, "learning_rate": 4.619540229885058e-05, "loss": 0.1293, "step": 662 }, { "epoch": 0.7685285807433166, "grad_norm": 0.27359437942504883, "learning_rate": 4.618965517241379e-05, "loss": 0.1359, "step": 663 }, { "epoch": 0.7696877490400638, "grad_norm": 0.24835547804832458, "learning_rate": 4.6183908045977015e-05, "loss": 0.1335, "step": 664 }, { "epoch": 0.7708469173368109, "grad_norm": 0.3183259665966034, "learning_rate": 4.617816091954023e-05, "loss": 0.1455, "step": 665 }, { "epoch": 0.7720060856335579, "grad_norm": 0.23825454711914062, "learning_rate": 4.617241379310345e-05, "loss": 0.1327, "step": 666 }, { "epoch": 0.773165253930305, "grad_norm": 0.24269577860832214, "learning_rate": 4.6166666666666666e-05, "loss": 0.1321, "step": 667 }, { "epoch": 0.7743244222270521, "grad_norm": 0.26813822984695435, "learning_rate": 4.616091954022989e-05, "loss": 0.1385, "step": 668 }, { "epoch": 0.7754835905237992, "grad_norm": 0.2735162377357483, "learning_rate": 4.615517241379311e-05, "loss": 0.1382, "step": 669 }, { "epoch": 0.7766427588205462, "grad_norm": 0.26851484179496765, "learning_rate": 4.6149425287356324e-05, "loss": 0.1323, "step": 670 }, { "epoch": 0.7778019271172933, "grad_norm": 0.3302837312221527, "learning_rate": 4.614367816091954e-05, "loss": 0.1644, "step": 671 }, { "epoch": 0.7789610954140405, "grad_norm": 0.28512895107269287, "learning_rate": 4.613793103448276e-05, "loss": 0.1541, "step": 672 }, { "epoch": 0.7801202637107875, "grad_norm": 0.26242977380752563, "learning_rate": 4.6132183908045976e-05, "loss": 0.1327, "step": 673 }, { "epoch": 0.7812794320075346, "grad_norm": 0.286178320646286, "learning_rate": 4.61264367816092e-05, "loss": 0.1301, "step": 674 }, { "epoch": 0.7824386003042817, "grad_norm": 0.22549080848693848, "learning_rate": 4.612068965517242e-05, "loss": 0.1237, "step": 675 }, { "epoch": 0.7835977686010288, "grad_norm": 0.24352504312992096, "learning_rate": 4.6114942528735634e-05, "loss": 0.1298, "step": 676 }, { "epoch": 0.7847569368977758, "grad_norm": 0.23466962575912476, "learning_rate": 4.6109195402298856e-05, "loss": 0.133, "step": 677 }, { "epoch": 0.7859161051945229, "grad_norm": 0.2620813250541687, "learning_rate": 4.610344827586207e-05, "loss": 0.1311, "step": 678 }, { "epoch": 0.78707527349127, "grad_norm": 0.2720955014228821, "learning_rate": 4.6097701149425286e-05, "loss": 0.1207, "step": 679 }, { "epoch": 0.788234441788017, "grad_norm": 0.2539190649986267, "learning_rate": 4.609195402298851e-05, "loss": 0.1287, "step": 680 }, { "epoch": 0.7893936100847642, "grad_norm": 0.2616521716117859, "learning_rate": 4.608620689655173e-05, "loss": 0.144, "step": 681 }, { "epoch": 0.7905527783815113, "grad_norm": 0.2984738349914551, "learning_rate": 4.6080459770114944e-05, "loss": 0.1408, "step": 682 }, { "epoch": 0.7917119466782584, "grad_norm": 0.25363418459892273, "learning_rate": 4.6074712643678166e-05, "loss": 0.121, "step": 683 }, { "epoch": 0.7928711149750054, "grad_norm": 0.22894874215126038, "learning_rate": 4.606896551724138e-05, "loss": 0.1137, "step": 684 }, { "epoch": 0.7940302832717525, "grad_norm": 0.25778335332870483, "learning_rate": 4.60632183908046e-05, "loss": 0.1338, "step": 685 }, { "epoch": 0.7951894515684996, "grad_norm": 0.2576935887336731, "learning_rate": 4.605747126436782e-05, "loss": 0.1416, "step": 686 }, { "epoch": 0.7963486198652466, "grad_norm": 0.3121855556964874, "learning_rate": 4.605172413793103e-05, "loss": 0.1284, "step": 687 }, { "epoch": 0.7975077881619937, "grad_norm": 0.24522997438907623, "learning_rate": 4.6045977011494254e-05, "loss": 0.1367, "step": 688 }, { "epoch": 0.7986669564587409, "grad_norm": 0.3264451324939728, "learning_rate": 4.6040229885057475e-05, "loss": 0.1574, "step": 689 }, { "epoch": 0.799826124755488, "grad_norm": 0.29765743017196655, "learning_rate": 4.603448275862069e-05, "loss": 0.1336, "step": 690 }, { "epoch": 0.800985293052235, "grad_norm": 0.302561491727829, "learning_rate": 4.602873563218391e-05, "loss": 0.1449, "step": 691 }, { "epoch": 0.8021444613489821, "grad_norm": 0.2951429486274719, "learning_rate": 4.602298850574713e-05, "loss": 0.1373, "step": 692 }, { "epoch": 0.8033036296457292, "grad_norm": 0.26344770193099976, "learning_rate": 4.601724137931035e-05, "loss": 0.1355, "step": 693 }, { "epoch": 0.8044627979424763, "grad_norm": 0.23573601245880127, "learning_rate": 4.6011494252873563e-05, "loss": 0.1264, "step": 694 }, { "epoch": 0.8056219662392233, "grad_norm": 0.2914588153362274, "learning_rate": 4.6005747126436785e-05, "loss": 0.1496, "step": 695 }, { "epoch": 0.8067811345359704, "grad_norm": 0.23518706858158112, "learning_rate": 4.600000000000001e-05, "loss": 0.125, "step": 696 }, { "epoch": 0.8079403028327176, "grad_norm": 0.2899051606655121, "learning_rate": 4.599425287356322e-05, "loss": 0.1379, "step": 697 }, { "epoch": 0.8090994711294646, "grad_norm": 0.25654926896095276, "learning_rate": 4.598850574712644e-05, "loss": 0.1469, "step": 698 }, { "epoch": 0.8102586394262117, "grad_norm": 0.36468809843063354, "learning_rate": 4.598275862068966e-05, "loss": 0.1424, "step": 699 }, { "epoch": 0.8114178077229588, "grad_norm": 0.2551107108592987, "learning_rate": 4.597701149425287e-05, "loss": 0.1441, "step": 700 }, { "epoch": 0.8125769760197059, "grad_norm": 0.21090874075889587, "learning_rate": 4.597126436781609e-05, "loss": 0.1253, "step": 701 }, { "epoch": 0.8137361443164529, "grad_norm": 0.2588905096054077, "learning_rate": 4.5965517241379317e-05, "loss": 0.1327, "step": 702 }, { "epoch": 0.8148953126132, "grad_norm": 0.24421721696853638, "learning_rate": 4.595977011494253e-05, "loss": 0.1294, "step": 703 }, { "epoch": 0.8160544809099471, "grad_norm": 0.25245368480682373, "learning_rate": 4.595402298850575e-05, "loss": 0.1247, "step": 704 }, { "epoch": 0.8172136492066941, "grad_norm": 0.3936619758605957, "learning_rate": 4.594827586206897e-05, "loss": 0.1321, "step": 705 }, { "epoch": 0.8183728175034413, "grad_norm": 0.25811803340911865, "learning_rate": 4.594252873563218e-05, "loss": 0.1337, "step": 706 }, { "epoch": 0.8195319858001884, "grad_norm": 0.23244990408420563, "learning_rate": 4.5936781609195405e-05, "loss": 0.1219, "step": 707 }, { "epoch": 0.8206911540969355, "grad_norm": 0.2513156235218048, "learning_rate": 4.593103448275862e-05, "loss": 0.1422, "step": 708 }, { "epoch": 0.8218503223936825, "grad_norm": 0.26262450218200684, "learning_rate": 4.592528735632184e-05, "loss": 0.1419, "step": 709 }, { "epoch": 0.8230094906904296, "grad_norm": 0.268996924161911, "learning_rate": 4.591954022988506e-05, "loss": 0.1427, "step": 710 }, { "epoch": 0.8241686589871767, "grad_norm": 0.2628903090953827, "learning_rate": 4.591379310344828e-05, "loss": 0.1406, "step": 711 }, { "epoch": 0.8253278272839237, "grad_norm": 0.24836388230323792, "learning_rate": 4.59080459770115e-05, "loss": 0.1344, "step": 712 }, { "epoch": 0.8264869955806708, "grad_norm": 0.29568374156951904, "learning_rate": 4.5902298850574714e-05, "loss": 0.1353, "step": 713 }, { "epoch": 0.827646163877418, "grad_norm": 0.30942583084106445, "learning_rate": 4.589655172413793e-05, "loss": 0.1381, "step": 714 }, { "epoch": 0.8288053321741651, "grad_norm": 0.3070472478866577, "learning_rate": 4.589080459770115e-05, "loss": 0.142, "step": 715 }, { "epoch": 0.8299645004709121, "grad_norm": 0.27771466970443726, "learning_rate": 4.588505747126437e-05, "loss": 0.1397, "step": 716 }, { "epoch": 0.8311236687676592, "grad_norm": 0.2712878882884979, "learning_rate": 4.587931034482759e-05, "loss": 0.1252, "step": 717 }, { "epoch": 0.8322828370644063, "grad_norm": 0.24696029722690582, "learning_rate": 4.587356321839081e-05, "loss": 0.1327, "step": 718 }, { "epoch": 0.8334420053611534, "grad_norm": 0.26035380363464355, "learning_rate": 4.5867816091954024e-05, "loss": 0.1276, "step": 719 }, { "epoch": 0.8346011736579004, "grad_norm": 0.23014302551746368, "learning_rate": 4.586206896551724e-05, "loss": 0.1294, "step": 720 }, { "epoch": 0.8357603419546475, "grad_norm": 0.22767721116542816, "learning_rate": 4.585632183908046e-05, "loss": 0.1204, "step": 721 }, { "epoch": 0.8369195102513947, "grad_norm": 0.24614818394184113, "learning_rate": 4.585057471264368e-05, "loss": 0.1412, "step": 722 }, { "epoch": 0.8380786785481417, "grad_norm": 0.2815050184726715, "learning_rate": 4.5844827586206904e-05, "loss": 0.142, "step": 723 }, { "epoch": 0.8392378468448888, "grad_norm": 0.24825121462345123, "learning_rate": 4.583908045977012e-05, "loss": 0.1477, "step": 724 }, { "epoch": 0.8403970151416359, "grad_norm": 0.2300599366426468, "learning_rate": 4.5833333333333334e-05, "loss": 0.1413, "step": 725 }, { "epoch": 0.841556183438383, "grad_norm": 0.27150389552116394, "learning_rate": 4.5827586206896556e-05, "loss": 0.1553, "step": 726 }, { "epoch": 0.84271535173513, "grad_norm": 0.33325132727622986, "learning_rate": 4.582183908045977e-05, "loss": 0.1699, "step": 727 }, { "epoch": 0.8438745200318771, "grad_norm": 0.21544939279556274, "learning_rate": 4.5816091954022985e-05, "loss": 0.1279, "step": 728 }, { "epoch": 0.8450336883286242, "grad_norm": 0.23038536310195923, "learning_rate": 4.581034482758621e-05, "loss": 0.1227, "step": 729 }, { "epoch": 0.8461928566253712, "grad_norm": 0.25293296575546265, "learning_rate": 4.580459770114943e-05, "loss": 0.1497, "step": 730 }, { "epoch": 0.8473520249221184, "grad_norm": 0.2448996752500534, "learning_rate": 4.579885057471265e-05, "loss": 0.1475, "step": 731 }, { "epoch": 0.8485111932188655, "grad_norm": 0.2752504050731659, "learning_rate": 4.5793103448275865e-05, "loss": 0.1453, "step": 732 }, { "epoch": 0.8496703615156126, "grad_norm": 0.2612292170524597, "learning_rate": 4.578735632183908e-05, "loss": 0.1306, "step": 733 }, { "epoch": 0.8508295298123596, "grad_norm": 0.2557094395160675, "learning_rate": 4.57816091954023e-05, "loss": 0.1288, "step": 734 }, { "epoch": 0.8519886981091067, "grad_norm": 0.24499356746673584, "learning_rate": 4.577586206896552e-05, "loss": 0.1391, "step": 735 }, { "epoch": 0.8531478664058538, "grad_norm": 0.23510660231113434, "learning_rate": 4.577011494252874e-05, "loss": 0.1277, "step": 736 }, { "epoch": 0.8543070347026008, "grad_norm": 0.25060582160949707, "learning_rate": 4.576436781609196e-05, "loss": 0.1301, "step": 737 }, { "epoch": 0.855466202999348, "grad_norm": 0.2850019931793213, "learning_rate": 4.5758620689655175e-05, "loss": 0.1406, "step": 738 }, { "epoch": 0.8566253712960951, "grad_norm": 0.30304649472236633, "learning_rate": 4.575287356321839e-05, "loss": 0.1455, "step": 739 }, { "epoch": 0.8577845395928422, "grad_norm": 0.27735939621925354, "learning_rate": 4.574712643678161e-05, "loss": 0.133, "step": 740 }, { "epoch": 0.8589437078895892, "grad_norm": 0.258037805557251, "learning_rate": 4.5741379310344826e-05, "loss": 0.1359, "step": 741 }, { "epoch": 0.8601028761863363, "grad_norm": 0.2623947858810425, "learning_rate": 4.573563218390805e-05, "loss": 0.141, "step": 742 }, { "epoch": 0.8612620444830834, "grad_norm": 0.2677944004535675, "learning_rate": 4.572988505747127e-05, "loss": 0.1398, "step": 743 }, { "epoch": 0.8624212127798305, "grad_norm": 0.28370678424835205, "learning_rate": 4.5724137931034485e-05, "loss": 0.1551, "step": 744 }, { "epoch": 0.8635803810765775, "grad_norm": 0.259971022605896, "learning_rate": 4.5718390804597706e-05, "loss": 0.1488, "step": 745 }, { "epoch": 0.8647395493733246, "grad_norm": 0.22784096002578735, "learning_rate": 4.571264367816092e-05, "loss": 0.123, "step": 746 }, { "epoch": 0.8658987176700718, "grad_norm": 0.25658029317855835, "learning_rate": 4.5706896551724136e-05, "loss": 0.1364, "step": 747 }, { "epoch": 0.8670578859668188, "grad_norm": 0.24773739278316498, "learning_rate": 4.570114942528736e-05, "loss": 0.128, "step": 748 }, { "epoch": 0.8682170542635659, "grad_norm": 0.2921466827392578, "learning_rate": 4.569540229885057e-05, "loss": 0.1432, "step": 749 }, { "epoch": 0.869376222560313, "grad_norm": 0.26510924100875854, "learning_rate": 4.5689655172413794e-05, "loss": 0.1334, "step": 750 }, { "epoch": 0.8705353908570601, "grad_norm": 0.2811342179775238, "learning_rate": 4.5683908045977016e-05, "loss": 0.1446, "step": 751 }, { "epoch": 0.8716945591538071, "grad_norm": 0.24528606235980988, "learning_rate": 4.567816091954023e-05, "loss": 0.1254, "step": 752 }, { "epoch": 0.8728537274505542, "grad_norm": 0.31388193368911743, "learning_rate": 4.567241379310345e-05, "loss": 0.148, "step": 753 }, { "epoch": 0.8740128957473013, "grad_norm": 0.26188236474990845, "learning_rate": 4.566666666666667e-05, "loss": 0.1347, "step": 754 }, { "epoch": 0.8751720640440483, "grad_norm": 0.2681477665901184, "learning_rate": 4.566091954022988e-05, "loss": 0.1251, "step": 755 }, { "epoch": 0.8763312323407955, "grad_norm": 0.24694494903087616, "learning_rate": 4.5655172413793104e-05, "loss": 0.1341, "step": 756 }, { "epoch": 0.8774904006375426, "grad_norm": 0.27526146173477173, "learning_rate": 4.5649425287356326e-05, "loss": 0.1425, "step": 757 }, { "epoch": 0.8786495689342897, "grad_norm": 0.28800222277641296, "learning_rate": 4.564367816091955e-05, "loss": 0.1396, "step": 758 }, { "epoch": 0.8798087372310367, "grad_norm": 0.29053691029548645, "learning_rate": 4.563793103448276e-05, "loss": 0.1427, "step": 759 }, { "epoch": 0.8809679055277838, "grad_norm": 0.29743799567222595, "learning_rate": 4.563218390804598e-05, "loss": 0.1513, "step": 760 }, { "epoch": 0.8821270738245309, "grad_norm": 0.27560529112815857, "learning_rate": 4.56264367816092e-05, "loss": 0.1356, "step": 761 }, { "epoch": 0.8832862421212779, "grad_norm": 0.23586055636405945, "learning_rate": 4.5620689655172414e-05, "loss": 0.132, "step": 762 }, { "epoch": 0.884445410418025, "grad_norm": 0.26681259274482727, "learning_rate": 4.5614942528735636e-05, "loss": 0.1584, "step": 763 }, { "epoch": 0.8856045787147722, "grad_norm": 0.3042534589767456, "learning_rate": 4.560919540229886e-05, "loss": 0.149, "step": 764 }, { "epoch": 0.8867637470115193, "grad_norm": 0.24351336061954498, "learning_rate": 4.560344827586207e-05, "loss": 0.1311, "step": 765 }, { "epoch": 0.8879229153082663, "grad_norm": 0.2620246708393097, "learning_rate": 4.559770114942529e-05, "loss": 0.1448, "step": 766 }, { "epoch": 0.8890820836050134, "grad_norm": 0.2437165081501007, "learning_rate": 4.559195402298851e-05, "loss": 0.1435, "step": 767 }, { "epoch": 0.8902412519017605, "grad_norm": 0.231397345662117, "learning_rate": 4.5586206896551724e-05, "loss": 0.1378, "step": 768 }, { "epoch": 0.8914004201985076, "grad_norm": 0.22732099890708923, "learning_rate": 4.5580459770114945e-05, "loss": 0.1399, "step": 769 }, { "epoch": 0.8925595884952546, "grad_norm": 0.2291109710931778, "learning_rate": 4.557471264367816e-05, "loss": 0.1207, "step": 770 }, { "epoch": 0.8937187567920017, "grad_norm": 0.23525553941726685, "learning_rate": 4.556896551724138e-05, "loss": 0.1269, "step": 771 }, { "epoch": 0.8948779250887489, "grad_norm": 0.2587391436100006, "learning_rate": 4.5563218390804604e-05, "loss": 0.1208, "step": 772 }, { "epoch": 0.8960370933854959, "grad_norm": 0.24662849307060242, "learning_rate": 4.555747126436782e-05, "loss": 0.1413, "step": 773 }, { "epoch": 0.897196261682243, "grad_norm": 0.2599044144153595, "learning_rate": 4.5551724137931033e-05, "loss": 0.1322, "step": 774 }, { "epoch": 0.8983554299789901, "grad_norm": 0.2424292415380478, "learning_rate": 4.5545977011494255e-05, "loss": 0.1359, "step": 775 }, { "epoch": 0.8995145982757372, "grad_norm": 0.25603169202804565, "learning_rate": 4.554022988505747e-05, "loss": 0.1324, "step": 776 }, { "epoch": 0.9006737665724842, "grad_norm": 0.2570304870605469, "learning_rate": 4.553448275862069e-05, "loss": 0.122, "step": 777 }, { "epoch": 0.9018329348692313, "grad_norm": 0.2565818130970001, "learning_rate": 4.552873563218391e-05, "loss": 0.1246, "step": 778 }, { "epoch": 0.9029921031659784, "grad_norm": 0.2947520613670349, "learning_rate": 4.552298850574713e-05, "loss": 0.1376, "step": 779 }, { "epoch": 0.9041512714627254, "grad_norm": 0.31323108077049255, "learning_rate": 4.551724137931035e-05, "loss": 0.148, "step": 780 }, { "epoch": 0.9053104397594726, "grad_norm": 0.3410814702510834, "learning_rate": 4.5511494252873565e-05, "loss": 0.1493, "step": 781 }, { "epoch": 0.9064696080562197, "grad_norm": 0.2659667134284973, "learning_rate": 4.550574712643678e-05, "loss": 0.1303, "step": 782 }, { "epoch": 0.9076287763529668, "grad_norm": 0.2651742696762085, "learning_rate": 4.55e-05, "loss": 0.1327, "step": 783 }, { "epoch": 0.9087879446497138, "grad_norm": 0.24942578375339508, "learning_rate": 4.549425287356322e-05, "loss": 0.1342, "step": 784 }, { "epoch": 0.9099471129464609, "grad_norm": 0.2682301104068756, "learning_rate": 4.548850574712644e-05, "loss": 0.1449, "step": 785 }, { "epoch": 0.911106281243208, "grad_norm": 0.2612859606742859, "learning_rate": 4.548275862068966e-05, "loss": 0.1371, "step": 786 }, { "epoch": 0.912265449539955, "grad_norm": 0.30771300196647644, "learning_rate": 4.5477011494252875e-05, "loss": 0.1446, "step": 787 }, { "epoch": 0.9134246178367021, "grad_norm": 0.33343127369880676, "learning_rate": 4.5471264367816096e-05, "loss": 0.1375, "step": 788 }, { "epoch": 0.9145837861334493, "grad_norm": 0.2678926885128021, "learning_rate": 4.546551724137931e-05, "loss": 0.1347, "step": 789 }, { "epoch": 0.9157429544301964, "grad_norm": 0.2823614478111267, "learning_rate": 4.5459770114942526e-05, "loss": 0.1461, "step": 790 }, { "epoch": 0.9169021227269434, "grad_norm": 0.27924710512161255, "learning_rate": 4.545402298850575e-05, "loss": 0.1343, "step": 791 }, { "epoch": 0.9180612910236905, "grad_norm": 0.27781474590301514, "learning_rate": 4.544827586206897e-05, "loss": 0.1433, "step": 792 }, { "epoch": 0.9192204593204376, "grad_norm": 0.23757725954055786, "learning_rate": 4.5442528735632184e-05, "loss": 0.1214, "step": 793 }, { "epoch": 0.9203796276171846, "grad_norm": 0.305899053812027, "learning_rate": 4.5436781609195406e-05, "loss": 0.136, "step": 794 }, { "epoch": 0.9215387959139317, "grad_norm": 0.29127955436706543, "learning_rate": 4.543103448275862e-05, "loss": 0.1551, "step": 795 }, { "epoch": 0.9226979642106788, "grad_norm": 0.29604771733283997, "learning_rate": 4.542528735632184e-05, "loss": 0.1378, "step": 796 }, { "epoch": 0.923857132507426, "grad_norm": 0.23887404799461365, "learning_rate": 4.541954022988506e-05, "loss": 0.1268, "step": 797 }, { "epoch": 0.925016300804173, "grad_norm": 0.2829357981681824, "learning_rate": 4.541379310344828e-05, "loss": 0.1432, "step": 798 }, { "epoch": 0.9261754691009201, "grad_norm": 0.2251252681016922, "learning_rate": 4.54080459770115e-05, "loss": 0.1298, "step": 799 }, { "epoch": 0.9273346373976672, "grad_norm": 0.26304125785827637, "learning_rate": 4.5402298850574716e-05, "loss": 0.1376, "step": 800 }, { "epoch": 0.9284938056944143, "grad_norm": 0.22513070702552795, "learning_rate": 4.539655172413793e-05, "loss": 0.127, "step": 801 }, { "epoch": 0.9296529739911613, "grad_norm": 0.25344815850257874, "learning_rate": 4.539080459770115e-05, "loss": 0.1395, "step": 802 }, { "epoch": 0.9308121422879084, "grad_norm": 0.23576590418815613, "learning_rate": 4.538505747126437e-05, "loss": 0.1269, "step": 803 }, { "epoch": 0.9319713105846555, "grad_norm": 0.24553732573986053, "learning_rate": 4.537931034482759e-05, "loss": 0.1408, "step": 804 }, { "epoch": 0.9331304788814025, "grad_norm": 0.24802154302597046, "learning_rate": 4.537356321839081e-05, "loss": 0.1307, "step": 805 }, { "epoch": 0.9342896471781497, "grad_norm": 0.255938321352005, "learning_rate": 4.5367816091954025e-05, "loss": 0.1446, "step": 806 }, { "epoch": 0.9354488154748968, "grad_norm": 0.23458512127399445, "learning_rate": 4.536206896551725e-05, "loss": 0.1371, "step": 807 }, { "epoch": 0.9366079837716439, "grad_norm": 0.2571949064731598, "learning_rate": 4.535632183908046e-05, "loss": 0.1319, "step": 808 }, { "epoch": 0.9377671520683909, "grad_norm": 0.2464578002691269, "learning_rate": 4.535057471264368e-05, "loss": 0.1425, "step": 809 }, { "epoch": 0.938926320365138, "grad_norm": 0.23102332651615143, "learning_rate": 4.53448275862069e-05, "loss": 0.1351, "step": 810 }, { "epoch": 0.9400854886618851, "grad_norm": 0.27257415652275085, "learning_rate": 4.5339080459770114e-05, "loss": 0.1359, "step": 811 }, { "epoch": 0.9412446569586321, "grad_norm": 0.24599824845790863, "learning_rate": 4.5333333333333335e-05, "loss": 0.1356, "step": 812 }, { "epoch": 0.9424038252553792, "grad_norm": 0.23127569258213043, "learning_rate": 4.532758620689656e-05, "loss": 0.1412, "step": 813 }, { "epoch": 0.9435629935521264, "grad_norm": 0.2365388125181198, "learning_rate": 4.532183908045977e-05, "loss": 0.1118, "step": 814 }, { "epoch": 0.9447221618488735, "grad_norm": 0.2809307873249054, "learning_rate": 4.5316091954022993e-05, "loss": 0.1363, "step": 815 }, { "epoch": 0.9458813301456205, "grad_norm": 0.258364737033844, "learning_rate": 4.531034482758621e-05, "loss": 0.1378, "step": 816 }, { "epoch": 0.9470404984423676, "grad_norm": 0.27401670813560486, "learning_rate": 4.530459770114942e-05, "loss": 0.1465, "step": 817 }, { "epoch": 0.9481996667391147, "grad_norm": 0.22026842832565308, "learning_rate": 4.5298850574712645e-05, "loss": 0.1283, "step": 818 }, { "epoch": 0.9493588350358617, "grad_norm": 0.2709653675556183, "learning_rate": 4.529310344827587e-05, "loss": 0.1284, "step": 819 }, { "epoch": 0.9505180033326088, "grad_norm": 0.2417244166135788, "learning_rate": 4.528735632183908e-05, "loss": 0.1363, "step": 820 }, { "epoch": 0.951677171629356, "grad_norm": 0.2279566526412964, "learning_rate": 4.52816091954023e-05, "loss": 0.1285, "step": 821 }, { "epoch": 0.9528363399261031, "grad_norm": 0.32052162289619446, "learning_rate": 4.527586206896552e-05, "loss": 0.1433, "step": 822 }, { "epoch": 0.9539955082228501, "grad_norm": 0.3256911635398865, "learning_rate": 4.527011494252873e-05, "loss": 0.145, "step": 823 }, { "epoch": 0.9551546765195972, "grad_norm": 0.30633530020713806, "learning_rate": 4.5264367816091955e-05, "loss": 0.1458, "step": 824 }, { "epoch": 0.9563138448163443, "grad_norm": 0.24086791276931763, "learning_rate": 4.5258620689655176e-05, "loss": 0.1213, "step": 825 }, { "epoch": 0.9574730131130914, "grad_norm": 0.3224896788597107, "learning_rate": 4.52528735632184e-05, "loss": 0.1543, "step": 826 }, { "epoch": 0.9586321814098384, "grad_norm": 0.24555163085460663, "learning_rate": 4.524712643678161e-05, "loss": 0.1295, "step": 827 }, { "epoch": 0.9597913497065855, "grad_norm": 0.2528969645500183, "learning_rate": 4.524137931034483e-05, "loss": 0.1389, "step": 828 }, { "epoch": 0.9609505180033326, "grad_norm": 0.2498130351305008, "learning_rate": 4.523563218390805e-05, "loss": 0.1529, "step": 829 }, { "epoch": 0.9621096863000796, "grad_norm": 0.26962774991989136, "learning_rate": 4.5229885057471264e-05, "loss": 0.1346, "step": 830 }, { "epoch": 0.9632688545968268, "grad_norm": 0.2615809142589569, "learning_rate": 4.522413793103448e-05, "loss": 0.1549, "step": 831 }, { "epoch": 0.9644280228935739, "grad_norm": 0.25109943747520447, "learning_rate": 4.521839080459771e-05, "loss": 0.1402, "step": 832 }, { "epoch": 0.965587191190321, "grad_norm": 0.2414674311876297, "learning_rate": 4.521264367816092e-05, "loss": 0.1329, "step": 833 }, { "epoch": 0.966746359487068, "grad_norm": 0.2999972701072693, "learning_rate": 4.5206896551724144e-05, "loss": 0.1524, "step": 834 }, { "epoch": 0.9679055277838151, "grad_norm": 0.22573649883270264, "learning_rate": 4.520114942528736e-05, "loss": 0.1248, "step": 835 }, { "epoch": 0.9690646960805622, "grad_norm": 0.2691646218299866, "learning_rate": 4.5195402298850574e-05, "loss": 0.1259, "step": 836 }, { "epoch": 0.9702238643773092, "grad_norm": 0.2566002607345581, "learning_rate": 4.5189655172413796e-05, "loss": 0.1338, "step": 837 }, { "epoch": 0.9713830326740563, "grad_norm": 0.30500656366348267, "learning_rate": 4.518390804597701e-05, "loss": 0.1393, "step": 838 }, { "epoch": 0.9725422009708035, "grad_norm": 0.2786789834499359, "learning_rate": 4.517816091954023e-05, "loss": 0.1418, "step": 839 }, { "epoch": 0.9737013692675506, "grad_norm": 0.2511579096317291, "learning_rate": 4.5172413793103454e-05, "loss": 0.1261, "step": 840 }, { "epoch": 0.9748605375642976, "grad_norm": 0.2628697454929352, "learning_rate": 4.516666666666667e-05, "loss": 0.1255, "step": 841 }, { "epoch": 0.9760197058610447, "grad_norm": 0.2691211700439453, "learning_rate": 4.5160919540229884e-05, "loss": 0.1469, "step": 842 }, { "epoch": 0.9771788741577918, "grad_norm": 0.4240913689136505, "learning_rate": 4.5155172413793106e-05, "loss": 0.1345, "step": 843 }, { "epoch": 0.9783380424545388, "grad_norm": 0.26439380645751953, "learning_rate": 4.514942528735632e-05, "loss": 0.1389, "step": 844 }, { "epoch": 0.9794972107512859, "grad_norm": 0.2187066376209259, "learning_rate": 4.514367816091954e-05, "loss": 0.1191, "step": 845 }, { "epoch": 0.980656379048033, "grad_norm": 0.2630019187927246, "learning_rate": 4.5137931034482764e-05, "loss": 0.1255, "step": 846 }, { "epoch": 0.9818155473447802, "grad_norm": 0.2619337737560272, "learning_rate": 4.513218390804598e-05, "loss": 0.146, "step": 847 }, { "epoch": 0.9829747156415272, "grad_norm": 0.24650199711322784, "learning_rate": 4.51264367816092e-05, "loss": 0.1246, "step": 848 }, { "epoch": 0.9841338839382743, "grad_norm": 0.33000877499580383, "learning_rate": 4.5120689655172415e-05, "loss": 0.1448, "step": 849 }, { "epoch": 0.9852930522350214, "grad_norm": 0.39497750997543335, "learning_rate": 4.511494252873563e-05, "loss": 0.1332, "step": 850 }, { "epoch": 0.9864522205317685, "grad_norm": 0.27251842617988586, "learning_rate": 4.510919540229885e-05, "loss": 0.127, "step": 851 }, { "epoch": 0.9876113888285155, "grad_norm": 0.4220561981201172, "learning_rate": 4.510344827586207e-05, "loss": 0.1353, "step": 852 }, { "epoch": 0.9887705571252626, "grad_norm": 0.24371755123138428, "learning_rate": 4.5097701149425295e-05, "loss": 0.1436, "step": 853 }, { "epoch": 0.9899297254220097, "grad_norm": 0.2443200647830963, "learning_rate": 4.509195402298851e-05, "loss": 0.1405, "step": 854 }, { "epoch": 0.9910888937187567, "grad_norm": 0.22042927145957947, "learning_rate": 4.5086206896551725e-05, "loss": 0.1294, "step": 855 }, { "epoch": 0.9922480620155039, "grad_norm": 0.26495447754859924, "learning_rate": 4.508045977011495e-05, "loss": 0.1442, "step": 856 }, { "epoch": 0.993407230312251, "grad_norm": 0.28576844930648804, "learning_rate": 4.507471264367816e-05, "loss": 0.1422, "step": 857 }, { "epoch": 0.9945663986089981, "grad_norm": 0.24880261719226837, "learning_rate": 4.5068965517241377e-05, "loss": 0.1404, "step": 858 }, { "epoch": 0.9957255669057451, "grad_norm": 0.2431262731552124, "learning_rate": 4.50632183908046e-05, "loss": 0.1376, "step": 859 }, { "epoch": 0.9968847352024922, "grad_norm": 0.2703196704387665, "learning_rate": 4.505747126436782e-05, "loss": 0.1407, "step": 860 }, { "epoch": 0.9980439034992393, "grad_norm": 0.3191419839859009, "learning_rate": 4.5051724137931035e-05, "loss": 0.1558, "step": 861 }, { "epoch": 0.9992030717959863, "grad_norm": 0.25417354702949524, "learning_rate": 4.5045977011494257e-05, "loss": 0.1386, "step": 862 }, { "epoch": 0.9992030717959863, "eval_loss": 0.13573063910007477, "eval_runtime": 265.6894, "eval_samples_per_second": 5.774, "eval_steps_per_second": 5.774, "step": 862 }, { "epoch": 1.0003622400927334, "grad_norm": 0.21144159138202667, "learning_rate": 4.504022988505747e-05, "loss": 0.119, "step": 863 }, { "epoch": 1.0015214083894806, "grad_norm": 0.27048224210739136, "learning_rate": 4.503448275862069e-05, "loss": 0.1282, "step": 864 }, { "epoch": 1.0026805766862277, "grad_norm": 0.2059887945652008, "learning_rate": 4.502873563218391e-05, "loss": 0.1389, "step": 865 }, { "epoch": 1.0038397449829748, "grad_norm": 0.25673580169677734, "learning_rate": 4.502298850574713e-05, "loss": 0.134, "step": 866 }, { "epoch": 1.004998913279722, "grad_norm": 0.2381593883037567, "learning_rate": 4.501724137931035e-05, "loss": 0.1271, "step": 867 }, { "epoch": 1.0061580815764688, "grad_norm": 0.2609866261482239, "learning_rate": 4.5011494252873566e-05, "loss": 0.1349, "step": 868 }, { "epoch": 1.007317249873216, "grad_norm": 0.24942117929458618, "learning_rate": 4.500574712643678e-05, "loss": 0.1251, "step": 869 }, { "epoch": 1.008476418169963, "grad_norm": 0.2552180886268616, "learning_rate": 4.5e-05, "loss": 0.1333, "step": 870 }, { "epoch": 1.0096355864667101, "grad_norm": 0.2939591407775879, "learning_rate": 4.499425287356322e-05, "loss": 0.1433, "step": 871 }, { "epoch": 1.0107947547634573, "grad_norm": 0.2345464825630188, "learning_rate": 4.498850574712644e-05, "loss": 0.1273, "step": 872 }, { "epoch": 1.0119539230602044, "grad_norm": 0.3161166310310364, "learning_rate": 4.498275862068966e-05, "loss": 0.1301, "step": 873 }, { "epoch": 1.0131130913569515, "grad_norm": 0.2409006655216217, "learning_rate": 4.4977011494252876e-05, "loss": 0.1145, "step": 874 }, { "epoch": 1.0142722596536984, "grad_norm": 0.26523053646087646, "learning_rate": 4.49712643678161e-05, "loss": 0.1386, "step": 875 }, { "epoch": 1.0154314279504455, "grad_norm": 0.2150416225194931, "learning_rate": 4.496551724137931e-05, "loss": 0.1285, "step": 876 }, { "epoch": 1.0165905962471926, "grad_norm": 0.23828035593032837, "learning_rate": 4.495977011494253e-05, "loss": 0.1161, "step": 877 }, { "epoch": 1.0177497645439397, "grad_norm": 0.319975882768631, "learning_rate": 4.495402298850575e-05, "loss": 0.1444, "step": 878 }, { "epoch": 1.0189089328406868, "grad_norm": 0.22034992277622223, "learning_rate": 4.4948275862068964e-05, "loss": 0.1195, "step": 879 }, { "epoch": 1.020068101137434, "grad_norm": 0.2223641574382782, "learning_rate": 4.4942528735632186e-05, "loss": 0.1181, "step": 880 }, { "epoch": 1.021227269434181, "grad_norm": 0.21697430312633514, "learning_rate": 4.493678160919541e-05, "loss": 0.1131, "step": 881 }, { "epoch": 1.022386437730928, "grad_norm": 0.24257703125476837, "learning_rate": 4.493103448275862e-05, "loss": 0.123, "step": 882 }, { "epoch": 1.023545606027675, "grad_norm": 0.2622239291667938, "learning_rate": 4.4925287356321844e-05, "loss": 0.1325, "step": 883 }, { "epoch": 1.0247047743244222, "grad_norm": 0.2764127254486084, "learning_rate": 4.491954022988506e-05, "loss": 0.1228, "step": 884 }, { "epoch": 1.0258639426211693, "grad_norm": 0.25700658559799194, "learning_rate": 4.4913793103448274e-05, "loss": 0.1212, "step": 885 }, { "epoch": 1.0270231109179164, "grad_norm": 0.27795660495758057, "learning_rate": 4.4908045977011495e-05, "loss": 0.1328, "step": 886 }, { "epoch": 1.0281822792146635, "grad_norm": 0.28292378783226013, "learning_rate": 4.490229885057472e-05, "loss": 0.1197, "step": 887 }, { "epoch": 1.0293414475114107, "grad_norm": 0.27904993295669556, "learning_rate": 4.489655172413793e-05, "loss": 0.1208, "step": 888 }, { "epoch": 1.0305006158081575, "grad_norm": 0.23822936415672302, "learning_rate": 4.4890804597701154e-05, "loss": 0.1187, "step": 889 }, { "epoch": 1.0316597841049047, "grad_norm": 0.23615434765815735, "learning_rate": 4.488505747126437e-05, "loss": 0.119, "step": 890 }, { "epoch": 1.0328189524016518, "grad_norm": 0.2193615883588791, "learning_rate": 4.487931034482759e-05, "loss": 0.1251, "step": 891 }, { "epoch": 1.033978120698399, "grad_norm": 0.2385019063949585, "learning_rate": 4.4873563218390805e-05, "loss": 0.1261, "step": 892 }, { "epoch": 1.035137288995146, "grad_norm": 0.262351393699646, "learning_rate": 4.486781609195403e-05, "loss": 0.1285, "step": 893 }, { "epoch": 1.0362964572918931, "grad_norm": 0.2790246307849884, "learning_rate": 4.486206896551725e-05, "loss": 0.13, "step": 894 }, { "epoch": 1.0374556255886402, "grad_norm": 0.28902125358581543, "learning_rate": 4.4856321839080463e-05, "loss": 0.1266, "step": 895 }, { "epoch": 1.0386147938853871, "grad_norm": 0.2775196433067322, "learning_rate": 4.485057471264368e-05, "loss": 0.1453, "step": 896 }, { "epoch": 1.0397739621821342, "grad_norm": 0.3169780671596527, "learning_rate": 4.48448275862069e-05, "loss": 0.1372, "step": 897 }, { "epoch": 1.0409331304788814, "grad_norm": 0.24293041229248047, "learning_rate": 4.4839080459770115e-05, "loss": 0.1267, "step": 898 }, { "epoch": 1.0420922987756285, "grad_norm": 0.3456263244152069, "learning_rate": 4.483333333333333e-05, "loss": 0.1566, "step": 899 }, { "epoch": 1.0432514670723756, "grad_norm": 0.2542770504951477, "learning_rate": 4.482758620689655e-05, "loss": 0.1292, "step": 900 }, { "epoch": 1.0444106353691227, "grad_norm": 0.308881551027298, "learning_rate": 4.482183908045977e-05, "loss": 0.1492, "step": 901 }, { "epoch": 1.0455698036658698, "grad_norm": 0.26184189319610596, "learning_rate": 4.4816091954022995e-05, "loss": 0.1225, "step": 902 }, { "epoch": 1.0467289719626167, "grad_norm": 0.22493098676204681, "learning_rate": 4.481034482758621e-05, "loss": 0.1239, "step": 903 }, { "epoch": 1.0478881402593638, "grad_norm": 0.23027804493904114, "learning_rate": 4.4804597701149425e-05, "loss": 0.1253, "step": 904 }, { "epoch": 1.049047308556111, "grad_norm": 0.2515313923358917, "learning_rate": 4.4798850574712646e-05, "loss": 0.1268, "step": 905 }, { "epoch": 1.050206476852858, "grad_norm": 0.30108925700187683, "learning_rate": 4.479310344827586e-05, "loss": 0.1306, "step": 906 }, { "epoch": 1.0513656451496052, "grad_norm": 0.25616785883903503, "learning_rate": 4.478735632183908e-05, "loss": 0.1292, "step": 907 }, { "epoch": 1.0525248134463523, "grad_norm": 0.27651333808898926, "learning_rate": 4.4781609195402305e-05, "loss": 0.1293, "step": 908 }, { "epoch": 1.0536839817430994, "grad_norm": 0.26399362087249756, "learning_rate": 4.477586206896552e-05, "loss": 0.1379, "step": 909 }, { "epoch": 1.0548431500398463, "grad_norm": 0.27588364481925964, "learning_rate": 4.477011494252874e-05, "loss": 0.1369, "step": 910 }, { "epoch": 1.0560023183365934, "grad_norm": 0.22311675548553467, "learning_rate": 4.4764367816091956e-05, "loss": 0.119, "step": 911 }, { "epoch": 1.0571614866333405, "grad_norm": 0.25451788306236267, "learning_rate": 4.475862068965517e-05, "loss": 0.1204, "step": 912 }, { "epoch": 1.0583206549300876, "grad_norm": 0.25941815972328186, "learning_rate": 4.475287356321839e-05, "loss": 0.1291, "step": 913 }, { "epoch": 1.0594798232268348, "grad_norm": 0.2863259017467499, "learning_rate": 4.4747126436781614e-05, "loss": 0.1269, "step": 914 }, { "epoch": 1.0606389915235819, "grad_norm": 0.26459577679634094, "learning_rate": 4.474137931034483e-05, "loss": 0.1291, "step": 915 }, { "epoch": 1.061798159820329, "grad_norm": 0.2957836985588074, "learning_rate": 4.473563218390805e-05, "loss": 0.1313, "step": 916 }, { "epoch": 1.062957328117076, "grad_norm": 0.33022555708885193, "learning_rate": 4.4729885057471266e-05, "loss": 0.1444, "step": 917 }, { "epoch": 1.064116496413823, "grad_norm": 0.26044270396232605, "learning_rate": 4.472413793103448e-05, "loss": 0.1297, "step": 918 }, { "epoch": 1.0652756647105701, "grad_norm": 0.25342807173728943, "learning_rate": 4.47183908045977e-05, "loss": 0.1134, "step": 919 }, { "epoch": 1.0664348330073172, "grad_norm": 0.27540135383605957, "learning_rate": 4.471264367816092e-05, "loss": 0.1218, "step": 920 }, { "epoch": 1.0675940013040643, "grad_norm": 0.24183742702007294, "learning_rate": 4.470689655172414e-05, "loss": 0.1227, "step": 921 }, { "epoch": 1.0687531696008115, "grad_norm": 0.493832528591156, "learning_rate": 4.470114942528736e-05, "loss": 0.132, "step": 922 }, { "epoch": 1.0699123378975586, "grad_norm": 0.31524115800857544, "learning_rate": 4.4695402298850576e-05, "loss": 0.135, "step": 923 }, { "epoch": 1.0710715061943057, "grad_norm": 0.272127240896225, "learning_rate": 4.46896551724138e-05, "loss": 0.13, "step": 924 }, { "epoch": 1.0722306744910526, "grad_norm": 0.24013520777225494, "learning_rate": 4.468390804597701e-05, "loss": 0.1231, "step": 925 }, { "epoch": 1.0733898427877997, "grad_norm": 0.44068050384521484, "learning_rate": 4.467816091954023e-05, "loss": 0.1219, "step": 926 }, { "epoch": 1.0745490110845468, "grad_norm": 0.2556682825088501, "learning_rate": 4.467241379310345e-05, "loss": 0.132, "step": 927 }, { "epoch": 1.075708179381294, "grad_norm": 0.22146081924438477, "learning_rate": 4.466666666666667e-05, "loss": 0.1174, "step": 928 }, { "epoch": 1.076867347678041, "grad_norm": 0.2589811086654663, "learning_rate": 4.466091954022989e-05, "loss": 0.1235, "step": 929 }, { "epoch": 1.0780265159747882, "grad_norm": 0.26327693462371826, "learning_rate": 4.465517241379311e-05, "loss": 0.1303, "step": 930 }, { "epoch": 1.0791856842715353, "grad_norm": 0.25532066822052, "learning_rate": 4.464942528735632e-05, "loss": 0.1442, "step": 931 }, { "epoch": 1.0803448525682822, "grad_norm": 0.31189385056495667, "learning_rate": 4.4643678160919544e-05, "loss": 0.1448, "step": 932 }, { "epoch": 1.0815040208650293, "grad_norm": 0.26370447874069214, "learning_rate": 4.463793103448276e-05, "loss": 0.1257, "step": 933 }, { "epoch": 1.0826631891617764, "grad_norm": 0.2603430151939392, "learning_rate": 4.463218390804598e-05, "loss": 0.1248, "step": 934 }, { "epoch": 1.0838223574585235, "grad_norm": 0.2369643896818161, "learning_rate": 4.46264367816092e-05, "loss": 0.1211, "step": 935 }, { "epoch": 1.0849815257552706, "grad_norm": 0.2757248878479004, "learning_rate": 4.462068965517242e-05, "loss": 0.1319, "step": 936 }, { "epoch": 1.0861406940520177, "grad_norm": 0.249775692820549, "learning_rate": 4.461494252873564e-05, "loss": 0.1349, "step": 937 }, { "epoch": 1.0872998623487649, "grad_norm": 0.25326481461524963, "learning_rate": 4.460919540229885e-05, "loss": 0.1313, "step": 938 }, { "epoch": 1.0884590306455117, "grad_norm": 0.20008519291877747, "learning_rate": 4.460344827586207e-05, "loss": 0.1142, "step": 939 }, { "epoch": 1.0896181989422589, "grad_norm": 0.29447588324546814, "learning_rate": 4.459770114942529e-05, "loss": 0.1376, "step": 940 }, { "epoch": 1.090777367239006, "grad_norm": 0.22719112038612366, "learning_rate": 4.4591954022988505e-05, "loss": 0.1122, "step": 941 }, { "epoch": 1.091936535535753, "grad_norm": 0.2523985207080841, "learning_rate": 4.4586206896551726e-05, "loss": 0.1289, "step": 942 }, { "epoch": 1.0930957038325002, "grad_norm": 0.2906104028224945, "learning_rate": 4.458045977011495e-05, "loss": 0.1285, "step": 943 }, { "epoch": 1.0942548721292473, "grad_norm": 0.344566285610199, "learning_rate": 4.457471264367816e-05, "loss": 0.1237, "step": 944 }, { "epoch": 1.0954140404259944, "grad_norm": 0.2776789963245392, "learning_rate": 4.456896551724138e-05, "loss": 0.1273, "step": 945 }, { "epoch": 1.0965732087227413, "grad_norm": 0.2559017241001129, "learning_rate": 4.45632183908046e-05, "loss": 0.1239, "step": 946 }, { "epoch": 1.0977323770194884, "grad_norm": 0.2877829670906067, "learning_rate": 4.4557471264367815e-05, "loss": 0.1337, "step": 947 }, { "epoch": 1.0988915453162356, "grad_norm": 0.3006168007850647, "learning_rate": 4.4551724137931036e-05, "loss": 0.1387, "step": 948 }, { "epoch": 1.1000507136129827, "grad_norm": 0.2508884370326996, "learning_rate": 4.454597701149426e-05, "loss": 0.1225, "step": 949 }, { "epoch": 1.1012098819097298, "grad_norm": 0.2920970618724823, "learning_rate": 4.454022988505747e-05, "loss": 0.1384, "step": 950 }, { "epoch": 1.102369050206477, "grad_norm": 0.28268909454345703, "learning_rate": 4.4534482758620694e-05, "loss": 0.1344, "step": 951 }, { "epoch": 1.103528218503224, "grad_norm": 0.41611185669898987, "learning_rate": 4.452873563218391e-05, "loss": 0.1416, "step": 952 }, { "epoch": 1.104687386799971, "grad_norm": 0.27131763100624084, "learning_rate": 4.4522988505747124e-05, "loss": 0.1279, "step": 953 }, { "epoch": 1.105846555096718, "grad_norm": 0.250305712223053, "learning_rate": 4.4517241379310346e-05, "loss": 0.1136, "step": 954 }, { "epoch": 1.1070057233934651, "grad_norm": 0.35008758306503296, "learning_rate": 4.451149425287357e-05, "loss": 0.1449, "step": 955 }, { "epoch": 1.1081648916902123, "grad_norm": 0.23740458488464355, "learning_rate": 4.450574712643679e-05, "loss": 0.1094, "step": 956 }, { "epoch": 1.1093240599869594, "grad_norm": 0.25829577445983887, "learning_rate": 4.4500000000000004e-05, "loss": 0.1097, "step": 957 }, { "epoch": 1.1104832282837065, "grad_norm": 0.24298064410686493, "learning_rate": 4.449425287356322e-05, "loss": 0.1226, "step": 958 }, { "epoch": 1.1116423965804536, "grad_norm": 0.27012696862220764, "learning_rate": 4.448850574712644e-05, "loss": 0.1289, "step": 959 }, { "epoch": 1.1128015648772007, "grad_norm": 0.289583683013916, "learning_rate": 4.4482758620689656e-05, "loss": 0.138, "step": 960 }, { "epoch": 1.1139607331739476, "grad_norm": 0.23818224668502808, "learning_rate": 4.447701149425287e-05, "loss": 0.1302, "step": 961 }, { "epoch": 1.1151199014706947, "grad_norm": 0.2659807801246643, "learning_rate": 4.447126436781609e-05, "loss": 0.1248, "step": 962 }, { "epoch": 1.1162790697674418, "grad_norm": 0.27049756050109863, "learning_rate": 4.4465517241379314e-05, "loss": 0.1355, "step": 963 }, { "epoch": 1.117438238064189, "grad_norm": 0.2728050947189331, "learning_rate": 4.445977011494253e-05, "loss": 0.1264, "step": 964 }, { "epoch": 1.118597406360936, "grad_norm": 0.2654384672641754, "learning_rate": 4.445402298850575e-05, "loss": 0.1303, "step": 965 }, { "epoch": 1.1197565746576832, "grad_norm": 0.25535136461257935, "learning_rate": 4.4448275862068965e-05, "loss": 0.1189, "step": 966 }, { "epoch": 1.12091574295443, "grad_norm": 0.2649828791618347, "learning_rate": 4.444252873563219e-05, "loss": 0.1289, "step": 967 }, { "epoch": 1.1220749112511772, "grad_norm": 0.26751139760017395, "learning_rate": 4.44367816091954e-05, "loss": 0.1306, "step": 968 }, { "epoch": 1.1232340795479243, "grad_norm": 0.2819879353046417, "learning_rate": 4.4431034482758624e-05, "loss": 0.132, "step": 969 }, { "epoch": 1.1243932478446714, "grad_norm": 0.24419303238391876, "learning_rate": 4.4425287356321845e-05, "loss": 0.1302, "step": 970 }, { "epoch": 1.1255524161414185, "grad_norm": 0.23565685749053955, "learning_rate": 4.441954022988506e-05, "loss": 0.1215, "step": 971 }, { "epoch": 1.1267115844381657, "grad_norm": 0.24753518402576447, "learning_rate": 4.4413793103448275e-05, "loss": 0.1195, "step": 972 }, { "epoch": 1.1278707527349128, "grad_norm": 0.26664602756500244, "learning_rate": 4.44080459770115e-05, "loss": 0.1289, "step": 973 }, { "epoch": 1.1290299210316599, "grad_norm": 0.2846687436103821, "learning_rate": 4.440229885057471e-05, "loss": 0.1374, "step": 974 }, { "epoch": 1.1301890893284068, "grad_norm": 0.2790849506855011, "learning_rate": 4.4396551724137933e-05, "loss": 0.1427, "step": 975 }, { "epoch": 1.131348257625154, "grad_norm": 0.27603012323379517, "learning_rate": 4.4390804597701155e-05, "loss": 0.1335, "step": 976 }, { "epoch": 1.132507425921901, "grad_norm": 0.2694251835346222, "learning_rate": 4.438505747126437e-05, "loss": 0.1278, "step": 977 }, { "epoch": 1.1336665942186481, "grad_norm": 0.25316059589385986, "learning_rate": 4.437931034482759e-05, "loss": 0.1387, "step": 978 }, { "epoch": 1.1348257625153952, "grad_norm": 0.2663499116897583, "learning_rate": 4.437356321839081e-05, "loss": 0.1206, "step": 979 }, { "epoch": 1.1359849308121424, "grad_norm": 0.2719017267227173, "learning_rate": 4.436781609195402e-05, "loss": 0.1401, "step": 980 }, { "epoch": 1.1371440991088893, "grad_norm": 0.4000224769115448, "learning_rate": 4.436206896551724e-05, "loss": 0.1259, "step": 981 }, { "epoch": 1.1383032674056364, "grad_norm": 0.2817740738391876, "learning_rate": 4.435632183908046e-05, "loss": 0.1307, "step": 982 }, { "epoch": 1.1394624357023835, "grad_norm": 0.28263017535209656, "learning_rate": 4.435057471264368e-05, "loss": 0.1328, "step": 983 }, { "epoch": 1.1406216039991306, "grad_norm": 0.2730136513710022, "learning_rate": 4.43448275862069e-05, "loss": 0.125, "step": 984 }, { "epoch": 1.1417807722958777, "grad_norm": 0.24145883321762085, "learning_rate": 4.4339080459770116e-05, "loss": 0.1206, "step": 985 }, { "epoch": 1.1429399405926248, "grad_norm": 0.29111185669898987, "learning_rate": 4.433333333333334e-05, "loss": 0.1469, "step": 986 }, { "epoch": 1.144099108889372, "grad_norm": 0.28760775923728943, "learning_rate": 4.432758620689655e-05, "loss": 0.1338, "step": 987 }, { "epoch": 1.145258277186119, "grad_norm": 0.2725701630115509, "learning_rate": 4.432183908045977e-05, "loss": 0.1314, "step": 988 }, { "epoch": 1.146417445482866, "grad_norm": 0.2519955635070801, "learning_rate": 4.431609195402299e-05, "loss": 0.1277, "step": 989 }, { "epoch": 1.147576613779613, "grad_norm": 0.28241047263145447, "learning_rate": 4.431034482758621e-05, "loss": 0.1309, "step": 990 }, { "epoch": 1.1487357820763602, "grad_norm": 0.284697562456131, "learning_rate": 4.4304597701149426e-05, "loss": 0.1396, "step": 991 }, { "epoch": 1.1498949503731073, "grad_norm": 0.2153967171907425, "learning_rate": 4.429885057471265e-05, "loss": 0.1074, "step": 992 }, { "epoch": 1.1510541186698544, "grad_norm": 0.28190892934799194, "learning_rate": 4.429310344827586e-05, "loss": 0.1408, "step": 993 }, { "epoch": 1.1522132869666015, "grad_norm": 0.26553207635879517, "learning_rate": 4.4287356321839084e-05, "loss": 0.1191, "step": 994 }, { "epoch": 1.1533724552633486, "grad_norm": 0.3092179596424103, "learning_rate": 4.42816091954023e-05, "loss": 0.1408, "step": 995 }, { "epoch": 1.1545316235600955, "grad_norm": 0.2503480315208435, "learning_rate": 4.427586206896552e-05, "loss": 0.1152, "step": 996 }, { "epoch": 1.1556907918568426, "grad_norm": 0.279032826423645, "learning_rate": 4.427011494252874e-05, "loss": 0.118, "step": 997 }, { "epoch": 1.1568499601535898, "grad_norm": 0.283666729927063, "learning_rate": 4.426436781609196e-05, "loss": 0.143, "step": 998 }, { "epoch": 1.1580091284503369, "grad_norm": 0.3145494759082794, "learning_rate": 4.425862068965517e-05, "loss": 0.1467, "step": 999 }, { "epoch": 1.159168296747084, "grad_norm": 0.24866163730621338, "learning_rate": 4.4252873563218394e-05, "loss": 0.1217, "step": 1000 }, { "epoch": 1.160327465043831, "grad_norm": 0.29811036586761475, "learning_rate": 4.424712643678161e-05, "loss": 0.1343, "step": 1001 }, { "epoch": 1.1614866333405782, "grad_norm": 0.2376304566860199, "learning_rate": 4.4241379310344824e-05, "loss": 0.1111, "step": 1002 }, { "epoch": 1.1626458016373253, "grad_norm": 0.2960997223854065, "learning_rate": 4.423563218390805e-05, "loss": 0.1537, "step": 1003 }, { "epoch": 1.1638049699340722, "grad_norm": 0.2618663012981415, "learning_rate": 4.422988505747127e-05, "loss": 0.1244, "step": 1004 }, { "epoch": 1.1649641382308193, "grad_norm": 0.2316623479127884, "learning_rate": 4.422413793103449e-05, "loss": 0.1276, "step": 1005 }, { "epoch": 1.1661233065275665, "grad_norm": 0.225045844912529, "learning_rate": 4.4218390804597704e-05, "loss": 0.12, "step": 1006 }, { "epoch": 1.1672824748243136, "grad_norm": 0.2224287986755371, "learning_rate": 4.421264367816092e-05, "loss": 0.1281, "step": 1007 }, { "epoch": 1.1684416431210607, "grad_norm": 0.23176699876785278, "learning_rate": 4.420689655172414e-05, "loss": 0.1115, "step": 1008 }, { "epoch": 1.1696008114178078, "grad_norm": 0.2502862215042114, "learning_rate": 4.4201149425287355e-05, "loss": 0.1183, "step": 1009 }, { "epoch": 1.1707599797145547, "grad_norm": 0.24579674005508423, "learning_rate": 4.419540229885058e-05, "loss": 0.1302, "step": 1010 }, { "epoch": 1.1719191480113018, "grad_norm": 0.26011398434638977, "learning_rate": 4.41896551724138e-05, "loss": 0.1386, "step": 1011 }, { "epoch": 1.173078316308049, "grad_norm": 0.26081839203834534, "learning_rate": 4.4183908045977014e-05, "loss": 0.1511, "step": 1012 }, { "epoch": 1.174237484604796, "grad_norm": 0.22634555399417877, "learning_rate": 4.4178160919540235e-05, "loss": 0.1304, "step": 1013 }, { "epoch": 1.1753966529015432, "grad_norm": 0.23882032930850983, "learning_rate": 4.417241379310345e-05, "loss": 0.1289, "step": 1014 }, { "epoch": 1.1765558211982903, "grad_norm": 0.26218292117118835, "learning_rate": 4.4166666666666665e-05, "loss": 0.1373, "step": 1015 }, { "epoch": 1.1777149894950374, "grad_norm": 0.3141017556190491, "learning_rate": 4.416091954022989e-05, "loss": 0.1389, "step": 1016 }, { "epoch": 1.1788741577917845, "grad_norm": 0.2978722155094147, "learning_rate": 4.415517241379311e-05, "loss": 0.14, "step": 1017 }, { "epoch": 1.1800333260885314, "grad_norm": 0.30704426765441895, "learning_rate": 4.414942528735632e-05, "loss": 0.1286, "step": 1018 }, { "epoch": 1.1811924943852785, "grad_norm": 0.3598279058933258, "learning_rate": 4.4143678160919545e-05, "loss": 0.1354, "step": 1019 }, { "epoch": 1.1823516626820256, "grad_norm": 0.2837865650653839, "learning_rate": 4.413793103448276e-05, "loss": 0.1311, "step": 1020 }, { "epoch": 1.1835108309787727, "grad_norm": 0.2859646677970886, "learning_rate": 4.4132183908045975e-05, "loss": 0.1153, "step": 1021 }, { "epoch": 1.1846699992755199, "grad_norm": 0.2848678529262543, "learning_rate": 4.4126436781609196e-05, "loss": 0.1344, "step": 1022 }, { "epoch": 1.185829167572267, "grad_norm": 0.27502962946891785, "learning_rate": 4.412068965517241e-05, "loss": 0.1336, "step": 1023 }, { "epoch": 1.1869883358690139, "grad_norm": 0.23905035853385925, "learning_rate": 4.411494252873564e-05, "loss": 0.1415, "step": 1024 }, { "epoch": 1.188147504165761, "grad_norm": 0.3138931691646576, "learning_rate": 4.4109195402298855e-05, "loss": 0.1388, "step": 1025 }, { "epoch": 1.189306672462508, "grad_norm": 0.2796839773654938, "learning_rate": 4.410344827586207e-05, "loss": 0.126, "step": 1026 }, { "epoch": 1.1904658407592552, "grad_norm": 0.24468503892421722, "learning_rate": 4.409770114942529e-05, "loss": 0.1356, "step": 1027 }, { "epoch": 1.1916250090560023, "grad_norm": 0.2509666383266449, "learning_rate": 4.4091954022988506e-05, "loss": 0.1223, "step": 1028 }, { "epoch": 1.1927841773527494, "grad_norm": 0.2291174829006195, "learning_rate": 4.408620689655172e-05, "loss": 0.1114, "step": 1029 }, { "epoch": 1.1939433456494966, "grad_norm": 0.21360966563224792, "learning_rate": 4.408045977011494e-05, "loss": 0.1149, "step": 1030 }, { "epoch": 1.1951025139462437, "grad_norm": 0.273369163274765, "learning_rate": 4.4074712643678164e-05, "loss": 0.14, "step": 1031 }, { "epoch": 1.1962616822429906, "grad_norm": 0.24241317808628082, "learning_rate": 4.4068965517241386e-05, "loss": 0.1242, "step": 1032 }, { "epoch": 1.1974208505397377, "grad_norm": 0.29952454566955566, "learning_rate": 4.40632183908046e-05, "loss": 0.1363, "step": 1033 }, { "epoch": 1.1985800188364848, "grad_norm": 0.2337886095046997, "learning_rate": 4.4057471264367816e-05, "loss": 0.1229, "step": 1034 }, { "epoch": 1.199739187133232, "grad_norm": 0.21567285060882568, "learning_rate": 4.405172413793104e-05, "loss": 0.1208, "step": 1035 }, { "epoch": 1.200898355429979, "grad_norm": 0.2876308262348175, "learning_rate": 4.404597701149425e-05, "loss": 0.1376, "step": 1036 }, { "epoch": 1.2020575237267261, "grad_norm": 0.30270469188690186, "learning_rate": 4.4040229885057474e-05, "loss": 0.1444, "step": 1037 }, { "epoch": 1.203216692023473, "grad_norm": 0.2885245680809021, "learning_rate": 4.4034482758620696e-05, "loss": 0.1327, "step": 1038 }, { "epoch": 1.2043758603202201, "grad_norm": 0.3696737289428711, "learning_rate": 4.402873563218391e-05, "loss": 0.1444, "step": 1039 }, { "epoch": 1.2055350286169673, "grad_norm": 0.26480332016944885, "learning_rate": 4.4022988505747126e-05, "loss": 0.1332, "step": 1040 }, { "epoch": 1.2066941969137144, "grad_norm": 0.25360003113746643, "learning_rate": 4.401724137931035e-05, "loss": 0.1315, "step": 1041 }, { "epoch": 1.2078533652104615, "grad_norm": 0.22617582976818085, "learning_rate": 4.401149425287356e-05, "loss": 0.1289, "step": 1042 }, { "epoch": 1.2090125335072086, "grad_norm": 0.22125768661499023, "learning_rate": 4.4005747126436784e-05, "loss": 0.125, "step": 1043 }, { "epoch": 1.2101717018039557, "grad_norm": 0.23760303854942322, "learning_rate": 4.4000000000000006e-05, "loss": 0.1201, "step": 1044 }, { "epoch": 1.2113308701007028, "grad_norm": 0.28465020656585693, "learning_rate": 4.399425287356322e-05, "loss": 0.1306, "step": 1045 }, { "epoch": 1.21249003839745, "grad_norm": 0.2427520602941513, "learning_rate": 4.398850574712644e-05, "loss": 0.1252, "step": 1046 }, { "epoch": 1.2136492066941968, "grad_norm": 0.29493942856788635, "learning_rate": 4.398275862068966e-05, "loss": 0.1131, "step": 1047 }, { "epoch": 1.214808374990944, "grad_norm": 0.24606949090957642, "learning_rate": 4.397701149425287e-05, "loss": 0.1254, "step": 1048 }, { "epoch": 1.215967543287691, "grad_norm": 0.26658526062965393, "learning_rate": 4.3971264367816094e-05, "loss": 0.1299, "step": 1049 }, { "epoch": 1.2171267115844382, "grad_norm": 0.22725453972816467, "learning_rate": 4.396551724137931e-05, "loss": 0.1209, "step": 1050 }, { "epoch": 1.2182858798811853, "grad_norm": 0.26789578795433044, "learning_rate": 4.395977011494253e-05, "loss": 0.131, "step": 1051 }, { "epoch": 1.2194450481779324, "grad_norm": 0.2898029685020447, "learning_rate": 4.395402298850575e-05, "loss": 0.1321, "step": 1052 }, { "epoch": 1.2206042164746793, "grad_norm": 0.2893812656402588, "learning_rate": 4.394827586206897e-05, "loss": 0.151, "step": 1053 }, { "epoch": 1.2217633847714264, "grad_norm": 0.2428007870912552, "learning_rate": 4.394252873563219e-05, "loss": 0.1221, "step": 1054 }, { "epoch": 1.2229225530681735, "grad_norm": 0.31453728675842285, "learning_rate": 4.3936781609195403e-05, "loss": 0.127, "step": 1055 }, { "epoch": 1.2240817213649207, "grad_norm": 0.22855356335639954, "learning_rate": 4.393103448275862e-05, "loss": 0.1232, "step": 1056 }, { "epoch": 1.2252408896616678, "grad_norm": 0.21666891872882843, "learning_rate": 4.392528735632184e-05, "loss": 0.1174, "step": 1057 }, { "epoch": 1.226400057958415, "grad_norm": 0.2279151976108551, "learning_rate": 4.391954022988506e-05, "loss": 0.1307, "step": 1058 }, { "epoch": 1.227559226255162, "grad_norm": 0.2704702615737915, "learning_rate": 4.3913793103448277e-05, "loss": 0.1328, "step": 1059 }, { "epoch": 1.2287183945519091, "grad_norm": 0.2585502564907074, "learning_rate": 4.39080459770115e-05, "loss": 0.1359, "step": 1060 }, { "epoch": 1.229877562848656, "grad_norm": 0.29662445187568665, "learning_rate": 4.390229885057471e-05, "loss": 0.1302, "step": 1061 }, { "epoch": 1.2310367311454031, "grad_norm": 0.24643655121326447, "learning_rate": 4.3896551724137935e-05, "loss": 0.1233, "step": 1062 }, { "epoch": 1.2321958994421502, "grad_norm": 0.238943949341774, "learning_rate": 4.389080459770115e-05, "loss": 0.1271, "step": 1063 }, { "epoch": 1.2333550677388974, "grad_norm": 0.27087101340293884, "learning_rate": 4.388505747126437e-05, "loss": 0.125, "step": 1064 }, { "epoch": 1.2345142360356445, "grad_norm": 0.29983842372894287, "learning_rate": 4.387931034482759e-05, "loss": 0.135, "step": 1065 }, { "epoch": 1.2356734043323916, "grad_norm": 0.24061062932014465, "learning_rate": 4.387356321839081e-05, "loss": 0.1365, "step": 1066 }, { "epoch": 1.2368325726291385, "grad_norm": 0.24052466452121735, "learning_rate": 4.386781609195402e-05, "loss": 0.1175, "step": 1067 }, { "epoch": 1.2379917409258856, "grad_norm": 0.2357567995786667, "learning_rate": 4.3862068965517245e-05, "loss": 0.1337, "step": 1068 }, { "epoch": 1.2391509092226327, "grad_norm": 0.3319186270236969, "learning_rate": 4.385632183908046e-05, "loss": 0.1434, "step": 1069 }, { "epoch": 1.2403100775193798, "grad_norm": 0.34508636593818665, "learning_rate": 4.385057471264368e-05, "loss": 0.1511, "step": 1070 }, { "epoch": 1.241469245816127, "grad_norm": 0.22875401377677917, "learning_rate": 4.3844827586206896e-05, "loss": 0.1278, "step": 1071 }, { "epoch": 1.242628414112874, "grad_norm": 0.2499420940876007, "learning_rate": 4.383908045977012e-05, "loss": 0.1204, "step": 1072 }, { "epoch": 1.2437875824096212, "grad_norm": 0.24523814022541046, "learning_rate": 4.383333333333334e-05, "loss": 0.1351, "step": 1073 }, { "epoch": 1.2449467507063683, "grad_norm": 0.28577345609664917, "learning_rate": 4.3827586206896554e-05, "loss": 0.121, "step": 1074 }, { "epoch": 1.2461059190031152, "grad_norm": 0.21815964579582214, "learning_rate": 4.382183908045977e-05, "loss": 0.113, "step": 1075 }, { "epoch": 1.2472650872998623, "grad_norm": 0.24391821026802063, "learning_rate": 4.381609195402299e-05, "loss": 0.1196, "step": 1076 }, { "epoch": 1.2484242555966094, "grad_norm": 0.2738696336746216, "learning_rate": 4.3810344827586206e-05, "loss": 0.1436, "step": 1077 }, { "epoch": 1.2495834238933565, "grad_norm": 0.24671784043312073, "learning_rate": 4.380459770114943e-05, "loss": 0.1221, "step": 1078 }, { "epoch": 1.2507425921901036, "grad_norm": 0.249884694814682, "learning_rate": 4.379885057471265e-05, "loss": 0.1199, "step": 1079 }, { "epoch": 1.2519017604868508, "grad_norm": 0.25286155939102173, "learning_rate": 4.3793103448275864e-05, "loss": 0.1381, "step": 1080 }, { "epoch": 1.2530609287835976, "grad_norm": 0.36500653624534607, "learning_rate": 4.3787356321839086e-05, "loss": 0.13, "step": 1081 }, { "epoch": 1.2542200970803448, "grad_norm": 0.2865230441093445, "learning_rate": 4.37816091954023e-05, "loss": 0.1223, "step": 1082 }, { "epoch": 1.2553792653770919, "grad_norm": 0.2717812657356262, "learning_rate": 4.3775862068965516e-05, "loss": 0.118, "step": 1083 }, { "epoch": 1.256538433673839, "grad_norm": 0.23932139575481415, "learning_rate": 4.377011494252874e-05, "loss": 0.1171, "step": 1084 }, { "epoch": 1.257697601970586, "grad_norm": 0.25386103987693787, "learning_rate": 4.376436781609196e-05, "loss": 0.127, "step": 1085 }, { "epoch": 1.2588567702673332, "grad_norm": 0.25412800908088684, "learning_rate": 4.3758620689655174e-05, "loss": 0.1394, "step": 1086 }, { "epoch": 1.2600159385640803, "grad_norm": 0.30397337675094604, "learning_rate": 4.3752873563218395e-05, "loss": 0.1087, "step": 1087 }, { "epoch": 1.2611751068608275, "grad_norm": 0.2501704692840576, "learning_rate": 4.374712643678161e-05, "loss": 0.1342, "step": 1088 }, { "epoch": 1.2623342751575746, "grad_norm": 0.2684697210788727, "learning_rate": 4.374137931034483e-05, "loss": 0.1225, "step": 1089 }, { "epoch": 1.2634934434543215, "grad_norm": 0.2351657599210739, "learning_rate": 4.373563218390805e-05, "loss": 0.1106, "step": 1090 }, { "epoch": 1.2646526117510686, "grad_norm": 0.3316408097743988, "learning_rate": 4.372988505747126e-05, "loss": 0.1367, "step": 1091 }, { "epoch": 1.2658117800478157, "grad_norm": 0.2677469849586487, "learning_rate": 4.3724137931034484e-05, "loss": 0.1221, "step": 1092 }, { "epoch": 1.2669709483445628, "grad_norm": 0.5877587199211121, "learning_rate": 4.3718390804597705e-05, "loss": 0.1538, "step": 1093 }, { "epoch": 1.26813011664131, "grad_norm": 0.2598472833633423, "learning_rate": 4.371264367816092e-05, "loss": 0.1325, "step": 1094 }, { "epoch": 1.2692892849380568, "grad_norm": 0.3260670602321625, "learning_rate": 4.370689655172414e-05, "loss": 0.1372, "step": 1095 }, { "epoch": 1.270448453234804, "grad_norm": 0.24232131242752075, "learning_rate": 4.370114942528736e-05, "loss": 0.1255, "step": 1096 }, { "epoch": 1.271607621531551, "grad_norm": 0.3213041126728058, "learning_rate": 4.369540229885058e-05, "loss": 0.1325, "step": 1097 }, { "epoch": 1.2727667898282982, "grad_norm": 0.2717669904232025, "learning_rate": 4.368965517241379e-05, "loss": 0.1447, "step": 1098 }, { "epoch": 1.2739259581250453, "grad_norm": 0.2611459791660309, "learning_rate": 4.3683908045977015e-05, "loss": 0.1349, "step": 1099 }, { "epoch": 1.2750851264217924, "grad_norm": 0.21271906793117523, "learning_rate": 4.367816091954024e-05, "loss": 0.1062, "step": 1100 }, { "epoch": 1.2762442947185395, "grad_norm": 0.28656983375549316, "learning_rate": 4.367241379310345e-05, "loss": 0.1535, "step": 1101 }, { "epoch": 1.2774034630152866, "grad_norm": 0.24881134927272797, "learning_rate": 4.3666666666666666e-05, "loss": 0.132, "step": 1102 }, { "epoch": 1.2785626313120337, "grad_norm": 0.3379434645175934, "learning_rate": 4.366091954022989e-05, "loss": 0.1276, "step": 1103 }, { "epoch": 1.2797217996087806, "grad_norm": 0.32010799646377563, "learning_rate": 4.36551724137931e-05, "loss": 0.133, "step": 1104 }, { "epoch": 1.2808809679055277, "grad_norm": 0.22083823382854462, "learning_rate": 4.3649425287356325e-05, "loss": 0.1258, "step": 1105 }, { "epoch": 1.2820401362022749, "grad_norm": 0.27177348732948303, "learning_rate": 4.3643678160919546e-05, "loss": 0.1299, "step": 1106 }, { "epoch": 1.283199304499022, "grad_norm": 0.2737819254398346, "learning_rate": 4.363793103448276e-05, "loss": 0.1322, "step": 1107 }, { "epoch": 1.284358472795769, "grad_norm": 0.2882055640220642, "learning_rate": 4.363218390804598e-05, "loss": 0.1317, "step": 1108 }, { "epoch": 1.285517641092516, "grad_norm": 0.24766948819160461, "learning_rate": 4.36264367816092e-05, "loss": 0.1247, "step": 1109 }, { "epoch": 1.286676809389263, "grad_norm": 0.27174562215805054, "learning_rate": 4.362068965517241e-05, "loss": 0.1404, "step": 1110 }, { "epoch": 1.2878359776860102, "grad_norm": 0.25926515460014343, "learning_rate": 4.3614942528735634e-05, "loss": 0.1275, "step": 1111 }, { "epoch": 1.2889951459827573, "grad_norm": 0.2558022439479828, "learning_rate": 4.360919540229885e-05, "loss": 0.1224, "step": 1112 }, { "epoch": 1.2901543142795044, "grad_norm": 0.24418483674526215, "learning_rate": 4.360344827586207e-05, "loss": 0.1274, "step": 1113 }, { "epoch": 1.2913134825762516, "grad_norm": 0.2943531274795532, "learning_rate": 4.359770114942529e-05, "loss": 0.1242, "step": 1114 }, { "epoch": 1.2924726508729987, "grad_norm": 0.24752426147460938, "learning_rate": 4.359195402298851e-05, "loss": 0.1336, "step": 1115 }, { "epoch": 1.2936318191697458, "grad_norm": 0.24908795952796936, "learning_rate": 4.358620689655173e-05, "loss": 0.1176, "step": 1116 }, { "epoch": 1.294790987466493, "grad_norm": 0.290752649307251, "learning_rate": 4.3580459770114944e-05, "loss": 0.1532, "step": 1117 }, { "epoch": 1.2959501557632398, "grad_norm": 0.2473299503326416, "learning_rate": 4.357471264367816e-05, "loss": 0.1166, "step": 1118 }, { "epoch": 1.297109324059987, "grad_norm": 0.24173453450202942, "learning_rate": 4.356896551724138e-05, "loss": 0.1069, "step": 1119 }, { "epoch": 1.298268492356734, "grad_norm": 0.2775953412055969, "learning_rate": 4.35632183908046e-05, "loss": 0.1163, "step": 1120 }, { "epoch": 1.2994276606534811, "grad_norm": 0.2543400824069977, "learning_rate": 4.355747126436782e-05, "loss": 0.1359, "step": 1121 }, { "epoch": 1.3005868289502283, "grad_norm": 0.2600906193256378, "learning_rate": 4.355172413793104e-05, "loss": 0.1297, "step": 1122 }, { "epoch": 1.3017459972469754, "grad_norm": 0.21949701011180878, "learning_rate": 4.3545977011494254e-05, "loss": 0.1223, "step": 1123 }, { "epoch": 1.3029051655437223, "grad_norm": 0.2765035629272461, "learning_rate": 4.354022988505747e-05, "loss": 0.1302, "step": 1124 }, { "epoch": 1.3040643338404694, "grad_norm": 0.2629549205303192, "learning_rate": 4.353448275862069e-05, "loss": 0.1321, "step": 1125 }, { "epoch": 1.3052235021372165, "grad_norm": 0.278767853975296, "learning_rate": 4.352873563218391e-05, "loss": 0.1252, "step": 1126 }, { "epoch": 1.3063826704339636, "grad_norm": 0.3192477226257324, "learning_rate": 4.3522988505747134e-05, "loss": 0.1559, "step": 1127 }, { "epoch": 1.3075418387307107, "grad_norm": 0.24097472429275513, "learning_rate": 4.351724137931035e-05, "loss": 0.1239, "step": 1128 }, { "epoch": 1.3087010070274578, "grad_norm": 0.27867624163627625, "learning_rate": 4.3511494252873564e-05, "loss": 0.142, "step": 1129 }, { "epoch": 1.309860175324205, "grad_norm": 0.2419125735759735, "learning_rate": 4.3505747126436785e-05, "loss": 0.1126, "step": 1130 }, { "epoch": 1.311019343620952, "grad_norm": 0.24018104374408722, "learning_rate": 4.35e-05, "loss": 0.1324, "step": 1131 }, { "epoch": 1.3121785119176992, "grad_norm": 0.2754385769367218, "learning_rate": 4.3494252873563215e-05, "loss": 0.1422, "step": 1132 }, { "epoch": 1.313337680214446, "grad_norm": 0.24360622465610504, "learning_rate": 4.348850574712644e-05, "loss": 0.1409, "step": 1133 }, { "epoch": 1.3144968485111932, "grad_norm": 0.2462378591299057, "learning_rate": 4.348275862068966e-05, "loss": 0.1299, "step": 1134 }, { "epoch": 1.3156560168079403, "grad_norm": 0.22275815904140472, "learning_rate": 4.347701149425288e-05, "loss": 0.1283, "step": 1135 }, { "epoch": 1.3168151851046874, "grad_norm": 0.388450026512146, "learning_rate": 4.3471264367816095e-05, "loss": 0.152, "step": 1136 }, { "epoch": 1.3179743534014345, "grad_norm": 0.24839439988136292, "learning_rate": 4.346551724137931e-05, "loss": 0.1333, "step": 1137 }, { "epoch": 1.3191335216981814, "grad_norm": 0.24341842532157898, "learning_rate": 4.345977011494253e-05, "loss": 0.1176, "step": 1138 }, { "epoch": 1.3202926899949285, "grad_norm": 0.22959062457084656, "learning_rate": 4.3454022988505747e-05, "loss": 0.1337, "step": 1139 }, { "epoch": 1.3214518582916757, "grad_norm": 0.2183791995048523, "learning_rate": 4.344827586206897e-05, "loss": 0.1157, "step": 1140 }, { "epoch": 1.3226110265884228, "grad_norm": 0.3063965141773224, "learning_rate": 4.344252873563219e-05, "loss": 0.1365, "step": 1141 }, { "epoch": 1.32377019488517, "grad_norm": 0.24136781692504883, "learning_rate": 4.3436781609195405e-05, "loss": 0.1255, "step": 1142 }, { "epoch": 1.324929363181917, "grad_norm": 0.21908266842365265, "learning_rate": 4.343103448275862e-05, "loss": 0.1144, "step": 1143 }, { "epoch": 1.3260885314786641, "grad_norm": 0.2468622326850891, "learning_rate": 4.342528735632184e-05, "loss": 0.1298, "step": 1144 }, { "epoch": 1.3272476997754112, "grad_norm": 0.28959596157073975, "learning_rate": 4.3419540229885056e-05, "loss": 0.1318, "step": 1145 }, { "epoch": 1.3284068680721584, "grad_norm": 0.2605699300765991, "learning_rate": 4.341379310344828e-05, "loss": 0.137, "step": 1146 }, { "epoch": 1.3295660363689052, "grad_norm": 0.3156377971172333, "learning_rate": 4.34080459770115e-05, "loss": 0.1417, "step": 1147 }, { "epoch": 1.3307252046656524, "grad_norm": 0.24471645057201385, "learning_rate": 4.3402298850574715e-05, "loss": 0.1268, "step": 1148 }, { "epoch": 1.3318843729623995, "grad_norm": 0.24388937652111053, "learning_rate": 4.3396551724137936e-05, "loss": 0.13, "step": 1149 }, { "epoch": 1.3330435412591466, "grad_norm": 0.21283157169818878, "learning_rate": 4.339080459770115e-05, "loss": 0.1213, "step": 1150 }, { "epoch": 1.3342027095558937, "grad_norm": 0.232645183801651, "learning_rate": 4.3385057471264366e-05, "loss": 0.1245, "step": 1151 }, { "epoch": 1.3353618778526406, "grad_norm": 0.25331243872642517, "learning_rate": 4.337931034482759e-05, "loss": 0.1232, "step": 1152 }, { "epoch": 1.3365210461493877, "grad_norm": 0.24029718339443207, "learning_rate": 4.33735632183908e-05, "loss": 0.1161, "step": 1153 }, { "epoch": 1.3376802144461348, "grad_norm": 0.2674920856952667, "learning_rate": 4.336781609195403e-05, "loss": 0.1361, "step": 1154 }, { "epoch": 1.338839382742882, "grad_norm": 0.19390049576759338, "learning_rate": 4.3362068965517246e-05, "loss": 0.1138, "step": 1155 }, { "epoch": 1.339998551039629, "grad_norm": 0.2703855633735657, "learning_rate": 4.335632183908046e-05, "loss": 0.1334, "step": 1156 }, { "epoch": 1.3411577193363762, "grad_norm": 0.23119378089904785, "learning_rate": 4.335057471264368e-05, "loss": 0.1278, "step": 1157 }, { "epoch": 1.3423168876331233, "grad_norm": 0.22249457240104675, "learning_rate": 4.33448275862069e-05, "loss": 0.1122, "step": 1158 }, { "epoch": 1.3434760559298704, "grad_norm": 0.22288426756858826, "learning_rate": 4.333908045977011e-05, "loss": 0.1251, "step": 1159 }, { "epoch": 1.3446352242266175, "grad_norm": 0.22571322321891785, "learning_rate": 4.3333333333333334e-05, "loss": 0.1167, "step": 1160 }, { "epoch": 1.3457943925233644, "grad_norm": 0.25278669595718384, "learning_rate": 4.3327586206896556e-05, "loss": 0.1247, "step": 1161 }, { "epoch": 1.3469535608201115, "grad_norm": 0.24313043057918549, "learning_rate": 4.332183908045977e-05, "loss": 0.132, "step": 1162 }, { "epoch": 1.3481127291168586, "grad_norm": 0.26298508048057556, "learning_rate": 4.331609195402299e-05, "loss": 0.1262, "step": 1163 }, { "epoch": 1.3492718974136058, "grad_norm": 0.24116156995296478, "learning_rate": 4.331034482758621e-05, "loss": 0.1313, "step": 1164 }, { "epoch": 1.3504310657103529, "grad_norm": 0.24901200830936432, "learning_rate": 4.330459770114943e-05, "loss": 0.1097, "step": 1165 }, { "epoch": 1.3515902340071, "grad_norm": 0.30485498905181885, "learning_rate": 4.3298850574712644e-05, "loss": 0.1178, "step": 1166 }, { "epoch": 1.3527494023038469, "grad_norm": 0.3070286810398102, "learning_rate": 4.3293103448275865e-05, "loss": 0.1386, "step": 1167 }, { "epoch": 1.353908570600594, "grad_norm": 0.27904579043388367, "learning_rate": 4.328735632183909e-05, "loss": 0.1504, "step": 1168 }, { "epoch": 1.3550677388973411, "grad_norm": 0.28574293851852417, "learning_rate": 4.32816091954023e-05, "loss": 0.1273, "step": 1169 }, { "epoch": 1.3562269071940882, "grad_norm": 0.27783069014549255, "learning_rate": 4.327586206896552e-05, "loss": 0.13, "step": 1170 }, { "epoch": 1.3573860754908353, "grad_norm": 0.21755783259868622, "learning_rate": 4.327011494252874e-05, "loss": 0.1189, "step": 1171 }, { "epoch": 1.3585452437875825, "grad_norm": 0.27805066108703613, "learning_rate": 4.3264367816091954e-05, "loss": 0.1295, "step": 1172 }, { "epoch": 1.3597044120843296, "grad_norm": 0.26228439807891846, "learning_rate": 4.3258620689655175e-05, "loss": 0.1312, "step": 1173 }, { "epoch": 1.3608635803810767, "grad_norm": 0.31947600841522217, "learning_rate": 4.325287356321839e-05, "loss": 0.1407, "step": 1174 }, { "epoch": 1.3620227486778236, "grad_norm": 0.2713332176208496, "learning_rate": 4.324712643678161e-05, "loss": 0.1229, "step": 1175 }, { "epoch": 1.3631819169745707, "grad_norm": 0.3652999699115753, "learning_rate": 4.3241379310344833e-05, "loss": 0.133, "step": 1176 }, { "epoch": 1.3643410852713178, "grad_norm": 0.26640525460243225, "learning_rate": 4.323563218390805e-05, "loss": 0.1254, "step": 1177 }, { "epoch": 1.365500253568065, "grad_norm": 0.24308764934539795, "learning_rate": 4.322988505747126e-05, "loss": 0.1311, "step": 1178 }, { "epoch": 1.366659421864812, "grad_norm": 0.2529830038547516, "learning_rate": 4.3224137931034485e-05, "loss": 0.1257, "step": 1179 }, { "epoch": 1.3678185901615592, "grad_norm": 0.2467876523733139, "learning_rate": 4.32183908045977e-05, "loss": 0.1387, "step": 1180 }, { "epoch": 1.368977758458306, "grad_norm": 0.27905914187431335, "learning_rate": 4.321264367816092e-05, "loss": 0.119, "step": 1181 }, { "epoch": 1.3701369267550532, "grad_norm": 0.2613096535205841, "learning_rate": 4.320689655172414e-05, "loss": 0.1392, "step": 1182 }, { "epoch": 1.3712960950518003, "grad_norm": 0.3503858745098114, "learning_rate": 4.320114942528736e-05, "loss": 0.1457, "step": 1183 }, { "epoch": 1.3724552633485474, "grad_norm": 0.26688286662101746, "learning_rate": 4.319540229885058e-05, "loss": 0.141, "step": 1184 }, { "epoch": 1.3736144316452945, "grad_norm": 0.2871832251548767, "learning_rate": 4.3189655172413795e-05, "loss": 0.1323, "step": 1185 }, { "epoch": 1.3747735999420416, "grad_norm": 0.27119460701942444, "learning_rate": 4.318390804597701e-05, "loss": 0.1207, "step": 1186 }, { "epoch": 1.3759327682387887, "grad_norm": 0.23951269686222076, "learning_rate": 4.317816091954023e-05, "loss": 0.1135, "step": 1187 }, { "epoch": 1.3770919365355359, "grad_norm": 0.221263587474823, "learning_rate": 4.317241379310345e-05, "loss": 0.1169, "step": 1188 }, { "epoch": 1.378251104832283, "grad_norm": 0.2704179883003235, "learning_rate": 4.316666666666667e-05, "loss": 0.1341, "step": 1189 }, { "epoch": 1.3794102731290299, "grad_norm": 0.2637876272201538, "learning_rate": 4.316091954022989e-05, "loss": 0.13, "step": 1190 }, { "epoch": 1.380569441425777, "grad_norm": 0.2561126947402954, "learning_rate": 4.3155172413793104e-05, "loss": 0.128, "step": 1191 }, { "epoch": 1.381728609722524, "grad_norm": 0.2545618712902069, "learning_rate": 4.3149425287356326e-05, "loss": 0.1245, "step": 1192 }, { "epoch": 1.3828877780192712, "grad_norm": 0.20275923609733582, "learning_rate": 4.314367816091954e-05, "loss": 0.1168, "step": 1193 }, { "epoch": 1.3840469463160183, "grad_norm": 0.2271161824464798, "learning_rate": 4.3137931034482756e-05, "loss": 0.1326, "step": 1194 }, { "epoch": 1.3852061146127652, "grad_norm": 0.21496017277240753, "learning_rate": 4.3132183908045984e-05, "loss": 0.1248, "step": 1195 }, { "epoch": 1.3863652829095123, "grad_norm": 0.2341107428073883, "learning_rate": 4.31264367816092e-05, "loss": 0.1268, "step": 1196 }, { "epoch": 1.3875244512062594, "grad_norm": 0.2202032208442688, "learning_rate": 4.3120689655172414e-05, "loss": 0.1246, "step": 1197 }, { "epoch": 1.3886836195030066, "grad_norm": 0.2526317834854126, "learning_rate": 4.3114942528735636e-05, "loss": 0.1276, "step": 1198 }, { "epoch": 1.3898427877997537, "grad_norm": 0.29777175188064575, "learning_rate": 4.310919540229885e-05, "loss": 0.119, "step": 1199 }, { "epoch": 1.3910019560965008, "grad_norm": 0.28763195872306824, "learning_rate": 4.3103448275862066e-05, "loss": 0.1363, "step": 1200 }, { "epoch": 1.392161124393248, "grad_norm": 0.22191934287548065, "learning_rate": 4.309770114942529e-05, "loss": 0.1113, "step": 1201 }, { "epoch": 1.393320292689995, "grad_norm": 0.36686575412750244, "learning_rate": 4.309195402298851e-05, "loss": 0.1451, "step": 1202 }, { "epoch": 1.3944794609867421, "grad_norm": 0.2628101110458374, "learning_rate": 4.308620689655173e-05, "loss": 0.1255, "step": 1203 }, { "epoch": 1.395638629283489, "grad_norm": 0.2633892893791199, "learning_rate": 4.3080459770114946e-05, "loss": 0.1391, "step": 1204 }, { "epoch": 1.3967977975802361, "grad_norm": 0.28212958574295044, "learning_rate": 4.307471264367816e-05, "loss": 0.1383, "step": 1205 }, { "epoch": 1.3979569658769833, "grad_norm": 0.24369271099567413, "learning_rate": 4.306896551724138e-05, "loss": 0.1227, "step": 1206 }, { "epoch": 1.3991161341737304, "grad_norm": 0.23191668093204498, "learning_rate": 4.30632183908046e-05, "loss": 0.1255, "step": 1207 }, { "epoch": 1.4002753024704775, "grad_norm": 0.2150292694568634, "learning_rate": 4.305747126436782e-05, "loss": 0.1172, "step": 1208 }, { "epoch": 1.4014344707672244, "grad_norm": 0.2306421399116516, "learning_rate": 4.305172413793104e-05, "loss": 0.1213, "step": 1209 }, { "epoch": 1.4025936390639715, "grad_norm": 0.22703979909420013, "learning_rate": 4.3045977011494255e-05, "loss": 0.1059, "step": 1210 }, { "epoch": 1.4037528073607186, "grad_norm": 0.2609024941921234, "learning_rate": 4.304022988505748e-05, "loss": 0.1393, "step": 1211 }, { "epoch": 1.4049119756574657, "grad_norm": 0.23271793127059937, "learning_rate": 4.303448275862069e-05, "loss": 0.1306, "step": 1212 }, { "epoch": 1.4060711439542128, "grad_norm": 0.25955161452293396, "learning_rate": 4.302873563218391e-05, "loss": 0.1345, "step": 1213 }, { "epoch": 1.40723031225096, "grad_norm": 0.20886264741420746, "learning_rate": 4.302298850574713e-05, "loss": 0.1255, "step": 1214 }, { "epoch": 1.408389480547707, "grad_norm": 0.2347596287727356, "learning_rate": 4.301724137931035e-05, "loss": 0.1219, "step": 1215 }, { "epoch": 1.4095486488444542, "grad_norm": 0.25307637453079224, "learning_rate": 4.3011494252873565e-05, "loss": 0.1122, "step": 1216 }, { "epoch": 1.4107078171412013, "grad_norm": 0.28534573316574097, "learning_rate": 4.300574712643679e-05, "loss": 0.1356, "step": 1217 }, { "epoch": 1.4118669854379482, "grad_norm": 0.2502347528934479, "learning_rate": 4.3e-05, "loss": 0.1256, "step": 1218 }, { "epoch": 1.4130261537346953, "grad_norm": 0.31428301334381104, "learning_rate": 4.2994252873563217e-05, "loss": 0.1375, "step": 1219 }, { "epoch": 1.4141853220314424, "grad_norm": 0.23887260258197784, "learning_rate": 4.298850574712644e-05, "loss": 0.127, "step": 1220 }, { "epoch": 1.4153444903281895, "grad_norm": 0.2826530933380127, "learning_rate": 4.298275862068965e-05, "loss": 0.1431, "step": 1221 }, { "epoch": 1.4165036586249367, "grad_norm": 0.22156329452991486, "learning_rate": 4.2977011494252875e-05, "loss": 0.1132, "step": 1222 }, { "epoch": 1.4176628269216838, "grad_norm": 0.23711679875850677, "learning_rate": 4.2971264367816096e-05, "loss": 0.1234, "step": 1223 }, { "epoch": 1.4188219952184307, "grad_norm": 0.21596437692642212, "learning_rate": 4.296551724137931e-05, "loss": 0.1195, "step": 1224 }, { "epoch": 1.4199811635151778, "grad_norm": 0.26402637362480164, "learning_rate": 4.295977011494253e-05, "loss": 0.1268, "step": 1225 }, { "epoch": 1.421140331811925, "grad_norm": 0.20864778757095337, "learning_rate": 4.295402298850575e-05, "loss": 0.1287, "step": 1226 }, { "epoch": 1.422299500108672, "grad_norm": 0.23605617880821228, "learning_rate": 4.294827586206896e-05, "loss": 0.1432, "step": 1227 }, { "epoch": 1.4234586684054191, "grad_norm": 0.2601798176765442, "learning_rate": 4.2942528735632185e-05, "loss": 0.1317, "step": 1228 }, { "epoch": 1.4246178367021662, "grad_norm": 0.2347661405801773, "learning_rate": 4.2936781609195406e-05, "loss": 0.1233, "step": 1229 }, { "epoch": 1.4257770049989134, "grad_norm": 0.22487115859985352, "learning_rate": 4.293103448275863e-05, "loss": 0.1314, "step": 1230 }, { "epoch": 1.4269361732956605, "grad_norm": 0.2523593306541443, "learning_rate": 4.292528735632184e-05, "loss": 0.127, "step": 1231 }, { "epoch": 1.4280953415924076, "grad_norm": 0.25972914695739746, "learning_rate": 4.291954022988506e-05, "loss": 0.1404, "step": 1232 }, { "epoch": 1.4292545098891545, "grad_norm": 0.2787970006465912, "learning_rate": 4.291379310344828e-05, "loss": 0.1309, "step": 1233 }, { "epoch": 1.4304136781859016, "grad_norm": 0.3272540271282196, "learning_rate": 4.2908045977011494e-05, "loss": 0.1271, "step": 1234 }, { "epoch": 1.4315728464826487, "grad_norm": 0.302356094121933, "learning_rate": 4.290229885057471e-05, "loss": 0.1341, "step": 1235 }, { "epoch": 1.4327320147793958, "grad_norm": 0.19032296538352966, "learning_rate": 4.289655172413794e-05, "loss": 0.1179, "step": 1236 }, { "epoch": 1.433891183076143, "grad_norm": 0.28379547595977783, "learning_rate": 4.289080459770115e-05, "loss": 0.127, "step": 1237 }, { "epoch": 1.4350503513728898, "grad_norm": 0.25689971446990967, "learning_rate": 4.288505747126437e-05, "loss": 0.1253, "step": 1238 }, { "epoch": 1.436209519669637, "grad_norm": 0.235792875289917, "learning_rate": 4.287931034482759e-05, "loss": 0.1287, "step": 1239 }, { "epoch": 1.437368687966384, "grad_norm": 0.24741043150424957, "learning_rate": 4.2873563218390804e-05, "loss": 0.1317, "step": 1240 }, { "epoch": 1.4385278562631312, "grad_norm": 0.25098156929016113, "learning_rate": 4.2867816091954026e-05, "loss": 0.1291, "step": 1241 }, { "epoch": 1.4396870245598783, "grad_norm": 0.26148924231529236, "learning_rate": 4.286206896551724e-05, "loss": 0.1356, "step": 1242 }, { "epoch": 1.4408461928566254, "grad_norm": 0.27373388409614563, "learning_rate": 4.285632183908046e-05, "loss": 0.1457, "step": 1243 }, { "epoch": 1.4420053611533725, "grad_norm": 0.23837333917617798, "learning_rate": 4.2850574712643684e-05, "loss": 0.132, "step": 1244 }, { "epoch": 1.4431645294501196, "grad_norm": 0.2650034725666046, "learning_rate": 4.28448275862069e-05, "loss": 0.1317, "step": 1245 }, { "epoch": 1.4443236977468668, "grad_norm": 0.2445426732301712, "learning_rate": 4.2839080459770114e-05, "loss": 0.1183, "step": 1246 }, { "epoch": 1.4454828660436136, "grad_norm": 0.2151249349117279, "learning_rate": 4.2833333333333335e-05, "loss": 0.1185, "step": 1247 }, { "epoch": 1.4466420343403608, "grad_norm": 0.2612224817276001, "learning_rate": 4.282758620689655e-05, "loss": 0.1225, "step": 1248 }, { "epoch": 1.4478012026371079, "grad_norm": 0.2132655531167984, "learning_rate": 4.282183908045977e-05, "loss": 0.1083, "step": 1249 }, { "epoch": 1.448960370933855, "grad_norm": 0.28365272283554077, "learning_rate": 4.2816091954022994e-05, "loss": 0.1269, "step": 1250 }, { "epoch": 1.450119539230602, "grad_norm": 0.24323928356170654, "learning_rate": 4.281034482758621e-05, "loss": 0.138, "step": 1251 }, { "epoch": 1.451278707527349, "grad_norm": 0.2997300624847412, "learning_rate": 4.280459770114943e-05, "loss": 0.1181, "step": 1252 }, { "epoch": 1.4524378758240961, "grad_norm": 0.28692933917045593, "learning_rate": 4.2798850574712645e-05, "loss": 0.136, "step": 1253 }, { "epoch": 1.4535970441208432, "grad_norm": 0.3969144821166992, "learning_rate": 4.279310344827586e-05, "loss": 0.1519, "step": 1254 }, { "epoch": 1.4547562124175903, "grad_norm": 0.29358184337615967, "learning_rate": 4.278735632183908e-05, "loss": 0.1243, "step": 1255 }, { "epoch": 1.4559153807143375, "grad_norm": 0.2625267803668976, "learning_rate": 4.2781609195402303e-05, "loss": 0.1228, "step": 1256 }, { "epoch": 1.4570745490110846, "grad_norm": 0.24338027834892273, "learning_rate": 4.2775862068965525e-05, "loss": 0.1182, "step": 1257 }, { "epoch": 1.4582337173078317, "grad_norm": 0.25080496072769165, "learning_rate": 4.277011494252874e-05, "loss": 0.1171, "step": 1258 }, { "epoch": 1.4593928856045788, "grad_norm": 0.2200382798910141, "learning_rate": 4.2764367816091955e-05, "loss": 0.1089, "step": 1259 }, { "epoch": 1.460552053901326, "grad_norm": 0.5557222962379456, "learning_rate": 4.275862068965518e-05, "loss": 0.1408, "step": 1260 }, { "epoch": 1.4617112221980728, "grad_norm": 0.21948528289794922, "learning_rate": 4.275287356321839e-05, "loss": 0.1188, "step": 1261 }, { "epoch": 1.46287039049482, "grad_norm": 0.2826273441314697, "learning_rate": 4.2747126436781606e-05, "loss": 0.1208, "step": 1262 }, { "epoch": 1.464029558791567, "grad_norm": 0.3197879195213318, "learning_rate": 4.274137931034483e-05, "loss": 0.1428, "step": 1263 }, { "epoch": 1.4651887270883142, "grad_norm": 0.26854708790779114, "learning_rate": 4.273563218390805e-05, "loss": 0.1354, "step": 1264 }, { "epoch": 1.4663478953850613, "grad_norm": 0.20866204798221588, "learning_rate": 4.2729885057471265e-05, "loss": 0.1205, "step": 1265 }, { "epoch": 1.4675070636818082, "grad_norm": 0.22703410685062408, "learning_rate": 4.2724137931034486e-05, "loss": 0.1165, "step": 1266 }, { "epoch": 1.4686662319785553, "grad_norm": 0.2805153727531433, "learning_rate": 4.27183908045977e-05, "loss": 0.129, "step": 1267 }, { "epoch": 1.4698254002753024, "grad_norm": 0.25746089220046997, "learning_rate": 4.271264367816092e-05, "loss": 0.113, "step": 1268 }, { "epoch": 1.4709845685720495, "grad_norm": 0.24136294424533844, "learning_rate": 4.270689655172414e-05, "loss": 0.1215, "step": 1269 }, { "epoch": 1.4721437368687966, "grad_norm": 0.21415215730667114, "learning_rate": 4.270114942528736e-05, "loss": 0.113, "step": 1270 }, { "epoch": 1.4733029051655437, "grad_norm": 0.23764324188232422, "learning_rate": 4.269540229885058e-05, "loss": 0.1283, "step": 1271 }, { "epoch": 1.4744620734622909, "grad_norm": 0.3138484060764313, "learning_rate": 4.2689655172413796e-05, "loss": 0.1294, "step": 1272 }, { "epoch": 1.475621241759038, "grad_norm": 0.29593080282211304, "learning_rate": 4.268390804597701e-05, "loss": 0.1109, "step": 1273 }, { "epoch": 1.476780410055785, "grad_norm": 0.19137899577617645, "learning_rate": 4.267816091954023e-05, "loss": 0.1062, "step": 1274 }, { "epoch": 1.477939578352532, "grad_norm": 0.2660485506057739, "learning_rate": 4.267241379310345e-05, "loss": 0.1331, "step": 1275 }, { "epoch": 1.479098746649279, "grad_norm": 0.2462538778781891, "learning_rate": 4.266666666666667e-05, "loss": 0.1241, "step": 1276 }, { "epoch": 1.4802579149460262, "grad_norm": 0.28121456503868103, "learning_rate": 4.266091954022989e-05, "loss": 0.1301, "step": 1277 }, { "epoch": 1.4814170832427733, "grad_norm": 0.2737043797969818, "learning_rate": 4.2655172413793106e-05, "loss": 0.139, "step": 1278 }, { "epoch": 1.4825762515395204, "grad_norm": 0.271465539932251, "learning_rate": 4.264942528735633e-05, "loss": 0.1272, "step": 1279 }, { "epoch": 1.4837354198362676, "grad_norm": 0.35305821895599365, "learning_rate": 4.264367816091954e-05, "loss": 0.13, "step": 1280 }, { "epoch": 1.4848945881330144, "grad_norm": 0.26508283615112305, "learning_rate": 4.263793103448276e-05, "loss": 0.1277, "step": 1281 }, { "epoch": 1.4860537564297616, "grad_norm": 0.2650332450866699, "learning_rate": 4.263218390804598e-05, "loss": 0.1276, "step": 1282 }, { "epoch": 1.4872129247265087, "grad_norm": 0.234884575009346, "learning_rate": 4.2626436781609194e-05, "loss": 0.1385, "step": 1283 }, { "epoch": 1.4883720930232558, "grad_norm": 0.26508384943008423, "learning_rate": 4.2620689655172416e-05, "loss": 0.1308, "step": 1284 }, { "epoch": 1.489531261320003, "grad_norm": 0.2862217426300049, "learning_rate": 4.261494252873564e-05, "loss": 0.1252, "step": 1285 }, { "epoch": 1.49069042961675, "grad_norm": 0.2019922137260437, "learning_rate": 4.260919540229885e-05, "loss": 0.1138, "step": 1286 }, { "epoch": 1.4918495979134971, "grad_norm": 0.278028279542923, "learning_rate": 4.2603448275862074e-05, "loss": 0.1363, "step": 1287 }, { "epoch": 1.4930087662102443, "grad_norm": 0.29411014914512634, "learning_rate": 4.259770114942529e-05, "loss": 0.1316, "step": 1288 }, { "epoch": 1.4941679345069914, "grad_norm": 0.22040338814258575, "learning_rate": 4.2591954022988504e-05, "loss": 0.1376, "step": 1289 }, { "epoch": 1.4953271028037383, "grad_norm": 0.2261519730091095, "learning_rate": 4.2586206896551725e-05, "loss": 0.1237, "step": 1290 }, { "epoch": 1.4964862711004854, "grad_norm": 0.3036960959434509, "learning_rate": 4.258045977011495e-05, "loss": 0.126, "step": 1291 }, { "epoch": 1.4976454393972325, "grad_norm": 0.2828342020511627, "learning_rate": 4.257471264367816e-05, "loss": 0.1441, "step": 1292 }, { "epoch": 1.4988046076939796, "grad_norm": 0.2660335302352905, "learning_rate": 4.2568965517241384e-05, "loss": 0.1235, "step": 1293 }, { "epoch": 1.4999637759907267, "grad_norm": 0.22510606050491333, "learning_rate": 4.25632183908046e-05, "loss": 0.1087, "step": 1294 }, { "epoch": 1.5011229442874736, "grad_norm": 0.2649793028831482, "learning_rate": 4.255747126436782e-05, "loss": 0.1258, "step": 1295 }, { "epoch": 1.5022821125842207, "grad_norm": 0.2566182315349579, "learning_rate": 4.2551724137931035e-05, "loss": 0.1272, "step": 1296 }, { "epoch": 1.5034412808809678, "grad_norm": 0.2644808888435364, "learning_rate": 4.254597701149426e-05, "loss": 0.1268, "step": 1297 }, { "epoch": 1.504600449177715, "grad_norm": 0.32291561365127563, "learning_rate": 4.254022988505748e-05, "loss": 0.1329, "step": 1298 }, { "epoch": 1.505759617474462, "grad_norm": 0.34708988666534424, "learning_rate": 4.253448275862069e-05, "loss": 0.1569, "step": 1299 }, { "epoch": 1.5069187857712092, "grad_norm": 0.23418490588665009, "learning_rate": 4.252873563218391e-05, "loss": 0.1303, "step": 1300 }, { "epoch": 1.5080779540679563, "grad_norm": 0.35759875178337097, "learning_rate": 4.252298850574713e-05, "loss": 0.1226, "step": 1301 }, { "epoch": 1.5092371223647034, "grad_norm": 0.255636066198349, "learning_rate": 4.2517241379310345e-05, "loss": 0.12, "step": 1302 }, { "epoch": 1.5103962906614505, "grad_norm": 0.2545788586139679, "learning_rate": 4.251149425287356e-05, "loss": 0.1326, "step": 1303 }, { "epoch": 1.5115554589581977, "grad_norm": 0.2811221480369568, "learning_rate": 4.250574712643678e-05, "loss": 0.1441, "step": 1304 }, { "epoch": 1.5127146272549445, "grad_norm": 0.2685137987136841, "learning_rate": 4.25e-05, "loss": 0.1367, "step": 1305 }, { "epoch": 1.5138737955516917, "grad_norm": 0.26587024331092834, "learning_rate": 4.2494252873563225e-05, "loss": 0.1196, "step": 1306 }, { "epoch": 1.5150329638484388, "grad_norm": 0.296543151140213, "learning_rate": 4.248850574712644e-05, "loss": 0.1502, "step": 1307 }, { "epoch": 1.5161921321451857, "grad_norm": 0.2668646275997162, "learning_rate": 4.2482758620689655e-05, "loss": 0.1351, "step": 1308 }, { "epoch": 1.5173513004419328, "grad_norm": 0.2467673420906067, "learning_rate": 4.2477011494252876e-05, "loss": 0.1281, "step": 1309 }, { "epoch": 1.51851046873868, "grad_norm": 0.25315842032432556, "learning_rate": 4.247126436781609e-05, "loss": 0.1286, "step": 1310 }, { "epoch": 1.519669637035427, "grad_norm": 0.25049182772636414, "learning_rate": 4.246551724137931e-05, "loss": 0.1356, "step": 1311 }, { "epoch": 1.5208288053321741, "grad_norm": 0.23967556655406952, "learning_rate": 4.2459770114942534e-05, "loss": 0.1135, "step": 1312 }, { "epoch": 1.5219879736289212, "grad_norm": 0.2868068218231201, "learning_rate": 4.245402298850575e-05, "loss": 0.1309, "step": 1313 }, { "epoch": 1.5231471419256684, "grad_norm": 0.21902504563331604, "learning_rate": 4.244827586206897e-05, "loss": 0.1156, "step": 1314 }, { "epoch": 1.5243063102224155, "grad_norm": 0.24290820956230164, "learning_rate": 4.2442528735632186e-05, "loss": 0.1296, "step": 1315 }, { "epoch": 1.5254654785191626, "grad_norm": 0.21558572351932526, "learning_rate": 4.24367816091954e-05, "loss": 0.1213, "step": 1316 }, { "epoch": 1.5266246468159097, "grad_norm": 0.28430676460266113, "learning_rate": 4.243103448275862e-05, "loss": 0.1275, "step": 1317 }, { "epoch": 1.5277838151126568, "grad_norm": 0.28674018383026123, "learning_rate": 4.2425287356321844e-05, "loss": 0.1315, "step": 1318 }, { "epoch": 1.5289429834094037, "grad_norm": 0.31648436188697815, "learning_rate": 4.241954022988506e-05, "loss": 0.1457, "step": 1319 }, { "epoch": 1.5301021517061508, "grad_norm": 0.2692152261734009, "learning_rate": 4.241379310344828e-05, "loss": 0.1261, "step": 1320 }, { "epoch": 1.531261320002898, "grad_norm": 0.21987690031528473, "learning_rate": 4.2408045977011496e-05, "loss": 0.1224, "step": 1321 }, { "epoch": 1.532420488299645, "grad_norm": 0.32818254828453064, "learning_rate": 4.240229885057471e-05, "loss": 0.128, "step": 1322 }, { "epoch": 1.533579656596392, "grad_norm": 0.2926555275917053, "learning_rate": 4.239655172413793e-05, "loss": 0.1425, "step": 1323 }, { "epoch": 1.534738824893139, "grad_norm": 0.2183428853750229, "learning_rate": 4.239080459770115e-05, "loss": 0.121, "step": 1324 }, { "epoch": 1.5358979931898862, "grad_norm": 0.26924946904182434, "learning_rate": 4.238505747126437e-05, "loss": 0.1474, "step": 1325 }, { "epoch": 1.5370571614866333, "grad_norm": 0.1868240386247635, "learning_rate": 4.237931034482759e-05, "loss": 0.1293, "step": 1326 }, { "epoch": 1.5382163297833804, "grad_norm": 0.20834980905056, "learning_rate": 4.2373563218390805e-05, "loss": 0.1159, "step": 1327 }, { "epoch": 1.5393754980801275, "grad_norm": 0.2959633767604828, "learning_rate": 4.236781609195403e-05, "loss": 0.1421, "step": 1328 }, { "epoch": 1.5405346663768746, "grad_norm": 0.222950279712677, "learning_rate": 4.236206896551724e-05, "loss": 0.1279, "step": 1329 }, { "epoch": 1.5416938346736218, "grad_norm": 0.2185351401567459, "learning_rate": 4.235632183908046e-05, "loss": 0.1247, "step": 1330 }, { "epoch": 1.5428530029703689, "grad_norm": 0.16755683720111847, "learning_rate": 4.235057471264368e-05, "loss": 0.101, "step": 1331 }, { "epoch": 1.544012171267116, "grad_norm": 0.2677669823169708, "learning_rate": 4.23448275862069e-05, "loss": 0.1339, "step": 1332 }, { "epoch": 1.5451713395638629, "grad_norm": 0.252986341714859, "learning_rate": 4.233908045977012e-05, "loss": 0.1202, "step": 1333 }, { "epoch": 1.54633050786061, "grad_norm": 0.2726978659629822, "learning_rate": 4.233333333333334e-05, "loss": 0.1274, "step": 1334 }, { "epoch": 1.547489676157357, "grad_norm": 0.24727889895439148, "learning_rate": 4.232758620689655e-05, "loss": 0.1292, "step": 1335 }, { "epoch": 1.5486488444541042, "grad_norm": 0.24011863768100739, "learning_rate": 4.2321839080459773e-05, "loss": 0.1145, "step": 1336 }, { "epoch": 1.5498080127508511, "grad_norm": 0.26071444153785706, "learning_rate": 4.231609195402299e-05, "loss": 0.1208, "step": 1337 }, { "epoch": 1.5509671810475982, "grad_norm": 0.2095252424478531, "learning_rate": 4.231034482758621e-05, "loss": 0.1164, "step": 1338 }, { "epoch": 1.5521263493443453, "grad_norm": 0.3264429569244385, "learning_rate": 4.230459770114943e-05, "loss": 0.1344, "step": 1339 }, { "epoch": 1.5532855176410925, "grad_norm": 0.24655984342098236, "learning_rate": 4.2298850574712647e-05, "loss": 0.124, "step": 1340 }, { "epoch": 1.5544446859378396, "grad_norm": 0.28563857078552246, "learning_rate": 4.229310344827586e-05, "loss": 0.1312, "step": 1341 }, { "epoch": 1.5556038542345867, "grad_norm": 0.2881647050380707, "learning_rate": 4.228735632183908e-05, "loss": 0.1418, "step": 1342 }, { "epoch": 1.5567630225313338, "grad_norm": 0.2571274936199188, "learning_rate": 4.22816091954023e-05, "loss": 0.1213, "step": 1343 }, { "epoch": 1.557922190828081, "grad_norm": 0.26905322074890137, "learning_rate": 4.227586206896552e-05, "loss": 0.1294, "step": 1344 }, { "epoch": 1.559081359124828, "grad_norm": 0.3038524389266968, "learning_rate": 4.2270114942528735e-05, "loss": 0.1398, "step": 1345 }, { "epoch": 1.5602405274215752, "grad_norm": 0.31705963611602783, "learning_rate": 4.2264367816091956e-05, "loss": 0.1284, "step": 1346 }, { "epoch": 1.5613996957183223, "grad_norm": 0.24810083210468292, "learning_rate": 4.225862068965518e-05, "loss": 0.126, "step": 1347 }, { "epoch": 1.5625588640150692, "grad_norm": 0.2836195230484009, "learning_rate": 4.225287356321839e-05, "loss": 0.1318, "step": 1348 }, { "epoch": 1.5637180323118163, "grad_norm": 0.26509571075439453, "learning_rate": 4.224712643678161e-05, "loss": 0.1279, "step": 1349 }, { "epoch": 1.5648772006085634, "grad_norm": 0.24432741105556488, "learning_rate": 4.224137931034483e-05, "loss": 0.1238, "step": 1350 }, { "epoch": 1.5660363689053103, "grad_norm": 0.25199276208877563, "learning_rate": 4.2235632183908044e-05, "loss": 0.1153, "step": 1351 }, { "epoch": 1.5671955372020574, "grad_norm": 0.2422485649585724, "learning_rate": 4.2229885057471266e-05, "loss": 0.1222, "step": 1352 }, { "epoch": 1.5683547054988045, "grad_norm": 0.28373682498931885, "learning_rate": 4.222413793103449e-05, "loss": 0.143, "step": 1353 }, { "epoch": 1.5695138737955516, "grad_norm": 0.323743999004364, "learning_rate": 4.22183908045977e-05, "loss": 0.1185, "step": 1354 }, { "epoch": 1.5706730420922987, "grad_norm": 0.27437567710876465, "learning_rate": 4.2212643678160924e-05, "loss": 0.1209, "step": 1355 }, { "epoch": 1.5718322103890459, "grad_norm": 0.20446433126926422, "learning_rate": 4.220689655172414e-05, "loss": 0.1164, "step": 1356 }, { "epoch": 1.572991378685793, "grad_norm": 0.21873438358306885, "learning_rate": 4.2201149425287354e-05, "loss": 0.1209, "step": 1357 }, { "epoch": 1.57415054698254, "grad_norm": 0.2323230654001236, "learning_rate": 4.2195402298850576e-05, "loss": 0.1233, "step": 1358 }, { "epoch": 1.5753097152792872, "grad_norm": 0.2679866552352905, "learning_rate": 4.21896551724138e-05, "loss": 0.1219, "step": 1359 }, { "epoch": 1.5764688835760343, "grad_norm": 0.29342472553253174, "learning_rate": 4.218390804597701e-05, "loss": 0.1449, "step": 1360 }, { "epoch": 1.5776280518727814, "grad_norm": 0.29322659969329834, "learning_rate": 4.2178160919540234e-05, "loss": 0.1494, "step": 1361 }, { "epoch": 1.5787872201695283, "grad_norm": 0.23010887205600739, "learning_rate": 4.217241379310345e-05, "loss": 0.1304, "step": 1362 }, { "epoch": 1.5799463884662754, "grad_norm": 0.20732039213180542, "learning_rate": 4.216666666666667e-05, "loss": 0.113, "step": 1363 }, { "epoch": 1.5811055567630226, "grad_norm": 0.21723432838916779, "learning_rate": 4.2160919540229886e-05, "loss": 0.1193, "step": 1364 }, { "epoch": 1.5822647250597695, "grad_norm": 0.268611878156662, "learning_rate": 4.21551724137931e-05, "loss": 0.1447, "step": 1365 }, { "epoch": 1.5834238933565166, "grad_norm": 0.2262781858444214, "learning_rate": 4.214942528735633e-05, "loss": 0.1188, "step": 1366 }, { "epoch": 1.5845830616532637, "grad_norm": 0.21195755898952484, "learning_rate": 4.2143678160919544e-05, "loss": 0.1156, "step": 1367 }, { "epoch": 1.5857422299500108, "grad_norm": 0.26163250207901, "learning_rate": 4.213793103448276e-05, "loss": 0.1348, "step": 1368 }, { "epoch": 1.586901398246758, "grad_norm": 0.3384718894958496, "learning_rate": 4.213218390804598e-05, "loss": 0.1473, "step": 1369 }, { "epoch": 1.588060566543505, "grad_norm": 0.26248493790626526, "learning_rate": 4.2126436781609195e-05, "loss": 0.1212, "step": 1370 }, { "epoch": 1.5892197348402521, "grad_norm": 0.21204310655593872, "learning_rate": 4.212068965517242e-05, "loss": 0.1131, "step": 1371 }, { "epoch": 1.5903789031369993, "grad_norm": 0.2920737862586975, "learning_rate": 4.211494252873563e-05, "loss": 0.134, "step": 1372 }, { "epoch": 1.5915380714337464, "grad_norm": 0.2683486044406891, "learning_rate": 4.2109195402298854e-05, "loss": 0.1297, "step": 1373 }, { "epoch": 1.5926972397304935, "grad_norm": 0.25629928708076477, "learning_rate": 4.2103448275862075e-05, "loss": 0.1364, "step": 1374 }, { "epoch": 1.5938564080272406, "grad_norm": 0.22611750662326813, "learning_rate": 4.209770114942529e-05, "loss": 0.1189, "step": 1375 }, { "epoch": 1.5950155763239875, "grad_norm": 0.2171134501695633, "learning_rate": 4.2091954022988505e-05, "loss": 0.1224, "step": 1376 }, { "epoch": 1.5961747446207346, "grad_norm": 0.3188816010951996, "learning_rate": 4.208620689655173e-05, "loss": 0.1325, "step": 1377 }, { "epoch": 1.5973339129174817, "grad_norm": 0.25183406472206116, "learning_rate": 4.208045977011494e-05, "loss": 0.116, "step": 1378 }, { "epoch": 1.5984930812142288, "grad_norm": 0.274368017911911, "learning_rate": 4.207471264367816e-05, "loss": 0.1388, "step": 1379 }, { "epoch": 1.5996522495109757, "grad_norm": 0.2406664341688156, "learning_rate": 4.2068965517241385e-05, "loss": 0.1225, "step": 1380 }, { "epoch": 1.6008114178077228, "grad_norm": 0.22044144570827484, "learning_rate": 4.20632183908046e-05, "loss": 0.1212, "step": 1381 }, { "epoch": 1.60197058610447, "grad_norm": 0.2523006200790405, "learning_rate": 4.205747126436782e-05, "loss": 0.1142, "step": 1382 }, { "epoch": 1.603129754401217, "grad_norm": 0.26407095789909363, "learning_rate": 4.2051724137931036e-05, "loss": 0.1252, "step": 1383 }, { "epoch": 1.6042889226979642, "grad_norm": 0.22272075712680817, "learning_rate": 4.204597701149425e-05, "loss": 0.1177, "step": 1384 }, { "epoch": 1.6054480909947113, "grad_norm": 0.2437649518251419, "learning_rate": 4.204022988505747e-05, "loss": 0.1238, "step": 1385 }, { "epoch": 1.6066072592914584, "grad_norm": 0.29245513677597046, "learning_rate": 4.203448275862069e-05, "loss": 0.1389, "step": 1386 }, { "epoch": 1.6077664275882055, "grad_norm": 0.25167253613471985, "learning_rate": 4.202873563218391e-05, "loss": 0.131, "step": 1387 }, { "epoch": 1.6089255958849527, "grad_norm": 0.2610798478126526, "learning_rate": 4.202298850574713e-05, "loss": 0.1415, "step": 1388 }, { "epoch": 1.6100847641816998, "grad_norm": 0.21162807941436768, "learning_rate": 4.2017241379310346e-05, "loss": 0.1296, "step": 1389 }, { "epoch": 1.6112439324784467, "grad_norm": 0.26718345284461975, "learning_rate": 4.201149425287357e-05, "loss": 0.1346, "step": 1390 }, { "epoch": 1.6124031007751938, "grad_norm": 0.2872393727302551, "learning_rate": 4.200574712643678e-05, "loss": 0.1452, "step": 1391 }, { "epoch": 1.613562269071941, "grad_norm": 0.2846381366252899, "learning_rate": 4.2e-05, "loss": 0.1281, "step": 1392 }, { "epoch": 1.614721437368688, "grad_norm": 0.2739090919494629, "learning_rate": 4.199425287356322e-05, "loss": 0.1248, "step": 1393 }, { "epoch": 1.615880605665435, "grad_norm": 0.24870949983596802, "learning_rate": 4.198850574712644e-05, "loss": 0.1377, "step": 1394 }, { "epoch": 1.617039773962182, "grad_norm": 0.26134321093559265, "learning_rate": 4.1982758620689656e-05, "loss": 0.1348, "step": 1395 }, { "epoch": 1.6181989422589291, "grad_norm": 0.2896818220615387, "learning_rate": 4.197701149425288e-05, "loss": 0.1376, "step": 1396 }, { "epoch": 1.6193581105556762, "grad_norm": 0.24523970484733582, "learning_rate": 4.197126436781609e-05, "loss": 0.1289, "step": 1397 }, { "epoch": 1.6205172788524234, "grad_norm": 0.2446785867214203, "learning_rate": 4.196551724137931e-05, "loss": 0.1219, "step": 1398 }, { "epoch": 1.6216764471491705, "grad_norm": 0.2920502722263336, "learning_rate": 4.195977011494253e-05, "loss": 0.1609, "step": 1399 }, { "epoch": 1.6228356154459176, "grad_norm": 0.24321280419826508, "learning_rate": 4.195402298850575e-05, "loss": 0.1217, "step": 1400 }, { "epoch": 1.6239947837426647, "grad_norm": 0.20927608013153076, "learning_rate": 4.194827586206897e-05, "loss": 0.1071, "step": 1401 }, { "epoch": 1.6251539520394118, "grad_norm": 0.2517341077327728, "learning_rate": 4.194252873563219e-05, "loss": 0.1359, "step": 1402 }, { "epoch": 1.626313120336159, "grad_norm": 0.2402840554714203, "learning_rate": 4.19367816091954e-05, "loss": 0.1352, "step": 1403 }, { "epoch": 1.627472288632906, "grad_norm": 0.22628480195999146, "learning_rate": 4.1931034482758624e-05, "loss": 0.1271, "step": 1404 }, { "epoch": 1.628631456929653, "grad_norm": 0.24545376002788544, "learning_rate": 4.192528735632184e-05, "loss": 0.1338, "step": 1405 }, { "epoch": 1.6297906252264, "grad_norm": 0.2462320774793625, "learning_rate": 4.1919540229885054e-05, "loss": 0.1375, "step": 1406 }, { "epoch": 1.6309497935231472, "grad_norm": 0.2645837068557739, "learning_rate": 4.191379310344828e-05, "loss": 0.1327, "step": 1407 }, { "epoch": 1.632108961819894, "grad_norm": 0.2443280965089798, "learning_rate": 4.19080459770115e-05, "loss": 0.1211, "step": 1408 }, { "epoch": 1.6332681301166412, "grad_norm": 0.260450154542923, "learning_rate": 4.190229885057472e-05, "loss": 0.1318, "step": 1409 }, { "epoch": 1.6344272984133883, "grad_norm": 0.22310788929462433, "learning_rate": 4.1896551724137934e-05, "loss": 0.123, "step": 1410 }, { "epoch": 1.6355864667101354, "grad_norm": 0.24180607497692108, "learning_rate": 4.189080459770115e-05, "loss": 0.1214, "step": 1411 }, { "epoch": 1.6367456350068825, "grad_norm": 0.33246803283691406, "learning_rate": 4.188505747126437e-05, "loss": 0.159, "step": 1412 }, { "epoch": 1.6379048033036296, "grad_norm": 0.2154608517885208, "learning_rate": 4.1879310344827585e-05, "loss": 0.1242, "step": 1413 }, { "epoch": 1.6390639716003768, "grad_norm": 0.24036885797977448, "learning_rate": 4.187356321839081e-05, "loss": 0.1097, "step": 1414 }, { "epoch": 1.6402231398971239, "grad_norm": 0.21203725039958954, "learning_rate": 4.186781609195403e-05, "loss": 0.1232, "step": 1415 }, { "epoch": 1.641382308193871, "grad_norm": 0.24248747527599335, "learning_rate": 4.1862068965517243e-05, "loss": 0.1223, "step": 1416 }, { "epoch": 1.642541476490618, "grad_norm": 0.2965581715106964, "learning_rate": 4.185632183908046e-05, "loss": 0.1277, "step": 1417 }, { "epoch": 1.6437006447873652, "grad_norm": 0.24066859483718872, "learning_rate": 4.185057471264368e-05, "loss": 0.1388, "step": 1418 }, { "epoch": 1.644859813084112, "grad_norm": 0.23044906556606293, "learning_rate": 4.1844827586206895e-05, "loss": 0.1173, "step": 1419 }, { "epoch": 1.6460189813808592, "grad_norm": 0.2829865515232086, "learning_rate": 4.1839080459770117e-05, "loss": 0.1264, "step": 1420 }, { "epoch": 1.6471781496776063, "grad_norm": 0.37790974974632263, "learning_rate": 4.183333333333334e-05, "loss": 0.1158, "step": 1421 }, { "epoch": 1.6483373179743535, "grad_norm": 0.23219698667526245, "learning_rate": 4.182758620689655e-05, "loss": 0.1293, "step": 1422 }, { "epoch": 1.6494964862711003, "grad_norm": 0.26675254106521606, "learning_rate": 4.1821839080459775e-05, "loss": 0.1245, "step": 1423 }, { "epoch": 1.6506556545678475, "grad_norm": 0.2584593594074249, "learning_rate": 4.181609195402299e-05, "loss": 0.1132, "step": 1424 }, { "epoch": 1.6518148228645946, "grad_norm": 0.2532110810279846, "learning_rate": 4.1810344827586205e-05, "loss": 0.1265, "step": 1425 }, { "epoch": 1.6529739911613417, "grad_norm": 0.27470529079437256, "learning_rate": 4.1804597701149426e-05, "loss": 0.1422, "step": 1426 }, { "epoch": 1.6541331594580888, "grad_norm": 0.24903489649295807, "learning_rate": 4.179885057471265e-05, "loss": 0.1065, "step": 1427 }, { "epoch": 1.655292327754836, "grad_norm": 0.29945898056030273, "learning_rate": 4.179310344827587e-05, "loss": 0.1418, "step": 1428 }, { "epoch": 1.656451496051583, "grad_norm": 0.19737482070922852, "learning_rate": 4.1787356321839085e-05, "loss": 0.1162, "step": 1429 }, { "epoch": 1.6576106643483302, "grad_norm": 0.31681379675865173, "learning_rate": 4.17816091954023e-05, "loss": 0.1521, "step": 1430 }, { "epoch": 1.6587698326450773, "grad_norm": 0.23246780037879944, "learning_rate": 4.177586206896552e-05, "loss": 0.1156, "step": 1431 }, { "epoch": 1.6599290009418244, "grad_norm": 0.2456778883934021, "learning_rate": 4.1770114942528736e-05, "loss": 0.1194, "step": 1432 }, { "epoch": 1.6610881692385713, "grad_norm": 0.23620393872261047, "learning_rate": 4.176436781609195e-05, "loss": 0.1231, "step": 1433 }, { "epoch": 1.6622473375353184, "grad_norm": 0.20471753180027008, "learning_rate": 4.175862068965517e-05, "loss": 0.109, "step": 1434 }, { "epoch": 1.6634065058320655, "grad_norm": 0.2827279269695282, "learning_rate": 4.1752873563218394e-05, "loss": 0.1467, "step": 1435 }, { "epoch": 1.6645656741288126, "grad_norm": 0.21254539489746094, "learning_rate": 4.1747126436781616e-05, "loss": 0.1182, "step": 1436 }, { "epoch": 1.6657248424255595, "grad_norm": 0.24518734216690063, "learning_rate": 4.174137931034483e-05, "loss": 0.1326, "step": 1437 }, { "epoch": 1.6668840107223066, "grad_norm": 0.2632474899291992, "learning_rate": 4.1735632183908046e-05, "loss": 0.1357, "step": 1438 }, { "epoch": 1.6680431790190537, "grad_norm": 0.23681853711605072, "learning_rate": 4.172988505747127e-05, "loss": 0.1272, "step": 1439 }, { "epoch": 1.6692023473158009, "grad_norm": 0.25653204321861267, "learning_rate": 4.172413793103448e-05, "loss": 0.1305, "step": 1440 }, { "epoch": 1.670361515612548, "grad_norm": 0.22474606335163116, "learning_rate": 4.1718390804597704e-05, "loss": 0.1192, "step": 1441 }, { "epoch": 1.671520683909295, "grad_norm": 0.2875736951828003, "learning_rate": 4.1712643678160926e-05, "loss": 0.1271, "step": 1442 }, { "epoch": 1.6726798522060422, "grad_norm": 0.27800050377845764, "learning_rate": 4.170689655172414e-05, "loss": 0.1348, "step": 1443 }, { "epoch": 1.6738390205027893, "grad_norm": 0.2516680955886841, "learning_rate": 4.1701149425287356e-05, "loss": 0.1128, "step": 1444 }, { "epoch": 1.6749981887995364, "grad_norm": 0.3367115259170532, "learning_rate": 4.169540229885058e-05, "loss": 0.1362, "step": 1445 }, { "epoch": 1.6761573570962836, "grad_norm": 0.3319508731365204, "learning_rate": 4.168965517241379e-05, "loss": 0.1358, "step": 1446 }, { "epoch": 1.6773165253930307, "grad_norm": 0.2614979147911072, "learning_rate": 4.1683908045977014e-05, "loss": 0.1244, "step": 1447 }, { "epoch": 1.6784756936897776, "grad_norm": 0.24296019971370697, "learning_rate": 4.1678160919540235e-05, "loss": 0.1223, "step": 1448 }, { "epoch": 1.6796348619865247, "grad_norm": 0.26165443658828735, "learning_rate": 4.167241379310345e-05, "loss": 0.1326, "step": 1449 }, { "epoch": 1.6807940302832718, "grad_norm": 0.24626079201698303, "learning_rate": 4.166666666666667e-05, "loss": 0.1189, "step": 1450 }, { "epoch": 1.6819531985800187, "grad_norm": 0.24544042348861694, "learning_rate": 4.166091954022989e-05, "loss": 0.1343, "step": 1451 }, { "epoch": 1.6831123668767658, "grad_norm": 0.23547019064426422, "learning_rate": 4.16551724137931e-05, "loss": 0.1176, "step": 1452 }, { "epoch": 1.684271535173513, "grad_norm": 0.2702592611312866, "learning_rate": 4.1649425287356324e-05, "loss": 0.1488, "step": 1453 }, { "epoch": 1.68543070347026, "grad_norm": 0.24035340547561646, "learning_rate": 4.164367816091954e-05, "loss": 0.1348, "step": 1454 }, { "epoch": 1.6865898717670071, "grad_norm": 0.22424210608005524, "learning_rate": 4.163793103448276e-05, "loss": 0.13, "step": 1455 }, { "epoch": 1.6877490400637543, "grad_norm": 0.2859560549259186, "learning_rate": 4.163218390804598e-05, "loss": 0.1486, "step": 1456 }, { "epoch": 1.6889082083605014, "grad_norm": 0.21824337542057037, "learning_rate": 4.16264367816092e-05, "loss": 0.1174, "step": 1457 }, { "epoch": 1.6900673766572485, "grad_norm": 0.24200040102005005, "learning_rate": 4.162068965517242e-05, "loss": 0.1324, "step": 1458 }, { "epoch": 1.6912265449539956, "grad_norm": 0.2552363872528076, "learning_rate": 4.161494252873563e-05, "loss": 0.1313, "step": 1459 }, { "epoch": 1.6923857132507427, "grad_norm": 0.19345875084400177, "learning_rate": 4.160919540229885e-05, "loss": 0.1174, "step": 1460 }, { "epoch": 1.6935448815474898, "grad_norm": 0.25094732642173767, "learning_rate": 4.160344827586207e-05, "loss": 0.1326, "step": 1461 }, { "epoch": 1.6947040498442367, "grad_norm": 0.24605903029441833, "learning_rate": 4.159770114942529e-05, "loss": 0.1155, "step": 1462 }, { "epoch": 1.6958632181409838, "grad_norm": 0.3044376075267792, "learning_rate": 4.1591954022988506e-05, "loss": 0.1273, "step": 1463 }, { "epoch": 1.697022386437731, "grad_norm": 0.25969693064689636, "learning_rate": 4.158620689655173e-05, "loss": 0.1261, "step": 1464 }, { "epoch": 1.6981815547344778, "grad_norm": 0.2134069949388504, "learning_rate": 4.158045977011494e-05, "loss": 0.129, "step": 1465 }, { "epoch": 1.699340723031225, "grad_norm": 0.24624426662921906, "learning_rate": 4.1574712643678165e-05, "loss": 0.1254, "step": 1466 }, { "epoch": 1.700499891327972, "grad_norm": 0.27227187156677246, "learning_rate": 4.156896551724138e-05, "loss": 0.1241, "step": 1467 }, { "epoch": 1.7016590596247192, "grad_norm": 0.2617661952972412, "learning_rate": 4.15632183908046e-05, "loss": 0.1241, "step": 1468 }, { "epoch": 1.7028182279214663, "grad_norm": 0.2572365701198578, "learning_rate": 4.155747126436782e-05, "loss": 0.1308, "step": 1469 }, { "epoch": 1.7039773962182134, "grad_norm": 0.24519048631191254, "learning_rate": 4.155172413793104e-05, "loss": 0.1333, "step": 1470 }, { "epoch": 1.7051365645149605, "grad_norm": 0.23390959203243256, "learning_rate": 4.154597701149425e-05, "loss": 0.1335, "step": 1471 }, { "epoch": 1.7062957328117077, "grad_norm": 0.19565661251544952, "learning_rate": 4.1540229885057474e-05, "loss": 0.1161, "step": 1472 }, { "epoch": 1.7074549011084548, "grad_norm": 0.19742251932621002, "learning_rate": 4.153448275862069e-05, "loss": 0.1153, "step": 1473 }, { "epoch": 1.7086140694052019, "grad_norm": 0.2252473384141922, "learning_rate": 4.152873563218391e-05, "loss": 0.1223, "step": 1474 }, { "epoch": 1.709773237701949, "grad_norm": 0.25856614112854004, "learning_rate": 4.1522988505747126e-05, "loss": 0.1225, "step": 1475 }, { "epoch": 1.710932405998696, "grad_norm": 0.20960158109664917, "learning_rate": 4.151724137931035e-05, "loss": 0.1257, "step": 1476 }, { "epoch": 1.712091574295443, "grad_norm": 0.26653629541397095, "learning_rate": 4.151149425287357e-05, "loss": 0.123, "step": 1477 }, { "epoch": 1.7132507425921901, "grad_norm": 0.19926515221595764, "learning_rate": 4.1505747126436784e-05, "loss": 0.1199, "step": 1478 }, { "epoch": 1.7144099108889372, "grad_norm": 0.2077583223581314, "learning_rate": 4.15e-05, "loss": 0.1212, "step": 1479 }, { "epoch": 1.7155690791856841, "grad_norm": 0.22840292751789093, "learning_rate": 4.149425287356322e-05, "loss": 0.1244, "step": 1480 }, { "epoch": 1.7167282474824312, "grad_norm": 0.26933014392852783, "learning_rate": 4.1488505747126436e-05, "loss": 0.1388, "step": 1481 }, { "epoch": 1.7178874157791784, "grad_norm": 0.2531909942626953, "learning_rate": 4.148275862068966e-05, "loss": 0.128, "step": 1482 }, { "epoch": 1.7190465840759255, "grad_norm": 0.3011757731437683, "learning_rate": 4.147701149425288e-05, "loss": 0.1574, "step": 1483 }, { "epoch": 1.7202057523726726, "grad_norm": 0.27856916189193726, "learning_rate": 4.1471264367816094e-05, "loss": 0.1275, "step": 1484 }, { "epoch": 1.7213649206694197, "grad_norm": 0.23599520325660706, "learning_rate": 4.1465517241379316e-05, "loss": 0.1206, "step": 1485 }, { "epoch": 1.7225240889661668, "grad_norm": 0.2702353000640869, "learning_rate": 4.145977011494253e-05, "loss": 0.1296, "step": 1486 }, { "epoch": 1.723683257262914, "grad_norm": 0.21505098044872284, "learning_rate": 4.1454022988505745e-05, "loss": 0.1131, "step": 1487 }, { "epoch": 1.724842425559661, "grad_norm": 0.27001845836639404, "learning_rate": 4.144827586206897e-05, "loss": 0.1265, "step": 1488 }, { "epoch": 1.7260015938564082, "grad_norm": 0.2392754852771759, "learning_rate": 4.144252873563219e-05, "loss": 0.1159, "step": 1489 }, { "epoch": 1.727160762153155, "grad_norm": 0.24627234041690826, "learning_rate": 4.1436781609195404e-05, "loss": 0.1284, "step": 1490 }, { "epoch": 1.7283199304499022, "grad_norm": 0.23222601413726807, "learning_rate": 4.1431034482758625e-05, "loss": 0.1288, "step": 1491 }, { "epoch": 1.7294790987466493, "grad_norm": 0.2943344712257385, "learning_rate": 4.142528735632184e-05, "loss": 0.1299, "step": 1492 }, { "epoch": 1.7306382670433964, "grad_norm": 0.2248132824897766, "learning_rate": 4.141954022988506e-05, "loss": 0.1191, "step": 1493 }, { "epoch": 1.7317974353401433, "grad_norm": 0.24535109102725983, "learning_rate": 4.141379310344828e-05, "loss": 0.129, "step": 1494 }, { "epoch": 1.7329566036368904, "grad_norm": 0.2560446262359619, "learning_rate": 4.140804597701149e-05, "loss": 0.1343, "step": 1495 }, { "epoch": 1.7341157719336375, "grad_norm": 0.23877164721488953, "learning_rate": 4.1402298850574713e-05, "loss": 0.1026, "step": 1496 }, { "epoch": 1.7352749402303846, "grad_norm": 0.2307940572500229, "learning_rate": 4.1396551724137935e-05, "loss": 0.1158, "step": 1497 }, { "epoch": 1.7364341085271318, "grad_norm": 0.23847486078739166, "learning_rate": 4.139080459770115e-05, "loss": 0.1177, "step": 1498 }, { "epoch": 1.7375932768238789, "grad_norm": 0.2667318880558014, "learning_rate": 4.138505747126437e-05, "loss": 0.1483, "step": 1499 }, { "epoch": 1.738752445120626, "grad_norm": 0.2531675100326538, "learning_rate": 4.1379310344827587e-05, "loss": 0.1209, "step": 1500 }, { "epoch": 1.739911613417373, "grad_norm": 0.4089076817035675, "learning_rate": 4.13735632183908e-05, "loss": 0.1485, "step": 1501 }, { "epoch": 1.7410707817141202, "grad_norm": 0.2458110749721527, "learning_rate": 4.136781609195402e-05, "loss": 0.1252, "step": 1502 }, { "epoch": 1.7422299500108673, "grad_norm": 0.23186376690864563, "learning_rate": 4.1362068965517245e-05, "loss": 0.1292, "step": 1503 }, { "epoch": 1.7433891183076144, "grad_norm": 0.24878862500190735, "learning_rate": 4.1356321839080466e-05, "loss": 0.1307, "step": 1504 }, { "epoch": 1.7445482866043613, "grad_norm": 0.2361961454153061, "learning_rate": 4.135057471264368e-05, "loss": 0.118, "step": 1505 }, { "epoch": 1.7457074549011085, "grad_norm": 0.24874532222747803, "learning_rate": 4.1344827586206896e-05, "loss": 0.1122, "step": 1506 }, { "epoch": 1.7468666231978556, "grad_norm": 0.2600504159927368, "learning_rate": 4.133908045977012e-05, "loss": 0.1229, "step": 1507 }, { "epoch": 1.7480257914946025, "grad_norm": 0.28750351071357727, "learning_rate": 4.133333333333333e-05, "loss": 0.1385, "step": 1508 }, { "epoch": 1.7491849597913496, "grad_norm": 0.26855355501174927, "learning_rate": 4.1327586206896555e-05, "loss": 0.1394, "step": 1509 }, { "epoch": 1.7503441280880967, "grad_norm": 0.28046318888664246, "learning_rate": 4.1321839080459776e-05, "loss": 0.1362, "step": 1510 }, { "epoch": 1.7515032963848438, "grad_norm": 0.27256345748901367, "learning_rate": 4.131609195402299e-05, "loss": 0.1283, "step": 1511 }, { "epoch": 1.752662464681591, "grad_norm": 0.22228793799877167, "learning_rate": 4.131034482758621e-05, "loss": 0.1104, "step": 1512 }, { "epoch": 1.753821632978338, "grad_norm": 0.24911850690841675, "learning_rate": 4.130459770114943e-05, "loss": 0.1212, "step": 1513 }, { "epoch": 1.7549808012750852, "grad_norm": 0.21466509997844696, "learning_rate": 4.129885057471264e-05, "loss": 0.1141, "step": 1514 }, { "epoch": 1.7561399695718323, "grad_norm": 0.2404317706823349, "learning_rate": 4.1293103448275864e-05, "loss": 0.119, "step": 1515 }, { "epoch": 1.7572991378685794, "grad_norm": 0.25505301356315613, "learning_rate": 4.128735632183908e-05, "loss": 0.115, "step": 1516 }, { "epoch": 1.7584583061653265, "grad_norm": 0.2898204028606415, "learning_rate": 4.12816091954023e-05, "loss": 0.1508, "step": 1517 }, { "epoch": 1.7596174744620736, "grad_norm": 0.24544641375541687, "learning_rate": 4.127586206896552e-05, "loss": 0.1278, "step": 1518 }, { "epoch": 1.7607766427588205, "grad_norm": 0.21617454290390015, "learning_rate": 4.127011494252874e-05, "loss": 0.1206, "step": 1519 }, { "epoch": 1.7619358110555676, "grad_norm": 0.22762687504291534, "learning_rate": 4.126436781609195e-05, "loss": 0.1354, "step": 1520 }, { "epoch": 1.7630949793523147, "grad_norm": 0.22020873427391052, "learning_rate": 4.1258620689655174e-05, "loss": 0.1361, "step": 1521 }, { "epoch": 1.7642541476490616, "grad_norm": 0.21312019228935242, "learning_rate": 4.125287356321839e-05, "loss": 0.1258, "step": 1522 }, { "epoch": 1.7654133159458087, "grad_norm": 0.22142820060253143, "learning_rate": 4.124712643678161e-05, "loss": 0.1167, "step": 1523 }, { "epoch": 1.7665724842425559, "grad_norm": 0.23192018270492554, "learning_rate": 4.124137931034483e-05, "loss": 0.1149, "step": 1524 }, { "epoch": 1.767731652539303, "grad_norm": 0.23699016869068146, "learning_rate": 4.123563218390805e-05, "loss": 0.1356, "step": 1525 }, { "epoch": 1.76889082083605, "grad_norm": 0.25063955783843994, "learning_rate": 4.122988505747127e-05, "loss": 0.1232, "step": 1526 }, { "epoch": 1.7700499891327972, "grad_norm": 0.2514595687389374, "learning_rate": 4.1224137931034484e-05, "loss": 0.1332, "step": 1527 }, { "epoch": 1.7712091574295443, "grad_norm": 0.24346187710762024, "learning_rate": 4.12183908045977e-05, "loss": 0.1239, "step": 1528 }, { "epoch": 1.7723683257262914, "grad_norm": 0.22228626906871796, "learning_rate": 4.121264367816092e-05, "loss": 0.1061, "step": 1529 }, { "epoch": 1.7735274940230386, "grad_norm": 0.2860787510871887, "learning_rate": 4.120689655172414e-05, "loss": 0.1254, "step": 1530 }, { "epoch": 1.7746866623197857, "grad_norm": 0.2642548680305481, "learning_rate": 4.1201149425287364e-05, "loss": 0.126, "step": 1531 }, { "epoch": 1.7758458306165328, "grad_norm": 0.2488432377576828, "learning_rate": 4.119540229885058e-05, "loss": 0.1175, "step": 1532 }, { "epoch": 1.7770049989132797, "grad_norm": 0.23602858185768127, "learning_rate": 4.1189655172413794e-05, "loss": 0.1244, "step": 1533 }, { "epoch": 1.7781641672100268, "grad_norm": 0.23683659732341766, "learning_rate": 4.1183908045977015e-05, "loss": 0.1186, "step": 1534 }, { "epoch": 1.779323335506774, "grad_norm": 0.2419913113117218, "learning_rate": 4.117816091954023e-05, "loss": 0.1364, "step": 1535 }, { "epoch": 1.780482503803521, "grad_norm": 0.26852262020111084, "learning_rate": 4.1172413793103445e-05, "loss": 0.1287, "step": 1536 }, { "epoch": 1.781641672100268, "grad_norm": 0.2520958185195923, "learning_rate": 4.116666666666667e-05, "loss": 0.1381, "step": 1537 }, { "epoch": 1.782800840397015, "grad_norm": 0.2294151484966278, "learning_rate": 4.116091954022989e-05, "loss": 0.1458, "step": 1538 }, { "epoch": 1.7839600086937621, "grad_norm": 0.20725025236606598, "learning_rate": 4.11551724137931e-05, "loss": 0.1358, "step": 1539 }, { "epoch": 1.7851191769905093, "grad_norm": 0.22536805272102356, "learning_rate": 4.1149425287356325e-05, "loss": 0.1273, "step": 1540 }, { "epoch": 1.7862783452872564, "grad_norm": 0.2271406203508377, "learning_rate": 4.114367816091954e-05, "loss": 0.1254, "step": 1541 }, { "epoch": 1.7874375135840035, "grad_norm": 0.25207868218421936, "learning_rate": 4.113793103448276e-05, "loss": 0.1325, "step": 1542 }, { "epoch": 1.7885966818807506, "grad_norm": 0.22057871520519257, "learning_rate": 4.1132183908045976e-05, "loss": 0.1287, "step": 1543 }, { "epoch": 1.7897558501774977, "grad_norm": 0.21857698261737823, "learning_rate": 4.11264367816092e-05, "loss": 0.118, "step": 1544 }, { "epoch": 1.7909150184742448, "grad_norm": 0.26904162764549255, "learning_rate": 4.112068965517242e-05, "loss": 0.137, "step": 1545 }, { "epoch": 1.792074186770992, "grad_norm": 0.20811136066913605, "learning_rate": 4.1114942528735635e-05, "loss": 0.1148, "step": 1546 }, { "epoch": 1.7932333550677388, "grad_norm": 0.2898993194103241, "learning_rate": 4.110919540229885e-05, "loss": 0.1396, "step": 1547 }, { "epoch": 1.794392523364486, "grad_norm": 0.2406226396560669, "learning_rate": 4.110344827586207e-05, "loss": 0.1177, "step": 1548 }, { "epoch": 1.795551691661233, "grad_norm": 0.21909652650356293, "learning_rate": 4.1097701149425286e-05, "loss": 0.1221, "step": 1549 }, { "epoch": 1.7967108599579802, "grad_norm": 0.24359643459320068, "learning_rate": 4.109195402298851e-05, "loss": 0.1249, "step": 1550 }, { "epoch": 1.797870028254727, "grad_norm": 0.20829685032367706, "learning_rate": 4.108620689655173e-05, "loss": 0.1155, "step": 1551 }, { "epoch": 1.7990291965514742, "grad_norm": 0.34604066610336304, "learning_rate": 4.1080459770114944e-05, "loss": 0.1366, "step": 1552 }, { "epoch": 1.8001883648482213, "grad_norm": 0.2357710599899292, "learning_rate": 4.1074712643678166e-05, "loss": 0.1238, "step": 1553 }, { "epoch": 1.8013475331449684, "grad_norm": 0.2505914866924286, "learning_rate": 4.106896551724138e-05, "loss": 0.1381, "step": 1554 }, { "epoch": 1.8025067014417155, "grad_norm": 0.3045206665992737, "learning_rate": 4.1063218390804596e-05, "loss": 0.125, "step": 1555 }, { "epoch": 1.8036658697384627, "grad_norm": 0.22790023684501648, "learning_rate": 4.105747126436782e-05, "loss": 0.1189, "step": 1556 }, { "epoch": 1.8048250380352098, "grad_norm": 0.267311155796051, "learning_rate": 4.105172413793103e-05, "loss": 0.133, "step": 1557 }, { "epoch": 1.8059842063319569, "grad_norm": 0.2867993712425232, "learning_rate": 4.1045977011494254e-05, "loss": 0.1197, "step": 1558 }, { "epoch": 1.807143374628704, "grad_norm": 0.2740671634674072, "learning_rate": 4.1040229885057476e-05, "loss": 0.1247, "step": 1559 }, { "epoch": 1.8083025429254511, "grad_norm": 0.23246349394321442, "learning_rate": 4.103448275862069e-05, "loss": 0.1204, "step": 1560 }, { "epoch": 1.8094617112221982, "grad_norm": 0.2556561231613159, "learning_rate": 4.102873563218391e-05, "loss": 0.1381, "step": 1561 }, { "epoch": 1.8106208795189451, "grad_norm": 0.22063712775707245, "learning_rate": 4.102298850574713e-05, "loss": 0.1288, "step": 1562 }, { "epoch": 1.8117800478156922, "grad_norm": 0.26971542835235596, "learning_rate": 4.101724137931034e-05, "loss": 0.1348, "step": 1563 }, { "epoch": 1.8129392161124394, "grad_norm": 0.28952643275260925, "learning_rate": 4.1011494252873564e-05, "loss": 0.1408, "step": 1564 }, { "epoch": 1.8140983844091862, "grad_norm": 0.24396105110645294, "learning_rate": 4.1005747126436786e-05, "loss": 0.1284, "step": 1565 }, { "epoch": 1.8152575527059334, "grad_norm": 0.2277979552745819, "learning_rate": 4.1e-05, "loss": 0.1234, "step": 1566 }, { "epoch": 1.8164167210026805, "grad_norm": 0.1920943409204483, "learning_rate": 4.099425287356322e-05, "loss": 0.1183, "step": 1567 }, { "epoch": 1.8175758892994276, "grad_norm": 0.25110727548599243, "learning_rate": 4.098850574712644e-05, "loss": 0.1338, "step": 1568 }, { "epoch": 1.8187350575961747, "grad_norm": 0.2476564645767212, "learning_rate": 4.098275862068966e-05, "loss": 0.1311, "step": 1569 }, { "epoch": 1.8198942258929218, "grad_norm": 0.2378225028514862, "learning_rate": 4.0977011494252874e-05, "loss": 0.131, "step": 1570 }, { "epoch": 1.821053394189669, "grad_norm": 0.2154211848974228, "learning_rate": 4.0971264367816095e-05, "loss": 0.1192, "step": 1571 }, { "epoch": 1.822212562486416, "grad_norm": 0.23698244988918304, "learning_rate": 4.096551724137932e-05, "loss": 0.1302, "step": 1572 }, { "epoch": 1.8233717307831632, "grad_norm": 0.22462879121303558, "learning_rate": 4.095977011494253e-05, "loss": 0.1225, "step": 1573 }, { "epoch": 1.8245308990799103, "grad_norm": 0.21854135394096375, "learning_rate": 4.095402298850575e-05, "loss": 0.1185, "step": 1574 }, { "epoch": 1.8256900673766574, "grad_norm": 0.24488233029842377, "learning_rate": 4.094827586206897e-05, "loss": 0.1434, "step": 1575 }, { "epoch": 1.8268492356734043, "grad_norm": 0.22526147961616516, "learning_rate": 4.094252873563218e-05, "loss": 0.1272, "step": 1576 }, { "epoch": 1.8280084039701514, "grad_norm": 0.206687793135643, "learning_rate": 4.09367816091954e-05, "loss": 0.1109, "step": 1577 }, { "epoch": 1.8291675722668985, "grad_norm": 0.26105666160583496, "learning_rate": 4.093103448275863e-05, "loss": 0.1389, "step": 1578 }, { "epoch": 1.8303267405636456, "grad_norm": 0.3510555326938629, "learning_rate": 4.092528735632184e-05, "loss": 0.1294, "step": 1579 }, { "epoch": 1.8314859088603925, "grad_norm": 0.238253653049469, "learning_rate": 4.091954022988506e-05, "loss": 0.123, "step": 1580 }, { "epoch": 1.8326450771571396, "grad_norm": 0.2679709792137146, "learning_rate": 4.091379310344828e-05, "loss": 0.1326, "step": 1581 }, { "epoch": 1.8338042454538868, "grad_norm": 0.22641777992248535, "learning_rate": 4.090804597701149e-05, "loss": 0.1299, "step": 1582 }, { "epoch": 1.8349634137506339, "grad_norm": 0.25320807099342346, "learning_rate": 4.0902298850574715e-05, "loss": 0.1223, "step": 1583 }, { "epoch": 1.836122582047381, "grad_norm": 0.24081723392009735, "learning_rate": 4.089655172413793e-05, "loss": 0.1327, "step": 1584 }, { "epoch": 1.837281750344128, "grad_norm": 0.2431461215019226, "learning_rate": 4.089080459770115e-05, "loss": 0.1347, "step": 1585 }, { "epoch": 1.8384409186408752, "grad_norm": 0.2519475817680359, "learning_rate": 4.088505747126437e-05, "loss": 0.1438, "step": 1586 }, { "epoch": 1.8396000869376223, "grad_norm": 0.252816766500473, "learning_rate": 4.087931034482759e-05, "loss": 0.138, "step": 1587 }, { "epoch": 1.8407592552343695, "grad_norm": 0.22462011873722076, "learning_rate": 4.087356321839081e-05, "loss": 0.1192, "step": 1588 }, { "epoch": 1.8419184235311166, "grad_norm": 0.28114578127861023, "learning_rate": 4.0867816091954025e-05, "loss": 0.1384, "step": 1589 }, { "epoch": 1.8430775918278635, "grad_norm": 0.24173279106616974, "learning_rate": 4.086206896551724e-05, "loss": 0.1389, "step": 1590 }, { "epoch": 1.8442367601246106, "grad_norm": 0.19824741780757904, "learning_rate": 4.085632183908046e-05, "loss": 0.1117, "step": 1591 }, { "epoch": 1.8453959284213577, "grad_norm": 0.2025434970855713, "learning_rate": 4.085057471264368e-05, "loss": 0.1103, "step": 1592 }, { "epoch": 1.8465550967181048, "grad_norm": 0.2460830956697464, "learning_rate": 4.08448275862069e-05, "loss": 0.1327, "step": 1593 }, { "epoch": 1.8477142650148517, "grad_norm": 0.2609751224517822, "learning_rate": 4.083908045977012e-05, "loss": 0.1315, "step": 1594 }, { "epoch": 1.8488734333115988, "grad_norm": 0.20882025361061096, "learning_rate": 4.0833333333333334e-05, "loss": 0.1076, "step": 1595 }, { "epoch": 1.850032601608346, "grad_norm": 0.20100267231464386, "learning_rate": 4.0827586206896556e-05, "loss": 0.1116, "step": 1596 }, { "epoch": 1.851191769905093, "grad_norm": 0.2182759940624237, "learning_rate": 4.082183908045977e-05, "loss": 0.135, "step": 1597 }, { "epoch": 1.8523509382018402, "grad_norm": 0.21987977623939514, "learning_rate": 4.0816091954022986e-05, "loss": 0.1287, "step": 1598 }, { "epoch": 1.8535101064985873, "grad_norm": 0.2634388506412506, "learning_rate": 4.0810344827586214e-05, "loss": 0.1267, "step": 1599 }, { "epoch": 1.8546692747953344, "grad_norm": 0.2175815850496292, "learning_rate": 4.080459770114943e-05, "loss": 0.1175, "step": 1600 }, { "epoch": 1.8558284430920815, "grad_norm": 0.24021421372890472, "learning_rate": 4.0798850574712644e-05, "loss": 0.1298, "step": 1601 }, { "epoch": 1.8569876113888286, "grad_norm": 0.2455061972141266, "learning_rate": 4.0793103448275866e-05, "loss": 0.1318, "step": 1602 }, { "epoch": 1.8581467796855757, "grad_norm": 0.5049688220024109, "learning_rate": 4.078735632183908e-05, "loss": 0.1285, "step": 1603 }, { "epoch": 1.8593059479823228, "grad_norm": 0.3722595274448395, "learning_rate": 4.0781609195402295e-05, "loss": 0.1359, "step": 1604 }, { "epoch": 1.8604651162790697, "grad_norm": 0.3047929108142853, "learning_rate": 4.077586206896552e-05, "loss": 0.1284, "step": 1605 }, { "epoch": 1.8616242845758169, "grad_norm": 0.48975053429603577, "learning_rate": 4.077011494252874e-05, "loss": 0.1326, "step": 1606 }, { "epoch": 1.862783452872564, "grad_norm": 0.27146080136299133, "learning_rate": 4.076436781609196e-05, "loss": 0.1237, "step": 1607 }, { "epoch": 1.8639426211693109, "grad_norm": 0.26589420437812805, "learning_rate": 4.0758620689655175e-05, "loss": 0.1225, "step": 1608 }, { "epoch": 1.865101789466058, "grad_norm": 0.2550943195819855, "learning_rate": 4.075287356321839e-05, "loss": 0.1348, "step": 1609 }, { "epoch": 1.866260957762805, "grad_norm": 0.22643890976905823, "learning_rate": 4.074712643678161e-05, "loss": 0.1199, "step": 1610 }, { "epoch": 1.8674201260595522, "grad_norm": 0.28529438376426697, "learning_rate": 4.074137931034483e-05, "loss": 0.1186, "step": 1611 }, { "epoch": 1.8685792943562993, "grad_norm": 0.24935007095336914, "learning_rate": 4.073563218390805e-05, "loss": 0.1218, "step": 1612 }, { "epoch": 1.8697384626530464, "grad_norm": 0.23699697852134705, "learning_rate": 4.072988505747127e-05, "loss": 0.136, "step": 1613 }, { "epoch": 1.8708976309497936, "grad_norm": 0.4210551381111145, "learning_rate": 4.0724137931034485e-05, "loss": 0.1095, "step": 1614 }, { "epoch": 1.8720567992465407, "grad_norm": 0.2276671975851059, "learning_rate": 4.071839080459771e-05, "loss": 0.1346, "step": 1615 }, { "epoch": 1.8732159675432878, "grad_norm": 0.22973059117794037, "learning_rate": 4.071264367816092e-05, "loss": 0.122, "step": 1616 }, { "epoch": 1.874375135840035, "grad_norm": 0.37576723098754883, "learning_rate": 4.070689655172414e-05, "loss": 0.1173, "step": 1617 }, { "epoch": 1.875534304136782, "grad_norm": 0.23602044582366943, "learning_rate": 4.070114942528736e-05, "loss": 0.1426, "step": 1618 }, { "epoch": 1.876693472433529, "grad_norm": 0.22846823930740356, "learning_rate": 4.069540229885058e-05, "loss": 0.1252, "step": 1619 }, { "epoch": 1.877852640730276, "grad_norm": 0.21576084196567535, "learning_rate": 4.0689655172413795e-05, "loss": 0.1176, "step": 1620 }, { "epoch": 1.8790118090270231, "grad_norm": 0.21635417640209198, "learning_rate": 4.0683908045977017e-05, "loss": 0.1256, "step": 1621 }, { "epoch": 1.88017097732377, "grad_norm": 0.20854774117469788, "learning_rate": 4.067816091954023e-05, "loss": 0.1167, "step": 1622 }, { "epoch": 1.8813301456205171, "grad_norm": 0.25999489426612854, "learning_rate": 4.0672413793103446e-05, "loss": 0.1225, "step": 1623 }, { "epoch": 1.8824893139172643, "grad_norm": 0.2581290602684021, "learning_rate": 4.066666666666667e-05, "loss": 0.1254, "step": 1624 }, { "epoch": 1.8836484822140114, "grad_norm": 0.2655247449874878, "learning_rate": 4.066091954022988e-05, "loss": 0.1326, "step": 1625 }, { "epoch": 1.8848076505107585, "grad_norm": 0.2115846574306488, "learning_rate": 4.0655172413793105e-05, "loss": 0.1275, "step": 1626 }, { "epoch": 1.8859668188075056, "grad_norm": 0.23177579045295715, "learning_rate": 4.0649425287356326e-05, "loss": 0.1169, "step": 1627 }, { "epoch": 1.8871259871042527, "grad_norm": 0.45042136311531067, "learning_rate": 4.064367816091954e-05, "loss": 0.1309, "step": 1628 }, { "epoch": 1.8882851554009998, "grad_norm": 0.26189303398132324, "learning_rate": 4.063793103448276e-05, "loss": 0.1309, "step": 1629 }, { "epoch": 1.889444323697747, "grad_norm": 0.2205989956855774, "learning_rate": 4.063218390804598e-05, "loss": 0.1242, "step": 1630 }, { "epoch": 1.890603491994494, "grad_norm": 0.3108113706111908, "learning_rate": 4.062643678160919e-05, "loss": 0.1392, "step": 1631 }, { "epoch": 1.8917626602912412, "grad_norm": 0.2217811793088913, "learning_rate": 4.0620689655172414e-05, "loss": 0.1243, "step": 1632 }, { "epoch": 1.892921828587988, "grad_norm": 0.21977943181991577, "learning_rate": 4.0614942528735636e-05, "loss": 0.1202, "step": 1633 }, { "epoch": 1.8940809968847352, "grad_norm": 0.20839521288871765, "learning_rate": 4.060919540229886e-05, "loss": 0.1205, "step": 1634 }, { "epoch": 1.8952401651814823, "grad_norm": 0.2656031548976898, "learning_rate": 4.060344827586207e-05, "loss": 0.1355, "step": 1635 }, { "epoch": 1.8963993334782294, "grad_norm": 0.23771442472934723, "learning_rate": 4.059770114942529e-05, "loss": 0.1294, "step": 1636 }, { "epoch": 1.8975585017749763, "grad_norm": 0.27091866731643677, "learning_rate": 4.059195402298851e-05, "loss": 0.1364, "step": 1637 }, { "epoch": 1.8987176700717234, "grad_norm": 0.250888854265213, "learning_rate": 4.0586206896551724e-05, "loss": 0.1362, "step": 1638 }, { "epoch": 1.8998768383684705, "grad_norm": 0.26287785172462463, "learning_rate": 4.0580459770114946e-05, "loss": 0.1268, "step": 1639 }, { "epoch": 1.9010360066652177, "grad_norm": 0.2068604677915573, "learning_rate": 4.057471264367817e-05, "loss": 0.1186, "step": 1640 }, { "epoch": 1.9021951749619648, "grad_norm": 0.27759426832199097, "learning_rate": 4.056896551724138e-05, "loss": 0.1424, "step": 1641 }, { "epoch": 1.903354343258712, "grad_norm": 0.2885616719722748, "learning_rate": 4.05632183908046e-05, "loss": 0.1431, "step": 1642 }, { "epoch": 1.904513511555459, "grad_norm": 0.2596352994441986, "learning_rate": 4.055747126436782e-05, "loss": 0.1301, "step": 1643 }, { "epoch": 1.9056726798522061, "grad_norm": 0.22041890025138855, "learning_rate": 4.0551724137931034e-05, "loss": 0.1252, "step": 1644 }, { "epoch": 1.9068318481489532, "grad_norm": 0.23935045301914215, "learning_rate": 4.0545977011494256e-05, "loss": 0.1234, "step": 1645 }, { "epoch": 1.9079910164457003, "grad_norm": 0.23104766011238098, "learning_rate": 4.054022988505747e-05, "loss": 0.1247, "step": 1646 }, { "epoch": 1.9091501847424472, "grad_norm": 0.25777825713157654, "learning_rate": 4.053448275862069e-05, "loss": 0.1221, "step": 1647 }, { "epoch": 1.9103093530391944, "grad_norm": 0.21101577579975128, "learning_rate": 4.0528735632183914e-05, "loss": 0.1213, "step": 1648 }, { "epoch": 1.9114685213359415, "grad_norm": 0.27736735343933105, "learning_rate": 4.052298850574713e-05, "loss": 0.1315, "step": 1649 }, { "epoch": 1.9126276896326886, "grad_norm": 0.2874712646007538, "learning_rate": 4.0517241379310344e-05, "loss": 0.1267, "step": 1650 }, { "epoch": 1.9137868579294355, "grad_norm": 0.27713993191719055, "learning_rate": 4.0511494252873565e-05, "loss": 0.1236, "step": 1651 }, { "epoch": 1.9149460262261826, "grad_norm": 0.22386078536510468, "learning_rate": 4.050574712643678e-05, "loss": 0.1147, "step": 1652 }, { "epoch": 1.9161051945229297, "grad_norm": 0.21959929168224335, "learning_rate": 4.05e-05, "loss": 0.1079, "step": 1653 }, { "epoch": 1.9172643628196768, "grad_norm": 0.2387046068906784, "learning_rate": 4.0494252873563224e-05, "loss": 0.1267, "step": 1654 }, { "epoch": 1.918423531116424, "grad_norm": 0.22587932646274567, "learning_rate": 4.048850574712644e-05, "loss": 0.124, "step": 1655 }, { "epoch": 1.919582699413171, "grad_norm": 0.2760310471057892, "learning_rate": 4.048275862068966e-05, "loss": 0.1384, "step": 1656 }, { "epoch": 1.9207418677099182, "grad_norm": 0.24281680583953857, "learning_rate": 4.0477011494252875e-05, "loss": 0.1264, "step": 1657 }, { "epoch": 1.9219010360066653, "grad_norm": 0.23182785511016846, "learning_rate": 4.047126436781609e-05, "loss": 0.1184, "step": 1658 }, { "epoch": 1.9230602043034124, "grad_norm": 0.2864990532398224, "learning_rate": 4.046551724137931e-05, "loss": 0.1398, "step": 1659 }, { "epoch": 1.9242193726001595, "grad_norm": 0.267501562833786, "learning_rate": 4.045977011494253e-05, "loss": 0.135, "step": 1660 }, { "epoch": 1.9253785408969066, "grad_norm": 0.6067526936531067, "learning_rate": 4.045402298850575e-05, "loss": 0.1321, "step": 1661 }, { "epoch": 1.9265377091936535, "grad_norm": 0.2144288271665573, "learning_rate": 4.044827586206897e-05, "loss": 0.1195, "step": 1662 }, { "epoch": 1.9276968774904006, "grad_norm": 0.2767769396305084, "learning_rate": 4.0442528735632185e-05, "loss": 0.121, "step": 1663 }, { "epoch": 1.9288560457871478, "grad_norm": 0.258683443069458, "learning_rate": 4.0436781609195406e-05, "loss": 0.1358, "step": 1664 }, { "epoch": 1.9300152140838946, "grad_norm": 0.3497917056083679, "learning_rate": 4.043103448275862e-05, "loss": 0.1271, "step": 1665 }, { "epoch": 1.9311743823806418, "grad_norm": 0.2050793319940567, "learning_rate": 4.0425287356321836e-05, "loss": 0.0987, "step": 1666 }, { "epoch": 1.9323335506773889, "grad_norm": 0.25068920850753784, "learning_rate": 4.041954022988506e-05, "loss": 0.1228, "step": 1667 }, { "epoch": 1.933492718974136, "grad_norm": 0.31791725754737854, "learning_rate": 4.041379310344828e-05, "loss": 0.1245, "step": 1668 }, { "epoch": 1.934651887270883, "grad_norm": 0.38801631331443787, "learning_rate": 4.0408045977011495e-05, "loss": 0.1264, "step": 1669 }, { "epoch": 1.9358110555676302, "grad_norm": 0.26199209690093994, "learning_rate": 4.0402298850574716e-05, "loss": 0.1332, "step": 1670 }, { "epoch": 1.9369702238643773, "grad_norm": 0.2558247148990631, "learning_rate": 4.039655172413793e-05, "loss": 0.126, "step": 1671 }, { "epoch": 1.9381293921611245, "grad_norm": 0.2440565675497055, "learning_rate": 4.039080459770115e-05, "loss": 0.1265, "step": 1672 }, { "epoch": 1.9392885604578716, "grad_norm": 0.35634905099868774, "learning_rate": 4.038505747126437e-05, "loss": 0.1492, "step": 1673 }, { "epoch": 1.9404477287546187, "grad_norm": 0.2178521454334259, "learning_rate": 4.037931034482759e-05, "loss": 0.1166, "step": 1674 }, { "epoch": 1.9416068970513658, "grad_norm": 0.25191420316696167, "learning_rate": 4.037356321839081e-05, "loss": 0.1329, "step": 1675 }, { "epoch": 1.9427660653481127, "grad_norm": 0.25486209988594055, "learning_rate": 4.0367816091954026e-05, "loss": 0.1294, "step": 1676 }, { "epoch": 1.9439252336448598, "grad_norm": 0.21692296862602234, "learning_rate": 4.036206896551724e-05, "loss": 0.1204, "step": 1677 }, { "epoch": 1.945084401941607, "grad_norm": 0.24165906012058258, "learning_rate": 4.035632183908046e-05, "loss": 0.1244, "step": 1678 }, { "epoch": 1.9462435702383538, "grad_norm": 0.2571072578430176, "learning_rate": 4.035057471264368e-05, "loss": 0.1387, "step": 1679 }, { "epoch": 1.947402738535101, "grad_norm": 0.22820697724819183, "learning_rate": 4.03448275862069e-05, "loss": 0.1331, "step": 1680 }, { "epoch": 1.948561906831848, "grad_norm": 0.2674961984157562, "learning_rate": 4.033908045977012e-05, "loss": 0.128, "step": 1681 }, { "epoch": 1.9497210751285952, "grad_norm": 0.23374570906162262, "learning_rate": 4.0333333333333336e-05, "loss": 0.1256, "step": 1682 }, { "epoch": 1.9508802434253423, "grad_norm": 0.23953098058700562, "learning_rate": 4.032758620689656e-05, "loss": 0.1354, "step": 1683 }, { "epoch": 1.9520394117220894, "grad_norm": 0.21969419717788696, "learning_rate": 4.032183908045977e-05, "loss": 0.1273, "step": 1684 }, { "epoch": 1.9531985800188365, "grad_norm": 0.2675417959690094, "learning_rate": 4.031609195402299e-05, "loss": 0.145, "step": 1685 }, { "epoch": 1.9543577483155836, "grad_norm": 0.22775276005268097, "learning_rate": 4.031034482758621e-05, "loss": 0.1368, "step": 1686 }, { "epoch": 1.9555169166123307, "grad_norm": 0.25551965832710266, "learning_rate": 4.0304597701149424e-05, "loss": 0.1395, "step": 1687 }, { "epoch": 1.9566760849090779, "grad_norm": 0.20460672676563263, "learning_rate": 4.0298850574712645e-05, "loss": 0.1111, "step": 1688 }, { "epoch": 1.957835253205825, "grad_norm": 0.42244213819503784, "learning_rate": 4.029310344827587e-05, "loss": 0.1504, "step": 1689 }, { "epoch": 1.9589944215025719, "grad_norm": 0.2375163733959198, "learning_rate": 4.028735632183908e-05, "loss": 0.1243, "step": 1690 }, { "epoch": 1.960153589799319, "grad_norm": 0.23860155045986176, "learning_rate": 4.0281609195402304e-05, "loss": 0.1317, "step": 1691 }, { "epoch": 1.961312758096066, "grad_norm": 0.20756982266902924, "learning_rate": 4.027586206896552e-05, "loss": 0.1252, "step": 1692 }, { "epoch": 1.9624719263928132, "grad_norm": 0.25197523832321167, "learning_rate": 4.0270114942528733e-05, "loss": 0.1406, "step": 1693 }, { "epoch": 1.96363109468956, "grad_norm": 0.2613564133644104, "learning_rate": 4.0264367816091955e-05, "loss": 0.1342, "step": 1694 }, { "epoch": 1.9647902629863072, "grad_norm": 0.21359771490097046, "learning_rate": 4.025862068965518e-05, "loss": 0.1171, "step": 1695 }, { "epoch": 1.9659494312830543, "grad_norm": 0.2883491516113281, "learning_rate": 4.025287356321839e-05, "loss": 0.1262, "step": 1696 }, { "epoch": 1.9671085995798014, "grad_norm": 0.22171908617019653, "learning_rate": 4.0247126436781613e-05, "loss": 0.1259, "step": 1697 }, { "epoch": 1.9682677678765486, "grad_norm": 0.2951849699020386, "learning_rate": 4.024137931034483e-05, "loss": 0.1447, "step": 1698 }, { "epoch": 1.9694269361732957, "grad_norm": 0.2592881917953491, "learning_rate": 4.023563218390804e-05, "loss": 0.1294, "step": 1699 }, { "epoch": 1.9705861044700428, "grad_norm": 0.22371990978717804, "learning_rate": 4.0229885057471265e-05, "loss": 0.1238, "step": 1700 }, { "epoch": 1.97174527276679, "grad_norm": 0.23679739236831665, "learning_rate": 4.0224137931034487e-05, "loss": 0.128, "step": 1701 }, { "epoch": 1.972904441063537, "grad_norm": 0.208289235830307, "learning_rate": 4.021839080459771e-05, "loss": 0.1221, "step": 1702 }, { "epoch": 1.9740636093602841, "grad_norm": 0.3069906532764435, "learning_rate": 4.021264367816092e-05, "loss": 0.1264, "step": 1703 }, { "epoch": 1.975222777657031, "grad_norm": 0.23526400327682495, "learning_rate": 4.020689655172414e-05, "loss": 0.1314, "step": 1704 }, { "epoch": 1.9763819459537781, "grad_norm": 0.36365655064582825, "learning_rate": 4.020114942528736e-05, "loss": 0.135, "step": 1705 }, { "epoch": 1.9775411142505253, "grad_norm": 0.21487149596214294, "learning_rate": 4.0195402298850575e-05, "loss": 0.1166, "step": 1706 }, { "epoch": 1.9787002825472724, "grad_norm": 0.1872834861278534, "learning_rate": 4.018965517241379e-05, "loss": 0.114, "step": 1707 }, { "epoch": 1.9798594508440193, "grad_norm": 0.2666516602039337, "learning_rate": 4.018390804597701e-05, "loss": 0.1396, "step": 1708 }, { "epoch": 1.9810186191407664, "grad_norm": 0.23788093030452728, "learning_rate": 4.017816091954023e-05, "loss": 0.1303, "step": 1709 }, { "epoch": 1.9821777874375135, "grad_norm": 0.23278290033340454, "learning_rate": 4.0172413793103455e-05, "loss": 0.127, "step": 1710 }, { "epoch": 1.9833369557342606, "grad_norm": 0.2589450180530548, "learning_rate": 4.016666666666667e-05, "loss": 0.1311, "step": 1711 }, { "epoch": 1.9844961240310077, "grad_norm": 0.20115303993225098, "learning_rate": 4.0160919540229884e-05, "loss": 0.1255, "step": 1712 }, { "epoch": 1.9856552923277548, "grad_norm": 0.24653758108615875, "learning_rate": 4.0155172413793106e-05, "loss": 0.1346, "step": 1713 }, { "epoch": 1.986814460624502, "grad_norm": 0.25631844997406006, "learning_rate": 4.014942528735632e-05, "loss": 0.1385, "step": 1714 }, { "epoch": 1.987973628921249, "grad_norm": 0.21562731266021729, "learning_rate": 4.014367816091954e-05, "loss": 0.118, "step": 1715 }, { "epoch": 1.9891327972179962, "grad_norm": 0.20288534462451935, "learning_rate": 4.0137931034482764e-05, "loss": 0.1183, "step": 1716 }, { "epoch": 1.9902919655147433, "grad_norm": 0.2522091865539551, "learning_rate": 4.013218390804598e-05, "loss": 0.1337, "step": 1717 }, { "epoch": 1.9914511338114904, "grad_norm": 0.2482483983039856, "learning_rate": 4.0126436781609194e-05, "loss": 0.1347, "step": 1718 }, { "epoch": 1.9926103021082373, "grad_norm": 0.26897165179252625, "learning_rate": 4.0120689655172416e-05, "loss": 0.1463, "step": 1719 }, { "epoch": 1.9937694704049844, "grad_norm": 0.2053130716085434, "learning_rate": 4.011494252873563e-05, "loss": 0.1138, "step": 1720 }, { "epoch": 1.9949286387017315, "grad_norm": 0.22863611578941345, "learning_rate": 4.010919540229885e-05, "loss": 0.1255, "step": 1721 }, { "epoch": 1.9960878069984784, "grad_norm": 0.25083255767822266, "learning_rate": 4.0103448275862074e-05, "loss": 0.137, "step": 1722 }, { "epoch": 1.9972469752952255, "grad_norm": 0.2726476788520813, "learning_rate": 4.009770114942529e-05, "loss": 0.1459, "step": 1723 }, { "epoch": 1.9984061435919727, "grad_norm": 0.18482446670532227, "learning_rate": 4.009195402298851e-05, "loss": 0.1116, "step": 1724 }, { "epoch": 1.9995653118887198, "grad_norm": 0.22042682766914368, "learning_rate": 4.0086206896551726e-05, "loss": 0.1282, "step": 1725 }, { "epoch": 1.9995653118887198, "eval_loss": 0.13071221113204956, "eval_runtime": 265.8343, "eval_samples_per_second": 5.771, "eval_steps_per_second": 5.771, "step": 1725 }, { "epoch": 2.000724480185467, "grad_norm": 0.20493850111961365, "learning_rate": 4.008045977011494e-05, "loss": 0.115, "step": 1726 }, { "epoch": 2.001883648482214, "grad_norm": 0.21877332031726837, "learning_rate": 4.007471264367816e-05, "loss": 0.1288, "step": 1727 }, { "epoch": 2.003042816778961, "grad_norm": 0.17066803574562073, "learning_rate": 4.006896551724138e-05, "loss": 0.103, "step": 1728 }, { "epoch": 2.0042019850757082, "grad_norm": 0.2311452031135559, "learning_rate": 4.0063218390804605e-05, "loss": 0.123, "step": 1729 }, { "epoch": 2.0053611533724554, "grad_norm": 0.18617844581604004, "learning_rate": 4.005747126436782e-05, "loss": 0.1075, "step": 1730 }, { "epoch": 2.0065203216692025, "grad_norm": 0.17932945489883423, "learning_rate": 4.0051724137931035e-05, "loss": 0.1124, "step": 1731 }, { "epoch": 2.0076794899659496, "grad_norm": 0.2430291473865509, "learning_rate": 4.004597701149426e-05, "loss": 0.112, "step": 1732 }, { "epoch": 2.0088386582626967, "grad_norm": 0.1980556845664978, "learning_rate": 4.004022988505747e-05, "loss": 0.1257, "step": 1733 }, { "epoch": 2.009997826559444, "grad_norm": 0.2553955614566803, "learning_rate": 4.003448275862069e-05, "loss": 0.1168, "step": 1734 }, { "epoch": 2.0111569948561905, "grad_norm": 0.5730363130569458, "learning_rate": 4.002873563218391e-05, "loss": 0.1191, "step": 1735 }, { "epoch": 2.0123161631529376, "grad_norm": 0.27074134349823, "learning_rate": 4.002298850574713e-05, "loss": 0.1228, "step": 1736 }, { "epoch": 2.0134753314496847, "grad_norm": 0.2566532492637634, "learning_rate": 4.0017241379310345e-05, "loss": 0.1185, "step": 1737 }, { "epoch": 2.014634499746432, "grad_norm": 0.2553810179233551, "learning_rate": 4.001149425287357e-05, "loss": 0.1207, "step": 1738 }, { "epoch": 2.015793668043179, "grad_norm": 0.314544677734375, "learning_rate": 4.000574712643678e-05, "loss": 0.1338, "step": 1739 }, { "epoch": 2.016952836339926, "grad_norm": 0.26933231949806213, "learning_rate": 4e-05, "loss": 0.1175, "step": 1740 }, { "epoch": 2.018112004636673, "grad_norm": 0.4147557020187378, "learning_rate": 3.999425287356322e-05, "loss": 0.1191, "step": 1741 }, { "epoch": 2.0192711729334203, "grad_norm": 0.4365951716899872, "learning_rate": 3.998850574712644e-05, "loss": 0.1246, "step": 1742 }, { "epoch": 2.0204303412301674, "grad_norm": 0.22776885330677032, "learning_rate": 3.998275862068966e-05, "loss": 0.1277, "step": 1743 }, { "epoch": 2.0215895095269145, "grad_norm": 0.5201489925384521, "learning_rate": 3.9977011494252876e-05, "loss": 0.108, "step": 1744 }, { "epoch": 2.0227486778236616, "grad_norm": 0.2805781960487366, "learning_rate": 3.997126436781609e-05, "loss": 0.1168, "step": 1745 }, { "epoch": 2.0239078461204087, "grad_norm": 0.25759008526802063, "learning_rate": 3.996551724137931e-05, "loss": 0.1312, "step": 1746 }, { "epoch": 2.025067014417156, "grad_norm": 0.2724314332008362, "learning_rate": 3.995977011494253e-05, "loss": 0.1102, "step": 1747 }, { "epoch": 2.026226182713903, "grad_norm": 0.2714131772518158, "learning_rate": 3.995402298850575e-05, "loss": 0.1136, "step": 1748 }, { "epoch": 2.0273853510106497, "grad_norm": 0.5604748725891113, "learning_rate": 3.994827586206897e-05, "loss": 0.1387, "step": 1749 }, { "epoch": 2.0285445193073968, "grad_norm": 0.20117241144180298, "learning_rate": 3.9942528735632186e-05, "loss": 0.1045, "step": 1750 }, { "epoch": 2.029703687604144, "grad_norm": 0.2628777027130127, "learning_rate": 3.993678160919541e-05, "loss": 0.1219, "step": 1751 }, { "epoch": 2.030862855900891, "grad_norm": 0.29843881726264954, "learning_rate": 3.993103448275862e-05, "loss": 0.1282, "step": 1752 }, { "epoch": 2.032022024197638, "grad_norm": 0.4298798441886902, "learning_rate": 3.992528735632184e-05, "loss": 0.1108, "step": 1753 }, { "epoch": 2.0331811924943852, "grad_norm": 0.24070972204208374, "learning_rate": 3.991954022988506e-05, "loss": 0.1171, "step": 1754 }, { "epoch": 2.0343403607911323, "grad_norm": 0.2516588568687439, "learning_rate": 3.9913793103448274e-05, "loss": 0.1178, "step": 1755 }, { "epoch": 2.0354995290878795, "grad_norm": 0.3218182623386383, "learning_rate": 3.9908045977011496e-05, "loss": 0.1193, "step": 1756 }, { "epoch": 2.0366586973846266, "grad_norm": 0.2040579468011856, "learning_rate": 3.990229885057472e-05, "loss": 0.1125, "step": 1757 }, { "epoch": 2.0378178656813737, "grad_norm": 0.2935560643672943, "learning_rate": 3.989655172413793e-05, "loss": 0.1269, "step": 1758 }, { "epoch": 2.038977033978121, "grad_norm": 0.22673268616199493, "learning_rate": 3.9890804597701154e-05, "loss": 0.1145, "step": 1759 }, { "epoch": 2.040136202274868, "grad_norm": 0.37315088510513306, "learning_rate": 3.988505747126437e-05, "loss": 0.1226, "step": 1760 }, { "epoch": 2.041295370571615, "grad_norm": 0.21398569643497467, "learning_rate": 3.9879310344827584e-05, "loss": 0.1196, "step": 1761 }, { "epoch": 2.042454538868362, "grad_norm": 0.3148459494113922, "learning_rate": 3.9873563218390806e-05, "loss": 0.1321, "step": 1762 }, { "epoch": 2.043613707165109, "grad_norm": 0.5063501596450806, "learning_rate": 3.986781609195403e-05, "loss": 0.1237, "step": 1763 }, { "epoch": 2.044772875461856, "grad_norm": 0.25269943475723267, "learning_rate": 3.986206896551724e-05, "loss": 0.1149, "step": 1764 }, { "epoch": 2.045932043758603, "grad_norm": 0.32295992970466614, "learning_rate": 3.9856321839080464e-05, "loss": 0.1087, "step": 1765 }, { "epoch": 2.04709121205535, "grad_norm": 0.23823142051696777, "learning_rate": 3.985057471264368e-05, "loss": 0.1246, "step": 1766 }, { "epoch": 2.0482503803520973, "grad_norm": 0.3159748315811157, "learning_rate": 3.98448275862069e-05, "loss": 0.1114, "step": 1767 }, { "epoch": 2.0494095486488444, "grad_norm": 0.21690689027309418, "learning_rate": 3.9839080459770115e-05, "loss": 0.1105, "step": 1768 }, { "epoch": 2.0505687169455915, "grad_norm": 0.26003125309944153, "learning_rate": 3.983333333333333e-05, "loss": 0.1257, "step": 1769 }, { "epoch": 2.0517278852423386, "grad_norm": 0.27152520418167114, "learning_rate": 3.982758620689656e-05, "loss": 0.1207, "step": 1770 }, { "epoch": 2.0528870535390857, "grad_norm": 0.2450665384531021, "learning_rate": 3.9821839080459774e-05, "loss": 0.1198, "step": 1771 }, { "epoch": 2.054046221835833, "grad_norm": 0.20077472925186157, "learning_rate": 3.981609195402299e-05, "loss": 0.107, "step": 1772 }, { "epoch": 2.05520539013258, "grad_norm": 0.4394693374633789, "learning_rate": 3.981034482758621e-05, "loss": 0.1103, "step": 1773 }, { "epoch": 2.056364558429327, "grad_norm": 0.26243650913238525, "learning_rate": 3.9804597701149425e-05, "loss": 0.1128, "step": 1774 }, { "epoch": 2.057523726726074, "grad_norm": 0.2425994873046875, "learning_rate": 3.979885057471265e-05, "loss": 0.123, "step": 1775 }, { "epoch": 2.0586828950228213, "grad_norm": 0.23687340319156647, "learning_rate": 3.979310344827586e-05, "loss": 0.122, "step": 1776 }, { "epoch": 2.0598420633195684, "grad_norm": 0.2466685026884079, "learning_rate": 3.9787356321839083e-05, "loss": 0.1201, "step": 1777 }, { "epoch": 2.061001231616315, "grad_norm": 0.26380082964897156, "learning_rate": 3.9781609195402305e-05, "loss": 0.1216, "step": 1778 }, { "epoch": 2.062160399913062, "grad_norm": 0.2581118643283844, "learning_rate": 3.977586206896552e-05, "loss": 0.1261, "step": 1779 }, { "epoch": 2.0633195682098093, "grad_norm": 0.24851976335048676, "learning_rate": 3.9770114942528735e-05, "loss": 0.1212, "step": 1780 }, { "epoch": 2.0644787365065564, "grad_norm": 0.3059890568256378, "learning_rate": 3.9764367816091957e-05, "loss": 0.1233, "step": 1781 }, { "epoch": 2.0656379048033036, "grad_norm": 0.2686820924282074, "learning_rate": 3.975862068965517e-05, "loss": 0.1291, "step": 1782 }, { "epoch": 2.0667970731000507, "grad_norm": 0.24418821930885315, "learning_rate": 3.975287356321839e-05, "loss": 0.115, "step": 1783 }, { "epoch": 2.067956241396798, "grad_norm": 0.27120018005371094, "learning_rate": 3.9747126436781615e-05, "loss": 0.1192, "step": 1784 }, { "epoch": 2.069115409693545, "grad_norm": 0.22046048939228058, "learning_rate": 3.974137931034483e-05, "loss": 0.1033, "step": 1785 }, { "epoch": 2.070274577990292, "grad_norm": 0.24162054061889648, "learning_rate": 3.973563218390805e-05, "loss": 0.116, "step": 1786 }, { "epoch": 2.071433746287039, "grad_norm": 0.26987624168395996, "learning_rate": 3.9729885057471266e-05, "loss": 0.127, "step": 1787 }, { "epoch": 2.0725929145837862, "grad_norm": 0.2681949734687805, "learning_rate": 3.972413793103448e-05, "loss": 0.1266, "step": 1788 }, { "epoch": 2.0737520828805334, "grad_norm": 0.26860347390174866, "learning_rate": 3.97183908045977e-05, "loss": 0.1186, "step": 1789 }, { "epoch": 2.0749112511772805, "grad_norm": 0.2559060752391815, "learning_rate": 3.9712643678160925e-05, "loss": 0.108, "step": 1790 }, { "epoch": 2.0760704194740276, "grad_norm": 0.1907990276813507, "learning_rate": 3.970689655172414e-05, "loss": 0.0989, "step": 1791 }, { "epoch": 2.0772295877707743, "grad_norm": 0.2304263859987259, "learning_rate": 3.970114942528736e-05, "loss": 0.1186, "step": 1792 }, { "epoch": 2.0783887560675214, "grad_norm": 0.21806973218917847, "learning_rate": 3.9695402298850576e-05, "loss": 0.1055, "step": 1793 }, { "epoch": 2.0795479243642685, "grad_norm": 0.27678120136260986, "learning_rate": 3.96896551724138e-05, "loss": 0.1309, "step": 1794 }, { "epoch": 2.0807070926610156, "grad_norm": 0.25782880187034607, "learning_rate": 3.968390804597701e-05, "loss": 0.1066, "step": 1795 }, { "epoch": 2.0818662609577627, "grad_norm": 0.2833830416202545, "learning_rate": 3.967816091954023e-05, "loss": 0.1258, "step": 1796 }, { "epoch": 2.08302542925451, "grad_norm": 0.231010302901268, "learning_rate": 3.967241379310345e-05, "loss": 0.1035, "step": 1797 }, { "epoch": 2.084184597551257, "grad_norm": 0.3302299678325653, "learning_rate": 3.966666666666667e-05, "loss": 0.1137, "step": 1798 }, { "epoch": 2.085343765848004, "grad_norm": 0.270266056060791, "learning_rate": 3.9660919540229886e-05, "loss": 0.1117, "step": 1799 }, { "epoch": 2.086502934144751, "grad_norm": 0.23533837497234344, "learning_rate": 3.965517241379311e-05, "loss": 0.1112, "step": 1800 }, { "epoch": 2.0876621024414983, "grad_norm": 0.316643089056015, "learning_rate": 3.964942528735632e-05, "loss": 0.1243, "step": 1801 }, { "epoch": 2.0888212707382454, "grad_norm": 0.2741736173629761, "learning_rate": 3.964367816091954e-05, "loss": 0.1095, "step": 1802 }, { "epoch": 2.0899804390349925, "grad_norm": 0.2951594889163971, "learning_rate": 3.963793103448276e-05, "loss": 0.1246, "step": 1803 }, { "epoch": 2.0911396073317396, "grad_norm": 0.26919370889663696, "learning_rate": 3.963218390804598e-05, "loss": 0.1181, "step": 1804 }, { "epoch": 2.0922987756284868, "grad_norm": 0.31926748156547546, "learning_rate": 3.96264367816092e-05, "loss": 0.1216, "step": 1805 }, { "epoch": 2.0934579439252334, "grad_norm": 0.30833154916763306, "learning_rate": 3.962068965517242e-05, "loss": 0.1245, "step": 1806 }, { "epoch": 2.0946171122219805, "grad_norm": 0.2637370824813843, "learning_rate": 3.961494252873563e-05, "loss": 0.1168, "step": 1807 }, { "epoch": 2.0957762805187277, "grad_norm": 0.2496730089187622, "learning_rate": 3.9609195402298854e-05, "loss": 0.1152, "step": 1808 }, { "epoch": 2.096935448815475, "grad_norm": 0.241145521402359, "learning_rate": 3.960344827586207e-05, "loss": 0.1257, "step": 1809 }, { "epoch": 2.098094617112222, "grad_norm": 0.2335578054189682, "learning_rate": 3.959770114942529e-05, "loss": 0.1138, "step": 1810 }, { "epoch": 2.099253785408969, "grad_norm": 0.23772041499614716, "learning_rate": 3.959195402298851e-05, "loss": 0.1115, "step": 1811 }, { "epoch": 2.100412953705716, "grad_norm": 0.25079551339149475, "learning_rate": 3.958620689655173e-05, "loss": 0.1185, "step": 1812 }, { "epoch": 2.1015721220024632, "grad_norm": 0.26366278529167175, "learning_rate": 3.958045977011495e-05, "loss": 0.1177, "step": 1813 }, { "epoch": 2.1027312902992104, "grad_norm": 0.2486356645822525, "learning_rate": 3.9574712643678164e-05, "loss": 0.1204, "step": 1814 }, { "epoch": 2.1038904585959575, "grad_norm": 0.24018944799900055, "learning_rate": 3.956896551724138e-05, "loss": 0.1333, "step": 1815 }, { "epoch": 2.1050496268927046, "grad_norm": 0.25310537219047546, "learning_rate": 3.95632183908046e-05, "loss": 0.1243, "step": 1816 }, { "epoch": 2.1062087951894517, "grad_norm": 0.26761263608932495, "learning_rate": 3.9557471264367815e-05, "loss": 0.1184, "step": 1817 }, { "epoch": 2.107367963486199, "grad_norm": 0.32471829652786255, "learning_rate": 3.955172413793104e-05, "loss": 0.1234, "step": 1818 }, { "epoch": 2.108527131782946, "grad_norm": 0.25897079706192017, "learning_rate": 3.954597701149426e-05, "loss": 0.1142, "step": 1819 }, { "epoch": 2.1096863000796926, "grad_norm": 0.2632877826690674, "learning_rate": 3.954022988505747e-05, "loss": 0.1189, "step": 1820 }, { "epoch": 2.1108454683764397, "grad_norm": 0.3021185100078583, "learning_rate": 3.953448275862069e-05, "loss": 0.1283, "step": 1821 }, { "epoch": 2.112004636673187, "grad_norm": 0.2785482704639435, "learning_rate": 3.952873563218391e-05, "loss": 0.1302, "step": 1822 }, { "epoch": 2.113163804969934, "grad_norm": 0.33356723189353943, "learning_rate": 3.9522988505747125e-05, "loss": 0.1194, "step": 1823 }, { "epoch": 2.114322973266681, "grad_norm": 0.29633092880249023, "learning_rate": 3.9517241379310346e-05, "loss": 0.1243, "step": 1824 }, { "epoch": 2.115482141563428, "grad_norm": 0.2905955910682678, "learning_rate": 3.951149425287357e-05, "loss": 0.1128, "step": 1825 }, { "epoch": 2.1166413098601753, "grad_norm": 0.24889424443244934, "learning_rate": 3.950574712643678e-05, "loss": 0.1029, "step": 1826 }, { "epoch": 2.1178004781569224, "grad_norm": 0.24264870584011078, "learning_rate": 3.9500000000000005e-05, "loss": 0.1125, "step": 1827 }, { "epoch": 2.1189596464536695, "grad_norm": 0.2387196123600006, "learning_rate": 3.949425287356322e-05, "loss": 0.1123, "step": 1828 }, { "epoch": 2.1201188147504166, "grad_norm": 0.23533257842063904, "learning_rate": 3.9488505747126434e-05, "loss": 0.118, "step": 1829 }, { "epoch": 2.1212779830471638, "grad_norm": 0.24829024076461792, "learning_rate": 3.9482758620689656e-05, "loss": 0.1194, "step": 1830 }, { "epoch": 2.122437151343911, "grad_norm": 0.26718825101852417, "learning_rate": 3.947701149425288e-05, "loss": 0.1139, "step": 1831 }, { "epoch": 2.123596319640658, "grad_norm": 0.24464312195777893, "learning_rate": 3.94712643678161e-05, "loss": 0.1101, "step": 1832 }, { "epoch": 2.124755487937405, "grad_norm": 0.22388160228729248, "learning_rate": 3.9465517241379314e-05, "loss": 0.1088, "step": 1833 }, { "epoch": 2.125914656234152, "grad_norm": 0.25908219814300537, "learning_rate": 3.945977011494253e-05, "loss": 0.1202, "step": 1834 }, { "epoch": 2.127073824530899, "grad_norm": 0.2020428478717804, "learning_rate": 3.945402298850575e-05, "loss": 0.108, "step": 1835 }, { "epoch": 2.128232992827646, "grad_norm": 0.26430872082710266, "learning_rate": 3.9448275862068966e-05, "loss": 0.1235, "step": 1836 }, { "epoch": 2.129392161124393, "grad_norm": 0.3125210702419281, "learning_rate": 3.944252873563218e-05, "loss": 0.1134, "step": 1837 }, { "epoch": 2.1305513294211402, "grad_norm": 0.24071168899536133, "learning_rate": 3.94367816091954e-05, "loss": 0.1153, "step": 1838 }, { "epoch": 2.1317104977178873, "grad_norm": 0.22722084820270538, "learning_rate": 3.9431034482758624e-05, "loss": 0.1135, "step": 1839 }, { "epoch": 2.1328696660146345, "grad_norm": 0.2722453474998474, "learning_rate": 3.942528735632184e-05, "loss": 0.1159, "step": 1840 }, { "epoch": 2.1340288343113816, "grad_norm": 0.26020506024360657, "learning_rate": 3.941954022988506e-05, "loss": 0.1083, "step": 1841 }, { "epoch": 2.1351880026081287, "grad_norm": 0.38387203216552734, "learning_rate": 3.9413793103448276e-05, "loss": 0.1304, "step": 1842 }, { "epoch": 2.136347170904876, "grad_norm": 0.28882285952568054, "learning_rate": 3.94080459770115e-05, "loss": 0.1188, "step": 1843 }, { "epoch": 2.137506339201623, "grad_norm": 0.2755765914916992, "learning_rate": 3.940229885057471e-05, "loss": 0.1346, "step": 1844 }, { "epoch": 2.13866550749837, "grad_norm": 0.2223798632621765, "learning_rate": 3.9396551724137934e-05, "loss": 0.1112, "step": 1845 }, { "epoch": 2.139824675795117, "grad_norm": 0.30250582098960876, "learning_rate": 3.9390804597701156e-05, "loss": 0.1267, "step": 1846 }, { "epoch": 2.1409838440918643, "grad_norm": 0.2759857177734375, "learning_rate": 3.938505747126437e-05, "loss": 0.1198, "step": 1847 }, { "epoch": 2.1421430123886114, "grad_norm": 0.2635420560836792, "learning_rate": 3.9379310344827585e-05, "loss": 0.116, "step": 1848 }, { "epoch": 2.143302180685358, "grad_norm": 0.22998470067977905, "learning_rate": 3.937356321839081e-05, "loss": 0.1124, "step": 1849 }, { "epoch": 2.144461348982105, "grad_norm": 0.22204086184501648, "learning_rate": 3.936781609195402e-05, "loss": 0.1087, "step": 1850 }, { "epoch": 2.1456205172788523, "grad_norm": 0.2902623116970062, "learning_rate": 3.9362068965517244e-05, "loss": 0.1307, "step": 1851 }, { "epoch": 2.1467796855755994, "grad_norm": 0.2636187970638275, "learning_rate": 3.9356321839080465e-05, "loss": 0.1168, "step": 1852 }, { "epoch": 2.1479388538723465, "grad_norm": 0.29677683115005493, "learning_rate": 3.935057471264368e-05, "loss": 0.1328, "step": 1853 }, { "epoch": 2.1490980221690936, "grad_norm": 0.2324220985174179, "learning_rate": 3.93448275862069e-05, "loss": 0.1198, "step": 1854 }, { "epoch": 2.1502571904658407, "grad_norm": 0.2689639627933502, "learning_rate": 3.933908045977012e-05, "loss": 0.1285, "step": 1855 }, { "epoch": 2.151416358762588, "grad_norm": 0.264710932970047, "learning_rate": 3.933333333333333e-05, "loss": 0.1235, "step": 1856 }, { "epoch": 2.152575527059335, "grad_norm": 0.26649346947669983, "learning_rate": 3.932758620689655e-05, "loss": 0.1212, "step": 1857 }, { "epoch": 2.153734695356082, "grad_norm": 0.27043336629867554, "learning_rate": 3.932183908045977e-05, "loss": 0.1123, "step": 1858 }, { "epoch": 2.154893863652829, "grad_norm": 0.24677076935768127, "learning_rate": 3.931609195402299e-05, "loss": 0.1058, "step": 1859 }, { "epoch": 2.1560530319495763, "grad_norm": 0.3722219169139862, "learning_rate": 3.931034482758621e-05, "loss": 0.13, "step": 1860 }, { "epoch": 2.1572122002463234, "grad_norm": 0.3465796411037445, "learning_rate": 3.9304597701149427e-05, "loss": 0.1323, "step": 1861 }, { "epoch": 2.1583713685430705, "grad_norm": 0.24927546083927155, "learning_rate": 3.929885057471265e-05, "loss": 0.1133, "step": 1862 }, { "epoch": 2.159530536839817, "grad_norm": 0.2714444696903229, "learning_rate": 3.929310344827586e-05, "loss": 0.114, "step": 1863 }, { "epoch": 2.1606897051365643, "grad_norm": 0.2666366994380951, "learning_rate": 3.928735632183908e-05, "loss": 0.1182, "step": 1864 }, { "epoch": 2.1618488734333114, "grad_norm": 0.24803896248340607, "learning_rate": 3.92816091954023e-05, "loss": 0.1162, "step": 1865 }, { "epoch": 2.1630080417300586, "grad_norm": 0.23639126121997833, "learning_rate": 3.927586206896552e-05, "loss": 0.1154, "step": 1866 }, { "epoch": 2.1641672100268057, "grad_norm": 0.2096773087978363, "learning_rate": 3.9270114942528736e-05, "loss": 0.1079, "step": 1867 }, { "epoch": 2.165326378323553, "grad_norm": 0.3063262104988098, "learning_rate": 3.926436781609196e-05, "loss": 0.1197, "step": 1868 }, { "epoch": 2.1664855466203, "grad_norm": 0.24331098794937134, "learning_rate": 3.925862068965517e-05, "loss": 0.1114, "step": 1869 }, { "epoch": 2.167644714917047, "grad_norm": 0.25354528427124023, "learning_rate": 3.9252873563218395e-05, "loss": 0.1231, "step": 1870 }, { "epoch": 2.168803883213794, "grad_norm": 0.2932867109775543, "learning_rate": 3.924712643678161e-05, "loss": 0.131, "step": 1871 }, { "epoch": 2.1699630515105413, "grad_norm": 0.22387802600860596, "learning_rate": 3.924137931034483e-05, "loss": 0.1101, "step": 1872 }, { "epoch": 2.1711222198072884, "grad_norm": 0.250288724899292, "learning_rate": 3.923563218390805e-05, "loss": 0.1229, "step": 1873 }, { "epoch": 2.1722813881040355, "grad_norm": 0.39069998264312744, "learning_rate": 3.922988505747127e-05, "loss": 0.1259, "step": 1874 }, { "epoch": 2.1734405564007826, "grad_norm": 0.20509329438209534, "learning_rate": 3.922413793103448e-05, "loss": 0.1273, "step": 1875 }, { "epoch": 2.1745997246975297, "grad_norm": 0.27866530418395996, "learning_rate": 3.9218390804597704e-05, "loss": 0.1229, "step": 1876 }, { "epoch": 2.175758892994277, "grad_norm": 0.2512810528278351, "learning_rate": 3.921264367816092e-05, "loss": 0.1142, "step": 1877 }, { "epoch": 2.1769180612910235, "grad_norm": 0.2705281674861908, "learning_rate": 3.9206896551724134e-05, "loss": 0.1132, "step": 1878 }, { "epoch": 2.1780772295877706, "grad_norm": 0.22448644042015076, "learning_rate": 3.9201149425287356e-05, "loss": 0.1077, "step": 1879 }, { "epoch": 2.1792363978845177, "grad_norm": 0.30466312170028687, "learning_rate": 3.919540229885058e-05, "loss": 0.1246, "step": 1880 }, { "epoch": 2.180395566181265, "grad_norm": 0.2996283769607544, "learning_rate": 3.91896551724138e-05, "loss": 0.1262, "step": 1881 }, { "epoch": 2.181554734478012, "grad_norm": 0.3339924216270447, "learning_rate": 3.9183908045977014e-05, "loss": 0.1344, "step": 1882 }, { "epoch": 2.182713902774759, "grad_norm": 0.24632121622562408, "learning_rate": 3.917816091954023e-05, "loss": 0.1099, "step": 1883 }, { "epoch": 2.183873071071506, "grad_norm": 0.24614660441875458, "learning_rate": 3.917241379310345e-05, "loss": 0.1136, "step": 1884 }, { "epoch": 2.1850322393682533, "grad_norm": 0.31324732303619385, "learning_rate": 3.9166666666666665e-05, "loss": 0.1242, "step": 1885 }, { "epoch": 2.1861914076650004, "grad_norm": 0.3063158392906189, "learning_rate": 3.916091954022989e-05, "loss": 0.1313, "step": 1886 }, { "epoch": 2.1873505759617475, "grad_norm": 0.3112230598926544, "learning_rate": 3.915517241379311e-05, "loss": 0.1244, "step": 1887 }, { "epoch": 2.1885097442584946, "grad_norm": 0.2197721004486084, "learning_rate": 3.9149425287356324e-05, "loss": 0.1093, "step": 1888 }, { "epoch": 2.1896689125552418, "grad_norm": 0.2518065273761749, "learning_rate": 3.9143678160919545e-05, "loss": 0.1126, "step": 1889 }, { "epoch": 2.190828080851989, "grad_norm": 0.24468325078487396, "learning_rate": 3.913793103448276e-05, "loss": 0.1264, "step": 1890 }, { "epoch": 2.1919872491487356, "grad_norm": 0.2638467252254486, "learning_rate": 3.9132183908045975e-05, "loss": 0.1299, "step": 1891 }, { "epoch": 2.1931464174454827, "grad_norm": 0.29071396589279175, "learning_rate": 3.91264367816092e-05, "loss": 0.1406, "step": 1892 }, { "epoch": 2.19430558574223, "grad_norm": 0.26825428009033203, "learning_rate": 3.912068965517242e-05, "loss": 0.133, "step": 1893 }, { "epoch": 2.195464754038977, "grad_norm": 0.28922030329704285, "learning_rate": 3.9114942528735633e-05, "loss": 0.1303, "step": 1894 }, { "epoch": 2.196623922335724, "grad_norm": 0.22582325339317322, "learning_rate": 3.9109195402298855e-05, "loss": 0.1118, "step": 1895 }, { "epoch": 2.197783090632471, "grad_norm": 0.31042325496673584, "learning_rate": 3.910344827586207e-05, "loss": 0.1292, "step": 1896 }, { "epoch": 2.1989422589292182, "grad_norm": 0.207808256149292, "learning_rate": 3.9097701149425285e-05, "loss": 0.1151, "step": 1897 }, { "epoch": 2.2001014272259654, "grad_norm": 0.2200191169977188, "learning_rate": 3.909195402298851e-05, "loss": 0.1085, "step": 1898 }, { "epoch": 2.2012605955227125, "grad_norm": 0.2516978979110718, "learning_rate": 3.908620689655172e-05, "loss": 0.122, "step": 1899 }, { "epoch": 2.2024197638194596, "grad_norm": 0.2803818881511688, "learning_rate": 3.908045977011495e-05, "loss": 0.1214, "step": 1900 }, { "epoch": 2.2035789321162067, "grad_norm": 0.24263282120227814, "learning_rate": 3.9074712643678165e-05, "loss": 0.1072, "step": 1901 }, { "epoch": 2.204738100412954, "grad_norm": 0.24859733879566193, "learning_rate": 3.906896551724138e-05, "loss": 0.1286, "step": 1902 }, { "epoch": 2.205897268709701, "grad_norm": 0.2725147306919098, "learning_rate": 3.90632183908046e-05, "loss": 0.1282, "step": 1903 }, { "epoch": 2.207056437006448, "grad_norm": 0.42582276463508606, "learning_rate": 3.9057471264367816e-05, "loss": 0.1239, "step": 1904 }, { "epoch": 2.208215605303195, "grad_norm": 0.2799397110939026, "learning_rate": 3.905172413793103e-05, "loss": 0.126, "step": 1905 }, { "epoch": 2.209374773599942, "grad_norm": 0.23212167620658875, "learning_rate": 3.904597701149425e-05, "loss": 0.1143, "step": 1906 }, { "epoch": 2.210533941896689, "grad_norm": 0.26310837268829346, "learning_rate": 3.9040229885057475e-05, "loss": 0.1218, "step": 1907 }, { "epoch": 2.211693110193436, "grad_norm": 0.289028137922287, "learning_rate": 3.9034482758620696e-05, "loss": 0.1142, "step": 1908 }, { "epoch": 2.212852278490183, "grad_norm": 0.2884521484375, "learning_rate": 3.902873563218391e-05, "loss": 0.1215, "step": 1909 }, { "epoch": 2.2140114467869303, "grad_norm": 0.2507173717021942, "learning_rate": 3.9022988505747126e-05, "loss": 0.1184, "step": 1910 }, { "epoch": 2.2151706150836774, "grad_norm": 0.23451244831085205, "learning_rate": 3.901724137931035e-05, "loss": 0.1123, "step": 1911 }, { "epoch": 2.2163297833804245, "grad_norm": 0.2503047287464142, "learning_rate": 3.901149425287356e-05, "loss": 0.1204, "step": 1912 }, { "epoch": 2.2174889516771716, "grad_norm": 0.2952342927455902, "learning_rate": 3.9005747126436784e-05, "loss": 0.1154, "step": 1913 }, { "epoch": 2.2186481199739188, "grad_norm": 0.28569087386131287, "learning_rate": 3.9000000000000006e-05, "loss": 0.1348, "step": 1914 }, { "epoch": 2.219807288270666, "grad_norm": 0.3290340304374695, "learning_rate": 3.899425287356322e-05, "loss": 0.1197, "step": 1915 }, { "epoch": 2.220966456567413, "grad_norm": 0.20089855790138245, "learning_rate": 3.8988505747126436e-05, "loss": 0.1161, "step": 1916 }, { "epoch": 2.22212562486416, "grad_norm": 0.2501339912414551, "learning_rate": 3.898275862068966e-05, "loss": 0.1122, "step": 1917 }, { "epoch": 2.223284793160907, "grad_norm": 0.27215123176574707, "learning_rate": 3.897701149425287e-05, "loss": 0.1152, "step": 1918 }, { "epoch": 2.2244439614576543, "grad_norm": 0.2323666214942932, "learning_rate": 3.8971264367816094e-05, "loss": 0.1029, "step": 1919 }, { "epoch": 2.2256031297544014, "grad_norm": 0.3195997476577759, "learning_rate": 3.896551724137931e-05, "loss": 0.1307, "step": 1920 }, { "epoch": 2.226762298051148, "grad_norm": 0.34427744150161743, "learning_rate": 3.895977011494253e-05, "loss": 0.1279, "step": 1921 }, { "epoch": 2.2279214663478952, "grad_norm": 0.3101141154766083, "learning_rate": 3.895402298850575e-05, "loss": 0.1152, "step": 1922 }, { "epoch": 2.2290806346446423, "grad_norm": 0.3297821283340454, "learning_rate": 3.894827586206897e-05, "loss": 0.1249, "step": 1923 }, { "epoch": 2.2302398029413895, "grad_norm": 0.27412423491477966, "learning_rate": 3.894252873563218e-05, "loss": 0.1201, "step": 1924 }, { "epoch": 2.2313989712381366, "grad_norm": 0.2107117623090744, "learning_rate": 3.8936781609195404e-05, "loss": 0.1083, "step": 1925 }, { "epoch": 2.2325581395348837, "grad_norm": 0.3204614222049713, "learning_rate": 3.893103448275862e-05, "loss": 0.1319, "step": 1926 }, { "epoch": 2.233717307831631, "grad_norm": 0.32438015937805176, "learning_rate": 3.892528735632184e-05, "loss": 0.1343, "step": 1927 }, { "epoch": 2.234876476128378, "grad_norm": 0.22636090219020844, "learning_rate": 3.891954022988506e-05, "loss": 0.1092, "step": 1928 }, { "epoch": 2.236035644425125, "grad_norm": 0.27245032787323, "learning_rate": 3.891379310344828e-05, "loss": 0.1141, "step": 1929 }, { "epoch": 2.237194812721872, "grad_norm": 0.3505677580833435, "learning_rate": 3.89080459770115e-05, "loss": 0.1366, "step": 1930 }, { "epoch": 2.2383539810186193, "grad_norm": 0.2798290252685547, "learning_rate": 3.8902298850574714e-05, "loss": 0.1153, "step": 1931 }, { "epoch": 2.2395131493153664, "grad_norm": 0.20595116913318634, "learning_rate": 3.889655172413793e-05, "loss": 0.1109, "step": 1932 }, { "epoch": 2.2406723176121135, "grad_norm": 0.26562798023223877, "learning_rate": 3.889080459770115e-05, "loss": 0.1309, "step": 1933 }, { "epoch": 2.24183148590886, "grad_norm": 0.28645437955856323, "learning_rate": 3.888505747126437e-05, "loss": 0.124, "step": 1934 }, { "epoch": 2.2429906542056073, "grad_norm": 0.26411932706832886, "learning_rate": 3.8879310344827594e-05, "loss": 0.1231, "step": 1935 }, { "epoch": 2.2441498225023544, "grad_norm": 0.2324962168931961, "learning_rate": 3.887356321839081e-05, "loss": 0.1213, "step": 1936 }, { "epoch": 2.2453089907991015, "grad_norm": 0.2609705924987793, "learning_rate": 3.886781609195402e-05, "loss": 0.1246, "step": 1937 }, { "epoch": 2.2464681590958486, "grad_norm": 0.24605713784694672, "learning_rate": 3.8862068965517245e-05, "loss": 0.1193, "step": 1938 }, { "epoch": 2.2476273273925957, "grad_norm": 0.3854371905326843, "learning_rate": 3.885632183908046e-05, "loss": 0.1324, "step": 1939 }, { "epoch": 2.248786495689343, "grad_norm": 0.2347278594970703, "learning_rate": 3.8850574712643675e-05, "loss": 0.1185, "step": 1940 }, { "epoch": 2.24994566398609, "grad_norm": 0.25814783573150635, "learning_rate": 3.88448275862069e-05, "loss": 0.1163, "step": 1941 }, { "epoch": 2.251104832282837, "grad_norm": 0.2590005695819855, "learning_rate": 3.883908045977012e-05, "loss": 0.1159, "step": 1942 }, { "epoch": 2.252264000579584, "grad_norm": 0.2866111099720001, "learning_rate": 3.883333333333333e-05, "loss": 0.119, "step": 1943 }, { "epoch": 2.2534231688763313, "grad_norm": 0.20185330510139465, "learning_rate": 3.8827586206896555e-05, "loss": 0.1127, "step": 1944 }, { "epoch": 2.2545823371730784, "grad_norm": 0.2572873830795288, "learning_rate": 3.882183908045977e-05, "loss": 0.1235, "step": 1945 }, { "epoch": 2.2557415054698255, "grad_norm": 0.29143568873405457, "learning_rate": 3.881609195402299e-05, "loss": 0.1161, "step": 1946 }, { "epoch": 2.2569006737665727, "grad_norm": 0.25425946712493896, "learning_rate": 3.8810344827586206e-05, "loss": 0.1022, "step": 1947 }, { "epoch": 2.2580598420633198, "grad_norm": 0.25637203454971313, "learning_rate": 3.880459770114943e-05, "loss": 0.1139, "step": 1948 }, { "epoch": 2.2592190103600664, "grad_norm": 0.30507326126098633, "learning_rate": 3.879885057471265e-05, "loss": 0.1264, "step": 1949 }, { "epoch": 2.2603781786568136, "grad_norm": 0.28318384289741516, "learning_rate": 3.8793103448275865e-05, "loss": 0.1243, "step": 1950 }, { "epoch": 2.2615373469535607, "grad_norm": 0.3025624752044678, "learning_rate": 3.878735632183908e-05, "loss": 0.1223, "step": 1951 }, { "epoch": 2.262696515250308, "grad_norm": 0.3020472824573517, "learning_rate": 3.87816091954023e-05, "loss": 0.1205, "step": 1952 }, { "epoch": 2.263855683547055, "grad_norm": 0.3075084388256073, "learning_rate": 3.8775862068965516e-05, "loss": 0.1151, "step": 1953 }, { "epoch": 2.265014851843802, "grad_norm": 0.3189045786857605, "learning_rate": 3.877011494252874e-05, "loss": 0.1102, "step": 1954 }, { "epoch": 2.266174020140549, "grad_norm": 0.347697913646698, "learning_rate": 3.876436781609196e-05, "loss": 0.1333, "step": 1955 }, { "epoch": 2.2673331884372963, "grad_norm": 0.27193987369537354, "learning_rate": 3.8758620689655174e-05, "loss": 0.1237, "step": 1956 }, { "epoch": 2.2684923567340434, "grad_norm": 0.22569780051708221, "learning_rate": 3.8752873563218396e-05, "loss": 0.1018, "step": 1957 }, { "epoch": 2.2696515250307905, "grad_norm": 0.2848230302333832, "learning_rate": 3.874712643678161e-05, "loss": 0.1088, "step": 1958 }, { "epoch": 2.2708106933275376, "grad_norm": 0.20593014359474182, "learning_rate": 3.8741379310344826e-05, "loss": 0.1036, "step": 1959 }, { "epoch": 2.2719698616242847, "grad_norm": 0.2758769989013672, "learning_rate": 3.873563218390805e-05, "loss": 0.1227, "step": 1960 }, { "epoch": 2.273129029921032, "grad_norm": 0.2546898126602173, "learning_rate": 3.872988505747127e-05, "loss": 0.1075, "step": 1961 }, { "epoch": 2.2742881982177785, "grad_norm": 0.3119022250175476, "learning_rate": 3.8724137931034484e-05, "loss": 0.1343, "step": 1962 }, { "epoch": 2.275447366514526, "grad_norm": 0.2919405400753021, "learning_rate": 3.8718390804597706e-05, "loss": 0.1392, "step": 1963 }, { "epoch": 2.2766065348112727, "grad_norm": 0.29379087686538696, "learning_rate": 3.871264367816092e-05, "loss": 0.137, "step": 1964 }, { "epoch": 2.27776570310802, "grad_norm": 0.32056736946105957, "learning_rate": 3.870689655172414e-05, "loss": 0.1112, "step": 1965 }, { "epoch": 2.278924871404767, "grad_norm": 0.25349029898643494, "learning_rate": 3.870114942528736e-05, "loss": 0.1178, "step": 1966 }, { "epoch": 2.280084039701514, "grad_norm": 0.24394166469573975, "learning_rate": 3.869540229885057e-05, "loss": 0.1072, "step": 1967 }, { "epoch": 2.281243207998261, "grad_norm": 0.23791316151618958, "learning_rate": 3.8689655172413794e-05, "loss": 0.1076, "step": 1968 }, { "epoch": 2.2824023762950083, "grad_norm": 0.24641607701778412, "learning_rate": 3.8683908045977015e-05, "loss": 0.1218, "step": 1969 }, { "epoch": 2.2835615445917554, "grad_norm": 0.23730486631393433, "learning_rate": 3.867816091954023e-05, "loss": 0.1135, "step": 1970 }, { "epoch": 2.2847207128885025, "grad_norm": 0.2812163531780243, "learning_rate": 3.867241379310345e-05, "loss": 0.1196, "step": 1971 }, { "epoch": 2.2858798811852497, "grad_norm": 0.25188305974006653, "learning_rate": 3.866666666666667e-05, "loss": 0.1174, "step": 1972 }, { "epoch": 2.2870390494819968, "grad_norm": 0.2505231499671936, "learning_rate": 3.866091954022989e-05, "loss": 0.1175, "step": 1973 }, { "epoch": 2.288198217778744, "grad_norm": 0.36217159032821655, "learning_rate": 3.8655172413793103e-05, "loss": 0.1261, "step": 1974 }, { "epoch": 2.289357386075491, "grad_norm": 0.26633283495903015, "learning_rate": 3.8649425287356325e-05, "loss": 0.126, "step": 1975 }, { "epoch": 2.290516554372238, "grad_norm": 0.286703884601593, "learning_rate": 3.864367816091955e-05, "loss": 0.1083, "step": 1976 }, { "epoch": 2.291675722668985, "grad_norm": 0.2949945032596588, "learning_rate": 3.863793103448276e-05, "loss": 0.1201, "step": 1977 }, { "epoch": 2.292834890965732, "grad_norm": 0.22798071801662445, "learning_rate": 3.863218390804598e-05, "loss": 0.1158, "step": 1978 }, { "epoch": 2.293994059262479, "grad_norm": 0.2760827839374542, "learning_rate": 3.86264367816092e-05, "loss": 0.1322, "step": 1979 }, { "epoch": 2.295153227559226, "grad_norm": 0.25793665647506714, "learning_rate": 3.862068965517241e-05, "loss": 0.1113, "step": 1980 }, { "epoch": 2.2963123958559732, "grad_norm": 0.28689324855804443, "learning_rate": 3.861494252873563e-05, "loss": 0.1064, "step": 1981 }, { "epoch": 2.2974715641527204, "grad_norm": 0.2645778954029083, "learning_rate": 3.8609195402298857e-05, "loss": 0.1239, "step": 1982 }, { "epoch": 2.2986307324494675, "grad_norm": 0.26537734270095825, "learning_rate": 3.860344827586207e-05, "loss": 0.1201, "step": 1983 }, { "epoch": 2.2997899007462146, "grad_norm": 0.281444251537323, "learning_rate": 3.859770114942529e-05, "loss": 0.1199, "step": 1984 }, { "epoch": 2.3009490690429617, "grad_norm": 0.25172534584999084, "learning_rate": 3.859195402298851e-05, "loss": 0.1133, "step": 1985 }, { "epoch": 2.302108237339709, "grad_norm": 0.25721946358680725, "learning_rate": 3.858620689655172e-05, "loss": 0.1117, "step": 1986 }, { "epoch": 2.303267405636456, "grad_norm": 0.24373944103717804, "learning_rate": 3.8580459770114945e-05, "loss": 0.1043, "step": 1987 }, { "epoch": 2.304426573933203, "grad_norm": 0.2496458888053894, "learning_rate": 3.857471264367816e-05, "loss": 0.1147, "step": 1988 }, { "epoch": 2.30558574222995, "grad_norm": 0.21120178699493408, "learning_rate": 3.856896551724138e-05, "loss": 0.1069, "step": 1989 }, { "epoch": 2.3067449105266973, "grad_norm": 0.30783671140670776, "learning_rate": 3.85632183908046e-05, "loss": 0.125, "step": 1990 }, { "epoch": 2.3079040788234444, "grad_norm": 0.22991196811199188, "learning_rate": 3.855747126436782e-05, "loss": 0.1206, "step": 1991 }, { "epoch": 2.309063247120191, "grad_norm": 0.2827235758304596, "learning_rate": 3.855172413793104e-05, "loss": 0.1333, "step": 1992 }, { "epoch": 2.310222415416938, "grad_norm": 0.3185621500015259, "learning_rate": 3.8545977011494254e-05, "loss": 0.135, "step": 1993 }, { "epoch": 2.3113815837136853, "grad_norm": 0.25081896781921387, "learning_rate": 3.854022988505747e-05, "loss": 0.1197, "step": 1994 }, { "epoch": 2.3125407520104324, "grad_norm": 0.3022123873233795, "learning_rate": 3.853448275862069e-05, "loss": 0.1232, "step": 1995 }, { "epoch": 2.3136999203071795, "grad_norm": 0.3585253357887268, "learning_rate": 3.852873563218391e-05, "loss": 0.1371, "step": 1996 }, { "epoch": 2.3148590886039266, "grad_norm": 0.22381120920181274, "learning_rate": 3.852298850574713e-05, "loss": 0.1093, "step": 1997 }, { "epoch": 2.3160182569006738, "grad_norm": 0.3148757219314575, "learning_rate": 3.851724137931035e-05, "loss": 0.1487, "step": 1998 }, { "epoch": 2.317177425197421, "grad_norm": 0.298365980386734, "learning_rate": 3.8511494252873564e-05, "loss": 0.1225, "step": 1999 }, { "epoch": 2.318336593494168, "grad_norm": 0.27845144271850586, "learning_rate": 3.850574712643678e-05, "loss": 0.1202, "step": 2000 }, { "epoch": 2.319495761790915, "grad_norm": 0.2849871814250946, "learning_rate": 3.85e-05, "loss": 0.1209, "step": 2001 }, { "epoch": 2.320654930087662, "grad_norm": 0.30878233909606934, "learning_rate": 3.849425287356322e-05, "loss": 0.1256, "step": 2002 }, { "epoch": 2.3218140983844093, "grad_norm": 0.27193957567214966, "learning_rate": 3.8488505747126444e-05, "loss": 0.1206, "step": 2003 }, { "epoch": 2.3229732666811564, "grad_norm": 0.23535265028476715, "learning_rate": 3.848275862068966e-05, "loss": 0.1188, "step": 2004 }, { "epoch": 2.324132434977903, "grad_norm": 0.2573367953300476, "learning_rate": 3.8477011494252874e-05, "loss": 0.1165, "step": 2005 }, { "epoch": 2.3252916032746507, "grad_norm": 0.2889256179332733, "learning_rate": 3.8471264367816096e-05, "loss": 0.1293, "step": 2006 }, { "epoch": 2.3264507715713973, "grad_norm": 0.29052430391311646, "learning_rate": 3.846551724137931e-05, "loss": 0.1194, "step": 2007 }, { "epoch": 2.3276099398681445, "grad_norm": 0.2844145596027374, "learning_rate": 3.8459770114942525e-05, "loss": 0.1342, "step": 2008 }, { "epoch": 2.3287691081648916, "grad_norm": 0.2795712649822235, "learning_rate": 3.845402298850575e-05, "loss": 0.1216, "step": 2009 }, { "epoch": 2.3299282764616387, "grad_norm": 0.27312591671943665, "learning_rate": 3.844827586206897e-05, "loss": 0.1224, "step": 2010 }, { "epoch": 2.331087444758386, "grad_norm": 0.20868080854415894, "learning_rate": 3.844252873563219e-05, "loss": 0.1051, "step": 2011 }, { "epoch": 2.332246613055133, "grad_norm": 0.27325716614723206, "learning_rate": 3.8436781609195405e-05, "loss": 0.1154, "step": 2012 }, { "epoch": 2.33340578135188, "grad_norm": 0.2669987082481384, "learning_rate": 3.843103448275862e-05, "loss": 0.1131, "step": 2013 }, { "epoch": 2.334564949648627, "grad_norm": 0.25275173783302307, "learning_rate": 3.842528735632184e-05, "loss": 0.1209, "step": 2014 }, { "epoch": 2.3357241179453743, "grad_norm": 0.2534005045890808, "learning_rate": 3.841954022988506e-05, "loss": 0.1266, "step": 2015 }, { "epoch": 2.3368832862421214, "grad_norm": 0.26923778653144836, "learning_rate": 3.841379310344828e-05, "loss": 0.1289, "step": 2016 }, { "epoch": 2.3380424545388685, "grad_norm": 0.23470233380794525, "learning_rate": 3.84080459770115e-05, "loss": 0.1097, "step": 2017 }, { "epoch": 2.3392016228356156, "grad_norm": 0.29632678627967834, "learning_rate": 3.8402298850574715e-05, "loss": 0.1136, "step": 2018 }, { "epoch": 2.3403607911323627, "grad_norm": 0.22054457664489746, "learning_rate": 3.839655172413793e-05, "loss": 0.1087, "step": 2019 }, { "epoch": 2.3415199594291094, "grad_norm": 0.29391518235206604, "learning_rate": 3.839080459770115e-05, "loss": 0.1152, "step": 2020 }, { "epoch": 2.3426791277258565, "grad_norm": 0.2556963264942169, "learning_rate": 3.8385057471264366e-05, "loss": 0.1298, "step": 2021 }, { "epoch": 2.3438382960226036, "grad_norm": 0.20713378489017487, "learning_rate": 3.837931034482759e-05, "loss": 0.1121, "step": 2022 }, { "epoch": 2.3449974643193507, "grad_norm": 0.30257120728492737, "learning_rate": 3.837356321839081e-05, "loss": 0.1267, "step": 2023 }, { "epoch": 2.346156632616098, "grad_norm": 0.3050404489040375, "learning_rate": 3.8367816091954025e-05, "loss": 0.1361, "step": 2024 }, { "epoch": 2.347315800912845, "grad_norm": 0.1983451396226883, "learning_rate": 3.8362068965517246e-05, "loss": 0.1103, "step": 2025 }, { "epoch": 2.348474969209592, "grad_norm": 0.23524372279644012, "learning_rate": 3.835632183908046e-05, "loss": 0.1147, "step": 2026 }, { "epoch": 2.349634137506339, "grad_norm": 0.2787211537361145, "learning_rate": 3.8350574712643676e-05, "loss": 0.1175, "step": 2027 }, { "epoch": 2.3507933058030863, "grad_norm": 0.2609887421131134, "learning_rate": 3.83448275862069e-05, "loss": 0.118, "step": 2028 }, { "epoch": 2.3519524740998334, "grad_norm": 0.2685818076133728, "learning_rate": 3.833908045977011e-05, "loss": 0.1245, "step": 2029 }, { "epoch": 2.3531116423965805, "grad_norm": 0.25744444131851196, "learning_rate": 3.8333333333333334e-05, "loss": 0.1141, "step": 2030 }, { "epoch": 2.3542708106933277, "grad_norm": 0.22216452658176422, "learning_rate": 3.8327586206896556e-05, "loss": 0.1097, "step": 2031 }, { "epoch": 2.355429978990075, "grad_norm": 0.23163087666034698, "learning_rate": 3.832183908045977e-05, "loss": 0.1042, "step": 2032 }, { "epoch": 2.356589147286822, "grad_norm": 0.3125256896018982, "learning_rate": 3.831609195402299e-05, "loss": 0.1074, "step": 2033 }, { "epoch": 2.357748315583569, "grad_norm": 0.35050761699676514, "learning_rate": 3.831034482758621e-05, "loss": 0.1332, "step": 2034 }, { "epoch": 2.3589074838803157, "grad_norm": 0.2466825395822525, "learning_rate": 3.830459770114942e-05, "loss": 0.1192, "step": 2035 }, { "epoch": 2.360066652177063, "grad_norm": 0.26629340648651123, "learning_rate": 3.8298850574712644e-05, "loss": 0.1227, "step": 2036 }, { "epoch": 2.36122582047381, "grad_norm": 0.3505549728870392, "learning_rate": 3.8293103448275866e-05, "loss": 0.1138, "step": 2037 }, { "epoch": 2.362384988770557, "grad_norm": 0.22414235770702362, "learning_rate": 3.828735632183908e-05, "loss": 0.1087, "step": 2038 }, { "epoch": 2.363544157067304, "grad_norm": 0.23031394183635712, "learning_rate": 3.82816091954023e-05, "loss": 0.1178, "step": 2039 }, { "epoch": 2.3647033253640513, "grad_norm": 0.37229323387145996, "learning_rate": 3.827586206896552e-05, "loss": 0.1398, "step": 2040 }, { "epoch": 2.3658624936607984, "grad_norm": 0.2622588872909546, "learning_rate": 3.827011494252874e-05, "loss": 0.1189, "step": 2041 }, { "epoch": 2.3670216619575455, "grad_norm": 0.2855938673019409, "learning_rate": 3.8264367816091954e-05, "loss": 0.1256, "step": 2042 }, { "epoch": 2.3681808302542926, "grad_norm": 0.3094041049480438, "learning_rate": 3.8258620689655176e-05, "loss": 0.1268, "step": 2043 }, { "epoch": 2.3693399985510397, "grad_norm": 0.30594077706336975, "learning_rate": 3.82528735632184e-05, "loss": 0.1206, "step": 2044 }, { "epoch": 2.370499166847787, "grad_norm": 0.3137442469596863, "learning_rate": 3.824712643678161e-05, "loss": 0.1346, "step": 2045 }, { "epoch": 2.371658335144534, "grad_norm": 0.25371280312538147, "learning_rate": 3.824137931034483e-05, "loss": 0.121, "step": 2046 }, { "epoch": 2.372817503441281, "grad_norm": 0.24225755035877228, "learning_rate": 3.823563218390805e-05, "loss": 0.1124, "step": 2047 }, { "epoch": 2.3739766717380277, "grad_norm": 0.2342030107975006, "learning_rate": 3.8229885057471264e-05, "loss": 0.1014, "step": 2048 }, { "epoch": 2.3751358400347753, "grad_norm": 0.3285464942455292, "learning_rate": 3.8224137931034485e-05, "loss": 0.122, "step": 2049 }, { "epoch": 2.376295008331522, "grad_norm": 0.2837320864200592, "learning_rate": 3.82183908045977e-05, "loss": 0.1265, "step": 2050 }, { "epoch": 2.377454176628269, "grad_norm": 0.30253443121910095, "learning_rate": 3.821264367816092e-05, "loss": 0.1191, "step": 2051 }, { "epoch": 2.378613344925016, "grad_norm": 0.24019919335842133, "learning_rate": 3.8206896551724144e-05, "loss": 0.1035, "step": 2052 }, { "epoch": 2.3797725132217633, "grad_norm": 0.2411479502916336, "learning_rate": 3.820114942528736e-05, "loss": 0.1238, "step": 2053 }, { "epoch": 2.3809316815185104, "grad_norm": 0.2749464213848114, "learning_rate": 3.8195402298850573e-05, "loss": 0.1259, "step": 2054 }, { "epoch": 2.3820908498152575, "grad_norm": 0.2516106963157654, "learning_rate": 3.8189655172413795e-05, "loss": 0.1173, "step": 2055 }, { "epoch": 2.3832500181120047, "grad_norm": 0.2396615743637085, "learning_rate": 3.818390804597701e-05, "loss": 0.1057, "step": 2056 }, { "epoch": 2.3844091864087518, "grad_norm": 0.23807305097579956, "learning_rate": 3.817816091954023e-05, "loss": 0.1216, "step": 2057 }, { "epoch": 2.385568354705499, "grad_norm": 0.24773019552230835, "learning_rate": 3.8172413793103453e-05, "loss": 0.1243, "step": 2058 }, { "epoch": 2.386727523002246, "grad_norm": 0.21717150509357452, "learning_rate": 3.816666666666667e-05, "loss": 0.1101, "step": 2059 }, { "epoch": 2.387886691298993, "grad_norm": 0.28333330154418945, "learning_rate": 3.816091954022989e-05, "loss": 0.1183, "step": 2060 }, { "epoch": 2.3890458595957402, "grad_norm": 0.22554628551006317, "learning_rate": 3.8155172413793105e-05, "loss": 0.1276, "step": 2061 }, { "epoch": 2.3902050278924873, "grad_norm": 0.2866622507572174, "learning_rate": 3.814942528735632e-05, "loss": 0.1378, "step": 2062 }, { "epoch": 2.391364196189234, "grad_norm": 0.25763562321662903, "learning_rate": 3.814367816091954e-05, "loss": 0.1174, "step": 2063 }, { "epoch": 2.392523364485981, "grad_norm": 0.24066491425037384, "learning_rate": 3.813793103448276e-05, "loss": 0.1193, "step": 2064 }, { "epoch": 2.3936825327827282, "grad_norm": 0.27270743250846863, "learning_rate": 3.813218390804598e-05, "loss": 0.1219, "step": 2065 }, { "epoch": 2.3948417010794754, "grad_norm": 0.28559228777885437, "learning_rate": 3.81264367816092e-05, "loss": 0.1345, "step": 2066 }, { "epoch": 2.3960008693762225, "grad_norm": 0.27294817566871643, "learning_rate": 3.8120689655172415e-05, "loss": 0.1181, "step": 2067 }, { "epoch": 2.3971600376729696, "grad_norm": 0.2862413227558136, "learning_rate": 3.8114942528735636e-05, "loss": 0.1141, "step": 2068 }, { "epoch": 2.3983192059697167, "grad_norm": 0.26329490542411804, "learning_rate": 3.810919540229885e-05, "loss": 0.1233, "step": 2069 }, { "epoch": 2.399478374266464, "grad_norm": 0.25338056683540344, "learning_rate": 3.8103448275862066e-05, "loss": 0.1157, "step": 2070 }, { "epoch": 2.400637542563211, "grad_norm": 0.251355916261673, "learning_rate": 3.809770114942529e-05, "loss": 0.123, "step": 2071 }, { "epoch": 2.401796710859958, "grad_norm": 0.32914426922798157, "learning_rate": 3.809195402298851e-05, "loss": 0.1366, "step": 2072 }, { "epoch": 2.402955879156705, "grad_norm": 0.3044224977493286, "learning_rate": 3.8086206896551724e-05, "loss": 0.1273, "step": 2073 }, { "epoch": 2.4041150474534523, "grad_norm": 0.2599372863769531, "learning_rate": 3.8080459770114946e-05, "loss": 0.1143, "step": 2074 }, { "epoch": 2.4052742157501994, "grad_norm": 0.26074209809303284, "learning_rate": 3.807471264367816e-05, "loss": 0.121, "step": 2075 }, { "epoch": 2.406433384046946, "grad_norm": 0.2514488399028778, "learning_rate": 3.8068965517241376e-05, "loss": 0.123, "step": 2076 }, { "epoch": 2.4075925523436936, "grad_norm": 0.26624441146850586, "learning_rate": 3.80632183908046e-05, "loss": 0.1219, "step": 2077 }, { "epoch": 2.4087517206404403, "grad_norm": 0.2705528140068054, "learning_rate": 3.805747126436782e-05, "loss": 0.1155, "step": 2078 }, { "epoch": 2.4099108889371874, "grad_norm": 0.2879337668418884, "learning_rate": 3.805172413793104e-05, "loss": 0.1241, "step": 2079 }, { "epoch": 2.4110700572339345, "grad_norm": 0.2518971264362335, "learning_rate": 3.8045977011494256e-05, "loss": 0.1157, "step": 2080 }, { "epoch": 2.4122292255306816, "grad_norm": 0.3543863296508789, "learning_rate": 3.804022988505747e-05, "loss": 0.1295, "step": 2081 }, { "epoch": 2.4133883938274288, "grad_norm": 0.2539769113063812, "learning_rate": 3.803448275862069e-05, "loss": 0.1139, "step": 2082 }, { "epoch": 2.414547562124176, "grad_norm": 0.271513432264328, "learning_rate": 3.802873563218391e-05, "loss": 0.1255, "step": 2083 }, { "epoch": 2.415706730420923, "grad_norm": 0.3522006869316101, "learning_rate": 3.802298850574713e-05, "loss": 0.1294, "step": 2084 }, { "epoch": 2.41686589871767, "grad_norm": 0.27951931953430176, "learning_rate": 3.801724137931035e-05, "loss": 0.1331, "step": 2085 }, { "epoch": 2.418025067014417, "grad_norm": 0.2659072279930115, "learning_rate": 3.8011494252873566e-05, "loss": 0.1187, "step": 2086 }, { "epoch": 2.4191842353111643, "grad_norm": 0.23229147493839264, "learning_rate": 3.800574712643679e-05, "loss": 0.1146, "step": 2087 }, { "epoch": 2.4203434036079114, "grad_norm": 0.28749972581863403, "learning_rate": 3.8e-05, "loss": 0.123, "step": 2088 }, { "epoch": 2.4215025719046586, "grad_norm": 0.3175642788410187, "learning_rate": 3.799425287356322e-05, "loss": 0.1264, "step": 2089 }, { "epoch": 2.4226617402014057, "grad_norm": 0.25972867012023926, "learning_rate": 3.798850574712644e-05, "loss": 0.1193, "step": 2090 }, { "epoch": 2.4238209084981523, "grad_norm": 0.26062315702438354, "learning_rate": 3.7982758620689654e-05, "loss": 0.121, "step": 2091 }, { "epoch": 2.4249800767949, "grad_norm": 0.25162971019744873, "learning_rate": 3.7977011494252875e-05, "loss": 0.1115, "step": 2092 }, { "epoch": 2.4261392450916466, "grad_norm": 0.2526095509529114, "learning_rate": 3.79712643678161e-05, "loss": 0.1135, "step": 2093 }, { "epoch": 2.4272984133883937, "grad_norm": 0.25769245624542236, "learning_rate": 3.796551724137931e-05, "loss": 0.116, "step": 2094 }, { "epoch": 2.428457581685141, "grad_norm": 0.2587513029575348, "learning_rate": 3.7959770114942534e-05, "loss": 0.1046, "step": 2095 }, { "epoch": 2.429616749981888, "grad_norm": 0.2877984941005707, "learning_rate": 3.795402298850575e-05, "loss": 0.1147, "step": 2096 }, { "epoch": 2.430775918278635, "grad_norm": 0.28424739837646484, "learning_rate": 3.794827586206896e-05, "loss": 0.1188, "step": 2097 }, { "epoch": 2.431935086575382, "grad_norm": 0.38208964467048645, "learning_rate": 3.7942528735632185e-05, "loss": 0.1128, "step": 2098 }, { "epoch": 2.4330942548721293, "grad_norm": 0.27642908692359924, "learning_rate": 3.793678160919541e-05, "loss": 0.1336, "step": 2099 }, { "epoch": 2.4342534231688764, "grad_norm": 0.2712388336658478, "learning_rate": 3.793103448275862e-05, "loss": 0.1335, "step": 2100 }, { "epoch": 2.4354125914656235, "grad_norm": 0.3354480564594269, "learning_rate": 3.792528735632184e-05, "loss": 0.1292, "step": 2101 }, { "epoch": 2.4365717597623706, "grad_norm": 0.2718063294887543, "learning_rate": 3.791954022988506e-05, "loss": 0.1318, "step": 2102 }, { "epoch": 2.4377309280591177, "grad_norm": 0.2873290777206421, "learning_rate": 3.791379310344827e-05, "loss": 0.1184, "step": 2103 }, { "epoch": 2.438890096355865, "grad_norm": 0.2832820415496826, "learning_rate": 3.7908045977011495e-05, "loss": 0.1181, "step": 2104 }, { "epoch": 2.440049264652612, "grad_norm": 0.27394813299179077, "learning_rate": 3.7902298850574716e-05, "loss": 0.1117, "step": 2105 }, { "epoch": 2.4412084329493586, "grad_norm": 0.2313212752342224, "learning_rate": 3.789655172413794e-05, "loss": 0.1154, "step": 2106 }, { "epoch": 2.4423676012461057, "grad_norm": 0.2213948369026184, "learning_rate": 3.789080459770115e-05, "loss": 0.1214, "step": 2107 }, { "epoch": 2.443526769542853, "grad_norm": 0.2309620976448059, "learning_rate": 3.788505747126437e-05, "loss": 0.1104, "step": 2108 }, { "epoch": 2.4446859378396, "grad_norm": 0.24604758620262146, "learning_rate": 3.787931034482759e-05, "loss": 0.1153, "step": 2109 }, { "epoch": 2.445845106136347, "grad_norm": 0.21898497641086578, "learning_rate": 3.7873563218390804e-05, "loss": 0.1093, "step": 2110 }, { "epoch": 2.447004274433094, "grad_norm": 0.24178272485733032, "learning_rate": 3.786781609195402e-05, "loss": 0.1178, "step": 2111 }, { "epoch": 2.4481634427298413, "grad_norm": 0.28789666295051575, "learning_rate": 3.786206896551725e-05, "loss": 0.1074, "step": 2112 }, { "epoch": 2.4493226110265884, "grad_norm": 0.22450456023216248, "learning_rate": 3.785632183908046e-05, "loss": 0.1111, "step": 2113 }, { "epoch": 2.4504817793233356, "grad_norm": 0.23580452799797058, "learning_rate": 3.7850574712643684e-05, "loss": 0.1088, "step": 2114 }, { "epoch": 2.4516409476200827, "grad_norm": 0.26990747451782227, "learning_rate": 3.78448275862069e-05, "loss": 0.1343, "step": 2115 }, { "epoch": 2.45280011591683, "grad_norm": 0.3045923709869385, "learning_rate": 3.7839080459770114e-05, "loss": 0.1226, "step": 2116 }, { "epoch": 2.453959284213577, "grad_norm": 0.24597370624542236, "learning_rate": 3.7833333333333336e-05, "loss": 0.1329, "step": 2117 }, { "epoch": 2.455118452510324, "grad_norm": 0.32915717363357544, "learning_rate": 3.782758620689655e-05, "loss": 0.1257, "step": 2118 }, { "epoch": 2.4562776208070707, "grad_norm": 0.2542544901371002, "learning_rate": 3.782183908045977e-05, "loss": 0.1222, "step": 2119 }, { "epoch": 2.4574367891038182, "grad_norm": 0.28636133670806885, "learning_rate": 3.7816091954022994e-05, "loss": 0.1311, "step": 2120 }, { "epoch": 2.458595957400565, "grad_norm": 0.2496955841779709, "learning_rate": 3.781034482758621e-05, "loss": 0.1221, "step": 2121 }, { "epoch": 2.459755125697312, "grad_norm": 0.31861692667007446, "learning_rate": 3.7804597701149424e-05, "loss": 0.115, "step": 2122 }, { "epoch": 2.460914293994059, "grad_norm": 0.27043020725250244, "learning_rate": 3.7798850574712646e-05, "loss": 0.1132, "step": 2123 }, { "epoch": 2.4620734622908063, "grad_norm": 0.22890667617321014, "learning_rate": 3.779310344827586e-05, "loss": 0.1082, "step": 2124 }, { "epoch": 2.4632326305875534, "grad_norm": 0.27371424436569214, "learning_rate": 3.778735632183908e-05, "loss": 0.1231, "step": 2125 }, { "epoch": 2.4643917988843005, "grad_norm": 0.2712050974369049, "learning_rate": 3.7781609195402304e-05, "loss": 0.141, "step": 2126 }, { "epoch": 2.4655509671810476, "grad_norm": 0.26597630977630615, "learning_rate": 3.777586206896552e-05, "loss": 0.1193, "step": 2127 }, { "epoch": 2.4667101354777947, "grad_norm": 0.28506308794021606, "learning_rate": 3.777011494252874e-05, "loss": 0.1172, "step": 2128 }, { "epoch": 2.467869303774542, "grad_norm": 0.2775918245315552, "learning_rate": 3.7764367816091955e-05, "loss": 0.1131, "step": 2129 }, { "epoch": 2.469028472071289, "grad_norm": 0.2832791209220886, "learning_rate": 3.775862068965517e-05, "loss": 0.13, "step": 2130 }, { "epoch": 2.470187640368036, "grad_norm": 0.2610412836074829, "learning_rate": 3.775287356321839e-05, "loss": 0.1195, "step": 2131 }, { "epoch": 2.471346808664783, "grad_norm": 0.2954767346382141, "learning_rate": 3.774712643678161e-05, "loss": 0.1197, "step": 2132 }, { "epoch": 2.4725059769615303, "grad_norm": 0.30459877848625183, "learning_rate": 3.7741379310344835e-05, "loss": 0.1233, "step": 2133 }, { "epoch": 2.473665145258277, "grad_norm": 0.2544924318790436, "learning_rate": 3.773563218390805e-05, "loss": 0.1124, "step": 2134 }, { "epoch": 2.474824313555024, "grad_norm": 0.3522142469882965, "learning_rate": 3.7729885057471265e-05, "loss": 0.1125, "step": 2135 }, { "epoch": 2.475983481851771, "grad_norm": 0.2694511115550995, "learning_rate": 3.772413793103449e-05, "loss": 0.117, "step": 2136 }, { "epoch": 2.4771426501485183, "grad_norm": 0.261785626411438, "learning_rate": 3.77183908045977e-05, "loss": 0.112, "step": 2137 }, { "epoch": 2.4783018184452654, "grad_norm": 0.24989941716194153, "learning_rate": 3.7712643678160917e-05, "loss": 0.1075, "step": 2138 }, { "epoch": 2.4794609867420125, "grad_norm": 0.3909321427345276, "learning_rate": 3.770689655172414e-05, "loss": 0.1222, "step": 2139 }, { "epoch": 2.4806201550387597, "grad_norm": 0.2468433380126953, "learning_rate": 3.770114942528736e-05, "loss": 0.1193, "step": 2140 }, { "epoch": 2.4817793233355068, "grad_norm": 0.27097734808921814, "learning_rate": 3.7695402298850575e-05, "loss": 0.1379, "step": 2141 }, { "epoch": 2.482938491632254, "grad_norm": 0.26425981521606445, "learning_rate": 3.7689655172413797e-05, "loss": 0.1156, "step": 2142 }, { "epoch": 2.484097659929001, "grad_norm": 0.23942025005817413, "learning_rate": 3.768390804597701e-05, "loss": 0.1217, "step": 2143 }, { "epoch": 2.485256828225748, "grad_norm": 0.27993398904800415, "learning_rate": 3.767816091954023e-05, "loss": 0.1239, "step": 2144 }, { "epoch": 2.4864159965224952, "grad_norm": 0.2943548262119293, "learning_rate": 3.767241379310345e-05, "loss": 0.1265, "step": 2145 }, { "epoch": 2.4875751648192423, "grad_norm": 0.33951184153556824, "learning_rate": 3.766666666666667e-05, "loss": 0.1267, "step": 2146 }, { "epoch": 2.4887343331159895, "grad_norm": 0.2834022641181946, "learning_rate": 3.766091954022989e-05, "loss": 0.1261, "step": 2147 }, { "epoch": 2.4898935014127366, "grad_norm": 0.2666892111301422, "learning_rate": 3.7655172413793106e-05, "loss": 0.1148, "step": 2148 }, { "epoch": 2.4910526697094832, "grad_norm": 0.22419312596321106, "learning_rate": 3.764942528735632e-05, "loss": 0.1095, "step": 2149 }, { "epoch": 2.4922118380062304, "grad_norm": 0.2352280616760254, "learning_rate": 3.764367816091954e-05, "loss": 0.1183, "step": 2150 }, { "epoch": 2.4933710063029775, "grad_norm": 0.2600967288017273, "learning_rate": 3.763793103448276e-05, "loss": 0.1197, "step": 2151 }, { "epoch": 2.4945301745997246, "grad_norm": 0.25782260298728943, "learning_rate": 3.763218390804598e-05, "loss": 0.1201, "step": 2152 }, { "epoch": 2.4956893428964717, "grad_norm": 0.2279644012451172, "learning_rate": 3.76264367816092e-05, "loss": 0.1141, "step": 2153 }, { "epoch": 2.496848511193219, "grad_norm": 0.3026202917098999, "learning_rate": 3.7620689655172416e-05, "loss": 0.1371, "step": 2154 }, { "epoch": 2.498007679489966, "grad_norm": 0.18554648756980896, "learning_rate": 3.761494252873564e-05, "loss": 0.0954, "step": 2155 }, { "epoch": 2.499166847786713, "grad_norm": 0.22495919466018677, "learning_rate": 3.760919540229885e-05, "loss": 0.1215, "step": 2156 }, { "epoch": 2.50032601608346, "grad_norm": 0.26901552081108093, "learning_rate": 3.760344827586207e-05, "loss": 0.1227, "step": 2157 }, { "epoch": 2.5014851843802073, "grad_norm": 0.2286049723625183, "learning_rate": 3.759770114942529e-05, "loss": 0.1266, "step": 2158 }, { "epoch": 2.5026443526769544, "grad_norm": 0.2322368025779724, "learning_rate": 3.7591954022988504e-05, "loss": 0.1073, "step": 2159 }, { "epoch": 2.5038035209737015, "grad_norm": 0.24827679991722107, "learning_rate": 3.7586206896551726e-05, "loss": 0.1278, "step": 2160 }, { "epoch": 2.5049626892704486, "grad_norm": 0.24690257012844086, "learning_rate": 3.758045977011495e-05, "loss": 0.1126, "step": 2161 }, { "epoch": 2.5061218575671953, "grad_norm": 0.2480195313692093, "learning_rate": 3.757471264367816e-05, "loss": 0.119, "step": 2162 }, { "epoch": 2.507281025863943, "grad_norm": 0.21804195642471313, "learning_rate": 3.7568965517241384e-05, "loss": 0.1119, "step": 2163 }, { "epoch": 2.5084401941606895, "grad_norm": 0.25382915139198303, "learning_rate": 3.75632183908046e-05, "loss": 0.1198, "step": 2164 }, { "epoch": 2.5095993624574366, "grad_norm": 0.24101321399211884, "learning_rate": 3.7557471264367814e-05, "loss": 0.1187, "step": 2165 }, { "epoch": 2.5107585307541838, "grad_norm": 0.28679129481315613, "learning_rate": 3.7551724137931035e-05, "loss": 0.127, "step": 2166 }, { "epoch": 2.511917699050931, "grad_norm": 0.2896977365016937, "learning_rate": 3.754597701149426e-05, "loss": 0.1246, "step": 2167 }, { "epoch": 2.513076867347678, "grad_norm": 0.31614774465560913, "learning_rate": 3.754022988505747e-05, "loss": 0.1283, "step": 2168 }, { "epoch": 2.514236035644425, "grad_norm": 0.29304152727127075, "learning_rate": 3.7534482758620694e-05, "loss": 0.1263, "step": 2169 }, { "epoch": 2.515395203941172, "grad_norm": 0.29353946447372437, "learning_rate": 3.752873563218391e-05, "loss": 0.1148, "step": 2170 }, { "epoch": 2.5165543722379193, "grad_norm": 0.28586602210998535, "learning_rate": 3.752298850574713e-05, "loss": 0.1202, "step": 2171 }, { "epoch": 2.5177135405346664, "grad_norm": 0.29421526193618774, "learning_rate": 3.7517241379310345e-05, "loss": 0.1329, "step": 2172 }, { "epoch": 2.5188727088314136, "grad_norm": 0.23773986101150513, "learning_rate": 3.751149425287357e-05, "loss": 0.1212, "step": 2173 }, { "epoch": 2.5200318771281607, "grad_norm": 0.2569189965724945, "learning_rate": 3.750574712643679e-05, "loss": 0.1142, "step": 2174 }, { "epoch": 2.521191045424908, "grad_norm": 0.2963407337665558, "learning_rate": 3.7500000000000003e-05, "loss": 0.1409, "step": 2175 }, { "epoch": 2.522350213721655, "grad_norm": 0.24243849515914917, "learning_rate": 3.749425287356322e-05, "loss": 0.1304, "step": 2176 }, { "epoch": 2.5235093820184016, "grad_norm": 0.27366575598716736, "learning_rate": 3.748850574712644e-05, "loss": 0.1381, "step": 2177 }, { "epoch": 2.524668550315149, "grad_norm": 0.2454218864440918, "learning_rate": 3.7482758620689655e-05, "loss": 0.1145, "step": 2178 }, { "epoch": 2.525827718611896, "grad_norm": 0.23390009999275208, "learning_rate": 3.747701149425287e-05, "loss": 0.1131, "step": 2179 }, { "epoch": 2.526986886908643, "grad_norm": 0.24678659439086914, "learning_rate": 3.747126436781609e-05, "loss": 0.1215, "step": 2180 }, { "epoch": 2.52814605520539, "grad_norm": 0.2842147648334503, "learning_rate": 3.746551724137931e-05, "loss": 0.1298, "step": 2181 }, { "epoch": 2.529305223502137, "grad_norm": 0.3027459979057312, "learning_rate": 3.7459770114942535e-05, "loss": 0.1212, "step": 2182 }, { "epoch": 2.5304643917988843, "grad_norm": 0.31403622031211853, "learning_rate": 3.745402298850575e-05, "loss": 0.1231, "step": 2183 }, { "epoch": 2.5316235600956314, "grad_norm": 0.2930856943130493, "learning_rate": 3.7448275862068965e-05, "loss": 0.13, "step": 2184 }, { "epoch": 2.5327827283923785, "grad_norm": 0.2218947857618332, "learning_rate": 3.7442528735632186e-05, "loss": 0.1079, "step": 2185 }, { "epoch": 2.5339418966891256, "grad_norm": 0.2198045253753662, "learning_rate": 3.74367816091954e-05, "loss": 0.1077, "step": 2186 }, { "epoch": 2.5351010649858727, "grad_norm": 0.198323592543602, "learning_rate": 3.743103448275862e-05, "loss": 0.11, "step": 2187 }, { "epoch": 2.53626023328262, "grad_norm": 0.258024662733078, "learning_rate": 3.7425287356321845e-05, "loss": 0.1162, "step": 2188 }, { "epoch": 2.537419401579367, "grad_norm": 0.3027409315109253, "learning_rate": 3.741954022988506e-05, "loss": 0.1383, "step": 2189 }, { "epoch": 2.5385785698761136, "grad_norm": 0.27792859077453613, "learning_rate": 3.741379310344828e-05, "loss": 0.1138, "step": 2190 }, { "epoch": 2.539737738172861, "grad_norm": 0.2331332266330719, "learning_rate": 3.7408045977011496e-05, "loss": 0.1065, "step": 2191 }, { "epoch": 2.540896906469608, "grad_norm": 0.21128974854946136, "learning_rate": 3.740229885057471e-05, "loss": 0.1197, "step": 2192 }, { "epoch": 2.542056074766355, "grad_norm": 0.2796282172203064, "learning_rate": 3.739655172413793e-05, "loss": 0.1131, "step": 2193 }, { "epoch": 2.543215243063102, "grad_norm": 0.24139229953289032, "learning_rate": 3.7390804597701154e-05, "loss": 0.1165, "step": 2194 }, { "epoch": 2.544374411359849, "grad_norm": 0.22860391438007355, "learning_rate": 3.738505747126437e-05, "loss": 0.1176, "step": 2195 }, { "epoch": 2.5455335796565963, "grad_norm": 0.2661796510219574, "learning_rate": 3.737931034482759e-05, "loss": 0.1295, "step": 2196 }, { "epoch": 2.5466927479533434, "grad_norm": 0.28988024592399597, "learning_rate": 3.7373563218390806e-05, "loss": 0.1363, "step": 2197 }, { "epoch": 2.5478519162500906, "grad_norm": 0.26467469334602356, "learning_rate": 3.736781609195402e-05, "loss": 0.1078, "step": 2198 }, { "epoch": 2.5490110845468377, "grad_norm": 0.3318430483341217, "learning_rate": 3.736206896551724e-05, "loss": 0.1299, "step": 2199 }, { "epoch": 2.550170252843585, "grad_norm": 0.24909447133541107, "learning_rate": 3.735632183908046e-05, "loss": 0.1019, "step": 2200 }, { "epoch": 2.551329421140332, "grad_norm": 0.3391667604446411, "learning_rate": 3.735057471264368e-05, "loss": 0.1336, "step": 2201 }, { "epoch": 2.552488589437079, "grad_norm": 0.2599574327468872, "learning_rate": 3.73448275862069e-05, "loss": 0.1154, "step": 2202 }, { "epoch": 2.553647757733826, "grad_norm": 0.22761644423007965, "learning_rate": 3.7339080459770116e-05, "loss": 0.0995, "step": 2203 }, { "epoch": 2.5548069260305732, "grad_norm": 0.2874188721179962, "learning_rate": 3.733333333333334e-05, "loss": 0.1172, "step": 2204 }, { "epoch": 2.55596609432732, "grad_norm": 0.2598547041416168, "learning_rate": 3.732758620689655e-05, "loss": 0.1315, "step": 2205 }, { "epoch": 2.5571252626240675, "grad_norm": 0.2864150106906891, "learning_rate": 3.732183908045977e-05, "loss": 0.1163, "step": 2206 }, { "epoch": 2.558284430920814, "grad_norm": 0.3307492434978485, "learning_rate": 3.731609195402299e-05, "loss": 0.1267, "step": 2207 }, { "epoch": 2.5594435992175613, "grad_norm": 0.2648185193538666, "learning_rate": 3.731034482758621e-05, "loss": 0.1218, "step": 2208 }, { "epoch": 2.5606027675143084, "grad_norm": 0.21011538803577423, "learning_rate": 3.730459770114943e-05, "loss": 0.1069, "step": 2209 }, { "epoch": 2.5617619358110555, "grad_norm": 0.31918808817863464, "learning_rate": 3.729885057471265e-05, "loss": 0.1256, "step": 2210 }, { "epoch": 2.5629211041078026, "grad_norm": 0.28522172570228577, "learning_rate": 3.729310344827586e-05, "loss": 0.1294, "step": 2211 }, { "epoch": 2.5640802724045497, "grad_norm": 0.21739451587200165, "learning_rate": 3.7287356321839084e-05, "loss": 0.1001, "step": 2212 }, { "epoch": 2.565239440701297, "grad_norm": 0.23575380444526672, "learning_rate": 3.72816091954023e-05, "loss": 0.1192, "step": 2213 }, { "epoch": 2.566398608998044, "grad_norm": 0.20924952626228333, "learning_rate": 3.727586206896552e-05, "loss": 0.1079, "step": 2214 }, { "epoch": 2.567557777294791, "grad_norm": 0.31391969323158264, "learning_rate": 3.727011494252874e-05, "loss": 0.1236, "step": 2215 }, { "epoch": 2.568716945591538, "grad_norm": 0.2599903643131256, "learning_rate": 3.726436781609196e-05, "loss": 0.1175, "step": 2216 }, { "epoch": 2.5698761138882853, "grad_norm": 0.2094232141971588, "learning_rate": 3.725862068965517e-05, "loss": 0.1147, "step": 2217 }, { "epoch": 2.571035282185032, "grad_norm": 0.2692113518714905, "learning_rate": 3.725287356321839e-05, "loss": 0.125, "step": 2218 }, { "epoch": 2.5721944504817795, "grad_norm": 0.28311142325401306, "learning_rate": 3.724712643678161e-05, "loss": 0.1233, "step": 2219 }, { "epoch": 2.573353618778526, "grad_norm": 0.31285127997398376, "learning_rate": 3.724137931034483e-05, "loss": 0.1282, "step": 2220 }, { "epoch": 2.5745127870752738, "grad_norm": 0.2766589820384979, "learning_rate": 3.7235632183908045e-05, "loss": 0.1163, "step": 2221 }, { "epoch": 2.5756719553720204, "grad_norm": 0.26953697204589844, "learning_rate": 3.7229885057471267e-05, "loss": 0.1296, "step": 2222 }, { "epoch": 2.5768311236687675, "grad_norm": 0.23072504997253418, "learning_rate": 3.722413793103449e-05, "loss": 0.1202, "step": 2223 }, { "epoch": 2.5779902919655147, "grad_norm": 0.22109727561473846, "learning_rate": 3.72183908045977e-05, "loss": 0.1187, "step": 2224 }, { "epoch": 2.5791494602622618, "grad_norm": 0.28811052441596985, "learning_rate": 3.721264367816092e-05, "loss": 0.1201, "step": 2225 }, { "epoch": 2.580308628559009, "grad_norm": 0.31841617822647095, "learning_rate": 3.720689655172414e-05, "loss": 0.1092, "step": 2226 }, { "epoch": 2.581467796855756, "grad_norm": 0.27956047654151917, "learning_rate": 3.7201149425287355e-05, "loss": 0.1238, "step": 2227 }, { "epoch": 2.582626965152503, "grad_norm": 0.28410202264785767, "learning_rate": 3.7195402298850576e-05, "loss": 0.122, "step": 2228 }, { "epoch": 2.5837861334492502, "grad_norm": 0.23132778704166412, "learning_rate": 3.71896551724138e-05, "loss": 0.1221, "step": 2229 }, { "epoch": 2.5849453017459973, "grad_norm": 0.27684658765792847, "learning_rate": 3.718390804597701e-05, "loss": 0.1099, "step": 2230 }, { "epoch": 2.5861044700427445, "grad_norm": 0.241298645734787, "learning_rate": 3.7178160919540235e-05, "loss": 0.11, "step": 2231 }, { "epoch": 2.5872636383394916, "grad_norm": 0.23437266051769257, "learning_rate": 3.717241379310345e-05, "loss": 0.1234, "step": 2232 }, { "epoch": 2.5884228066362382, "grad_norm": 0.27701497077941895, "learning_rate": 3.7166666666666664e-05, "loss": 0.1105, "step": 2233 }, { "epoch": 2.589581974932986, "grad_norm": 0.24334724247455597, "learning_rate": 3.7160919540229886e-05, "loss": 0.1087, "step": 2234 }, { "epoch": 2.5907411432297325, "grad_norm": 0.25163525342941284, "learning_rate": 3.715517241379311e-05, "loss": 0.1084, "step": 2235 }, { "epoch": 2.5919003115264796, "grad_norm": 0.313559353351593, "learning_rate": 3.714942528735632e-05, "loss": 0.1397, "step": 2236 }, { "epoch": 2.5930594798232267, "grad_norm": 0.2449546456336975, "learning_rate": 3.7143678160919544e-05, "loss": 0.1127, "step": 2237 }, { "epoch": 2.594218648119974, "grad_norm": 0.2583812475204468, "learning_rate": 3.713793103448276e-05, "loss": 0.1089, "step": 2238 }, { "epoch": 2.595377816416721, "grad_norm": 0.2844179570674896, "learning_rate": 3.713218390804598e-05, "loss": 0.1166, "step": 2239 }, { "epoch": 2.596536984713468, "grad_norm": 0.2738819718360901, "learning_rate": 3.7126436781609196e-05, "loss": 0.1199, "step": 2240 }, { "epoch": 2.597696153010215, "grad_norm": 0.253930002450943, "learning_rate": 3.712068965517241e-05, "loss": 0.1134, "step": 2241 }, { "epoch": 2.5988553213069623, "grad_norm": 0.2768709659576416, "learning_rate": 3.711494252873563e-05, "loss": 0.1121, "step": 2242 }, { "epoch": 2.6000144896037094, "grad_norm": 0.2729703187942505, "learning_rate": 3.7109195402298854e-05, "loss": 0.1197, "step": 2243 }, { "epoch": 2.6011736579004565, "grad_norm": 0.3085278570652008, "learning_rate": 3.710344827586207e-05, "loss": 0.1363, "step": 2244 }, { "epoch": 2.6023328261972036, "grad_norm": 0.2976699471473694, "learning_rate": 3.709770114942529e-05, "loss": 0.1215, "step": 2245 }, { "epoch": 2.6034919944939507, "grad_norm": 0.24314960837364197, "learning_rate": 3.7091954022988505e-05, "loss": 0.1124, "step": 2246 }, { "epoch": 2.604651162790698, "grad_norm": 0.20558379590511322, "learning_rate": 3.708620689655173e-05, "loss": 0.1064, "step": 2247 }, { "epoch": 2.6058103310874445, "grad_norm": 0.2385900467634201, "learning_rate": 3.708045977011494e-05, "loss": 0.1125, "step": 2248 }, { "epoch": 2.606969499384192, "grad_norm": 0.3381904363632202, "learning_rate": 3.7074712643678164e-05, "loss": 0.1222, "step": 2249 }, { "epoch": 2.6081286676809388, "grad_norm": 0.24625416100025177, "learning_rate": 3.7068965517241385e-05, "loss": 0.1172, "step": 2250 }, { "epoch": 2.609287835977686, "grad_norm": 0.2692759931087494, "learning_rate": 3.70632183908046e-05, "loss": 0.1275, "step": 2251 }, { "epoch": 2.610447004274433, "grad_norm": 0.2261049598455429, "learning_rate": 3.7057471264367815e-05, "loss": 0.1132, "step": 2252 }, { "epoch": 2.61160617257118, "grad_norm": 0.2826228141784668, "learning_rate": 3.705172413793104e-05, "loss": 0.1276, "step": 2253 }, { "epoch": 2.612765340867927, "grad_norm": 0.24635249376296997, "learning_rate": 3.704597701149425e-05, "loss": 0.1231, "step": 2254 }, { "epoch": 2.6139245091646743, "grad_norm": 0.2234896570444107, "learning_rate": 3.7040229885057473e-05, "loss": 0.1048, "step": 2255 }, { "epoch": 2.6150836774614215, "grad_norm": 0.2619444727897644, "learning_rate": 3.7034482758620695e-05, "loss": 0.1306, "step": 2256 }, { "epoch": 2.6162428457581686, "grad_norm": 0.3317265808582306, "learning_rate": 3.702873563218391e-05, "loss": 0.1275, "step": 2257 }, { "epoch": 2.6174020140549157, "grad_norm": 0.2708195447921753, "learning_rate": 3.702298850574713e-05, "loss": 0.1133, "step": 2258 }, { "epoch": 2.618561182351663, "grad_norm": 0.2827620208263397, "learning_rate": 3.701724137931035e-05, "loss": 0.1307, "step": 2259 }, { "epoch": 2.61972035064841, "grad_norm": 0.24215327203273773, "learning_rate": 3.701149425287356e-05, "loss": 0.1187, "step": 2260 }, { "epoch": 2.6208795189451566, "grad_norm": 0.2541255056858063, "learning_rate": 3.700574712643678e-05, "loss": 0.1215, "step": 2261 }, { "epoch": 2.622038687241904, "grad_norm": 0.2253810614347458, "learning_rate": 3.7e-05, "loss": 0.1169, "step": 2262 }, { "epoch": 2.623197855538651, "grad_norm": 0.27330759167671204, "learning_rate": 3.699425287356322e-05, "loss": 0.1285, "step": 2263 }, { "epoch": 2.6243570238353984, "grad_norm": 0.2241210639476776, "learning_rate": 3.698850574712644e-05, "loss": 0.1143, "step": 2264 }, { "epoch": 2.625516192132145, "grad_norm": 0.28012943267822266, "learning_rate": 3.6982758620689656e-05, "loss": 0.1303, "step": 2265 }, { "epoch": 2.626675360428892, "grad_norm": 0.2770480215549469, "learning_rate": 3.697701149425288e-05, "loss": 0.1294, "step": 2266 }, { "epoch": 2.6278345287256393, "grad_norm": 0.22335301339626312, "learning_rate": 3.697126436781609e-05, "loss": 0.1223, "step": 2267 }, { "epoch": 2.6289936970223864, "grad_norm": 0.2822023332118988, "learning_rate": 3.696551724137931e-05, "loss": 0.123, "step": 2268 }, { "epoch": 2.6301528653191335, "grad_norm": 0.2653454840183258, "learning_rate": 3.695977011494253e-05, "loss": 0.1153, "step": 2269 }, { "epoch": 2.6313120336158806, "grad_norm": 0.2543680965900421, "learning_rate": 3.695402298850575e-05, "loss": 0.1211, "step": 2270 }, { "epoch": 2.6324712019126277, "grad_norm": 0.3247624635696411, "learning_rate": 3.6948275862068966e-05, "loss": 0.1232, "step": 2271 }, { "epoch": 2.633630370209375, "grad_norm": 0.30818063020706177, "learning_rate": 3.694252873563219e-05, "loss": 0.1255, "step": 2272 }, { "epoch": 2.634789538506122, "grad_norm": 0.2480584979057312, "learning_rate": 3.69367816091954e-05, "loss": 0.1266, "step": 2273 }, { "epoch": 2.635948706802869, "grad_norm": 0.2562277615070343, "learning_rate": 3.6931034482758624e-05, "loss": 0.1245, "step": 2274 }, { "epoch": 2.637107875099616, "grad_norm": 0.21452297270298004, "learning_rate": 3.692528735632184e-05, "loss": 0.112, "step": 2275 }, { "epoch": 2.638267043396363, "grad_norm": 0.20973417162895203, "learning_rate": 3.691954022988506e-05, "loss": 0.1148, "step": 2276 }, { "epoch": 2.6394262116931104, "grad_norm": 0.2719014286994934, "learning_rate": 3.691379310344828e-05, "loss": 0.1165, "step": 2277 }, { "epoch": 2.640585379989857, "grad_norm": 0.2603360414505005, "learning_rate": 3.69080459770115e-05, "loss": 0.1217, "step": 2278 }, { "epoch": 2.641744548286604, "grad_norm": 0.2589358985424042, "learning_rate": 3.690229885057471e-05, "loss": 0.1261, "step": 2279 }, { "epoch": 2.6429037165833513, "grad_norm": 0.2366466522216797, "learning_rate": 3.6896551724137934e-05, "loss": 0.112, "step": 2280 }, { "epoch": 2.6440628848800984, "grad_norm": 0.3076455891132355, "learning_rate": 3.689080459770115e-05, "loss": 0.1337, "step": 2281 }, { "epoch": 2.6452220531768456, "grad_norm": 0.24393005669116974, "learning_rate": 3.6885057471264364e-05, "loss": 0.1198, "step": 2282 }, { "epoch": 2.6463812214735927, "grad_norm": 0.23066796362400055, "learning_rate": 3.6879310344827586e-05, "loss": 0.1197, "step": 2283 }, { "epoch": 2.64754038977034, "grad_norm": 0.22284121811389923, "learning_rate": 3.687356321839081e-05, "loss": 0.1076, "step": 2284 }, { "epoch": 2.648699558067087, "grad_norm": 0.2574465870857239, "learning_rate": 3.686781609195403e-05, "loss": 0.1185, "step": 2285 }, { "epoch": 2.649858726363834, "grad_norm": 0.2645076811313629, "learning_rate": 3.6862068965517244e-05, "loss": 0.1175, "step": 2286 }, { "epoch": 2.651017894660581, "grad_norm": 0.25155895948410034, "learning_rate": 3.685632183908046e-05, "loss": 0.1228, "step": 2287 }, { "epoch": 2.6521770629573282, "grad_norm": 0.26526379585266113, "learning_rate": 3.685057471264368e-05, "loss": 0.1196, "step": 2288 }, { "epoch": 2.6533362312540754, "grad_norm": 0.25388583540916443, "learning_rate": 3.6844827586206895e-05, "loss": 0.1181, "step": 2289 }, { "epoch": 2.6544953995508225, "grad_norm": 0.31535226106643677, "learning_rate": 3.683908045977012e-05, "loss": 0.1282, "step": 2290 }, { "epoch": 2.655654567847569, "grad_norm": 0.2896163761615753, "learning_rate": 3.683333333333334e-05, "loss": 0.1271, "step": 2291 }, { "epoch": 2.6568137361443167, "grad_norm": 0.26265934109687805, "learning_rate": 3.6827586206896554e-05, "loss": 0.1289, "step": 2292 }, { "epoch": 2.6579729044410634, "grad_norm": 0.25363272428512573, "learning_rate": 3.6821839080459775e-05, "loss": 0.1212, "step": 2293 }, { "epoch": 2.6591320727378105, "grad_norm": 0.304082453250885, "learning_rate": 3.681609195402299e-05, "loss": 0.1121, "step": 2294 }, { "epoch": 2.6602912410345576, "grad_norm": 0.21810661256313324, "learning_rate": 3.6810344827586205e-05, "loss": 0.1177, "step": 2295 }, { "epoch": 2.6614504093313047, "grad_norm": 0.24036329984664917, "learning_rate": 3.680459770114943e-05, "loss": 0.109, "step": 2296 }, { "epoch": 2.662609577628052, "grad_norm": 0.2967372238636017, "learning_rate": 3.679885057471265e-05, "loss": 0.1218, "step": 2297 }, { "epoch": 2.663768745924799, "grad_norm": 0.3007439374923706, "learning_rate": 3.679310344827586e-05, "loss": 0.1286, "step": 2298 }, { "epoch": 2.664927914221546, "grad_norm": 0.26587292551994324, "learning_rate": 3.6787356321839085e-05, "loss": 0.1242, "step": 2299 }, { "epoch": 2.666087082518293, "grad_norm": 0.2876141369342804, "learning_rate": 3.67816091954023e-05, "loss": 0.1322, "step": 2300 }, { "epoch": 2.6672462508150403, "grad_norm": 0.27488911151885986, "learning_rate": 3.6775862068965515e-05, "loss": 0.1218, "step": 2301 }, { "epoch": 2.6684054191117874, "grad_norm": 0.2519826292991638, "learning_rate": 3.6770114942528736e-05, "loss": 0.1138, "step": 2302 }, { "epoch": 2.6695645874085345, "grad_norm": 0.26412853598594666, "learning_rate": 3.676436781609195e-05, "loss": 0.1184, "step": 2303 }, { "epoch": 2.670723755705281, "grad_norm": 0.24107958376407623, "learning_rate": 3.675862068965518e-05, "loss": 0.1159, "step": 2304 }, { "epoch": 2.6718829240020288, "grad_norm": 0.25105899572372437, "learning_rate": 3.6752873563218395e-05, "loss": 0.1075, "step": 2305 }, { "epoch": 2.6730420922987754, "grad_norm": 0.23559515178203583, "learning_rate": 3.674712643678161e-05, "loss": 0.1225, "step": 2306 }, { "epoch": 2.674201260595523, "grad_norm": 0.23702873289585114, "learning_rate": 3.674137931034483e-05, "loss": 0.1221, "step": 2307 }, { "epoch": 2.6753604288922697, "grad_norm": 0.21996693313121796, "learning_rate": 3.6735632183908046e-05, "loss": 0.1152, "step": 2308 }, { "epoch": 2.6765195971890168, "grad_norm": 0.2361491173505783, "learning_rate": 3.672988505747126e-05, "loss": 0.115, "step": 2309 }, { "epoch": 2.677678765485764, "grad_norm": 0.24125604331493378, "learning_rate": 3.672413793103448e-05, "loss": 0.1197, "step": 2310 }, { "epoch": 2.678837933782511, "grad_norm": 0.23669910430908203, "learning_rate": 3.6718390804597704e-05, "loss": 0.1091, "step": 2311 }, { "epoch": 2.679997102079258, "grad_norm": 0.26078295707702637, "learning_rate": 3.6712643678160926e-05, "loss": 0.1059, "step": 2312 }, { "epoch": 2.6811562703760052, "grad_norm": 0.30441415309906006, "learning_rate": 3.670689655172414e-05, "loss": 0.1154, "step": 2313 }, { "epoch": 2.6823154386727524, "grad_norm": 0.22218701243400574, "learning_rate": 3.6701149425287356e-05, "loss": 0.1235, "step": 2314 }, { "epoch": 2.6834746069694995, "grad_norm": 0.25640806555747986, "learning_rate": 3.669540229885058e-05, "loss": 0.1218, "step": 2315 }, { "epoch": 2.6846337752662466, "grad_norm": 0.2934137284755707, "learning_rate": 3.668965517241379e-05, "loss": 0.1164, "step": 2316 }, { "epoch": 2.6857929435629937, "grad_norm": 0.27650073170661926, "learning_rate": 3.6683908045977014e-05, "loss": 0.1183, "step": 2317 }, { "epoch": 2.686952111859741, "grad_norm": 0.24524487555027008, "learning_rate": 3.6678160919540236e-05, "loss": 0.1197, "step": 2318 }, { "epoch": 2.6881112801564875, "grad_norm": 0.3019874095916748, "learning_rate": 3.667241379310345e-05, "loss": 0.1294, "step": 2319 }, { "epoch": 2.689270448453235, "grad_norm": 0.321593314409256, "learning_rate": 3.6666666666666666e-05, "loss": 0.1279, "step": 2320 }, { "epoch": 2.6904296167499817, "grad_norm": 0.26887890696525574, "learning_rate": 3.666091954022989e-05, "loss": 0.1225, "step": 2321 }, { "epoch": 2.691588785046729, "grad_norm": 0.24992501735687256, "learning_rate": 3.66551724137931e-05, "loss": 0.1136, "step": 2322 }, { "epoch": 2.692747953343476, "grad_norm": 0.21639297902584076, "learning_rate": 3.6649425287356324e-05, "loss": 0.1074, "step": 2323 }, { "epoch": 2.693907121640223, "grad_norm": 0.3038938045501709, "learning_rate": 3.6643678160919546e-05, "loss": 0.1307, "step": 2324 }, { "epoch": 2.69506628993697, "grad_norm": 0.31261420249938965, "learning_rate": 3.663793103448276e-05, "loss": 0.1392, "step": 2325 }, { "epoch": 2.6962254582337173, "grad_norm": 0.22381141781806946, "learning_rate": 3.663218390804598e-05, "loss": 0.1043, "step": 2326 }, { "epoch": 2.6973846265304644, "grad_norm": 0.28085264563560486, "learning_rate": 3.66264367816092e-05, "loss": 0.1145, "step": 2327 }, { "epoch": 2.6985437948272115, "grad_norm": 0.2581823468208313, "learning_rate": 3.662068965517241e-05, "loss": 0.1333, "step": 2328 }, { "epoch": 2.6997029631239586, "grad_norm": 0.25244686007499695, "learning_rate": 3.6614942528735634e-05, "loss": 0.1249, "step": 2329 }, { "epoch": 2.7008621314207057, "grad_norm": 0.24646331369876862, "learning_rate": 3.660919540229885e-05, "loss": 0.1142, "step": 2330 }, { "epoch": 2.702021299717453, "grad_norm": 0.2165023535490036, "learning_rate": 3.660344827586207e-05, "loss": 0.1225, "step": 2331 }, { "epoch": 2.7031804680142, "grad_norm": 0.25640419125556946, "learning_rate": 3.659770114942529e-05, "loss": 0.1242, "step": 2332 }, { "epoch": 2.704339636310947, "grad_norm": 0.33699721097946167, "learning_rate": 3.659195402298851e-05, "loss": 0.1284, "step": 2333 }, { "epoch": 2.7054988046076938, "grad_norm": 0.2463027685880661, "learning_rate": 3.658620689655173e-05, "loss": 0.1252, "step": 2334 }, { "epoch": 2.7066579729044413, "grad_norm": 0.2819848954677582, "learning_rate": 3.6580459770114943e-05, "loss": 0.1097, "step": 2335 }, { "epoch": 2.707817141201188, "grad_norm": 0.2654036283493042, "learning_rate": 3.657471264367816e-05, "loss": 0.1187, "step": 2336 }, { "epoch": 2.708976309497935, "grad_norm": 0.2798532545566559, "learning_rate": 3.656896551724138e-05, "loss": 0.1169, "step": 2337 }, { "epoch": 2.7101354777946822, "grad_norm": 0.24664916098117828, "learning_rate": 3.65632183908046e-05, "loss": 0.1337, "step": 2338 }, { "epoch": 2.7112946460914293, "grad_norm": 0.2289317101240158, "learning_rate": 3.655747126436782e-05, "loss": 0.1255, "step": 2339 }, { "epoch": 2.7124538143881765, "grad_norm": 0.28109264373779297, "learning_rate": 3.655172413793104e-05, "loss": 0.1219, "step": 2340 }, { "epoch": 2.7136129826849236, "grad_norm": 0.2645872235298157, "learning_rate": 3.654597701149425e-05, "loss": 0.1365, "step": 2341 }, { "epoch": 2.7147721509816707, "grad_norm": 0.2895812690258026, "learning_rate": 3.6540229885057475e-05, "loss": 0.1297, "step": 2342 }, { "epoch": 2.715931319278418, "grad_norm": 0.2900197207927704, "learning_rate": 3.653448275862069e-05, "loss": 0.1319, "step": 2343 }, { "epoch": 2.717090487575165, "grad_norm": 0.22161021828651428, "learning_rate": 3.6528735632183905e-05, "loss": 0.1153, "step": 2344 }, { "epoch": 2.718249655871912, "grad_norm": 0.28364619612693787, "learning_rate": 3.652298850574713e-05, "loss": 0.1261, "step": 2345 }, { "epoch": 2.719408824168659, "grad_norm": 0.2544122636318207, "learning_rate": 3.651724137931035e-05, "loss": 0.1175, "step": 2346 }, { "epoch": 2.720567992465406, "grad_norm": 0.21084167063236237, "learning_rate": 3.651149425287356e-05, "loss": 0.1011, "step": 2347 }, { "epoch": 2.7217271607621534, "grad_norm": 0.24158698320388794, "learning_rate": 3.6505747126436785e-05, "loss": 0.1125, "step": 2348 }, { "epoch": 2.7228863290589, "grad_norm": 0.2733836770057678, "learning_rate": 3.65e-05, "loss": 0.1187, "step": 2349 }, { "epoch": 2.724045497355647, "grad_norm": 0.32197657227516174, "learning_rate": 3.649425287356322e-05, "loss": 0.1252, "step": 2350 }, { "epoch": 2.7252046656523943, "grad_norm": 0.2622753381729126, "learning_rate": 3.6488505747126436e-05, "loss": 0.1251, "step": 2351 }, { "epoch": 2.7263638339491414, "grad_norm": 0.2564488351345062, "learning_rate": 3.648275862068966e-05, "loss": 0.1175, "step": 2352 }, { "epoch": 2.7275230022458885, "grad_norm": 0.29768943786621094, "learning_rate": 3.647701149425288e-05, "loss": 0.1133, "step": 2353 }, { "epoch": 2.7286821705426356, "grad_norm": 0.297397643327713, "learning_rate": 3.6471264367816094e-05, "loss": 0.121, "step": 2354 }, { "epoch": 2.7298413388393827, "grad_norm": 0.29082873463630676, "learning_rate": 3.646551724137931e-05, "loss": 0.1182, "step": 2355 }, { "epoch": 2.73100050713613, "grad_norm": 0.22852347791194916, "learning_rate": 3.645977011494253e-05, "loss": 0.1106, "step": 2356 }, { "epoch": 2.732159675432877, "grad_norm": 0.24832133948802948, "learning_rate": 3.6454022988505746e-05, "loss": 0.1103, "step": 2357 }, { "epoch": 2.733318843729624, "grad_norm": 0.2785024046897888, "learning_rate": 3.644827586206897e-05, "loss": 0.1094, "step": 2358 }, { "epoch": 2.734478012026371, "grad_norm": 0.25696471333503723, "learning_rate": 3.644252873563219e-05, "loss": 0.1202, "step": 2359 }, { "epoch": 2.7356371803231183, "grad_norm": 0.264342337846756, "learning_rate": 3.6436781609195404e-05, "loss": 0.1151, "step": 2360 }, { "epoch": 2.7367963486198654, "grad_norm": 0.21554605662822723, "learning_rate": 3.6431034482758626e-05, "loss": 0.1109, "step": 2361 }, { "epoch": 2.737955516916612, "grad_norm": 0.26814398169517517, "learning_rate": 3.642528735632184e-05, "loss": 0.1184, "step": 2362 }, { "epoch": 2.7391146852133597, "grad_norm": 0.24080701172351837, "learning_rate": 3.6419540229885056e-05, "loss": 0.1151, "step": 2363 }, { "epoch": 2.7402738535101063, "grad_norm": 0.2593724727630615, "learning_rate": 3.641379310344828e-05, "loss": 0.1201, "step": 2364 }, { "epoch": 2.7414330218068534, "grad_norm": 0.21561947464942932, "learning_rate": 3.64080459770115e-05, "loss": 0.1213, "step": 2365 }, { "epoch": 2.7425921901036006, "grad_norm": 0.3276143968105316, "learning_rate": 3.6402298850574714e-05, "loss": 0.1139, "step": 2366 }, { "epoch": 2.7437513584003477, "grad_norm": 0.26314690709114075, "learning_rate": 3.6396551724137936e-05, "loss": 0.1104, "step": 2367 }, { "epoch": 2.744910526697095, "grad_norm": 0.2540189027786255, "learning_rate": 3.639080459770115e-05, "loss": 0.1218, "step": 2368 }, { "epoch": 2.746069694993842, "grad_norm": 0.23746971786022186, "learning_rate": 3.638505747126437e-05, "loss": 0.1109, "step": 2369 }, { "epoch": 2.747228863290589, "grad_norm": 0.25298699736595154, "learning_rate": 3.637931034482759e-05, "loss": 0.1159, "step": 2370 }, { "epoch": 2.748388031587336, "grad_norm": 0.25313448905944824, "learning_rate": 3.63735632183908e-05, "loss": 0.1115, "step": 2371 }, { "epoch": 2.7495471998840832, "grad_norm": 0.2032177448272705, "learning_rate": 3.6367816091954024e-05, "loss": 0.1203, "step": 2372 }, { "epoch": 2.7507063681808304, "grad_norm": 0.3549204468727112, "learning_rate": 3.6362068965517245e-05, "loss": 0.1234, "step": 2373 }, { "epoch": 2.7518655364775775, "grad_norm": 0.2937655746936798, "learning_rate": 3.635632183908046e-05, "loss": 0.1222, "step": 2374 }, { "epoch": 2.753024704774324, "grad_norm": 0.23844723403453827, "learning_rate": 3.635057471264368e-05, "loss": 0.1168, "step": 2375 }, { "epoch": 2.7541838730710717, "grad_norm": 0.3607582747936249, "learning_rate": 3.63448275862069e-05, "loss": 0.1224, "step": 2376 }, { "epoch": 2.7553430413678184, "grad_norm": 0.29361268877983093, "learning_rate": 3.633908045977011e-05, "loss": 0.1209, "step": 2377 }, { "epoch": 2.756502209664566, "grad_norm": 0.2490907460451126, "learning_rate": 3.633333333333333e-05, "loss": 0.1157, "step": 2378 }, { "epoch": 2.7576613779613126, "grad_norm": 0.24479351937770844, "learning_rate": 3.6327586206896555e-05, "loss": 0.1194, "step": 2379 }, { "epoch": 2.7588205462580597, "grad_norm": 0.33146244287490845, "learning_rate": 3.632183908045978e-05, "loss": 0.1374, "step": 2380 }, { "epoch": 2.759979714554807, "grad_norm": 0.2993434965610504, "learning_rate": 3.631609195402299e-05, "loss": 0.1125, "step": 2381 }, { "epoch": 2.761138882851554, "grad_norm": 0.25163406133651733, "learning_rate": 3.6310344827586206e-05, "loss": 0.1168, "step": 2382 }, { "epoch": 2.762298051148301, "grad_norm": 0.2083575427532196, "learning_rate": 3.630459770114943e-05, "loss": 0.1033, "step": 2383 }, { "epoch": 2.763457219445048, "grad_norm": 0.3714704215526581, "learning_rate": 3.629885057471264e-05, "loss": 0.1262, "step": 2384 }, { "epoch": 2.7646163877417953, "grad_norm": 0.3778088688850403, "learning_rate": 3.6293103448275865e-05, "loss": 0.1243, "step": 2385 }, { "epoch": 2.7657755560385424, "grad_norm": 0.28580304980278015, "learning_rate": 3.6287356321839086e-05, "loss": 0.1188, "step": 2386 }, { "epoch": 2.7669347243352895, "grad_norm": 0.3345694839954376, "learning_rate": 3.62816091954023e-05, "loss": 0.1078, "step": 2387 }, { "epoch": 2.7680938926320366, "grad_norm": 0.2846716642379761, "learning_rate": 3.627586206896552e-05, "loss": 0.1142, "step": 2388 }, { "epoch": 2.7692530609287838, "grad_norm": 0.30462977290153503, "learning_rate": 3.627011494252874e-05, "loss": 0.1327, "step": 2389 }, { "epoch": 2.7704122292255304, "grad_norm": 0.2605188190937042, "learning_rate": 3.626436781609195e-05, "loss": 0.1099, "step": 2390 }, { "epoch": 2.771571397522278, "grad_norm": 0.2525080740451813, "learning_rate": 3.6258620689655174e-05, "loss": 0.1149, "step": 2391 }, { "epoch": 2.7727305658190247, "grad_norm": 0.2309272438287735, "learning_rate": 3.625287356321839e-05, "loss": 0.1232, "step": 2392 }, { "epoch": 2.7738897341157718, "grad_norm": 0.25411146879196167, "learning_rate": 3.624712643678161e-05, "loss": 0.138, "step": 2393 }, { "epoch": 2.775048902412519, "grad_norm": 0.2844621241092682, "learning_rate": 3.624137931034483e-05, "loss": 0.1282, "step": 2394 }, { "epoch": 2.776208070709266, "grad_norm": 0.29295527935028076, "learning_rate": 3.623563218390805e-05, "loss": 0.1213, "step": 2395 }, { "epoch": 2.777367239006013, "grad_norm": 0.3641514778137207, "learning_rate": 3.622988505747126e-05, "loss": 0.1313, "step": 2396 }, { "epoch": 2.7785264073027602, "grad_norm": 0.24175724387168884, "learning_rate": 3.6224137931034484e-05, "loss": 0.1199, "step": 2397 }, { "epoch": 2.7796855755995074, "grad_norm": 0.3517894744873047, "learning_rate": 3.62183908045977e-05, "loss": 0.1308, "step": 2398 }, { "epoch": 2.7808447438962545, "grad_norm": 0.24343693256378174, "learning_rate": 3.621264367816092e-05, "loss": 0.1162, "step": 2399 }, { "epoch": 2.7820039121930016, "grad_norm": 0.2250448763370514, "learning_rate": 3.620689655172414e-05, "loss": 0.112, "step": 2400 }, { "epoch": 2.7831630804897487, "grad_norm": 0.23824208974838257, "learning_rate": 3.620114942528736e-05, "loss": 0.1222, "step": 2401 }, { "epoch": 2.784322248786496, "grad_norm": 0.22008417546749115, "learning_rate": 3.619540229885058e-05, "loss": 0.1213, "step": 2402 }, { "epoch": 2.785481417083243, "grad_norm": 0.3020162582397461, "learning_rate": 3.6189655172413794e-05, "loss": 0.1329, "step": 2403 }, { "epoch": 2.78664058537999, "grad_norm": 0.21677182614803314, "learning_rate": 3.618390804597701e-05, "loss": 0.1186, "step": 2404 }, { "epoch": 2.7877997536767367, "grad_norm": 0.25215545296669006, "learning_rate": 3.617816091954023e-05, "loss": 0.117, "step": 2405 }, { "epoch": 2.7889589219734843, "grad_norm": 0.24186648428440094, "learning_rate": 3.617241379310345e-05, "loss": 0.1181, "step": 2406 }, { "epoch": 2.790118090270231, "grad_norm": 0.2735014259815216, "learning_rate": 3.6166666666666674e-05, "loss": 0.1276, "step": 2407 }, { "epoch": 2.791277258566978, "grad_norm": 0.20679566264152527, "learning_rate": 3.616091954022989e-05, "loss": 0.1146, "step": 2408 }, { "epoch": 2.792436426863725, "grad_norm": 0.25650107860565186, "learning_rate": 3.6155172413793104e-05, "loss": 0.1223, "step": 2409 }, { "epoch": 2.7935955951604723, "grad_norm": 0.22591929137706757, "learning_rate": 3.6149425287356325e-05, "loss": 0.1167, "step": 2410 }, { "epoch": 2.7947547634572194, "grad_norm": 0.2470989227294922, "learning_rate": 3.614367816091954e-05, "loss": 0.1116, "step": 2411 }, { "epoch": 2.7959139317539665, "grad_norm": 0.24131318926811218, "learning_rate": 3.6137931034482755e-05, "loss": 0.117, "step": 2412 }, { "epoch": 2.7970731000507136, "grad_norm": 0.2714328467845917, "learning_rate": 3.613218390804598e-05, "loss": 0.1246, "step": 2413 }, { "epoch": 2.7982322683474607, "grad_norm": 0.27446985244750977, "learning_rate": 3.61264367816092e-05, "loss": 0.1134, "step": 2414 }, { "epoch": 2.799391436644208, "grad_norm": 0.23050987720489502, "learning_rate": 3.6120689655172413e-05, "loss": 0.1099, "step": 2415 }, { "epoch": 2.800550604940955, "grad_norm": 0.27243009209632874, "learning_rate": 3.6114942528735635e-05, "loss": 0.1351, "step": 2416 }, { "epoch": 2.801709773237702, "grad_norm": 0.2836304306983948, "learning_rate": 3.610919540229885e-05, "loss": 0.1248, "step": 2417 }, { "epoch": 2.8028689415344488, "grad_norm": 0.27356666326522827, "learning_rate": 3.610344827586207e-05, "loss": 0.1193, "step": 2418 }, { "epoch": 2.8040281098311963, "grad_norm": 0.2828332185745239, "learning_rate": 3.6097701149425287e-05, "loss": 0.1253, "step": 2419 }, { "epoch": 2.805187278127943, "grad_norm": 0.2918018102645874, "learning_rate": 3.609195402298851e-05, "loss": 0.1309, "step": 2420 }, { "epoch": 2.8063464464246906, "grad_norm": 0.22664931416511536, "learning_rate": 3.608620689655173e-05, "loss": 0.1153, "step": 2421 }, { "epoch": 2.8075056147214372, "grad_norm": 0.2359132617712021, "learning_rate": 3.6080459770114945e-05, "loss": 0.1218, "step": 2422 }, { "epoch": 2.8086647830181843, "grad_norm": 0.26460394263267517, "learning_rate": 3.607471264367816e-05, "loss": 0.1309, "step": 2423 }, { "epoch": 2.8098239513149315, "grad_norm": 0.25006482005119324, "learning_rate": 3.606896551724138e-05, "loss": 0.1078, "step": 2424 }, { "epoch": 2.8109831196116786, "grad_norm": 0.25657373666763306, "learning_rate": 3.6063218390804596e-05, "loss": 0.1084, "step": 2425 }, { "epoch": 2.8121422879084257, "grad_norm": 0.3210379183292389, "learning_rate": 3.605747126436782e-05, "loss": 0.1272, "step": 2426 }, { "epoch": 2.813301456205173, "grad_norm": 0.23152370750904083, "learning_rate": 3.605172413793104e-05, "loss": 0.1124, "step": 2427 }, { "epoch": 2.81446062450192, "grad_norm": 0.24926093220710754, "learning_rate": 3.6045977011494255e-05, "loss": 0.1181, "step": 2428 }, { "epoch": 2.815619792798667, "grad_norm": 0.25474682450294495, "learning_rate": 3.6040229885057476e-05, "loss": 0.1287, "step": 2429 }, { "epoch": 2.816778961095414, "grad_norm": 0.27318352460861206, "learning_rate": 3.603448275862069e-05, "loss": 0.1203, "step": 2430 }, { "epoch": 2.8179381293921613, "grad_norm": 0.2800019383430481, "learning_rate": 3.6028735632183906e-05, "loss": 0.1196, "step": 2431 }, { "epoch": 2.8190972976889084, "grad_norm": 0.20935480296611786, "learning_rate": 3.602298850574713e-05, "loss": 0.1105, "step": 2432 }, { "epoch": 2.820256465985655, "grad_norm": 0.24666768312454224, "learning_rate": 3.601724137931034e-05, "loss": 0.1182, "step": 2433 }, { "epoch": 2.8214156342824026, "grad_norm": 0.2466108649969101, "learning_rate": 3.6011494252873564e-05, "loss": 0.1183, "step": 2434 }, { "epoch": 2.8225748025791493, "grad_norm": 0.2528558671474457, "learning_rate": 3.6005747126436786e-05, "loss": 0.1188, "step": 2435 }, { "epoch": 2.8237339708758964, "grad_norm": 0.29567837715148926, "learning_rate": 3.6e-05, "loss": 0.1443, "step": 2436 }, { "epoch": 2.8248931391726435, "grad_norm": 0.31635522842407227, "learning_rate": 3.599425287356322e-05, "loss": 0.107, "step": 2437 }, { "epoch": 2.8260523074693906, "grad_norm": 0.23318928480148315, "learning_rate": 3.598850574712644e-05, "loss": 0.1231, "step": 2438 }, { "epoch": 2.8272114757661377, "grad_norm": 0.29607391357421875, "learning_rate": 3.598275862068965e-05, "loss": 0.1257, "step": 2439 }, { "epoch": 2.828370644062885, "grad_norm": 0.2570621371269226, "learning_rate": 3.5977011494252874e-05, "loss": 0.1128, "step": 2440 }, { "epoch": 2.829529812359632, "grad_norm": 0.24534910917282104, "learning_rate": 3.5971264367816096e-05, "loss": 0.111, "step": 2441 }, { "epoch": 2.830688980656379, "grad_norm": 0.26997894048690796, "learning_rate": 3.596551724137931e-05, "loss": 0.1114, "step": 2442 }, { "epoch": 2.831848148953126, "grad_norm": 0.29635146260261536, "learning_rate": 3.595977011494253e-05, "loss": 0.1228, "step": 2443 }, { "epoch": 2.8330073172498733, "grad_norm": 0.2879303991794586, "learning_rate": 3.595402298850575e-05, "loss": 0.1248, "step": 2444 }, { "epoch": 2.8341664855466204, "grad_norm": 0.2170044481754303, "learning_rate": 3.594827586206897e-05, "loss": 0.11, "step": 2445 }, { "epoch": 2.8353256538433675, "grad_norm": 0.2590021789073944, "learning_rate": 3.5942528735632184e-05, "loss": 0.1257, "step": 2446 }, { "epoch": 2.8364848221401147, "grad_norm": 0.25677573680877686, "learning_rate": 3.5936781609195405e-05, "loss": 0.1266, "step": 2447 }, { "epoch": 2.8376439904368613, "grad_norm": 0.2618117928504944, "learning_rate": 3.593103448275863e-05, "loss": 0.1143, "step": 2448 }, { "epoch": 2.838803158733609, "grad_norm": 0.23307304084300995, "learning_rate": 3.592528735632184e-05, "loss": 0.1126, "step": 2449 }, { "epoch": 2.8399623270303556, "grad_norm": 0.2070130705833435, "learning_rate": 3.591954022988506e-05, "loss": 0.1141, "step": 2450 }, { "epoch": 2.8411214953271027, "grad_norm": 0.25860536098480225, "learning_rate": 3.591379310344828e-05, "loss": 0.1102, "step": 2451 }, { "epoch": 2.84228066362385, "grad_norm": 0.22545821964740753, "learning_rate": 3.5908045977011494e-05, "loss": 0.1037, "step": 2452 }, { "epoch": 2.843439831920597, "grad_norm": 0.27270445227622986, "learning_rate": 3.5902298850574715e-05, "loss": 0.1171, "step": 2453 }, { "epoch": 2.844599000217344, "grad_norm": 0.23583996295928955, "learning_rate": 3.589655172413793e-05, "loss": 0.099, "step": 2454 }, { "epoch": 2.845758168514091, "grad_norm": 0.27526620030403137, "learning_rate": 3.589080459770115e-05, "loss": 0.1124, "step": 2455 }, { "epoch": 2.8469173368108383, "grad_norm": 0.23859523236751556, "learning_rate": 3.5885057471264373e-05, "loss": 0.1142, "step": 2456 }, { "epoch": 2.8480765051075854, "grad_norm": 0.3173846900463104, "learning_rate": 3.587931034482759e-05, "loss": 0.1249, "step": 2457 }, { "epoch": 2.8492356734043325, "grad_norm": 0.2413249909877777, "learning_rate": 3.58735632183908e-05, "loss": 0.1165, "step": 2458 }, { "epoch": 2.8503948417010796, "grad_norm": 0.2488858848810196, "learning_rate": 3.5867816091954025e-05, "loss": 0.1176, "step": 2459 }, { "epoch": 2.8515540099978267, "grad_norm": 0.272587388753891, "learning_rate": 3.586206896551724e-05, "loss": 0.1212, "step": 2460 }, { "epoch": 2.8527131782945734, "grad_norm": 0.29471564292907715, "learning_rate": 3.585632183908046e-05, "loss": 0.1298, "step": 2461 }, { "epoch": 2.853872346591321, "grad_norm": 0.25901687145233154, "learning_rate": 3.585057471264368e-05, "loss": 0.1231, "step": 2462 }, { "epoch": 2.8550315148880676, "grad_norm": 0.2517445981502533, "learning_rate": 3.58448275862069e-05, "loss": 0.1169, "step": 2463 }, { "epoch": 2.856190683184815, "grad_norm": 0.23075416684150696, "learning_rate": 3.583908045977012e-05, "loss": 0.1025, "step": 2464 }, { "epoch": 2.857349851481562, "grad_norm": 0.2388884723186493, "learning_rate": 3.5833333333333335e-05, "loss": 0.1107, "step": 2465 }, { "epoch": 2.858509019778309, "grad_norm": 0.3148244619369507, "learning_rate": 3.582758620689655e-05, "loss": 0.1179, "step": 2466 }, { "epoch": 2.859668188075056, "grad_norm": 0.28545230627059937, "learning_rate": 3.582183908045977e-05, "loss": 0.1237, "step": 2467 }, { "epoch": 2.860827356371803, "grad_norm": 0.2898615002632141, "learning_rate": 3.581609195402299e-05, "loss": 0.1331, "step": 2468 }, { "epoch": 2.8619865246685503, "grad_norm": 0.2582353353500366, "learning_rate": 3.581034482758621e-05, "loss": 0.1204, "step": 2469 }, { "epoch": 2.8631456929652974, "grad_norm": 0.2813541293144226, "learning_rate": 3.580459770114943e-05, "loss": 0.1232, "step": 2470 }, { "epoch": 2.8643048612620445, "grad_norm": 0.2520640194416046, "learning_rate": 3.5798850574712644e-05, "loss": 0.1137, "step": 2471 }, { "epoch": 2.8654640295587916, "grad_norm": 0.29542282223701477, "learning_rate": 3.5793103448275866e-05, "loss": 0.121, "step": 2472 }, { "epoch": 2.8666231978555388, "grad_norm": 0.21729327738285065, "learning_rate": 3.578735632183908e-05, "loss": 0.1108, "step": 2473 }, { "epoch": 2.867782366152286, "grad_norm": 0.285705029964447, "learning_rate": 3.5781609195402296e-05, "loss": 0.1234, "step": 2474 }, { "epoch": 2.868941534449033, "grad_norm": 0.2638833224773407, "learning_rate": 3.5775862068965524e-05, "loss": 0.1361, "step": 2475 }, { "epoch": 2.8701007027457797, "grad_norm": 0.25807875394821167, "learning_rate": 3.577011494252874e-05, "loss": 0.1067, "step": 2476 }, { "epoch": 2.8712598710425272, "grad_norm": 0.20497660338878632, "learning_rate": 3.5764367816091954e-05, "loss": 0.1072, "step": 2477 }, { "epoch": 2.872419039339274, "grad_norm": 0.23752939701080322, "learning_rate": 3.5758620689655176e-05, "loss": 0.1231, "step": 2478 }, { "epoch": 2.873578207636021, "grad_norm": 0.2198176383972168, "learning_rate": 3.575287356321839e-05, "loss": 0.1212, "step": 2479 }, { "epoch": 2.874737375932768, "grad_norm": 0.28689807653427124, "learning_rate": 3.5747126436781606e-05, "loss": 0.1292, "step": 2480 }, { "epoch": 2.8758965442295152, "grad_norm": 0.21287685632705688, "learning_rate": 3.574137931034483e-05, "loss": 0.106, "step": 2481 }, { "epoch": 2.8770557125262624, "grad_norm": 0.3098215162754059, "learning_rate": 3.573563218390805e-05, "loss": 0.1212, "step": 2482 }, { "epoch": 2.8782148808230095, "grad_norm": 0.1953476518392563, "learning_rate": 3.572988505747127e-05, "loss": 0.1001, "step": 2483 }, { "epoch": 2.8793740491197566, "grad_norm": 0.22873975336551666, "learning_rate": 3.5724137931034486e-05, "loss": 0.1115, "step": 2484 }, { "epoch": 2.8805332174165037, "grad_norm": 0.21951954066753387, "learning_rate": 3.57183908045977e-05, "loss": 0.1197, "step": 2485 }, { "epoch": 2.881692385713251, "grad_norm": 0.25903722643852234, "learning_rate": 3.571264367816092e-05, "loss": 0.1082, "step": 2486 }, { "epoch": 2.882851554009998, "grad_norm": 0.2247413992881775, "learning_rate": 3.570689655172414e-05, "loss": 0.1138, "step": 2487 }, { "epoch": 2.884010722306745, "grad_norm": 0.2930486798286438, "learning_rate": 3.570114942528736e-05, "loss": 0.1327, "step": 2488 }, { "epoch": 2.885169890603492, "grad_norm": 0.32218900322914124, "learning_rate": 3.569540229885058e-05, "loss": 0.1261, "step": 2489 }, { "epoch": 2.8863290589002393, "grad_norm": 0.2783204913139343, "learning_rate": 3.5689655172413795e-05, "loss": 0.1132, "step": 2490 }, { "epoch": 2.887488227196986, "grad_norm": 0.214439257979393, "learning_rate": 3.568390804597702e-05, "loss": 0.1179, "step": 2491 }, { "epoch": 2.8886473954937335, "grad_norm": 0.25692418217658997, "learning_rate": 3.567816091954023e-05, "loss": 0.1147, "step": 2492 }, { "epoch": 2.88980656379048, "grad_norm": 0.27473074197769165, "learning_rate": 3.567241379310345e-05, "loss": 0.1351, "step": 2493 }, { "epoch": 2.8909657320872273, "grad_norm": 0.27782875299453735, "learning_rate": 3.566666666666667e-05, "loss": 0.1246, "step": 2494 }, { "epoch": 2.8921249003839744, "grad_norm": 0.22847121953964233, "learning_rate": 3.5660919540229883e-05, "loss": 0.1118, "step": 2495 }, { "epoch": 2.8932840686807215, "grad_norm": 0.22717076539993286, "learning_rate": 3.5655172413793105e-05, "loss": 0.1233, "step": 2496 }, { "epoch": 2.8944432369774686, "grad_norm": 0.2825745642185211, "learning_rate": 3.564942528735633e-05, "loss": 0.1282, "step": 2497 }, { "epoch": 2.8956024052742158, "grad_norm": 0.25072160363197327, "learning_rate": 3.564367816091954e-05, "loss": 0.1239, "step": 2498 }, { "epoch": 2.896761573570963, "grad_norm": 0.2573794424533844, "learning_rate": 3.5637931034482757e-05, "loss": 0.1083, "step": 2499 }, { "epoch": 2.89792074186771, "grad_norm": 0.29869526624679565, "learning_rate": 3.563218390804598e-05, "loss": 0.132, "step": 2500 }, { "epoch": 2.899079910164457, "grad_norm": 0.24335411190986633, "learning_rate": 3.562643678160919e-05, "loss": 0.1187, "step": 2501 }, { "epoch": 2.900239078461204, "grad_norm": 0.25249624252319336, "learning_rate": 3.5620689655172415e-05, "loss": 0.1157, "step": 2502 }, { "epoch": 2.9013982467579513, "grad_norm": 0.2802266776561737, "learning_rate": 3.5614942528735637e-05, "loss": 0.1267, "step": 2503 }, { "epoch": 2.902557415054698, "grad_norm": 0.3055930435657501, "learning_rate": 3.560919540229885e-05, "loss": 0.1287, "step": 2504 }, { "epoch": 2.9037165833514456, "grad_norm": 0.2905328571796417, "learning_rate": 3.560344827586207e-05, "loss": 0.1317, "step": 2505 }, { "epoch": 2.9048757516481922, "grad_norm": 0.21209411323070526, "learning_rate": 3.559770114942529e-05, "loss": 0.1131, "step": 2506 }, { "epoch": 2.9060349199449393, "grad_norm": 0.2763073444366455, "learning_rate": 3.55919540229885e-05, "loss": 0.1205, "step": 2507 }, { "epoch": 2.9071940882416865, "grad_norm": 0.23870214819908142, "learning_rate": 3.5586206896551725e-05, "loss": 0.1237, "step": 2508 }, { "epoch": 2.9083532565384336, "grad_norm": 0.2398124486207962, "learning_rate": 3.5580459770114946e-05, "loss": 0.1161, "step": 2509 }, { "epoch": 2.9095124248351807, "grad_norm": 0.23274962604045868, "learning_rate": 3.557471264367817e-05, "loss": 0.1094, "step": 2510 }, { "epoch": 2.910671593131928, "grad_norm": 0.2622717022895813, "learning_rate": 3.556896551724138e-05, "loss": 0.1189, "step": 2511 }, { "epoch": 2.911830761428675, "grad_norm": 0.335947185754776, "learning_rate": 3.55632183908046e-05, "loss": 0.1237, "step": 2512 }, { "epoch": 2.912989929725422, "grad_norm": 0.2339511215686798, "learning_rate": 3.555747126436782e-05, "loss": 0.1195, "step": 2513 }, { "epoch": 2.914149098022169, "grad_norm": 0.2983540892601013, "learning_rate": 3.5551724137931034e-05, "loss": 0.1184, "step": 2514 }, { "epoch": 2.9153082663189163, "grad_norm": 0.29137369990348816, "learning_rate": 3.554597701149425e-05, "loss": 0.1074, "step": 2515 }, { "epoch": 2.9164674346156634, "grad_norm": 0.3541145622730255, "learning_rate": 3.554022988505748e-05, "loss": 0.1187, "step": 2516 }, { "epoch": 2.9176266029124105, "grad_norm": 0.2662143111228943, "learning_rate": 3.553448275862069e-05, "loss": 0.1227, "step": 2517 }, { "epoch": 2.9187857712091576, "grad_norm": 0.2711283266544342, "learning_rate": 3.552873563218391e-05, "loss": 0.1304, "step": 2518 }, { "epoch": 2.9199449395059043, "grad_norm": 0.2492939978837967, "learning_rate": 3.552298850574713e-05, "loss": 0.1191, "step": 2519 }, { "epoch": 2.921104107802652, "grad_norm": 0.25983232259750366, "learning_rate": 3.5517241379310344e-05, "loss": 0.1129, "step": 2520 }, { "epoch": 2.9222632760993985, "grad_norm": 0.24803534150123596, "learning_rate": 3.5511494252873566e-05, "loss": 0.116, "step": 2521 }, { "epoch": 2.9234224443961456, "grad_norm": 0.2651211619377136, "learning_rate": 3.550574712643678e-05, "loss": 0.1361, "step": 2522 }, { "epoch": 2.9245816126928927, "grad_norm": 0.2923348844051361, "learning_rate": 3.55e-05, "loss": 0.135, "step": 2523 }, { "epoch": 2.92574078098964, "grad_norm": 0.26953768730163574, "learning_rate": 3.5494252873563224e-05, "loss": 0.1183, "step": 2524 }, { "epoch": 2.926899949286387, "grad_norm": 0.235351100564003, "learning_rate": 3.548850574712644e-05, "loss": 0.1216, "step": 2525 }, { "epoch": 2.928059117583134, "grad_norm": 0.2557763457298279, "learning_rate": 3.5482758620689654e-05, "loss": 0.1138, "step": 2526 }, { "epoch": 2.929218285879881, "grad_norm": 0.2516639232635498, "learning_rate": 3.5477011494252875e-05, "loss": 0.144, "step": 2527 }, { "epoch": 2.9303774541766283, "grad_norm": 0.19669529795646667, "learning_rate": 3.547126436781609e-05, "loss": 0.1112, "step": 2528 }, { "epoch": 2.9315366224733754, "grad_norm": 0.21730567514896393, "learning_rate": 3.546551724137931e-05, "loss": 0.1113, "step": 2529 }, { "epoch": 2.9326957907701225, "grad_norm": 0.20566688477993011, "learning_rate": 3.5459770114942534e-05, "loss": 0.1146, "step": 2530 }, { "epoch": 2.9338549590668697, "grad_norm": 0.26135411858558655, "learning_rate": 3.545402298850575e-05, "loss": 0.1083, "step": 2531 }, { "epoch": 2.9350141273636163, "grad_norm": 0.23038803040981293, "learning_rate": 3.544827586206897e-05, "loss": 0.1212, "step": 2532 }, { "epoch": 2.936173295660364, "grad_norm": 0.2405983805656433, "learning_rate": 3.5442528735632185e-05, "loss": 0.1085, "step": 2533 }, { "epoch": 2.9373324639571106, "grad_norm": 0.2417573630809784, "learning_rate": 3.54367816091954e-05, "loss": 0.1173, "step": 2534 }, { "epoch": 2.938491632253858, "grad_norm": 0.21391847729682922, "learning_rate": 3.543103448275862e-05, "loss": 0.1161, "step": 2535 }, { "epoch": 2.939650800550605, "grad_norm": 0.2825719714164734, "learning_rate": 3.5425287356321843e-05, "loss": 0.1332, "step": 2536 }, { "epoch": 2.940809968847352, "grad_norm": 0.23542939126491547, "learning_rate": 3.541954022988506e-05, "loss": 0.1184, "step": 2537 }, { "epoch": 2.941969137144099, "grad_norm": 0.2510528862476349, "learning_rate": 3.541379310344828e-05, "loss": 0.1238, "step": 2538 }, { "epoch": 2.943128305440846, "grad_norm": 0.2628154456615448, "learning_rate": 3.5408045977011495e-05, "loss": 0.1224, "step": 2539 }, { "epoch": 2.9442874737375933, "grad_norm": 0.18918563425540924, "learning_rate": 3.540229885057472e-05, "loss": 0.1026, "step": 2540 }, { "epoch": 2.9454466420343404, "grad_norm": 0.299092561006546, "learning_rate": 3.539655172413793e-05, "loss": 0.1332, "step": 2541 }, { "epoch": 2.9466058103310875, "grad_norm": 0.22174027562141418, "learning_rate": 3.5390804597701146e-05, "loss": 0.1071, "step": 2542 }, { "epoch": 2.9477649786278346, "grad_norm": 0.30895039439201355, "learning_rate": 3.538505747126437e-05, "loss": 0.1243, "step": 2543 }, { "epoch": 2.9489241469245817, "grad_norm": 0.30053114891052246, "learning_rate": 3.537931034482759e-05, "loss": 0.1197, "step": 2544 }, { "epoch": 2.950083315221329, "grad_norm": 0.276027113199234, "learning_rate": 3.5373563218390805e-05, "loss": 0.1166, "step": 2545 }, { "epoch": 2.951242483518076, "grad_norm": 0.2735234797000885, "learning_rate": 3.5367816091954026e-05, "loss": 0.1244, "step": 2546 }, { "epoch": 2.9524016518148226, "grad_norm": 0.3124696612358093, "learning_rate": 3.536206896551724e-05, "loss": 0.1115, "step": 2547 }, { "epoch": 2.95356082011157, "grad_norm": 0.21694187819957733, "learning_rate": 3.535632183908046e-05, "loss": 0.1146, "step": 2548 }, { "epoch": 2.954719988408317, "grad_norm": 0.28036484122276306, "learning_rate": 3.535057471264368e-05, "loss": 0.1027, "step": 2549 }, { "epoch": 2.955879156705064, "grad_norm": 0.31232985854148865, "learning_rate": 3.53448275862069e-05, "loss": 0.1165, "step": 2550 }, { "epoch": 2.957038325001811, "grad_norm": 0.28552737832069397, "learning_rate": 3.533908045977012e-05, "loss": 0.1225, "step": 2551 }, { "epoch": 2.958197493298558, "grad_norm": 0.27900782227516174, "learning_rate": 3.5333333333333336e-05, "loss": 0.1191, "step": 2552 }, { "epoch": 2.9593566615953053, "grad_norm": 0.25439637899398804, "learning_rate": 3.532758620689655e-05, "loss": 0.1324, "step": 2553 }, { "epoch": 2.9605158298920524, "grad_norm": 0.24893079698085785, "learning_rate": 3.532183908045977e-05, "loss": 0.1161, "step": 2554 }, { "epoch": 2.9616749981887995, "grad_norm": 0.2921384274959564, "learning_rate": 3.531609195402299e-05, "loss": 0.1209, "step": 2555 }, { "epoch": 2.9628341664855466, "grad_norm": 0.21801045536994934, "learning_rate": 3.53103448275862e-05, "loss": 0.118, "step": 2556 }, { "epoch": 2.9639933347822938, "grad_norm": 0.26922228932380676, "learning_rate": 3.530459770114943e-05, "loss": 0.1315, "step": 2557 }, { "epoch": 2.965152503079041, "grad_norm": 0.2805965840816498, "learning_rate": 3.5298850574712646e-05, "loss": 0.1291, "step": 2558 }, { "epoch": 2.966311671375788, "grad_norm": 0.3655463457107544, "learning_rate": 3.529310344827587e-05, "loss": 0.1303, "step": 2559 }, { "epoch": 2.967470839672535, "grad_norm": 0.2937909960746765, "learning_rate": 3.528735632183908e-05, "loss": 0.1259, "step": 2560 }, { "epoch": 2.9686300079692822, "grad_norm": 0.3068729043006897, "learning_rate": 3.52816091954023e-05, "loss": 0.1348, "step": 2561 }, { "epoch": 2.969789176266029, "grad_norm": 0.2570815980434418, "learning_rate": 3.527586206896552e-05, "loss": 0.1261, "step": 2562 }, { "epoch": 2.9709483445627765, "grad_norm": 0.22781704366207123, "learning_rate": 3.5270114942528734e-05, "loss": 0.1177, "step": 2563 }, { "epoch": 2.972107512859523, "grad_norm": 0.2892128527164459, "learning_rate": 3.5264367816091956e-05, "loss": 0.1181, "step": 2564 }, { "epoch": 2.9732666811562702, "grad_norm": 0.24574293196201324, "learning_rate": 3.525862068965518e-05, "loss": 0.1309, "step": 2565 }, { "epoch": 2.9744258494530174, "grad_norm": 0.2564846873283386, "learning_rate": 3.525287356321839e-05, "loss": 0.1201, "step": 2566 }, { "epoch": 2.9755850177497645, "grad_norm": 0.3438556492328644, "learning_rate": 3.5247126436781614e-05, "loss": 0.1292, "step": 2567 }, { "epoch": 2.9767441860465116, "grad_norm": 0.25281572341918945, "learning_rate": 3.524137931034483e-05, "loss": 0.1202, "step": 2568 }, { "epoch": 2.9779033543432587, "grad_norm": 0.20820648968219757, "learning_rate": 3.5235632183908044e-05, "loss": 0.1221, "step": 2569 }, { "epoch": 2.979062522640006, "grad_norm": 0.23150451481342316, "learning_rate": 3.5229885057471265e-05, "loss": 0.1269, "step": 2570 }, { "epoch": 2.980221690936753, "grad_norm": 0.25065869092941284, "learning_rate": 3.522413793103449e-05, "loss": 0.1268, "step": 2571 }, { "epoch": 2.9813808592335, "grad_norm": 0.25164109468460083, "learning_rate": 3.52183908045977e-05, "loss": 0.1156, "step": 2572 }, { "epoch": 2.982540027530247, "grad_norm": 0.2479361742734909, "learning_rate": 3.5212643678160924e-05, "loss": 0.1204, "step": 2573 }, { "epoch": 2.9836991958269943, "grad_norm": 0.21308045089244843, "learning_rate": 3.520689655172414e-05, "loss": 0.1161, "step": 2574 }, { "epoch": 2.984858364123741, "grad_norm": 0.2212274670600891, "learning_rate": 3.5201149425287353e-05, "loss": 0.1138, "step": 2575 }, { "epoch": 2.9860175324204885, "grad_norm": 0.26298844814300537, "learning_rate": 3.5195402298850575e-05, "loss": 0.1218, "step": 2576 }, { "epoch": 2.987176700717235, "grad_norm": 0.20742398500442505, "learning_rate": 3.51896551724138e-05, "loss": 0.1156, "step": 2577 }, { "epoch": 2.9883358690139827, "grad_norm": 0.23608221113681793, "learning_rate": 3.518390804597702e-05, "loss": 0.1058, "step": 2578 }, { "epoch": 2.9894950373107294, "grad_norm": 0.25990620255470276, "learning_rate": 3.517816091954023e-05, "loss": 0.1249, "step": 2579 }, { "epoch": 2.9906542056074765, "grad_norm": 0.2265261709690094, "learning_rate": 3.517241379310345e-05, "loss": 0.1177, "step": 2580 }, { "epoch": 2.9918133739042236, "grad_norm": 0.21493545174598694, "learning_rate": 3.516666666666667e-05, "loss": 0.1042, "step": 2581 }, { "epoch": 2.9929725422009708, "grad_norm": 0.2955082058906555, "learning_rate": 3.5160919540229885e-05, "loss": 0.1213, "step": 2582 }, { "epoch": 2.994131710497718, "grad_norm": 0.25260546803474426, "learning_rate": 3.51551724137931e-05, "loss": 0.1231, "step": 2583 }, { "epoch": 2.995290878794465, "grad_norm": 0.22875334322452545, "learning_rate": 3.514942528735632e-05, "loss": 0.1313, "step": 2584 }, { "epoch": 2.996450047091212, "grad_norm": 0.24003781378269196, "learning_rate": 3.514367816091954e-05, "loss": 0.1311, "step": 2585 }, { "epoch": 2.997609215387959, "grad_norm": 0.23002997040748596, "learning_rate": 3.5137931034482765e-05, "loss": 0.1153, "step": 2586 }, { "epoch": 2.9987683836847063, "grad_norm": 0.28721874952316284, "learning_rate": 3.513218390804598e-05, "loss": 0.1178, "step": 2587 }, { "epoch": 2.9999275519814534, "grad_norm": 0.2430833876132965, "learning_rate": 3.5126436781609195e-05, "loss": 0.1155, "step": 2588 }, { "epoch": 2.9999275519814534, "eval_loss": 0.13075338304042816, "eval_runtime": 279.8691, "eval_samples_per_second": 5.481, "eval_steps_per_second": 5.481, "step": 2588 }, { "epoch": 3.0010867202782006, "grad_norm": 0.23414799571037292, "learning_rate": 3.5120689655172416e-05, "loss": 0.1143, "step": 2589 }, { "epoch": 3.0022458885749477, "grad_norm": 0.25894778966903687, "learning_rate": 3.511494252873563e-05, "loss": 0.1089, "step": 2590 }, { "epoch": 3.0034050568716943, "grad_norm": 0.23415379226207733, "learning_rate": 3.510919540229885e-05, "loss": 0.1097, "step": 2591 }, { "epoch": 3.0045642251684415, "grad_norm": 0.21930260956287384, "learning_rate": 3.5103448275862074e-05, "loss": 0.1111, "step": 2592 }, { "epoch": 3.0057233934651886, "grad_norm": 0.21448297798633575, "learning_rate": 3.509770114942529e-05, "loss": 0.1116, "step": 2593 }, { "epoch": 3.0068825617619357, "grad_norm": 0.23125120997428894, "learning_rate": 3.509195402298851e-05, "loss": 0.116, "step": 2594 }, { "epoch": 3.008041730058683, "grad_norm": 0.25538870692253113, "learning_rate": 3.5086206896551726e-05, "loss": 0.1146, "step": 2595 }, { "epoch": 3.00920089835543, "grad_norm": 0.2790210247039795, "learning_rate": 3.508045977011494e-05, "loss": 0.1138, "step": 2596 }, { "epoch": 3.010360066652177, "grad_norm": 0.22470445930957794, "learning_rate": 3.507471264367816e-05, "loss": 0.119, "step": 2597 }, { "epoch": 3.011519234948924, "grad_norm": 0.2440544217824936, "learning_rate": 3.5068965517241384e-05, "loss": 0.1027, "step": 2598 }, { "epoch": 3.0126784032456713, "grad_norm": 0.3405747413635254, "learning_rate": 3.50632183908046e-05, "loss": 0.113, "step": 2599 }, { "epoch": 3.0138375715424184, "grad_norm": 0.22075963020324707, "learning_rate": 3.505747126436782e-05, "loss": 0.1043, "step": 2600 }, { "epoch": 3.0149967398391655, "grad_norm": 0.28365811705589294, "learning_rate": 3.5051724137931036e-05, "loss": 0.1105, "step": 2601 }, { "epoch": 3.0161559081359126, "grad_norm": 0.38459697365760803, "learning_rate": 3.504597701149425e-05, "loss": 0.1043, "step": 2602 }, { "epoch": 3.0173150764326597, "grad_norm": 0.3330821394920349, "learning_rate": 3.504022988505747e-05, "loss": 0.1052, "step": 2603 }, { "epoch": 3.018474244729407, "grad_norm": 0.25718963146209717, "learning_rate": 3.503448275862069e-05, "loss": 0.1091, "step": 2604 }, { "epoch": 3.0196334130261535, "grad_norm": 0.2592739462852478, "learning_rate": 3.502873563218391e-05, "loss": 0.0919, "step": 2605 }, { "epoch": 3.0207925813229006, "grad_norm": 0.24682362377643585, "learning_rate": 3.502298850574713e-05, "loss": 0.1094, "step": 2606 }, { "epoch": 3.0219517496196477, "grad_norm": 0.4260392487049103, "learning_rate": 3.5017241379310345e-05, "loss": 0.1122, "step": 2607 }, { "epoch": 3.023110917916395, "grad_norm": 0.35808852314949036, "learning_rate": 3.501149425287357e-05, "loss": 0.1215, "step": 2608 }, { "epoch": 3.024270086213142, "grad_norm": 0.31842851638793945, "learning_rate": 3.500574712643678e-05, "loss": 0.1083, "step": 2609 }, { "epoch": 3.025429254509889, "grad_norm": 0.29208144545555115, "learning_rate": 3.5e-05, "loss": 0.1146, "step": 2610 }, { "epoch": 3.026588422806636, "grad_norm": 0.2514886260032654, "learning_rate": 3.499425287356322e-05, "loss": 0.0972, "step": 2611 }, { "epoch": 3.0277475911033833, "grad_norm": 0.3120005428791046, "learning_rate": 3.498850574712644e-05, "loss": 0.1072, "step": 2612 }, { "epoch": 3.0289067594001304, "grad_norm": 0.30786648392677307, "learning_rate": 3.498275862068966e-05, "loss": 0.1142, "step": 2613 }, { "epoch": 3.0300659276968775, "grad_norm": 0.3202109634876251, "learning_rate": 3.497701149425288e-05, "loss": 0.1153, "step": 2614 }, { "epoch": 3.0312250959936247, "grad_norm": 0.3748902976512909, "learning_rate": 3.497126436781609e-05, "loss": 0.1064, "step": 2615 }, { "epoch": 3.032384264290372, "grad_norm": 0.3179676830768585, "learning_rate": 3.4965517241379313e-05, "loss": 0.1193, "step": 2616 }, { "epoch": 3.033543432587119, "grad_norm": 0.31394878029823303, "learning_rate": 3.495977011494253e-05, "loss": 0.1158, "step": 2617 }, { "epoch": 3.034702600883866, "grad_norm": 0.2936527132987976, "learning_rate": 3.495402298850575e-05, "loss": 0.1094, "step": 2618 }, { "epoch": 3.035861769180613, "grad_norm": 0.252445787191391, "learning_rate": 3.494827586206897e-05, "loss": 0.1038, "step": 2619 }, { "epoch": 3.03702093747736, "grad_norm": 0.29907506704330444, "learning_rate": 3.4942528735632187e-05, "loss": 0.112, "step": 2620 }, { "epoch": 3.038180105774107, "grad_norm": 0.24381442368030548, "learning_rate": 3.49367816091954e-05, "loss": 0.1026, "step": 2621 }, { "epoch": 3.039339274070854, "grad_norm": 0.3066432476043701, "learning_rate": 3.493103448275862e-05, "loss": 0.1018, "step": 2622 }, { "epoch": 3.040498442367601, "grad_norm": 0.2872762680053711, "learning_rate": 3.492528735632184e-05, "loss": 0.1105, "step": 2623 }, { "epoch": 3.0416576106643483, "grad_norm": 0.2535305917263031, "learning_rate": 3.491954022988506e-05, "loss": 0.1013, "step": 2624 }, { "epoch": 3.0428167789610954, "grad_norm": 0.22548238933086395, "learning_rate": 3.4913793103448275e-05, "loss": 0.1034, "step": 2625 }, { "epoch": 3.0439759472578425, "grad_norm": 0.3591626286506653, "learning_rate": 3.4908045977011496e-05, "loss": 0.1124, "step": 2626 }, { "epoch": 3.0451351155545896, "grad_norm": 0.3023275136947632, "learning_rate": 3.490229885057472e-05, "loss": 0.1029, "step": 2627 }, { "epoch": 3.0462942838513367, "grad_norm": 0.25562551617622375, "learning_rate": 3.489655172413793e-05, "loss": 0.1056, "step": 2628 }, { "epoch": 3.047453452148084, "grad_norm": 0.32490625977516174, "learning_rate": 3.489080459770115e-05, "loss": 0.1061, "step": 2629 }, { "epoch": 3.048612620444831, "grad_norm": 0.30418357253074646, "learning_rate": 3.488505747126437e-05, "loss": 0.1139, "step": 2630 }, { "epoch": 3.049771788741578, "grad_norm": 0.3882814645767212, "learning_rate": 3.4879310344827584e-05, "loss": 0.1037, "step": 2631 }, { "epoch": 3.050930957038325, "grad_norm": 0.4713079631328583, "learning_rate": 3.4873563218390806e-05, "loss": 0.1157, "step": 2632 }, { "epoch": 3.0520901253350723, "grad_norm": 0.32217907905578613, "learning_rate": 3.486781609195403e-05, "loss": 0.1135, "step": 2633 }, { "epoch": 3.053249293631819, "grad_norm": 0.24278047680854797, "learning_rate": 3.486206896551724e-05, "loss": 0.1121, "step": 2634 }, { "epoch": 3.054408461928566, "grad_norm": 0.32192277908325195, "learning_rate": 3.4856321839080464e-05, "loss": 0.1019, "step": 2635 }, { "epoch": 3.055567630225313, "grad_norm": 0.317410945892334, "learning_rate": 3.485057471264368e-05, "loss": 0.1081, "step": 2636 }, { "epoch": 3.0567267985220603, "grad_norm": 0.30316248536109924, "learning_rate": 3.4844827586206894e-05, "loss": 0.0999, "step": 2637 }, { "epoch": 3.0578859668188074, "grad_norm": 0.4302930235862732, "learning_rate": 3.4839080459770116e-05, "loss": 0.113, "step": 2638 }, { "epoch": 3.0590451351155545, "grad_norm": 0.29626715183258057, "learning_rate": 3.483333333333334e-05, "loss": 0.1057, "step": 2639 }, { "epoch": 3.0602043034123017, "grad_norm": 0.38166263699531555, "learning_rate": 3.482758620689655e-05, "loss": 0.1216, "step": 2640 }, { "epoch": 3.0613634717090488, "grad_norm": 0.3923417031764984, "learning_rate": 3.4821839080459774e-05, "loss": 0.1105, "step": 2641 }, { "epoch": 3.062522640005796, "grad_norm": 0.27287542819976807, "learning_rate": 3.481609195402299e-05, "loss": 0.1096, "step": 2642 }, { "epoch": 3.063681808302543, "grad_norm": 0.2712195813655853, "learning_rate": 3.481034482758621e-05, "loss": 0.1141, "step": 2643 }, { "epoch": 3.06484097659929, "grad_norm": 0.2444847673177719, "learning_rate": 3.4804597701149426e-05, "loss": 0.1045, "step": 2644 }, { "epoch": 3.0660001448960372, "grad_norm": 0.26770317554473877, "learning_rate": 3.479885057471264e-05, "loss": 0.1044, "step": 2645 }, { "epoch": 3.0671593131927843, "grad_norm": 0.27558213472366333, "learning_rate": 3.479310344827587e-05, "loss": 0.1089, "step": 2646 }, { "epoch": 3.0683184814895315, "grad_norm": 0.26137158274650574, "learning_rate": 3.4787356321839084e-05, "loss": 0.1059, "step": 2647 }, { "epoch": 3.069477649786278, "grad_norm": 0.3349991738796234, "learning_rate": 3.47816091954023e-05, "loss": 0.1185, "step": 2648 }, { "epoch": 3.0706368180830252, "grad_norm": 0.32622119784355164, "learning_rate": 3.477586206896552e-05, "loss": 0.1101, "step": 2649 }, { "epoch": 3.0717959863797724, "grad_norm": 0.3372982442378998, "learning_rate": 3.4770114942528735e-05, "loss": 0.113, "step": 2650 }, { "epoch": 3.0729551546765195, "grad_norm": 0.31593719124794006, "learning_rate": 3.476436781609196e-05, "loss": 0.1188, "step": 2651 }, { "epoch": 3.0741143229732666, "grad_norm": 0.26254966855049133, "learning_rate": 3.475862068965517e-05, "loss": 0.1185, "step": 2652 }, { "epoch": 3.0752734912700137, "grad_norm": 0.22806406021118164, "learning_rate": 3.4752873563218394e-05, "loss": 0.0998, "step": 2653 }, { "epoch": 3.076432659566761, "grad_norm": 0.26122182607650757, "learning_rate": 3.4747126436781615e-05, "loss": 0.1111, "step": 2654 }, { "epoch": 3.077591827863508, "grad_norm": 0.2905748188495636, "learning_rate": 3.474137931034483e-05, "loss": 0.1127, "step": 2655 }, { "epoch": 3.078750996160255, "grad_norm": 0.32908645272254944, "learning_rate": 3.4735632183908045e-05, "loss": 0.1165, "step": 2656 }, { "epoch": 3.079910164457002, "grad_norm": 0.32499000430107117, "learning_rate": 3.472988505747127e-05, "loss": 0.1151, "step": 2657 }, { "epoch": 3.0810693327537493, "grad_norm": 0.30107995867729187, "learning_rate": 3.472413793103448e-05, "loss": 0.1142, "step": 2658 }, { "epoch": 3.0822285010504964, "grad_norm": 0.29092562198638916, "learning_rate": 3.47183908045977e-05, "loss": 0.1015, "step": 2659 }, { "epoch": 3.0833876693472435, "grad_norm": 0.23830758035182953, "learning_rate": 3.4712643678160925e-05, "loss": 0.1084, "step": 2660 }, { "epoch": 3.0845468376439906, "grad_norm": 0.254711776971817, "learning_rate": 3.470689655172414e-05, "loss": 0.106, "step": 2661 }, { "epoch": 3.0857060059407377, "grad_norm": 0.24054522812366486, "learning_rate": 3.470114942528736e-05, "loss": 0.0995, "step": 2662 }, { "epoch": 3.0868651742374844, "grad_norm": 0.2799350917339325, "learning_rate": 3.4695402298850576e-05, "loss": 0.1114, "step": 2663 }, { "epoch": 3.0880243425342315, "grad_norm": 0.3153596818447113, "learning_rate": 3.468965517241379e-05, "loss": 0.1107, "step": 2664 }, { "epoch": 3.0891835108309786, "grad_norm": 0.2980448603630066, "learning_rate": 3.468390804597701e-05, "loss": 0.1169, "step": 2665 }, { "epoch": 3.0903426791277258, "grad_norm": 0.26500749588012695, "learning_rate": 3.467816091954023e-05, "loss": 0.1071, "step": 2666 }, { "epoch": 3.091501847424473, "grad_norm": 0.308677077293396, "learning_rate": 3.467241379310345e-05, "loss": 0.112, "step": 2667 }, { "epoch": 3.09266101572122, "grad_norm": 0.3179532289505005, "learning_rate": 3.466666666666667e-05, "loss": 0.1219, "step": 2668 }, { "epoch": 3.093820184017967, "grad_norm": 0.28046876192092896, "learning_rate": 3.4660919540229886e-05, "loss": 0.1197, "step": 2669 }, { "epoch": 3.094979352314714, "grad_norm": 0.3597003221511841, "learning_rate": 3.465517241379311e-05, "loss": 0.0995, "step": 2670 }, { "epoch": 3.0961385206114613, "grad_norm": 0.2430323362350464, "learning_rate": 3.464942528735632e-05, "loss": 0.1078, "step": 2671 }, { "epoch": 3.0972976889082084, "grad_norm": 0.3916395902633667, "learning_rate": 3.464367816091954e-05, "loss": 0.1289, "step": 2672 }, { "epoch": 3.0984568572049556, "grad_norm": 0.28007376194000244, "learning_rate": 3.463793103448276e-05, "loss": 0.1082, "step": 2673 }, { "epoch": 3.0996160255017027, "grad_norm": 0.370697945356369, "learning_rate": 3.463218390804598e-05, "loss": 0.1157, "step": 2674 }, { "epoch": 3.10077519379845, "grad_norm": 0.27986010909080505, "learning_rate": 3.4626436781609196e-05, "loss": 0.1067, "step": 2675 }, { "epoch": 3.101934362095197, "grad_norm": 0.34155765175819397, "learning_rate": 3.462068965517242e-05, "loss": 0.1124, "step": 2676 }, { "epoch": 3.1030935303919436, "grad_norm": 0.27783599495887756, "learning_rate": 3.461494252873563e-05, "loss": 0.1084, "step": 2677 }, { "epoch": 3.1042526986886907, "grad_norm": 0.32850217819213867, "learning_rate": 3.460919540229885e-05, "loss": 0.1029, "step": 2678 }, { "epoch": 3.105411866985438, "grad_norm": 0.21651075780391693, "learning_rate": 3.460344827586207e-05, "loss": 0.0985, "step": 2679 }, { "epoch": 3.106571035282185, "grad_norm": 0.29915618896484375, "learning_rate": 3.459770114942529e-05, "loss": 0.1114, "step": 2680 }, { "epoch": 3.107730203578932, "grad_norm": 0.2699195146560669, "learning_rate": 3.459195402298851e-05, "loss": 0.108, "step": 2681 }, { "epoch": 3.108889371875679, "grad_norm": 0.33292192220687866, "learning_rate": 3.458620689655173e-05, "loss": 0.121, "step": 2682 }, { "epoch": 3.1100485401724263, "grad_norm": 0.48338431119918823, "learning_rate": 3.458045977011494e-05, "loss": 0.1148, "step": 2683 }, { "epoch": 3.1112077084691734, "grad_norm": 0.5552323460578918, "learning_rate": 3.4574712643678164e-05, "loss": 0.1119, "step": 2684 }, { "epoch": 3.1123668767659205, "grad_norm": 0.26004117727279663, "learning_rate": 3.456896551724138e-05, "loss": 0.1115, "step": 2685 }, { "epoch": 3.1135260450626676, "grad_norm": 0.35093238949775696, "learning_rate": 3.4563218390804594e-05, "loss": 0.1196, "step": 2686 }, { "epoch": 3.1146852133594147, "grad_norm": 0.26992592215538025, "learning_rate": 3.455747126436782e-05, "loss": 0.1114, "step": 2687 }, { "epoch": 3.115844381656162, "grad_norm": 0.23782843351364136, "learning_rate": 3.455172413793104e-05, "loss": 0.112, "step": 2688 }, { "epoch": 3.117003549952909, "grad_norm": 0.37826716899871826, "learning_rate": 3.454597701149426e-05, "loss": 0.1154, "step": 2689 }, { "epoch": 3.118162718249656, "grad_norm": 0.4076231122016907, "learning_rate": 3.4540229885057474e-05, "loss": 0.1181, "step": 2690 }, { "epoch": 3.1193218865464027, "grad_norm": 0.27670371532440186, "learning_rate": 3.453448275862069e-05, "loss": 0.1192, "step": 2691 }, { "epoch": 3.12048105484315, "grad_norm": 0.3018391728401184, "learning_rate": 3.452873563218391e-05, "loss": 0.112, "step": 2692 }, { "epoch": 3.121640223139897, "grad_norm": 0.3222883641719818, "learning_rate": 3.4522988505747125e-05, "loss": 0.1108, "step": 2693 }, { "epoch": 3.122799391436644, "grad_norm": 0.32334110140800476, "learning_rate": 3.451724137931035e-05, "loss": 0.1059, "step": 2694 }, { "epoch": 3.123958559733391, "grad_norm": 0.3832739293575287, "learning_rate": 3.451149425287357e-05, "loss": 0.1176, "step": 2695 }, { "epoch": 3.1251177280301383, "grad_norm": 0.30942925810813904, "learning_rate": 3.4505747126436783e-05, "loss": 0.1249, "step": 2696 }, { "epoch": 3.1262768963268854, "grad_norm": 0.25625792145729065, "learning_rate": 3.45e-05, "loss": 0.1054, "step": 2697 }, { "epoch": 3.1274360646236326, "grad_norm": 0.28217336535453796, "learning_rate": 3.449425287356322e-05, "loss": 0.1095, "step": 2698 }, { "epoch": 3.1285952329203797, "grad_norm": 0.2541414499282837, "learning_rate": 3.4488505747126435e-05, "loss": 0.108, "step": 2699 }, { "epoch": 3.129754401217127, "grad_norm": 0.25402045249938965, "learning_rate": 3.4482758620689657e-05, "loss": 0.1133, "step": 2700 }, { "epoch": 3.130913569513874, "grad_norm": 0.23192809522151947, "learning_rate": 3.447701149425288e-05, "loss": 0.1102, "step": 2701 }, { "epoch": 3.132072737810621, "grad_norm": 0.3315061926841736, "learning_rate": 3.447126436781609e-05, "loss": 0.1214, "step": 2702 }, { "epoch": 3.133231906107368, "grad_norm": 0.2749609053134918, "learning_rate": 3.4465517241379315e-05, "loss": 0.1005, "step": 2703 }, { "epoch": 3.1343910744041152, "grad_norm": 0.36684563755989075, "learning_rate": 3.445977011494253e-05, "loss": 0.1055, "step": 2704 }, { "epoch": 3.1355502427008624, "grad_norm": 0.30152618885040283, "learning_rate": 3.4454022988505745e-05, "loss": 0.1053, "step": 2705 }, { "epoch": 3.136709410997609, "grad_norm": 0.24934984743595123, "learning_rate": 3.4448275862068966e-05, "loss": 0.1231, "step": 2706 }, { "epoch": 3.137868579294356, "grad_norm": 0.3130822777748108, "learning_rate": 3.444252873563219e-05, "loss": 0.1075, "step": 2707 }, { "epoch": 3.1390277475911033, "grad_norm": 0.28072500228881836, "learning_rate": 3.443678160919541e-05, "loss": 0.1016, "step": 2708 }, { "epoch": 3.1401869158878504, "grad_norm": 0.36836108565330505, "learning_rate": 3.4431034482758625e-05, "loss": 0.1132, "step": 2709 }, { "epoch": 3.1413460841845975, "grad_norm": 0.40246373414993286, "learning_rate": 3.442528735632184e-05, "loss": 0.1265, "step": 2710 }, { "epoch": 3.1425052524813446, "grad_norm": 0.3051183223724365, "learning_rate": 3.441954022988506e-05, "loss": 0.0992, "step": 2711 }, { "epoch": 3.1436644207780917, "grad_norm": 0.2942332625389099, "learning_rate": 3.4413793103448276e-05, "loss": 0.1132, "step": 2712 }, { "epoch": 3.144823589074839, "grad_norm": 0.34289440512657166, "learning_rate": 3.440804597701149e-05, "loss": 0.112, "step": 2713 }, { "epoch": 3.145982757371586, "grad_norm": 0.273400217294693, "learning_rate": 3.440229885057471e-05, "loss": 0.0977, "step": 2714 }, { "epoch": 3.147141925668333, "grad_norm": 0.3162487745285034, "learning_rate": 3.4396551724137934e-05, "loss": 0.1127, "step": 2715 }, { "epoch": 3.14830109396508, "grad_norm": 0.33499714732170105, "learning_rate": 3.439080459770115e-05, "loss": 0.112, "step": 2716 }, { "epoch": 3.1494602622618273, "grad_norm": 0.28966301679611206, "learning_rate": 3.438505747126437e-05, "loss": 0.1062, "step": 2717 }, { "epoch": 3.1506194305585744, "grad_norm": 0.2868296205997467, "learning_rate": 3.4379310344827586e-05, "loss": 0.1149, "step": 2718 }, { "epoch": 3.151778598855321, "grad_norm": 0.4284828007221222, "learning_rate": 3.437356321839081e-05, "loss": 0.1244, "step": 2719 }, { "epoch": 3.152937767152068, "grad_norm": 0.3205833435058594, "learning_rate": 3.436781609195402e-05, "loss": 0.0985, "step": 2720 }, { "epoch": 3.1540969354488153, "grad_norm": 0.29444050788879395, "learning_rate": 3.4362068965517244e-05, "loss": 0.1103, "step": 2721 }, { "epoch": 3.1552561037455624, "grad_norm": 0.2642725706100464, "learning_rate": 3.4356321839080466e-05, "loss": 0.1047, "step": 2722 }, { "epoch": 3.1564152720423095, "grad_norm": 0.34533828496932983, "learning_rate": 3.435057471264368e-05, "loss": 0.1131, "step": 2723 }, { "epoch": 3.1575744403390567, "grad_norm": 0.31518617272377014, "learning_rate": 3.4344827586206896e-05, "loss": 0.1065, "step": 2724 }, { "epoch": 3.1587336086358038, "grad_norm": 0.3125956058502197, "learning_rate": 3.433908045977012e-05, "loss": 0.1039, "step": 2725 }, { "epoch": 3.159892776932551, "grad_norm": 0.24645163118839264, "learning_rate": 3.433333333333333e-05, "loss": 0.1016, "step": 2726 }, { "epoch": 3.161051945229298, "grad_norm": 0.42728784680366516, "learning_rate": 3.4327586206896554e-05, "loss": 0.101, "step": 2727 }, { "epoch": 3.162211113526045, "grad_norm": 0.5982040166854858, "learning_rate": 3.4321839080459775e-05, "loss": 0.1286, "step": 2728 }, { "epoch": 3.1633702818227922, "grad_norm": 0.3353968560695648, "learning_rate": 3.431609195402299e-05, "loss": 0.1232, "step": 2729 }, { "epoch": 3.1645294501195393, "grad_norm": 0.3435153663158417, "learning_rate": 3.431034482758621e-05, "loss": 0.1188, "step": 2730 }, { "epoch": 3.1656886184162865, "grad_norm": 0.39836829900741577, "learning_rate": 3.430459770114943e-05, "loss": 0.121, "step": 2731 }, { "epoch": 3.1668477867130336, "grad_norm": 0.30429473519325256, "learning_rate": 3.429885057471264e-05, "loss": 0.1089, "step": 2732 }, { "epoch": 3.1680069550097807, "grad_norm": 0.3069549798965454, "learning_rate": 3.4293103448275864e-05, "loss": 0.1196, "step": 2733 }, { "epoch": 3.1691661233065274, "grad_norm": 0.2685747444629669, "learning_rate": 3.428735632183908e-05, "loss": 0.1202, "step": 2734 }, { "epoch": 3.1703252916032745, "grad_norm": 0.2461479902267456, "learning_rate": 3.42816091954023e-05, "loss": 0.1145, "step": 2735 }, { "epoch": 3.1714844599000216, "grad_norm": 0.29422634840011597, "learning_rate": 3.427586206896552e-05, "loss": 0.118, "step": 2736 }, { "epoch": 3.1726436281967687, "grad_norm": 0.3010058104991913, "learning_rate": 3.427011494252874e-05, "loss": 0.1168, "step": 2737 }, { "epoch": 3.173802796493516, "grad_norm": 0.2945718765258789, "learning_rate": 3.426436781609196e-05, "loss": 0.109, "step": 2738 }, { "epoch": 3.174961964790263, "grad_norm": 0.2734999358654022, "learning_rate": 3.425862068965517e-05, "loss": 0.1117, "step": 2739 }, { "epoch": 3.17612113308701, "grad_norm": 0.25881072878837585, "learning_rate": 3.425287356321839e-05, "loss": 0.1062, "step": 2740 }, { "epoch": 3.177280301383757, "grad_norm": 0.2748000919818878, "learning_rate": 3.424712643678161e-05, "loss": 0.1093, "step": 2741 }, { "epoch": 3.1784394696805043, "grad_norm": 0.37148886919021606, "learning_rate": 3.424137931034483e-05, "loss": 0.1227, "step": 2742 }, { "epoch": 3.1795986379772514, "grad_norm": 0.32973724603652954, "learning_rate": 3.4235632183908046e-05, "loss": 0.1057, "step": 2743 }, { "epoch": 3.1807578062739985, "grad_norm": 0.27131950855255127, "learning_rate": 3.422988505747127e-05, "loss": 0.106, "step": 2744 }, { "epoch": 3.1819169745707456, "grad_norm": 0.25479647517204285, "learning_rate": 3.422413793103448e-05, "loss": 0.1154, "step": 2745 }, { "epoch": 3.1830761428674927, "grad_norm": 0.331696480512619, "learning_rate": 3.4218390804597705e-05, "loss": 0.1166, "step": 2746 }, { "epoch": 3.18423531116424, "grad_norm": 0.30211329460144043, "learning_rate": 3.421264367816092e-05, "loss": 0.1158, "step": 2747 }, { "epoch": 3.185394479460987, "grad_norm": 0.265440434217453, "learning_rate": 3.420689655172414e-05, "loss": 0.0981, "step": 2748 }, { "epoch": 3.1865536477577336, "grad_norm": 0.34413856267929077, "learning_rate": 3.420114942528736e-05, "loss": 0.1058, "step": 2749 }, { "epoch": 3.1877128160544808, "grad_norm": 0.3049376308917999, "learning_rate": 3.419540229885058e-05, "loss": 0.107, "step": 2750 }, { "epoch": 3.188871984351228, "grad_norm": 0.24063755571842194, "learning_rate": 3.418965517241379e-05, "loss": 0.1063, "step": 2751 }, { "epoch": 3.190031152647975, "grad_norm": 0.40382859110832214, "learning_rate": 3.4183908045977014e-05, "loss": 0.1112, "step": 2752 }, { "epoch": 3.191190320944722, "grad_norm": 0.3165836036205292, "learning_rate": 3.417816091954023e-05, "loss": 0.1113, "step": 2753 }, { "epoch": 3.192349489241469, "grad_norm": 0.3423636555671692, "learning_rate": 3.4172413793103444e-05, "loss": 0.1036, "step": 2754 }, { "epoch": 3.1935086575382163, "grad_norm": 0.3611285388469696, "learning_rate": 3.4166666666666666e-05, "loss": 0.1057, "step": 2755 }, { "epoch": 3.1946678258349634, "grad_norm": 0.2877699136734009, "learning_rate": 3.416091954022989e-05, "loss": 0.1124, "step": 2756 }, { "epoch": 3.1958269941317106, "grad_norm": 0.32133352756500244, "learning_rate": 3.415517241379311e-05, "loss": 0.1136, "step": 2757 }, { "epoch": 3.1969861624284577, "grad_norm": 0.28176063299179077, "learning_rate": 3.4149425287356324e-05, "loss": 0.1036, "step": 2758 }, { "epoch": 3.198145330725205, "grad_norm": 0.35929253697395325, "learning_rate": 3.414367816091954e-05, "loss": 0.1226, "step": 2759 }, { "epoch": 3.199304499021952, "grad_norm": 0.37938886880874634, "learning_rate": 3.413793103448276e-05, "loss": 0.1133, "step": 2760 }, { "epoch": 3.200463667318699, "grad_norm": 0.27585509419441223, "learning_rate": 3.4132183908045976e-05, "loss": 0.0993, "step": 2761 }, { "epoch": 3.2016228356154457, "grad_norm": 0.33102548122406006, "learning_rate": 3.41264367816092e-05, "loss": 0.1055, "step": 2762 }, { "epoch": 3.202782003912193, "grad_norm": 0.4190862774848938, "learning_rate": 3.412068965517242e-05, "loss": 0.1232, "step": 2763 }, { "epoch": 3.20394117220894, "grad_norm": 0.3287636935710907, "learning_rate": 3.4114942528735634e-05, "loss": 0.1189, "step": 2764 }, { "epoch": 3.205100340505687, "grad_norm": 0.27761226892471313, "learning_rate": 3.4109195402298856e-05, "loss": 0.1102, "step": 2765 }, { "epoch": 3.206259508802434, "grad_norm": 0.271768718957901, "learning_rate": 3.410344827586207e-05, "loss": 0.1042, "step": 2766 }, { "epoch": 3.2074186770991813, "grad_norm": 0.44091716408729553, "learning_rate": 3.4097701149425285e-05, "loss": 0.1098, "step": 2767 }, { "epoch": 3.2085778453959284, "grad_norm": 0.32181066274642944, "learning_rate": 3.409195402298851e-05, "loss": 0.1077, "step": 2768 }, { "epoch": 3.2097370136926755, "grad_norm": 0.33620011806488037, "learning_rate": 3.408620689655173e-05, "loss": 0.1113, "step": 2769 }, { "epoch": 3.2108961819894226, "grad_norm": 0.3269021511077881, "learning_rate": 3.4080459770114944e-05, "loss": 0.1266, "step": 2770 }, { "epoch": 3.2120553502861697, "grad_norm": 0.3237850069999695, "learning_rate": 3.4074712643678165e-05, "loss": 0.1132, "step": 2771 }, { "epoch": 3.213214518582917, "grad_norm": 0.37990424036979675, "learning_rate": 3.406896551724138e-05, "loss": 0.1028, "step": 2772 }, { "epoch": 3.214373686879664, "grad_norm": 0.28017759323120117, "learning_rate": 3.40632183908046e-05, "loss": 0.108, "step": 2773 }, { "epoch": 3.215532855176411, "grad_norm": 0.3556600511074066, "learning_rate": 3.405747126436782e-05, "loss": 0.121, "step": 2774 }, { "epoch": 3.216692023473158, "grad_norm": 0.26597216725349426, "learning_rate": 3.405172413793103e-05, "loss": 0.1024, "step": 2775 }, { "epoch": 3.2178511917699053, "grad_norm": 0.35249412059783936, "learning_rate": 3.4045977011494253e-05, "loss": 0.1202, "step": 2776 }, { "epoch": 3.219010360066652, "grad_norm": 0.35307571291923523, "learning_rate": 3.4040229885057475e-05, "loss": 0.1139, "step": 2777 }, { "epoch": 3.220169528363399, "grad_norm": 0.25899243354797363, "learning_rate": 3.403448275862069e-05, "loss": 0.1079, "step": 2778 }, { "epoch": 3.221328696660146, "grad_norm": 0.3125319480895996, "learning_rate": 3.402873563218391e-05, "loss": 0.1137, "step": 2779 }, { "epoch": 3.2224878649568933, "grad_norm": 0.3008607029914856, "learning_rate": 3.4022988505747127e-05, "loss": 0.1013, "step": 2780 }, { "epoch": 3.2236470332536404, "grad_norm": 0.2770404517650604, "learning_rate": 3.401724137931034e-05, "loss": 0.1137, "step": 2781 }, { "epoch": 3.2248062015503876, "grad_norm": 0.35955601930618286, "learning_rate": 3.401149425287356e-05, "loss": 0.1108, "step": 2782 }, { "epoch": 3.2259653698471347, "grad_norm": 0.2946624159812927, "learning_rate": 3.4005747126436785e-05, "loss": 0.1023, "step": 2783 }, { "epoch": 3.227124538143882, "grad_norm": 0.34001263976097107, "learning_rate": 3.4000000000000007e-05, "loss": 0.1049, "step": 2784 }, { "epoch": 3.228283706440629, "grad_norm": 0.2924330234527588, "learning_rate": 3.399425287356322e-05, "loss": 0.1097, "step": 2785 }, { "epoch": 3.229442874737376, "grad_norm": 0.3034241199493408, "learning_rate": 3.3988505747126436e-05, "loss": 0.1148, "step": 2786 }, { "epoch": 3.230602043034123, "grad_norm": 0.3855332136154175, "learning_rate": 3.398275862068966e-05, "loss": 0.0945, "step": 2787 }, { "epoch": 3.2317612113308702, "grad_norm": 0.24068623781204224, "learning_rate": 3.397701149425287e-05, "loss": 0.1092, "step": 2788 }, { "epoch": 3.2329203796276174, "grad_norm": 0.2632269859313965, "learning_rate": 3.3971264367816095e-05, "loss": 0.1123, "step": 2789 }, { "epoch": 3.2340795479243645, "grad_norm": 0.2876719832420349, "learning_rate": 3.3965517241379316e-05, "loss": 0.1158, "step": 2790 }, { "epoch": 3.2352387162211116, "grad_norm": 0.3035225570201874, "learning_rate": 3.395977011494253e-05, "loss": 0.1187, "step": 2791 }, { "epoch": 3.2363978845178583, "grad_norm": 0.27335041761398315, "learning_rate": 3.395402298850575e-05, "loss": 0.1005, "step": 2792 }, { "epoch": 3.2375570528146054, "grad_norm": 0.2715912461280823, "learning_rate": 3.394827586206897e-05, "loss": 0.1102, "step": 2793 }, { "epoch": 3.2387162211113525, "grad_norm": 0.2653818726539612, "learning_rate": 3.394252873563218e-05, "loss": 0.1119, "step": 2794 }, { "epoch": 3.2398753894080996, "grad_norm": 0.3413951098918915, "learning_rate": 3.3936781609195404e-05, "loss": 0.1209, "step": 2795 }, { "epoch": 3.2410345577048467, "grad_norm": 0.3429463505744934, "learning_rate": 3.393103448275862e-05, "loss": 0.1285, "step": 2796 }, { "epoch": 3.242193726001594, "grad_norm": 0.3493320345878601, "learning_rate": 3.392528735632184e-05, "loss": 0.1169, "step": 2797 }, { "epoch": 3.243352894298341, "grad_norm": 0.31924867630004883, "learning_rate": 3.391954022988506e-05, "loss": 0.1119, "step": 2798 }, { "epoch": 3.244512062595088, "grad_norm": 0.42040061950683594, "learning_rate": 3.391379310344828e-05, "loss": 0.1106, "step": 2799 }, { "epoch": 3.245671230891835, "grad_norm": 0.3470522463321686, "learning_rate": 3.390804597701149e-05, "loss": 0.1122, "step": 2800 }, { "epoch": 3.2468303991885823, "grad_norm": 0.356770783662796, "learning_rate": 3.3902298850574714e-05, "loss": 0.1191, "step": 2801 }, { "epoch": 3.2479895674853294, "grad_norm": 0.40925726294517517, "learning_rate": 3.389655172413793e-05, "loss": 0.1073, "step": 2802 }, { "epoch": 3.2491487357820765, "grad_norm": 0.34631669521331787, "learning_rate": 3.389080459770115e-05, "loss": 0.1188, "step": 2803 }, { "epoch": 3.2503079040788236, "grad_norm": 0.3311026394367218, "learning_rate": 3.388505747126437e-05, "loss": 0.1099, "step": 2804 }, { "epoch": 3.2514670723755703, "grad_norm": 0.34615299105644226, "learning_rate": 3.387931034482759e-05, "loss": 0.1197, "step": 2805 }, { "epoch": 3.2526262406723174, "grad_norm": 0.33618980646133423, "learning_rate": 3.387356321839081e-05, "loss": 0.1158, "step": 2806 }, { "epoch": 3.2537854089690645, "grad_norm": 0.30485883355140686, "learning_rate": 3.3867816091954024e-05, "loss": 0.1141, "step": 2807 }, { "epoch": 3.2549445772658117, "grad_norm": 0.35726383328437805, "learning_rate": 3.386206896551724e-05, "loss": 0.1059, "step": 2808 }, { "epoch": 3.2561037455625588, "grad_norm": 0.45298418402671814, "learning_rate": 3.385632183908046e-05, "loss": 0.1142, "step": 2809 }, { "epoch": 3.257262913859306, "grad_norm": 0.2797136902809143, "learning_rate": 3.385057471264368e-05, "loss": 0.104, "step": 2810 }, { "epoch": 3.258422082156053, "grad_norm": 0.2695922553539276, "learning_rate": 3.3844827586206904e-05, "loss": 0.1047, "step": 2811 }, { "epoch": 3.2595812504528, "grad_norm": 0.2859964668750763, "learning_rate": 3.383908045977012e-05, "loss": 0.1057, "step": 2812 }, { "epoch": 3.2607404187495472, "grad_norm": 0.26754042506217957, "learning_rate": 3.3833333333333334e-05, "loss": 0.1138, "step": 2813 }, { "epoch": 3.2618995870462943, "grad_norm": 0.3192819058895111, "learning_rate": 3.3827586206896555e-05, "loss": 0.1159, "step": 2814 }, { "epoch": 3.2630587553430415, "grad_norm": 0.33752667903900146, "learning_rate": 3.382183908045977e-05, "loss": 0.1124, "step": 2815 }, { "epoch": 3.2642179236397886, "grad_norm": 0.2708832025527954, "learning_rate": 3.3816091954022985e-05, "loss": 0.1181, "step": 2816 }, { "epoch": 3.2653770919365357, "grad_norm": 0.3345796465873718, "learning_rate": 3.381034482758621e-05, "loss": 0.1123, "step": 2817 }, { "epoch": 3.266536260233283, "grad_norm": 0.33695465326309204, "learning_rate": 3.380459770114943e-05, "loss": 0.0986, "step": 2818 }, { "epoch": 3.26769542853003, "grad_norm": 0.2809793949127197, "learning_rate": 3.379885057471264e-05, "loss": 0.1151, "step": 2819 }, { "epoch": 3.2688545968267766, "grad_norm": 0.35505884885787964, "learning_rate": 3.3793103448275865e-05, "loss": 0.1203, "step": 2820 }, { "epoch": 3.2700137651235237, "grad_norm": 0.2393438071012497, "learning_rate": 3.378735632183908e-05, "loss": 0.1109, "step": 2821 }, { "epoch": 3.271172933420271, "grad_norm": 0.35280153155326843, "learning_rate": 3.37816091954023e-05, "loss": 0.1217, "step": 2822 }, { "epoch": 3.272332101717018, "grad_norm": 0.3676847815513611, "learning_rate": 3.3775862068965516e-05, "loss": 0.1182, "step": 2823 }, { "epoch": 3.273491270013765, "grad_norm": 0.26780447363853455, "learning_rate": 3.377011494252874e-05, "loss": 0.1093, "step": 2824 }, { "epoch": 3.274650438310512, "grad_norm": 0.3136596381664276, "learning_rate": 3.376436781609196e-05, "loss": 0.1126, "step": 2825 }, { "epoch": 3.2758096066072593, "grad_norm": 0.26697683334350586, "learning_rate": 3.3758620689655175e-05, "loss": 0.1163, "step": 2826 }, { "epoch": 3.2769687749040064, "grad_norm": 0.32463008165359497, "learning_rate": 3.375287356321839e-05, "loss": 0.1113, "step": 2827 }, { "epoch": 3.2781279432007535, "grad_norm": 0.3022761344909668, "learning_rate": 3.374712643678161e-05, "loss": 0.113, "step": 2828 }, { "epoch": 3.2792871114975006, "grad_norm": 0.2690412700176239, "learning_rate": 3.3741379310344826e-05, "loss": 0.1018, "step": 2829 }, { "epoch": 3.2804462797942477, "grad_norm": 0.2957005798816681, "learning_rate": 3.373563218390805e-05, "loss": 0.1163, "step": 2830 }, { "epoch": 3.281605448090995, "grad_norm": 0.3116534352302551, "learning_rate": 3.372988505747127e-05, "loss": 0.1263, "step": 2831 }, { "epoch": 3.282764616387742, "grad_norm": 0.2315097600221634, "learning_rate": 3.3724137931034484e-05, "loss": 0.1036, "step": 2832 }, { "epoch": 3.2839237846844886, "grad_norm": 0.5004782676696777, "learning_rate": 3.3718390804597706e-05, "loss": 0.1086, "step": 2833 }, { "epoch": 3.285082952981236, "grad_norm": 0.3481523096561432, "learning_rate": 3.371264367816092e-05, "loss": 0.1112, "step": 2834 }, { "epoch": 3.286242121277983, "grad_norm": 0.27010461688041687, "learning_rate": 3.3706896551724136e-05, "loss": 0.1145, "step": 2835 }, { "epoch": 3.28740128957473, "grad_norm": 0.30692821741104126, "learning_rate": 3.370114942528736e-05, "loss": 0.1043, "step": 2836 }, { "epoch": 3.288560457871477, "grad_norm": 0.33265191316604614, "learning_rate": 3.369540229885057e-05, "loss": 0.1151, "step": 2837 }, { "epoch": 3.289719626168224, "grad_norm": 0.328311562538147, "learning_rate": 3.3689655172413794e-05, "loss": 0.1136, "step": 2838 }, { "epoch": 3.2908787944649713, "grad_norm": 0.2806066870689392, "learning_rate": 3.3683908045977016e-05, "loss": 0.114, "step": 2839 }, { "epoch": 3.2920379627617185, "grad_norm": 0.30125707387924194, "learning_rate": 3.367816091954023e-05, "loss": 0.128, "step": 2840 }, { "epoch": 3.2931971310584656, "grad_norm": 0.31410592794418335, "learning_rate": 3.367241379310345e-05, "loss": 0.1133, "step": 2841 }, { "epoch": 3.2943562993552127, "grad_norm": 0.24413692951202393, "learning_rate": 3.366666666666667e-05, "loss": 0.1056, "step": 2842 }, { "epoch": 3.29551546765196, "grad_norm": 0.3842843472957611, "learning_rate": 3.366091954022988e-05, "loss": 0.1107, "step": 2843 }, { "epoch": 3.296674635948707, "grad_norm": 0.2507694661617279, "learning_rate": 3.3655172413793104e-05, "loss": 0.108, "step": 2844 }, { "epoch": 3.297833804245454, "grad_norm": 0.28499695658683777, "learning_rate": 3.3649425287356326e-05, "loss": 0.1169, "step": 2845 }, { "epoch": 3.298992972542201, "grad_norm": 0.2902618646621704, "learning_rate": 3.364367816091954e-05, "loss": 0.118, "step": 2846 }, { "epoch": 3.3001521408389483, "grad_norm": 0.26456037163734436, "learning_rate": 3.363793103448276e-05, "loss": 0.1105, "step": 2847 }, { "epoch": 3.301311309135695, "grad_norm": 0.3139413595199585, "learning_rate": 3.363218390804598e-05, "loss": 0.1272, "step": 2848 }, { "epoch": 3.302470477432442, "grad_norm": 0.282294899225235, "learning_rate": 3.36264367816092e-05, "loss": 0.1098, "step": 2849 }, { "epoch": 3.303629645729189, "grad_norm": 0.2812632918357849, "learning_rate": 3.3620689655172414e-05, "loss": 0.1124, "step": 2850 }, { "epoch": 3.3047888140259363, "grad_norm": 0.27748167514801025, "learning_rate": 3.3614942528735635e-05, "loss": 0.1028, "step": 2851 }, { "epoch": 3.3059479823226834, "grad_norm": 0.39685526490211487, "learning_rate": 3.360919540229886e-05, "loss": 0.1191, "step": 2852 }, { "epoch": 3.3071071506194305, "grad_norm": 0.33615151047706604, "learning_rate": 3.360344827586207e-05, "loss": 0.1085, "step": 2853 }, { "epoch": 3.3082663189161776, "grad_norm": 0.2919673025608063, "learning_rate": 3.359770114942529e-05, "loss": 0.1045, "step": 2854 }, { "epoch": 3.3094254872129247, "grad_norm": 0.2599186301231384, "learning_rate": 3.359195402298851e-05, "loss": 0.1106, "step": 2855 }, { "epoch": 3.310584655509672, "grad_norm": 0.3396240174770355, "learning_rate": 3.3586206896551723e-05, "loss": 0.1063, "step": 2856 }, { "epoch": 3.311743823806419, "grad_norm": 0.2739236354827881, "learning_rate": 3.358045977011494e-05, "loss": 0.1033, "step": 2857 }, { "epoch": 3.312902992103166, "grad_norm": 0.5088075399398804, "learning_rate": 3.357471264367817e-05, "loss": 0.1295, "step": 2858 }, { "epoch": 3.314062160399913, "grad_norm": 0.3902330994606018, "learning_rate": 3.356896551724138e-05, "loss": 0.1233, "step": 2859 }, { "epoch": 3.3152213286966603, "grad_norm": 0.33790311217308044, "learning_rate": 3.35632183908046e-05, "loss": 0.1204, "step": 2860 }, { "epoch": 3.3163804969934074, "grad_norm": 0.2987493872642517, "learning_rate": 3.355747126436782e-05, "loss": 0.1068, "step": 2861 }, { "epoch": 3.3175396652901545, "grad_norm": 0.3023374080657959, "learning_rate": 3.355172413793103e-05, "loss": 0.1109, "step": 2862 }, { "epoch": 3.318698833586901, "grad_norm": 0.26257169246673584, "learning_rate": 3.3545977011494255e-05, "loss": 0.1092, "step": 2863 }, { "epoch": 3.3198580018836483, "grad_norm": 0.22975148260593414, "learning_rate": 3.354022988505747e-05, "loss": 0.1036, "step": 2864 }, { "epoch": 3.3210171701803954, "grad_norm": 0.29056644439697266, "learning_rate": 3.353448275862069e-05, "loss": 0.1094, "step": 2865 }, { "epoch": 3.3221763384771426, "grad_norm": 0.3132513761520386, "learning_rate": 3.352873563218391e-05, "loss": 0.1257, "step": 2866 }, { "epoch": 3.3233355067738897, "grad_norm": 0.23093485832214355, "learning_rate": 3.352298850574713e-05, "loss": 0.1044, "step": 2867 }, { "epoch": 3.324494675070637, "grad_norm": 0.3373148441314697, "learning_rate": 3.351724137931035e-05, "loss": 0.1174, "step": 2868 }, { "epoch": 3.325653843367384, "grad_norm": 0.2774425148963928, "learning_rate": 3.3511494252873565e-05, "loss": 0.1137, "step": 2869 }, { "epoch": 3.326813011664131, "grad_norm": 0.2680792808532715, "learning_rate": 3.350574712643678e-05, "loss": 0.1045, "step": 2870 }, { "epoch": 3.327972179960878, "grad_norm": 0.39170289039611816, "learning_rate": 3.35e-05, "loss": 0.1121, "step": 2871 }, { "epoch": 3.3291313482576252, "grad_norm": 0.2982361614704132, "learning_rate": 3.349425287356322e-05, "loss": 0.1169, "step": 2872 }, { "epoch": 3.3302905165543724, "grad_norm": 0.2685178816318512, "learning_rate": 3.348850574712644e-05, "loss": 0.1001, "step": 2873 }, { "epoch": 3.3314496848511195, "grad_norm": 0.35294216871261597, "learning_rate": 3.348275862068966e-05, "loss": 0.1087, "step": 2874 }, { "epoch": 3.3326088531478666, "grad_norm": 0.3147522509098053, "learning_rate": 3.3477011494252874e-05, "loss": 0.1244, "step": 2875 }, { "epoch": 3.3337680214446133, "grad_norm": 0.2905179560184479, "learning_rate": 3.347126436781609e-05, "loss": 0.1017, "step": 2876 }, { "epoch": 3.334927189741361, "grad_norm": 0.28454288840293884, "learning_rate": 3.346551724137931e-05, "loss": 0.117, "step": 2877 }, { "epoch": 3.3360863580381075, "grad_norm": 0.24592730402946472, "learning_rate": 3.3459770114942526e-05, "loss": 0.105, "step": 2878 }, { "epoch": 3.3372455263348546, "grad_norm": 0.2803703248500824, "learning_rate": 3.3454022988505754e-05, "loss": 0.1251, "step": 2879 }, { "epoch": 3.3384046946316017, "grad_norm": 0.2251860499382019, "learning_rate": 3.344827586206897e-05, "loss": 0.0974, "step": 2880 }, { "epoch": 3.339563862928349, "grad_norm": 0.30766502022743225, "learning_rate": 3.3442528735632184e-05, "loss": 0.1055, "step": 2881 }, { "epoch": 3.340723031225096, "grad_norm": 0.31499797105789185, "learning_rate": 3.3436781609195406e-05, "loss": 0.1112, "step": 2882 }, { "epoch": 3.341882199521843, "grad_norm": 0.3749004900455475, "learning_rate": 3.343103448275862e-05, "loss": 0.1056, "step": 2883 }, { "epoch": 3.34304136781859, "grad_norm": 0.3857807517051697, "learning_rate": 3.3425287356321836e-05, "loss": 0.1111, "step": 2884 }, { "epoch": 3.3442005361153373, "grad_norm": 0.31204038858413696, "learning_rate": 3.341954022988506e-05, "loss": 0.1077, "step": 2885 }, { "epoch": 3.3453597044120844, "grad_norm": 0.3444509506225586, "learning_rate": 3.341379310344828e-05, "loss": 0.1068, "step": 2886 }, { "epoch": 3.3465188727088315, "grad_norm": 0.34033796191215515, "learning_rate": 3.34080459770115e-05, "loss": 0.1187, "step": 2887 }, { "epoch": 3.3476780410055786, "grad_norm": 0.3054271936416626, "learning_rate": 3.3402298850574715e-05, "loss": 0.118, "step": 2888 }, { "epoch": 3.3488372093023258, "grad_norm": 0.2640574276447296, "learning_rate": 3.339655172413793e-05, "loss": 0.1113, "step": 2889 }, { "epoch": 3.349996377599073, "grad_norm": 0.9027268886566162, "learning_rate": 3.339080459770115e-05, "loss": 0.1232, "step": 2890 }, { "epoch": 3.3511555458958195, "grad_norm": 0.2914552390575409, "learning_rate": 3.338505747126437e-05, "loss": 0.1053, "step": 2891 }, { "epoch": 3.3523147141925667, "grad_norm": 0.2577556371688843, "learning_rate": 3.337931034482759e-05, "loss": 0.1057, "step": 2892 }, { "epoch": 3.3534738824893138, "grad_norm": 0.2784647047519684, "learning_rate": 3.337356321839081e-05, "loss": 0.1012, "step": 2893 }, { "epoch": 3.354633050786061, "grad_norm": 0.2601286470890045, "learning_rate": 3.3367816091954025e-05, "loss": 0.1066, "step": 2894 }, { "epoch": 3.355792219082808, "grad_norm": 0.25884032249450684, "learning_rate": 3.336206896551724e-05, "loss": 0.1121, "step": 2895 }, { "epoch": 3.356951387379555, "grad_norm": 0.3100210428237915, "learning_rate": 3.335632183908046e-05, "loss": 0.1129, "step": 2896 }, { "epoch": 3.3581105556763022, "grad_norm": 0.3463558256626129, "learning_rate": 3.335057471264368e-05, "loss": 0.1178, "step": 2897 }, { "epoch": 3.3592697239730493, "grad_norm": 0.3152965009212494, "learning_rate": 3.33448275862069e-05, "loss": 0.1214, "step": 2898 }, { "epoch": 3.3604288922697965, "grad_norm": 0.32749250531196594, "learning_rate": 3.333908045977012e-05, "loss": 0.1165, "step": 2899 }, { "epoch": 3.3615880605665436, "grad_norm": 0.3287135660648346, "learning_rate": 3.3333333333333335e-05, "loss": 0.124, "step": 2900 }, { "epoch": 3.3627472288632907, "grad_norm": 0.2673903703689575, "learning_rate": 3.3327586206896557e-05, "loss": 0.1162, "step": 2901 }, { "epoch": 3.363906397160038, "grad_norm": 0.29948368668556213, "learning_rate": 3.332183908045977e-05, "loss": 0.1116, "step": 2902 }, { "epoch": 3.365065565456785, "grad_norm": 0.4442507028579712, "learning_rate": 3.3316091954022986e-05, "loss": 0.1093, "step": 2903 }, { "epoch": 3.3662247337535316, "grad_norm": 0.2337656021118164, "learning_rate": 3.331034482758621e-05, "loss": 0.0952, "step": 2904 }, { "epoch": 3.367383902050279, "grad_norm": 0.2944662272930145, "learning_rate": 3.330459770114942e-05, "loss": 0.1011, "step": 2905 }, { "epoch": 3.368543070347026, "grad_norm": 0.24339404702186584, "learning_rate": 3.3298850574712645e-05, "loss": 0.098, "step": 2906 }, { "epoch": 3.369702238643773, "grad_norm": 0.33986908197402954, "learning_rate": 3.3293103448275866e-05, "loss": 0.1111, "step": 2907 }, { "epoch": 3.37086140694052, "grad_norm": 0.333913117647171, "learning_rate": 3.328735632183908e-05, "loss": 0.1163, "step": 2908 }, { "epoch": 3.372020575237267, "grad_norm": 0.2759074568748474, "learning_rate": 3.32816091954023e-05, "loss": 0.1083, "step": 2909 }, { "epoch": 3.3731797435340143, "grad_norm": 0.264443039894104, "learning_rate": 3.327586206896552e-05, "loss": 0.1069, "step": 2910 }, { "epoch": 3.3743389118307614, "grad_norm": 0.33492133021354675, "learning_rate": 3.327011494252873e-05, "loss": 0.1081, "step": 2911 }, { "epoch": 3.3754980801275085, "grad_norm": 0.37320229411125183, "learning_rate": 3.3264367816091954e-05, "loss": 0.1177, "step": 2912 }, { "epoch": 3.3766572484242556, "grad_norm": 0.3110753297805786, "learning_rate": 3.3258620689655176e-05, "loss": 0.1062, "step": 2913 }, { "epoch": 3.3778164167210027, "grad_norm": 0.35673680901527405, "learning_rate": 3.325287356321839e-05, "loss": 0.1095, "step": 2914 }, { "epoch": 3.37897558501775, "grad_norm": 0.3656083643436432, "learning_rate": 3.324712643678161e-05, "loss": 0.1075, "step": 2915 }, { "epoch": 3.380134753314497, "grad_norm": 0.29794302582740784, "learning_rate": 3.324137931034483e-05, "loss": 0.1062, "step": 2916 }, { "epoch": 3.381293921611244, "grad_norm": 0.3165527284145355, "learning_rate": 3.323563218390805e-05, "loss": 0.1035, "step": 2917 }, { "epoch": 3.382453089907991, "grad_norm": 0.44483765959739685, "learning_rate": 3.3229885057471264e-05, "loss": 0.1039, "step": 2918 }, { "epoch": 3.383612258204738, "grad_norm": 0.3703659474849701, "learning_rate": 3.3224137931034486e-05, "loss": 0.1176, "step": 2919 }, { "epoch": 3.384771426501485, "grad_norm": 0.2532442510128021, "learning_rate": 3.321839080459771e-05, "loss": 0.1028, "step": 2920 }, { "epoch": 3.385930594798232, "grad_norm": 0.2925904393196106, "learning_rate": 3.321264367816092e-05, "loss": 0.1037, "step": 2921 }, { "epoch": 3.3870897630949792, "grad_norm": 0.45509782433509827, "learning_rate": 3.320689655172414e-05, "loss": 0.1159, "step": 2922 }, { "epoch": 3.3882489313917263, "grad_norm": 0.36248907446861267, "learning_rate": 3.320114942528736e-05, "loss": 0.114, "step": 2923 }, { "epoch": 3.3894080996884735, "grad_norm": 0.3373396098613739, "learning_rate": 3.3195402298850574e-05, "loss": 0.1131, "step": 2924 }, { "epoch": 3.3905672679852206, "grad_norm": 0.44164901971817017, "learning_rate": 3.3189655172413796e-05, "loss": 0.125, "step": 2925 }, { "epoch": 3.3917264362819677, "grad_norm": 0.32218971848487854, "learning_rate": 3.318390804597701e-05, "loss": 0.1103, "step": 2926 }, { "epoch": 3.392885604578715, "grad_norm": 0.25145384669303894, "learning_rate": 3.317816091954023e-05, "loss": 0.1172, "step": 2927 }, { "epoch": 3.394044772875462, "grad_norm": 0.3067503571510315, "learning_rate": 3.3172413793103454e-05, "loss": 0.1042, "step": 2928 }, { "epoch": 3.395203941172209, "grad_norm": 0.31007614731788635, "learning_rate": 3.316666666666667e-05, "loss": 0.1022, "step": 2929 }, { "epoch": 3.396363109468956, "grad_norm": 0.398503839969635, "learning_rate": 3.3160919540229884e-05, "loss": 0.1158, "step": 2930 }, { "epoch": 3.3975222777657033, "grad_norm": 0.32212406396865845, "learning_rate": 3.3155172413793105e-05, "loss": 0.1178, "step": 2931 }, { "epoch": 3.3986814460624504, "grad_norm": 0.40230342745780945, "learning_rate": 3.314942528735632e-05, "loss": 0.1165, "step": 2932 }, { "epoch": 3.3998406143591975, "grad_norm": 0.46653199195861816, "learning_rate": 3.314367816091954e-05, "loss": 0.1123, "step": 2933 }, { "epoch": 3.400999782655944, "grad_norm": 0.217983677983284, "learning_rate": 3.3137931034482764e-05, "loss": 0.0943, "step": 2934 }, { "epoch": 3.4021589509526913, "grad_norm": 0.3591748774051666, "learning_rate": 3.313218390804598e-05, "loss": 0.1121, "step": 2935 }, { "epoch": 3.4033181192494384, "grad_norm": 0.28786206245422363, "learning_rate": 3.31264367816092e-05, "loss": 0.104, "step": 2936 }, { "epoch": 3.4044772875461855, "grad_norm": 0.23771996796131134, "learning_rate": 3.3120689655172415e-05, "loss": 0.1036, "step": 2937 }, { "epoch": 3.4056364558429326, "grad_norm": 0.2682191729545593, "learning_rate": 3.311494252873563e-05, "loss": 0.1108, "step": 2938 }, { "epoch": 3.4067956241396797, "grad_norm": 0.2601390480995178, "learning_rate": 3.310919540229885e-05, "loss": 0.1215, "step": 2939 }, { "epoch": 3.407954792436427, "grad_norm": 0.3103736639022827, "learning_rate": 3.310344827586207e-05, "loss": 0.1083, "step": 2940 }, { "epoch": 3.409113960733174, "grad_norm": 0.2568131983280182, "learning_rate": 3.309770114942529e-05, "loss": 0.107, "step": 2941 }, { "epoch": 3.410273129029921, "grad_norm": 0.29044461250305176, "learning_rate": 3.309195402298851e-05, "loss": 0.1074, "step": 2942 }, { "epoch": 3.411432297326668, "grad_norm": 0.30530881881713867, "learning_rate": 3.3086206896551725e-05, "loss": 0.1095, "step": 2943 }, { "epoch": 3.4125914656234153, "grad_norm": 0.30023837089538574, "learning_rate": 3.3080459770114946e-05, "loss": 0.1057, "step": 2944 }, { "epoch": 3.4137506339201624, "grad_norm": 0.3304160237312317, "learning_rate": 3.307471264367816e-05, "loss": 0.1015, "step": 2945 }, { "epoch": 3.4149098022169095, "grad_norm": 0.3712501525878906, "learning_rate": 3.3068965517241376e-05, "loss": 0.1215, "step": 2946 }, { "epoch": 3.416068970513656, "grad_norm": 0.36417120695114136, "learning_rate": 3.30632183908046e-05, "loss": 0.1185, "step": 2947 }, { "epoch": 3.4172281388104038, "grad_norm": 0.401915580034256, "learning_rate": 3.305747126436782e-05, "loss": 0.1118, "step": 2948 }, { "epoch": 3.4183873071071504, "grad_norm": 0.27606403827667236, "learning_rate": 3.3051724137931035e-05, "loss": 0.1035, "step": 2949 }, { "epoch": 3.4195464754038976, "grad_norm": 0.28905513882637024, "learning_rate": 3.3045977011494256e-05, "loss": 0.1059, "step": 2950 }, { "epoch": 3.4207056437006447, "grad_norm": 0.30320367217063904, "learning_rate": 3.304022988505747e-05, "loss": 0.1162, "step": 2951 }, { "epoch": 3.421864811997392, "grad_norm": 0.36751824617385864, "learning_rate": 3.303448275862069e-05, "loss": 0.1222, "step": 2952 }, { "epoch": 3.423023980294139, "grad_norm": 0.5594837665557861, "learning_rate": 3.302873563218391e-05, "loss": 0.0988, "step": 2953 }, { "epoch": 3.424183148590886, "grad_norm": 0.2814250588417053, "learning_rate": 3.302298850574713e-05, "loss": 0.1089, "step": 2954 }, { "epoch": 3.425342316887633, "grad_norm": 0.30620330572128296, "learning_rate": 3.301724137931035e-05, "loss": 0.1238, "step": 2955 }, { "epoch": 3.4265014851843802, "grad_norm": 0.3259787857532501, "learning_rate": 3.3011494252873566e-05, "loss": 0.1061, "step": 2956 }, { "epoch": 3.4276606534811274, "grad_norm": 0.3204762041568756, "learning_rate": 3.300574712643678e-05, "loss": 0.1081, "step": 2957 }, { "epoch": 3.4288198217778745, "grad_norm": 0.32642972469329834, "learning_rate": 3.3e-05, "loss": 0.1097, "step": 2958 }, { "epoch": 3.4299789900746216, "grad_norm": 0.3611741065979004, "learning_rate": 3.299425287356322e-05, "loss": 0.1153, "step": 2959 }, { "epoch": 3.4311381583713687, "grad_norm": 0.24560561776161194, "learning_rate": 3.298850574712644e-05, "loss": 0.1122, "step": 2960 }, { "epoch": 3.432297326668116, "grad_norm": 0.2226334512233734, "learning_rate": 3.298275862068966e-05, "loss": 0.1046, "step": 2961 }, { "epoch": 3.4334564949648625, "grad_norm": 0.3255598545074463, "learning_rate": 3.2977011494252876e-05, "loss": 0.1003, "step": 2962 }, { "epoch": 3.4346156632616096, "grad_norm": 0.2790594696998596, "learning_rate": 3.29712643678161e-05, "loss": 0.1162, "step": 2963 }, { "epoch": 3.4357748315583567, "grad_norm": 0.3118756413459778, "learning_rate": 3.296551724137931e-05, "loss": 0.1025, "step": 2964 }, { "epoch": 3.436933999855104, "grad_norm": 0.3635786473751068, "learning_rate": 3.295977011494253e-05, "loss": 0.1311, "step": 2965 }, { "epoch": 3.438093168151851, "grad_norm": 0.2933517098426819, "learning_rate": 3.295402298850575e-05, "loss": 0.1107, "step": 2966 }, { "epoch": 3.439252336448598, "grad_norm": 0.37896665930747986, "learning_rate": 3.2948275862068964e-05, "loss": 0.1058, "step": 2967 }, { "epoch": 3.440411504745345, "grad_norm": 0.23080338537693024, "learning_rate": 3.2942528735632185e-05, "loss": 0.0987, "step": 2968 }, { "epoch": 3.4415706730420923, "grad_norm": 0.262114554643631, "learning_rate": 3.293678160919541e-05, "loss": 0.1152, "step": 2969 }, { "epoch": 3.4427298413388394, "grad_norm": 0.30394595861434937, "learning_rate": 3.293103448275862e-05, "loss": 0.1076, "step": 2970 }, { "epoch": 3.4438890096355865, "grad_norm": 0.30665647983551025, "learning_rate": 3.2925287356321844e-05, "loss": 0.1169, "step": 2971 }, { "epoch": 3.4450481779323336, "grad_norm": 0.35701873898506165, "learning_rate": 3.291954022988506e-05, "loss": 0.1186, "step": 2972 }, { "epoch": 3.4462073462290808, "grad_norm": 0.3494682013988495, "learning_rate": 3.2913793103448273e-05, "loss": 0.1188, "step": 2973 }, { "epoch": 3.447366514525828, "grad_norm": 0.28734609484672546, "learning_rate": 3.2908045977011495e-05, "loss": 0.1164, "step": 2974 }, { "epoch": 3.448525682822575, "grad_norm": 0.25234439969062805, "learning_rate": 3.290229885057472e-05, "loss": 0.107, "step": 2975 }, { "epoch": 3.449684851119322, "grad_norm": 0.3193553388118744, "learning_rate": 3.289655172413793e-05, "loss": 0.1101, "step": 2976 }, { "epoch": 3.4508440194160688, "grad_norm": 0.32543477416038513, "learning_rate": 3.2890804597701153e-05, "loss": 0.1204, "step": 2977 }, { "epoch": 3.452003187712816, "grad_norm": 0.34769558906555176, "learning_rate": 3.288505747126437e-05, "loss": 0.1161, "step": 2978 }, { "epoch": 3.453162356009563, "grad_norm": 0.315849244594574, "learning_rate": 3.287931034482758e-05, "loss": 0.1074, "step": 2979 }, { "epoch": 3.45432152430631, "grad_norm": 0.3821331262588501, "learning_rate": 3.2873563218390805e-05, "loss": 0.1193, "step": 2980 }, { "epoch": 3.4554806926030572, "grad_norm": 0.2808358669281006, "learning_rate": 3.2867816091954027e-05, "loss": 0.1031, "step": 2981 }, { "epoch": 3.4566398608998044, "grad_norm": 0.5840047001838684, "learning_rate": 3.286206896551725e-05, "loss": 0.1185, "step": 2982 }, { "epoch": 3.4577990291965515, "grad_norm": 0.3200035095214844, "learning_rate": 3.285632183908046e-05, "loss": 0.1226, "step": 2983 }, { "epoch": 3.4589581974932986, "grad_norm": 0.3244805335998535, "learning_rate": 3.285057471264368e-05, "loss": 0.1222, "step": 2984 }, { "epoch": 3.4601173657900457, "grad_norm": 0.3481799364089966, "learning_rate": 3.28448275862069e-05, "loss": 0.1145, "step": 2985 }, { "epoch": 3.461276534086793, "grad_norm": 0.40199366211891174, "learning_rate": 3.2839080459770115e-05, "loss": 0.1198, "step": 2986 }, { "epoch": 3.46243570238354, "grad_norm": 0.297984778881073, "learning_rate": 3.283333333333333e-05, "loss": 0.1072, "step": 2987 }, { "epoch": 3.463594870680287, "grad_norm": 0.272242933511734, "learning_rate": 3.282758620689655e-05, "loss": 0.1262, "step": 2988 }, { "epoch": 3.464754038977034, "grad_norm": 0.5346304178237915, "learning_rate": 3.282183908045977e-05, "loss": 0.1111, "step": 2989 }, { "epoch": 3.465913207273781, "grad_norm": 0.25313276052474976, "learning_rate": 3.2816091954022995e-05, "loss": 0.0966, "step": 2990 }, { "epoch": 3.4670723755705284, "grad_norm": 0.2953258156776428, "learning_rate": 3.281034482758621e-05, "loss": 0.11, "step": 2991 }, { "epoch": 3.468231543867275, "grad_norm": 0.2898904085159302, "learning_rate": 3.2804597701149424e-05, "loss": 0.1163, "step": 2992 }, { "epoch": 3.469390712164022, "grad_norm": 0.2814153730869293, "learning_rate": 3.2798850574712646e-05, "loss": 0.1005, "step": 2993 }, { "epoch": 3.4705498804607693, "grad_norm": 0.25759944319725037, "learning_rate": 3.279310344827586e-05, "loss": 0.1051, "step": 2994 }, { "epoch": 3.4717090487575164, "grad_norm": 0.2664312720298767, "learning_rate": 3.278735632183908e-05, "loss": 0.1084, "step": 2995 }, { "epoch": 3.4728682170542635, "grad_norm": 0.26857104897499084, "learning_rate": 3.2781609195402304e-05, "loss": 0.1059, "step": 2996 }, { "epoch": 3.4740273853510106, "grad_norm": 0.6132874488830566, "learning_rate": 3.277586206896552e-05, "loss": 0.1036, "step": 2997 }, { "epoch": 3.4751865536477577, "grad_norm": 0.2552327513694763, "learning_rate": 3.2770114942528734e-05, "loss": 0.0961, "step": 2998 }, { "epoch": 3.476345721944505, "grad_norm": 0.3144066035747528, "learning_rate": 3.2764367816091956e-05, "loss": 0.1203, "step": 2999 }, { "epoch": 3.477504890241252, "grad_norm": 0.2916877567768097, "learning_rate": 3.275862068965517e-05, "loss": 0.1169, "step": 3000 }, { "epoch": 3.478664058537999, "grad_norm": 0.2755250036716461, "learning_rate": 3.275287356321839e-05, "loss": 0.1092, "step": 3001 }, { "epoch": 3.479823226834746, "grad_norm": 0.30885928869247437, "learning_rate": 3.2747126436781614e-05, "loss": 0.1072, "step": 3002 }, { "epoch": 3.4809823951314933, "grad_norm": 0.26637914776802063, "learning_rate": 3.274137931034483e-05, "loss": 0.1115, "step": 3003 }, { "epoch": 3.4821415634282404, "grad_norm": 0.25272154808044434, "learning_rate": 3.273563218390805e-05, "loss": 0.1062, "step": 3004 }, { "epoch": 3.483300731724987, "grad_norm": 0.24959425628185272, "learning_rate": 3.2729885057471266e-05, "loss": 0.1034, "step": 3005 }, { "epoch": 3.4844599000217342, "grad_norm": 0.29696011543273926, "learning_rate": 3.272413793103448e-05, "loss": 0.129, "step": 3006 }, { "epoch": 3.4856190683184813, "grad_norm": 0.24814733862876892, "learning_rate": 3.27183908045977e-05, "loss": 0.1057, "step": 3007 }, { "epoch": 3.4867782366152285, "grad_norm": 0.2551887035369873, "learning_rate": 3.271264367816092e-05, "loss": 0.1072, "step": 3008 }, { "epoch": 3.4879374049119756, "grad_norm": 0.2791741192340851, "learning_rate": 3.2706896551724145e-05, "loss": 0.1059, "step": 3009 }, { "epoch": 3.4890965732087227, "grad_norm": 0.3654879033565521, "learning_rate": 3.270114942528736e-05, "loss": 0.1077, "step": 3010 }, { "epoch": 3.49025574150547, "grad_norm": 0.21658289432525635, "learning_rate": 3.2695402298850575e-05, "loss": 0.0997, "step": 3011 }, { "epoch": 3.491414909802217, "grad_norm": 0.2808610796928406, "learning_rate": 3.26896551724138e-05, "loss": 0.1127, "step": 3012 }, { "epoch": 3.492574078098964, "grad_norm": 0.327570378780365, "learning_rate": 3.268390804597701e-05, "loss": 0.1062, "step": 3013 }, { "epoch": 3.493733246395711, "grad_norm": 0.24436767399311066, "learning_rate": 3.267816091954023e-05, "loss": 0.1024, "step": 3014 }, { "epoch": 3.4948924146924583, "grad_norm": 0.33249321579933167, "learning_rate": 3.267241379310345e-05, "loss": 0.1109, "step": 3015 }, { "epoch": 3.4960515829892054, "grad_norm": 0.31921130418777466, "learning_rate": 3.266666666666667e-05, "loss": 0.1264, "step": 3016 }, { "epoch": 3.4972107512859525, "grad_norm": 0.2958783805370331, "learning_rate": 3.2660919540229885e-05, "loss": 0.1125, "step": 3017 }, { "epoch": 3.4983699195826996, "grad_norm": 0.2842378318309784, "learning_rate": 3.265517241379311e-05, "loss": 0.1187, "step": 3018 }, { "epoch": 3.4995290878794467, "grad_norm": 0.37194153666496277, "learning_rate": 3.264942528735632e-05, "loss": 0.1131, "step": 3019 }, { "epoch": 3.5006882561761934, "grad_norm": 0.342540979385376, "learning_rate": 3.264367816091954e-05, "loss": 0.1059, "step": 3020 }, { "epoch": 3.5018474244729405, "grad_norm": 0.517421305179596, "learning_rate": 3.263793103448276e-05, "loss": 0.1226, "step": 3021 }, { "epoch": 3.5030065927696876, "grad_norm": 0.2997574508190155, "learning_rate": 3.263218390804598e-05, "loss": 0.0962, "step": 3022 }, { "epoch": 3.5041657610664347, "grad_norm": 0.3273774981498718, "learning_rate": 3.26264367816092e-05, "loss": 0.1205, "step": 3023 }, { "epoch": 3.505324929363182, "grad_norm": 0.25956934690475464, "learning_rate": 3.2620689655172416e-05, "loss": 0.1186, "step": 3024 }, { "epoch": 3.506484097659929, "grad_norm": 0.306949645280838, "learning_rate": 3.261494252873563e-05, "loss": 0.1115, "step": 3025 }, { "epoch": 3.507643265956676, "grad_norm": 0.3232230842113495, "learning_rate": 3.260919540229885e-05, "loss": 0.1155, "step": 3026 }, { "epoch": 3.508802434253423, "grad_norm": 0.2607809007167816, "learning_rate": 3.260344827586207e-05, "loss": 0.1142, "step": 3027 }, { "epoch": 3.5099616025501703, "grad_norm": 0.31110328435897827, "learning_rate": 3.259770114942529e-05, "loss": 0.1102, "step": 3028 }, { "epoch": 3.5111207708469174, "grad_norm": 0.24277092516422272, "learning_rate": 3.2591954022988505e-05, "loss": 0.1149, "step": 3029 }, { "epoch": 3.5122799391436645, "grad_norm": 0.29219481348991394, "learning_rate": 3.2586206896551726e-05, "loss": 0.1141, "step": 3030 }, { "epoch": 3.5134391074404117, "grad_norm": 0.24297522008419037, "learning_rate": 3.258045977011495e-05, "loss": 0.1015, "step": 3031 }, { "epoch": 3.5145982757371588, "grad_norm": 0.2821162939071655, "learning_rate": 3.257471264367816e-05, "loss": 0.1153, "step": 3032 }, { "epoch": 3.5157574440339054, "grad_norm": 0.5800468325614929, "learning_rate": 3.256896551724138e-05, "loss": 0.1193, "step": 3033 }, { "epoch": 3.516916612330653, "grad_norm": 0.3560374677181244, "learning_rate": 3.25632183908046e-05, "loss": 0.1076, "step": 3034 }, { "epoch": 3.5180757806273997, "grad_norm": 0.4053660035133362, "learning_rate": 3.2557471264367814e-05, "loss": 0.1098, "step": 3035 }, { "epoch": 3.519234948924147, "grad_norm": 0.36452099680900574, "learning_rate": 3.2551724137931036e-05, "loss": 0.1074, "step": 3036 }, { "epoch": 3.520394117220894, "grad_norm": 0.2924714982509613, "learning_rate": 3.254597701149426e-05, "loss": 0.1131, "step": 3037 }, { "epoch": 3.521553285517641, "grad_norm": 0.30950403213500977, "learning_rate": 3.254022988505747e-05, "loss": 0.1121, "step": 3038 }, { "epoch": 3.522712453814388, "grad_norm": 0.3289961814880371, "learning_rate": 3.2534482758620694e-05, "loss": 0.1153, "step": 3039 }, { "epoch": 3.5238716221111352, "grad_norm": 0.2730542719364166, "learning_rate": 3.252873563218391e-05, "loss": 0.1181, "step": 3040 }, { "epoch": 3.5250307904078824, "grad_norm": 0.4356801211833954, "learning_rate": 3.2522988505747124e-05, "loss": 0.1049, "step": 3041 }, { "epoch": 3.5261899587046295, "grad_norm": 0.2651293873786926, "learning_rate": 3.2517241379310346e-05, "loss": 0.1194, "step": 3042 }, { "epoch": 3.5273491270013766, "grad_norm": 0.3155745267868042, "learning_rate": 3.251149425287357e-05, "loss": 0.1106, "step": 3043 }, { "epoch": 3.5285082952981237, "grad_norm": 0.2664744257926941, "learning_rate": 3.250574712643678e-05, "loss": 0.1166, "step": 3044 }, { "epoch": 3.529667463594871, "grad_norm": 0.3852986693382263, "learning_rate": 3.2500000000000004e-05, "loss": 0.1129, "step": 3045 }, { "epoch": 3.5308266318916175, "grad_norm": 0.31656765937805176, "learning_rate": 3.249425287356322e-05, "loss": 0.1202, "step": 3046 }, { "epoch": 3.531985800188365, "grad_norm": 0.2873196303844452, "learning_rate": 3.248850574712644e-05, "loss": 0.1228, "step": 3047 }, { "epoch": 3.5331449684851117, "grad_norm": 0.29903656244277954, "learning_rate": 3.2482758620689655e-05, "loss": 0.1052, "step": 3048 }, { "epoch": 3.5343041367818593, "grad_norm": 0.2554726302623749, "learning_rate": 3.247701149425287e-05, "loss": 0.1036, "step": 3049 }, { "epoch": 3.535463305078606, "grad_norm": 0.33585837483406067, "learning_rate": 3.24712643678161e-05, "loss": 0.117, "step": 3050 }, { "epoch": 3.536622473375353, "grad_norm": 0.28238433599472046, "learning_rate": 3.2465517241379314e-05, "loss": 0.1106, "step": 3051 }, { "epoch": 3.5377816416721, "grad_norm": 0.33771008253097534, "learning_rate": 3.245977011494253e-05, "loss": 0.1117, "step": 3052 }, { "epoch": 3.5389408099688473, "grad_norm": 0.34234920144081116, "learning_rate": 3.245402298850575e-05, "loss": 0.1119, "step": 3053 }, { "epoch": 3.5400999782655944, "grad_norm": 0.29577329754829407, "learning_rate": 3.2448275862068965e-05, "loss": 0.1119, "step": 3054 }, { "epoch": 3.5412591465623415, "grad_norm": 0.5385765433311462, "learning_rate": 3.244252873563218e-05, "loss": 0.1013, "step": 3055 }, { "epoch": 3.5424183148590886, "grad_norm": 0.31815600395202637, "learning_rate": 3.24367816091954e-05, "loss": 0.1136, "step": 3056 }, { "epoch": 3.5435774831558358, "grad_norm": 0.3204317092895508, "learning_rate": 3.2431034482758623e-05, "loss": 0.1209, "step": 3057 }, { "epoch": 3.544736651452583, "grad_norm": 0.34729644656181335, "learning_rate": 3.2425287356321845e-05, "loss": 0.1186, "step": 3058 }, { "epoch": 3.54589581974933, "grad_norm": 0.35338544845581055, "learning_rate": 3.241954022988506e-05, "loss": 0.1165, "step": 3059 }, { "epoch": 3.547054988046077, "grad_norm": 0.36400482058525085, "learning_rate": 3.2413793103448275e-05, "loss": 0.1285, "step": 3060 }, { "epoch": 3.5482141563428238, "grad_norm": 0.2854449450969696, "learning_rate": 3.2408045977011497e-05, "loss": 0.1064, "step": 3061 }, { "epoch": 3.5493733246395713, "grad_norm": 0.2662312090396881, "learning_rate": 3.240229885057471e-05, "loss": 0.105, "step": 3062 }, { "epoch": 3.550532492936318, "grad_norm": 0.30098652839660645, "learning_rate": 3.239655172413793e-05, "loss": 0.1025, "step": 3063 }, { "epoch": 3.551691661233065, "grad_norm": 0.35262545943260193, "learning_rate": 3.2390804597701155e-05, "loss": 0.1148, "step": 3064 }, { "epoch": 3.5528508295298122, "grad_norm": 0.2994409501552582, "learning_rate": 3.238505747126437e-05, "loss": 0.1072, "step": 3065 }, { "epoch": 3.5540099978265594, "grad_norm": 0.3354470133781433, "learning_rate": 3.237931034482759e-05, "loss": 0.1112, "step": 3066 }, { "epoch": 3.5551691661233065, "grad_norm": 0.30493679642677307, "learning_rate": 3.2373563218390806e-05, "loss": 0.1071, "step": 3067 }, { "epoch": 3.5563283344200536, "grad_norm": 0.24388879537582397, "learning_rate": 3.236781609195402e-05, "loss": 0.1135, "step": 3068 }, { "epoch": 3.5574875027168007, "grad_norm": 0.37016189098358154, "learning_rate": 3.236206896551724e-05, "loss": 0.1184, "step": 3069 }, { "epoch": 3.558646671013548, "grad_norm": 0.3760676681995392, "learning_rate": 3.2356321839080465e-05, "loss": 0.1099, "step": 3070 }, { "epoch": 3.559805839310295, "grad_norm": 0.25357261300086975, "learning_rate": 3.235057471264368e-05, "loss": 0.1016, "step": 3071 }, { "epoch": 3.560965007607042, "grad_norm": 0.3364717960357666, "learning_rate": 3.23448275862069e-05, "loss": 0.1095, "step": 3072 }, { "epoch": 3.562124175903789, "grad_norm": 0.316201776266098, "learning_rate": 3.2339080459770116e-05, "loss": 0.1217, "step": 3073 }, { "epoch": 3.5632833442005363, "grad_norm": 0.2470405250787735, "learning_rate": 3.233333333333333e-05, "loss": 0.1078, "step": 3074 }, { "epoch": 3.5644425124972834, "grad_norm": 0.25586774945259094, "learning_rate": 3.232758620689655e-05, "loss": 0.1138, "step": 3075 }, { "epoch": 3.56560168079403, "grad_norm": 0.37907102704048157, "learning_rate": 3.232183908045977e-05, "loss": 0.1062, "step": 3076 }, { "epoch": 3.5667608490907776, "grad_norm": 0.36059078574180603, "learning_rate": 3.231609195402299e-05, "loss": 0.1178, "step": 3077 }, { "epoch": 3.5679200173875243, "grad_norm": 0.24348652362823486, "learning_rate": 3.231034482758621e-05, "loss": 0.1101, "step": 3078 }, { "epoch": 3.5690791856842714, "grad_norm": 0.31023815274238586, "learning_rate": 3.2304597701149426e-05, "loss": 0.1072, "step": 3079 }, { "epoch": 3.5702383539810185, "grad_norm": 0.2670823931694031, "learning_rate": 3.229885057471265e-05, "loss": 0.1137, "step": 3080 }, { "epoch": 3.5713975222777656, "grad_norm": 0.4615989029407501, "learning_rate": 3.229310344827586e-05, "loss": 0.1186, "step": 3081 }, { "epoch": 3.5725566905745127, "grad_norm": 0.2701643407344818, "learning_rate": 3.228735632183908e-05, "loss": 0.1039, "step": 3082 }, { "epoch": 3.57371585887126, "grad_norm": 0.28233602643013, "learning_rate": 3.22816091954023e-05, "loss": 0.1227, "step": 3083 }, { "epoch": 3.574875027168007, "grad_norm": 0.3075142204761505, "learning_rate": 3.227586206896552e-05, "loss": 0.115, "step": 3084 }, { "epoch": 3.576034195464754, "grad_norm": 0.31267890334129333, "learning_rate": 3.227011494252874e-05, "loss": 0.1172, "step": 3085 }, { "epoch": 3.577193363761501, "grad_norm": 0.27376192808151245, "learning_rate": 3.226436781609196e-05, "loss": 0.1111, "step": 3086 }, { "epoch": 3.5783525320582483, "grad_norm": 0.2703624963760376, "learning_rate": 3.225862068965517e-05, "loss": 0.1122, "step": 3087 }, { "epoch": 3.5795117003549954, "grad_norm": 0.32042473554611206, "learning_rate": 3.2252873563218394e-05, "loss": 0.1001, "step": 3088 }, { "epoch": 3.580670868651742, "grad_norm": 0.28498318791389465, "learning_rate": 3.224712643678161e-05, "loss": 0.1022, "step": 3089 }, { "epoch": 3.5818300369484897, "grad_norm": 0.2931499183177948, "learning_rate": 3.2241379310344824e-05, "loss": 0.1109, "step": 3090 }, { "epoch": 3.5829892052452363, "grad_norm": 0.34564685821533203, "learning_rate": 3.223563218390805e-05, "loss": 0.1222, "step": 3091 }, { "epoch": 3.584148373541984, "grad_norm": 0.40452006459236145, "learning_rate": 3.222988505747127e-05, "loss": 0.1101, "step": 3092 }, { "epoch": 3.5853075418387306, "grad_norm": 0.40536507964134216, "learning_rate": 3.222413793103448e-05, "loss": 0.1166, "step": 3093 }, { "epoch": 3.5864667101354777, "grad_norm": 0.30855557322502136, "learning_rate": 3.2218390804597704e-05, "loss": 0.1273, "step": 3094 }, { "epoch": 3.587625878432225, "grad_norm": 0.2754945755004883, "learning_rate": 3.221264367816092e-05, "loss": 0.1053, "step": 3095 }, { "epoch": 3.588785046728972, "grad_norm": 0.3107283413410187, "learning_rate": 3.220689655172414e-05, "loss": 0.1072, "step": 3096 }, { "epoch": 3.589944215025719, "grad_norm": 0.30340340733528137, "learning_rate": 3.2201149425287355e-05, "loss": 0.1079, "step": 3097 }, { "epoch": 3.591103383322466, "grad_norm": 0.32089850306510925, "learning_rate": 3.219540229885058e-05, "loss": 0.1076, "step": 3098 }, { "epoch": 3.5922625516192133, "grad_norm": 0.3080294728279114, "learning_rate": 3.21896551724138e-05, "loss": 0.1093, "step": 3099 }, { "epoch": 3.5934217199159604, "grad_norm": 0.28277286887168884, "learning_rate": 3.218390804597701e-05, "loss": 0.1045, "step": 3100 }, { "epoch": 3.5945808882127075, "grad_norm": 0.30949509143829346, "learning_rate": 3.217816091954023e-05, "loss": 0.1089, "step": 3101 }, { "epoch": 3.5957400565094546, "grad_norm": 0.24771907925605774, "learning_rate": 3.217241379310345e-05, "loss": 0.1061, "step": 3102 }, { "epoch": 3.5968992248062017, "grad_norm": 0.37252581119537354, "learning_rate": 3.2166666666666665e-05, "loss": 0.12, "step": 3103 }, { "epoch": 3.5980583931029484, "grad_norm": 0.3569035232067108, "learning_rate": 3.2160919540229886e-05, "loss": 0.1188, "step": 3104 }, { "epoch": 3.599217561399696, "grad_norm": 0.46223220229148865, "learning_rate": 3.215517241379311e-05, "loss": 0.1251, "step": 3105 }, { "epoch": 3.6003767296964426, "grad_norm": 0.42109113931655884, "learning_rate": 3.214942528735632e-05, "loss": 0.1117, "step": 3106 }, { "epoch": 3.6015358979931897, "grad_norm": 0.3445476293563843, "learning_rate": 3.2143678160919545e-05, "loss": 0.1015, "step": 3107 }, { "epoch": 3.602695066289937, "grad_norm": 0.3141631484031677, "learning_rate": 3.213793103448276e-05, "loss": 0.1086, "step": 3108 }, { "epoch": 3.603854234586684, "grad_norm": 0.4695309102535248, "learning_rate": 3.2132183908045974e-05, "loss": 0.1293, "step": 3109 }, { "epoch": 3.605013402883431, "grad_norm": 0.2591305077075958, "learning_rate": 3.2126436781609196e-05, "loss": 0.1013, "step": 3110 }, { "epoch": 3.606172571180178, "grad_norm": 0.26712268590927124, "learning_rate": 3.212068965517242e-05, "loss": 0.1071, "step": 3111 }, { "epoch": 3.6073317394769253, "grad_norm": 0.30240774154663086, "learning_rate": 3.211494252873564e-05, "loss": 0.1096, "step": 3112 }, { "epoch": 3.6084909077736724, "grad_norm": 0.3110927641391754, "learning_rate": 3.2109195402298854e-05, "loss": 0.1147, "step": 3113 }, { "epoch": 3.6096500760704195, "grad_norm": 0.3136695325374603, "learning_rate": 3.210344827586207e-05, "loss": 0.1206, "step": 3114 }, { "epoch": 3.6108092443671667, "grad_norm": 0.23492947220802307, "learning_rate": 3.209770114942529e-05, "loss": 0.1091, "step": 3115 }, { "epoch": 3.6119684126639138, "grad_norm": 0.29082900285720825, "learning_rate": 3.2091954022988506e-05, "loss": 0.1052, "step": 3116 }, { "epoch": 3.613127580960661, "grad_norm": 0.2898150086402893, "learning_rate": 3.208620689655172e-05, "loss": 0.1239, "step": 3117 }, { "epoch": 3.614286749257408, "grad_norm": 0.25189781188964844, "learning_rate": 3.208045977011494e-05, "loss": 0.1027, "step": 3118 }, { "epoch": 3.6154459175541547, "grad_norm": 0.2974126636981964, "learning_rate": 3.2074712643678164e-05, "loss": 0.0982, "step": 3119 }, { "epoch": 3.6166050858509022, "grad_norm": 0.2558945119380951, "learning_rate": 3.206896551724138e-05, "loss": 0.1072, "step": 3120 }, { "epoch": 3.617764254147649, "grad_norm": 0.2936224937438965, "learning_rate": 3.20632183908046e-05, "loss": 0.1153, "step": 3121 }, { "epoch": 3.618923422444396, "grad_norm": 0.24101215600967407, "learning_rate": 3.2057471264367816e-05, "loss": 0.1005, "step": 3122 }, { "epoch": 3.620082590741143, "grad_norm": 0.3117283582687378, "learning_rate": 3.205172413793104e-05, "loss": 0.1159, "step": 3123 }, { "epoch": 3.6212417590378903, "grad_norm": 0.3123790919780731, "learning_rate": 3.204597701149425e-05, "loss": 0.1102, "step": 3124 }, { "epoch": 3.6224009273346374, "grad_norm": 0.32129356265068054, "learning_rate": 3.2040229885057474e-05, "loss": 0.1247, "step": 3125 }, { "epoch": 3.6235600956313845, "grad_norm": 0.287790983915329, "learning_rate": 3.2034482758620696e-05, "loss": 0.1225, "step": 3126 }, { "epoch": 3.6247192639281316, "grad_norm": 0.33491361141204834, "learning_rate": 3.202873563218391e-05, "loss": 0.1189, "step": 3127 }, { "epoch": 3.6258784322248787, "grad_norm": 0.3275797963142395, "learning_rate": 3.2022988505747125e-05, "loss": 0.1246, "step": 3128 }, { "epoch": 3.627037600521626, "grad_norm": 0.30787503719329834, "learning_rate": 3.201724137931035e-05, "loss": 0.1204, "step": 3129 }, { "epoch": 3.628196768818373, "grad_norm": 0.254406601190567, "learning_rate": 3.201149425287356e-05, "loss": 0.1033, "step": 3130 }, { "epoch": 3.62935593711512, "grad_norm": 0.2660934329032898, "learning_rate": 3.2005747126436784e-05, "loss": 0.1096, "step": 3131 }, { "epoch": 3.6305151054118667, "grad_norm": 0.30187851190567017, "learning_rate": 3.2000000000000005e-05, "loss": 0.1064, "step": 3132 }, { "epoch": 3.6316742737086143, "grad_norm": 0.2814703583717346, "learning_rate": 3.199425287356322e-05, "loss": 0.1209, "step": 3133 }, { "epoch": 3.632833442005361, "grad_norm": 0.31418412923812866, "learning_rate": 3.198850574712644e-05, "loss": 0.1147, "step": 3134 }, { "epoch": 3.633992610302108, "grad_norm": 0.25230860710144043, "learning_rate": 3.198275862068966e-05, "loss": 0.1094, "step": 3135 }, { "epoch": 3.635151778598855, "grad_norm": 0.2896537482738495, "learning_rate": 3.197701149425287e-05, "loss": 0.1166, "step": 3136 }, { "epoch": 3.6363109468956023, "grad_norm": 0.2596437931060791, "learning_rate": 3.1971264367816093e-05, "loss": 0.1141, "step": 3137 }, { "epoch": 3.6374701151923494, "grad_norm": 0.29989323019981384, "learning_rate": 3.196551724137931e-05, "loss": 0.1154, "step": 3138 }, { "epoch": 3.6386292834890965, "grad_norm": 0.303406685590744, "learning_rate": 3.195977011494253e-05, "loss": 0.1147, "step": 3139 }, { "epoch": 3.6397884517858436, "grad_norm": 0.2839892506599426, "learning_rate": 3.195402298850575e-05, "loss": 0.1164, "step": 3140 }, { "epoch": 3.6409476200825908, "grad_norm": 0.29430249333381653, "learning_rate": 3.1948275862068967e-05, "loss": 0.1083, "step": 3141 }, { "epoch": 3.642106788379338, "grad_norm": 0.24778763949871063, "learning_rate": 3.194252873563219e-05, "loss": 0.1037, "step": 3142 }, { "epoch": 3.643265956676085, "grad_norm": 0.3377165198326111, "learning_rate": 3.19367816091954e-05, "loss": 0.1178, "step": 3143 }, { "epoch": 3.644425124972832, "grad_norm": 0.38875332474708557, "learning_rate": 3.193103448275862e-05, "loss": 0.1159, "step": 3144 }, { "epoch": 3.6455842932695792, "grad_norm": 0.3728177547454834, "learning_rate": 3.192528735632184e-05, "loss": 0.1088, "step": 3145 }, { "epoch": 3.6467434615663263, "grad_norm": 0.31668561697006226, "learning_rate": 3.191954022988506e-05, "loss": 0.1055, "step": 3146 }, { "epoch": 3.647902629863073, "grad_norm": 0.3226422965526581, "learning_rate": 3.1913793103448276e-05, "loss": 0.1252, "step": 3147 }, { "epoch": 3.6490617981598206, "grad_norm": 0.31225472688674927, "learning_rate": 3.19080459770115e-05, "loss": 0.1131, "step": 3148 }, { "epoch": 3.6502209664565672, "grad_norm": 0.3561670482158661, "learning_rate": 3.190229885057471e-05, "loss": 0.1341, "step": 3149 }, { "epoch": 3.6513801347533144, "grad_norm": 0.29987701773643494, "learning_rate": 3.1896551724137935e-05, "loss": 0.1263, "step": 3150 }, { "epoch": 3.6525393030500615, "grad_norm": 0.2811523675918579, "learning_rate": 3.189080459770115e-05, "loss": 0.1225, "step": 3151 }, { "epoch": 3.6536984713468086, "grad_norm": 0.31684601306915283, "learning_rate": 3.188505747126437e-05, "loss": 0.1086, "step": 3152 }, { "epoch": 3.6548576396435557, "grad_norm": 0.28780969977378845, "learning_rate": 3.187931034482759e-05, "loss": 0.1085, "step": 3153 }, { "epoch": 3.656016807940303, "grad_norm": 0.31450673937797546, "learning_rate": 3.187356321839081e-05, "loss": 0.109, "step": 3154 }, { "epoch": 3.65717597623705, "grad_norm": 0.2759770452976227, "learning_rate": 3.186781609195402e-05, "loss": 0.1049, "step": 3155 }, { "epoch": 3.658335144533797, "grad_norm": 0.3448818624019623, "learning_rate": 3.1862068965517244e-05, "loss": 0.1283, "step": 3156 }, { "epoch": 3.659494312830544, "grad_norm": 0.2501785457134247, "learning_rate": 3.185632183908046e-05, "loss": 0.1056, "step": 3157 }, { "epoch": 3.6606534811272913, "grad_norm": 0.288875550031662, "learning_rate": 3.1850574712643674e-05, "loss": 0.1018, "step": 3158 }, { "epoch": 3.6618126494240384, "grad_norm": 0.29204368591308594, "learning_rate": 3.1844827586206896e-05, "loss": 0.1082, "step": 3159 }, { "epoch": 3.662971817720785, "grad_norm": 0.3856884837150574, "learning_rate": 3.183908045977012e-05, "loss": 0.0979, "step": 3160 }, { "epoch": 3.6641309860175326, "grad_norm": 0.30575424432754517, "learning_rate": 3.183333333333334e-05, "loss": 0.1151, "step": 3161 }, { "epoch": 3.6652901543142793, "grad_norm": 0.3039374053478241, "learning_rate": 3.1827586206896554e-05, "loss": 0.1203, "step": 3162 }, { "epoch": 3.666449322611027, "grad_norm": 0.2760923206806183, "learning_rate": 3.182183908045977e-05, "loss": 0.1114, "step": 3163 }, { "epoch": 3.6676084909077735, "grad_norm": 0.3372345566749573, "learning_rate": 3.181609195402299e-05, "loss": 0.1204, "step": 3164 }, { "epoch": 3.6687676592045206, "grad_norm": 0.34602412581443787, "learning_rate": 3.1810344827586206e-05, "loss": 0.1149, "step": 3165 }, { "epoch": 3.6699268275012678, "grad_norm": 0.3075660467147827, "learning_rate": 3.180459770114943e-05, "loss": 0.109, "step": 3166 }, { "epoch": 3.671085995798015, "grad_norm": 0.4555627405643463, "learning_rate": 3.179885057471265e-05, "loss": 0.1119, "step": 3167 }, { "epoch": 3.672245164094762, "grad_norm": 0.3053813576698303, "learning_rate": 3.1793103448275864e-05, "loss": 0.117, "step": 3168 }, { "epoch": 3.673404332391509, "grad_norm": 0.34098416566848755, "learning_rate": 3.1787356321839085e-05, "loss": 0.1131, "step": 3169 }, { "epoch": 3.674563500688256, "grad_norm": 0.2684711813926697, "learning_rate": 3.17816091954023e-05, "loss": 0.1205, "step": 3170 }, { "epoch": 3.6757226689850033, "grad_norm": 0.2564740777015686, "learning_rate": 3.1775862068965515e-05, "loss": 0.1063, "step": 3171 }, { "epoch": 3.6768818372817504, "grad_norm": 0.27508318424224854, "learning_rate": 3.177011494252874e-05, "loss": 0.1106, "step": 3172 }, { "epoch": 3.6780410055784976, "grad_norm": 0.23721154034137726, "learning_rate": 3.176436781609196e-05, "loss": 0.107, "step": 3173 }, { "epoch": 3.6792001738752447, "grad_norm": 0.2993345260620117, "learning_rate": 3.1758620689655174e-05, "loss": 0.1075, "step": 3174 }, { "epoch": 3.6803593421719913, "grad_norm": 0.3475472927093506, "learning_rate": 3.1752873563218395e-05, "loss": 0.1034, "step": 3175 }, { "epoch": 3.681518510468739, "grad_norm": 0.25363993644714355, "learning_rate": 3.174712643678161e-05, "loss": 0.113, "step": 3176 }, { "epoch": 3.6826776787654856, "grad_norm": 0.27115947008132935, "learning_rate": 3.1741379310344825e-05, "loss": 0.1231, "step": 3177 }, { "epoch": 3.6838368470622327, "grad_norm": 0.2857735753059387, "learning_rate": 3.173563218390805e-05, "loss": 0.1171, "step": 3178 }, { "epoch": 3.68499601535898, "grad_norm": 0.2827852666378021, "learning_rate": 3.172988505747126e-05, "loss": 0.1089, "step": 3179 }, { "epoch": 3.686155183655727, "grad_norm": 0.31064677238464355, "learning_rate": 3.172413793103448e-05, "loss": 0.1281, "step": 3180 }, { "epoch": 3.687314351952474, "grad_norm": 0.38487324118614197, "learning_rate": 3.1718390804597705e-05, "loss": 0.1147, "step": 3181 }, { "epoch": 3.688473520249221, "grad_norm": 0.2798865735530853, "learning_rate": 3.171264367816092e-05, "loss": 0.1065, "step": 3182 }, { "epoch": 3.6896326885459683, "grad_norm": 0.3226175308227539, "learning_rate": 3.170689655172414e-05, "loss": 0.1131, "step": 3183 }, { "epoch": 3.6907918568427154, "grad_norm": 0.2735946774482727, "learning_rate": 3.1701149425287356e-05, "loss": 0.1149, "step": 3184 }, { "epoch": 3.6919510251394625, "grad_norm": 0.3574925661087036, "learning_rate": 3.169540229885057e-05, "loss": 0.1081, "step": 3185 }, { "epoch": 3.6931101934362096, "grad_norm": 0.24798470735549927, "learning_rate": 3.168965517241379e-05, "loss": 0.1078, "step": 3186 }, { "epoch": 3.6942693617329567, "grad_norm": 0.27632611989974976, "learning_rate": 3.1683908045977015e-05, "loss": 0.1247, "step": 3187 }, { "epoch": 3.695428530029704, "grad_norm": 0.21540036797523499, "learning_rate": 3.1678160919540236e-05, "loss": 0.0948, "step": 3188 }, { "epoch": 3.696587698326451, "grad_norm": 0.34586644172668457, "learning_rate": 3.167241379310345e-05, "loss": 0.1299, "step": 3189 }, { "epoch": 3.6977468666231976, "grad_norm": 0.3047373592853546, "learning_rate": 3.1666666666666666e-05, "loss": 0.1185, "step": 3190 }, { "epoch": 3.698906034919945, "grad_norm": 0.30293378233909607, "learning_rate": 3.166091954022989e-05, "loss": 0.114, "step": 3191 }, { "epoch": 3.700065203216692, "grad_norm": 0.3187353014945984, "learning_rate": 3.16551724137931e-05, "loss": 0.1255, "step": 3192 }, { "epoch": 3.701224371513439, "grad_norm": 0.25981852412223816, "learning_rate": 3.1649425287356324e-05, "loss": 0.114, "step": 3193 }, { "epoch": 3.702383539810186, "grad_norm": 0.2848886549472809, "learning_rate": 3.1643678160919546e-05, "loss": 0.104, "step": 3194 }, { "epoch": 3.703542708106933, "grad_norm": 0.2802824079990387, "learning_rate": 3.163793103448276e-05, "loss": 0.0887, "step": 3195 }, { "epoch": 3.7047018764036803, "grad_norm": 0.2874505817890167, "learning_rate": 3.1632183908045976e-05, "loss": 0.1067, "step": 3196 }, { "epoch": 3.7058610447004274, "grad_norm": 0.4384983479976654, "learning_rate": 3.16264367816092e-05, "loss": 0.1229, "step": 3197 }, { "epoch": 3.7070202129971745, "grad_norm": 0.3324441909790039, "learning_rate": 3.162068965517241e-05, "loss": 0.1089, "step": 3198 }, { "epoch": 3.7081793812939217, "grad_norm": 0.311897337436676, "learning_rate": 3.1614942528735634e-05, "loss": 0.1182, "step": 3199 }, { "epoch": 3.7093385495906688, "grad_norm": 0.28386661410331726, "learning_rate": 3.160919540229885e-05, "loss": 0.1092, "step": 3200 }, { "epoch": 3.710497717887416, "grad_norm": 0.27510008215904236, "learning_rate": 3.160344827586207e-05, "loss": 0.1098, "step": 3201 }, { "epoch": 3.711656886184163, "grad_norm": 0.25964272022247314, "learning_rate": 3.159770114942529e-05, "loss": 0.0969, "step": 3202 }, { "epoch": 3.7128160544809097, "grad_norm": 0.31421464681625366, "learning_rate": 3.159195402298851e-05, "loss": 0.1123, "step": 3203 }, { "epoch": 3.7139752227776572, "grad_norm": 0.2845355272293091, "learning_rate": 3.158620689655172e-05, "loss": 0.1089, "step": 3204 }, { "epoch": 3.715134391074404, "grad_norm": 0.5368481874465942, "learning_rate": 3.1580459770114944e-05, "loss": 0.1038, "step": 3205 }, { "epoch": 3.7162935593711515, "grad_norm": 0.5967825651168823, "learning_rate": 3.157471264367816e-05, "loss": 0.105, "step": 3206 }, { "epoch": 3.717452727667898, "grad_norm": 0.29082855582237244, "learning_rate": 3.156896551724138e-05, "loss": 0.1157, "step": 3207 }, { "epoch": 3.7186118959646453, "grad_norm": 0.337176114320755, "learning_rate": 3.15632183908046e-05, "loss": 0.1105, "step": 3208 }, { "epoch": 3.7197710642613924, "grad_norm": 0.4318813979625702, "learning_rate": 3.155747126436782e-05, "loss": 0.1172, "step": 3209 }, { "epoch": 3.7209302325581395, "grad_norm": 0.2882601320743561, "learning_rate": 3.155172413793104e-05, "loss": 0.1016, "step": 3210 }, { "epoch": 3.7220894008548866, "grad_norm": 0.2961920499801636, "learning_rate": 3.1545977011494254e-05, "loss": 0.1317, "step": 3211 }, { "epoch": 3.7232485691516337, "grad_norm": 0.2166760265827179, "learning_rate": 3.154022988505747e-05, "loss": 0.1111, "step": 3212 }, { "epoch": 3.724407737448381, "grad_norm": 0.28654518723487854, "learning_rate": 3.153448275862069e-05, "loss": 0.1187, "step": 3213 }, { "epoch": 3.725566905745128, "grad_norm": 0.34299778938293457, "learning_rate": 3.152873563218391e-05, "loss": 0.1278, "step": 3214 }, { "epoch": 3.726726074041875, "grad_norm": 0.25071898102760315, "learning_rate": 3.152298850574713e-05, "loss": 0.1067, "step": 3215 }, { "epoch": 3.727885242338622, "grad_norm": 0.2629477083683014, "learning_rate": 3.151724137931035e-05, "loss": 0.1121, "step": 3216 }, { "epoch": 3.7290444106353693, "grad_norm": 0.37242335081100464, "learning_rate": 3.151149425287356e-05, "loss": 0.1083, "step": 3217 }, { "epoch": 3.730203578932116, "grad_norm": 0.27490779757499695, "learning_rate": 3.1505747126436785e-05, "loss": 0.1174, "step": 3218 }, { "epoch": 3.7313627472288635, "grad_norm": 0.2524685263633728, "learning_rate": 3.15e-05, "loss": 0.118, "step": 3219 }, { "epoch": 3.73252191552561, "grad_norm": 0.27024635672569275, "learning_rate": 3.1494252873563215e-05, "loss": 0.1198, "step": 3220 }, { "epoch": 3.7336810838223573, "grad_norm": 0.2741260528564453, "learning_rate": 3.148850574712644e-05, "loss": 0.1144, "step": 3221 }, { "epoch": 3.7348402521191044, "grad_norm": 0.321684867143631, "learning_rate": 3.148275862068966e-05, "loss": 0.0993, "step": 3222 }, { "epoch": 3.7359994204158515, "grad_norm": 0.19403128325939178, "learning_rate": 3.147701149425287e-05, "loss": 0.0992, "step": 3223 }, { "epoch": 3.7371585887125987, "grad_norm": 0.26611411571502686, "learning_rate": 3.1471264367816095e-05, "loss": 0.1176, "step": 3224 }, { "epoch": 3.7383177570093458, "grad_norm": 0.36536505818367004, "learning_rate": 3.146551724137931e-05, "loss": 0.1099, "step": 3225 }, { "epoch": 3.739476925306093, "grad_norm": 0.2902340590953827, "learning_rate": 3.145977011494253e-05, "loss": 0.1109, "step": 3226 }, { "epoch": 3.74063609360284, "grad_norm": 0.24061763286590576, "learning_rate": 3.1454022988505746e-05, "loss": 0.111, "step": 3227 }, { "epoch": 3.741795261899587, "grad_norm": 0.32074034214019775, "learning_rate": 3.144827586206897e-05, "loss": 0.1056, "step": 3228 }, { "epoch": 3.7429544301963342, "grad_norm": 0.2975504994392395, "learning_rate": 3.144252873563219e-05, "loss": 0.1241, "step": 3229 }, { "epoch": 3.7441135984930813, "grad_norm": 0.33813774585723877, "learning_rate": 3.1436781609195405e-05, "loss": 0.11, "step": 3230 }, { "epoch": 3.7452727667898285, "grad_norm": 0.29403892159461975, "learning_rate": 3.143103448275862e-05, "loss": 0.1118, "step": 3231 }, { "epoch": 3.7464319350865756, "grad_norm": 0.2610683739185333, "learning_rate": 3.142528735632184e-05, "loss": 0.1055, "step": 3232 }, { "epoch": 3.7475911033833222, "grad_norm": 0.2696630358695984, "learning_rate": 3.1419540229885056e-05, "loss": 0.1062, "step": 3233 }, { "epoch": 3.74875027168007, "grad_norm": 0.3874567747116089, "learning_rate": 3.141379310344828e-05, "loss": 0.1243, "step": 3234 }, { "epoch": 3.7499094399768165, "grad_norm": 0.39800602197647095, "learning_rate": 3.14080459770115e-05, "loss": 0.124, "step": 3235 }, { "epoch": 3.7510686082735636, "grad_norm": 0.3324180245399475, "learning_rate": 3.1402298850574714e-05, "loss": 0.1292, "step": 3236 }, { "epoch": 3.7522277765703107, "grad_norm": 0.3287622630596161, "learning_rate": 3.1396551724137936e-05, "loss": 0.1152, "step": 3237 }, { "epoch": 3.753386944867058, "grad_norm": 0.22335346043109894, "learning_rate": 3.139080459770115e-05, "loss": 0.1006, "step": 3238 }, { "epoch": 3.754546113163805, "grad_norm": 0.32788801193237305, "learning_rate": 3.1385057471264366e-05, "loss": 0.1072, "step": 3239 }, { "epoch": 3.755705281460552, "grad_norm": 0.3683432340621948, "learning_rate": 3.137931034482759e-05, "loss": 0.1085, "step": 3240 }, { "epoch": 3.756864449757299, "grad_norm": 0.33938443660736084, "learning_rate": 3.13735632183908e-05, "loss": 0.1115, "step": 3241 }, { "epoch": 3.7580236180540463, "grad_norm": 0.27710798382759094, "learning_rate": 3.1367816091954024e-05, "loss": 0.1152, "step": 3242 }, { "epoch": 3.7591827863507934, "grad_norm": 0.4090668261051178, "learning_rate": 3.1362068965517246e-05, "loss": 0.1064, "step": 3243 }, { "epoch": 3.7603419546475405, "grad_norm": 0.33840107917785645, "learning_rate": 3.135632183908046e-05, "loss": 0.1124, "step": 3244 }, { "epoch": 3.7615011229442876, "grad_norm": 0.2494470775127411, "learning_rate": 3.135057471264368e-05, "loss": 0.1, "step": 3245 }, { "epoch": 3.7626602912410343, "grad_norm": 0.2987157106399536, "learning_rate": 3.13448275862069e-05, "loss": 0.1159, "step": 3246 }, { "epoch": 3.763819459537782, "grad_norm": 0.3439898192882538, "learning_rate": 3.133908045977011e-05, "loss": 0.1171, "step": 3247 }, { "epoch": 3.7649786278345285, "grad_norm": 0.4079365134239197, "learning_rate": 3.1333333333333334e-05, "loss": 0.1184, "step": 3248 }, { "epoch": 3.766137796131276, "grad_norm": 0.25229400396347046, "learning_rate": 3.1327586206896555e-05, "loss": 0.107, "step": 3249 }, { "epoch": 3.7672969644280228, "grad_norm": 0.28538668155670166, "learning_rate": 3.132183908045977e-05, "loss": 0.1143, "step": 3250 }, { "epoch": 3.76845613272477, "grad_norm": 0.3976428806781769, "learning_rate": 3.131609195402299e-05, "loss": 0.1152, "step": 3251 }, { "epoch": 3.769615301021517, "grad_norm": 0.3206281363964081, "learning_rate": 3.131034482758621e-05, "loss": 0.1219, "step": 3252 }, { "epoch": 3.770774469318264, "grad_norm": 0.34073176980018616, "learning_rate": 3.130459770114942e-05, "loss": 0.1111, "step": 3253 }, { "epoch": 3.771933637615011, "grad_norm": 0.2973473370075226, "learning_rate": 3.1298850574712643e-05, "loss": 0.1155, "step": 3254 }, { "epoch": 3.7730928059117583, "grad_norm": 0.268359512090683, "learning_rate": 3.1293103448275865e-05, "loss": 0.1207, "step": 3255 }, { "epoch": 3.7742519742085054, "grad_norm": 0.24991478025913239, "learning_rate": 3.128735632183909e-05, "loss": 0.1108, "step": 3256 }, { "epoch": 3.7754111425052526, "grad_norm": 0.28493621945381165, "learning_rate": 3.12816091954023e-05, "loss": 0.1153, "step": 3257 }, { "epoch": 3.7765703108019997, "grad_norm": 0.25810399651527405, "learning_rate": 3.127586206896552e-05, "loss": 0.1134, "step": 3258 }, { "epoch": 3.777729479098747, "grad_norm": 0.33807915449142456, "learning_rate": 3.127011494252874e-05, "loss": 0.1156, "step": 3259 }, { "epoch": 3.778888647395494, "grad_norm": 0.37843331694602966, "learning_rate": 3.126436781609195e-05, "loss": 0.1112, "step": 3260 }, { "epoch": 3.7800478156922406, "grad_norm": 0.3349437713623047, "learning_rate": 3.125862068965517e-05, "loss": 0.1004, "step": 3261 }, { "epoch": 3.781206983988988, "grad_norm": 0.24173638224601746, "learning_rate": 3.1252873563218397e-05, "loss": 0.1016, "step": 3262 }, { "epoch": 3.782366152285735, "grad_norm": 0.27250927686691284, "learning_rate": 3.124712643678161e-05, "loss": 0.1076, "step": 3263 }, { "epoch": 3.783525320582482, "grad_norm": 0.3555189073085785, "learning_rate": 3.124137931034483e-05, "loss": 0.1234, "step": 3264 }, { "epoch": 3.784684488879229, "grad_norm": 0.3194526433944702, "learning_rate": 3.123563218390805e-05, "loss": 0.1168, "step": 3265 }, { "epoch": 3.785843657175976, "grad_norm": 0.3455429673194885, "learning_rate": 3.122988505747126e-05, "loss": 0.1243, "step": 3266 }, { "epoch": 3.7870028254727233, "grad_norm": 0.31268778443336487, "learning_rate": 3.1224137931034485e-05, "loss": 0.1013, "step": 3267 }, { "epoch": 3.7881619937694704, "grad_norm": 0.3976519703865051, "learning_rate": 3.12183908045977e-05, "loss": 0.1106, "step": 3268 }, { "epoch": 3.7893211620662175, "grad_norm": 0.37077805399894714, "learning_rate": 3.121264367816092e-05, "loss": 0.115, "step": 3269 }, { "epoch": 3.7904803303629646, "grad_norm": 0.26564523577690125, "learning_rate": 3.120689655172414e-05, "loss": 0.1025, "step": 3270 }, { "epoch": 3.7916394986597117, "grad_norm": 0.30875927209854126, "learning_rate": 3.120114942528736e-05, "loss": 0.1061, "step": 3271 }, { "epoch": 3.792798666956459, "grad_norm": 0.2600082755088806, "learning_rate": 3.119540229885058e-05, "loss": 0.0994, "step": 3272 }, { "epoch": 3.793957835253206, "grad_norm": 0.2653372585773468, "learning_rate": 3.1189655172413794e-05, "loss": 0.1006, "step": 3273 }, { "epoch": 3.795117003549953, "grad_norm": 0.23784595727920532, "learning_rate": 3.118390804597701e-05, "loss": 0.1064, "step": 3274 }, { "epoch": 3.7962761718467, "grad_norm": 0.3346762955188751, "learning_rate": 3.117816091954023e-05, "loss": 0.1174, "step": 3275 }, { "epoch": 3.797435340143447, "grad_norm": 0.28256985545158386, "learning_rate": 3.117241379310345e-05, "loss": 0.1233, "step": 3276 }, { "epoch": 3.7985945084401944, "grad_norm": 0.3030017614364624, "learning_rate": 3.116666666666667e-05, "loss": 0.1022, "step": 3277 }, { "epoch": 3.799753676736941, "grad_norm": 0.28801146149635315, "learning_rate": 3.116091954022989e-05, "loss": 0.1084, "step": 3278 }, { "epoch": 3.800912845033688, "grad_norm": 0.3173084259033203, "learning_rate": 3.1155172413793104e-05, "loss": 0.1118, "step": 3279 }, { "epoch": 3.8020720133304353, "grad_norm": 0.22037595510482788, "learning_rate": 3.114942528735632e-05, "loss": 0.1013, "step": 3280 }, { "epoch": 3.8032311816271824, "grad_norm": 0.4173763394355774, "learning_rate": 3.114367816091954e-05, "loss": 0.1147, "step": 3281 }, { "epoch": 3.8043903499239295, "grad_norm": 0.38189518451690674, "learning_rate": 3.113793103448276e-05, "loss": 0.1191, "step": 3282 }, { "epoch": 3.8055495182206767, "grad_norm": 0.28138333559036255, "learning_rate": 3.1132183908045984e-05, "loss": 0.118, "step": 3283 }, { "epoch": 3.806708686517424, "grad_norm": 0.28683844208717346, "learning_rate": 3.11264367816092e-05, "loss": 0.1156, "step": 3284 }, { "epoch": 3.807867854814171, "grad_norm": 0.28898361325263977, "learning_rate": 3.1120689655172414e-05, "loss": 0.1054, "step": 3285 }, { "epoch": 3.809027023110918, "grad_norm": 0.2948157787322998, "learning_rate": 3.1114942528735636e-05, "loss": 0.0977, "step": 3286 }, { "epoch": 3.810186191407665, "grad_norm": 0.27834632992744446, "learning_rate": 3.110919540229885e-05, "loss": 0.1206, "step": 3287 }, { "epoch": 3.8113453597044122, "grad_norm": 0.2628282606601715, "learning_rate": 3.1103448275862065e-05, "loss": 0.1112, "step": 3288 }, { "epoch": 3.812504528001159, "grad_norm": 0.2561165988445282, "learning_rate": 3.109770114942529e-05, "loss": 0.1138, "step": 3289 }, { "epoch": 3.8136636962979065, "grad_norm": 0.23604638874530792, "learning_rate": 3.109195402298851e-05, "loss": 0.1081, "step": 3290 }, { "epoch": 3.814822864594653, "grad_norm": 0.2508888840675354, "learning_rate": 3.108620689655173e-05, "loss": 0.1002, "step": 3291 }, { "epoch": 3.8159820328914003, "grad_norm": 0.3218640089035034, "learning_rate": 3.1080459770114945e-05, "loss": 0.1064, "step": 3292 }, { "epoch": 3.8171412011881474, "grad_norm": 0.22713243961334229, "learning_rate": 3.107471264367816e-05, "loss": 0.1074, "step": 3293 }, { "epoch": 3.8183003694848945, "grad_norm": 0.2802816927433014, "learning_rate": 3.106896551724138e-05, "loss": 0.1089, "step": 3294 }, { "epoch": 3.8194595377816416, "grad_norm": 0.25849002599716187, "learning_rate": 3.10632183908046e-05, "loss": 0.1132, "step": 3295 }, { "epoch": 3.8206187060783887, "grad_norm": 0.3340066373348236, "learning_rate": 3.105747126436782e-05, "loss": 0.1287, "step": 3296 }, { "epoch": 3.821777874375136, "grad_norm": 0.3333747684955597, "learning_rate": 3.105172413793104e-05, "loss": 0.1157, "step": 3297 }, { "epoch": 3.822937042671883, "grad_norm": 0.26639991998672485, "learning_rate": 3.1045977011494255e-05, "loss": 0.1089, "step": 3298 }, { "epoch": 3.82409621096863, "grad_norm": 0.2500338852405548, "learning_rate": 3.104022988505747e-05, "loss": 0.1146, "step": 3299 }, { "epoch": 3.825255379265377, "grad_norm": 0.3188420534133911, "learning_rate": 3.103448275862069e-05, "loss": 0.1142, "step": 3300 }, { "epoch": 3.8264145475621243, "grad_norm": 0.22074191272258759, "learning_rate": 3.1028735632183907e-05, "loss": 0.1022, "step": 3301 }, { "epoch": 3.8275737158588714, "grad_norm": 0.32169198989868164, "learning_rate": 3.102298850574713e-05, "loss": 0.1234, "step": 3302 }, { "epoch": 3.8287328841556185, "grad_norm": 0.23068860173225403, "learning_rate": 3.101724137931035e-05, "loss": 0.1052, "step": 3303 }, { "epoch": 3.829892052452365, "grad_norm": 0.4056016206741333, "learning_rate": 3.1011494252873565e-05, "loss": 0.1122, "step": 3304 }, { "epoch": 3.8310512207491128, "grad_norm": 0.7055277228355408, "learning_rate": 3.1005747126436786e-05, "loss": 0.1154, "step": 3305 }, { "epoch": 3.8322103890458594, "grad_norm": 0.3167892098426819, "learning_rate": 3.1e-05, "loss": 0.1151, "step": 3306 }, { "epoch": 3.8333695573426065, "grad_norm": 0.3068336546421051, "learning_rate": 3.0994252873563216e-05, "loss": 0.1093, "step": 3307 }, { "epoch": 3.8345287256393537, "grad_norm": 0.28874102234840393, "learning_rate": 3.098850574712644e-05, "loss": 0.101, "step": 3308 }, { "epoch": 3.8356878939361008, "grad_norm": 0.30974501371383667, "learning_rate": 3.098275862068965e-05, "loss": 0.1292, "step": 3309 }, { "epoch": 3.836847062232848, "grad_norm": 0.2519693672657013, "learning_rate": 3.0977011494252875e-05, "loss": 0.0984, "step": 3310 }, { "epoch": 3.838006230529595, "grad_norm": 0.3312533497810364, "learning_rate": 3.0971264367816096e-05, "loss": 0.1092, "step": 3311 }, { "epoch": 3.839165398826342, "grad_norm": 0.33562129735946655, "learning_rate": 3.096551724137931e-05, "loss": 0.1045, "step": 3312 }, { "epoch": 3.8403245671230892, "grad_norm": 0.22888796031475067, "learning_rate": 3.095977011494253e-05, "loss": 0.1107, "step": 3313 }, { "epoch": 3.8414837354198363, "grad_norm": 0.338222861289978, "learning_rate": 3.095402298850575e-05, "loss": 0.1227, "step": 3314 }, { "epoch": 3.8426429037165835, "grad_norm": 0.3241053521633148, "learning_rate": 3.094827586206896e-05, "loss": 0.1243, "step": 3315 }, { "epoch": 3.8438020720133306, "grad_norm": 0.2689477503299713, "learning_rate": 3.0942528735632184e-05, "loss": 0.1091, "step": 3316 }, { "epoch": 3.8449612403100772, "grad_norm": 0.2917823791503906, "learning_rate": 3.0936781609195406e-05, "loss": 0.099, "step": 3317 }, { "epoch": 3.846120408606825, "grad_norm": 0.2831590473651886, "learning_rate": 3.093103448275862e-05, "loss": 0.0987, "step": 3318 }, { "epoch": 3.8472795769035715, "grad_norm": 0.2956966161727905, "learning_rate": 3.092528735632184e-05, "loss": 0.1038, "step": 3319 }, { "epoch": 3.848438745200319, "grad_norm": 0.41253408789634705, "learning_rate": 3.091954022988506e-05, "loss": 0.1173, "step": 3320 }, { "epoch": 3.8495979134970657, "grad_norm": 0.37665021419525146, "learning_rate": 3.091379310344828e-05, "loss": 0.1211, "step": 3321 }, { "epoch": 3.850757081793813, "grad_norm": 0.29172924160957336, "learning_rate": 3.0908045977011494e-05, "loss": 0.1087, "step": 3322 }, { "epoch": 3.85191625009056, "grad_norm": 0.4337025284767151, "learning_rate": 3.0902298850574716e-05, "loss": 0.1089, "step": 3323 }, { "epoch": 3.853075418387307, "grad_norm": 0.36892715096473694, "learning_rate": 3.089655172413794e-05, "loss": 0.108, "step": 3324 }, { "epoch": 3.854234586684054, "grad_norm": 0.34563952684402466, "learning_rate": 3.089080459770115e-05, "loss": 0.116, "step": 3325 }, { "epoch": 3.8553937549808013, "grad_norm": 0.3011876344680786, "learning_rate": 3.088505747126437e-05, "loss": 0.1105, "step": 3326 }, { "epoch": 3.8565529232775484, "grad_norm": 0.36122533679008484, "learning_rate": 3.087931034482759e-05, "loss": 0.1171, "step": 3327 }, { "epoch": 3.8577120915742955, "grad_norm": 0.31418511271476746, "learning_rate": 3.0873563218390804e-05, "loss": 0.1189, "step": 3328 }, { "epoch": 3.8588712598710426, "grad_norm": 0.2561214566230774, "learning_rate": 3.0867816091954025e-05, "loss": 0.1061, "step": 3329 }, { "epoch": 3.8600304281677897, "grad_norm": 0.28758811950683594, "learning_rate": 3.086206896551724e-05, "loss": 0.1142, "step": 3330 }, { "epoch": 3.861189596464537, "grad_norm": 0.2774584889411926, "learning_rate": 3.085632183908046e-05, "loss": 0.1097, "step": 3331 }, { "epoch": 3.8623487647612835, "grad_norm": 0.3186490535736084, "learning_rate": 3.0850574712643684e-05, "loss": 0.117, "step": 3332 }, { "epoch": 3.863507933058031, "grad_norm": 0.3841571807861328, "learning_rate": 3.08448275862069e-05, "loss": 0.1224, "step": 3333 }, { "epoch": 3.8646671013547778, "grad_norm": 0.2925528585910797, "learning_rate": 3.0839080459770113e-05, "loss": 0.1049, "step": 3334 }, { "epoch": 3.865826269651525, "grad_norm": 0.35688725113868713, "learning_rate": 3.0833333333333335e-05, "loss": 0.1102, "step": 3335 }, { "epoch": 3.866985437948272, "grad_norm": 0.27751150727272034, "learning_rate": 3.082758620689655e-05, "loss": 0.1114, "step": 3336 }, { "epoch": 3.868144606245019, "grad_norm": 0.25139451026916504, "learning_rate": 3.082183908045977e-05, "loss": 0.1159, "step": 3337 }, { "epoch": 3.869303774541766, "grad_norm": 0.4053381085395813, "learning_rate": 3.0816091954022993e-05, "loss": 0.1268, "step": 3338 }, { "epoch": 3.8704629428385133, "grad_norm": 0.22363197803497314, "learning_rate": 3.081034482758621e-05, "loss": 0.106, "step": 3339 }, { "epoch": 3.8716221111352604, "grad_norm": 0.310592919588089, "learning_rate": 3.080459770114943e-05, "loss": 0.1168, "step": 3340 }, { "epoch": 3.8727812794320076, "grad_norm": 0.29735296964645386, "learning_rate": 3.0798850574712645e-05, "loss": 0.1257, "step": 3341 }, { "epoch": 3.8739404477287547, "grad_norm": 0.2444659024477005, "learning_rate": 3.079310344827586e-05, "loss": 0.109, "step": 3342 }, { "epoch": 3.875099616025502, "grad_norm": 0.32433512806892395, "learning_rate": 3.078735632183908e-05, "loss": 0.1177, "step": 3343 }, { "epoch": 3.876258784322249, "grad_norm": 0.2731485366821289, "learning_rate": 3.07816091954023e-05, "loss": 0.1176, "step": 3344 }, { "epoch": 3.877417952618996, "grad_norm": 0.25933563709259033, "learning_rate": 3.077586206896552e-05, "loss": 0.0964, "step": 3345 }, { "epoch": 3.878577120915743, "grad_norm": 0.34049245715141296, "learning_rate": 3.077011494252874e-05, "loss": 0.1237, "step": 3346 }, { "epoch": 3.87973628921249, "grad_norm": 0.2941095232963562, "learning_rate": 3.0764367816091955e-05, "loss": 0.1159, "step": 3347 }, { "epoch": 3.8808954575092374, "grad_norm": 0.2893029451370239, "learning_rate": 3.0758620689655176e-05, "loss": 0.1072, "step": 3348 }, { "epoch": 3.882054625805984, "grad_norm": 0.2857878506183624, "learning_rate": 3.075287356321839e-05, "loss": 0.1117, "step": 3349 }, { "epoch": 3.883213794102731, "grad_norm": 0.3128105103969574, "learning_rate": 3.0747126436781606e-05, "loss": 0.1154, "step": 3350 }, { "epoch": 3.8843729623994783, "grad_norm": 0.34108099341392517, "learning_rate": 3.074137931034483e-05, "loss": 0.1083, "step": 3351 }, { "epoch": 3.8855321306962254, "grad_norm": 0.29815927147865295, "learning_rate": 3.073563218390805e-05, "loss": 0.1092, "step": 3352 }, { "epoch": 3.8866912989929725, "grad_norm": 0.32374799251556396, "learning_rate": 3.0729885057471264e-05, "loss": 0.1138, "step": 3353 }, { "epoch": 3.8878504672897196, "grad_norm": 0.30145663022994995, "learning_rate": 3.0724137931034486e-05, "loss": 0.1105, "step": 3354 }, { "epoch": 3.8890096355864667, "grad_norm": 0.27380499243736267, "learning_rate": 3.07183908045977e-05, "loss": 0.1038, "step": 3355 }, { "epoch": 3.890168803883214, "grad_norm": 0.33210647106170654, "learning_rate": 3.0712643678160916e-05, "loss": 0.1312, "step": 3356 }, { "epoch": 3.891327972179961, "grad_norm": 0.27347707748413086, "learning_rate": 3.070689655172414e-05, "loss": 0.104, "step": 3357 }, { "epoch": 3.892487140476708, "grad_norm": 0.31707629561424255, "learning_rate": 3.070114942528736e-05, "loss": 0.1097, "step": 3358 }, { "epoch": 3.893646308773455, "grad_norm": 0.3601592481136322, "learning_rate": 3.069540229885058e-05, "loss": 0.1209, "step": 3359 }, { "epoch": 3.894805477070202, "grad_norm": 0.29122745990753174, "learning_rate": 3.0689655172413796e-05, "loss": 0.1059, "step": 3360 }, { "epoch": 3.8959646453669494, "grad_norm": 0.3259906470775604, "learning_rate": 3.068390804597701e-05, "loss": 0.1168, "step": 3361 }, { "epoch": 3.897123813663696, "grad_norm": 0.31396666169166565, "learning_rate": 3.067816091954023e-05, "loss": 0.1048, "step": 3362 }, { "epoch": 3.8982829819604436, "grad_norm": 0.28336969017982483, "learning_rate": 3.067241379310345e-05, "loss": 0.1083, "step": 3363 }, { "epoch": 3.8994421502571903, "grad_norm": 0.29647666215896606, "learning_rate": 3.066666666666667e-05, "loss": 0.1163, "step": 3364 }, { "epoch": 3.9006013185539374, "grad_norm": 0.4049275815486908, "learning_rate": 3.066091954022989e-05, "loss": 0.1132, "step": 3365 }, { "epoch": 3.9017604868506846, "grad_norm": 0.2655840516090393, "learning_rate": 3.0655172413793106e-05, "loss": 0.0983, "step": 3366 }, { "epoch": 3.9029196551474317, "grad_norm": 0.5942867994308472, "learning_rate": 3.064942528735633e-05, "loss": 0.1326, "step": 3367 }, { "epoch": 3.904078823444179, "grad_norm": 0.24042394757270813, "learning_rate": 3.064367816091954e-05, "loss": 0.1083, "step": 3368 }, { "epoch": 3.905237991740926, "grad_norm": 0.24177686870098114, "learning_rate": 3.063793103448276e-05, "loss": 0.1151, "step": 3369 }, { "epoch": 3.906397160037673, "grad_norm": 0.33689844608306885, "learning_rate": 3.063218390804598e-05, "loss": 0.1128, "step": 3370 }, { "epoch": 3.90755632833442, "grad_norm": 0.3097141683101654, "learning_rate": 3.0626436781609194e-05, "loss": 0.1209, "step": 3371 }, { "epoch": 3.9087154966311672, "grad_norm": 0.2580316364765167, "learning_rate": 3.0620689655172415e-05, "loss": 0.1079, "step": 3372 }, { "epoch": 3.9098746649279144, "grad_norm": 0.2943916320800781, "learning_rate": 3.061494252873564e-05, "loss": 0.124, "step": 3373 }, { "epoch": 3.9110338332246615, "grad_norm": 0.27771899104118347, "learning_rate": 3.060919540229885e-05, "loss": 0.1084, "step": 3374 }, { "epoch": 3.912193001521408, "grad_norm": 0.30981943011283875, "learning_rate": 3.060344827586207e-05, "loss": 0.1076, "step": 3375 }, { "epoch": 3.9133521698181557, "grad_norm": 0.3062245547771454, "learning_rate": 3.059770114942529e-05, "loss": 0.1273, "step": 3376 }, { "epoch": 3.9145113381149024, "grad_norm": 0.25687843561172485, "learning_rate": 3.05919540229885e-05, "loss": 0.1053, "step": 3377 }, { "epoch": 3.9156705064116495, "grad_norm": 0.25983986258506775, "learning_rate": 3.0586206896551725e-05, "loss": 0.1062, "step": 3378 }, { "epoch": 3.9168296747083966, "grad_norm": 0.7442601323127747, "learning_rate": 3.058045977011495e-05, "loss": 0.1203, "step": 3379 }, { "epoch": 3.9179888430051437, "grad_norm": 0.289230078458786, "learning_rate": 3.057471264367816e-05, "loss": 0.1178, "step": 3380 }, { "epoch": 3.919148011301891, "grad_norm": 0.25480005145072937, "learning_rate": 3.056896551724138e-05, "loss": 0.1119, "step": 3381 }, { "epoch": 3.920307179598638, "grad_norm": 0.2439074069261551, "learning_rate": 3.05632183908046e-05, "loss": 0.103, "step": 3382 }, { "epoch": 3.921466347895385, "grad_norm": 0.32492339611053467, "learning_rate": 3.055747126436781e-05, "loss": 0.1203, "step": 3383 }, { "epoch": 3.922625516192132, "grad_norm": 0.2885267436504364, "learning_rate": 3.0551724137931035e-05, "loss": 0.1092, "step": 3384 }, { "epoch": 3.9237846844888793, "grad_norm": 0.5572437644004822, "learning_rate": 3.0545977011494256e-05, "loss": 0.1181, "step": 3385 }, { "epoch": 3.9249438527856264, "grad_norm": 0.3531005382537842, "learning_rate": 3.054022988505748e-05, "loss": 0.1129, "step": 3386 }, { "epoch": 3.9261030210823735, "grad_norm": 0.35553064942359924, "learning_rate": 3.053448275862069e-05, "loss": 0.1238, "step": 3387 }, { "epoch": 3.9272621893791206, "grad_norm": 0.31617647409439087, "learning_rate": 3.052873563218391e-05, "loss": 0.0986, "step": 3388 }, { "epoch": 3.9284213576758678, "grad_norm": 0.3103896379470825, "learning_rate": 3.052298850574713e-05, "loss": 0.1315, "step": 3389 }, { "epoch": 3.9295805259726144, "grad_norm": 0.31774574518203735, "learning_rate": 3.0517241379310348e-05, "loss": 0.1116, "step": 3390 }, { "epoch": 3.930739694269362, "grad_norm": 0.2169465720653534, "learning_rate": 3.0511494252873563e-05, "loss": 0.0987, "step": 3391 }, { "epoch": 3.9318988625661087, "grad_norm": 0.27531975507736206, "learning_rate": 3.0505747126436784e-05, "loss": 0.1173, "step": 3392 }, { "epoch": 3.9330580308628558, "grad_norm": 0.34132128953933716, "learning_rate": 3.05e-05, "loss": 0.1246, "step": 3393 }, { "epoch": 3.934217199159603, "grad_norm": 0.3186652362346649, "learning_rate": 3.0494252873563218e-05, "loss": 0.1182, "step": 3394 }, { "epoch": 3.93537636745635, "grad_norm": 0.2878797948360443, "learning_rate": 3.048850574712644e-05, "loss": 0.1155, "step": 3395 }, { "epoch": 3.936535535753097, "grad_norm": 0.27938562631607056, "learning_rate": 3.0482758620689654e-05, "loss": 0.1024, "step": 3396 }, { "epoch": 3.9376947040498442, "grad_norm": 0.3184402883052826, "learning_rate": 3.0477011494252876e-05, "loss": 0.1094, "step": 3397 }, { "epoch": 3.9388538723465913, "grad_norm": 0.2606091797351837, "learning_rate": 3.0471264367816094e-05, "loss": 0.1115, "step": 3398 }, { "epoch": 3.9400130406433385, "grad_norm": 0.3425968885421753, "learning_rate": 3.046551724137931e-05, "loss": 0.1085, "step": 3399 }, { "epoch": 3.9411722089400856, "grad_norm": 0.30267462134361267, "learning_rate": 3.045977011494253e-05, "loss": 0.1239, "step": 3400 }, { "epoch": 3.9423313772368327, "grad_norm": 0.31355491280555725, "learning_rate": 3.045402298850575e-05, "loss": 0.1179, "step": 3401 }, { "epoch": 3.94349054553358, "grad_norm": 0.2975825071334839, "learning_rate": 3.0448275862068964e-05, "loss": 0.1146, "step": 3402 }, { "epoch": 3.9446497138303265, "grad_norm": 0.3157767653465271, "learning_rate": 3.0442528735632186e-05, "loss": 0.1192, "step": 3403 }, { "epoch": 3.945808882127074, "grad_norm": 0.2541486620903015, "learning_rate": 3.0436781609195404e-05, "loss": 0.0994, "step": 3404 }, { "epoch": 3.9469680504238207, "grad_norm": 0.42458853125572205, "learning_rate": 3.0431034482758626e-05, "loss": 0.1295, "step": 3405 }, { "epoch": 3.9481272187205683, "grad_norm": 0.28080934286117554, "learning_rate": 3.042528735632184e-05, "loss": 0.1086, "step": 3406 }, { "epoch": 3.949286387017315, "grad_norm": 0.26001524925231934, "learning_rate": 3.041954022988506e-05, "loss": 0.1099, "step": 3407 }, { "epoch": 3.950445555314062, "grad_norm": 0.2862829267978668, "learning_rate": 3.041379310344828e-05, "loss": 0.1153, "step": 3408 }, { "epoch": 3.951604723610809, "grad_norm": 0.2606809735298157, "learning_rate": 3.0408045977011495e-05, "loss": 0.1151, "step": 3409 }, { "epoch": 3.9527638919075563, "grad_norm": 0.3059811294078827, "learning_rate": 3.040229885057471e-05, "loss": 0.1151, "step": 3410 }, { "epoch": 3.9539230602043034, "grad_norm": 0.30108916759490967, "learning_rate": 3.0396551724137935e-05, "loss": 0.1125, "step": 3411 }, { "epoch": 3.9550822285010505, "grad_norm": 0.21490095555782318, "learning_rate": 3.039080459770115e-05, "loss": 0.0883, "step": 3412 }, { "epoch": 3.9562413967977976, "grad_norm": 0.39761850237846375, "learning_rate": 3.0385057471264365e-05, "loss": 0.1313, "step": 3413 }, { "epoch": 3.9574005650945447, "grad_norm": 0.33936530351638794, "learning_rate": 3.0379310344827587e-05, "loss": 0.1119, "step": 3414 }, { "epoch": 3.958559733391292, "grad_norm": 0.33767202496528625, "learning_rate": 3.0373563218390805e-05, "loss": 0.1272, "step": 3415 }, { "epoch": 3.959718901688039, "grad_norm": 0.31474098563194275, "learning_rate": 3.0367816091954027e-05, "loss": 0.1093, "step": 3416 }, { "epoch": 3.960878069984786, "grad_norm": 0.394424170255661, "learning_rate": 3.0362068965517242e-05, "loss": 0.1262, "step": 3417 }, { "epoch": 3.9620372382815328, "grad_norm": 0.3407604992389679, "learning_rate": 3.035632183908046e-05, "loss": 0.1184, "step": 3418 }, { "epoch": 3.9631964065782803, "grad_norm": 0.25149932503700256, "learning_rate": 3.035057471264368e-05, "loss": 0.1003, "step": 3419 }, { "epoch": 3.964355574875027, "grad_norm": 0.22654342651367188, "learning_rate": 3.0344827586206897e-05, "loss": 0.1024, "step": 3420 }, { "epoch": 3.965514743171774, "grad_norm": 0.3020685911178589, "learning_rate": 3.0339080459770115e-05, "loss": 0.1096, "step": 3421 }, { "epoch": 3.966673911468521, "grad_norm": 0.25613948702812195, "learning_rate": 3.0333333333333337e-05, "loss": 0.1232, "step": 3422 }, { "epoch": 3.9678330797652683, "grad_norm": 0.3310185670852661, "learning_rate": 3.032758620689655e-05, "loss": 0.1222, "step": 3423 }, { "epoch": 3.9689922480620154, "grad_norm": 0.26435863971710205, "learning_rate": 3.0321839080459773e-05, "loss": 0.1013, "step": 3424 }, { "epoch": 3.9701514163587626, "grad_norm": 0.3062216341495514, "learning_rate": 3.031609195402299e-05, "loss": 0.1156, "step": 3425 }, { "epoch": 3.9713105846555097, "grad_norm": 0.3526940643787384, "learning_rate": 3.0310344827586206e-05, "loss": 0.1171, "step": 3426 }, { "epoch": 3.972469752952257, "grad_norm": 0.2813412547111511, "learning_rate": 3.0304597701149428e-05, "loss": 0.1031, "step": 3427 }, { "epoch": 3.973628921249004, "grad_norm": 0.3743179738521576, "learning_rate": 3.0298850574712646e-05, "loss": 0.1138, "step": 3428 }, { "epoch": 3.974788089545751, "grad_norm": 0.30746176838874817, "learning_rate": 3.029310344827586e-05, "loss": 0.1214, "step": 3429 }, { "epoch": 3.975947257842498, "grad_norm": 0.3143618106842041, "learning_rate": 3.0287356321839083e-05, "loss": 0.1114, "step": 3430 }, { "epoch": 3.9771064261392453, "grad_norm": 0.33030736446380615, "learning_rate": 3.02816091954023e-05, "loss": 0.1038, "step": 3431 }, { "epoch": 3.9782655944359924, "grad_norm": 0.40963098406791687, "learning_rate": 3.0275862068965523e-05, "loss": 0.1182, "step": 3432 }, { "epoch": 3.979424762732739, "grad_norm": 0.31964415311813354, "learning_rate": 3.0270114942528738e-05, "loss": 0.1074, "step": 3433 }, { "epoch": 3.9805839310294866, "grad_norm": 0.24795931577682495, "learning_rate": 3.0264367816091953e-05, "loss": 0.1048, "step": 3434 }, { "epoch": 3.9817430993262333, "grad_norm": 0.25050121545791626, "learning_rate": 3.0258620689655178e-05, "loss": 0.1026, "step": 3435 }, { "epoch": 3.9829022676229804, "grad_norm": 0.27471792697906494, "learning_rate": 3.0252873563218393e-05, "loss": 0.1081, "step": 3436 }, { "epoch": 3.9840614359197275, "grad_norm": 0.24730391800403595, "learning_rate": 3.0247126436781608e-05, "loss": 0.1041, "step": 3437 }, { "epoch": 3.9852206042164746, "grad_norm": 0.24231289327144623, "learning_rate": 3.024137931034483e-05, "loss": 0.1031, "step": 3438 }, { "epoch": 3.9863797725132217, "grad_norm": 0.3932971656322479, "learning_rate": 3.0235632183908047e-05, "loss": 0.1118, "step": 3439 }, { "epoch": 3.987538940809969, "grad_norm": 0.28917720913887024, "learning_rate": 3.0229885057471262e-05, "loss": 0.1139, "step": 3440 }, { "epoch": 3.988698109106716, "grad_norm": 0.2533641457557678, "learning_rate": 3.0224137931034484e-05, "loss": 0.0963, "step": 3441 }, { "epoch": 3.989857277403463, "grad_norm": 0.303200364112854, "learning_rate": 3.0218390804597702e-05, "loss": 0.1199, "step": 3442 }, { "epoch": 3.99101644570021, "grad_norm": 0.2749335765838623, "learning_rate": 3.0212643678160924e-05, "loss": 0.1025, "step": 3443 }, { "epoch": 3.9921756139969573, "grad_norm": 0.2617204785346985, "learning_rate": 3.020689655172414e-05, "loss": 0.1123, "step": 3444 }, { "epoch": 3.9933347822937044, "grad_norm": 0.2848406434059143, "learning_rate": 3.0201149425287357e-05, "loss": 0.1098, "step": 3445 }, { "epoch": 3.994493950590451, "grad_norm": 0.3450290262699127, "learning_rate": 3.019540229885058e-05, "loss": 0.1087, "step": 3446 }, { "epoch": 3.9956531188871987, "grad_norm": 0.3187498450279236, "learning_rate": 3.0189655172413794e-05, "loss": 0.1144, "step": 3447 }, { "epoch": 3.9968122871839453, "grad_norm": 0.3266300857067108, "learning_rate": 3.0183908045977012e-05, "loss": 0.1161, "step": 3448 }, { "epoch": 3.9979714554806924, "grad_norm": 0.2566640079021454, "learning_rate": 3.0178160919540234e-05, "loss": 0.1034, "step": 3449 }, { "epoch": 3.9991306237774396, "grad_norm": 0.31494787335395813, "learning_rate": 3.017241379310345e-05, "loss": 0.1086, "step": 3450 }, { "epoch": 3.9991306237774396, "eval_loss": 0.1329651027917862, "eval_runtime": 265.5812, "eval_samples_per_second": 5.776, "eval_steps_per_second": 5.776, "step": 3450 }, { "epoch": 4.000289792074187, "grad_norm": 0.3459082245826721, "learning_rate": 3.016666666666667e-05, "loss": 0.1203, "step": 3451 }, { "epoch": 4.001448960370934, "grad_norm": 0.25177237391471863, "learning_rate": 3.016091954022989e-05, "loss": 0.1, "step": 3452 }, { "epoch": 4.002608128667681, "grad_norm": 0.2541625201702118, "learning_rate": 3.0155172413793104e-05, "loss": 0.1011, "step": 3453 }, { "epoch": 4.003767296964428, "grad_norm": 0.23528751730918884, "learning_rate": 3.0149425287356325e-05, "loss": 0.1055, "step": 3454 }, { "epoch": 4.004926465261175, "grad_norm": 0.25770679116249084, "learning_rate": 3.014367816091954e-05, "loss": 0.0924, "step": 3455 }, { "epoch": 4.006085633557922, "grad_norm": 0.24531324207782745, "learning_rate": 3.013793103448276e-05, "loss": 0.1031, "step": 3456 }, { "epoch": 4.007244801854669, "grad_norm": 0.2368677854537964, "learning_rate": 3.013218390804598e-05, "loss": 0.1022, "step": 3457 }, { "epoch": 4.0084039701514165, "grad_norm": 0.2580307722091675, "learning_rate": 3.0126436781609195e-05, "loss": 0.0883, "step": 3458 }, { "epoch": 4.009563138448163, "grad_norm": 0.2701059579849243, "learning_rate": 3.0120689655172413e-05, "loss": 0.0988, "step": 3459 }, { "epoch": 4.010722306744911, "grad_norm": 0.3617304563522339, "learning_rate": 3.0114942528735635e-05, "loss": 0.1142, "step": 3460 }, { "epoch": 4.011881475041657, "grad_norm": 0.300418883562088, "learning_rate": 3.010919540229885e-05, "loss": 0.0961, "step": 3461 }, { "epoch": 4.013040643338405, "grad_norm": 0.34037333726882935, "learning_rate": 3.010344827586207e-05, "loss": 0.1101, "step": 3462 }, { "epoch": 4.014199811635152, "grad_norm": 0.30946558713912964, "learning_rate": 3.009770114942529e-05, "loss": 0.1017, "step": 3463 }, { "epoch": 4.015358979931899, "grad_norm": 0.2514745891094208, "learning_rate": 3.0091954022988505e-05, "loss": 0.0895, "step": 3464 }, { "epoch": 4.016518148228646, "grad_norm": 0.3249182403087616, "learning_rate": 3.0086206896551726e-05, "loss": 0.0983, "step": 3465 }, { "epoch": 4.017677316525393, "grad_norm": 0.30697742104530334, "learning_rate": 3.0080459770114945e-05, "loss": 0.1012, "step": 3466 }, { "epoch": 4.01883648482214, "grad_norm": 0.5410303473472595, "learning_rate": 3.007471264367816e-05, "loss": 0.101, "step": 3467 }, { "epoch": 4.019995653118888, "grad_norm": 0.367801308631897, "learning_rate": 3.006896551724138e-05, "loss": 0.0972, "step": 3468 }, { "epoch": 4.021154821415634, "grad_norm": 0.6262068152427673, "learning_rate": 3.00632183908046e-05, "loss": 0.1085, "step": 3469 }, { "epoch": 4.022313989712381, "grad_norm": 0.39368757605552673, "learning_rate": 3.005747126436782e-05, "loss": 0.1074, "step": 3470 }, { "epoch": 4.0234731580091285, "grad_norm": 0.3936510682106018, "learning_rate": 3.0051724137931036e-05, "loss": 0.0996, "step": 3471 }, { "epoch": 4.024632326305875, "grad_norm": 0.5199762582778931, "learning_rate": 3.0045977011494254e-05, "loss": 0.1011, "step": 3472 }, { "epoch": 4.025791494602623, "grad_norm": 0.3035762906074524, "learning_rate": 3.0040229885057476e-05, "loss": 0.103, "step": 3473 }, { "epoch": 4.026950662899369, "grad_norm": 0.34868428111076355, "learning_rate": 3.003448275862069e-05, "loss": 0.1066, "step": 3474 }, { "epoch": 4.028109831196117, "grad_norm": 0.4163806438446045, "learning_rate": 3.0028735632183906e-05, "loss": 0.103, "step": 3475 }, { "epoch": 4.029268999492864, "grad_norm": 0.4000799059867859, "learning_rate": 3.002298850574713e-05, "loss": 0.0984, "step": 3476 }, { "epoch": 4.030428167789611, "grad_norm": 0.3437565267086029, "learning_rate": 3.0017241379310346e-05, "loss": 0.0968, "step": 3477 }, { "epoch": 4.031587336086358, "grad_norm": 0.2822440564632416, "learning_rate": 3.001149425287356e-05, "loss": 0.1045, "step": 3478 }, { "epoch": 4.0327465043831054, "grad_norm": 0.44937509298324585, "learning_rate": 3.0005747126436782e-05, "loss": 0.1031, "step": 3479 }, { "epoch": 4.033905672679852, "grad_norm": 0.667371392250061, "learning_rate": 3e-05, "loss": 0.112, "step": 3480 }, { "epoch": 4.0350648409766, "grad_norm": 0.44788849353790283, "learning_rate": 2.9994252873563222e-05, "loss": 0.1159, "step": 3481 }, { "epoch": 4.036224009273346, "grad_norm": 0.3105098009109497, "learning_rate": 2.9988505747126437e-05, "loss": 0.1164, "step": 3482 }, { "epoch": 4.037383177570093, "grad_norm": 0.3047453463077545, "learning_rate": 2.9982758620689656e-05, "loss": 0.098, "step": 3483 }, { "epoch": 4.038542345866841, "grad_norm": 0.39169877767562866, "learning_rate": 2.9977011494252877e-05, "loss": 0.0997, "step": 3484 }, { "epoch": 4.039701514163587, "grad_norm": 0.31317397952079773, "learning_rate": 2.9971264367816092e-05, "loss": 0.1013, "step": 3485 }, { "epoch": 4.040860682460335, "grad_norm": 0.396670937538147, "learning_rate": 2.996551724137931e-05, "loss": 0.1137, "step": 3486 }, { "epoch": 4.0420198507570815, "grad_norm": 0.30599015951156616, "learning_rate": 2.9959770114942532e-05, "loss": 0.1096, "step": 3487 }, { "epoch": 4.043179019053829, "grad_norm": 0.35810816287994385, "learning_rate": 2.9954022988505747e-05, "loss": 0.0991, "step": 3488 }, { "epoch": 4.044338187350576, "grad_norm": 0.36599722504615784, "learning_rate": 2.994827586206897e-05, "loss": 0.1015, "step": 3489 }, { "epoch": 4.045497355647323, "grad_norm": 0.30756571888923645, "learning_rate": 2.9942528735632187e-05, "loss": 0.0986, "step": 3490 }, { "epoch": 4.04665652394407, "grad_norm": 0.42771488428115845, "learning_rate": 2.9936781609195402e-05, "loss": 0.1141, "step": 3491 }, { "epoch": 4.0478156922408175, "grad_norm": 0.24768565595149994, "learning_rate": 2.9931034482758624e-05, "loss": 0.1063, "step": 3492 }, { "epoch": 4.048974860537564, "grad_norm": 0.3938906490802765, "learning_rate": 2.9925287356321842e-05, "loss": 0.096, "step": 3493 }, { "epoch": 4.050134028834312, "grad_norm": 0.3523421883583069, "learning_rate": 2.9919540229885057e-05, "loss": 0.1066, "step": 3494 }, { "epoch": 4.051293197131058, "grad_norm": 0.32402533292770386, "learning_rate": 2.991379310344828e-05, "loss": 0.106, "step": 3495 }, { "epoch": 4.052452365427806, "grad_norm": 0.3420311510562897, "learning_rate": 2.9908045977011497e-05, "loss": 0.0934, "step": 3496 }, { "epoch": 4.053611533724553, "grad_norm": 0.5918967723846436, "learning_rate": 2.990229885057471e-05, "loss": 0.105, "step": 3497 }, { "epoch": 4.054770702021299, "grad_norm": 0.2808108627796173, "learning_rate": 2.9896551724137933e-05, "loss": 0.1022, "step": 3498 }, { "epoch": 4.055929870318047, "grad_norm": 0.43918702006340027, "learning_rate": 2.9890804597701148e-05, "loss": 0.1052, "step": 3499 }, { "epoch": 4.0570890386147935, "grad_norm": 0.32599446177482605, "learning_rate": 2.988505747126437e-05, "loss": 0.0984, "step": 3500 }, { "epoch": 4.058248206911541, "grad_norm": 0.3194628655910492, "learning_rate": 2.9879310344827588e-05, "loss": 0.0974, "step": 3501 }, { "epoch": 4.059407375208288, "grad_norm": 0.358102947473526, "learning_rate": 2.9873563218390803e-05, "loss": 0.1023, "step": 3502 }, { "epoch": 4.060566543505035, "grad_norm": 0.30983057618141174, "learning_rate": 2.9867816091954025e-05, "loss": 0.0888, "step": 3503 }, { "epoch": 4.061725711801782, "grad_norm": 0.38468021154403687, "learning_rate": 2.9862068965517243e-05, "loss": 0.0959, "step": 3504 }, { "epoch": 4.0628848800985295, "grad_norm": 0.4226651191711426, "learning_rate": 2.9856321839080458e-05, "loss": 0.1022, "step": 3505 }, { "epoch": 4.064044048395276, "grad_norm": 0.5442459583282471, "learning_rate": 2.985057471264368e-05, "loss": 0.1106, "step": 3506 }, { "epoch": 4.065203216692024, "grad_norm": 0.3217725455760956, "learning_rate": 2.9844827586206898e-05, "loss": 0.1021, "step": 3507 }, { "epoch": 4.0663623849887705, "grad_norm": 0.30229008197784424, "learning_rate": 2.983908045977012e-05, "loss": 0.1031, "step": 3508 }, { "epoch": 4.067521553285518, "grad_norm": 0.3460632562637329, "learning_rate": 2.9833333333333335e-05, "loss": 0.111, "step": 3509 }, { "epoch": 4.068680721582265, "grad_norm": 0.38224828243255615, "learning_rate": 2.9827586206896553e-05, "loss": 0.1013, "step": 3510 }, { "epoch": 4.069839889879012, "grad_norm": 0.35836493968963623, "learning_rate": 2.9821839080459775e-05, "loss": 0.105, "step": 3511 }, { "epoch": 4.070999058175759, "grad_norm": 0.293371319770813, "learning_rate": 2.981609195402299e-05, "loss": 0.093, "step": 3512 }, { "epoch": 4.072158226472506, "grad_norm": 0.37686097621917725, "learning_rate": 2.9810344827586208e-05, "loss": 0.1018, "step": 3513 }, { "epoch": 4.073317394769253, "grad_norm": 0.30317679047584534, "learning_rate": 2.980459770114943e-05, "loss": 0.1095, "step": 3514 }, { "epoch": 4.074476563066, "grad_norm": 0.3032994866371155, "learning_rate": 2.9798850574712644e-05, "loss": 0.1063, "step": 3515 }, { "epoch": 4.075635731362747, "grad_norm": 0.29661819338798523, "learning_rate": 2.979310344827586e-05, "loss": 0.1068, "step": 3516 }, { "epoch": 4.076794899659494, "grad_norm": 0.34293368458747864, "learning_rate": 2.9787356321839084e-05, "loss": 0.1063, "step": 3517 }, { "epoch": 4.077954067956242, "grad_norm": 0.3552305996417999, "learning_rate": 2.97816091954023e-05, "loss": 0.1002, "step": 3518 }, { "epoch": 4.079113236252988, "grad_norm": 0.3097629249095917, "learning_rate": 2.977586206896552e-05, "loss": 0.0955, "step": 3519 }, { "epoch": 4.080272404549736, "grad_norm": 0.35458773374557495, "learning_rate": 2.9770114942528736e-05, "loss": 0.098, "step": 3520 }, { "epoch": 4.0814315728464825, "grad_norm": 0.30128052830696106, "learning_rate": 2.9764367816091954e-05, "loss": 0.1087, "step": 3521 }, { "epoch": 4.08259074114323, "grad_norm": 0.407092422246933, "learning_rate": 2.9758620689655176e-05, "loss": 0.0927, "step": 3522 }, { "epoch": 4.083749909439977, "grad_norm": 0.43123817443847656, "learning_rate": 2.975287356321839e-05, "loss": 0.1045, "step": 3523 }, { "epoch": 4.084909077736724, "grad_norm": 0.47122377157211304, "learning_rate": 2.974712643678161e-05, "loss": 0.1043, "step": 3524 }, { "epoch": 4.086068246033471, "grad_norm": 0.4324159324169159, "learning_rate": 2.974137931034483e-05, "loss": 0.1095, "step": 3525 }, { "epoch": 4.087227414330218, "grad_norm": 0.3203006982803345, "learning_rate": 2.9735632183908045e-05, "loss": 0.0967, "step": 3526 }, { "epoch": 4.088386582626965, "grad_norm": 0.45111727714538574, "learning_rate": 2.9729885057471267e-05, "loss": 0.0979, "step": 3527 }, { "epoch": 4.089545750923712, "grad_norm": 0.3182992935180664, "learning_rate": 2.9724137931034485e-05, "loss": 0.0957, "step": 3528 }, { "epoch": 4.090704919220459, "grad_norm": 0.392347127199173, "learning_rate": 2.97183908045977e-05, "loss": 0.0982, "step": 3529 }, { "epoch": 4.091864087517206, "grad_norm": 0.41814011335372925, "learning_rate": 2.9712643678160922e-05, "loss": 0.1017, "step": 3530 }, { "epoch": 4.093023255813954, "grad_norm": 0.3611200749874115, "learning_rate": 2.970689655172414e-05, "loss": 0.0953, "step": 3531 }, { "epoch": 4.0941824241107, "grad_norm": 0.3499966859817505, "learning_rate": 2.9701149425287355e-05, "loss": 0.1012, "step": 3532 }, { "epoch": 4.095341592407448, "grad_norm": 0.33209457993507385, "learning_rate": 2.9695402298850577e-05, "loss": 0.0989, "step": 3533 }, { "epoch": 4.0965007607041946, "grad_norm": 0.421478807926178, "learning_rate": 2.9689655172413795e-05, "loss": 0.1052, "step": 3534 }, { "epoch": 4.097659929000942, "grad_norm": 0.3154635727405548, "learning_rate": 2.968390804597701e-05, "loss": 0.0984, "step": 3535 }, { "epoch": 4.098819097297689, "grad_norm": 0.7844545841217041, "learning_rate": 2.9678160919540232e-05, "loss": 0.1113, "step": 3536 }, { "epoch": 4.099978265594436, "grad_norm": 0.3256261646747589, "learning_rate": 2.967241379310345e-05, "loss": 0.0994, "step": 3537 }, { "epoch": 4.101137433891183, "grad_norm": 0.4824637770652771, "learning_rate": 2.9666666666666672e-05, "loss": 0.1117, "step": 3538 }, { "epoch": 4.102296602187931, "grad_norm": 0.46031931042671204, "learning_rate": 2.9660919540229887e-05, "loss": 0.1103, "step": 3539 }, { "epoch": 4.103455770484677, "grad_norm": 0.33370453119277954, "learning_rate": 2.96551724137931e-05, "loss": 0.1121, "step": 3540 }, { "epoch": 4.104614938781424, "grad_norm": 0.4630499482154846, "learning_rate": 2.9649425287356327e-05, "loss": 0.1128, "step": 3541 }, { "epoch": 4.1057741070781715, "grad_norm": 0.47765010595321655, "learning_rate": 2.964367816091954e-05, "loss": 0.0989, "step": 3542 }, { "epoch": 4.106933275374918, "grad_norm": 0.2912958562374115, "learning_rate": 2.9637931034482756e-05, "loss": 0.0968, "step": 3543 }, { "epoch": 4.108092443671666, "grad_norm": 0.4124128520488739, "learning_rate": 2.9632183908045978e-05, "loss": 0.1004, "step": 3544 }, { "epoch": 4.109251611968412, "grad_norm": 0.38116535544395447, "learning_rate": 2.9626436781609196e-05, "loss": 0.1079, "step": 3545 }, { "epoch": 4.11041078026516, "grad_norm": 0.35159099102020264, "learning_rate": 2.9620689655172418e-05, "loss": 0.105, "step": 3546 }, { "epoch": 4.111569948561907, "grad_norm": 0.38807129859924316, "learning_rate": 2.9614942528735633e-05, "loss": 0.1012, "step": 3547 }, { "epoch": 4.112729116858654, "grad_norm": 0.33173301815986633, "learning_rate": 2.960919540229885e-05, "loss": 0.1213, "step": 3548 }, { "epoch": 4.113888285155401, "grad_norm": 0.2943272888660431, "learning_rate": 2.9603448275862073e-05, "loss": 0.1063, "step": 3549 }, { "epoch": 4.115047453452148, "grad_norm": 0.4288482666015625, "learning_rate": 2.9597701149425288e-05, "loss": 0.1047, "step": 3550 }, { "epoch": 4.116206621748895, "grad_norm": 0.3704341948032379, "learning_rate": 2.9591954022988506e-05, "loss": 0.1166, "step": 3551 }, { "epoch": 4.117365790045643, "grad_norm": 0.34703579545021057, "learning_rate": 2.9586206896551728e-05, "loss": 0.1068, "step": 3552 }, { "epoch": 4.118524958342389, "grad_norm": 0.4057508111000061, "learning_rate": 2.9580459770114943e-05, "loss": 0.1048, "step": 3553 }, { "epoch": 4.119684126639137, "grad_norm": 0.30694398283958435, "learning_rate": 2.957471264367816e-05, "loss": 0.1057, "step": 3554 }, { "epoch": 4.1208432949358835, "grad_norm": 0.42865851521492004, "learning_rate": 2.9568965517241383e-05, "loss": 0.1085, "step": 3555 }, { "epoch": 4.12200246323263, "grad_norm": 0.3558603525161743, "learning_rate": 2.9563218390804598e-05, "loss": 0.1029, "step": 3556 }, { "epoch": 4.123161631529378, "grad_norm": 0.40978485345840454, "learning_rate": 2.955747126436782e-05, "loss": 0.095, "step": 3557 }, { "epoch": 4.124320799826124, "grad_norm": 0.3812257647514343, "learning_rate": 2.9551724137931038e-05, "loss": 0.1052, "step": 3558 }, { "epoch": 4.125479968122872, "grad_norm": 0.3055720925331116, "learning_rate": 2.9545977011494252e-05, "loss": 0.1113, "step": 3559 }, { "epoch": 4.126639136419619, "grad_norm": 0.2221103310585022, "learning_rate": 2.9540229885057474e-05, "loss": 0.0886, "step": 3560 }, { "epoch": 4.127798304716366, "grad_norm": 0.3389085531234741, "learning_rate": 2.953448275862069e-05, "loss": 0.113, "step": 3561 }, { "epoch": 4.128957473013113, "grad_norm": 0.3829638659954071, "learning_rate": 2.9528735632183907e-05, "loss": 0.1048, "step": 3562 }, { "epoch": 4.1301166413098604, "grad_norm": 0.5124924182891846, "learning_rate": 2.952298850574713e-05, "loss": 0.0942, "step": 3563 }, { "epoch": 4.131275809606607, "grad_norm": 0.2896119952201843, "learning_rate": 2.9517241379310344e-05, "loss": 0.1041, "step": 3564 }, { "epoch": 4.132434977903355, "grad_norm": 0.39416196942329407, "learning_rate": 2.9511494252873566e-05, "loss": 0.1046, "step": 3565 }, { "epoch": 4.133594146200101, "grad_norm": 0.4558121860027313, "learning_rate": 2.9505747126436784e-05, "loss": 0.1077, "step": 3566 }, { "epoch": 4.134753314496849, "grad_norm": 0.5235021114349365, "learning_rate": 2.95e-05, "loss": 0.0916, "step": 3567 }, { "epoch": 4.135912482793596, "grad_norm": 0.4148746728897095, "learning_rate": 2.949425287356322e-05, "loss": 0.1049, "step": 3568 }, { "epoch": 4.137071651090342, "grad_norm": 0.3960752785205841, "learning_rate": 2.948850574712644e-05, "loss": 0.0955, "step": 3569 }, { "epoch": 4.13823081938709, "grad_norm": 0.2922149896621704, "learning_rate": 2.9482758620689654e-05, "loss": 0.1046, "step": 3570 }, { "epoch": 4.1393899876838365, "grad_norm": 0.461208313703537, "learning_rate": 2.9477011494252875e-05, "loss": 0.1062, "step": 3571 }, { "epoch": 4.140549155980584, "grad_norm": 0.35071003437042236, "learning_rate": 2.9471264367816094e-05, "loss": 0.1028, "step": 3572 }, { "epoch": 4.141708324277331, "grad_norm": 0.3092256486415863, "learning_rate": 2.946551724137931e-05, "loss": 0.0979, "step": 3573 }, { "epoch": 4.142867492574078, "grad_norm": 0.36416730284690857, "learning_rate": 2.945977011494253e-05, "loss": 0.1096, "step": 3574 }, { "epoch": 4.144026660870825, "grad_norm": 0.4897642135620117, "learning_rate": 2.945402298850575e-05, "loss": 0.0942, "step": 3575 }, { "epoch": 4.1451858291675725, "grad_norm": 0.3243999481201172, "learning_rate": 2.944827586206897e-05, "loss": 0.1, "step": 3576 }, { "epoch": 4.146344997464319, "grad_norm": 0.3881404399871826, "learning_rate": 2.9442528735632185e-05, "loss": 0.1136, "step": 3577 }, { "epoch": 4.147504165761067, "grad_norm": 0.3916066884994507, "learning_rate": 2.9436781609195403e-05, "loss": 0.1066, "step": 3578 }, { "epoch": 4.148663334057813, "grad_norm": 0.3146432638168335, "learning_rate": 2.9431034482758625e-05, "loss": 0.0874, "step": 3579 }, { "epoch": 4.149822502354561, "grad_norm": 0.3404291272163391, "learning_rate": 2.942528735632184e-05, "loss": 0.095, "step": 3580 }, { "epoch": 4.150981670651308, "grad_norm": 0.4161677658557892, "learning_rate": 2.9419540229885055e-05, "loss": 0.0965, "step": 3581 }, { "epoch": 4.152140838948055, "grad_norm": 0.38748228549957275, "learning_rate": 2.941379310344828e-05, "loss": 0.1148, "step": 3582 }, { "epoch": 4.153300007244802, "grad_norm": 0.3862541615962982, "learning_rate": 2.9408045977011495e-05, "loss": 0.0994, "step": 3583 }, { "epoch": 4.1544591755415485, "grad_norm": 0.4990377724170685, "learning_rate": 2.9402298850574716e-05, "loss": 0.1023, "step": 3584 }, { "epoch": 4.155618343838296, "grad_norm": 0.3841322362422943, "learning_rate": 2.939655172413793e-05, "loss": 0.1137, "step": 3585 }, { "epoch": 4.156777512135043, "grad_norm": 0.2818942368030548, "learning_rate": 2.939080459770115e-05, "loss": 0.0971, "step": 3586 }, { "epoch": 4.15793668043179, "grad_norm": 0.4144044518470764, "learning_rate": 2.938505747126437e-05, "loss": 0.1, "step": 3587 }, { "epoch": 4.159095848728537, "grad_norm": 0.265598326921463, "learning_rate": 2.9379310344827586e-05, "loss": 0.0911, "step": 3588 }, { "epoch": 4.1602550170252846, "grad_norm": 0.40067845582962036, "learning_rate": 2.9373563218390805e-05, "loss": 0.1078, "step": 3589 }, { "epoch": 4.161414185322031, "grad_norm": 0.3143431842327118, "learning_rate": 2.9367816091954026e-05, "loss": 0.0986, "step": 3590 }, { "epoch": 4.162573353618779, "grad_norm": 0.2772219181060791, "learning_rate": 2.936206896551724e-05, "loss": 0.0944, "step": 3591 }, { "epoch": 4.1637325219155255, "grad_norm": 0.4990723729133606, "learning_rate": 2.935632183908046e-05, "loss": 0.0989, "step": 3592 }, { "epoch": 4.164891690212273, "grad_norm": 0.3033507168292999, "learning_rate": 2.935057471264368e-05, "loss": 0.1002, "step": 3593 }, { "epoch": 4.16605085850902, "grad_norm": 0.28623032569885254, "learning_rate": 2.9344827586206896e-05, "loss": 0.1109, "step": 3594 }, { "epoch": 4.167210026805767, "grad_norm": 0.28263723850250244, "learning_rate": 2.9339080459770118e-05, "loss": 0.1128, "step": 3595 }, { "epoch": 4.168369195102514, "grad_norm": 0.7730621099472046, "learning_rate": 2.9333333333333336e-05, "loss": 0.1082, "step": 3596 }, { "epoch": 4.169528363399261, "grad_norm": 0.306794673204422, "learning_rate": 2.932758620689655e-05, "loss": 0.1018, "step": 3597 }, { "epoch": 4.170687531696008, "grad_norm": 0.3806617558002472, "learning_rate": 2.9321839080459773e-05, "loss": 0.1206, "step": 3598 }, { "epoch": 4.171846699992755, "grad_norm": 0.3783591687679291, "learning_rate": 2.931609195402299e-05, "loss": 0.105, "step": 3599 }, { "epoch": 4.173005868289502, "grad_norm": 0.49170562624931335, "learning_rate": 2.9310344827586206e-05, "loss": 0.1053, "step": 3600 }, { "epoch": 4.174165036586249, "grad_norm": 0.36278071999549866, "learning_rate": 2.9304597701149427e-05, "loss": 0.1006, "step": 3601 }, { "epoch": 4.175324204882997, "grad_norm": 0.4412696957588196, "learning_rate": 2.9298850574712646e-05, "loss": 0.1136, "step": 3602 }, { "epoch": 4.176483373179743, "grad_norm": 0.33460313081741333, "learning_rate": 2.9293103448275867e-05, "loss": 0.1022, "step": 3603 }, { "epoch": 4.177642541476491, "grad_norm": 0.32124656438827515, "learning_rate": 2.9287356321839082e-05, "loss": 0.1033, "step": 3604 }, { "epoch": 4.1788017097732375, "grad_norm": 0.3389214873313904, "learning_rate": 2.9281609195402297e-05, "loss": 0.1074, "step": 3605 }, { "epoch": 4.179960878069985, "grad_norm": 0.3551189601421356, "learning_rate": 2.927586206896552e-05, "loss": 0.0946, "step": 3606 }, { "epoch": 4.181120046366732, "grad_norm": 0.3766753673553467, "learning_rate": 2.9270114942528737e-05, "loss": 0.0939, "step": 3607 }, { "epoch": 4.182279214663479, "grad_norm": 0.362870454788208, "learning_rate": 2.9264367816091952e-05, "loss": 0.1079, "step": 3608 }, { "epoch": 4.183438382960226, "grad_norm": 0.44273027777671814, "learning_rate": 2.9258620689655174e-05, "loss": 0.0969, "step": 3609 }, { "epoch": 4.1845975512569735, "grad_norm": 0.3742962181568146, "learning_rate": 2.9252873563218392e-05, "loss": 0.1129, "step": 3610 }, { "epoch": 4.18575671955372, "grad_norm": 0.4322942793369293, "learning_rate": 2.9247126436781614e-05, "loss": 0.1058, "step": 3611 }, { "epoch": 4.186915887850467, "grad_norm": 0.44302043318748474, "learning_rate": 2.924137931034483e-05, "loss": 0.1062, "step": 3612 }, { "epoch": 4.188075056147214, "grad_norm": 0.32489603757858276, "learning_rate": 2.9235632183908047e-05, "loss": 0.0974, "step": 3613 }, { "epoch": 4.189234224443961, "grad_norm": 0.282850444316864, "learning_rate": 2.922988505747127e-05, "loss": 0.0941, "step": 3614 }, { "epoch": 4.190393392740709, "grad_norm": 0.27374568581581116, "learning_rate": 2.9224137931034483e-05, "loss": 0.0963, "step": 3615 }, { "epoch": 4.191552561037455, "grad_norm": 0.31600022315979004, "learning_rate": 2.9218390804597702e-05, "loss": 0.1027, "step": 3616 }, { "epoch": 4.192711729334203, "grad_norm": 0.24971164762973785, "learning_rate": 2.9212643678160923e-05, "loss": 0.1103, "step": 3617 }, { "epoch": 4.19387089763095, "grad_norm": 0.3458114564418793, "learning_rate": 2.920689655172414e-05, "loss": 0.0996, "step": 3618 }, { "epoch": 4.195030065927697, "grad_norm": 0.3708142340183258, "learning_rate": 2.9201149425287357e-05, "loss": 0.0953, "step": 3619 }, { "epoch": 4.196189234224444, "grad_norm": 0.3767480254173279, "learning_rate": 2.919540229885058e-05, "loss": 0.1081, "step": 3620 }, { "epoch": 4.197348402521191, "grad_norm": 0.34636861085891724, "learning_rate": 2.9189655172413793e-05, "loss": 0.0996, "step": 3621 }, { "epoch": 4.198507570817938, "grad_norm": 0.4181584119796753, "learning_rate": 2.9183908045977015e-05, "loss": 0.0951, "step": 3622 }, { "epoch": 4.199666739114686, "grad_norm": 0.4052351117134094, "learning_rate": 2.9178160919540233e-05, "loss": 0.1059, "step": 3623 }, { "epoch": 4.200825907411432, "grad_norm": 0.28023508191108704, "learning_rate": 2.9172413793103448e-05, "loss": 0.0992, "step": 3624 }, { "epoch": 4.20198507570818, "grad_norm": 0.3520253896713257, "learning_rate": 2.916666666666667e-05, "loss": 0.107, "step": 3625 }, { "epoch": 4.2031442440049265, "grad_norm": 0.3649381697177887, "learning_rate": 2.9160919540229885e-05, "loss": 0.114, "step": 3626 }, { "epoch": 4.204303412301673, "grad_norm": 0.36334317922592163, "learning_rate": 2.9155172413793103e-05, "loss": 0.1012, "step": 3627 }, { "epoch": 4.205462580598421, "grad_norm": 0.3084128797054291, "learning_rate": 2.9149425287356325e-05, "loss": 0.1035, "step": 3628 }, { "epoch": 4.206621748895167, "grad_norm": 0.31549155712127686, "learning_rate": 2.914367816091954e-05, "loss": 0.1022, "step": 3629 }, { "epoch": 4.207780917191915, "grad_norm": 0.33115145564079285, "learning_rate": 2.913793103448276e-05, "loss": 0.1034, "step": 3630 }, { "epoch": 4.208940085488662, "grad_norm": 0.321877121925354, "learning_rate": 2.913218390804598e-05, "loss": 0.0948, "step": 3631 }, { "epoch": 4.210099253785409, "grad_norm": 0.34342601895332336, "learning_rate": 2.9126436781609194e-05, "loss": 0.0987, "step": 3632 }, { "epoch": 4.211258422082156, "grad_norm": 0.3793211579322815, "learning_rate": 2.9120689655172416e-05, "loss": 0.1032, "step": 3633 }, { "epoch": 4.212417590378903, "grad_norm": 0.46431970596313477, "learning_rate": 2.9114942528735634e-05, "loss": 0.0997, "step": 3634 }, { "epoch": 4.21357675867565, "grad_norm": 0.27705472707748413, "learning_rate": 2.910919540229885e-05, "loss": 0.0957, "step": 3635 }, { "epoch": 4.214735926972398, "grad_norm": 0.5416485667228699, "learning_rate": 2.910344827586207e-05, "loss": 0.1021, "step": 3636 }, { "epoch": 4.215895095269144, "grad_norm": 0.3653283715248108, "learning_rate": 2.909770114942529e-05, "loss": 0.0992, "step": 3637 }, { "epoch": 4.217054263565892, "grad_norm": 0.34338125586509705, "learning_rate": 2.9091954022988504e-05, "loss": 0.1045, "step": 3638 }, { "epoch": 4.2182134318626385, "grad_norm": 0.39666372537612915, "learning_rate": 2.9086206896551726e-05, "loss": 0.1029, "step": 3639 }, { "epoch": 4.219372600159385, "grad_norm": 0.46274009346961975, "learning_rate": 2.9080459770114944e-05, "loss": 0.1097, "step": 3640 }, { "epoch": 4.220531768456133, "grad_norm": 0.3507300615310669, "learning_rate": 2.9074712643678166e-05, "loss": 0.1001, "step": 3641 }, { "epoch": 4.221690936752879, "grad_norm": 0.5658450126647949, "learning_rate": 2.906896551724138e-05, "loss": 0.1062, "step": 3642 }, { "epoch": 4.222850105049627, "grad_norm": 0.33815431594848633, "learning_rate": 2.90632183908046e-05, "loss": 0.1002, "step": 3643 }, { "epoch": 4.224009273346374, "grad_norm": 0.3951635956764221, "learning_rate": 2.905747126436782e-05, "loss": 0.0966, "step": 3644 }, { "epoch": 4.225168441643121, "grad_norm": 0.4805266261100769, "learning_rate": 2.9051724137931036e-05, "loss": 0.1122, "step": 3645 }, { "epoch": 4.226327609939868, "grad_norm": 0.3607132136821747, "learning_rate": 2.904597701149425e-05, "loss": 0.0924, "step": 3646 }, { "epoch": 4.2274867782366155, "grad_norm": 0.3858696520328522, "learning_rate": 2.9040229885057476e-05, "loss": 0.0954, "step": 3647 }, { "epoch": 4.228645946533362, "grad_norm": 0.35056713223457336, "learning_rate": 2.903448275862069e-05, "loss": 0.1034, "step": 3648 }, { "epoch": 4.22980511483011, "grad_norm": 0.45169034600257874, "learning_rate": 2.9028735632183912e-05, "loss": 0.1089, "step": 3649 }, { "epoch": 4.230964283126856, "grad_norm": 0.33204302191734314, "learning_rate": 2.9022988505747127e-05, "loss": 0.1078, "step": 3650 }, { "epoch": 4.232123451423604, "grad_norm": 0.4049871563911438, "learning_rate": 2.9017241379310345e-05, "loss": 0.1022, "step": 3651 }, { "epoch": 4.233282619720351, "grad_norm": 0.37815043330192566, "learning_rate": 2.9011494252873567e-05, "loss": 0.1025, "step": 3652 }, { "epoch": 4.234441788017098, "grad_norm": 0.6017569899559021, "learning_rate": 2.9005747126436782e-05, "loss": 0.1095, "step": 3653 }, { "epoch": 4.235600956313845, "grad_norm": 0.44660142064094543, "learning_rate": 2.9e-05, "loss": 0.1027, "step": 3654 }, { "epoch": 4.2367601246105915, "grad_norm": 0.22865894436836243, "learning_rate": 2.8994252873563222e-05, "loss": 0.0953, "step": 3655 }, { "epoch": 4.237919292907339, "grad_norm": 0.28317615389823914, "learning_rate": 2.8988505747126437e-05, "loss": 0.1009, "step": 3656 }, { "epoch": 4.239078461204086, "grad_norm": 0.30559006333351135, "learning_rate": 2.8982758620689655e-05, "loss": 0.1079, "step": 3657 }, { "epoch": 4.240237629500833, "grad_norm": 0.3804382383823395, "learning_rate": 2.8977011494252877e-05, "loss": 0.115, "step": 3658 }, { "epoch": 4.24139679779758, "grad_norm": 0.4205724000930786, "learning_rate": 2.897126436781609e-05, "loss": 0.1013, "step": 3659 }, { "epoch": 4.2425559660943275, "grad_norm": 0.3758813142776489, "learning_rate": 2.8965517241379313e-05, "loss": 0.103, "step": 3660 }, { "epoch": 4.243715134391074, "grad_norm": 0.6086719632148743, "learning_rate": 2.895977011494253e-05, "loss": 0.1231, "step": 3661 }, { "epoch": 4.244874302687822, "grad_norm": 0.4006671607494354, "learning_rate": 2.8954022988505746e-05, "loss": 0.0924, "step": 3662 }, { "epoch": 4.246033470984568, "grad_norm": 0.41011175513267517, "learning_rate": 2.8948275862068968e-05, "loss": 0.0992, "step": 3663 }, { "epoch": 4.247192639281316, "grad_norm": 0.4147280156612396, "learning_rate": 2.8942528735632186e-05, "loss": 0.1034, "step": 3664 }, { "epoch": 4.248351807578063, "grad_norm": 0.32018283009529114, "learning_rate": 2.89367816091954e-05, "loss": 0.0927, "step": 3665 }, { "epoch": 4.24951097587481, "grad_norm": 0.3739756643772125, "learning_rate": 2.8931034482758623e-05, "loss": 0.11, "step": 3666 }, { "epoch": 4.250670144171557, "grad_norm": 0.3220639228820801, "learning_rate": 2.8925287356321838e-05, "loss": 0.0968, "step": 3667 }, { "epoch": 4.251829312468304, "grad_norm": 0.3207123279571533, "learning_rate": 2.8919540229885063e-05, "loss": 0.1002, "step": 3668 }, { "epoch": 4.252988480765051, "grad_norm": 0.38051706552505493, "learning_rate": 2.8913793103448278e-05, "loss": 0.1089, "step": 3669 }, { "epoch": 4.254147649061798, "grad_norm": 0.3652991056442261, "learning_rate": 2.8908045977011493e-05, "loss": 0.1044, "step": 3670 }, { "epoch": 4.255306817358545, "grad_norm": 0.4439450204372406, "learning_rate": 2.8902298850574714e-05, "loss": 0.1028, "step": 3671 }, { "epoch": 4.256465985655292, "grad_norm": 0.2599686086177826, "learning_rate": 2.8896551724137933e-05, "loss": 0.0919, "step": 3672 }, { "epoch": 4.2576251539520396, "grad_norm": 0.33243459463119507, "learning_rate": 2.8890804597701148e-05, "loss": 0.1044, "step": 3673 }, { "epoch": 4.258784322248786, "grad_norm": 0.305882066488266, "learning_rate": 2.888505747126437e-05, "loss": 0.0968, "step": 3674 }, { "epoch": 4.259943490545534, "grad_norm": 0.403209388256073, "learning_rate": 2.8879310344827588e-05, "loss": 0.1124, "step": 3675 }, { "epoch": 4.2611026588422805, "grad_norm": 0.32304686307907104, "learning_rate": 2.8873563218390803e-05, "loss": 0.0908, "step": 3676 }, { "epoch": 4.262261827139028, "grad_norm": 0.3692949116230011, "learning_rate": 2.8867816091954024e-05, "loss": 0.1067, "step": 3677 }, { "epoch": 4.263420995435775, "grad_norm": 0.2813093066215515, "learning_rate": 2.8862068965517243e-05, "loss": 0.1046, "step": 3678 }, { "epoch": 4.264580163732522, "grad_norm": 0.28762635588645935, "learning_rate": 2.8856321839080464e-05, "loss": 0.0968, "step": 3679 }, { "epoch": 4.265739332029269, "grad_norm": 0.447125643491745, "learning_rate": 2.885057471264368e-05, "loss": 0.115, "step": 3680 }, { "epoch": 4.2668985003260165, "grad_norm": 0.32157212495803833, "learning_rate": 2.8844827586206897e-05, "loss": 0.0963, "step": 3681 }, { "epoch": 4.268057668622763, "grad_norm": 0.5032519698143005, "learning_rate": 2.883908045977012e-05, "loss": 0.107, "step": 3682 }, { "epoch": 4.26921683691951, "grad_norm": 0.2923990786075592, "learning_rate": 2.8833333333333334e-05, "loss": 0.0884, "step": 3683 }, { "epoch": 4.270376005216257, "grad_norm": 0.4205268621444702, "learning_rate": 2.8827586206896552e-05, "loss": 0.1035, "step": 3684 }, { "epoch": 4.271535173513004, "grad_norm": 0.343766987323761, "learning_rate": 2.8821839080459774e-05, "loss": 0.1016, "step": 3685 }, { "epoch": 4.272694341809752, "grad_norm": 0.39418303966522217, "learning_rate": 2.881609195402299e-05, "loss": 0.1057, "step": 3686 }, { "epoch": 4.273853510106498, "grad_norm": 0.41105031967163086, "learning_rate": 2.881034482758621e-05, "loss": 0.0932, "step": 3687 }, { "epoch": 4.275012678403246, "grad_norm": 0.32232189178466797, "learning_rate": 2.880459770114943e-05, "loss": 0.1043, "step": 3688 }, { "epoch": 4.2761718466999925, "grad_norm": 0.49274659156799316, "learning_rate": 2.8798850574712644e-05, "loss": 0.1039, "step": 3689 }, { "epoch": 4.27733101499674, "grad_norm": 0.5565973520278931, "learning_rate": 2.8793103448275865e-05, "loss": 0.1054, "step": 3690 }, { "epoch": 4.278490183293487, "grad_norm": 0.366921603679657, "learning_rate": 2.878735632183908e-05, "loss": 0.1087, "step": 3691 }, { "epoch": 4.279649351590234, "grad_norm": 0.3326165974140167, "learning_rate": 2.87816091954023e-05, "loss": 0.0971, "step": 3692 }, { "epoch": 4.280808519886981, "grad_norm": 0.46312418580055237, "learning_rate": 2.877586206896552e-05, "loss": 0.1038, "step": 3693 }, { "epoch": 4.2819676881837285, "grad_norm": 0.2820906639099121, "learning_rate": 2.8770114942528735e-05, "loss": 0.0958, "step": 3694 }, { "epoch": 4.283126856480475, "grad_norm": 0.3850758969783783, "learning_rate": 2.8764367816091953e-05, "loss": 0.1098, "step": 3695 }, { "epoch": 4.284286024777223, "grad_norm": 0.4358065128326416, "learning_rate": 2.8758620689655175e-05, "loss": 0.1004, "step": 3696 }, { "epoch": 4.285445193073969, "grad_norm": 0.3014400601387024, "learning_rate": 2.875287356321839e-05, "loss": 0.1031, "step": 3697 }, { "epoch": 4.286604361370716, "grad_norm": 0.30815666913986206, "learning_rate": 2.8747126436781612e-05, "loss": 0.1043, "step": 3698 }, { "epoch": 4.287763529667464, "grad_norm": 0.39481833577156067, "learning_rate": 2.874137931034483e-05, "loss": 0.1068, "step": 3699 }, { "epoch": 4.28892269796421, "grad_norm": 0.28407761454582214, "learning_rate": 2.8735632183908045e-05, "loss": 0.0992, "step": 3700 }, { "epoch": 4.290081866260958, "grad_norm": 0.3631652891635895, "learning_rate": 2.8729885057471267e-05, "loss": 0.1069, "step": 3701 }, { "epoch": 4.291241034557705, "grad_norm": 0.3379894495010376, "learning_rate": 2.8724137931034485e-05, "loss": 0.1074, "step": 3702 }, { "epoch": 4.292400202854452, "grad_norm": 0.29723644256591797, "learning_rate": 2.87183908045977e-05, "loss": 0.1051, "step": 3703 }, { "epoch": 4.293559371151199, "grad_norm": 0.4353717863559723, "learning_rate": 2.871264367816092e-05, "loss": 0.1067, "step": 3704 }, { "epoch": 4.294718539447946, "grad_norm": 0.30414873361587524, "learning_rate": 2.870689655172414e-05, "loss": 0.0915, "step": 3705 }, { "epoch": 4.295877707744693, "grad_norm": 0.5142664909362793, "learning_rate": 2.870114942528736e-05, "loss": 0.1019, "step": 3706 }, { "epoch": 4.297036876041441, "grad_norm": 0.47813358902931213, "learning_rate": 2.8695402298850576e-05, "loss": 0.0917, "step": 3707 }, { "epoch": 4.298196044338187, "grad_norm": 0.418399840593338, "learning_rate": 2.8689655172413795e-05, "loss": 0.1045, "step": 3708 }, { "epoch": 4.299355212634935, "grad_norm": 0.36528536677360535, "learning_rate": 2.8683908045977016e-05, "loss": 0.112, "step": 3709 }, { "epoch": 4.3005143809316815, "grad_norm": 0.41568946838378906, "learning_rate": 2.867816091954023e-05, "loss": 0.0988, "step": 3710 }, { "epoch": 4.301673549228429, "grad_norm": 0.3442577123641968, "learning_rate": 2.8672413793103446e-05, "loss": 0.0949, "step": 3711 }, { "epoch": 4.302832717525176, "grad_norm": 0.4584302604198456, "learning_rate": 2.8666666666666668e-05, "loss": 0.1037, "step": 3712 }, { "epoch": 4.303991885821922, "grad_norm": 0.36319372057914734, "learning_rate": 2.8660919540229886e-05, "loss": 0.1082, "step": 3713 }, { "epoch": 4.30515105411867, "grad_norm": 0.2930583953857422, "learning_rate": 2.86551724137931e-05, "loss": 0.0955, "step": 3714 }, { "epoch": 4.306310222415417, "grad_norm": 0.4411957859992981, "learning_rate": 2.8649425287356323e-05, "loss": 0.1108, "step": 3715 }, { "epoch": 4.307469390712164, "grad_norm": 0.3296014070510864, "learning_rate": 2.864367816091954e-05, "loss": 0.1025, "step": 3716 }, { "epoch": 4.308628559008911, "grad_norm": 0.3820302188396454, "learning_rate": 2.8637931034482763e-05, "loss": 0.1105, "step": 3717 }, { "epoch": 4.309787727305658, "grad_norm": 0.30717596411705017, "learning_rate": 2.8632183908045978e-05, "loss": 0.1037, "step": 3718 }, { "epoch": 4.310946895602405, "grad_norm": 0.4110417664051056, "learning_rate": 2.8626436781609196e-05, "loss": 0.1076, "step": 3719 }, { "epoch": 4.312106063899153, "grad_norm": 0.3443779945373535, "learning_rate": 2.8620689655172417e-05, "loss": 0.0982, "step": 3720 }, { "epoch": 4.313265232195899, "grad_norm": 0.2677842080593109, "learning_rate": 2.8614942528735632e-05, "loss": 0.1009, "step": 3721 }, { "epoch": 4.314424400492647, "grad_norm": 0.28737112879753113, "learning_rate": 2.860919540229885e-05, "loss": 0.0953, "step": 3722 }, { "epoch": 4.3155835687893935, "grad_norm": 0.3234851658344269, "learning_rate": 2.8603448275862072e-05, "loss": 0.0992, "step": 3723 }, { "epoch": 4.316742737086141, "grad_norm": 0.2924320101737976, "learning_rate": 2.8597701149425287e-05, "loss": 0.1053, "step": 3724 }, { "epoch": 4.317901905382888, "grad_norm": 0.42933475971221924, "learning_rate": 2.859195402298851e-05, "loss": 0.1063, "step": 3725 }, { "epoch": 4.319061073679634, "grad_norm": 0.34258314967155457, "learning_rate": 2.8586206896551727e-05, "loss": 0.1059, "step": 3726 }, { "epoch": 4.320220241976382, "grad_norm": 0.3477523624897003, "learning_rate": 2.8580459770114942e-05, "loss": 0.0947, "step": 3727 }, { "epoch": 4.321379410273129, "grad_norm": 0.5333256125450134, "learning_rate": 2.8574712643678164e-05, "loss": 0.1111, "step": 3728 }, { "epoch": 4.322538578569876, "grad_norm": 0.3772802948951721, "learning_rate": 2.8568965517241382e-05, "loss": 0.0991, "step": 3729 }, { "epoch": 4.323697746866623, "grad_norm": 0.415400892496109, "learning_rate": 2.8563218390804597e-05, "loss": 0.108, "step": 3730 }, { "epoch": 4.3248569151633705, "grad_norm": 0.34160271286964417, "learning_rate": 2.855747126436782e-05, "loss": 0.0992, "step": 3731 }, { "epoch": 4.326016083460117, "grad_norm": 0.29234248399734497, "learning_rate": 2.8551724137931034e-05, "loss": 0.1012, "step": 3732 }, { "epoch": 4.327175251756865, "grad_norm": 0.36780115962028503, "learning_rate": 2.8545977011494252e-05, "loss": 0.1029, "step": 3733 }, { "epoch": 4.328334420053611, "grad_norm": 0.3559337556362152, "learning_rate": 2.8540229885057474e-05, "loss": 0.116, "step": 3734 }, { "epoch": 4.329493588350359, "grad_norm": 0.35530203580856323, "learning_rate": 2.853448275862069e-05, "loss": 0.1117, "step": 3735 }, { "epoch": 4.330652756647106, "grad_norm": 0.39250123500823975, "learning_rate": 2.852873563218391e-05, "loss": 0.1021, "step": 3736 }, { "epoch": 4.331811924943853, "grad_norm": 0.3159255087375641, "learning_rate": 2.852298850574713e-05, "loss": 0.1061, "step": 3737 }, { "epoch": 4.3329710932406, "grad_norm": 0.28667929768562317, "learning_rate": 2.8517241379310343e-05, "loss": 0.0963, "step": 3738 }, { "epoch": 4.3341302615373465, "grad_norm": 0.49704504013061523, "learning_rate": 2.8511494252873565e-05, "loss": 0.1079, "step": 3739 }, { "epoch": 4.335289429834094, "grad_norm": 0.383167564868927, "learning_rate": 2.8505747126436783e-05, "loss": 0.1031, "step": 3740 }, { "epoch": 4.336448598130841, "grad_norm": 0.28818759322166443, "learning_rate": 2.8499999999999998e-05, "loss": 0.1103, "step": 3741 }, { "epoch": 4.337607766427588, "grad_norm": 0.3614335358142853, "learning_rate": 2.849425287356322e-05, "loss": 0.1032, "step": 3742 }, { "epoch": 4.338766934724335, "grad_norm": 0.3309556841850281, "learning_rate": 2.8488505747126438e-05, "loss": 0.1053, "step": 3743 }, { "epoch": 4.3399261030210825, "grad_norm": 0.4003108739852905, "learning_rate": 2.848275862068966e-05, "loss": 0.1038, "step": 3744 }, { "epoch": 4.341085271317829, "grad_norm": 0.30309104919433594, "learning_rate": 2.8477011494252875e-05, "loss": 0.1019, "step": 3745 }, { "epoch": 4.342244439614577, "grad_norm": 0.3004578649997711, "learning_rate": 2.8471264367816093e-05, "loss": 0.0911, "step": 3746 }, { "epoch": 4.343403607911323, "grad_norm": 0.3777802586555481, "learning_rate": 2.8465517241379315e-05, "loss": 0.0995, "step": 3747 }, { "epoch": 4.344562776208071, "grad_norm": 0.29196664690971375, "learning_rate": 2.845977011494253e-05, "loss": 0.101, "step": 3748 }, { "epoch": 4.345721944504818, "grad_norm": 0.4231297969818115, "learning_rate": 2.8454022988505748e-05, "loss": 0.1033, "step": 3749 }, { "epoch": 4.346881112801565, "grad_norm": 0.38380828499794006, "learning_rate": 2.844827586206897e-05, "loss": 0.1193, "step": 3750 }, { "epoch": 4.348040281098312, "grad_norm": 0.37438878417015076, "learning_rate": 2.8442528735632184e-05, "loss": 0.0972, "step": 3751 }, { "epoch": 4.349199449395059, "grad_norm": 0.29754364490509033, "learning_rate": 2.84367816091954e-05, "loss": 0.0936, "step": 3752 }, { "epoch": 4.350358617691806, "grad_norm": 0.2982840836048126, "learning_rate": 2.8431034482758624e-05, "loss": 0.0892, "step": 3753 }, { "epoch": 4.351517785988554, "grad_norm": 0.4285561740398407, "learning_rate": 2.842528735632184e-05, "loss": 0.0987, "step": 3754 }, { "epoch": 4.3526769542853, "grad_norm": 0.356584370136261, "learning_rate": 2.841954022988506e-05, "loss": 0.1078, "step": 3755 }, { "epoch": 4.353836122582047, "grad_norm": 0.4146307408809662, "learning_rate": 2.8413793103448276e-05, "loss": 0.1098, "step": 3756 }, { "epoch": 4.354995290878795, "grad_norm": 0.39184170961380005, "learning_rate": 2.8408045977011494e-05, "loss": 0.0973, "step": 3757 }, { "epoch": 4.356154459175541, "grad_norm": 0.38053470849990845, "learning_rate": 2.8402298850574716e-05, "loss": 0.106, "step": 3758 }, { "epoch": 4.357313627472289, "grad_norm": 0.4440837800502777, "learning_rate": 2.839655172413793e-05, "loss": 0.1195, "step": 3759 }, { "epoch": 4.3584727957690355, "grad_norm": 0.4450792670249939, "learning_rate": 2.839080459770115e-05, "loss": 0.1102, "step": 3760 }, { "epoch": 4.359631964065783, "grad_norm": 0.29842203855514526, "learning_rate": 2.838505747126437e-05, "loss": 0.1029, "step": 3761 }, { "epoch": 4.36079113236253, "grad_norm": 0.35125017166137695, "learning_rate": 2.8379310344827586e-05, "loss": 0.1055, "step": 3762 }, { "epoch": 4.361950300659277, "grad_norm": 0.43805792927742004, "learning_rate": 2.8373563218390807e-05, "loss": 0.1018, "step": 3763 }, { "epoch": 4.363109468956024, "grad_norm": 0.4038124978542328, "learning_rate": 2.8367816091954026e-05, "loss": 0.1019, "step": 3764 }, { "epoch": 4.3642686372527715, "grad_norm": 0.3258083462715149, "learning_rate": 2.836206896551724e-05, "loss": 0.0994, "step": 3765 }, { "epoch": 4.365427805549518, "grad_norm": 0.3961776793003082, "learning_rate": 2.8356321839080462e-05, "loss": 0.1168, "step": 3766 }, { "epoch": 4.366586973846266, "grad_norm": 0.39625585079193115, "learning_rate": 2.835057471264368e-05, "loss": 0.1075, "step": 3767 }, { "epoch": 4.367746142143012, "grad_norm": 0.3018212914466858, "learning_rate": 2.8344827586206895e-05, "loss": 0.1109, "step": 3768 }, { "epoch": 4.368905310439759, "grad_norm": 0.32641318440437317, "learning_rate": 2.8339080459770117e-05, "loss": 0.1058, "step": 3769 }, { "epoch": 4.370064478736507, "grad_norm": 0.28830480575561523, "learning_rate": 2.8333333333333335e-05, "loss": 0.0888, "step": 3770 }, { "epoch": 4.371223647033253, "grad_norm": 0.299063116312027, "learning_rate": 2.8327586206896557e-05, "loss": 0.0994, "step": 3771 }, { "epoch": 4.372382815330001, "grad_norm": 0.3458064794540405, "learning_rate": 2.8321839080459772e-05, "loss": 0.099, "step": 3772 }, { "epoch": 4.3735419836267475, "grad_norm": 0.44503387808799744, "learning_rate": 2.8316091954022987e-05, "loss": 0.1047, "step": 3773 }, { "epoch": 4.374701151923495, "grad_norm": 0.34976091980934143, "learning_rate": 2.8310344827586212e-05, "loss": 0.1, "step": 3774 }, { "epoch": 4.375860320220242, "grad_norm": 0.6220278739929199, "learning_rate": 2.8304597701149427e-05, "loss": 0.1055, "step": 3775 }, { "epoch": 4.377019488516989, "grad_norm": 0.5796971917152405, "learning_rate": 2.8298850574712642e-05, "loss": 0.1164, "step": 3776 }, { "epoch": 4.378178656813736, "grad_norm": 0.38796380162239075, "learning_rate": 2.8293103448275863e-05, "loss": 0.0984, "step": 3777 }, { "epoch": 4.3793378251104835, "grad_norm": 0.33590441942214966, "learning_rate": 2.828735632183908e-05, "loss": 0.1074, "step": 3778 }, { "epoch": 4.38049699340723, "grad_norm": 0.3619459569454193, "learning_rate": 2.8281609195402297e-05, "loss": 0.1002, "step": 3779 }, { "epoch": 4.381656161703978, "grad_norm": 0.27559658885002136, "learning_rate": 2.8275862068965518e-05, "loss": 0.098, "step": 3780 }, { "epoch": 4.382815330000724, "grad_norm": 0.25252318382263184, "learning_rate": 2.8270114942528737e-05, "loss": 0.0882, "step": 3781 }, { "epoch": 4.383974498297471, "grad_norm": 0.35869985818862915, "learning_rate": 2.8264367816091958e-05, "loss": 0.106, "step": 3782 }, { "epoch": 4.385133666594219, "grad_norm": 0.4135657250881195, "learning_rate": 2.8258620689655173e-05, "loss": 0.1053, "step": 3783 }, { "epoch": 4.386292834890965, "grad_norm": 0.3181520104408264, "learning_rate": 2.825287356321839e-05, "loss": 0.1071, "step": 3784 }, { "epoch": 4.387452003187713, "grad_norm": 0.5326215624809265, "learning_rate": 2.8247126436781613e-05, "loss": 0.1095, "step": 3785 }, { "epoch": 4.38861117148446, "grad_norm": 0.278712660074234, "learning_rate": 2.8241379310344828e-05, "loss": 0.0979, "step": 3786 }, { "epoch": 4.389770339781207, "grad_norm": 0.30752044916152954, "learning_rate": 2.8235632183908046e-05, "loss": 0.0938, "step": 3787 }, { "epoch": 4.390929508077954, "grad_norm": 0.3980220854282379, "learning_rate": 2.8229885057471268e-05, "loss": 0.101, "step": 3788 }, { "epoch": 4.392088676374701, "grad_norm": 0.3621883690357208, "learning_rate": 2.8224137931034483e-05, "loss": 0.1064, "step": 3789 }, { "epoch": 4.393247844671448, "grad_norm": 0.3474581837654114, "learning_rate": 2.8218390804597705e-05, "loss": 0.1024, "step": 3790 }, { "epoch": 4.394407012968196, "grad_norm": 0.33540961146354675, "learning_rate": 2.8212643678160923e-05, "loss": 0.1032, "step": 3791 }, { "epoch": 4.395566181264942, "grad_norm": 0.39876723289489746, "learning_rate": 2.8206896551724138e-05, "loss": 0.0977, "step": 3792 }, { "epoch": 4.39672534956169, "grad_norm": 0.31449171900749207, "learning_rate": 2.820114942528736e-05, "loss": 0.0971, "step": 3793 }, { "epoch": 4.3978845178584365, "grad_norm": 0.30794376134872437, "learning_rate": 2.8195402298850578e-05, "loss": 0.11, "step": 3794 }, { "epoch": 4.399043686155184, "grad_norm": 0.30705249309539795, "learning_rate": 2.8189655172413793e-05, "loss": 0.1023, "step": 3795 }, { "epoch": 4.400202854451931, "grad_norm": 0.37450698018074036, "learning_rate": 2.8183908045977014e-05, "loss": 0.1003, "step": 3796 }, { "epoch": 4.401362022748678, "grad_norm": 0.4203515350818634, "learning_rate": 2.817816091954023e-05, "loss": 0.1178, "step": 3797 }, { "epoch": 4.402521191045425, "grad_norm": 0.31087568402290344, "learning_rate": 2.8172413793103447e-05, "loss": 0.115, "step": 3798 }, { "epoch": 4.403680359342172, "grad_norm": 0.32186147570610046, "learning_rate": 2.816666666666667e-05, "loss": 0.0939, "step": 3799 }, { "epoch": 4.404839527638919, "grad_norm": 0.38315415382385254, "learning_rate": 2.8160919540229884e-05, "loss": 0.0979, "step": 3800 }, { "epoch": 4.405998695935666, "grad_norm": 0.5825996398925781, "learning_rate": 2.8155172413793106e-05, "loss": 0.109, "step": 3801 }, { "epoch": 4.407157864232413, "grad_norm": 0.42345044016838074, "learning_rate": 2.8149425287356324e-05, "loss": 0.106, "step": 3802 }, { "epoch": 4.40831703252916, "grad_norm": 0.2959122657775879, "learning_rate": 2.814367816091954e-05, "loss": 0.0996, "step": 3803 }, { "epoch": 4.409476200825908, "grad_norm": 0.479018896818161, "learning_rate": 2.813793103448276e-05, "loss": 0.0952, "step": 3804 }, { "epoch": 4.410635369122654, "grad_norm": 0.3577905297279358, "learning_rate": 2.813218390804598e-05, "loss": 0.1047, "step": 3805 }, { "epoch": 4.411794537419402, "grad_norm": 0.5896329283714294, "learning_rate": 2.8126436781609194e-05, "loss": 0.1145, "step": 3806 }, { "epoch": 4.4129537057161485, "grad_norm": 0.31682226061820984, "learning_rate": 2.8120689655172415e-05, "loss": 0.0913, "step": 3807 }, { "epoch": 4.414112874012896, "grad_norm": 0.38672780990600586, "learning_rate": 2.8114942528735634e-05, "loss": 0.1036, "step": 3808 }, { "epoch": 4.415272042309643, "grad_norm": 0.3999164402484894, "learning_rate": 2.8109195402298855e-05, "loss": 0.0992, "step": 3809 }, { "epoch": 4.41643121060639, "grad_norm": 0.32970115542411804, "learning_rate": 2.810344827586207e-05, "loss": 0.0957, "step": 3810 }, { "epoch": 4.417590378903137, "grad_norm": 0.4063851535320282, "learning_rate": 2.809770114942529e-05, "loss": 0.1041, "step": 3811 }, { "epoch": 4.418749547199884, "grad_norm": 0.4128503203392029, "learning_rate": 2.809195402298851e-05, "loss": 0.1006, "step": 3812 }, { "epoch": 4.419908715496631, "grad_norm": 0.3817613124847412, "learning_rate": 2.8086206896551725e-05, "loss": 0.1138, "step": 3813 }, { "epoch": 4.421067883793378, "grad_norm": 0.4013262391090393, "learning_rate": 2.8080459770114944e-05, "loss": 0.1078, "step": 3814 }, { "epoch": 4.4222270520901255, "grad_norm": 0.4090496599674225, "learning_rate": 2.8074712643678165e-05, "loss": 0.1052, "step": 3815 }, { "epoch": 4.423386220386872, "grad_norm": 0.307972252368927, "learning_rate": 2.806896551724138e-05, "loss": 0.1008, "step": 3816 }, { "epoch": 4.42454538868362, "grad_norm": 0.4433940052986145, "learning_rate": 2.8063218390804595e-05, "loss": 0.1126, "step": 3817 }, { "epoch": 4.425704556980366, "grad_norm": 0.34657689929008484, "learning_rate": 2.8057471264367817e-05, "loss": 0.085, "step": 3818 }, { "epoch": 4.426863725277114, "grad_norm": 0.4442926049232483, "learning_rate": 2.8051724137931035e-05, "loss": 0.1089, "step": 3819 }, { "epoch": 4.428022893573861, "grad_norm": 0.5043272376060486, "learning_rate": 2.8045977011494257e-05, "loss": 0.1284, "step": 3820 }, { "epoch": 4.429182061870608, "grad_norm": 0.34867578744888306, "learning_rate": 2.804022988505747e-05, "loss": 0.101, "step": 3821 }, { "epoch": 4.430341230167355, "grad_norm": 0.3213669955730438, "learning_rate": 2.803448275862069e-05, "loss": 0.0922, "step": 3822 }, { "epoch": 4.431500398464102, "grad_norm": 1.2875010967254639, "learning_rate": 2.802873563218391e-05, "loss": 0.1089, "step": 3823 }, { "epoch": 4.432659566760849, "grad_norm": 0.3360012173652649, "learning_rate": 2.8022988505747126e-05, "loss": 0.1059, "step": 3824 }, { "epoch": 4.433818735057596, "grad_norm": 0.31028974056243896, "learning_rate": 2.8017241379310345e-05, "loss": 0.1003, "step": 3825 }, { "epoch": 4.434977903354343, "grad_norm": 0.45090678334236145, "learning_rate": 2.8011494252873566e-05, "loss": 0.1056, "step": 3826 }, { "epoch": 4.43613707165109, "grad_norm": 0.29718369245529175, "learning_rate": 2.800574712643678e-05, "loss": 0.1012, "step": 3827 }, { "epoch": 4.4372962399478375, "grad_norm": 0.3447516858577728, "learning_rate": 2.8000000000000003e-05, "loss": 0.0984, "step": 3828 }, { "epoch": 4.438455408244584, "grad_norm": 0.3421769440174103, "learning_rate": 2.799425287356322e-05, "loss": 0.1022, "step": 3829 }, { "epoch": 4.439614576541332, "grad_norm": 0.4500944912433624, "learning_rate": 2.7988505747126436e-05, "loss": 0.1007, "step": 3830 }, { "epoch": 4.440773744838078, "grad_norm": 0.434008926153183, "learning_rate": 2.7982758620689658e-05, "loss": 0.1144, "step": 3831 }, { "epoch": 4.441932913134826, "grad_norm": 0.3270074129104614, "learning_rate": 2.7977011494252876e-05, "loss": 0.1051, "step": 3832 }, { "epoch": 4.443092081431573, "grad_norm": 0.37319257855415344, "learning_rate": 2.797126436781609e-05, "loss": 0.1048, "step": 3833 }, { "epoch": 4.44425124972832, "grad_norm": 0.3999885320663452, "learning_rate": 2.7965517241379313e-05, "loss": 0.1119, "step": 3834 }, { "epoch": 4.445410418025067, "grad_norm": 0.3219684362411499, "learning_rate": 2.795977011494253e-05, "loss": 0.1173, "step": 3835 }, { "epoch": 4.446569586321814, "grad_norm": 0.3253685534000397, "learning_rate": 2.7954022988505746e-05, "loss": 0.1012, "step": 3836 }, { "epoch": 4.447728754618561, "grad_norm": 0.3479521870613098, "learning_rate": 2.7948275862068968e-05, "loss": 0.0977, "step": 3837 }, { "epoch": 4.448887922915309, "grad_norm": 0.33220139145851135, "learning_rate": 2.7942528735632182e-05, "loss": 0.1052, "step": 3838 }, { "epoch": 4.450047091212055, "grad_norm": 0.3189479410648346, "learning_rate": 2.7936781609195408e-05, "loss": 0.0937, "step": 3839 }, { "epoch": 4.451206259508803, "grad_norm": 0.34778445959091187, "learning_rate": 2.7931034482758622e-05, "loss": 0.1142, "step": 3840 }, { "epoch": 4.45236542780555, "grad_norm": 0.4988318979740143, "learning_rate": 2.7925287356321837e-05, "loss": 0.1025, "step": 3841 }, { "epoch": 4.453524596102296, "grad_norm": 0.41545069217681885, "learning_rate": 2.791954022988506e-05, "loss": 0.1075, "step": 3842 }, { "epoch": 4.454683764399044, "grad_norm": 0.38228297233581543, "learning_rate": 2.7913793103448277e-05, "loss": 0.1112, "step": 3843 }, { "epoch": 4.4558429326957905, "grad_norm": 0.34965816140174866, "learning_rate": 2.7908045977011492e-05, "loss": 0.1088, "step": 3844 }, { "epoch": 4.457002100992538, "grad_norm": 0.32655954360961914, "learning_rate": 2.7902298850574714e-05, "loss": 0.1071, "step": 3845 }, { "epoch": 4.458161269289285, "grad_norm": 0.35616791248321533, "learning_rate": 2.7896551724137932e-05, "loss": 0.1095, "step": 3846 }, { "epoch": 4.459320437586032, "grad_norm": 0.4080839157104492, "learning_rate": 2.7890804597701154e-05, "loss": 0.1029, "step": 3847 }, { "epoch": 4.460479605882779, "grad_norm": 0.36598852276802063, "learning_rate": 2.788505747126437e-05, "loss": 0.0919, "step": 3848 }, { "epoch": 4.4616387741795265, "grad_norm": 0.3768613040447235, "learning_rate": 2.7879310344827587e-05, "loss": 0.1065, "step": 3849 }, { "epoch": 4.462797942476273, "grad_norm": 0.3174971044063568, "learning_rate": 2.787356321839081e-05, "loss": 0.1003, "step": 3850 }, { "epoch": 4.463957110773021, "grad_norm": 0.34315162897109985, "learning_rate": 2.7867816091954024e-05, "loss": 0.1042, "step": 3851 }, { "epoch": 4.465116279069767, "grad_norm": 0.29958537220954895, "learning_rate": 2.7862068965517242e-05, "loss": 0.0927, "step": 3852 }, { "epoch": 4.466275447366515, "grad_norm": 0.40852096676826477, "learning_rate": 2.7856321839080464e-05, "loss": 0.1002, "step": 3853 }, { "epoch": 4.467434615663262, "grad_norm": 0.32836470007896423, "learning_rate": 2.785057471264368e-05, "loss": 0.1069, "step": 3854 }, { "epoch": 4.468593783960008, "grad_norm": 0.37109801173210144, "learning_rate": 2.7844827586206897e-05, "loss": 0.1069, "step": 3855 }, { "epoch": 4.469752952256756, "grad_norm": 0.4152434170246124, "learning_rate": 2.783908045977012e-05, "loss": 0.1026, "step": 3856 }, { "epoch": 4.4709121205535025, "grad_norm": 0.31486672163009644, "learning_rate": 2.7833333333333333e-05, "loss": 0.0983, "step": 3857 }, { "epoch": 4.47207128885025, "grad_norm": 0.2974405288696289, "learning_rate": 2.7827586206896555e-05, "loss": 0.0975, "step": 3858 }, { "epoch": 4.473230457146997, "grad_norm": 0.3772009015083313, "learning_rate": 2.7821839080459773e-05, "loss": 0.1035, "step": 3859 }, { "epoch": 4.474389625443744, "grad_norm": 0.3580301105976105, "learning_rate": 2.7816091954022988e-05, "loss": 0.1001, "step": 3860 }, { "epoch": 4.475548793740491, "grad_norm": 0.31176498532295227, "learning_rate": 2.781034482758621e-05, "loss": 0.0991, "step": 3861 }, { "epoch": 4.4767079620372385, "grad_norm": 0.36399632692337036, "learning_rate": 2.7804597701149425e-05, "loss": 0.1106, "step": 3862 }, { "epoch": 4.477867130333985, "grad_norm": 0.36921966075897217, "learning_rate": 2.7798850574712643e-05, "loss": 0.1045, "step": 3863 }, { "epoch": 4.479026298630733, "grad_norm": 0.28541839122772217, "learning_rate": 2.7793103448275865e-05, "loss": 0.0879, "step": 3864 }, { "epoch": 4.480185466927479, "grad_norm": 0.5838915109634399, "learning_rate": 2.778735632183908e-05, "loss": 0.1098, "step": 3865 }, { "epoch": 4.481344635224227, "grad_norm": 0.32612380385398865, "learning_rate": 2.77816091954023e-05, "loss": 0.107, "step": 3866 }, { "epoch": 4.482503803520974, "grad_norm": 0.33638033270835876, "learning_rate": 2.777586206896552e-05, "loss": 0.101, "step": 3867 }, { "epoch": 4.48366297181772, "grad_norm": 0.40769678354263306, "learning_rate": 2.7770114942528735e-05, "loss": 0.107, "step": 3868 }, { "epoch": 4.484822140114468, "grad_norm": 0.3535826504230499, "learning_rate": 2.7764367816091956e-05, "loss": 0.1087, "step": 3869 }, { "epoch": 4.485981308411215, "grad_norm": 0.3710591495037079, "learning_rate": 2.7758620689655175e-05, "loss": 0.1045, "step": 3870 }, { "epoch": 4.487140476707962, "grad_norm": 0.3905164301395416, "learning_rate": 2.775287356321839e-05, "loss": 0.1059, "step": 3871 }, { "epoch": 4.488299645004709, "grad_norm": 0.27448105812072754, "learning_rate": 2.774712643678161e-05, "loss": 0.1021, "step": 3872 }, { "epoch": 4.489458813301456, "grad_norm": 0.3772237002849579, "learning_rate": 2.774137931034483e-05, "loss": 0.1088, "step": 3873 }, { "epoch": 4.490617981598203, "grad_norm": 0.4715537130832672, "learning_rate": 2.7735632183908044e-05, "loss": 0.1001, "step": 3874 }, { "epoch": 4.491777149894951, "grad_norm": 0.4609696865081787, "learning_rate": 2.7729885057471266e-05, "loss": 0.1155, "step": 3875 }, { "epoch": 4.492936318191697, "grad_norm": 0.368083119392395, "learning_rate": 2.7724137931034484e-05, "loss": 0.1018, "step": 3876 }, { "epoch": 4.494095486488445, "grad_norm": 0.40971893072128296, "learning_rate": 2.7718390804597706e-05, "loss": 0.0994, "step": 3877 }, { "epoch": 4.4952546547851915, "grad_norm": 0.40606266260147095, "learning_rate": 2.771264367816092e-05, "loss": 0.1162, "step": 3878 }, { "epoch": 4.496413823081939, "grad_norm": 0.3284454047679901, "learning_rate": 2.7706896551724136e-05, "loss": 0.1001, "step": 3879 }, { "epoch": 4.497572991378686, "grad_norm": 0.3405894935131073, "learning_rate": 2.770114942528736e-05, "loss": 0.089, "step": 3880 }, { "epoch": 4.498732159675433, "grad_norm": 0.44441795349121094, "learning_rate": 2.7695402298850576e-05, "loss": 0.1161, "step": 3881 }, { "epoch": 4.49989132797218, "grad_norm": 0.3282816410064697, "learning_rate": 2.768965517241379e-05, "loss": 0.1014, "step": 3882 }, { "epoch": 4.5010504962689275, "grad_norm": 0.31686845421791077, "learning_rate": 2.7683908045977012e-05, "loss": 0.1002, "step": 3883 }, { "epoch": 4.502209664565674, "grad_norm": 0.33207643032073975, "learning_rate": 2.767816091954023e-05, "loss": 0.0948, "step": 3884 }, { "epoch": 4.503368832862421, "grad_norm": 0.3429032564163208, "learning_rate": 2.7672413793103452e-05, "loss": 0.1001, "step": 3885 }, { "epoch": 4.504528001159168, "grad_norm": 0.333465576171875, "learning_rate": 2.7666666666666667e-05, "loss": 0.1043, "step": 3886 }, { "epoch": 4.505687169455915, "grad_norm": 0.3548075556755066, "learning_rate": 2.7660919540229885e-05, "loss": 0.1056, "step": 3887 }, { "epoch": 4.506846337752663, "grad_norm": 0.41525760293006897, "learning_rate": 2.7655172413793107e-05, "loss": 0.1086, "step": 3888 }, { "epoch": 4.508005506049409, "grad_norm": 0.30972954630851746, "learning_rate": 2.7649425287356322e-05, "loss": 0.0972, "step": 3889 }, { "epoch": 4.509164674346157, "grad_norm": 0.46984899044036865, "learning_rate": 2.764367816091954e-05, "loss": 0.1082, "step": 3890 }, { "epoch": 4.5103238426429035, "grad_norm": 0.3709583580493927, "learning_rate": 2.7637931034482762e-05, "loss": 0.1007, "step": 3891 }, { "epoch": 4.511483010939651, "grad_norm": 0.45104119181632996, "learning_rate": 2.7632183908045977e-05, "loss": 0.1131, "step": 3892 }, { "epoch": 4.512642179236398, "grad_norm": 0.311952143907547, "learning_rate": 2.7626436781609195e-05, "loss": 0.1021, "step": 3893 }, { "epoch": 4.513801347533145, "grad_norm": 0.3778269588947296, "learning_rate": 2.7620689655172417e-05, "loss": 0.1046, "step": 3894 }, { "epoch": 4.514960515829892, "grad_norm": 0.3810199797153473, "learning_rate": 2.7614942528735632e-05, "loss": 0.1044, "step": 3895 }, { "epoch": 4.5161196841266396, "grad_norm": 0.3130066394805908, "learning_rate": 2.7609195402298853e-05, "loss": 0.1037, "step": 3896 }, { "epoch": 4.517278852423386, "grad_norm": 0.32273080945014954, "learning_rate": 2.7603448275862072e-05, "loss": 0.0984, "step": 3897 }, { "epoch": 4.518438020720133, "grad_norm": 0.4851730763912201, "learning_rate": 2.7597701149425287e-05, "loss": 0.1088, "step": 3898 }, { "epoch": 4.5195971890168805, "grad_norm": 0.30688047409057617, "learning_rate": 2.759195402298851e-05, "loss": 0.0997, "step": 3899 }, { "epoch": 4.520756357313627, "grad_norm": 0.29143083095550537, "learning_rate": 2.7586206896551727e-05, "loss": 0.1066, "step": 3900 }, { "epoch": 4.521915525610375, "grad_norm": 0.4881395697593689, "learning_rate": 2.758045977011494e-05, "loss": 0.103, "step": 3901 }, { "epoch": 4.523074693907121, "grad_norm": 0.2735028862953186, "learning_rate": 2.7574712643678163e-05, "loss": 0.093, "step": 3902 }, { "epoch": 4.524233862203869, "grad_norm": 0.3494533598423004, "learning_rate": 2.7568965517241378e-05, "loss": 0.1089, "step": 3903 }, { "epoch": 4.525393030500616, "grad_norm": 0.36798563599586487, "learning_rate": 2.7563218390804603e-05, "loss": 0.1049, "step": 3904 }, { "epoch": 4.526552198797363, "grad_norm": 0.3005774915218353, "learning_rate": 2.7557471264367818e-05, "loss": 0.1141, "step": 3905 }, { "epoch": 4.52771136709411, "grad_norm": 0.3312096893787384, "learning_rate": 2.7551724137931033e-05, "loss": 0.1006, "step": 3906 }, { "epoch": 4.528870535390857, "grad_norm": 0.3487379848957062, "learning_rate": 2.7545977011494255e-05, "loss": 0.1005, "step": 3907 }, { "epoch": 4.530029703687604, "grad_norm": 0.36595210433006287, "learning_rate": 2.7540229885057473e-05, "loss": 0.1016, "step": 3908 }, { "epoch": 4.531188871984352, "grad_norm": 0.2543398439884186, "learning_rate": 2.7534482758620688e-05, "loss": 0.0935, "step": 3909 }, { "epoch": 4.532348040281098, "grad_norm": 0.38063111901283264, "learning_rate": 2.752873563218391e-05, "loss": 0.106, "step": 3910 }, { "epoch": 4.533507208577845, "grad_norm": 0.37747347354888916, "learning_rate": 2.7522988505747128e-05, "loss": 0.1045, "step": 3911 }, { "epoch": 4.5346663768745925, "grad_norm": 0.4080142676830292, "learning_rate": 2.7517241379310343e-05, "loss": 0.1065, "step": 3912 }, { "epoch": 4.535825545171339, "grad_norm": 0.3274177014827728, "learning_rate": 2.7511494252873564e-05, "loss": 0.1012, "step": 3913 }, { "epoch": 4.536984713468087, "grad_norm": 0.2830631136894226, "learning_rate": 2.7505747126436783e-05, "loss": 0.1, "step": 3914 }, { "epoch": 4.538143881764833, "grad_norm": 0.27470821142196655, "learning_rate": 2.7500000000000004e-05, "loss": 0.0975, "step": 3915 }, { "epoch": 4.539303050061581, "grad_norm": 0.37115851044654846, "learning_rate": 2.749425287356322e-05, "loss": 0.1079, "step": 3916 }, { "epoch": 4.540462218358328, "grad_norm": 0.3276093900203705, "learning_rate": 2.7488505747126438e-05, "loss": 0.1043, "step": 3917 }, { "epoch": 4.541621386655075, "grad_norm": 0.3176075518131256, "learning_rate": 2.748275862068966e-05, "loss": 0.1021, "step": 3918 }, { "epoch": 4.542780554951822, "grad_norm": 0.32996928691864014, "learning_rate": 2.7477011494252874e-05, "loss": 0.095, "step": 3919 }, { "epoch": 4.543939723248569, "grad_norm": 0.3963630497455597, "learning_rate": 2.7471264367816092e-05, "loss": 0.1069, "step": 3920 }, { "epoch": 4.545098891545316, "grad_norm": 0.9190638065338135, "learning_rate": 2.7465517241379314e-05, "loss": 0.1153, "step": 3921 }, { "epoch": 4.546258059842064, "grad_norm": 0.3529224991798401, "learning_rate": 2.745977011494253e-05, "loss": 0.092, "step": 3922 }, { "epoch": 4.54741722813881, "grad_norm": 0.4127110540866852, "learning_rate": 2.745402298850575e-05, "loss": 0.1058, "step": 3923 }, { "epoch": 4.548576396435557, "grad_norm": 0.4567547142505646, "learning_rate": 2.744827586206897e-05, "loss": 0.1041, "step": 3924 }, { "epoch": 4.549735564732305, "grad_norm": 0.30119210481643677, "learning_rate": 2.7442528735632184e-05, "loss": 0.0998, "step": 3925 }, { "epoch": 4.550894733029052, "grad_norm": 0.3091825842857361, "learning_rate": 2.7436781609195406e-05, "loss": 0.0967, "step": 3926 }, { "epoch": 4.552053901325799, "grad_norm": 0.38511085510253906, "learning_rate": 2.743103448275862e-05, "loss": 0.1076, "step": 3927 }, { "epoch": 4.5532130696225455, "grad_norm": 0.33152782917022705, "learning_rate": 2.742528735632184e-05, "loss": 0.0974, "step": 3928 }, { "epoch": 4.554372237919293, "grad_norm": 0.35295701026916504, "learning_rate": 2.741954022988506e-05, "loss": 0.1044, "step": 3929 }, { "epoch": 4.55553140621604, "grad_norm": 0.40102672576904297, "learning_rate": 2.7413793103448275e-05, "loss": 0.1007, "step": 3930 }, { "epoch": 4.556690574512787, "grad_norm": 0.4231402575969696, "learning_rate": 2.7408045977011494e-05, "loss": 0.1146, "step": 3931 }, { "epoch": 4.557849742809534, "grad_norm": 0.43630826473236084, "learning_rate": 2.7402298850574715e-05, "loss": 0.1009, "step": 3932 }, { "epoch": 4.5590089111062815, "grad_norm": 0.2806771993637085, "learning_rate": 2.739655172413793e-05, "loss": 0.1037, "step": 3933 }, { "epoch": 4.560168079403028, "grad_norm": 0.3072310984134674, "learning_rate": 2.7390804597701152e-05, "loss": 0.0999, "step": 3934 }, { "epoch": 4.561327247699776, "grad_norm": 0.3378421366214752, "learning_rate": 2.738505747126437e-05, "loss": 0.1061, "step": 3935 }, { "epoch": 4.562486415996522, "grad_norm": 0.25815141201019287, "learning_rate": 2.7379310344827585e-05, "loss": 0.0916, "step": 3936 }, { "epoch": 4.56364558429327, "grad_norm": 0.3428052067756653, "learning_rate": 2.7373563218390807e-05, "loss": 0.1086, "step": 3937 }, { "epoch": 4.564804752590017, "grad_norm": 0.3696090579032898, "learning_rate": 2.7367816091954025e-05, "loss": 0.1022, "step": 3938 }, { "epoch": 4.565963920886764, "grad_norm": 0.31813839077949524, "learning_rate": 2.736206896551724e-05, "loss": 0.0936, "step": 3939 }, { "epoch": 4.567123089183511, "grad_norm": 0.40487226843833923, "learning_rate": 2.735632183908046e-05, "loss": 0.103, "step": 3940 }, { "epoch": 4.5682822574802575, "grad_norm": 0.3482483923435211, "learning_rate": 2.735057471264368e-05, "loss": 0.1005, "step": 3941 }, { "epoch": 4.569441425777005, "grad_norm": 0.7645699381828308, "learning_rate": 2.73448275862069e-05, "loss": 0.1048, "step": 3942 }, { "epoch": 4.570600594073752, "grad_norm": 0.2931617796421051, "learning_rate": 2.7339080459770116e-05, "loss": 0.0907, "step": 3943 }, { "epoch": 4.571759762370499, "grad_norm": 0.491020530462265, "learning_rate": 2.733333333333333e-05, "loss": 0.1099, "step": 3944 }, { "epoch": 4.572918930667246, "grad_norm": 0.28609660267829895, "learning_rate": 2.7327586206896556e-05, "loss": 0.104, "step": 3945 }, { "epoch": 4.5740780989639935, "grad_norm": 0.4021299183368683, "learning_rate": 2.732183908045977e-05, "loss": 0.1071, "step": 3946 }, { "epoch": 4.57523726726074, "grad_norm": 0.39661794900894165, "learning_rate": 2.7316091954022986e-05, "loss": 0.1019, "step": 3947 }, { "epoch": 4.576396435557488, "grad_norm": 0.3372730314731598, "learning_rate": 2.7310344827586208e-05, "loss": 0.1109, "step": 3948 }, { "epoch": 4.577555603854234, "grad_norm": 0.33836305141448975, "learning_rate": 2.7304597701149426e-05, "loss": 0.0944, "step": 3949 }, { "epoch": 4.578714772150982, "grad_norm": 0.3433241844177246, "learning_rate": 2.7298850574712648e-05, "loss": 0.1087, "step": 3950 }, { "epoch": 4.579873940447729, "grad_norm": 0.3279787600040436, "learning_rate": 2.7293103448275863e-05, "loss": 0.1091, "step": 3951 }, { "epoch": 4.581033108744476, "grad_norm": 0.3588603734970093, "learning_rate": 2.728735632183908e-05, "loss": 0.0953, "step": 3952 }, { "epoch": 4.582192277041223, "grad_norm": 0.2686539590358734, "learning_rate": 2.7281609195402303e-05, "loss": 0.099, "step": 3953 }, { "epoch": 4.58335144533797, "grad_norm": 0.42638182640075684, "learning_rate": 2.7275862068965518e-05, "loss": 0.1131, "step": 3954 }, { "epoch": 4.584510613634717, "grad_norm": 0.34486687183380127, "learning_rate": 2.7270114942528736e-05, "loss": 0.0993, "step": 3955 }, { "epoch": 4.585669781931464, "grad_norm": 0.3565883934497833, "learning_rate": 2.7264367816091958e-05, "loss": 0.1047, "step": 3956 }, { "epoch": 4.586828950228211, "grad_norm": 0.2582899332046509, "learning_rate": 2.7258620689655173e-05, "loss": 0.0902, "step": 3957 }, { "epoch": 4.587988118524958, "grad_norm": 0.3092981278896332, "learning_rate": 2.725287356321839e-05, "loss": 0.1102, "step": 3958 }, { "epoch": 4.589147286821706, "grad_norm": 0.41385966539382935, "learning_rate": 2.7247126436781613e-05, "loss": 0.0956, "step": 3959 }, { "epoch": 4.590306455118452, "grad_norm": 0.4211784303188324, "learning_rate": 2.7241379310344827e-05, "loss": 0.1025, "step": 3960 }, { "epoch": 4.5914656234152, "grad_norm": 0.3663957417011261, "learning_rate": 2.723563218390805e-05, "loss": 0.1041, "step": 3961 }, { "epoch": 4.5926247917119465, "grad_norm": 0.30467289686203003, "learning_rate": 2.7229885057471267e-05, "loss": 0.1013, "step": 3962 }, { "epoch": 4.593783960008694, "grad_norm": 0.41379502415657043, "learning_rate": 2.7224137931034482e-05, "loss": 0.1, "step": 3963 }, { "epoch": 4.594943128305441, "grad_norm": 0.31340333819389343, "learning_rate": 2.7218390804597704e-05, "loss": 0.0935, "step": 3964 }, { "epoch": 4.596102296602188, "grad_norm": 0.3276791274547577, "learning_rate": 2.7212643678160922e-05, "loss": 0.1058, "step": 3965 }, { "epoch": 4.597261464898935, "grad_norm": 0.4072125554084778, "learning_rate": 2.7206896551724137e-05, "loss": 0.0965, "step": 3966 }, { "epoch": 4.598420633195682, "grad_norm": 0.32723650336265564, "learning_rate": 2.720114942528736e-05, "loss": 0.1108, "step": 3967 }, { "epoch": 4.599579801492429, "grad_norm": 0.3264493942260742, "learning_rate": 2.7195402298850574e-05, "loss": 0.119, "step": 3968 }, { "epoch": 4.600738969789177, "grad_norm": 0.32702627778053284, "learning_rate": 2.71896551724138e-05, "loss": 0.1014, "step": 3969 }, { "epoch": 4.601898138085923, "grad_norm": 0.5226812958717346, "learning_rate": 2.7183908045977014e-05, "loss": 0.1009, "step": 3970 }, { "epoch": 4.60305730638267, "grad_norm": 0.3024195730686188, "learning_rate": 2.717816091954023e-05, "loss": 0.0955, "step": 3971 }, { "epoch": 4.604216474679418, "grad_norm": 0.3582437038421631, "learning_rate": 2.717241379310345e-05, "loss": 0.1102, "step": 3972 }, { "epoch": 4.605375642976164, "grad_norm": 0.38571134209632874, "learning_rate": 2.716666666666667e-05, "loss": 0.1012, "step": 3973 }, { "epoch": 4.606534811272912, "grad_norm": 0.30533286929130554, "learning_rate": 2.7160919540229883e-05, "loss": 0.1025, "step": 3974 }, { "epoch": 4.6076939795696585, "grad_norm": 0.38881656527519226, "learning_rate": 2.7155172413793105e-05, "loss": 0.0957, "step": 3975 }, { "epoch": 4.608853147866406, "grad_norm": 0.40366286039352417, "learning_rate": 2.7149425287356323e-05, "loss": 0.1178, "step": 3976 }, { "epoch": 4.610012316163153, "grad_norm": 0.3555791974067688, "learning_rate": 2.714367816091954e-05, "loss": 0.1039, "step": 3977 }, { "epoch": 4.6111714844599, "grad_norm": 0.405254602432251, "learning_rate": 2.713793103448276e-05, "loss": 0.1036, "step": 3978 }, { "epoch": 4.612330652756647, "grad_norm": 0.42055705189704895, "learning_rate": 2.713218390804598e-05, "loss": 0.1063, "step": 3979 }, { "epoch": 4.613489821053395, "grad_norm": 0.38437414169311523, "learning_rate": 2.71264367816092e-05, "loss": 0.1002, "step": 3980 }, { "epoch": 4.614648989350141, "grad_norm": 0.37615966796875, "learning_rate": 2.7120689655172415e-05, "loss": 0.101, "step": 3981 }, { "epoch": 4.615808157646889, "grad_norm": 0.363473117351532, "learning_rate": 2.7114942528735633e-05, "loss": 0.1121, "step": 3982 }, { "epoch": 4.6169673259436355, "grad_norm": 0.4437958896160126, "learning_rate": 2.7109195402298855e-05, "loss": 0.1137, "step": 3983 }, { "epoch": 4.618126494240382, "grad_norm": 0.2698152959346771, "learning_rate": 2.710344827586207e-05, "loss": 0.1011, "step": 3984 }, { "epoch": 4.61928566253713, "grad_norm": 0.360612154006958, "learning_rate": 2.7097701149425288e-05, "loss": 0.1123, "step": 3985 }, { "epoch": 4.620444830833876, "grad_norm": 0.43149617314338684, "learning_rate": 2.709195402298851e-05, "loss": 0.1087, "step": 3986 }, { "epoch": 4.621603999130624, "grad_norm": 0.37150686979293823, "learning_rate": 2.7086206896551725e-05, "loss": 0.0995, "step": 3987 }, { "epoch": 4.622763167427371, "grad_norm": 0.3497457802295685, "learning_rate": 2.7080459770114946e-05, "loss": 0.1014, "step": 3988 }, { "epoch": 4.623922335724118, "grad_norm": 0.33245381712913513, "learning_rate": 2.707471264367816e-05, "loss": 0.1003, "step": 3989 }, { "epoch": 4.625081504020865, "grad_norm": 0.44494423270225525, "learning_rate": 2.706896551724138e-05, "loss": 0.102, "step": 3990 }, { "epoch": 4.626240672317612, "grad_norm": 0.38262462615966797, "learning_rate": 2.70632183908046e-05, "loss": 0.1171, "step": 3991 }, { "epoch": 4.627399840614359, "grad_norm": 0.4167878329753876, "learning_rate": 2.7057471264367816e-05, "loss": 0.1033, "step": 3992 }, { "epoch": 4.628559008911107, "grad_norm": 0.39638566970825195, "learning_rate": 2.7051724137931034e-05, "loss": 0.0931, "step": 3993 }, { "epoch": 4.629718177207853, "grad_norm": 0.45487165451049805, "learning_rate": 2.7045977011494256e-05, "loss": 0.1104, "step": 3994 }, { "epoch": 4.630877345504601, "grad_norm": 0.42852312326431274, "learning_rate": 2.704022988505747e-05, "loss": 0.0995, "step": 3995 }, { "epoch": 4.6320365138013475, "grad_norm": 0.388959139585495, "learning_rate": 2.703448275862069e-05, "loss": 0.1152, "step": 3996 }, { "epoch": 4.633195682098094, "grad_norm": 0.33843693137168884, "learning_rate": 2.702873563218391e-05, "loss": 0.1018, "step": 3997 }, { "epoch": 4.634354850394842, "grad_norm": 0.3359590768814087, "learning_rate": 2.7022988505747126e-05, "loss": 0.1009, "step": 3998 }, { "epoch": 4.635514018691588, "grad_norm": 0.29807549715042114, "learning_rate": 2.7017241379310348e-05, "loss": 0.0949, "step": 3999 }, { "epoch": 4.636673186988336, "grad_norm": 0.3521331250667572, "learning_rate": 2.7011494252873566e-05, "loss": 0.0987, "step": 4000 }, { "epoch": 4.637832355285083, "grad_norm": 0.32744914293289185, "learning_rate": 2.700574712643678e-05, "loss": 0.1068, "step": 4001 }, { "epoch": 4.63899152358183, "grad_norm": 0.33140355348587036, "learning_rate": 2.7000000000000002e-05, "loss": 0.0941, "step": 4002 }, { "epoch": 4.640150691878577, "grad_norm": 0.35607364773750305, "learning_rate": 2.699425287356322e-05, "loss": 0.1128, "step": 4003 }, { "epoch": 4.641309860175324, "grad_norm": 0.28557470440864563, "learning_rate": 2.6988505747126436e-05, "loss": 0.1044, "step": 4004 }, { "epoch": 4.642469028472071, "grad_norm": 0.4250648617744446, "learning_rate": 2.6982758620689657e-05, "loss": 0.1022, "step": 4005 }, { "epoch": 4.643628196768819, "grad_norm": 0.2685200273990631, "learning_rate": 2.6977011494252876e-05, "loss": 0.0965, "step": 4006 }, { "epoch": 4.644787365065565, "grad_norm": 0.3097551465034485, "learning_rate": 2.6971264367816097e-05, "loss": 0.11, "step": 4007 }, { "epoch": 4.645946533362313, "grad_norm": 0.3279826045036316, "learning_rate": 2.6965517241379312e-05, "loss": 0.1003, "step": 4008 }, { "epoch": 4.64710570165906, "grad_norm": 0.3129337430000305, "learning_rate": 2.6959770114942527e-05, "loss": 0.1062, "step": 4009 }, { "epoch": 4.648264869955806, "grad_norm": 0.27965065836906433, "learning_rate": 2.6954022988505752e-05, "loss": 0.1017, "step": 4010 }, { "epoch": 4.649424038252554, "grad_norm": 0.3252604007720947, "learning_rate": 2.6948275862068967e-05, "loss": 0.0935, "step": 4011 }, { "epoch": 4.650583206549301, "grad_norm": 0.32502391934394836, "learning_rate": 2.6942528735632182e-05, "loss": 0.1059, "step": 4012 }, { "epoch": 4.651742374846048, "grad_norm": 0.3370577394962311, "learning_rate": 2.6936781609195404e-05, "loss": 0.0959, "step": 4013 }, { "epoch": 4.652901543142795, "grad_norm": 0.3003638684749603, "learning_rate": 2.6931034482758622e-05, "loss": 0.0969, "step": 4014 }, { "epoch": 4.654060711439542, "grad_norm": 0.35102033615112305, "learning_rate": 2.6925287356321837e-05, "loss": 0.104, "step": 4015 }, { "epoch": 4.655219879736289, "grad_norm": 0.355948269367218, "learning_rate": 2.691954022988506e-05, "loss": 0.1125, "step": 4016 }, { "epoch": 4.6563790480330365, "grad_norm": 0.526878833770752, "learning_rate": 2.6913793103448277e-05, "loss": 0.1195, "step": 4017 }, { "epoch": 4.657538216329783, "grad_norm": 0.3807942867279053, "learning_rate": 2.69080459770115e-05, "loss": 0.1092, "step": 4018 }, { "epoch": 4.658697384626531, "grad_norm": 0.36606287956237793, "learning_rate": 2.6902298850574713e-05, "loss": 0.1061, "step": 4019 }, { "epoch": 4.659856552923277, "grad_norm": 0.4438146650791168, "learning_rate": 2.689655172413793e-05, "loss": 0.1059, "step": 4020 }, { "epoch": 4.661015721220025, "grad_norm": 0.3968319296836853, "learning_rate": 2.6890804597701153e-05, "loss": 0.0976, "step": 4021 }, { "epoch": 4.662174889516772, "grad_norm": 0.34883856773376465, "learning_rate": 2.6885057471264368e-05, "loss": 0.1019, "step": 4022 }, { "epoch": 4.663334057813519, "grad_norm": 0.5616782903671265, "learning_rate": 2.6879310344827586e-05, "loss": 0.1028, "step": 4023 }, { "epoch": 4.664493226110266, "grad_norm": 0.3588404953479767, "learning_rate": 2.6873563218390808e-05, "loss": 0.0995, "step": 4024 }, { "epoch": 4.665652394407013, "grad_norm": 0.3863297998905182, "learning_rate": 2.6867816091954023e-05, "loss": 0.0863, "step": 4025 }, { "epoch": 4.66681156270376, "grad_norm": 0.2935449481010437, "learning_rate": 2.6862068965517245e-05, "loss": 0.0876, "step": 4026 }, { "epoch": 4.667970731000507, "grad_norm": 0.3793972432613373, "learning_rate": 2.6856321839080463e-05, "loss": 0.095, "step": 4027 }, { "epoch": 4.669129899297254, "grad_norm": 1.2369050979614258, "learning_rate": 2.6850574712643678e-05, "loss": 0.1036, "step": 4028 }, { "epoch": 4.670289067594001, "grad_norm": 0.4123905599117279, "learning_rate": 2.68448275862069e-05, "loss": 0.0965, "step": 4029 }, { "epoch": 4.6714482358907485, "grad_norm": 0.4004489481449127, "learning_rate": 2.6839080459770118e-05, "loss": 0.1136, "step": 4030 }, { "epoch": 4.672607404187495, "grad_norm": 0.37782585620880127, "learning_rate": 2.6833333333333333e-05, "loss": 0.1215, "step": 4031 }, { "epoch": 4.673766572484243, "grad_norm": 0.49208974838256836, "learning_rate": 2.6827586206896554e-05, "loss": 0.1135, "step": 4032 }, { "epoch": 4.674925740780989, "grad_norm": 0.38141676783561707, "learning_rate": 2.682183908045977e-05, "loss": 0.0966, "step": 4033 }, { "epoch": 4.676084909077737, "grad_norm": 0.3110532760620117, "learning_rate": 2.6816091954022988e-05, "loss": 0.1001, "step": 4034 }, { "epoch": 4.677244077374484, "grad_norm": 0.38810020685195923, "learning_rate": 2.681034482758621e-05, "loss": 0.1021, "step": 4035 }, { "epoch": 4.678403245671231, "grad_norm": 0.6681183576583862, "learning_rate": 2.6804597701149424e-05, "loss": 0.1168, "step": 4036 }, { "epoch": 4.679562413967978, "grad_norm": 0.28798437118530273, "learning_rate": 2.6798850574712646e-05, "loss": 0.1115, "step": 4037 }, { "epoch": 4.6807215822647255, "grad_norm": 0.43260279297828674, "learning_rate": 2.6793103448275864e-05, "loss": 0.1064, "step": 4038 }, { "epoch": 4.681880750561472, "grad_norm": 0.3560652732849121, "learning_rate": 2.678735632183908e-05, "loss": 0.0963, "step": 4039 }, { "epoch": 4.683039918858219, "grad_norm": 0.38259580731391907, "learning_rate": 2.67816091954023e-05, "loss": 0.1062, "step": 4040 }, { "epoch": 4.684199087154966, "grad_norm": 0.26703354716300964, "learning_rate": 2.677586206896552e-05, "loss": 0.0953, "step": 4041 }, { "epoch": 4.685358255451713, "grad_norm": 0.3569488227367401, "learning_rate": 2.6770114942528734e-05, "loss": 0.1066, "step": 4042 }, { "epoch": 4.686517423748461, "grad_norm": 0.3421025574207306, "learning_rate": 2.6764367816091956e-05, "loss": 0.104, "step": 4043 }, { "epoch": 4.687676592045207, "grad_norm": 0.31309759616851807, "learning_rate": 2.6758620689655174e-05, "loss": 0.1047, "step": 4044 }, { "epoch": 4.688835760341955, "grad_norm": 0.6280311346054077, "learning_rate": 2.6752873563218396e-05, "loss": 0.1096, "step": 4045 }, { "epoch": 4.6899949286387015, "grad_norm": 0.36928072571754456, "learning_rate": 2.674712643678161e-05, "loss": 0.1067, "step": 4046 }, { "epoch": 4.691154096935449, "grad_norm": 0.2873797118663788, "learning_rate": 2.674137931034483e-05, "loss": 0.102, "step": 4047 }, { "epoch": 4.692313265232196, "grad_norm": 0.3734436631202698, "learning_rate": 2.673563218390805e-05, "loss": 0.116, "step": 4048 }, { "epoch": 4.693472433528943, "grad_norm": 0.33200186491012573, "learning_rate": 2.6729885057471265e-05, "loss": 0.0983, "step": 4049 }, { "epoch": 4.69463160182569, "grad_norm": 0.3504514992237091, "learning_rate": 2.672413793103448e-05, "loss": 0.1072, "step": 4050 }, { "epoch": 4.6957907701224375, "grad_norm": 0.4115615487098694, "learning_rate": 2.6718390804597705e-05, "loss": 0.1073, "step": 4051 }, { "epoch": 4.696949938419184, "grad_norm": 0.35126233100891113, "learning_rate": 2.671264367816092e-05, "loss": 0.1048, "step": 4052 }, { "epoch": 4.698109106715931, "grad_norm": 0.2556185722351074, "learning_rate": 2.6706896551724135e-05, "loss": 0.1022, "step": 4053 }, { "epoch": 4.699268275012678, "grad_norm": 0.38733574748039246, "learning_rate": 2.6701149425287357e-05, "loss": 0.1198, "step": 4054 }, { "epoch": 4.700427443309426, "grad_norm": 0.45527157187461853, "learning_rate": 2.6695402298850575e-05, "loss": 0.1059, "step": 4055 }, { "epoch": 4.701586611606173, "grad_norm": 0.5728080868721008, "learning_rate": 2.6689655172413797e-05, "loss": 0.0996, "step": 4056 }, { "epoch": 4.702745779902919, "grad_norm": 0.3340456187725067, "learning_rate": 2.6683908045977012e-05, "loss": 0.108, "step": 4057 }, { "epoch": 4.703904948199667, "grad_norm": 0.35559847950935364, "learning_rate": 2.667816091954023e-05, "loss": 0.1056, "step": 4058 }, { "epoch": 4.7050641164964135, "grad_norm": 0.2684958279132843, "learning_rate": 2.667241379310345e-05, "loss": 0.0996, "step": 4059 }, { "epoch": 4.706223284793161, "grad_norm": 0.31825652718544006, "learning_rate": 2.6666666666666667e-05, "loss": 0.1091, "step": 4060 }, { "epoch": 4.707382453089908, "grad_norm": 0.3788098990917206, "learning_rate": 2.6660919540229885e-05, "loss": 0.0985, "step": 4061 }, { "epoch": 4.708541621386655, "grad_norm": 0.31583863496780396, "learning_rate": 2.6655172413793107e-05, "loss": 0.1064, "step": 4062 }, { "epoch": 4.709700789683402, "grad_norm": 0.5462097525596619, "learning_rate": 2.664942528735632e-05, "loss": 0.0976, "step": 4063 }, { "epoch": 4.71085995798015, "grad_norm": 0.4478006660938263, "learning_rate": 2.6643678160919543e-05, "loss": 0.1024, "step": 4064 }, { "epoch": 4.712019126276896, "grad_norm": 0.41845348477363586, "learning_rate": 2.663793103448276e-05, "loss": 0.1003, "step": 4065 }, { "epoch": 4.713178294573644, "grad_norm": 0.3364732265472412, "learning_rate": 2.6632183908045976e-05, "loss": 0.0996, "step": 4066 }, { "epoch": 4.7143374628703905, "grad_norm": 0.3141407370567322, "learning_rate": 2.6626436781609198e-05, "loss": 0.0995, "step": 4067 }, { "epoch": 4.715496631167138, "grad_norm": 0.2963089942932129, "learning_rate": 2.6620689655172416e-05, "loss": 0.1043, "step": 4068 }, { "epoch": 4.716655799463885, "grad_norm": 0.3780442774295807, "learning_rate": 2.661494252873563e-05, "loss": 0.1161, "step": 4069 }, { "epoch": 4.717814967760631, "grad_norm": 0.4881404936313629, "learning_rate": 2.6609195402298853e-05, "loss": 0.1049, "step": 4070 }, { "epoch": 4.718974136057379, "grad_norm": 0.39198312163352966, "learning_rate": 2.660344827586207e-05, "loss": 0.1134, "step": 4071 }, { "epoch": 4.720133304354126, "grad_norm": 0.35181596875190735, "learning_rate": 2.6597701149425286e-05, "loss": 0.1013, "step": 4072 }, { "epoch": 4.721292472650873, "grad_norm": 0.3244793713092804, "learning_rate": 2.6591954022988508e-05, "loss": 0.098, "step": 4073 }, { "epoch": 4.72245164094762, "grad_norm": 0.3857901692390442, "learning_rate": 2.6586206896551723e-05, "loss": 0.1198, "step": 4074 }, { "epoch": 4.723610809244367, "grad_norm": 0.3005591034889221, "learning_rate": 2.6580459770114948e-05, "loss": 0.0984, "step": 4075 }, { "epoch": 4.724769977541114, "grad_norm": 0.34256380796432495, "learning_rate": 2.6574712643678163e-05, "loss": 0.106, "step": 4076 }, { "epoch": 4.725929145837862, "grad_norm": 0.31626880168914795, "learning_rate": 2.6568965517241378e-05, "loss": 0.1079, "step": 4077 }, { "epoch": 4.727088314134608, "grad_norm": 0.3006041944026947, "learning_rate": 2.65632183908046e-05, "loss": 0.1084, "step": 4078 }, { "epoch": 4.728247482431356, "grad_norm": 0.31316351890563965, "learning_rate": 2.6557471264367817e-05, "loss": 0.1048, "step": 4079 }, { "epoch": 4.7294066507281025, "grad_norm": 0.30015018582344055, "learning_rate": 2.6551724137931032e-05, "loss": 0.1159, "step": 4080 }, { "epoch": 4.73056581902485, "grad_norm": 0.3358341455459595, "learning_rate": 2.6545977011494254e-05, "loss": 0.1077, "step": 4081 }, { "epoch": 4.731724987321597, "grad_norm": 0.3661000728607178, "learning_rate": 2.6540229885057472e-05, "loss": 0.1164, "step": 4082 }, { "epoch": 4.732884155618343, "grad_norm": 0.30234283208847046, "learning_rate": 2.6534482758620694e-05, "loss": 0.1018, "step": 4083 }, { "epoch": 4.734043323915091, "grad_norm": 0.29934728145599365, "learning_rate": 2.652873563218391e-05, "loss": 0.1044, "step": 4084 }, { "epoch": 4.735202492211838, "grad_norm": 0.3741641640663147, "learning_rate": 2.6522988505747127e-05, "loss": 0.1084, "step": 4085 }, { "epoch": 4.736361660508585, "grad_norm": 0.35596325993537903, "learning_rate": 2.651724137931035e-05, "loss": 0.1069, "step": 4086 }, { "epoch": 4.737520828805332, "grad_norm": 0.3338968753814697, "learning_rate": 2.6511494252873564e-05, "loss": 0.1055, "step": 4087 }, { "epoch": 4.738679997102079, "grad_norm": 0.290626585483551, "learning_rate": 2.6505747126436782e-05, "loss": 0.0997, "step": 4088 }, { "epoch": 4.739839165398826, "grad_norm": 0.6464786529541016, "learning_rate": 2.6500000000000004e-05, "loss": 0.1179, "step": 4089 }, { "epoch": 4.740998333695574, "grad_norm": 0.5353714227676392, "learning_rate": 2.649425287356322e-05, "loss": 0.1145, "step": 4090 }, { "epoch": 4.74215750199232, "grad_norm": 0.39538970589637756, "learning_rate": 2.6488505747126437e-05, "loss": 0.0998, "step": 4091 }, { "epoch": 4.743316670289068, "grad_norm": 0.36018985509872437, "learning_rate": 2.648275862068966e-05, "loss": 0.1087, "step": 4092 }, { "epoch": 4.744475838585815, "grad_norm": 0.4141652286052704, "learning_rate": 2.6477011494252874e-05, "loss": 0.1129, "step": 4093 }, { "epoch": 4.745635006882562, "grad_norm": 0.4767494201660156, "learning_rate": 2.6471264367816095e-05, "loss": 0.116, "step": 4094 }, { "epoch": 4.746794175179309, "grad_norm": 0.37823235988616943, "learning_rate": 2.646551724137931e-05, "loss": 0.1107, "step": 4095 }, { "epoch": 4.7479533434760555, "grad_norm": 0.3563480079174042, "learning_rate": 2.645977011494253e-05, "loss": 0.1083, "step": 4096 }, { "epoch": 4.749112511772803, "grad_norm": 0.27948468923568726, "learning_rate": 2.645402298850575e-05, "loss": 0.1067, "step": 4097 }, { "epoch": 4.750271680069551, "grad_norm": 0.311713844537735, "learning_rate": 2.6448275862068965e-05, "loss": 0.1077, "step": 4098 }, { "epoch": 4.751430848366297, "grad_norm": 0.2752847969532013, "learning_rate": 2.6442528735632183e-05, "loss": 0.0956, "step": 4099 }, { "epoch": 4.752590016663044, "grad_norm": 0.3526310622692108, "learning_rate": 2.6436781609195405e-05, "loss": 0.1152, "step": 4100 }, { "epoch": 4.7537491849597915, "grad_norm": 0.38243040442466736, "learning_rate": 2.643103448275862e-05, "loss": 0.1043, "step": 4101 }, { "epoch": 4.754908353256538, "grad_norm": 0.32026922702789307, "learning_rate": 2.642528735632184e-05, "loss": 0.1129, "step": 4102 }, { "epoch": 4.756067521553286, "grad_norm": 0.2896019220352173, "learning_rate": 2.641954022988506e-05, "loss": 0.0993, "step": 4103 }, { "epoch": 4.757226689850032, "grad_norm": 0.33304235339164734, "learning_rate": 2.6413793103448275e-05, "loss": 0.1097, "step": 4104 }, { "epoch": 4.75838585814678, "grad_norm": 0.2931871712207794, "learning_rate": 2.6408045977011496e-05, "loss": 0.0938, "step": 4105 }, { "epoch": 4.759545026443527, "grad_norm": 0.39289239048957825, "learning_rate": 2.6402298850574715e-05, "loss": 0.116, "step": 4106 }, { "epoch": 4.760704194740274, "grad_norm": 0.45948904752731323, "learning_rate": 2.639655172413793e-05, "loss": 0.1125, "step": 4107 }, { "epoch": 4.761863363037021, "grad_norm": 0.3527555465698242, "learning_rate": 2.639080459770115e-05, "loss": 0.0992, "step": 4108 }, { "epoch": 4.763022531333768, "grad_norm": 0.3574606478214264, "learning_rate": 2.638505747126437e-05, "loss": 0.1097, "step": 4109 }, { "epoch": 4.764181699630515, "grad_norm": 0.41160982847213745, "learning_rate": 2.637931034482759e-05, "loss": 0.0972, "step": 4110 }, { "epoch": 4.765340867927263, "grad_norm": 0.5347371697425842, "learning_rate": 2.6373563218390806e-05, "loss": 0.101, "step": 4111 }, { "epoch": 4.766500036224009, "grad_norm": 0.3305839002132416, "learning_rate": 2.6367816091954024e-05, "loss": 0.0983, "step": 4112 }, { "epoch": 4.767659204520756, "grad_norm": 0.4040708541870117, "learning_rate": 2.6362068965517246e-05, "loss": 0.1072, "step": 4113 }, { "epoch": 4.7688183728175035, "grad_norm": 0.3733755648136139, "learning_rate": 2.635632183908046e-05, "loss": 0.0923, "step": 4114 }, { "epoch": 4.76997754111425, "grad_norm": 0.4366565942764282, "learning_rate": 2.6350574712643676e-05, "loss": 0.0999, "step": 4115 }, { "epoch": 4.771136709410998, "grad_norm": 0.3389378786087036, "learning_rate": 2.63448275862069e-05, "loss": 0.1016, "step": 4116 }, { "epoch": 4.772295877707744, "grad_norm": 0.2860844135284424, "learning_rate": 2.6339080459770116e-05, "loss": 0.0976, "step": 4117 }, { "epoch": 4.773455046004492, "grad_norm": 0.4258897304534912, "learning_rate": 2.633333333333333e-05, "loss": 0.105, "step": 4118 }, { "epoch": 4.774614214301239, "grad_norm": 0.36527156829833984, "learning_rate": 2.6327586206896552e-05, "loss": 0.1089, "step": 4119 }, { "epoch": 4.775773382597986, "grad_norm": 0.4984228312969208, "learning_rate": 2.632183908045977e-05, "loss": 0.1178, "step": 4120 }, { "epoch": 4.776932550894733, "grad_norm": 0.3134072721004486, "learning_rate": 2.6316091954022992e-05, "loss": 0.0994, "step": 4121 }, { "epoch": 4.7780917191914805, "grad_norm": 0.28975674510002136, "learning_rate": 2.6310344827586207e-05, "loss": 0.112, "step": 4122 }, { "epoch": 4.779250887488227, "grad_norm": 0.4431288242340088, "learning_rate": 2.6304597701149426e-05, "loss": 0.1108, "step": 4123 }, { "epoch": 4.780410055784975, "grad_norm": 0.3906228542327881, "learning_rate": 2.6298850574712647e-05, "loss": 0.1187, "step": 4124 }, { "epoch": 4.781569224081721, "grad_norm": 0.32078051567077637, "learning_rate": 2.6293103448275862e-05, "loss": 0.1081, "step": 4125 }, { "epoch": 4.782728392378468, "grad_norm": 0.34437334537506104, "learning_rate": 2.628735632183908e-05, "loss": 0.1045, "step": 4126 }, { "epoch": 4.783887560675216, "grad_norm": 0.37716683745384216, "learning_rate": 2.6281609195402302e-05, "loss": 0.1112, "step": 4127 }, { "epoch": 4.785046728971962, "grad_norm": 0.3816121220588684, "learning_rate": 2.6275862068965517e-05, "loss": 0.1015, "step": 4128 }, { "epoch": 4.78620589726871, "grad_norm": 0.30392956733703613, "learning_rate": 2.627011494252874e-05, "loss": 0.096, "step": 4129 }, { "epoch": 4.7873650655654565, "grad_norm": 0.37450024485588074, "learning_rate": 2.6264367816091957e-05, "loss": 0.1037, "step": 4130 }, { "epoch": 4.788524233862204, "grad_norm": 0.3201805055141449, "learning_rate": 2.6258620689655172e-05, "loss": 0.0916, "step": 4131 }, { "epoch": 4.789683402158951, "grad_norm": 0.35114505887031555, "learning_rate": 2.6252873563218394e-05, "loss": 0.0957, "step": 4132 }, { "epoch": 4.790842570455698, "grad_norm": 0.3193678855895996, "learning_rate": 2.6247126436781612e-05, "loss": 0.0976, "step": 4133 }, { "epoch": 4.792001738752445, "grad_norm": 0.3565538227558136, "learning_rate": 2.6241379310344827e-05, "loss": 0.0974, "step": 4134 }, { "epoch": 4.7931609070491925, "grad_norm": 0.425341933965683, "learning_rate": 2.623563218390805e-05, "loss": 0.1083, "step": 4135 }, { "epoch": 4.794320075345939, "grad_norm": 0.3331052362918854, "learning_rate": 2.6229885057471267e-05, "loss": 0.1, "step": 4136 }, { "epoch": 4.795479243642687, "grad_norm": 0.3157724142074585, "learning_rate": 2.6224137931034482e-05, "loss": 0.103, "step": 4137 }, { "epoch": 4.796638411939433, "grad_norm": 0.3534112572669983, "learning_rate": 2.6218390804597703e-05, "loss": 0.1032, "step": 4138 }, { "epoch": 4.79779758023618, "grad_norm": 0.2811379134654999, "learning_rate": 2.6212643678160918e-05, "loss": 0.0926, "step": 4139 }, { "epoch": 4.798956748532928, "grad_norm": 0.3642650246620178, "learning_rate": 2.620689655172414e-05, "loss": 0.1109, "step": 4140 }, { "epoch": 4.800115916829675, "grad_norm": 0.4630174934864044, "learning_rate": 2.6201149425287358e-05, "loss": 0.1029, "step": 4141 }, { "epoch": 4.801275085126422, "grad_norm": 0.48048609495162964, "learning_rate": 2.6195402298850573e-05, "loss": 0.1111, "step": 4142 }, { "epoch": 4.8024342534231685, "grad_norm": 0.47064340114593506, "learning_rate": 2.6189655172413795e-05, "loss": 0.1048, "step": 4143 }, { "epoch": 4.803593421719916, "grad_norm": 0.30796119570732117, "learning_rate": 2.6183908045977013e-05, "loss": 0.1083, "step": 4144 }, { "epoch": 4.804752590016663, "grad_norm": 0.325536847114563, "learning_rate": 2.6178160919540228e-05, "loss": 0.1034, "step": 4145 }, { "epoch": 4.80591175831341, "grad_norm": 0.40425774455070496, "learning_rate": 2.617241379310345e-05, "loss": 0.106, "step": 4146 }, { "epoch": 4.807070926610157, "grad_norm": 0.33187299966812134, "learning_rate": 2.6166666666666668e-05, "loss": 0.1018, "step": 4147 }, { "epoch": 4.808230094906905, "grad_norm": 0.35282886028289795, "learning_rate": 2.616091954022989e-05, "loss": 0.1052, "step": 4148 }, { "epoch": 4.809389263203651, "grad_norm": 0.3740447163581848, "learning_rate": 2.6155172413793105e-05, "loss": 0.106, "step": 4149 }, { "epoch": 4.810548431500399, "grad_norm": 0.30205562710762024, "learning_rate": 2.6149425287356323e-05, "loss": 0.0938, "step": 4150 }, { "epoch": 4.8117075997971455, "grad_norm": 0.2860054671764374, "learning_rate": 2.6143678160919545e-05, "loss": 0.0934, "step": 4151 }, { "epoch": 4.812866768093892, "grad_norm": 0.465196430683136, "learning_rate": 2.613793103448276e-05, "loss": 0.098, "step": 4152 }, { "epoch": 4.81402593639064, "grad_norm": 0.3192795217037201, "learning_rate": 2.6132183908045978e-05, "loss": 0.0972, "step": 4153 }, { "epoch": 4.815185104687387, "grad_norm": 0.42049890756607056, "learning_rate": 2.61264367816092e-05, "loss": 0.1133, "step": 4154 }, { "epoch": 4.816344272984134, "grad_norm": 0.38218989968299866, "learning_rate": 2.6120689655172414e-05, "loss": 0.1014, "step": 4155 }, { "epoch": 4.817503441280881, "grad_norm": 0.46369317173957825, "learning_rate": 2.611494252873563e-05, "loss": 0.1097, "step": 4156 }, { "epoch": 4.818662609577628, "grad_norm": 0.3875451982021332, "learning_rate": 2.6109195402298854e-05, "loss": 0.1063, "step": 4157 }, { "epoch": 4.819821777874375, "grad_norm": 0.582574725151062, "learning_rate": 2.610344827586207e-05, "loss": 0.0954, "step": 4158 }, { "epoch": 4.820980946171122, "grad_norm": 0.2862607538700104, "learning_rate": 2.609770114942529e-05, "loss": 0.0948, "step": 4159 }, { "epoch": 4.822140114467869, "grad_norm": 0.47772908210754395, "learning_rate": 2.6091954022988506e-05, "loss": 0.0984, "step": 4160 }, { "epoch": 4.823299282764617, "grad_norm": 0.4275151789188385, "learning_rate": 2.6086206896551724e-05, "loss": 0.103, "step": 4161 }, { "epoch": 4.824458451061363, "grad_norm": 0.3913339674472809, "learning_rate": 2.6080459770114946e-05, "loss": 0.1175, "step": 4162 }, { "epoch": 4.825617619358111, "grad_norm": 0.37874650955200195, "learning_rate": 2.607471264367816e-05, "loss": 0.1093, "step": 4163 }, { "epoch": 4.8267767876548575, "grad_norm": 0.33609622716903687, "learning_rate": 2.606896551724138e-05, "loss": 0.1113, "step": 4164 }, { "epoch": 4.827935955951605, "grad_norm": 0.32703521847724915, "learning_rate": 2.60632183908046e-05, "loss": 0.1167, "step": 4165 }, { "epoch": 4.829095124248352, "grad_norm": 0.45330101251602173, "learning_rate": 2.6057471264367815e-05, "loss": 0.0996, "step": 4166 }, { "epoch": 4.830254292545099, "grad_norm": 0.3833322823047638, "learning_rate": 2.6051724137931037e-05, "loss": 0.1106, "step": 4167 }, { "epoch": 4.831413460841846, "grad_norm": 0.416790634393692, "learning_rate": 2.6045977011494255e-05, "loss": 0.1077, "step": 4168 }, { "epoch": 4.832572629138593, "grad_norm": 0.40256428718566895, "learning_rate": 2.604022988505747e-05, "loss": 0.0983, "step": 4169 }, { "epoch": 4.83373179743534, "grad_norm": 0.3929414451122284, "learning_rate": 2.6034482758620692e-05, "loss": 0.1143, "step": 4170 }, { "epoch": 4.834890965732087, "grad_norm": 0.3878033757209778, "learning_rate": 2.602873563218391e-05, "loss": 0.0966, "step": 4171 }, { "epoch": 4.836050134028834, "grad_norm": 0.40418297052383423, "learning_rate": 2.6022988505747125e-05, "loss": 0.1041, "step": 4172 }, { "epoch": 4.837209302325581, "grad_norm": 0.3119466006755829, "learning_rate": 2.6017241379310347e-05, "loss": 0.104, "step": 4173 }, { "epoch": 4.838368470622329, "grad_norm": 0.31403571367263794, "learning_rate": 2.6011494252873565e-05, "loss": 0.1008, "step": 4174 }, { "epoch": 4.839527638919075, "grad_norm": 0.32932233810424805, "learning_rate": 2.600574712643678e-05, "loss": 0.1069, "step": 4175 }, { "epoch": 4.840686807215823, "grad_norm": 0.3228423595428467, "learning_rate": 2.6000000000000002e-05, "loss": 0.1007, "step": 4176 }, { "epoch": 4.84184597551257, "grad_norm": 0.3488909900188446, "learning_rate": 2.599425287356322e-05, "loss": 0.1019, "step": 4177 }, { "epoch": 4.843005143809317, "grad_norm": 0.4107312858104706, "learning_rate": 2.5988505747126442e-05, "loss": 0.1235, "step": 4178 }, { "epoch": 4.844164312106064, "grad_norm": 0.2800889015197754, "learning_rate": 2.5982758620689657e-05, "loss": 0.1017, "step": 4179 }, { "epoch": 4.845323480402811, "grad_norm": 0.2643517851829529, "learning_rate": 2.597701149425287e-05, "loss": 0.1084, "step": 4180 }, { "epoch": 4.846482648699558, "grad_norm": 0.40645238757133484, "learning_rate": 2.5971264367816097e-05, "loss": 0.1026, "step": 4181 }, { "epoch": 4.847641816996305, "grad_norm": 0.5874878168106079, "learning_rate": 2.596551724137931e-05, "loss": 0.0999, "step": 4182 }, { "epoch": 4.848800985293052, "grad_norm": 0.3134619891643524, "learning_rate": 2.5959770114942526e-05, "loss": 0.1144, "step": 4183 }, { "epoch": 4.8499601535898, "grad_norm": 0.33643344044685364, "learning_rate": 2.5954022988505748e-05, "loss": 0.1035, "step": 4184 }, { "epoch": 4.8511193218865465, "grad_norm": 0.39385950565338135, "learning_rate": 2.5948275862068966e-05, "loss": 0.1082, "step": 4185 }, { "epoch": 4.852278490183293, "grad_norm": 0.32700175046920776, "learning_rate": 2.5942528735632188e-05, "loss": 0.1048, "step": 4186 }, { "epoch": 4.853437658480041, "grad_norm": 0.4382705092430115, "learning_rate": 2.5936781609195403e-05, "loss": 0.0977, "step": 4187 }, { "epoch": 4.854596826776787, "grad_norm": 0.4250156879425049, "learning_rate": 2.593103448275862e-05, "loss": 0.1147, "step": 4188 }, { "epoch": 4.855755995073535, "grad_norm": 0.29872676730155945, "learning_rate": 2.5925287356321843e-05, "loss": 0.0983, "step": 4189 }, { "epoch": 4.856915163370282, "grad_norm": 0.33435744047164917, "learning_rate": 2.5919540229885058e-05, "loss": 0.099, "step": 4190 }, { "epoch": 4.858074331667029, "grad_norm": 0.49045172333717346, "learning_rate": 2.5913793103448276e-05, "loss": 0.0921, "step": 4191 }, { "epoch": 4.859233499963776, "grad_norm": 0.3408951759338379, "learning_rate": 2.5908045977011498e-05, "loss": 0.0991, "step": 4192 }, { "epoch": 4.860392668260523, "grad_norm": 0.2995651960372925, "learning_rate": 2.5902298850574713e-05, "loss": 0.0979, "step": 4193 }, { "epoch": 4.86155183655727, "grad_norm": 0.462162584066391, "learning_rate": 2.589655172413793e-05, "loss": 0.1083, "step": 4194 }, { "epoch": 4.862711004854017, "grad_norm": 0.4822317659854889, "learning_rate": 2.5890804597701153e-05, "loss": 0.0976, "step": 4195 }, { "epoch": 4.863870173150764, "grad_norm": 0.34353649616241455, "learning_rate": 2.5885057471264368e-05, "loss": 0.1036, "step": 4196 }, { "epoch": 4.865029341447512, "grad_norm": 0.32048463821411133, "learning_rate": 2.587931034482759e-05, "loss": 0.1164, "step": 4197 }, { "epoch": 4.8661885097442585, "grad_norm": 0.37083297967910767, "learning_rate": 2.5873563218390808e-05, "loss": 0.107, "step": 4198 }, { "epoch": 4.867347678041005, "grad_norm": 0.2568010091781616, "learning_rate": 2.5867816091954022e-05, "loss": 0.0924, "step": 4199 }, { "epoch": 4.868506846337753, "grad_norm": 0.3095264136791229, "learning_rate": 2.5862068965517244e-05, "loss": 0.0984, "step": 4200 }, { "epoch": 4.869666014634499, "grad_norm": 0.34810638427734375, "learning_rate": 2.585632183908046e-05, "loss": 0.1055, "step": 4201 }, { "epoch": 4.870825182931247, "grad_norm": 0.3801257014274597, "learning_rate": 2.5850574712643677e-05, "loss": 0.1107, "step": 4202 }, { "epoch": 4.871984351227994, "grad_norm": 0.31729915738105774, "learning_rate": 2.58448275862069e-05, "loss": 0.1158, "step": 4203 }, { "epoch": 4.873143519524741, "grad_norm": 0.4292013347148895, "learning_rate": 2.5839080459770114e-05, "loss": 0.1062, "step": 4204 }, { "epoch": 4.874302687821488, "grad_norm": 0.35838496685028076, "learning_rate": 2.5833333333333336e-05, "loss": 0.1062, "step": 4205 }, { "epoch": 4.8754618561182355, "grad_norm": 0.479670912027359, "learning_rate": 2.5827586206896554e-05, "loss": 0.0975, "step": 4206 }, { "epoch": 4.876621024414982, "grad_norm": 0.3806205689907074, "learning_rate": 2.582183908045977e-05, "loss": 0.1128, "step": 4207 }, { "epoch": 4.87778019271173, "grad_norm": 0.2937345802783966, "learning_rate": 2.581609195402299e-05, "loss": 0.1024, "step": 4208 }, { "epoch": 4.878939361008476, "grad_norm": 0.28283917903900146, "learning_rate": 2.581034482758621e-05, "loss": 0.0977, "step": 4209 }, { "epoch": 4.880098529305224, "grad_norm": 0.4149821400642395, "learning_rate": 2.5804597701149424e-05, "loss": 0.1125, "step": 4210 }, { "epoch": 4.881257697601971, "grad_norm": 0.41813716292381287, "learning_rate": 2.5798850574712645e-05, "loss": 0.1096, "step": 4211 }, { "epoch": 4.882416865898717, "grad_norm": 0.40485164523124695, "learning_rate": 2.5793103448275864e-05, "loss": 0.1119, "step": 4212 }, { "epoch": 4.883576034195465, "grad_norm": 0.36861032247543335, "learning_rate": 2.578735632183908e-05, "loss": 0.0978, "step": 4213 }, { "epoch": 4.8847352024922115, "grad_norm": 0.4352358281612396, "learning_rate": 2.57816091954023e-05, "loss": 0.1026, "step": 4214 }, { "epoch": 4.885894370788959, "grad_norm": 0.3257162272930145, "learning_rate": 2.577586206896552e-05, "loss": 0.1098, "step": 4215 }, { "epoch": 4.887053539085706, "grad_norm": 0.3483160734176636, "learning_rate": 2.577011494252874e-05, "loss": 0.1115, "step": 4216 }, { "epoch": 4.888212707382453, "grad_norm": 0.30593305826187134, "learning_rate": 2.5764367816091955e-05, "loss": 0.1081, "step": 4217 }, { "epoch": 4.8893718756792, "grad_norm": 0.30475834012031555, "learning_rate": 2.5758620689655173e-05, "loss": 0.0919, "step": 4218 }, { "epoch": 4.8905310439759475, "grad_norm": 0.296648770570755, "learning_rate": 2.5752873563218395e-05, "loss": 0.0985, "step": 4219 }, { "epoch": 4.891690212272694, "grad_norm": 0.3274528980255127, "learning_rate": 2.574712643678161e-05, "loss": 0.1018, "step": 4220 }, { "epoch": 4.892849380569442, "grad_norm": 0.3239176571369171, "learning_rate": 2.5741379310344825e-05, "loss": 0.1053, "step": 4221 }, { "epoch": 4.894008548866188, "grad_norm": 0.3093695342540741, "learning_rate": 2.573563218390805e-05, "loss": 0.1006, "step": 4222 }, { "epoch": 4.895167717162936, "grad_norm": 0.29413002729415894, "learning_rate": 2.5729885057471265e-05, "loss": 0.1035, "step": 4223 }, { "epoch": 4.896326885459683, "grad_norm": 0.3807078003883362, "learning_rate": 2.5724137931034486e-05, "loss": 0.1111, "step": 4224 }, { "epoch": 4.897486053756429, "grad_norm": 0.357345312833786, "learning_rate": 2.57183908045977e-05, "loss": 0.1102, "step": 4225 }, { "epoch": 4.898645222053177, "grad_norm": 0.45739462971687317, "learning_rate": 2.571264367816092e-05, "loss": 0.0942, "step": 4226 }, { "epoch": 4.8998043903499235, "grad_norm": 0.325125515460968, "learning_rate": 2.570689655172414e-05, "loss": 0.0932, "step": 4227 }, { "epoch": 4.900963558646671, "grad_norm": 0.39498814940452576, "learning_rate": 2.5701149425287356e-05, "loss": 0.1001, "step": 4228 }, { "epoch": 4.902122726943418, "grad_norm": 0.3661857843399048, "learning_rate": 2.5695402298850575e-05, "loss": 0.1013, "step": 4229 }, { "epoch": 4.903281895240165, "grad_norm": 0.4122169315814972, "learning_rate": 2.5689655172413796e-05, "loss": 0.1106, "step": 4230 }, { "epoch": 4.904441063536912, "grad_norm": 0.6558185815811157, "learning_rate": 2.568390804597701e-05, "loss": 0.1162, "step": 4231 }, { "epoch": 4.90560023183366, "grad_norm": 0.2946203052997589, "learning_rate": 2.567816091954023e-05, "loss": 0.1005, "step": 4232 }, { "epoch": 4.906759400130406, "grad_norm": 0.3813852071762085, "learning_rate": 2.567241379310345e-05, "loss": 0.1039, "step": 4233 }, { "epoch": 4.907918568427154, "grad_norm": 0.33533862233161926, "learning_rate": 2.5666666666666666e-05, "loss": 0.0955, "step": 4234 }, { "epoch": 4.9090777367239005, "grad_norm": 0.3679860830307007, "learning_rate": 2.5660919540229888e-05, "loss": 0.1084, "step": 4235 }, { "epoch": 4.910236905020648, "grad_norm": 0.32838982343673706, "learning_rate": 2.5655172413793106e-05, "loss": 0.1006, "step": 4236 }, { "epoch": 4.911396073317395, "grad_norm": 0.40485119819641113, "learning_rate": 2.564942528735632e-05, "loss": 0.1078, "step": 4237 }, { "epoch": 4.912555241614141, "grad_norm": 0.43411320447921753, "learning_rate": 2.5643678160919543e-05, "loss": 0.1015, "step": 4238 }, { "epoch": 4.913714409910889, "grad_norm": 0.622022807598114, "learning_rate": 2.563793103448276e-05, "loss": 0.1006, "step": 4239 }, { "epoch": 4.9148735782076365, "grad_norm": 0.37312477827072144, "learning_rate": 2.5632183908045976e-05, "loss": 0.0984, "step": 4240 }, { "epoch": 4.916032746504383, "grad_norm": 0.4039802849292755, "learning_rate": 2.5626436781609197e-05, "loss": 0.1012, "step": 4241 }, { "epoch": 4.91719191480113, "grad_norm": 0.28577470779418945, "learning_rate": 2.5620689655172416e-05, "loss": 0.1062, "step": 4242 }, { "epoch": 4.918351083097877, "grad_norm": 0.4363808333873749, "learning_rate": 2.5614942528735637e-05, "loss": 0.1079, "step": 4243 }, { "epoch": 4.919510251394624, "grad_norm": 0.31524115800857544, "learning_rate": 2.5609195402298852e-05, "loss": 0.1027, "step": 4244 }, { "epoch": 4.920669419691372, "grad_norm": 0.324258416891098, "learning_rate": 2.5603448275862067e-05, "loss": 0.1011, "step": 4245 }, { "epoch": 4.921828587988118, "grad_norm": 0.3237408697605133, "learning_rate": 2.559770114942529e-05, "loss": 0.0999, "step": 4246 }, { "epoch": 4.922987756284866, "grad_norm": 0.3803741931915283, "learning_rate": 2.5591954022988507e-05, "loss": 0.1069, "step": 4247 }, { "epoch": 4.9241469245816125, "grad_norm": 0.36005181074142456, "learning_rate": 2.5586206896551722e-05, "loss": 0.1102, "step": 4248 }, { "epoch": 4.92530609287836, "grad_norm": 0.3981182873249054, "learning_rate": 2.5580459770114944e-05, "loss": 0.1076, "step": 4249 }, { "epoch": 4.926465261175107, "grad_norm": 0.37391379475593567, "learning_rate": 2.5574712643678162e-05, "loss": 0.0985, "step": 4250 }, { "epoch": 4.927624429471854, "grad_norm": 0.38138946890830994, "learning_rate": 2.5568965517241377e-05, "loss": 0.0987, "step": 4251 }, { "epoch": 4.928783597768601, "grad_norm": 0.5195475816726685, "learning_rate": 2.55632183908046e-05, "loss": 0.0944, "step": 4252 }, { "epoch": 4.9299427660653485, "grad_norm": 0.4090215265750885, "learning_rate": 2.5557471264367817e-05, "loss": 0.1082, "step": 4253 }, { "epoch": 4.931101934362095, "grad_norm": 0.2851259410381317, "learning_rate": 2.555172413793104e-05, "loss": 0.0993, "step": 4254 }, { "epoch": 4.932261102658842, "grad_norm": 0.37689948081970215, "learning_rate": 2.5545977011494253e-05, "loss": 0.108, "step": 4255 }, { "epoch": 4.933420270955589, "grad_norm": 0.3453246057033539, "learning_rate": 2.5540229885057472e-05, "loss": 0.1001, "step": 4256 }, { "epoch": 4.934579439252336, "grad_norm": 0.4019702970981598, "learning_rate": 2.5534482758620693e-05, "loss": 0.1084, "step": 4257 }, { "epoch": 4.935738607549084, "grad_norm": 0.30191949009895325, "learning_rate": 2.552873563218391e-05, "loss": 0.0991, "step": 4258 }, { "epoch": 4.93689777584583, "grad_norm": 0.34570759534835815, "learning_rate": 2.5522988505747127e-05, "loss": 0.0976, "step": 4259 }, { "epoch": 4.938056944142578, "grad_norm": 0.32790854573249817, "learning_rate": 2.551724137931035e-05, "loss": 0.088, "step": 4260 }, { "epoch": 4.939216112439325, "grad_norm": 0.32591426372528076, "learning_rate": 2.5511494252873563e-05, "loss": 0.0989, "step": 4261 }, { "epoch": 4.940375280736072, "grad_norm": 0.3843526542186737, "learning_rate": 2.5505747126436785e-05, "loss": 0.103, "step": 4262 }, { "epoch": 4.941534449032819, "grad_norm": 0.38071221113204956, "learning_rate": 2.5500000000000003e-05, "loss": 0.1045, "step": 4263 }, { "epoch": 4.942693617329566, "grad_norm": 0.3825156092643738, "learning_rate": 2.5494252873563218e-05, "loss": 0.0969, "step": 4264 }, { "epoch": 4.943852785626313, "grad_norm": 0.3230441212654114, "learning_rate": 2.548850574712644e-05, "loss": 0.1042, "step": 4265 }, { "epoch": 4.945011953923061, "grad_norm": 0.35447943210601807, "learning_rate": 2.5482758620689655e-05, "loss": 0.1133, "step": 4266 }, { "epoch": 4.946171122219807, "grad_norm": 0.2828787863254547, "learning_rate": 2.5477011494252873e-05, "loss": 0.0917, "step": 4267 }, { "epoch": 4.947330290516554, "grad_norm": 0.2748185694217682, "learning_rate": 2.5471264367816095e-05, "loss": 0.0973, "step": 4268 }, { "epoch": 4.9484894588133015, "grad_norm": 0.321551114320755, "learning_rate": 2.546551724137931e-05, "loss": 0.0973, "step": 4269 }, { "epoch": 4.949648627110048, "grad_norm": 0.3038749396800995, "learning_rate": 2.545977011494253e-05, "loss": 0.0888, "step": 4270 }, { "epoch": 4.950807795406796, "grad_norm": 0.4207051396369934, "learning_rate": 2.545402298850575e-05, "loss": 0.0991, "step": 4271 }, { "epoch": 4.951966963703542, "grad_norm": 0.44856876134872437, "learning_rate": 2.5448275862068964e-05, "loss": 0.1067, "step": 4272 }, { "epoch": 4.95312613200029, "grad_norm": 0.3357430398464203, "learning_rate": 2.5442528735632186e-05, "loss": 0.0877, "step": 4273 }, { "epoch": 4.954285300297037, "grad_norm": 0.33568447828292847, "learning_rate": 2.5436781609195404e-05, "loss": 0.0988, "step": 4274 }, { "epoch": 4.955444468593784, "grad_norm": 0.3613547384738922, "learning_rate": 2.543103448275862e-05, "loss": 0.1052, "step": 4275 }, { "epoch": 4.956603636890531, "grad_norm": 0.3529345393180847, "learning_rate": 2.542528735632184e-05, "loss": 0.1049, "step": 4276 }, { "epoch": 4.957762805187278, "grad_norm": 0.35080334544181824, "learning_rate": 2.541954022988506e-05, "loss": 0.1034, "step": 4277 }, { "epoch": 4.958921973484025, "grad_norm": 0.68172287940979, "learning_rate": 2.5413793103448274e-05, "loss": 0.1033, "step": 4278 }, { "epoch": 4.960081141780773, "grad_norm": 0.3911987543106079, "learning_rate": 2.5408045977011496e-05, "loss": 0.1144, "step": 4279 }, { "epoch": 4.961240310077519, "grad_norm": 0.3486397862434387, "learning_rate": 2.5402298850574714e-05, "loss": 0.1069, "step": 4280 }, { "epoch": 4.962399478374266, "grad_norm": 0.4205693006515503, "learning_rate": 2.5396551724137936e-05, "loss": 0.1115, "step": 4281 }, { "epoch": 4.9635586466710135, "grad_norm": 0.4175969660282135, "learning_rate": 2.539080459770115e-05, "loss": 0.106, "step": 4282 }, { "epoch": 4.964717814967761, "grad_norm": 0.3387162685394287, "learning_rate": 2.538505747126437e-05, "loss": 0.1113, "step": 4283 }, { "epoch": 4.965876983264508, "grad_norm": 0.3226720988750458, "learning_rate": 2.537931034482759e-05, "loss": 0.0975, "step": 4284 }, { "epoch": 4.967036151561254, "grad_norm": 0.3947923183441162, "learning_rate": 2.5373563218390806e-05, "loss": 0.0999, "step": 4285 }, { "epoch": 4.968195319858002, "grad_norm": 0.40375250577926636, "learning_rate": 2.536781609195402e-05, "loss": 0.1104, "step": 4286 }, { "epoch": 4.969354488154749, "grad_norm": 0.44277888536453247, "learning_rate": 2.5362068965517246e-05, "loss": 0.1052, "step": 4287 }, { "epoch": 4.970513656451496, "grad_norm": 0.38605281710624695, "learning_rate": 2.535632183908046e-05, "loss": 0.1062, "step": 4288 }, { "epoch": 4.971672824748243, "grad_norm": 0.3479140102863312, "learning_rate": 2.5350574712643682e-05, "loss": 0.1085, "step": 4289 }, { "epoch": 4.9728319930449905, "grad_norm": 0.38351091742515564, "learning_rate": 2.5344827586206897e-05, "loss": 0.1046, "step": 4290 }, { "epoch": 4.973991161341737, "grad_norm": 0.3506777882575989, "learning_rate": 2.5339080459770115e-05, "loss": 0.0964, "step": 4291 }, { "epoch": 4.975150329638485, "grad_norm": 0.4924359619617462, "learning_rate": 2.5333333333333337e-05, "loss": 0.116, "step": 4292 }, { "epoch": 4.976309497935231, "grad_norm": 0.31600138545036316, "learning_rate": 2.5327586206896552e-05, "loss": 0.1058, "step": 4293 }, { "epoch": 4.977468666231979, "grad_norm": 0.3318912386894226, "learning_rate": 2.532183908045977e-05, "loss": 0.0918, "step": 4294 }, { "epoch": 4.978627834528726, "grad_norm": 0.361688494682312, "learning_rate": 2.5316091954022992e-05, "loss": 0.1061, "step": 4295 }, { "epoch": 4.979787002825473, "grad_norm": 0.37395092844963074, "learning_rate": 2.5310344827586207e-05, "loss": 0.1067, "step": 4296 }, { "epoch": 4.98094617112222, "grad_norm": 0.48089030385017395, "learning_rate": 2.5304597701149425e-05, "loss": 0.1015, "step": 4297 }, { "epoch": 4.9821053394189665, "grad_norm": 0.305130273103714, "learning_rate": 2.5298850574712647e-05, "loss": 0.0966, "step": 4298 }, { "epoch": 4.983264507715714, "grad_norm": 0.30088353157043457, "learning_rate": 2.529310344827586e-05, "loss": 0.1099, "step": 4299 }, { "epoch": 4.984423676012461, "grad_norm": 0.4635865092277527, "learning_rate": 2.5287356321839083e-05, "loss": 0.1118, "step": 4300 }, { "epoch": 4.985582844309208, "grad_norm": 0.34878647327423096, "learning_rate": 2.52816091954023e-05, "loss": 0.1004, "step": 4301 }, { "epoch": 4.986742012605955, "grad_norm": 0.31814780831336975, "learning_rate": 2.5275862068965516e-05, "loss": 0.1032, "step": 4302 }, { "epoch": 4.9879011809027025, "grad_norm": 0.311321884393692, "learning_rate": 2.5270114942528738e-05, "loss": 0.0982, "step": 4303 }, { "epoch": 4.989060349199449, "grad_norm": 0.3587198853492737, "learning_rate": 2.5264367816091956e-05, "loss": 0.0981, "step": 4304 }, { "epoch": 4.990219517496197, "grad_norm": 0.33750030398368835, "learning_rate": 2.525862068965517e-05, "loss": 0.0996, "step": 4305 }, { "epoch": 4.991378685792943, "grad_norm": 0.3702969253063202, "learning_rate": 2.5252873563218393e-05, "loss": 0.1126, "step": 4306 }, { "epoch": 4.992537854089691, "grad_norm": 0.33090633153915405, "learning_rate": 2.5247126436781608e-05, "loss": 0.1072, "step": 4307 }, { "epoch": 4.993697022386438, "grad_norm": 0.4805144965648651, "learning_rate": 2.5241379310344833e-05, "loss": 0.0977, "step": 4308 }, { "epoch": 4.994856190683185, "grad_norm": 0.3834288418292999, "learning_rate": 2.5235632183908048e-05, "loss": 0.0978, "step": 4309 }, { "epoch": 4.996015358979932, "grad_norm": 0.39906617999076843, "learning_rate": 2.5229885057471263e-05, "loss": 0.1049, "step": 4310 }, { "epoch": 4.9971745272766785, "grad_norm": 0.2854554355144501, "learning_rate": 2.5224137931034484e-05, "loss": 0.1047, "step": 4311 }, { "epoch": 4.998333695573426, "grad_norm": 0.37985438108444214, "learning_rate": 2.5218390804597703e-05, "loss": 0.101, "step": 4312 }, { "epoch": 4.999492863870173, "grad_norm": 0.4384147822856903, "learning_rate": 2.5212643678160918e-05, "loss": 0.1011, "step": 4313 }, { "epoch": 4.999492863870173, "eval_loss": 0.1395179033279419, "eval_runtime": 265.5301, "eval_samples_per_second": 5.777, "eval_steps_per_second": 5.777, "step": 4313 }, { "epoch": 5.00065203216692, "grad_norm": 0.49006325006484985, "learning_rate": 2.520689655172414e-05, "loss": 0.1007, "step": 4314 }, { "epoch": 5.001811200463667, "grad_norm": 0.2520461082458496, "learning_rate": 2.5201149425287358e-05, "loss": 0.0905, "step": 4315 }, { "epoch": 5.002970368760415, "grad_norm": 0.2542027235031128, "learning_rate": 2.5195402298850573e-05, "loss": 0.0858, "step": 4316 }, { "epoch": 5.004129537057161, "grad_norm": 0.3190763592720032, "learning_rate": 2.5189655172413794e-05, "loss": 0.0818, "step": 4317 }, { "epoch": 5.005288705353909, "grad_norm": 0.2673119008541107, "learning_rate": 2.5183908045977013e-05, "loss": 0.0943, "step": 4318 }, { "epoch": 5.0064478736506555, "grad_norm": 0.31075116991996765, "learning_rate": 2.5178160919540234e-05, "loss": 0.0966, "step": 4319 }, { "epoch": 5.007607041947403, "grad_norm": 0.3015635311603546, "learning_rate": 2.517241379310345e-05, "loss": 0.0971, "step": 4320 }, { "epoch": 5.00876621024415, "grad_norm": 0.43081480264663696, "learning_rate": 2.5166666666666667e-05, "loss": 0.0955, "step": 4321 }, { "epoch": 5.009925378540897, "grad_norm": 0.3676944971084595, "learning_rate": 2.516091954022989e-05, "loss": 0.0966, "step": 4322 }, { "epoch": 5.011084546837644, "grad_norm": 0.34852761030197144, "learning_rate": 2.5155172413793104e-05, "loss": 0.0881, "step": 4323 }, { "epoch": 5.0122437151343915, "grad_norm": 0.3189998269081116, "learning_rate": 2.5149425287356322e-05, "loss": 0.0981, "step": 4324 }, { "epoch": 5.013402883431138, "grad_norm": 0.5241860151290894, "learning_rate": 2.5143678160919544e-05, "loss": 0.0923, "step": 4325 }, { "epoch": 5.014562051727885, "grad_norm": 0.4675620198249817, "learning_rate": 2.513793103448276e-05, "loss": 0.0899, "step": 4326 }, { "epoch": 5.015721220024632, "grad_norm": 0.37763670086860657, "learning_rate": 2.513218390804598e-05, "loss": 0.084, "step": 4327 }, { "epoch": 5.016880388321379, "grad_norm": 0.4259863495826721, "learning_rate": 2.51264367816092e-05, "loss": 0.1091, "step": 4328 }, { "epoch": 5.018039556618127, "grad_norm": 0.6199847459793091, "learning_rate": 2.5120689655172414e-05, "loss": 0.0982, "step": 4329 }, { "epoch": 5.019198724914873, "grad_norm": 0.4678066670894623, "learning_rate": 2.5114942528735635e-05, "loss": 0.0968, "step": 4330 }, { "epoch": 5.020357893211621, "grad_norm": 0.46299999952316284, "learning_rate": 2.510919540229885e-05, "loss": 0.0903, "step": 4331 }, { "epoch": 5.0215170615083675, "grad_norm": 0.3906437158584595, "learning_rate": 2.510344827586207e-05, "loss": 0.0929, "step": 4332 }, { "epoch": 5.022676229805115, "grad_norm": 0.4142506420612335, "learning_rate": 2.509770114942529e-05, "loss": 0.1009, "step": 4333 }, { "epoch": 5.023835398101862, "grad_norm": 0.3510461151599884, "learning_rate": 2.5091954022988505e-05, "loss": 0.0879, "step": 4334 }, { "epoch": 5.024994566398609, "grad_norm": 0.3772026598453522, "learning_rate": 2.5086206896551723e-05, "loss": 0.0947, "step": 4335 }, { "epoch": 5.026153734695356, "grad_norm": 0.3540421426296234, "learning_rate": 2.5080459770114945e-05, "loss": 0.0897, "step": 4336 }, { "epoch": 5.0273129029921035, "grad_norm": 0.39619699120521545, "learning_rate": 2.507471264367816e-05, "loss": 0.0932, "step": 4337 }, { "epoch": 5.02847207128885, "grad_norm": 0.4173181354999542, "learning_rate": 2.5068965517241382e-05, "loss": 0.0857, "step": 4338 }, { "epoch": 5.029631239585598, "grad_norm": 0.4821605682373047, "learning_rate": 2.50632183908046e-05, "loss": 0.0936, "step": 4339 }, { "epoch": 5.030790407882344, "grad_norm": 0.3579258918762207, "learning_rate": 2.5057471264367815e-05, "loss": 0.0945, "step": 4340 }, { "epoch": 5.031949576179091, "grad_norm": 0.3280121982097626, "learning_rate": 2.5051724137931037e-05, "loss": 0.0958, "step": 4341 }, { "epoch": 5.033108744475839, "grad_norm": 0.4641885757446289, "learning_rate": 2.5045977011494255e-05, "loss": 0.0982, "step": 4342 }, { "epoch": 5.034267912772585, "grad_norm": 0.4874429404735565, "learning_rate": 2.504022988505747e-05, "loss": 0.0996, "step": 4343 }, { "epoch": 5.035427081069333, "grad_norm": 0.45620620250701904, "learning_rate": 2.503448275862069e-05, "loss": 0.0894, "step": 4344 }, { "epoch": 5.03658624936608, "grad_norm": 0.42721596360206604, "learning_rate": 2.502873563218391e-05, "loss": 0.1, "step": 4345 }, { "epoch": 5.037745417662827, "grad_norm": 0.5702162981033325, "learning_rate": 2.502298850574713e-05, "loss": 0.0954, "step": 4346 }, { "epoch": 5.038904585959574, "grad_norm": 0.2864901125431061, "learning_rate": 2.5017241379310346e-05, "loss": 0.0902, "step": 4347 }, { "epoch": 5.040063754256321, "grad_norm": 0.31084322929382324, "learning_rate": 2.5011494252873565e-05, "loss": 0.0924, "step": 4348 }, { "epoch": 5.041222922553068, "grad_norm": 0.5150142312049866, "learning_rate": 2.5005747126436786e-05, "loss": 0.0936, "step": 4349 }, { "epoch": 5.042382090849816, "grad_norm": 0.572697103023529, "learning_rate": 2.5e-05, "loss": 0.0926, "step": 4350 }, { "epoch": 5.043541259146562, "grad_norm": 0.4093453288078308, "learning_rate": 2.499425287356322e-05, "loss": 0.0913, "step": 4351 }, { "epoch": 5.04470042744331, "grad_norm": 0.3991166949272156, "learning_rate": 2.4988505747126438e-05, "loss": 0.0965, "step": 4352 }, { "epoch": 5.0458595957400565, "grad_norm": 0.6385246515274048, "learning_rate": 2.4982758620689656e-05, "loss": 0.0879, "step": 4353 }, { "epoch": 5.047018764036803, "grad_norm": 0.4218897223472595, "learning_rate": 2.4977011494252874e-05, "loss": 0.0874, "step": 4354 }, { "epoch": 5.048177932333551, "grad_norm": 0.3741064667701721, "learning_rate": 2.4971264367816093e-05, "loss": 0.0987, "step": 4355 }, { "epoch": 5.049337100630297, "grad_norm": 0.4053811728954315, "learning_rate": 2.496551724137931e-05, "loss": 0.0972, "step": 4356 }, { "epoch": 5.050496268927045, "grad_norm": 0.521132230758667, "learning_rate": 2.495977011494253e-05, "loss": 0.0903, "step": 4357 }, { "epoch": 5.051655437223792, "grad_norm": 0.3791821002960205, "learning_rate": 2.4954022988505748e-05, "loss": 0.0827, "step": 4358 }, { "epoch": 5.052814605520539, "grad_norm": 0.5845307111740112, "learning_rate": 2.494827586206897e-05, "loss": 0.1025, "step": 4359 }, { "epoch": 5.053973773817286, "grad_norm": 0.5024420022964478, "learning_rate": 2.4942528735632184e-05, "loss": 0.0903, "step": 4360 }, { "epoch": 5.055132942114033, "grad_norm": 0.3732399046421051, "learning_rate": 2.4936781609195402e-05, "loss": 0.0925, "step": 4361 }, { "epoch": 5.05629211041078, "grad_norm": 0.3285994827747345, "learning_rate": 2.493103448275862e-05, "loss": 0.0922, "step": 4362 }, { "epoch": 5.057451278707528, "grad_norm": 0.4758022129535675, "learning_rate": 2.4925287356321842e-05, "loss": 0.0955, "step": 4363 }, { "epoch": 5.058610447004274, "grad_norm": 0.558384358882904, "learning_rate": 2.4919540229885057e-05, "loss": 0.0947, "step": 4364 }, { "epoch": 5.059769615301022, "grad_norm": 0.7286369800567627, "learning_rate": 2.4913793103448276e-05, "loss": 0.0965, "step": 4365 }, { "epoch": 5.0609287835977685, "grad_norm": 0.35651078820228577, "learning_rate": 2.4908045977011497e-05, "loss": 0.0874, "step": 4366 }, { "epoch": 5.062087951894516, "grad_norm": 0.5121673345565796, "learning_rate": 2.4902298850574716e-05, "loss": 0.0997, "step": 4367 }, { "epoch": 5.063247120191263, "grad_norm": 0.506806492805481, "learning_rate": 2.489655172413793e-05, "loss": 0.0869, "step": 4368 }, { "epoch": 5.0644062884880094, "grad_norm": 1.1122589111328125, "learning_rate": 2.4890804597701152e-05, "loss": 0.1026, "step": 4369 }, { "epoch": 5.065565456784757, "grad_norm": 0.42941662669181824, "learning_rate": 2.488505747126437e-05, "loss": 0.0868, "step": 4370 }, { "epoch": 5.066724625081504, "grad_norm": 0.4219612181186676, "learning_rate": 2.4879310344827585e-05, "loss": 0.0924, "step": 4371 }, { "epoch": 5.067883793378251, "grad_norm": 0.3930709660053253, "learning_rate": 2.4873563218390804e-05, "loss": 0.0868, "step": 4372 }, { "epoch": 5.069042961674998, "grad_norm": 0.3940015137195587, "learning_rate": 2.4867816091954025e-05, "loss": 0.0815, "step": 4373 }, { "epoch": 5.0702021299717455, "grad_norm": 0.496271014213562, "learning_rate": 2.4862068965517244e-05, "loss": 0.0987, "step": 4374 }, { "epoch": 5.071361298268492, "grad_norm": 0.3629682958126068, "learning_rate": 2.485632183908046e-05, "loss": 0.0875, "step": 4375 }, { "epoch": 5.07252046656524, "grad_norm": 0.5022166967391968, "learning_rate": 2.485057471264368e-05, "loss": 0.0989, "step": 4376 }, { "epoch": 5.073679634861986, "grad_norm": 0.7410409450531006, "learning_rate": 2.48448275862069e-05, "loss": 0.0991, "step": 4377 }, { "epoch": 5.074838803158734, "grad_norm": 0.30620214343070984, "learning_rate": 2.4839080459770117e-05, "loss": 0.0834, "step": 4378 }, { "epoch": 5.075997971455481, "grad_norm": 0.5494264960289001, "learning_rate": 2.4833333333333335e-05, "loss": 0.0888, "step": 4379 }, { "epoch": 5.077157139752228, "grad_norm": 0.45978859066963196, "learning_rate": 2.4827586206896553e-05, "loss": 0.1035, "step": 4380 }, { "epoch": 5.078316308048975, "grad_norm": 0.4293060004711151, "learning_rate": 2.482183908045977e-05, "loss": 0.0898, "step": 4381 }, { "epoch": 5.079475476345722, "grad_norm": 0.3662157654762268, "learning_rate": 2.481609195402299e-05, "loss": 0.1001, "step": 4382 }, { "epoch": 5.080634644642469, "grad_norm": 0.3850663900375366, "learning_rate": 2.4810344827586208e-05, "loss": 0.0901, "step": 4383 }, { "epoch": 5.081793812939216, "grad_norm": 0.32764530181884766, "learning_rate": 2.4804597701149426e-05, "loss": 0.0922, "step": 4384 }, { "epoch": 5.082952981235963, "grad_norm": 0.8065647482872009, "learning_rate": 2.4798850574712645e-05, "loss": 0.0949, "step": 4385 }, { "epoch": 5.08411214953271, "grad_norm": 0.4645923376083374, "learning_rate": 2.4793103448275863e-05, "loss": 0.0899, "step": 4386 }, { "epoch": 5.0852713178294575, "grad_norm": 0.3961159288883209, "learning_rate": 2.478735632183908e-05, "loss": 0.0918, "step": 4387 }, { "epoch": 5.086430486126204, "grad_norm": 0.3519403636455536, "learning_rate": 2.47816091954023e-05, "loss": 0.0942, "step": 4388 }, { "epoch": 5.087589654422952, "grad_norm": 0.45820310711860657, "learning_rate": 2.4775862068965518e-05, "loss": 0.0977, "step": 4389 }, { "epoch": 5.088748822719698, "grad_norm": 0.810590386390686, "learning_rate": 2.4770114942528736e-05, "loss": 0.0983, "step": 4390 }, { "epoch": 5.089907991016446, "grad_norm": 0.4400874972343445, "learning_rate": 2.4764367816091954e-05, "loss": 0.0962, "step": 4391 }, { "epoch": 5.091067159313193, "grad_norm": 0.3362007737159729, "learning_rate": 2.4758620689655173e-05, "loss": 0.0957, "step": 4392 }, { "epoch": 5.09222632760994, "grad_norm": 0.5532455444335938, "learning_rate": 2.4752873563218394e-05, "loss": 0.1131, "step": 4393 }, { "epoch": 5.093385495906687, "grad_norm": 0.3802591860294342, "learning_rate": 2.474712643678161e-05, "loss": 0.091, "step": 4394 }, { "epoch": 5.094544664203434, "grad_norm": 0.45833706855773926, "learning_rate": 2.4741379310344828e-05, "loss": 0.0942, "step": 4395 }, { "epoch": 5.095703832500181, "grad_norm": 0.3089923858642578, "learning_rate": 2.4735632183908046e-05, "loss": 0.0918, "step": 4396 }, { "epoch": 5.096863000796928, "grad_norm": 0.43163448572158813, "learning_rate": 2.4729885057471268e-05, "loss": 0.0963, "step": 4397 }, { "epoch": 5.098022169093675, "grad_norm": 0.388334721326828, "learning_rate": 2.4724137931034483e-05, "loss": 0.0901, "step": 4398 }, { "epoch": 5.099181337390422, "grad_norm": 0.5277796387672424, "learning_rate": 2.47183908045977e-05, "loss": 0.0934, "step": 4399 }, { "epoch": 5.10034050568717, "grad_norm": 0.42713913321495056, "learning_rate": 2.4712643678160922e-05, "loss": 0.1081, "step": 4400 }, { "epoch": 5.101499673983916, "grad_norm": 0.4533218741416931, "learning_rate": 2.470689655172414e-05, "loss": 0.088, "step": 4401 }, { "epoch": 5.102658842280664, "grad_norm": 0.4103233516216278, "learning_rate": 2.4701149425287356e-05, "loss": 0.1027, "step": 4402 }, { "epoch": 5.1038180105774105, "grad_norm": 0.5257367491722107, "learning_rate": 2.4695402298850577e-05, "loss": 0.0954, "step": 4403 }, { "epoch": 5.104977178874158, "grad_norm": 0.5127593874931335, "learning_rate": 2.4689655172413796e-05, "loss": 0.0903, "step": 4404 }, { "epoch": 5.106136347170905, "grad_norm": 0.4334225058555603, "learning_rate": 2.4683908045977014e-05, "loss": 0.0976, "step": 4405 }, { "epoch": 5.107295515467652, "grad_norm": 0.5956823229789734, "learning_rate": 2.467816091954023e-05, "loss": 0.1057, "step": 4406 }, { "epoch": 5.108454683764399, "grad_norm": 0.448223352432251, "learning_rate": 2.467241379310345e-05, "loss": 0.0955, "step": 4407 }, { "epoch": 5.1096138520611465, "grad_norm": 0.5333186984062195, "learning_rate": 2.466666666666667e-05, "loss": 0.0998, "step": 4408 }, { "epoch": 5.110773020357893, "grad_norm": 0.5879203081130981, "learning_rate": 2.4660919540229887e-05, "loss": 0.0954, "step": 4409 }, { "epoch": 5.111932188654641, "grad_norm": 0.4079529047012329, "learning_rate": 2.4655172413793105e-05, "loss": 0.0925, "step": 4410 }, { "epoch": 5.113091356951387, "grad_norm": 0.3445991277694702, "learning_rate": 2.4649425287356324e-05, "loss": 0.0894, "step": 4411 }, { "epoch": 5.114250525248134, "grad_norm": 0.371430903673172, "learning_rate": 2.4643678160919542e-05, "loss": 0.0904, "step": 4412 }, { "epoch": 5.115409693544882, "grad_norm": 0.4925328195095062, "learning_rate": 2.4637931034482757e-05, "loss": 0.0954, "step": 4413 }, { "epoch": 5.116568861841628, "grad_norm": 0.4289555847644806, "learning_rate": 2.463218390804598e-05, "loss": 0.0909, "step": 4414 }, { "epoch": 5.117728030138376, "grad_norm": 0.6553305983543396, "learning_rate": 2.4626436781609197e-05, "loss": 0.1009, "step": 4415 }, { "epoch": 5.1188871984351225, "grad_norm": 0.4029340445995331, "learning_rate": 2.4620689655172415e-05, "loss": 0.0964, "step": 4416 }, { "epoch": 5.12004636673187, "grad_norm": 0.48213991522789, "learning_rate": 2.4614942528735633e-05, "loss": 0.099, "step": 4417 }, { "epoch": 5.121205535028617, "grad_norm": 0.5332958698272705, "learning_rate": 2.4609195402298852e-05, "loss": 0.0959, "step": 4418 }, { "epoch": 5.122364703325364, "grad_norm": 0.48427054286003113, "learning_rate": 2.460344827586207e-05, "loss": 0.0936, "step": 4419 }, { "epoch": 5.123523871622111, "grad_norm": 0.32984548807144165, "learning_rate": 2.4597701149425288e-05, "loss": 0.0927, "step": 4420 }, { "epoch": 5.1246830399188585, "grad_norm": 0.6007506847381592, "learning_rate": 2.4591954022988507e-05, "loss": 0.0944, "step": 4421 }, { "epoch": 5.125842208215605, "grad_norm": 0.4519351124763489, "learning_rate": 2.4586206896551725e-05, "loss": 0.0904, "step": 4422 }, { "epoch": 5.127001376512353, "grad_norm": 0.44407084584236145, "learning_rate": 2.4580459770114943e-05, "loss": 0.0938, "step": 4423 }, { "epoch": 5.128160544809099, "grad_norm": 0.4065387547016144, "learning_rate": 2.4574712643678165e-05, "loss": 0.0949, "step": 4424 }, { "epoch": 5.129319713105847, "grad_norm": 0.42463329434394836, "learning_rate": 2.456896551724138e-05, "loss": 0.0859, "step": 4425 }, { "epoch": 5.130478881402594, "grad_norm": 0.5019070506095886, "learning_rate": 2.4563218390804598e-05, "loss": 0.1039, "step": 4426 }, { "epoch": 5.13163804969934, "grad_norm": 0.463072806596756, "learning_rate": 2.4557471264367816e-05, "loss": 0.1004, "step": 4427 }, { "epoch": 5.132797217996088, "grad_norm": 0.32865774631500244, "learning_rate": 2.4551724137931038e-05, "loss": 0.0915, "step": 4428 }, { "epoch": 5.133956386292835, "grad_norm": 0.26548025012016296, "learning_rate": 2.4545977011494253e-05, "loss": 0.0837, "step": 4429 }, { "epoch": 5.135115554589582, "grad_norm": 0.3461511731147766, "learning_rate": 2.454022988505747e-05, "loss": 0.0882, "step": 4430 }, { "epoch": 5.136274722886329, "grad_norm": 0.816858172416687, "learning_rate": 2.4534482758620693e-05, "loss": 0.0924, "step": 4431 }, { "epoch": 5.137433891183076, "grad_norm": 0.3309417963027954, "learning_rate": 2.4528735632183908e-05, "loss": 0.0926, "step": 4432 }, { "epoch": 5.138593059479823, "grad_norm": 0.6483179926872253, "learning_rate": 2.4522988505747126e-05, "loss": 0.0908, "step": 4433 }, { "epoch": 5.139752227776571, "grad_norm": 0.3910187780857086, "learning_rate": 2.4517241379310348e-05, "loss": 0.0956, "step": 4434 }, { "epoch": 5.140911396073317, "grad_norm": 0.4869095981121063, "learning_rate": 2.4511494252873566e-05, "loss": 0.0881, "step": 4435 }, { "epoch": 5.142070564370065, "grad_norm": 0.6011160612106323, "learning_rate": 2.450574712643678e-05, "loss": 0.1055, "step": 4436 }, { "epoch": 5.1432297326668115, "grad_norm": 0.38875725865364075, "learning_rate": 2.45e-05, "loss": 0.0919, "step": 4437 }, { "epoch": 5.144388900963559, "grad_norm": 0.28228649497032166, "learning_rate": 2.449425287356322e-05, "loss": 0.0942, "step": 4438 }, { "epoch": 5.145548069260306, "grad_norm": 0.3258858919143677, "learning_rate": 2.448850574712644e-05, "loss": 0.0845, "step": 4439 }, { "epoch": 5.146707237557052, "grad_norm": 0.3680334985256195, "learning_rate": 2.4482758620689654e-05, "loss": 0.0922, "step": 4440 }, { "epoch": 5.1478664058538, "grad_norm": 0.38462620973587036, "learning_rate": 2.4477011494252876e-05, "loss": 0.1046, "step": 4441 }, { "epoch": 5.149025574150547, "grad_norm": 0.48732998967170715, "learning_rate": 2.4471264367816094e-05, "loss": 0.1, "step": 4442 }, { "epoch": 5.150184742447294, "grad_norm": 0.45189419388771057, "learning_rate": 2.4465517241379312e-05, "loss": 0.0962, "step": 4443 }, { "epoch": 5.151343910744041, "grad_norm": 0.4464831054210663, "learning_rate": 2.445977011494253e-05, "loss": 0.0986, "step": 4444 }, { "epoch": 5.152503079040788, "grad_norm": 0.3762010931968689, "learning_rate": 2.445402298850575e-05, "loss": 0.1022, "step": 4445 }, { "epoch": 5.153662247337535, "grad_norm": 0.3228253424167633, "learning_rate": 2.4448275862068967e-05, "loss": 0.0903, "step": 4446 }, { "epoch": 5.154821415634283, "grad_norm": 0.40066200494766235, "learning_rate": 2.4442528735632185e-05, "loss": 0.103, "step": 4447 }, { "epoch": 5.155980583931029, "grad_norm": 0.42761850357055664, "learning_rate": 2.4436781609195404e-05, "loss": 0.0986, "step": 4448 }, { "epoch": 5.157139752227777, "grad_norm": 0.3788778483867645, "learning_rate": 2.4431034482758622e-05, "loss": 0.0954, "step": 4449 }, { "epoch": 5.1582989205245235, "grad_norm": 0.35833922028541565, "learning_rate": 2.442528735632184e-05, "loss": 0.0897, "step": 4450 }, { "epoch": 5.159458088821271, "grad_norm": 0.522954523563385, "learning_rate": 2.441954022988506e-05, "loss": 0.0864, "step": 4451 }, { "epoch": 5.160617257118018, "grad_norm": 0.43595683574676514, "learning_rate": 2.4413793103448277e-05, "loss": 0.0897, "step": 4452 }, { "epoch": 5.161776425414765, "grad_norm": 0.4088694155216217, "learning_rate": 2.4408045977011495e-05, "loss": 0.1002, "step": 4453 }, { "epoch": 5.162935593711512, "grad_norm": 0.4870903491973877, "learning_rate": 2.4402298850574714e-05, "loss": 0.0898, "step": 4454 }, { "epoch": 5.164094762008259, "grad_norm": 0.3365190029144287, "learning_rate": 2.4396551724137932e-05, "loss": 0.0888, "step": 4455 }, { "epoch": 5.165253930305006, "grad_norm": 0.5863637328147888, "learning_rate": 2.439080459770115e-05, "loss": 0.1003, "step": 4456 }, { "epoch": 5.166413098601753, "grad_norm": 0.3489425778388977, "learning_rate": 2.438505747126437e-05, "loss": 0.0891, "step": 4457 }, { "epoch": 5.1675722668985005, "grad_norm": 0.42141303420066833, "learning_rate": 2.4379310344827587e-05, "loss": 0.0868, "step": 4458 }, { "epoch": 5.168731435195247, "grad_norm": 0.5264760255813599, "learning_rate": 2.4373563218390805e-05, "loss": 0.1003, "step": 4459 }, { "epoch": 5.169890603491995, "grad_norm": 0.4587344825267792, "learning_rate": 2.4367816091954023e-05, "loss": 0.0844, "step": 4460 }, { "epoch": 5.171049771788741, "grad_norm": 0.5122568607330322, "learning_rate": 2.436206896551724e-05, "loss": 0.0842, "step": 4461 }, { "epoch": 5.172208940085489, "grad_norm": 0.38229185342788696, "learning_rate": 2.4356321839080463e-05, "loss": 0.0903, "step": 4462 }, { "epoch": 5.173368108382236, "grad_norm": 0.5392225980758667, "learning_rate": 2.4350574712643678e-05, "loss": 0.0971, "step": 4463 }, { "epoch": 5.174527276678983, "grad_norm": 0.4981425106525421, "learning_rate": 2.4344827586206896e-05, "loss": 0.0872, "step": 4464 }, { "epoch": 5.17568644497573, "grad_norm": 0.4396236538887024, "learning_rate": 2.4339080459770118e-05, "loss": 0.0926, "step": 4465 }, { "epoch": 5.176845613272477, "grad_norm": 0.42154166102409363, "learning_rate": 2.4333333333333336e-05, "loss": 0.0882, "step": 4466 }, { "epoch": 5.178004781569224, "grad_norm": 0.4124399721622467, "learning_rate": 2.432758620689655e-05, "loss": 0.092, "step": 4467 }, { "epoch": 5.179163949865972, "grad_norm": 0.378948450088501, "learning_rate": 2.432183908045977e-05, "loss": 0.0951, "step": 4468 }, { "epoch": 5.180323118162718, "grad_norm": 0.3714349567890167, "learning_rate": 2.431609195402299e-05, "loss": 0.0981, "step": 4469 }, { "epoch": 5.181482286459465, "grad_norm": 0.3504725396633148, "learning_rate": 2.4310344827586206e-05, "loss": 0.0973, "step": 4470 }, { "epoch": 5.1826414547562125, "grad_norm": 0.4103260636329651, "learning_rate": 2.4304597701149424e-05, "loss": 0.0957, "step": 4471 }, { "epoch": 5.183800623052959, "grad_norm": 0.5066686868667603, "learning_rate": 2.4298850574712646e-05, "loss": 0.0986, "step": 4472 }, { "epoch": 5.184959791349707, "grad_norm": 0.6200160980224609, "learning_rate": 2.4293103448275864e-05, "loss": 0.1011, "step": 4473 }, { "epoch": 5.186118959646453, "grad_norm": 0.4090159833431244, "learning_rate": 2.428735632183908e-05, "loss": 0.0977, "step": 4474 }, { "epoch": 5.187278127943201, "grad_norm": 0.45171108841896057, "learning_rate": 2.42816091954023e-05, "loss": 0.0855, "step": 4475 }, { "epoch": 5.188437296239948, "grad_norm": 0.3407220244407654, "learning_rate": 2.427586206896552e-05, "loss": 0.0917, "step": 4476 }, { "epoch": 5.189596464536695, "grad_norm": 0.3243277370929718, "learning_rate": 2.4270114942528738e-05, "loss": 0.0855, "step": 4477 }, { "epoch": 5.190755632833442, "grad_norm": 0.5616956949234009, "learning_rate": 2.4264367816091952e-05, "loss": 0.0855, "step": 4478 }, { "epoch": 5.191914801130189, "grad_norm": 0.369253009557724, "learning_rate": 2.4258620689655174e-05, "loss": 0.0832, "step": 4479 }, { "epoch": 5.193073969426936, "grad_norm": 0.3325676918029785, "learning_rate": 2.4252873563218392e-05, "loss": 0.0883, "step": 4480 }, { "epoch": 5.194233137723684, "grad_norm": 0.4389870762825012, "learning_rate": 2.424712643678161e-05, "loss": 0.1078, "step": 4481 }, { "epoch": 5.19539230602043, "grad_norm": 0.3416706919670105, "learning_rate": 2.424137931034483e-05, "loss": 0.0885, "step": 4482 }, { "epoch": 5.196551474317177, "grad_norm": 0.3958418369293213, "learning_rate": 2.4235632183908047e-05, "loss": 0.0959, "step": 4483 }, { "epoch": 5.197710642613925, "grad_norm": 0.41249221563339233, "learning_rate": 2.4229885057471266e-05, "loss": 0.0972, "step": 4484 }, { "epoch": 5.198869810910671, "grad_norm": 0.3895527720451355, "learning_rate": 2.4224137931034484e-05, "loss": 0.0847, "step": 4485 }, { "epoch": 5.200028979207419, "grad_norm": 0.3692401349544525, "learning_rate": 2.4218390804597702e-05, "loss": 0.095, "step": 4486 }, { "epoch": 5.2011881475041655, "grad_norm": 0.5421669483184814, "learning_rate": 2.421264367816092e-05, "loss": 0.0923, "step": 4487 }, { "epoch": 5.202347315800913, "grad_norm": 0.40526431798934937, "learning_rate": 2.420689655172414e-05, "loss": 0.0903, "step": 4488 }, { "epoch": 5.20350648409766, "grad_norm": 0.35065868496894836, "learning_rate": 2.420114942528736e-05, "loss": 0.0945, "step": 4489 }, { "epoch": 5.204665652394407, "grad_norm": 0.34505459666252136, "learning_rate": 2.4195402298850575e-05, "loss": 0.0952, "step": 4490 }, { "epoch": 5.205824820691154, "grad_norm": 0.43336015939712524, "learning_rate": 2.4189655172413794e-05, "loss": 0.0989, "step": 4491 }, { "epoch": 5.2069839889879015, "grad_norm": 0.3147088885307312, "learning_rate": 2.4183908045977012e-05, "loss": 0.089, "step": 4492 }, { "epoch": 5.208143157284648, "grad_norm": 0.567955493927002, "learning_rate": 2.417816091954023e-05, "loss": 0.088, "step": 4493 }, { "epoch": 5.209302325581396, "grad_norm": 0.4204098582267761, "learning_rate": 2.417241379310345e-05, "loss": 0.0927, "step": 4494 }, { "epoch": 5.210461493878142, "grad_norm": 0.4277987778186798, "learning_rate": 2.4166666666666667e-05, "loss": 0.0955, "step": 4495 }, { "epoch": 5.211620662174889, "grad_norm": 0.44593873620033264, "learning_rate": 2.416091954022989e-05, "loss": 0.1008, "step": 4496 }, { "epoch": 5.212779830471637, "grad_norm": 0.39049631357192993, "learning_rate": 2.4155172413793103e-05, "loss": 0.0957, "step": 4497 }, { "epoch": 5.213938998768383, "grad_norm": 0.3604249954223633, "learning_rate": 2.414942528735632e-05, "loss": 0.1004, "step": 4498 }, { "epoch": 5.215098167065131, "grad_norm": 0.3670724630355835, "learning_rate": 2.4143678160919543e-05, "loss": 0.0904, "step": 4499 }, { "epoch": 5.2162573353618775, "grad_norm": 0.34518417716026306, "learning_rate": 2.413793103448276e-05, "loss": 0.0893, "step": 4500 }, { "epoch": 5.217416503658625, "grad_norm": 0.39651983976364136, "learning_rate": 2.4132183908045977e-05, "loss": 0.0948, "step": 4501 }, { "epoch": 5.218575671955372, "grad_norm": 0.4444870948791504, "learning_rate": 2.4126436781609195e-05, "loss": 0.0863, "step": 4502 }, { "epoch": 5.219734840252119, "grad_norm": 0.5257106423377991, "learning_rate": 2.4120689655172417e-05, "loss": 0.0919, "step": 4503 }, { "epoch": 5.220894008548866, "grad_norm": 0.45813608169555664, "learning_rate": 2.4114942528735635e-05, "loss": 0.1027, "step": 4504 }, { "epoch": 5.2220531768456135, "grad_norm": 0.4071040749549866, "learning_rate": 2.410919540229885e-05, "loss": 0.0958, "step": 4505 }, { "epoch": 5.22321234514236, "grad_norm": 0.4606774151325226, "learning_rate": 2.410344827586207e-05, "loss": 0.0926, "step": 4506 }, { "epoch": 5.224371513439108, "grad_norm": 0.47253385186195374, "learning_rate": 2.409770114942529e-05, "loss": 0.1084, "step": 4507 }, { "epoch": 5.225530681735854, "grad_norm": 0.4298096299171448, "learning_rate": 2.4091954022988508e-05, "loss": 0.0866, "step": 4508 }, { "epoch": 5.226689850032602, "grad_norm": 0.843309760093689, "learning_rate": 2.4086206896551726e-05, "loss": 0.104, "step": 4509 }, { "epoch": 5.227849018329349, "grad_norm": 0.5684033632278442, "learning_rate": 2.4080459770114945e-05, "loss": 0.0958, "step": 4510 }, { "epoch": 5.229008186626096, "grad_norm": 0.6145815849304199, "learning_rate": 2.4074712643678163e-05, "loss": 0.0957, "step": 4511 }, { "epoch": 5.230167354922843, "grad_norm": 0.5087504386901855, "learning_rate": 2.4068965517241378e-05, "loss": 0.0846, "step": 4512 }, { "epoch": 5.23132652321959, "grad_norm": 0.49333977699279785, "learning_rate": 2.40632183908046e-05, "loss": 0.088, "step": 4513 }, { "epoch": 5.232485691516337, "grad_norm": 0.29817384481430054, "learning_rate": 2.4057471264367818e-05, "loss": 0.086, "step": 4514 }, { "epoch": 5.233644859813084, "grad_norm": 0.3775709867477417, "learning_rate": 2.4051724137931036e-05, "loss": 0.0947, "step": 4515 }, { "epoch": 5.234804028109831, "grad_norm": 0.31703516840934753, "learning_rate": 2.4045977011494254e-05, "loss": 0.0883, "step": 4516 }, { "epoch": 5.235963196406578, "grad_norm": 0.42791876196861267, "learning_rate": 2.4040229885057473e-05, "loss": 0.0933, "step": 4517 }, { "epoch": 5.237122364703326, "grad_norm": 0.5185449123382568, "learning_rate": 2.403448275862069e-05, "loss": 0.1011, "step": 4518 }, { "epoch": 5.238281533000072, "grad_norm": 0.4122971296310425, "learning_rate": 2.402873563218391e-05, "loss": 0.1013, "step": 4519 }, { "epoch": 5.23944070129682, "grad_norm": 0.3461478054523468, "learning_rate": 2.4022988505747127e-05, "loss": 0.0909, "step": 4520 }, { "epoch": 5.2405998695935665, "grad_norm": 0.38688674569129944, "learning_rate": 2.4017241379310346e-05, "loss": 0.0908, "step": 4521 }, { "epoch": 5.241759037890314, "grad_norm": 0.3087342381477356, "learning_rate": 2.4011494252873564e-05, "loss": 0.0818, "step": 4522 }, { "epoch": 5.242918206187061, "grad_norm": 0.46722832322120667, "learning_rate": 2.4005747126436782e-05, "loss": 0.09, "step": 4523 }, { "epoch": 5.244077374483808, "grad_norm": 0.4601225256919861, "learning_rate": 2.4e-05, "loss": 0.1053, "step": 4524 }, { "epoch": 5.245236542780555, "grad_norm": 0.3325468599796295, "learning_rate": 2.399425287356322e-05, "loss": 0.0875, "step": 4525 }, { "epoch": 5.246395711077302, "grad_norm": 0.4317132830619812, "learning_rate": 2.3988505747126437e-05, "loss": 0.0961, "step": 4526 }, { "epoch": 5.247554879374049, "grad_norm": 0.3786173462867737, "learning_rate": 2.398275862068966e-05, "loss": 0.088, "step": 4527 }, { "epoch": 5.248714047670796, "grad_norm": 0.34360384941101074, "learning_rate": 2.3977011494252874e-05, "loss": 0.0882, "step": 4528 }, { "epoch": 5.249873215967543, "grad_norm": 0.3245190382003784, "learning_rate": 2.3971264367816092e-05, "loss": 0.0888, "step": 4529 }, { "epoch": 5.25103238426429, "grad_norm": 0.3508089780807495, "learning_rate": 2.3965517241379314e-05, "loss": 0.0838, "step": 4530 }, { "epoch": 5.252191552561038, "grad_norm": 0.7918804287910461, "learning_rate": 2.395977011494253e-05, "loss": 0.1023, "step": 4531 }, { "epoch": 5.253350720857784, "grad_norm": 0.5230761766433716, "learning_rate": 2.3954022988505747e-05, "loss": 0.0951, "step": 4532 }, { "epoch": 5.254509889154532, "grad_norm": 0.3835771381855011, "learning_rate": 2.3948275862068965e-05, "loss": 0.0938, "step": 4533 }, { "epoch": 5.2556690574512785, "grad_norm": 0.28895124793052673, "learning_rate": 2.3942528735632187e-05, "loss": 0.081, "step": 4534 }, { "epoch": 5.256828225748026, "grad_norm": 0.36572563648223877, "learning_rate": 2.3936781609195402e-05, "loss": 0.0896, "step": 4535 }, { "epoch": 5.257987394044773, "grad_norm": 0.4853745996952057, "learning_rate": 2.393103448275862e-05, "loss": 0.0952, "step": 4536 }, { "epoch": 5.25914656234152, "grad_norm": 0.37866243720054626, "learning_rate": 2.3925287356321842e-05, "loss": 0.0936, "step": 4537 }, { "epoch": 5.260305730638267, "grad_norm": 0.42627018690109253, "learning_rate": 2.391954022988506e-05, "loss": 0.0868, "step": 4538 }, { "epoch": 5.261464898935014, "grad_norm": 0.3555832803249359, "learning_rate": 2.3913793103448275e-05, "loss": 0.0904, "step": 4539 }, { "epoch": 5.262624067231761, "grad_norm": 0.4143363833427429, "learning_rate": 2.3908045977011497e-05, "loss": 0.0939, "step": 4540 }, { "epoch": 5.263783235528508, "grad_norm": 0.3686874806880951, "learning_rate": 2.3902298850574715e-05, "loss": 0.0899, "step": 4541 }, { "epoch": 5.2649424038252555, "grad_norm": 0.34536075592041016, "learning_rate": 2.3896551724137933e-05, "loss": 0.0869, "step": 4542 }, { "epoch": 5.266101572122002, "grad_norm": 0.6285473704338074, "learning_rate": 2.3890804597701148e-05, "loss": 0.1043, "step": 4543 }, { "epoch": 5.26726074041875, "grad_norm": 0.4580352306365967, "learning_rate": 2.388505747126437e-05, "loss": 0.0931, "step": 4544 }, { "epoch": 5.268419908715496, "grad_norm": 0.3974003493785858, "learning_rate": 2.3879310344827588e-05, "loss": 0.0933, "step": 4545 }, { "epoch": 5.269579077012244, "grad_norm": 0.4980361759662628, "learning_rate": 2.3873563218390806e-05, "loss": 0.095, "step": 4546 }, { "epoch": 5.270738245308991, "grad_norm": 0.3423739969730377, "learning_rate": 2.3867816091954025e-05, "loss": 0.0922, "step": 4547 }, { "epoch": 5.271897413605738, "grad_norm": 0.47561824321746826, "learning_rate": 2.3862068965517243e-05, "loss": 0.0896, "step": 4548 }, { "epoch": 5.273056581902485, "grad_norm": 0.36675018072128296, "learning_rate": 2.385632183908046e-05, "loss": 0.0984, "step": 4549 }, { "epoch": 5.274215750199232, "grad_norm": 0.507520854473114, "learning_rate": 2.385057471264368e-05, "loss": 0.092, "step": 4550 }, { "epoch": 5.275374918495979, "grad_norm": 0.43633562326431274, "learning_rate": 2.3844827586206898e-05, "loss": 0.0979, "step": 4551 }, { "epoch": 5.276534086792727, "grad_norm": 0.41792401671409607, "learning_rate": 2.3839080459770116e-05, "loss": 0.0897, "step": 4552 }, { "epoch": 5.277693255089473, "grad_norm": 0.44738197326660156, "learning_rate": 2.3833333333333334e-05, "loss": 0.0915, "step": 4553 }, { "epoch": 5.278852423386221, "grad_norm": 0.49772951006889343, "learning_rate": 2.3827586206896553e-05, "loss": 0.0982, "step": 4554 }, { "epoch": 5.2800115916829675, "grad_norm": 0.5207464098930359, "learning_rate": 2.382183908045977e-05, "loss": 0.0929, "step": 4555 }, { "epoch": 5.281170759979714, "grad_norm": 0.8150321841239929, "learning_rate": 2.381609195402299e-05, "loss": 0.1011, "step": 4556 }, { "epoch": 5.282329928276462, "grad_norm": 0.4668427109718323, "learning_rate": 2.3810344827586208e-05, "loss": 0.0894, "step": 4557 }, { "epoch": 5.283489096573208, "grad_norm": 0.42127272486686707, "learning_rate": 2.3804597701149426e-05, "loss": 0.0844, "step": 4558 }, { "epoch": 5.284648264869956, "grad_norm": 0.6911650896072388, "learning_rate": 2.3798850574712644e-05, "loss": 0.1004, "step": 4559 }, { "epoch": 5.285807433166703, "grad_norm": 0.37736475467681885, "learning_rate": 2.3793103448275862e-05, "loss": 0.0964, "step": 4560 }, { "epoch": 5.28696660146345, "grad_norm": 0.4046756327152252, "learning_rate": 2.3787356321839084e-05, "loss": 0.0899, "step": 4561 }, { "epoch": 5.288125769760197, "grad_norm": 0.3828308582305908, "learning_rate": 2.37816091954023e-05, "loss": 0.1046, "step": 4562 }, { "epoch": 5.289284938056944, "grad_norm": 0.3368671238422394, "learning_rate": 2.3775862068965517e-05, "loss": 0.0895, "step": 4563 }, { "epoch": 5.290444106353691, "grad_norm": 0.3442758321762085, "learning_rate": 2.3770114942528736e-05, "loss": 0.0895, "step": 4564 }, { "epoch": 5.291603274650439, "grad_norm": 0.4719181954860687, "learning_rate": 2.3764367816091957e-05, "loss": 0.1, "step": 4565 }, { "epoch": 5.292762442947185, "grad_norm": 0.37224331498146057, "learning_rate": 2.3758620689655172e-05, "loss": 0.1004, "step": 4566 }, { "epoch": 5.293921611243933, "grad_norm": 0.4549705982208252, "learning_rate": 2.375287356321839e-05, "loss": 0.0958, "step": 4567 }, { "epoch": 5.29508077954068, "grad_norm": 0.3962538242340088, "learning_rate": 2.3747126436781612e-05, "loss": 0.0984, "step": 4568 }, { "epoch": 5.296239947837426, "grad_norm": 0.42686334252357483, "learning_rate": 2.374137931034483e-05, "loss": 0.0917, "step": 4569 }, { "epoch": 5.297399116134174, "grad_norm": 0.4089353382587433, "learning_rate": 2.3735632183908045e-05, "loss": 0.0938, "step": 4570 }, { "epoch": 5.2985582844309205, "grad_norm": 0.43393051624298096, "learning_rate": 2.3729885057471267e-05, "loss": 0.0894, "step": 4571 }, { "epoch": 5.299717452727668, "grad_norm": 0.4832133650779724, "learning_rate": 2.3724137931034485e-05, "loss": 0.1016, "step": 4572 }, { "epoch": 5.300876621024415, "grad_norm": 0.4066667854785919, "learning_rate": 2.37183908045977e-05, "loss": 0.0954, "step": 4573 }, { "epoch": 5.302035789321162, "grad_norm": 0.40719154477119446, "learning_rate": 2.371264367816092e-05, "loss": 0.0924, "step": 4574 }, { "epoch": 5.303194957617909, "grad_norm": 0.38603299856185913, "learning_rate": 2.370689655172414e-05, "loss": 0.0946, "step": 4575 }, { "epoch": 5.3043541259146565, "grad_norm": 0.45713695883750916, "learning_rate": 2.370114942528736e-05, "loss": 0.0946, "step": 4576 }, { "epoch": 5.305513294211403, "grad_norm": 0.43153274059295654, "learning_rate": 2.3695402298850573e-05, "loss": 0.0941, "step": 4577 }, { "epoch": 5.306672462508151, "grad_norm": 0.5558722019195557, "learning_rate": 2.3689655172413795e-05, "loss": 0.0991, "step": 4578 }, { "epoch": 5.307831630804897, "grad_norm": 0.3364887833595276, "learning_rate": 2.3683908045977013e-05, "loss": 0.0903, "step": 4579 }, { "epoch": 5.308990799101645, "grad_norm": 0.39872419834136963, "learning_rate": 2.367816091954023e-05, "loss": 0.1015, "step": 4580 }, { "epoch": 5.310149967398392, "grad_norm": 0.4545663297176361, "learning_rate": 2.367241379310345e-05, "loss": 0.1017, "step": 4581 }, { "epoch": 5.311309135695138, "grad_norm": 0.4072364866733551, "learning_rate": 2.3666666666666668e-05, "loss": 0.0888, "step": 4582 }, { "epoch": 5.312468303991886, "grad_norm": 0.373927503824234, "learning_rate": 2.3660919540229886e-05, "loss": 0.0973, "step": 4583 }, { "epoch": 5.3136274722886325, "grad_norm": 0.35121285915374756, "learning_rate": 2.3655172413793105e-05, "loss": 0.0938, "step": 4584 }, { "epoch": 5.31478664058538, "grad_norm": 0.4112243950366974, "learning_rate": 2.3649425287356323e-05, "loss": 0.1031, "step": 4585 }, { "epoch": 5.315945808882127, "grad_norm": 0.4238256812095642, "learning_rate": 2.364367816091954e-05, "loss": 0.0929, "step": 4586 }, { "epoch": 5.317104977178874, "grad_norm": 0.531015157699585, "learning_rate": 2.363793103448276e-05, "loss": 0.0988, "step": 4587 }, { "epoch": 5.318264145475621, "grad_norm": 0.4540468752384186, "learning_rate": 2.3632183908045978e-05, "loss": 0.1005, "step": 4588 }, { "epoch": 5.3194233137723685, "grad_norm": 0.30282753705978394, "learning_rate": 2.3626436781609196e-05, "loss": 0.0845, "step": 4589 }, { "epoch": 5.320582482069115, "grad_norm": 0.3676490783691406, "learning_rate": 2.3620689655172415e-05, "loss": 0.0846, "step": 4590 }, { "epoch": 5.321741650365863, "grad_norm": 0.4564397931098938, "learning_rate": 2.3614942528735633e-05, "loss": 0.086, "step": 4591 }, { "epoch": 5.3229008186626094, "grad_norm": 0.3975692689418793, "learning_rate": 2.360919540229885e-05, "loss": 0.0934, "step": 4592 }, { "epoch": 5.324059986959357, "grad_norm": 0.35405778884887695, "learning_rate": 2.360344827586207e-05, "loss": 0.0997, "step": 4593 }, { "epoch": 5.325219155256104, "grad_norm": 0.3428707718849182, "learning_rate": 2.3597701149425288e-05, "loss": 0.0859, "step": 4594 }, { "epoch": 5.326378323552851, "grad_norm": 0.41354939341545105, "learning_rate": 2.359195402298851e-05, "loss": 0.0982, "step": 4595 }, { "epoch": 5.327537491849598, "grad_norm": 0.5002196431159973, "learning_rate": 2.3586206896551724e-05, "loss": 0.0968, "step": 4596 }, { "epoch": 5.3286966601463455, "grad_norm": 0.40941765904426575, "learning_rate": 2.3580459770114943e-05, "loss": 0.0923, "step": 4597 }, { "epoch": 5.329855828443092, "grad_norm": 0.37814703583717346, "learning_rate": 2.357471264367816e-05, "loss": 0.0916, "step": 4598 }, { "epoch": 5.331014996739839, "grad_norm": 0.3758867681026459, "learning_rate": 2.3568965517241383e-05, "loss": 0.0851, "step": 4599 }, { "epoch": 5.332174165036586, "grad_norm": 0.48879507184028625, "learning_rate": 2.3563218390804597e-05, "loss": 0.1009, "step": 4600 }, { "epoch": 5.333333333333333, "grad_norm": 0.6215201616287231, "learning_rate": 2.3557471264367816e-05, "loss": 0.098, "step": 4601 }, { "epoch": 5.334492501630081, "grad_norm": 0.48262956738471985, "learning_rate": 2.3551724137931037e-05, "loss": 0.0904, "step": 4602 }, { "epoch": 5.335651669926827, "grad_norm": 0.40028294920921326, "learning_rate": 2.3545977011494256e-05, "loss": 0.0851, "step": 4603 }, { "epoch": 5.336810838223575, "grad_norm": 0.47819679975509644, "learning_rate": 2.354022988505747e-05, "loss": 0.096, "step": 4604 }, { "epoch": 5.3379700065203215, "grad_norm": 0.36950936913490295, "learning_rate": 2.3534482758620692e-05, "loss": 0.0909, "step": 4605 }, { "epoch": 5.339129174817069, "grad_norm": 0.42461180686950684, "learning_rate": 2.352873563218391e-05, "loss": 0.0992, "step": 4606 }, { "epoch": 5.340288343113816, "grad_norm": 0.3869633674621582, "learning_rate": 2.352298850574713e-05, "loss": 0.0915, "step": 4607 }, { "epoch": 5.341447511410563, "grad_norm": 0.36021578311920166, "learning_rate": 2.3517241379310344e-05, "loss": 0.0952, "step": 4608 }, { "epoch": 5.34260667970731, "grad_norm": 0.3799790143966675, "learning_rate": 2.3511494252873565e-05, "loss": 0.0881, "step": 4609 }, { "epoch": 5.3437658480040575, "grad_norm": 0.4257470667362213, "learning_rate": 2.3505747126436784e-05, "loss": 0.1011, "step": 4610 }, { "epoch": 5.344925016300804, "grad_norm": 0.5010349154472351, "learning_rate": 2.35e-05, "loss": 0.0897, "step": 4611 }, { "epoch": 5.346084184597551, "grad_norm": 0.3810718357563019, "learning_rate": 2.349425287356322e-05, "loss": 0.0969, "step": 4612 }, { "epoch": 5.347243352894298, "grad_norm": 0.4444500505924225, "learning_rate": 2.348850574712644e-05, "loss": 0.0941, "step": 4613 }, { "epoch": 5.348402521191045, "grad_norm": 0.5119409561157227, "learning_rate": 2.3482758620689657e-05, "loss": 0.1, "step": 4614 }, { "epoch": 5.349561689487793, "grad_norm": 0.31901487708091736, "learning_rate": 2.3477011494252875e-05, "loss": 0.0996, "step": 4615 }, { "epoch": 5.350720857784539, "grad_norm": 0.33912405371665955, "learning_rate": 2.3471264367816093e-05, "loss": 0.0938, "step": 4616 }, { "epoch": 5.351880026081287, "grad_norm": 0.4147634208202362, "learning_rate": 2.3465517241379312e-05, "loss": 0.0975, "step": 4617 }, { "epoch": 5.3530391943780335, "grad_norm": 0.33564382791519165, "learning_rate": 2.345977011494253e-05, "loss": 0.0827, "step": 4618 }, { "epoch": 5.354198362674781, "grad_norm": 0.47823843359947205, "learning_rate": 2.345402298850575e-05, "loss": 0.1004, "step": 4619 }, { "epoch": 5.355357530971528, "grad_norm": 0.43068960309028625, "learning_rate": 2.3448275862068967e-05, "loss": 0.0898, "step": 4620 }, { "epoch": 5.356516699268275, "grad_norm": 0.4093489646911621, "learning_rate": 2.3442528735632185e-05, "loss": 0.099, "step": 4621 }, { "epoch": 5.357675867565022, "grad_norm": 0.4601415991783142, "learning_rate": 2.3436781609195403e-05, "loss": 0.0959, "step": 4622 }, { "epoch": 5.35883503586177, "grad_norm": 0.42657336592674255, "learning_rate": 2.343103448275862e-05, "loss": 0.0862, "step": 4623 }, { "epoch": 5.359994204158516, "grad_norm": 0.41843754053115845, "learning_rate": 2.342528735632184e-05, "loss": 0.0896, "step": 4624 }, { "epoch": 5.361153372455263, "grad_norm": 0.4212128818035126, "learning_rate": 2.3419540229885058e-05, "loss": 0.0981, "step": 4625 }, { "epoch": 5.3623125407520105, "grad_norm": 0.3441298305988312, "learning_rate": 2.341379310344828e-05, "loss": 0.0958, "step": 4626 }, { "epoch": 5.363471709048757, "grad_norm": 0.398131787776947, "learning_rate": 2.3408045977011495e-05, "loss": 0.0944, "step": 4627 }, { "epoch": 5.364630877345505, "grad_norm": 0.5075594186782837, "learning_rate": 2.3402298850574713e-05, "loss": 0.0958, "step": 4628 }, { "epoch": 5.365790045642251, "grad_norm": 0.3779784142971039, "learning_rate": 2.339655172413793e-05, "loss": 0.1002, "step": 4629 }, { "epoch": 5.366949213938999, "grad_norm": 0.39865589141845703, "learning_rate": 2.339080459770115e-05, "loss": 0.0853, "step": 4630 }, { "epoch": 5.368108382235746, "grad_norm": 0.7066544890403748, "learning_rate": 2.3385057471264368e-05, "loss": 0.0903, "step": 4631 }, { "epoch": 5.369267550532493, "grad_norm": 0.3441803455352783, "learning_rate": 2.3379310344827586e-05, "loss": 0.084, "step": 4632 }, { "epoch": 5.37042671882924, "grad_norm": 0.4163109362125397, "learning_rate": 2.3373563218390808e-05, "loss": 0.095, "step": 4633 }, { "epoch": 5.371585887125987, "grad_norm": 0.43912622332572937, "learning_rate": 2.3367816091954023e-05, "loss": 0.0922, "step": 4634 }, { "epoch": 5.372745055422734, "grad_norm": 0.304113507270813, "learning_rate": 2.336206896551724e-05, "loss": 0.0875, "step": 4635 }, { "epoch": 5.373904223719482, "grad_norm": 0.3352859318256378, "learning_rate": 2.3356321839080463e-05, "loss": 0.0966, "step": 4636 }, { "epoch": 5.375063392016228, "grad_norm": 0.4194989800453186, "learning_rate": 2.335057471264368e-05, "loss": 0.1005, "step": 4637 }, { "epoch": 5.376222560312976, "grad_norm": 0.3426196873188019, "learning_rate": 2.3344827586206896e-05, "loss": 0.0988, "step": 4638 }, { "epoch": 5.3773817286097225, "grad_norm": 0.35888779163360596, "learning_rate": 2.3339080459770114e-05, "loss": 0.0897, "step": 4639 }, { "epoch": 5.378540896906469, "grad_norm": 0.35968539118766785, "learning_rate": 2.3333333333333336e-05, "loss": 0.0873, "step": 4640 }, { "epoch": 5.379700065203217, "grad_norm": 0.4251978099346161, "learning_rate": 2.3327586206896554e-05, "loss": 0.0904, "step": 4641 }, { "epoch": 5.380859233499963, "grad_norm": 0.5857329964637756, "learning_rate": 2.332183908045977e-05, "loss": 0.0879, "step": 4642 }, { "epoch": 5.382018401796711, "grad_norm": 0.5313498377799988, "learning_rate": 2.331609195402299e-05, "loss": 0.0975, "step": 4643 }, { "epoch": 5.383177570093458, "grad_norm": 0.4486619234085083, "learning_rate": 2.331034482758621e-05, "loss": 0.09, "step": 4644 }, { "epoch": 5.384336738390205, "grad_norm": 0.40548866987228394, "learning_rate": 2.3304597701149427e-05, "loss": 0.0929, "step": 4645 }, { "epoch": 5.385495906686952, "grad_norm": 0.5880759954452515, "learning_rate": 2.3298850574712646e-05, "loss": 0.0945, "step": 4646 }, { "epoch": 5.386655074983699, "grad_norm": 0.5098415017127991, "learning_rate": 2.3293103448275864e-05, "loss": 0.0874, "step": 4647 }, { "epoch": 5.387814243280446, "grad_norm": 0.42802801728248596, "learning_rate": 2.3287356321839082e-05, "loss": 0.0853, "step": 4648 }, { "epoch": 5.388973411577194, "grad_norm": 0.5465481281280518, "learning_rate": 2.32816091954023e-05, "loss": 0.0944, "step": 4649 }, { "epoch": 5.39013257987394, "grad_norm": 0.4456057548522949, "learning_rate": 2.327586206896552e-05, "loss": 0.1031, "step": 4650 }, { "epoch": 5.391291748170688, "grad_norm": 0.36883285641670227, "learning_rate": 2.3270114942528737e-05, "loss": 0.0992, "step": 4651 }, { "epoch": 5.392450916467435, "grad_norm": 0.39192837476730347, "learning_rate": 2.3264367816091955e-05, "loss": 0.0943, "step": 4652 }, { "epoch": 5.393610084764182, "grad_norm": 0.36094704270362854, "learning_rate": 2.3258620689655174e-05, "loss": 0.0896, "step": 4653 }, { "epoch": 5.394769253060929, "grad_norm": 0.8563114404678345, "learning_rate": 2.3252873563218392e-05, "loss": 0.0834, "step": 4654 }, { "epoch": 5.3959284213576755, "grad_norm": 0.4293108880519867, "learning_rate": 2.324712643678161e-05, "loss": 0.1027, "step": 4655 }, { "epoch": 5.397087589654423, "grad_norm": 0.5573574900627136, "learning_rate": 2.324137931034483e-05, "loss": 0.0978, "step": 4656 }, { "epoch": 5.39824675795117, "grad_norm": 0.3921107351779938, "learning_rate": 2.3235632183908047e-05, "loss": 0.09, "step": 4657 }, { "epoch": 5.399405926247917, "grad_norm": 0.41322219371795654, "learning_rate": 2.3229885057471265e-05, "loss": 0.0815, "step": 4658 }, { "epoch": 5.400565094544664, "grad_norm": 0.41785478591918945, "learning_rate": 2.3224137931034483e-05, "loss": 0.0907, "step": 4659 }, { "epoch": 5.4017242628414115, "grad_norm": 0.39730241894721985, "learning_rate": 2.3218390804597705e-05, "loss": 0.098, "step": 4660 }, { "epoch": 5.402883431138158, "grad_norm": 0.46797794103622437, "learning_rate": 2.321264367816092e-05, "loss": 0.0967, "step": 4661 }, { "epoch": 5.404042599434906, "grad_norm": 0.42924076318740845, "learning_rate": 2.3206896551724138e-05, "loss": 0.0872, "step": 4662 }, { "epoch": 5.405201767731652, "grad_norm": 0.4479629695415497, "learning_rate": 2.3201149425287356e-05, "loss": 0.0882, "step": 4663 }, { "epoch": 5.4063609360284, "grad_norm": 0.32181426882743835, "learning_rate": 2.3195402298850578e-05, "loss": 0.0851, "step": 4664 }, { "epoch": 5.407520104325147, "grad_norm": 0.44183018803596497, "learning_rate": 2.3189655172413793e-05, "loss": 0.0861, "step": 4665 }, { "epoch": 5.408679272621894, "grad_norm": 0.4371998608112335, "learning_rate": 2.318390804597701e-05, "loss": 0.0945, "step": 4666 }, { "epoch": 5.409838440918641, "grad_norm": 0.3353933095932007, "learning_rate": 2.3178160919540233e-05, "loss": 0.0867, "step": 4667 }, { "epoch": 5.4109976092153875, "grad_norm": 0.35332971811294556, "learning_rate": 2.317241379310345e-05, "loss": 0.0795, "step": 4668 }, { "epoch": 5.412156777512135, "grad_norm": 0.3787877857685089, "learning_rate": 2.3166666666666666e-05, "loss": 0.0977, "step": 4669 }, { "epoch": 5.413315945808882, "grad_norm": 0.3937995135784149, "learning_rate": 2.3160919540229885e-05, "loss": 0.101, "step": 4670 }, { "epoch": 5.414475114105629, "grad_norm": 0.3958158493041992, "learning_rate": 2.3155172413793106e-05, "loss": 0.0962, "step": 4671 }, { "epoch": 5.415634282402376, "grad_norm": 0.5695372819900513, "learning_rate": 2.314942528735632e-05, "loss": 0.1024, "step": 4672 }, { "epoch": 5.4167934506991235, "grad_norm": 0.4417628049850464, "learning_rate": 2.314367816091954e-05, "loss": 0.0931, "step": 4673 }, { "epoch": 5.41795261899587, "grad_norm": 0.44972625374794006, "learning_rate": 2.313793103448276e-05, "loss": 0.1041, "step": 4674 }, { "epoch": 5.419111787292618, "grad_norm": 0.375545471906662, "learning_rate": 2.313218390804598e-05, "loss": 0.0948, "step": 4675 }, { "epoch": 5.4202709555893644, "grad_norm": 0.4629718065261841, "learning_rate": 2.3126436781609194e-05, "loss": 0.09, "step": 4676 }, { "epoch": 5.421430123886112, "grad_norm": 0.462091326713562, "learning_rate": 2.3120689655172416e-05, "loss": 0.0955, "step": 4677 }, { "epoch": 5.422589292182859, "grad_norm": 0.43810778856277466, "learning_rate": 2.3114942528735634e-05, "loss": 0.0946, "step": 4678 }, { "epoch": 5.423748460479606, "grad_norm": 0.48332980275154114, "learning_rate": 2.3109195402298853e-05, "loss": 0.097, "step": 4679 }, { "epoch": 5.424907628776353, "grad_norm": 0.4389806091785431, "learning_rate": 2.3103448275862067e-05, "loss": 0.0881, "step": 4680 }, { "epoch": 5.4260667970731, "grad_norm": 0.4335387349128723, "learning_rate": 2.309770114942529e-05, "loss": 0.0946, "step": 4681 }, { "epoch": 5.427225965369847, "grad_norm": 0.4705444872379303, "learning_rate": 2.3091954022988507e-05, "loss": 0.0989, "step": 4682 }, { "epoch": 5.428385133666594, "grad_norm": 0.4291810989379883, "learning_rate": 2.3086206896551726e-05, "loss": 0.0942, "step": 4683 }, { "epoch": 5.429544301963341, "grad_norm": 0.6216928958892822, "learning_rate": 2.3080459770114944e-05, "loss": 0.096, "step": 4684 }, { "epoch": 5.430703470260088, "grad_norm": 0.3836531937122345, "learning_rate": 2.3074712643678162e-05, "loss": 0.088, "step": 4685 }, { "epoch": 5.431862638556836, "grad_norm": 0.46026742458343506, "learning_rate": 2.306896551724138e-05, "loss": 0.0907, "step": 4686 }, { "epoch": 5.433021806853582, "grad_norm": 0.45022183656692505, "learning_rate": 2.30632183908046e-05, "loss": 0.0981, "step": 4687 }, { "epoch": 5.43418097515033, "grad_norm": 0.3708306550979614, "learning_rate": 2.3057471264367817e-05, "loss": 0.0984, "step": 4688 }, { "epoch": 5.4353401434470765, "grad_norm": 0.3784964084625244, "learning_rate": 2.3051724137931035e-05, "loss": 0.0918, "step": 4689 }, { "epoch": 5.436499311743824, "grad_norm": 0.6409464478492737, "learning_rate": 2.3045977011494254e-05, "loss": 0.0958, "step": 4690 }, { "epoch": 5.437658480040571, "grad_norm": 0.5641512274742126, "learning_rate": 2.3040229885057472e-05, "loss": 0.0927, "step": 4691 }, { "epoch": 5.438817648337318, "grad_norm": 0.42215609550476074, "learning_rate": 2.303448275862069e-05, "loss": 0.0903, "step": 4692 }, { "epoch": 5.439976816634065, "grad_norm": 0.5272184610366821, "learning_rate": 2.302873563218391e-05, "loss": 0.0919, "step": 4693 }, { "epoch": 5.4411359849308125, "grad_norm": 0.37865811586380005, "learning_rate": 2.3022988505747127e-05, "loss": 0.0928, "step": 4694 }, { "epoch": 5.442295153227559, "grad_norm": 0.48235297203063965, "learning_rate": 2.3017241379310345e-05, "loss": 0.0885, "step": 4695 }, { "epoch": 5.443454321524307, "grad_norm": 0.3501775562763214, "learning_rate": 2.3011494252873563e-05, "loss": 0.0847, "step": 4696 }, { "epoch": 5.444613489821053, "grad_norm": 0.5579177141189575, "learning_rate": 2.3005747126436782e-05, "loss": 0.1053, "step": 4697 }, { "epoch": 5.4457726581178, "grad_norm": 0.423701673746109, "learning_rate": 2.3000000000000003e-05, "loss": 0.0932, "step": 4698 }, { "epoch": 5.446931826414548, "grad_norm": 0.4396893084049225, "learning_rate": 2.299425287356322e-05, "loss": 0.0968, "step": 4699 }, { "epoch": 5.448090994711294, "grad_norm": 0.3700418472290039, "learning_rate": 2.2988505747126437e-05, "loss": 0.0908, "step": 4700 }, { "epoch": 5.449250163008042, "grad_norm": 0.3896975815296173, "learning_rate": 2.2982758620689658e-05, "loss": 0.0947, "step": 4701 }, { "epoch": 5.4504093313047886, "grad_norm": 0.3217882812023163, "learning_rate": 2.2977011494252877e-05, "loss": 0.0881, "step": 4702 }, { "epoch": 5.451568499601536, "grad_norm": 0.47387391328811646, "learning_rate": 2.297126436781609e-05, "loss": 0.1008, "step": 4703 }, { "epoch": 5.452727667898283, "grad_norm": 0.2585291862487793, "learning_rate": 2.296551724137931e-05, "loss": 0.0952, "step": 4704 }, { "epoch": 5.45388683619503, "grad_norm": 0.4016747772693634, "learning_rate": 2.295977011494253e-05, "loss": 0.0852, "step": 4705 }, { "epoch": 5.455046004491777, "grad_norm": 0.29344117641448975, "learning_rate": 2.295402298850575e-05, "loss": 0.0946, "step": 4706 }, { "epoch": 5.456205172788525, "grad_norm": 0.334484338760376, "learning_rate": 2.2948275862068965e-05, "loss": 0.088, "step": 4707 }, { "epoch": 5.457364341085271, "grad_norm": 0.5377522706985474, "learning_rate": 2.2942528735632186e-05, "loss": 0.0911, "step": 4708 }, { "epoch": 5.458523509382019, "grad_norm": 0.3403340280056, "learning_rate": 2.2936781609195405e-05, "loss": 0.0961, "step": 4709 }, { "epoch": 5.4596826776787655, "grad_norm": 0.370523601770401, "learning_rate": 2.293103448275862e-05, "loss": 0.0951, "step": 4710 }, { "epoch": 5.460841845975512, "grad_norm": 0.3723559081554413, "learning_rate": 2.292528735632184e-05, "loss": 0.0967, "step": 4711 }, { "epoch": 5.46200101427226, "grad_norm": 0.4718587398529053, "learning_rate": 2.291954022988506e-05, "loss": 0.0917, "step": 4712 }, { "epoch": 5.463160182569006, "grad_norm": 0.3738349974155426, "learning_rate": 2.2913793103448278e-05, "loss": 0.0973, "step": 4713 }, { "epoch": 5.464319350865754, "grad_norm": 0.4696074426174164, "learning_rate": 2.2908045977011493e-05, "loss": 0.095, "step": 4714 }, { "epoch": 5.465478519162501, "grad_norm": 0.39924389123916626, "learning_rate": 2.2902298850574714e-05, "loss": 0.0899, "step": 4715 }, { "epoch": 5.466637687459248, "grad_norm": 0.4714980125427246, "learning_rate": 2.2896551724137933e-05, "loss": 0.0892, "step": 4716 }, { "epoch": 5.467796855755995, "grad_norm": 0.3208000361919403, "learning_rate": 2.289080459770115e-05, "loss": 0.0917, "step": 4717 }, { "epoch": 5.468956024052742, "grad_norm": 0.5692875385284424, "learning_rate": 2.288505747126437e-05, "loss": 0.0917, "step": 4718 }, { "epoch": 5.470115192349489, "grad_norm": 0.5145659446716309, "learning_rate": 2.2879310344827587e-05, "loss": 0.0994, "step": 4719 }, { "epoch": 5.471274360646237, "grad_norm": 0.4095035791397095, "learning_rate": 2.2873563218390806e-05, "loss": 0.0857, "step": 4720 }, { "epoch": 5.472433528942983, "grad_norm": 0.38376545906066895, "learning_rate": 2.2867816091954024e-05, "loss": 0.0959, "step": 4721 }, { "epoch": 5.473592697239731, "grad_norm": 0.5485108494758606, "learning_rate": 2.2862068965517242e-05, "loss": 0.0857, "step": 4722 }, { "epoch": 5.4747518655364775, "grad_norm": 0.6178170442581177, "learning_rate": 2.285632183908046e-05, "loss": 0.1075, "step": 4723 }, { "epoch": 5.475911033833224, "grad_norm": 0.44804081320762634, "learning_rate": 2.285057471264368e-05, "loss": 0.0933, "step": 4724 }, { "epoch": 5.477070202129972, "grad_norm": 0.7503616213798523, "learning_rate": 2.2844827586206897e-05, "loss": 0.0994, "step": 4725 }, { "epoch": 5.478229370426718, "grad_norm": 0.4127942621707916, "learning_rate": 2.2839080459770116e-05, "loss": 0.0947, "step": 4726 }, { "epoch": 5.479388538723466, "grad_norm": 0.4280507266521454, "learning_rate": 2.2833333333333334e-05, "loss": 0.0837, "step": 4727 }, { "epoch": 5.480547707020213, "grad_norm": 0.41026708483695984, "learning_rate": 2.2827586206896552e-05, "loss": 0.0915, "step": 4728 }, { "epoch": 5.48170687531696, "grad_norm": 0.3558439016342163, "learning_rate": 2.2821839080459774e-05, "loss": 0.0843, "step": 4729 }, { "epoch": 5.482866043613707, "grad_norm": 0.3199481964111328, "learning_rate": 2.281609195402299e-05, "loss": 0.088, "step": 4730 }, { "epoch": 5.4840252119104544, "grad_norm": 0.3802203834056854, "learning_rate": 2.2810344827586207e-05, "loss": 0.0955, "step": 4731 }, { "epoch": 5.485184380207201, "grad_norm": 0.6358181238174438, "learning_rate": 2.280459770114943e-05, "loss": 0.1024, "step": 4732 }, { "epoch": 5.486343548503949, "grad_norm": 0.3913860619068146, "learning_rate": 2.2798850574712644e-05, "loss": 0.0922, "step": 4733 }, { "epoch": 5.487502716800695, "grad_norm": 0.4372271001338959, "learning_rate": 2.2793103448275862e-05, "loss": 0.0905, "step": 4734 }, { "epoch": 5.488661885097443, "grad_norm": 0.4437418282032013, "learning_rate": 2.278735632183908e-05, "loss": 0.0815, "step": 4735 }, { "epoch": 5.48982105339419, "grad_norm": 0.38980501890182495, "learning_rate": 2.2781609195402302e-05, "loss": 0.0888, "step": 4736 }, { "epoch": 5.490980221690937, "grad_norm": 0.6082010865211487, "learning_rate": 2.2775862068965517e-05, "loss": 0.1109, "step": 4737 }, { "epoch": 5.492139389987684, "grad_norm": 0.4929609000682831, "learning_rate": 2.2770114942528735e-05, "loss": 0.1048, "step": 4738 }, { "epoch": 5.493298558284431, "grad_norm": 0.3931887745857239, "learning_rate": 2.2764367816091957e-05, "loss": 0.087, "step": 4739 }, { "epoch": 5.494457726581178, "grad_norm": 0.4002040922641754, "learning_rate": 2.2758620689655175e-05, "loss": 0.0938, "step": 4740 }, { "epoch": 5.495616894877925, "grad_norm": 0.6055549383163452, "learning_rate": 2.275287356321839e-05, "loss": 0.1004, "step": 4741 }, { "epoch": 5.496776063174672, "grad_norm": 0.4150696098804474, "learning_rate": 2.274712643678161e-05, "loss": 0.0971, "step": 4742 }, { "epoch": 5.497935231471419, "grad_norm": 0.43481725454330444, "learning_rate": 2.274137931034483e-05, "loss": 0.0936, "step": 4743 }, { "epoch": 5.4990943997681665, "grad_norm": 0.4212779998779297, "learning_rate": 2.2735632183908048e-05, "loss": 0.0863, "step": 4744 }, { "epoch": 5.500253568064913, "grad_norm": 0.3982880711555481, "learning_rate": 2.2729885057471263e-05, "loss": 0.092, "step": 4745 }, { "epoch": 5.501412736361661, "grad_norm": 0.7657057642936707, "learning_rate": 2.2724137931034485e-05, "loss": 0.0915, "step": 4746 }, { "epoch": 5.502571904658407, "grad_norm": 0.47100207209587097, "learning_rate": 2.2718390804597703e-05, "loss": 0.0896, "step": 4747 }, { "epoch": 5.503731072955155, "grad_norm": 0.41695326566696167, "learning_rate": 2.271264367816092e-05, "loss": 0.1028, "step": 4748 }, { "epoch": 5.504890241251902, "grad_norm": 0.38724851608276367, "learning_rate": 2.270689655172414e-05, "loss": 0.0922, "step": 4749 }, { "epoch": 5.506049409548649, "grad_norm": 0.32382097840309143, "learning_rate": 2.2701149425287358e-05, "loss": 0.0894, "step": 4750 }, { "epoch": 5.507208577845396, "grad_norm": 0.7724230289459229, "learning_rate": 2.2695402298850576e-05, "loss": 0.0947, "step": 4751 }, { "epoch": 5.508367746142143, "grad_norm": 0.35658082365989685, "learning_rate": 2.2689655172413794e-05, "loss": 0.086, "step": 4752 }, { "epoch": 5.50952691443889, "grad_norm": 0.38609573245048523, "learning_rate": 2.2683908045977013e-05, "loss": 0.0961, "step": 4753 }, { "epoch": 5.510686082735637, "grad_norm": 0.37133869528770447, "learning_rate": 2.267816091954023e-05, "loss": 0.0916, "step": 4754 }, { "epoch": 5.511845251032384, "grad_norm": 0.34857529401779175, "learning_rate": 2.267241379310345e-05, "loss": 0.0952, "step": 4755 }, { "epoch": 5.513004419329131, "grad_norm": 0.3999149799346924, "learning_rate": 2.2666666666666668e-05, "loss": 0.1025, "step": 4756 }, { "epoch": 5.5141635876258785, "grad_norm": 0.38613250851631165, "learning_rate": 2.2660919540229886e-05, "loss": 0.0912, "step": 4757 }, { "epoch": 5.515322755922625, "grad_norm": 0.4043678939342499, "learning_rate": 2.2655172413793104e-05, "loss": 0.0991, "step": 4758 }, { "epoch": 5.516481924219373, "grad_norm": 0.460321843624115, "learning_rate": 2.2649425287356322e-05, "loss": 0.0861, "step": 4759 }, { "epoch": 5.5176410925161194, "grad_norm": 0.5290979146957397, "learning_rate": 2.264367816091954e-05, "loss": 0.0946, "step": 4760 }, { "epoch": 5.518800260812867, "grad_norm": 0.4141453802585602, "learning_rate": 2.263793103448276e-05, "loss": 0.0918, "step": 4761 }, { "epoch": 5.519959429109614, "grad_norm": 0.5237475633621216, "learning_rate": 2.2632183908045977e-05, "loss": 0.0966, "step": 4762 }, { "epoch": 5.521118597406361, "grad_norm": 0.392376571893692, "learning_rate": 2.26264367816092e-05, "loss": 0.1033, "step": 4763 }, { "epoch": 5.522277765703108, "grad_norm": 0.43914276361465454, "learning_rate": 2.2620689655172414e-05, "loss": 0.09, "step": 4764 }, { "epoch": 5.5234369339998555, "grad_norm": 0.5471196174621582, "learning_rate": 2.2614942528735632e-05, "loss": 0.0959, "step": 4765 }, { "epoch": 5.524596102296602, "grad_norm": 0.47647860646247864, "learning_rate": 2.2609195402298854e-05, "loss": 0.0975, "step": 4766 }, { "epoch": 5.525755270593349, "grad_norm": 0.5217294096946716, "learning_rate": 2.2603448275862072e-05, "loss": 0.1029, "step": 4767 }, { "epoch": 5.526914438890096, "grad_norm": 0.3615458309650421, "learning_rate": 2.2597701149425287e-05, "loss": 0.0837, "step": 4768 }, { "epoch": 5.528073607186844, "grad_norm": 0.3933470547199249, "learning_rate": 2.2591954022988505e-05, "loss": 0.0959, "step": 4769 }, { "epoch": 5.529232775483591, "grad_norm": 0.32372987270355225, "learning_rate": 2.2586206896551727e-05, "loss": 0.0963, "step": 4770 }, { "epoch": 5.530391943780337, "grad_norm": 0.3662286102771759, "learning_rate": 2.2580459770114942e-05, "loss": 0.103, "step": 4771 }, { "epoch": 5.531551112077085, "grad_norm": 0.5479937791824341, "learning_rate": 2.257471264367816e-05, "loss": 0.1051, "step": 4772 }, { "epoch": 5.5327102803738315, "grad_norm": 0.43857720494270325, "learning_rate": 2.2568965517241382e-05, "loss": 0.0945, "step": 4773 }, { "epoch": 5.533869448670579, "grad_norm": 0.45895788073539734, "learning_rate": 2.25632183908046e-05, "loss": 0.0961, "step": 4774 }, { "epoch": 5.535028616967326, "grad_norm": 0.3341083526611328, "learning_rate": 2.2557471264367815e-05, "loss": 0.0899, "step": 4775 }, { "epoch": 5.536187785264073, "grad_norm": 0.45365577936172485, "learning_rate": 2.2551724137931033e-05, "loss": 0.0899, "step": 4776 }, { "epoch": 5.53734695356082, "grad_norm": 0.3847201466560364, "learning_rate": 2.2545977011494255e-05, "loss": 0.1055, "step": 4777 }, { "epoch": 5.5385061218575675, "grad_norm": 0.570408821105957, "learning_rate": 2.2540229885057473e-05, "loss": 0.0844, "step": 4778 }, { "epoch": 5.539665290154314, "grad_norm": 0.4045121669769287, "learning_rate": 2.2534482758620688e-05, "loss": 0.0953, "step": 4779 }, { "epoch": 5.540824458451062, "grad_norm": 0.321380615234375, "learning_rate": 2.252873563218391e-05, "loss": 0.0908, "step": 4780 }, { "epoch": 5.541983626747808, "grad_norm": 0.40816688537597656, "learning_rate": 2.2522988505747128e-05, "loss": 0.0952, "step": 4781 }, { "epoch": 5.543142795044556, "grad_norm": 0.7444669604301453, "learning_rate": 2.2517241379310347e-05, "loss": 0.1046, "step": 4782 }, { "epoch": 5.544301963341303, "grad_norm": 0.41582420468330383, "learning_rate": 2.2511494252873565e-05, "loss": 0.094, "step": 4783 }, { "epoch": 5.545461131638049, "grad_norm": 0.2793687880039215, "learning_rate": 2.2505747126436783e-05, "loss": 0.0919, "step": 4784 }, { "epoch": 5.546620299934797, "grad_norm": 0.530275821685791, "learning_rate": 2.25e-05, "loss": 0.0953, "step": 4785 }, { "epoch": 5.5477794682315436, "grad_norm": 0.484336793422699, "learning_rate": 2.249425287356322e-05, "loss": 0.0929, "step": 4786 }, { "epoch": 5.548938636528291, "grad_norm": 0.4025559723377228, "learning_rate": 2.2488505747126438e-05, "loss": 0.0982, "step": 4787 }, { "epoch": 5.550097804825038, "grad_norm": 0.639001727104187, "learning_rate": 2.2482758620689656e-05, "loss": 0.0926, "step": 4788 }, { "epoch": 5.551256973121785, "grad_norm": 0.4068092107772827, "learning_rate": 2.2477011494252875e-05, "loss": 0.0983, "step": 4789 }, { "epoch": 5.552416141418532, "grad_norm": 0.4974704384803772, "learning_rate": 2.2471264367816093e-05, "loss": 0.0971, "step": 4790 }, { "epoch": 5.55357530971528, "grad_norm": 0.5406764149665833, "learning_rate": 2.246551724137931e-05, "loss": 0.0927, "step": 4791 }, { "epoch": 5.554734478012026, "grad_norm": 0.3397584855556488, "learning_rate": 2.245977011494253e-05, "loss": 0.0869, "step": 4792 }, { "epoch": 5.555893646308774, "grad_norm": 0.3097866475582123, "learning_rate": 2.2454022988505748e-05, "loss": 0.0877, "step": 4793 }, { "epoch": 5.5570528146055205, "grad_norm": 0.4206595718860626, "learning_rate": 2.2448275862068966e-05, "loss": 0.0911, "step": 4794 }, { "epoch": 5.558211982902268, "grad_norm": 0.2879236042499542, "learning_rate": 2.2442528735632184e-05, "loss": 0.1, "step": 4795 }, { "epoch": 5.559371151199015, "grad_norm": 0.3888731300830841, "learning_rate": 2.2436781609195403e-05, "loss": 0.0943, "step": 4796 }, { "epoch": 5.560530319495761, "grad_norm": 0.3849053382873535, "learning_rate": 2.2431034482758624e-05, "loss": 0.0925, "step": 4797 }, { "epoch": 5.561689487792509, "grad_norm": 0.464656263589859, "learning_rate": 2.242528735632184e-05, "loss": 0.1005, "step": 4798 }, { "epoch": 5.562848656089256, "grad_norm": 0.4027196764945984, "learning_rate": 2.2419540229885057e-05, "loss": 0.1009, "step": 4799 }, { "epoch": 5.564007824386003, "grad_norm": 0.47038623690605164, "learning_rate": 2.2413793103448276e-05, "loss": 0.0972, "step": 4800 }, { "epoch": 5.56516699268275, "grad_norm": 0.35763296484947205, "learning_rate": 2.2408045977011497e-05, "loss": 0.0836, "step": 4801 }, { "epoch": 5.566326160979497, "grad_norm": 0.4369265139102936, "learning_rate": 2.2402298850574712e-05, "loss": 0.0853, "step": 4802 }, { "epoch": 5.567485329276244, "grad_norm": 0.43006211519241333, "learning_rate": 2.239655172413793e-05, "loss": 0.0973, "step": 4803 }, { "epoch": 5.568644497572992, "grad_norm": 0.4107172191143036, "learning_rate": 2.2390804597701152e-05, "loss": 0.0965, "step": 4804 }, { "epoch": 5.569803665869738, "grad_norm": 0.3978167176246643, "learning_rate": 2.238505747126437e-05, "loss": 0.0923, "step": 4805 }, { "epoch": 5.570962834166486, "grad_norm": 0.49095186591148376, "learning_rate": 2.2379310344827586e-05, "loss": 0.0946, "step": 4806 }, { "epoch": 5.5721220024632325, "grad_norm": 0.32563725113868713, "learning_rate": 2.2373563218390807e-05, "loss": 0.092, "step": 4807 }, { "epoch": 5.57328117075998, "grad_norm": 0.42100727558135986, "learning_rate": 2.2367816091954025e-05, "loss": 0.0942, "step": 4808 }, { "epoch": 5.574440339056727, "grad_norm": 0.39792951941490173, "learning_rate": 2.236206896551724e-05, "loss": 0.09, "step": 4809 }, { "epoch": 5.575599507353473, "grad_norm": 0.5242913365364075, "learning_rate": 2.235632183908046e-05, "loss": 0.0943, "step": 4810 }, { "epoch": 5.576758675650221, "grad_norm": 0.5006229281425476, "learning_rate": 2.235057471264368e-05, "loss": 0.0987, "step": 4811 }, { "epoch": 5.5779178439469685, "grad_norm": 0.37062880396842957, "learning_rate": 2.23448275862069e-05, "loss": 0.0929, "step": 4812 }, { "epoch": 5.579077012243715, "grad_norm": 0.5827232003211975, "learning_rate": 2.2339080459770114e-05, "loss": 0.1032, "step": 4813 }, { "epoch": 5.580236180540462, "grad_norm": 0.30347374081611633, "learning_rate": 2.2333333333333335e-05, "loss": 0.0965, "step": 4814 }, { "epoch": 5.5813953488372094, "grad_norm": 0.5079871416091919, "learning_rate": 2.2327586206896554e-05, "loss": 0.0942, "step": 4815 }, { "epoch": 5.582554517133956, "grad_norm": 0.3671127259731293, "learning_rate": 2.2321839080459772e-05, "loss": 0.0885, "step": 4816 }, { "epoch": 5.583713685430704, "grad_norm": 0.4037294089794159, "learning_rate": 2.231609195402299e-05, "loss": 0.0965, "step": 4817 }, { "epoch": 5.58487285372745, "grad_norm": 0.44176700711250305, "learning_rate": 2.231034482758621e-05, "loss": 0.0977, "step": 4818 }, { "epoch": 5.586032022024198, "grad_norm": 0.5387149453163147, "learning_rate": 2.2304597701149427e-05, "loss": 0.0961, "step": 4819 }, { "epoch": 5.587191190320945, "grad_norm": 0.4820229411125183, "learning_rate": 2.2298850574712645e-05, "loss": 0.1048, "step": 4820 }, { "epoch": 5.588350358617692, "grad_norm": 0.46356868743896484, "learning_rate": 2.2293103448275863e-05, "loss": 0.0917, "step": 4821 }, { "epoch": 5.589509526914439, "grad_norm": 0.3790029287338257, "learning_rate": 2.228735632183908e-05, "loss": 0.0897, "step": 4822 }, { "epoch": 5.5906686952111855, "grad_norm": 0.4080239534378052, "learning_rate": 2.22816091954023e-05, "loss": 0.0932, "step": 4823 }, { "epoch": 5.591827863507933, "grad_norm": 0.5119190216064453, "learning_rate": 2.2275862068965518e-05, "loss": 0.099, "step": 4824 }, { "epoch": 5.592987031804681, "grad_norm": 0.40316665172576904, "learning_rate": 2.2270114942528736e-05, "loss": 0.1001, "step": 4825 }, { "epoch": 5.594146200101427, "grad_norm": 0.3797335922718048, "learning_rate": 2.2264367816091955e-05, "loss": 0.0963, "step": 4826 }, { "epoch": 5.595305368398174, "grad_norm": 0.4197828471660614, "learning_rate": 2.2258620689655173e-05, "loss": 0.0921, "step": 4827 }, { "epoch": 5.5964645366949215, "grad_norm": 0.41356906294822693, "learning_rate": 2.2252873563218395e-05, "loss": 0.0948, "step": 4828 }, { "epoch": 5.597623704991668, "grad_norm": 0.5795599818229675, "learning_rate": 2.224712643678161e-05, "loss": 0.0966, "step": 4829 }, { "epoch": 5.598782873288416, "grad_norm": 0.3532811999320984, "learning_rate": 2.2241379310344828e-05, "loss": 0.0926, "step": 4830 }, { "epoch": 5.599942041585162, "grad_norm": 0.32101044058799744, "learning_rate": 2.2235632183908046e-05, "loss": 0.0955, "step": 4831 }, { "epoch": 5.60110120988191, "grad_norm": 0.3435097634792328, "learning_rate": 2.2229885057471264e-05, "loss": 0.1026, "step": 4832 }, { "epoch": 5.602260378178657, "grad_norm": 0.42557913064956665, "learning_rate": 2.2224137931034483e-05, "loss": 0.0857, "step": 4833 }, { "epoch": 5.603419546475404, "grad_norm": 0.36306434869766235, "learning_rate": 2.22183908045977e-05, "loss": 0.0925, "step": 4834 }, { "epoch": 5.604578714772151, "grad_norm": 0.45969343185424805, "learning_rate": 2.2212643678160923e-05, "loss": 0.0987, "step": 4835 }, { "epoch": 5.605737883068898, "grad_norm": 0.39553216099739075, "learning_rate": 2.2206896551724138e-05, "loss": 0.0932, "step": 4836 }, { "epoch": 5.606897051365645, "grad_norm": 0.6770229339599609, "learning_rate": 2.2201149425287356e-05, "loss": 0.0989, "step": 4837 }, { "epoch": 5.608056219662393, "grad_norm": 0.5006446838378906, "learning_rate": 2.2195402298850578e-05, "loss": 0.0959, "step": 4838 }, { "epoch": 5.609215387959139, "grad_norm": 0.33049294352531433, "learning_rate": 2.2189655172413796e-05, "loss": 0.0861, "step": 4839 }, { "epoch": 5.610374556255886, "grad_norm": 0.5273866653442383, "learning_rate": 2.218390804597701e-05, "loss": 0.1108, "step": 4840 }, { "epoch": 5.6115337245526336, "grad_norm": 0.45553502440452576, "learning_rate": 2.217816091954023e-05, "loss": 0.0959, "step": 4841 }, { "epoch": 5.61269289284938, "grad_norm": 0.3138861358165741, "learning_rate": 2.217241379310345e-05, "loss": 0.086, "step": 4842 }, { "epoch": 5.613852061146128, "grad_norm": 0.5423036217689514, "learning_rate": 2.216666666666667e-05, "loss": 0.102, "step": 4843 }, { "epoch": 5.6150112294428745, "grad_norm": 0.5598729848861694, "learning_rate": 2.2160919540229884e-05, "loss": 0.0991, "step": 4844 }, { "epoch": 5.616170397739622, "grad_norm": 0.5099043846130371, "learning_rate": 2.2155172413793106e-05, "loss": 0.0908, "step": 4845 }, { "epoch": 5.617329566036369, "grad_norm": 0.40814903378486633, "learning_rate": 2.2149425287356324e-05, "loss": 0.0911, "step": 4846 }, { "epoch": 5.618488734333116, "grad_norm": 0.3231344223022461, "learning_rate": 2.2143678160919542e-05, "loss": 0.0867, "step": 4847 }, { "epoch": 5.619647902629863, "grad_norm": 0.3704584538936615, "learning_rate": 2.213793103448276e-05, "loss": 0.0857, "step": 4848 }, { "epoch": 5.6208070709266105, "grad_norm": 0.49821195006370544, "learning_rate": 2.213218390804598e-05, "loss": 0.1002, "step": 4849 }, { "epoch": 5.621966239223357, "grad_norm": 0.35286957025527954, "learning_rate": 2.2126436781609197e-05, "loss": 0.0895, "step": 4850 }, { "epoch": 5.623125407520105, "grad_norm": 0.46417170763015747, "learning_rate": 2.2120689655172412e-05, "loss": 0.0919, "step": 4851 }, { "epoch": 5.624284575816851, "grad_norm": 0.4302671253681183, "learning_rate": 2.2114942528735634e-05, "loss": 0.0954, "step": 4852 }, { "epoch": 5.625443744113598, "grad_norm": 0.3605821132659912, "learning_rate": 2.2109195402298852e-05, "loss": 0.0913, "step": 4853 }, { "epoch": 5.626602912410346, "grad_norm": 0.4404606223106384, "learning_rate": 2.210344827586207e-05, "loss": 0.0899, "step": 4854 }, { "epoch": 5.627762080707092, "grad_norm": 0.4587152898311615, "learning_rate": 2.209770114942529e-05, "loss": 0.0897, "step": 4855 }, { "epoch": 5.62892124900384, "grad_norm": 0.4051382839679718, "learning_rate": 2.2091954022988507e-05, "loss": 0.1017, "step": 4856 }, { "epoch": 5.6300804173005865, "grad_norm": 0.4511359930038452, "learning_rate": 2.2086206896551725e-05, "loss": 0.0966, "step": 4857 }, { "epoch": 5.631239585597334, "grad_norm": 0.43057432770729065, "learning_rate": 2.2080459770114943e-05, "loss": 0.1004, "step": 4858 }, { "epoch": 5.632398753894081, "grad_norm": 0.41975724697113037, "learning_rate": 2.207471264367816e-05, "loss": 0.1025, "step": 4859 }, { "epoch": 5.633557922190828, "grad_norm": 0.4029335677623749, "learning_rate": 2.206896551724138e-05, "loss": 0.0968, "step": 4860 }, { "epoch": 5.634717090487575, "grad_norm": 0.296252965927124, "learning_rate": 2.2063218390804598e-05, "loss": 0.0929, "step": 4861 }, { "epoch": 5.6358762587843225, "grad_norm": 0.3576434552669525, "learning_rate": 2.205747126436782e-05, "loss": 0.0906, "step": 4862 }, { "epoch": 5.637035427081069, "grad_norm": 0.37971681356430054, "learning_rate": 2.2051724137931035e-05, "loss": 0.0954, "step": 4863 }, { "epoch": 5.638194595377817, "grad_norm": 0.4097626805305481, "learning_rate": 2.2045977011494253e-05, "loss": 0.0987, "step": 4864 }, { "epoch": 5.639353763674563, "grad_norm": 0.34203460812568665, "learning_rate": 2.204022988505747e-05, "loss": 0.0954, "step": 4865 }, { "epoch": 5.64051293197131, "grad_norm": 0.4253942370414734, "learning_rate": 2.2034482758620693e-05, "loss": 0.0997, "step": 4866 }, { "epoch": 5.641672100268058, "grad_norm": 0.5293704271316528, "learning_rate": 2.2028735632183908e-05, "loss": 0.0933, "step": 4867 }, { "epoch": 5.642831268564805, "grad_norm": 0.36057615280151367, "learning_rate": 2.2022988505747126e-05, "loss": 0.0883, "step": 4868 }, { "epoch": 5.643990436861552, "grad_norm": 0.44225749373435974, "learning_rate": 2.2017241379310348e-05, "loss": 0.0927, "step": 4869 }, { "epoch": 5.6451496051582986, "grad_norm": 0.34227925539016724, "learning_rate": 2.2011494252873563e-05, "loss": 0.0883, "step": 4870 }, { "epoch": 5.646308773455046, "grad_norm": 0.44135892391204834, "learning_rate": 2.200574712643678e-05, "loss": 0.094, "step": 4871 }, { "epoch": 5.647467941751793, "grad_norm": 0.5417119860649109, "learning_rate": 2.2000000000000003e-05, "loss": 0.0943, "step": 4872 }, { "epoch": 5.64862711004854, "grad_norm": 0.47763824462890625, "learning_rate": 2.199425287356322e-05, "loss": 0.1042, "step": 4873 }, { "epoch": 5.649786278345287, "grad_norm": 0.4434187710285187, "learning_rate": 2.1988505747126436e-05, "loss": 0.098, "step": 4874 }, { "epoch": 5.650945446642035, "grad_norm": 0.466353178024292, "learning_rate": 2.1982758620689654e-05, "loss": 0.109, "step": 4875 }, { "epoch": 5.652104614938781, "grad_norm": 0.5113919377326965, "learning_rate": 2.1977011494252876e-05, "loss": 0.0935, "step": 4876 }, { "epoch": 5.653263783235529, "grad_norm": 0.43153828382492065, "learning_rate": 2.1971264367816094e-05, "loss": 0.1003, "step": 4877 }, { "epoch": 5.6544229515322755, "grad_norm": 0.40823087096214294, "learning_rate": 2.196551724137931e-05, "loss": 0.0954, "step": 4878 }, { "epoch": 5.655582119829023, "grad_norm": 0.4650181531906128, "learning_rate": 2.195977011494253e-05, "loss": 0.0951, "step": 4879 }, { "epoch": 5.65674128812577, "grad_norm": 0.41202208399772644, "learning_rate": 2.195402298850575e-05, "loss": 0.1001, "step": 4880 }, { "epoch": 5.657900456422517, "grad_norm": 0.5160025954246521, "learning_rate": 2.1948275862068967e-05, "loss": 0.0914, "step": 4881 }, { "epoch": 5.659059624719264, "grad_norm": 0.5344350337982178, "learning_rate": 2.1942528735632186e-05, "loss": 0.0995, "step": 4882 }, { "epoch": 5.660218793016011, "grad_norm": 1.5580607652664185, "learning_rate": 2.1936781609195404e-05, "loss": 0.1032, "step": 4883 }, { "epoch": 5.661377961312758, "grad_norm": 0.43727943301200867, "learning_rate": 2.1931034482758622e-05, "loss": 0.1002, "step": 4884 }, { "epoch": 5.662537129609505, "grad_norm": 0.35894694924354553, "learning_rate": 2.192528735632184e-05, "loss": 0.1022, "step": 4885 }, { "epoch": 5.663696297906252, "grad_norm": 0.46932268142700195, "learning_rate": 2.191954022988506e-05, "loss": 0.0913, "step": 4886 }, { "epoch": 5.664855466202999, "grad_norm": 0.5814934372901917, "learning_rate": 2.1913793103448277e-05, "loss": 0.1021, "step": 4887 }, { "epoch": 5.666014634499747, "grad_norm": 0.41701653599739075, "learning_rate": 2.1908045977011495e-05, "loss": 0.0984, "step": 4888 }, { "epoch": 5.667173802796493, "grad_norm": 0.564414381980896, "learning_rate": 2.1902298850574714e-05, "loss": 0.1031, "step": 4889 }, { "epoch": 5.668332971093241, "grad_norm": 0.40304380655288696, "learning_rate": 2.1896551724137932e-05, "loss": 0.0933, "step": 4890 }, { "epoch": 5.6694921393899875, "grad_norm": 0.40032997727394104, "learning_rate": 2.189080459770115e-05, "loss": 0.0899, "step": 4891 }, { "epoch": 5.670651307686735, "grad_norm": 0.48019522428512573, "learning_rate": 2.188505747126437e-05, "loss": 0.0989, "step": 4892 }, { "epoch": 5.671810475983482, "grad_norm": 0.5299116969108582, "learning_rate": 2.1879310344827587e-05, "loss": 0.1001, "step": 4893 }, { "epoch": 5.672969644280229, "grad_norm": 0.3729042112827301, "learning_rate": 2.1873563218390805e-05, "loss": 0.0886, "step": 4894 }, { "epoch": 5.674128812576976, "grad_norm": 0.4367959201335907, "learning_rate": 2.1867816091954023e-05, "loss": 0.0877, "step": 4895 }, { "epoch": 5.675287980873723, "grad_norm": 0.36830776929855347, "learning_rate": 2.1862068965517242e-05, "loss": 0.0926, "step": 4896 }, { "epoch": 5.67644714917047, "grad_norm": 0.37894847989082336, "learning_rate": 2.185632183908046e-05, "loss": 0.0922, "step": 4897 }, { "epoch": 5.677606317467217, "grad_norm": 0.4589332640171051, "learning_rate": 2.185057471264368e-05, "loss": 0.0984, "step": 4898 }, { "epoch": 5.6787654857639644, "grad_norm": 0.3483442962169647, "learning_rate": 2.1844827586206897e-05, "loss": 0.0949, "step": 4899 }, { "epoch": 5.679924654060711, "grad_norm": 0.4337550103664398, "learning_rate": 2.183908045977012e-05, "loss": 0.1002, "step": 4900 }, { "epoch": 5.681083822357459, "grad_norm": 0.42401912808418274, "learning_rate": 2.1833333333333333e-05, "loss": 0.087, "step": 4901 }, { "epoch": 5.682242990654205, "grad_norm": 0.40039029717445374, "learning_rate": 2.182758620689655e-05, "loss": 0.1024, "step": 4902 }, { "epoch": 5.683402158950953, "grad_norm": 0.3798970878124237, "learning_rate": 2.1821839080459773e-05, "loss": 0.0913, "step": 4903 }, { "epoch": 5.6845613272477, "grad_norm": 0.41023609042167664, "learning_rate": 2.181609195402299e-05, "loss": 0.0864, "step": 4904 }, { "epoch": 5.685720495544447, "grad_norm": 0.5003272891044617, "learning_rate": 2.1810344827586206e-05, "loss": 0.096, "step": 4905 }, { "epoch": 5.686879663841194, "grad_norm": 0.673970103263855, "learning_rate": 2.1804597701149425e-05, "loss": 0.0967, "step": 4906 }, { "epoch": 5.688038832137941, "grad_norm": 0.4125644564628601, "learning_rate": 2.1798850574712646e-05, "loss": 0.097, "step": 4907 }, { "epoch": 5.689198000434688, "grad_norm": 0.4496941566467285, "learning_rate": 2.1793103448275865e-05, "loss": 0.0917, "step": 4908 }, { "epoch": 5.690357168731435, "grad_norm": 0.37504181265830994, "learning_rate": 2.178735632183908e-05, "loss": 0.0965, "step": 4909 }, { "epoch": 5.691516337028182, "grad_norm": 0.4096089005470276, "learning_rate": 2.17816091954023e-05, "loss": 0.0958, "step": 4910 }, { "epoch": 5.69267550532493, "grad_norm": 0.43474438786506653, "learning_rate": 2.177586206896552e-05, "loss": 0.0888, "step": 4911 }, { "epoch": 5.6938346736216765, "grad_norm": 0.39973193407058716, "learning_rate": 2.1770114942528734e-05, "loss": 0.0946, "step": 4912 }, { "epoch": 5.694993841918423, "grad_norm": 0.5031002163887024, "learning_rate": 2.1764367816091956e-05, "loss": 0.0978, "step": 4913 }, { "epoch": 5.696153010215171, "grad_norm": 0.5082476139068604, "learning_rate": 2.1758620689655174e-05, "loss": 0.0917, "step": 4914 }, { "epoch": 5.697312178511917, "grad_norm": 0.48969969153404236, "learning_rate": 2.1752873563218393e-05, "loss": 0.1036, "step": 4915 }, { "epoch": 5.698471346808665, "grad_norm": 0.41776567697525024, "learning_rate": 2.1747126436781608e-05, "loss": 0.0936, "step": 4916 }, { "epoch": 5.699630515105412, "grad_norm": 0.43276071548461914, "learning_rate": 2.174137931034483e-05, "loss": 0.0969, "step": 4917 }, { "epoch": 5.700789683402159, "grad_norm": 0.44773709774017334, "learning_rate": 2.1735632183908048e-05, "loss": 0.0923, "step": 4918 }, { "epoch": 5.701948851698906, "grad_norm": 0.3338507115840912, "learning_rate": 2.1729885057471266e-05, "loss": 0.0912, "step": 4919 }, { "epoch": 5.703108019995653, "grad_norm": 0.6209862232208252, "learning_rate": 2.1724137931034484e-05, "loss": 0.096, "step": 4920 }, { "epoch": 5.7042671882924, "grad_norm": 0.482950896024704, "learning_rate": 2.1718390804597702e-05, "loss": 0.0981, "step": 4921 }, { "epoch": 5.705426356589148, "grad_norm": 0.4542859196662903, "learning_rate": 2.171264367816092e-05, "loss": 0.0949, "step": 4922 }, { "epoch": 5.706585524885894, "grad_norm": 0.4882400333881378, "learning_rate": 2.170689655172414e-05, "loss": 0.0961, "step": 4923 }, { "epoch": 5.707744693182642, "grad_norm": 0.4771440923213959, "learning_rate": 2.1701149425287357e-05, "loss": 0.1034, "step": 4924 }, { "epoch": 5.7089038614793886, "grad_norm": 0.5598927140235901, "learning_rate": 2.1695402298850576e-05, "loss": 0.0996, "step": 4925 }, { "epoch": 5.710063029776135, "grad_norm": 0.45021867752075195, "learning_rate": 2.1689655172413794e-05, "loss": 0.1006, "step": 4926 }, { "epoch": 5.711222198072883, "grad_norm": 0.38733410835266113, "learning_rate": 2.1683908045977016e-05, "loss": 0.0896, "step": 4927 }, { "epoch": 5.7123813663696295, "grad_norm": 0.3161242604255676, "learning_rate": 2.167816091954023e-05, "loss": 0.0883, "step": 4928 }, { "epoch": 5.713540534666377, "grad_norm": 0.43492892384529114, "learning_rate": 2.167241379310345e-05, "loss": 0.0994, "step": 4929 }, { "epoch": 5.714699702963124, "grad_norm": 0.34343111515045166, "learning_rate": 2.1666666666666667e-05, "loss": 0.0947, "step": 4930 }, { "epoch": 5.715858871259871, "grad_norm": 1.0595885515213013, "learning_rate": 2.1660919540229885e-05, "loss": 0.1021, "step": 4931 }, { "epoch": 5.717018039556618, "grad_norm": 0.4083106815814972, "learning_rate": 2.1655172413793104e-05, "loss": 0.0991, "step": 4932 }, { "epoch": 5.7181772078533655, "grad_norm": 0.3929539918899536, "learning_rate": 2.1649425287356322e-05, "loss": 0.0956, "step": 4933 }, { "epoch": 5.719336376150112, "grad_norm": 0.3941984176635742, "learning_rate": 2.1643678160919544e-05, "loss": 0.1015, "step": 4934 }, { "epoch": 5.72049554444686, "grad_norm": 0.5016578435897827, "learning_rate": 2.163793103448276e-05, "loss": 0.1034, "step": 4935 }, { "epoch": 5.721654712743606, "grad_norm": 0.41408535838127136, "learning_rate": 2.1632183908045977e-05, "loss": 0.0969, "step": 4936 }, { "epoch": 5.722813881040354, "grad_norm": 0.3475428521633148, "learning_rate": 2.1626436781609195e-05, "loss": 0.0932, "step": 4937 }, { "epoch": 5.723973049337101, "grad_norm": 0.315900057554245, "learning_rate": 2.1620689655172417e-05, "loss": 0.0907, "step": 4938 }, { "epoch": 5.725132217633847, "grad_norm": 0.5354477763175964, "learning_rate": 2.161494252873563e-05, "loss": 0.0927, "step": 4939 }, { "epoch": 5.726291385930595, "grad_norm": 0.40579307079315186, "learning_rate": 2.160919540229885e-05, "loss": 0.1103, "step": 4940 }, { "epoch": 5.7274505542273415, "grad_norm": 0.3445356786251068, "learning_rate": 2.160344827586207e-05, "loss": 0.1003, "step": 4941 }, { "epoch": 5.728609722524089, "grad_norm": 0.5686594247817993, "learning_rate": 2.159770114942529e-05, "loss": 0.1108, "step": 4942 }, { "epoch": 5.729768890820836, "grad_norm": 0.47493574023246765, "learning_rate": 2.1591954022988505e-05, "loss": 0.1049, "step": 4943 }, { "epoch": 5.730928059117583, "grad_norm": 0.3111097514629364, "learning_rate": 2.1586206896551726e-05, "loss": 0.0867, "step": 4944 }, { "epoch": 5.73208722741433, "grad_norm": 0.4092866778373718, "learning_rate": 2.1580459770114945e-05, "loss": 0.0945, "step": 4945 }, { "epoch": 5.7332463957110775, "grad_norm": 0.4263165593147278, "learning_rate": 2.1574712643678163e-05, "loss": 0.0818, "step": 4946 }, { "epoch": 5.734405564007824, "grad_norm": 0.3667573928833008, "learning_rate": 2.1568965517241378e-05, "loss": 0.0882, "step": 4947 }, { "epoch": 5.735564732304572, "grad_norm": 0.505990207195282, "learning_rate": 2.15632183908046e-05, "loss": 0.0843, "step": 4948 }, { "epoch": 5.736723900601318, "grad_norm": 0.4376978278160095, "learning_rate": 2.1557471264367818e-05, "loss": 0.0911, "step": 4949 }, { "epoch": 5.737883068898066, "grad_norm": 0.4979074001312256, "learning_rate": 2.1551724137931033e-05, "loss": 0.1033, "step": 4950 }, { "epoch": 5.739042237194813, "grad_norm": 0.48788338899612427, "learning_rate": 2.1545977011494255e-05, "loss": 0.1081, "step": 4951 }, { "epoch": 5.740201405491559, "grad_norm": 0.6209906935691833, "learning_rate": 2.1540229885057473e-05, "loss": 0.1015, "step": 4952 }, { "epoch": 5.741360573788307, "grad_norm": 0.40411826968193054, "learning_rate": 2.153448275862069e-05, "loss": 0.0942, "step": 4953 }, { "epoch": 5.7425197420850544, "grad_norm": 0.4680006206035614, "learning_rate": 2.152873563218391e-05, "loss": 0.0898, "step": 4954 }, { "epoch": 5.743678910381801, "grad_norm": 0.4377489984035492, "learning_rate": 2.1522988505747128e-05, "loss": 0.0927, "step": 4955 }, { "epoch": 5.744838078678548, "grad_norm": 0.6448618173599243, "learning_rate": 2.1517241379310346e-05, "loss": 0.0984, "step": 4956 }, { "epoch": 5.745997246975295, "grad_norm": 0.44574832916259766, "learning_rate": 2.1511494252873564e-05, "loss": 0.1009, "step": 4957 }, { "epoch": 5.747156415272042, "grad_norm": 0.5613508224487305, "learning_rate": 2.1505747126436783e-05, "loss": 0.0987, "step": 4958 }, { "epoch": 5.74831558356879, "grad_norm": 0.5421482920646667, "learning_rate": 2.15e-05, "loss": 0.0975, "step": 4959 }, { "epoch": 5.749474751865536, "grad_norm": 0.4239422082901001, "learning_rate": 2.149425287356322e-05, "loss": 0.0974, "step": 4960 }, { "epoch": 5.750633920162284, "grad_norm": 0.7376547455787659, "learning_rate": 2.1488505747126437e-05, "loss": 0.0881, "step": 4961 }, { "epoch": 5.7517930884590305, "grad_norm": 0.449830561876297, "learning_rate": 2.1482758620689656e-05, "loss": 0.098, "step": 4962 }, { "epoch": 5.752952256755778, "grad_norm": 0.6881018280982971, "learning_rate": 2.1477011494252874e-05, "loss": 0.1013, "step": 4963 }, { "epoch": 5.754111425052525, "grad_norm": 0.4860606789588928, "learning_rate": 2.1471264367816092e-05, "loss": 0.0979, "step": 4964 }, { "epoch": 5.755270593349272, "grad_norm": 0.3773813247680664, "learning_rate": 2.1465517241379314e-05, "loss": 0.0878, "step": 4965 }, { "epoch": 5.756429761646019, "grad_norm": 0.3880038857460022, "learning_rate": 2.145977011494253e-05, "loss": 0.0956, "step": 4966 }, { "epoch": 5.7575889299427665, "grad_norm": 0.6223691701889038, "learning_rate": 2.1454022988505747e-05, "loss": 0.1005, "step": 4967 }, { "epoch": 5.758748098239513, "grad_norm": 0.3865233361721039, "learning_rate": 2.144827586206897e-05, "loss": 0.1, "step": 4968 }, { "epoch": 5.75990726653626, "grad_norm": 0.426666259765625, "learning_rate": 2.1442528735632184e-05, "loss": 0.0984, "step": 4969 }, { "epoch": 5.761066434833007, "grad_norm": 0.3938763439655304, "learning_rate": 2.1436781609195402e-05, "loss": 0.096, "step": 4970 }, { "epoch": 5.762225603129754, "grad_norm": 0.4771715998649597, "learning_rate": 2.143103448275862e-05, "loss": 0.0964, "step": 4971 }, { "epoch": 5.763384771426502, "grad_norm": 0.3489798307418823, "learning_rate": 2.1425287356321842e-05, "loss": 0.0877, "step": 4972 }, { "epoch": 5.764543939723248, "grad_norm": 0.36042097210884094, "learning_rate": 2.1419540229885057e-05, "loss": 0.0889, "step": 4973 }, { "epoch": 5.765703108019996, "grad_norm": 0.41603612899780273, "learning_rate": 2.1413793103448275e-05, "loss": 0.095, "step": 4974 }, { "epoch": 5.7668622763167425, "grad_norm": 0.4094390273094177, "learning_rate": 2.1408045977011497e-05, "loss": 0.0955, "step": 4975 }, { "epoch": 5.76802144461349, "grad_norm": 0.4778774678707123, "learning_rate": 2.1402298850574715e-05, "loss": 0.1005, "step": 4976 }, { "epoch": 5.769180612910237, "grad_norm": 0.4297574758529663, "learning_rate": 2.139655172413793e-05, "loss": 0.0954, "step": 4977 }, { "epoch": 5.770339781206984, "grad_norm": 0.3127455711364746, "learning_rate": 2.1390804597701152e-05, "loss": 0.0896, "step": 4978 }, { "epoch": 5.771498949503731, "grad_norm": 0.37766167521476746, "learning_rate": 2.138505747126437e-05, "loss": 0.0906, "step": 4979 }, { "epoch": 5.7726581178004785, "grad_norm": 0.49125292897224426, "learning_rate": 2.137931034482759e-05, "loss": 0.0963, "step": 4980 }, { "epoch": 5.773817286097225, "grad_norm": 0.4246441423892975, "learning_rate": 2.1373563218390803e-05, "loss": 0.0919, "step": 4981 }, { "epoch": 5.774976454393972, "grad_norm": 0.366273432970047, "learning_rate": 2.1367816091954025e-05, "loss": 0.0896, "step": 4982 }, { "epoch": 5.7761356226907195, "grad_norm": 0.4927791655063629, "learning_rate": 2.1362068965517243e-05, "loss": 0.1007, "step": 4983 }, { "epoch": 5.777294790987466, "grad_norm": 0.29492560029029846, "learning_rate": 2.135632183908046e-05, "loss": 0.0959, "step": 4984 }, { "epoch": 5.778453959284214, "grad_norm": 0.5132672786712646, "learning_rate": 2.135057471264368e-05, "loss": 0.0917, "step": 4985 }, { "epoch": 5.77961312758096, "grad_norm": 0.3356097340583801, "learning_rate": 2.1344827586206898e-05, "loss": 0.083, "step": 4986 }, { "epoch": 5.780772295877708, "grad_norm": 0.43617239594459534, "learning_rate": 2.1339080459770116e-05, "loss": 0.1005, "step": 4987 }, { "epoch": 5.781931464174455, "grad_norm": 0.4654097259044647, "learning_rate": 2.1333333333333335e-05, "loss": 0.0922, "step": 4988 }, { "epoch": 5.783090632471202, "grad_norm": 0.42662137746810913, "learning_rate": 2.1327586206896553e-05, "loss": 0.099, "step": 4989 }, { "epoch": 5.784249800767949, "grad_norm": 0.4461085796356201, "learning_rate": 2.132183908045977e-05, "loss": 0.0907, "step": 4990 }, { "epoch": 5.785408969064696, "grad_norm": 0.6144752502441406, "learning_rate": 2.131609195402299e-05, "loss": 0.0959, "step": 4991 }, { "epoch": 5.786568137361443, "grad_norm": 0.4535113275051117, "learning_rate": 2.1310344827586208e-05, "loss": 0.0941, "step": 4992 }, { "epoch": 5.787727305658191, "grad_norm": 0.556451141834259, "learning_rate": 2.1304597701149426e-05, "loss": 0.1009, "step": 4993 }, { "epoch": 5.788886473954937, "grad_norm": 0.40173500776290894, "learning_rate": 2.1298850574712644e-05, "loss": 0.0952, "step": 4994 }, { "epoch": 5.790045642251684, "grad_norm": 0.485877126455307, "learning_rate": 2.1293103448275863e-05, "loss": 0.0937, "step": 4995 }, { "epoch": 5.7912048105484315, "grad_norm": 0.425545334815979, "learning_rate": 2.128735632183908e-05, "loss": 0.0954, "step": 4996 }, { "epoch": 5.792363978845179, "grad_norm": 0.463670939207077, "learning_rate": 2.12816091954023e-05, "loss": 0.09, "step": 4997 }, { "epoch": 5.793523147141926, "grad_norm": 0.3836846947669983, "learning_rate": 2.1275862068965518e-05, "loss": 0.0934, "step": 4998 }, { "epoch": 5.794682315438672, "grad_norm": 0.7452934980392456, "learning_rate": 2.127011494252874e-05, "loss": 0.094, "step": 4999 }, { "epoch": 5.79584148373542, "grad_norm": 0.364729106426239, "learning_rate": 2.1264367816091954e-05, "loss": 0.092, "step": 5000 }, { "epoch": 5.797000652032167, "grad_norm": 0.560750424861908, "learning_rate": 2.1258620689655172e-05, "loss": 0.0987, "step": 5001 }, { "epoch": 5.798159820328914, "grad_norm": 0.3720407485961914, "learning_rate": 2.125287356321839e-05, "loss": 0.0894, "step": 5002 }, { "epoch": 5.799318988625661, "grad_norm": 0.3824548125267029, "learning_rate": 2.1247126436781612e-05, "loss": 0.0929, "step": 5003 }, { "epoch": 5.800478156922408, "grad_norm": 0.3994615972042084, "learning_rate": 2.1241379310344827e-05, "loss": 0.1029, "step": 5004 }, { "epoch": 5.801637325219155, "grad_norm": 0.4771389067173004, "learning_rate": 2.1235632183908046e-05, "loss": 0.0842, "step": 5005 }, { "epoch": 5.802796493515903, "grad_norm": 0.4739455580711365, "learning_rate": 2.1229885057471267e-05, "loss": 0.1055, "step": 5006 }, { "epoch": 5.803955661812649, "grad_norm": 0.5355760455131531, "learning_rate": 2.1224137931034486e-05, "loss": 0.0892, "step": 5007 }, { "epoch": 5.805114830109397, "grad_norm": 0.5140407681465149, "learning_rate": 2.12183908045977e-05, "loss": 0.0986, "step": 5008 }, { "epoch": 5.8062739984061436, "grad_norm": 0.3573043644428253, "learning_rate": 2.1212643678160922e-05, "loss": 0.0904, "step": 5009 }, { "epoch": 5.807433166702891, "grad_norm": 0.3605383634567261, "learning_rate": 2.120689655172414e-05, "loss": 0.0984, "step": 5010 }, { "epoch": 5.808592334999638, "grad_norm": 0.3857939541339874, "learning_rate": 2.1201149425287355e-05, "loss": 0.0911, "step": 5011 }, { "epoch": 5.8097515032963845, "grad_norm": 0.3786557614803314, "learning_rate": 2.1195402298850574e-05, "loss": 0.0969, "step": 5012 }, { "epoch": 5.810910671593132, "grad_norm": 0.4758943021297455, "learning_rate": 2.1189655172413795e-05, "loss": 0.1068, "step": 5013 }, { "epoch": 5.812069839889879, "grad_norm": 0.5058685541152954, "learning_rate": 2.1183908045977014e-05, "loss": 0.0927, "step": 5014 }, { "epoch": 5.813229008186626, "grad_norm": 0.31769004464149475, "learning_rate": 2.117816091954023e-05, "loss": 0.0904, "step": 5015 }, { "epoch": 5.814388176483373, "grad_norm": 0.354096919298172, "learning_rate": 2.117241379310345e-05, "loss": 0.1018, "step": 5016 }, { "epoch": 5.8155473447801205, "grad_norm": 0.41636335849761963, "learning_rate": 2.116666666666667e-05, "loss": 0.0969, "step": 5017 }, { "epoch": 5.816706513076867, "grad_norm": 0.46119603514671326, "learning_rate": 2.1160919540229887e-05, "loss": 0.1032, "step": 5018 }, { "epoch": 5.817865681373615, "grad_norm": 0.38647714257240295, "learning_rate": 2.1155172413793105e-05, "loss": 0.0972, "step": 5019 }, { "epoch": 5.819024849670361, "grad_norm": 0.3750416338443756, "learning_rate": 2.1149425287356323e-05, "loss": 0.0897, "step": 5020 }, { "epoch": 5.820184017967109, "grad_norm": 0.4357173442840576, "learning_rate": 2.114367816091954e-05, "loss": 0.0988, "step": 5021 }, { "epoch": 5.821343186263856, "grad_norm": 0.2895772457122803, "learning_rate": 2.113793103448276e-05, "loss": 0.0955, "step": 5022 }, { "epoch": 5.822502354560603, "grad_norm": 0.3872501850128174, "learning_rate": 2.1132183908045978e-05, "loss": 0.103, "step": 5023 }, { "epoch": 5.82366152285735, "grad_norm": 0.40317487716674805, "learning_rate": 2.1126436781609196e-05, "loss": 0.0928, "step": 5024 }, { "epoch": 5.8248206911540965, "grad_norm": 0.49441027641296387, "learning_rate": 2.1120689655172415e-05, "loss": 0.0944, "step": 5025 }, { "epoch": 5.825979859450844, "grad_norm": 0.39836275577545166, "learning_rate": 2.1114942528735633e-05, "loss": 0.0926, "step": 5026 }, { "epoch": 5.827139027747591, "grad_norm": 0.5158929824829102, "learning_rate": 2.110919540229885e-05, "loss": 0.0888, "step": 5027 }, { "epoch": 5.828298196044338, "grad_norm": 0.4455549716949463, "learning_rate": 2.110344827586207e-05, "loss": 0.0907, "step": 5028 }, { "epoch": 5.829457364341085, "grad_norm": 0.4579765498638153, "learning_rate": 2.1097701149425288e-05, "loss": 0.0903, "step": 5029 }, { "epoch": 5.8306165326378325, "grad_norm": 0.6447916030883789, "learning_rate": 2.1091954022988506e-05, "loss": 0.0945, "step": 5030 }, { "epoch": 5.831775700934579, "grad_norm": 0.44061580300331116, "learning_rate": 2.1086206896551724e-05, "loss": 0.1007, "step": 5031 }, { "epoch": 5.832934869231327, "grad_norm": 0.46343931555747986, "learning_rate": 2.1080459770114943e-05, "loss": 0.1078, "step": 5032 }, { "epoch": 5.834094037528073, "grad_norm": 0.4852432310581207, "learning_rate": 2.1074712643678164e-05, "loss": 0.1009, "step": 5033 }, { "epoch": 5.835253205824821, "grad_norm": 0.2604130208492279, "learning_rate": 2.106896551724138e-05, "loss": 0.09, "step": 5034 }, { "epoch": 5.836412374121568, "grad_norm": 0.6115668416023254, "learning_rate": 2.1063218390804598e-05, "loss": 0.1032, "step": 5035 }, { "epoch": 5.837571542418315, "grad_norm": 0.4292698800563812, "learning_rate": 2.1057471264367816e-05, "loss": 0.0984, "step": 5036 }, { "epoch": 5.838730710715062, "grad_norm": 0.42464151978492737, "learning_rate": 2.1051724137931038e-05, "loss": 0.0854, "step": 5037 }, { "epoch": 5.839889879011809, "grad_norm": 0.36907604336738586, "learning_rate": 2.1045977011494253e-05, "loss": 0.0948, "step": 5038 }, { "epoch": 5.841049047308556, "grad_norm": 0.43384525179862976, "learning_rate": 2.104022988505747e-05, "loss": 0.0955, "step": 5039 }, { "epoch": 5.842208215605304, "grad_norm": 0.40040022134780884, "learning_rate": 2.1034482758620692e-05, "loss": 0.0971, "step": 5040 }, { "epoch": 5.84336738390205, "grad_norm": 0.4272783100605011, "learning_rate": 2.102873563218391e-05, "loss": 0.0898, "step": 5041 }, { "epoch": 5.844526552198797, "grad_norm": 0.3518434762954712, "learning_rate": 2.1022988505747126e-05, "loss": 0.0905, "step": 5042 }, { "epoch": 5.845685720495545, "grad_norm": 0.39061328768730164, "learning_rate": 2.1017241379310344e-05, "loss": 0.0924, "step": 5043 }, { "epoch": 5.846844888792291, "grad_norm": 0.4481010437011719, "learning_rate": 2.1011494252873566e-05, "loss": 0.0922, "step": 5044 }, { "epoch": 5.848004057089039, "grad_norm": 0.5339946746826172, "learning_rate": 2.1005747126436784e-05, "loss": 0.1021, "step": 5045 }, { "epoch": 5.8491632253857855, "grad_norm": 0.6023551821708679, "learning_rate": 2.1e-05, "loss": 0.0971, "step": 5046 }, { "epoch": 5.850322393682533, "grad_norm": 0.41510429978370667, "learning_rate": 2.099425287356322e-05, "loss": 0.1031, "step": 5047 }, { "epoch": 5.85148156197928, "grad_norm": 0.4894249439239502, "learning_rate": 2.098850574712644e-05, "loss": 0.0986, "step": 5048 }, { "epoch": 5.852640730276027, "grad_norm": 0.3392792046070099, "learning_rate": 2.0982758620689654e-05, "loss": 0.0904, "step": 5049 }, { "epoch": 5.853799898572774, "grad_norm": 0.43131858110427856, "learning_rate": 2.0977011494252875e-05, "loss": 0.0933, "step": 5050 }, { "epoch": 5.8549590668695215, "grad_norm": 0.3799227476119995, "learning_rate": 2.0971264367816094e-05, "loss": 0.0892, "step": 5051 }, { "epoch": 5.856118235166268, "grad_norm": 0.44021299481391907, "learning_rate": 2.0965517241379312e-05, "loss": 0.0978, "step": 5052 }, { "epoch": 5.857277403463016, "grad_norm": 0.39485037326812744, "learning_rate": 2.0959770114942527e-05, "loss": 0.0918, "step": 5053 }, { "epoch": 5.858436571759762, "grad_norm": 0.367180734872818, "learning_rate": 2.095402298850575e-05, "loss": 0.1015, "step": 5054 }, { "epoch": 5.859595740056509, "grad_norm": 0.6080685257911682, "learning_rate": 2.0948275862068967e-05, "loss": 0.0916, "step": 5055 }, { "epoch": 5.860754908353257, "grad_norm": 0.3705676198005676, "learning_rate": 2.0942528735632185e-05, "loss": 0.0935, "step": 5056 }, { "epoch": 5.861914076650003, "grad_norm": 0.41616174578666687, "learning_rate": 2.0936781609195403e-05, "loss": 0.096, "step": 5057 }, { "epoch": 5.863073244946751, "grad_norm": 0.3993401825428009, "learning_rate": 2.0931034482758622e-05, "loss": 0.0943, "step": 5058 }, { "epoch": 5.8642324132434975, "grad_norm": 0.4943077564239502, "learning_rate": 2.092528735632184e-05, "loss": 0.096, "step": 5059 }, { "epoch": 5.865391581540245, "grad_norm": 0.3290475010871887, "learning_rate": 2.0919540229885058e-05, "loss": 0.0948, "step": 5060 }, { "epoch": 5.866550749836992, "grad_norm": 0.39300331473350525, "learning_rate": 2.0913793103448277e-05, "loss": 0.1004, "step": 5061 }, { "epoch": 5.867709918133739, "grad_norm": 0.4835030138492584, "learning_rate": 2.0908045977011495e-05, "loss": 0.0884, "step": 5062 }, { "epoch": 5.868869086430486, "grad_norm": 0.5800598859786987, "learning_rate": 2.0902298850574713e-05, "loss": 0.0988, "step": 5063 }, { "epoch": 5.8700282547272336, "grad_norm": 0.4258168041706085, "learning_rate": 2.0896551724137935e-05, "loss": 0.1035, "step": 5064 }, { "epoch": 5.87118742302398, "grad_norm": 0.4930982291698456, "learning_rate": 2.089080459770115e-05, "loss": 0.0923, "step": 5065 }, { "epoch": 5.872346591320728, "grad_norm": 0.3994540870189667, "learning_rate": 2.0885057471264368e-05, "loss": 0.0916, "step": 5066 }, { "epoch": 5.8735057596174745, "grad_norm": 0.35855406522750854, "learning_rate": 2.0879310344827586e-05, "loss": 0.0976, "step": 5067 }, { "epoch": 5.874664927914221, "grad_norm": 0.37041381001472473, "learning_rate": 2.0873563218390808e-05, "loss": 0.0863, "step": 5068 }, { "epoch": 5.875824096210969, "grad_norm": 0.44822248816490173, "learning_rate": 2.0867816091954023e-05, "loss": 0.0963, "step": 5069 }, { "epoch": 5.876983264507715, "grad_norm": 0.3357725739479065, "learning_rate": 2.086206896551724e-05, "loss": 0.0877, "step": 5070 }, { "epoch": 5.878142432804463, "grad_norm": 0.37162381410598755, "learning_rate": 2.0856321839080463e-05, "loss": 0.0874, "step": 5071 }, { "epoch": 5.87930160110121, "grad_norm": 0.4773227274417877, "learning_rate": 2.0850574712643678e-05, "loss": 0.0965, "step": 5072 }, { "epoch": 5.880460769397957, "grad_norm": 0.47884953022003174, "learning_rate": 2.0844827586206896e-05, "loss": 0.0946, "step": 5073 }, { "epoch": 5.881619937694704, "grad_norm": 0.3373884856700897, "learning_rate": 2.0839080459770118e-05, "loss": 0.0949, "step": 5074 }, { "epoch": 5.882779105991451, "grad_norm": 0.399009644985199, "learning_rate": 2.0833333333333336e-05, "loss": 0.0953, "step": 5075 }, { "epoch": 5.883938274288198, "grad_norm": 0.44031986594200134, "learning_rate": 2.082758620689655e-05, "loss": 0.1024, "step": 5076 }, { "epoch": 5.885097442584946, "grad_norm": 0.3464363217353821, "learning_rate": 2.082183908045977e-05, "loss": 0.1056, "step": 5077 }, { "epoch": 5.886256610881692, "grad_norm": 0.4489310383796692, "learning_rate": 2.081609195402299e-05, "loss": 0.0995, "step": 5078 }, { "epoch": 5.88741577917844, "grad_norm": 0.43739187717437744, "learning_rate": 2.081034482758621e-05, "loss": 0.0965, "step": 5079 }, { "epoch": 5.8885749474751865, "grad_norm": 0.35651272535324097, "learning_rate": 2.0804597701149424e-05, "loss": 0.0944, "step": 5080 }, { "epoch": 5.889734115771933, "grad_norm": 0.4365938603878021, "learning_rate": 2.0798850574712646e-05, "loss": 0.101, "step": 5081 }, { "epoch": 5.890893284068681, "grad_norm": 0.4614748954772949, "learning_rate": 2.0793103448275864e-05, "loss": 0.0986, "step": 5082 }, { "epoch": 5.892052452365428, "grad_norm": 0.3819468915462494, "learning_rate": 2.0787356321839082e-05, "loss": 0.0882, "step": 5083 }, { "epoch": 5.893211620662175, "grad_norm": 0.5456834435462952, "learning_rate": 2.07816091954023e-05, "loss": 0.0931, "step": 5084 }, { "epoch": 5.894370788958922, "grad_norm": 0.48817238211631775, "learning_rate": 2.077586206896552e-05, "loss": 0.0946, "step": 5085 }, { "epoch": 5.895529957255669, "grad_norm": 0.5912837386131287, "learning_rate": 2.0770114942528737e-05, "loss": 0.0907, "step": 5086 }, { "epoch": 5.896689125552416, "grad_norm": 0.38435614109039307, "learning_rate": 2.0764367816091956e-05, "loss": 0.0919, "step": 5087 }, { "epoch": 5.897848293849163, "grad_norm": 0.3552255928516388, "learning_rate": 2.0758620689655174e-05, "loss": 0.0925, "step": 5088 }, { "epoch": 5.89900746214591, "grad_norm": 0.3824387192726135, "learning_rate": 2.0752873563218392e-05, "loss": 0.1059, "step": 5089 }, { "epoch": 5.900166630442658, "grad_norm": 0.43972963094711304, "learning_rate": 2.074712643678161e-05, "loss": 0.0966, "step": 5090 }, { "epoch": 5.901325798739404, "grad_norm": 0.3910900354385376, "learning_rate": 2.074137931034483e-05, "loss": 0.0889, "step": 5091 }, { "epoch": 5.902484967036152, "grad_norm": 0.45151054859161377, "learning_rate": 2.0735632183908047e-05, "loss": 0.1027, "step": 5092 }, { "epoch": 5.903644135332899, "grad_norm": 0.3880411982536316, "learning_rate": 2.0729885057471265e-05, "loss": 0.0947, "step": 5093 }, { "epoch": 5.904803303629646, "grad_norm": 0.40984082221984863, "learning_rate": 2.0724137931034484e-05, "loss": 0.0929, "step": 5094 }, { "epoch": 5.905962471926393, "grad_norm": 0.32482361793518066, "learning_rate": 2.0718390804597702e-05, "loss": 0.0848, "step": 5095 }, { "epoch": 5.90712164022314, "grad_norm": 0.438496470451355, "learning_rate": 2.071264367816092e-05, "loss": 0.088, "step": 5096 }, { "epoch": 5.908280808519887, "grad_norm": 0.47613009810447693, "learning_rate": 2.070689655172414e-05, "loss": 0.0942, "step": 5097 }, { "epoch": 5.909439976816634, "grad_norm": 0.3807111978530884, "learning_rate": 2.0701149425287357e-05, "loss": 0.096, "step": 5098 }, { "epoch": 5.910599145113381, "grad_norm": 0.4180908799171448, "learning_rate": 2.0695402298850575e-05, "loss": 0.0892, "step": 5099 }, { "epoch": 5.911758313410128, "grad_norm": 0.5334672927856445, "learning_rate": 2.0689655172413793e-05, "loss": 0.0952, "step": 5100 }, { "epoch": 5.9129174817068755, "grad_norm": 0.4528830945491791, "learning_rate": 2.068390804597701e-05, "loss": 0.0967, "step": 5101 }, { "epoch": 5.914076650003622, "grad_norm": 0.4010438323020935, "learning_rate": 2.0678160919540233e-05, "loss": 0.0969, "step": 5102 }, { "epoch": 5.91523581830037, "grad_norm": 0.3919377624988556, "learning_rate": 2.0672413793103448e-05, "loss": 0.0836, "step": 5103 }, { "epoch": 5.916394986597116, "grad_norm": 0.3746892511844635, "learning_rate": 2.0666666666666666e-05, "loss": 0.0957, "step": 5104 }, { "epoch": 5.917554154893864, "grad_norm": 0.5136653780937195, "learning_rate": 2.0660919540229888e-05, "loss": 0.0989, "step": 5105 }, { "epoch": 5.918713323190611, "grad_norm": 0.4311780631542206, "learning_rate": 2.0655172413793106e-05, "loss": 0.0998, "step": 5106 }, { "epoch": 5.919872491487358, "grad_norm": 0.4143439829349518, "learning_rate": 2.064942528735632e-05, "loss": 0.0921, "step": 5107 }, { "epoch": 5.921031659784105, "grad_norm": 0.857010543346405, "learning_rate": 2.064367816091954e-05, "loss": 0.0987, "step": 5108 }, { "epoch": 5.922190828080852, "grad_norm": 0.5835907459259033, "learning_rate": 2.063793103448276e-05, "loss": 0.1125, "step": 5109 }, { "epoch": 5.923349996377599, "grad_norm": 0.3759761154651642, "learning_rate": 2.0632183908045976e-05, "loss": 0.0884, "step": 5110 }, { "epoch": 5.924509164674346, "grad_norm": 0.4615977704524994, "learning_rate": 2.0626436781609194e-05, "loss": 0.0969, "step": 5111 }, { "epoch": 5.925668332971093, "grad_norm": 0.3750910758972168, "learning_rate": 2.0620689655172416e-05, "loss": 0.0927, "step": 5112 }, { "epoch": 5.92682750126784, "grad_norm": 0.42511269450187683, "learning_rate": 2.0614942528735634e-05, "loss": 0.0899, "step": 5113 }, { "epoch": 5.9279866695645875, "grad_norm": 0.5754655003547668, "learning_rate": 2.060919540229885e-05, "loss": 0.0914, "step": 5114 }, { "epoch": 5.929145837861334, "grad_norm": 0.43884530663490295, "learning_rate": 2.060344827586207e-05, "loss": 0.0984, "step": 5115 }, { "epoch": 5.930305006158082, "grad_norm": 0.3772994577884674, "learning_rate": 2.059770114942529e-05, "loss": 0.0964, "step": 5116 }, { "epoch": 5.931464174454828, "grad_norm": 0.46012642979621887, "learning_rate": 2.0591954022988508e-05, "loss": 0.0927, "step": 5117 }, { "epoch": 5.932623342751576, "grad_norm": 0.39248010516166687, "learning_rate": 2.0586206896551722e-05, "loss": 0.0925, "step": 5118 }, { "epoch": 5.933782511048323, "grad_norm": 0.42726606130599976, "learning_rate": 2.0580459770114944e-05, "loss": 0.1009, "step": 5119 }, { "epoch": 5.93494167934507, "grad_norm": 0.4554595947265625, "learning_rate": 2.0574712643678162e-05, "loss": 0.1023, "step": 5120 }, { "epoch": 5.936100847641817, "grad_norm": 0.34201934933662415, "learning_rate": 2.056896551724138e-05, "loss": 0.095, "step": 5121 }, { "epoch": 5.9372600159385645, "grad_norm": 0.4734445810317993, "learning_rate": 2.05632183908046e-05, "loss": 0.0948, "step": 5122 }, { "epoch": 5.938419184235311, "grad_norm": 0.4069608449935913, "learning_rate": 2.0557471264367817e-05, "loss": 0.0912, "step": 5123 }, { "epoch": 5.939578352532058, "grad_norm": 0.39140281081199646, "learning_rate": 2.0551724137931036e-05, "loss": 0.09, "step": 5124 }, { "epoch": 5.940737520828805, "grad_norm": 0.547514796257019, "learning_rate": 2.0545977011494254e-05, "loss": 0.0972, "step": 5125 }, { "epoch": 5.941896689125553, "grad_norm": 0.34587743878364563, "learning_rate": 2.0540229885057472e-05, "loss": 0.0912, "step": 5126 }, { "epoch": 5.9430558574223, "grad_norm": 0.5674780607223511, "learning_rate": 2.053448275862069e-05, "loss": 0.1001, "step": 5127 }, { "epoch": 5.944215025719046, "grad_norm": 0.3166492283344269, "learning_rate": 2.052873563218391e-05, "loss": 0.0838, "step": 5128 }, { "epoch": 5.945374194015794, "grad_norm": 0.3704341948032379, "learning_rate": 2.0522988505747127e-05, "loss": 0.0883, "step": 5129 }, { "epoch": 5.9465333623125405, "grad_norm": 0.3320915699005127, "learning_rate": 2.0517241379310345e-05, "loss": 0.0836, "step": 5130 }, { "epoch": 5.947692530609288, "grad_norm": 0.40248364210128784, "learning_rate": 2.0511494252873564e-05, "loss": 0.0946, "step": 5131 }, { "epoch": 5.948851698906035, "grad_norm": 0.523185670375824, "learning_rate": 2.0505747126436782e-05, "loss": 0.0928, "step": 5132 }, { "epoch": 5.950010867202782, "grad_norm": 0.3942788243293762, "learning_rate": 2.05e-05, "loss": 0.0931, "step": 5133 }, { "epoch": 5.951170035499529, "grad_norm": 0.39801275730133057, "learning_rate": 2.049425287356322e-05, "loss": 0.096, "step": 5134 }, { "epoch": 5.9523292037962765, "grad_norm": 0.5294638276100159, "learning_rate": 2.0488505747126437e-05, "loss": 0.0982, "step": 5135 }, { "epoch": 5.953488372093023, "grad_norm": 0.732921838760376, "learning_rate": 2.048275862068966e-05, "loss": 0.1096, "step": 5136 }, { "epoch": 5.95464754038977, "grad_norm": 0.39925023913383484, "learning_rate": 2.0477011494252873e-05, "loss": 0.0964, "step": 5137 }, { "epoch": 5.955806708686517, "grad_norm": 0.47731050848960876, "learning_rate": 2.047126436781609e-05, "loss": 0.0994, "step": 5138 }, { "epoch": 5.956965876983265, "grad_norm": 0.48555245995521545, "learning_rate": 2.0465517241379313e-05, "loss": 0.0893, "step": 5139 }, { "epoch": 5.958125045280012, "grad_norm": 0.33314958214759827, "learning_rate": 2.045977011494253e-05, "loss": 0.0874, "step": 5140 }, { "epoch": 5.959284213576758, "grad_norm": 0.5869859457015991, "learning_rate": 2.0454022988505747e-05, "loss": 0.0995, "step": 5141 }, { "epoch": 5.960443381873506, "grad_norm": 0.3285931348800659, "learning_rate": 2.0448275862068965e-05, "loss": 0.0937, "step": 5142 }, { "epoch": 5.9616025501702525, "grad_norm": 0.3716854751110077, "learning_rate": 2.0442528735632187e-05, "loss": 0.0976, "step": 5143 }, { "epoch": 5.962761718467, "grad_norm": 0.3806571960449219, "learning_rate": 2.0436781609195405e-05, "loss": 0.096, "step": 5144 }, { "epoch": 5.963920886763747, "grad_norm": 0.4217926859855652, "learning_rate": 2.043103448275862e-05, "loss": 0.0966, "step": 5145 }, { "epoch": 5.965080055060494, "grad_norm": 0.3270195424556732, "learning_rate": 2.042528735632184e-05, "loss": 0.0873, "step": 5146 }, { "epoch": 5.966239223357241, "grad_norm": 0.4076954126358032, "learning_rate": 2.041954022988506e-05, "loss": 0.0909, "step": 5147 }, { "epoch": 5.9673983916539886, "grad_norm": 0.491375595331192, "learning_rate": 2.0413793103448278e-05, "loss": 0.092, "step": 5148 }, { "epoch": 5.968557559950735, "grad_norm": 0.42915698885917664, "learning_rate": 2.0408045977011493e-05, "loss": 0.0966, "step": 5149 }, { "epoch": 5.969716728247483, "grad_norm": 0.38473838567733765, "learning_rate": 2.0402298850574715e-05, "loss": 0.0911, "step": 5150 }, { "epoch": 5.9708758965442295, "grad_norm": 0.4680832028388977, "learning_rate": 2.0396551724137933e-05, "loss": 0.0904, "step": 5151 }, { "epoch": 5.972035064840977, "grad_norm": 0.4412045180797577, "learning_rate": 2.0390804597701148e-05, "loss": 0.1048, "step": 5152 }, { "epoch": 5.973194233137724, "grad_norm": 0.4312341809272766, "learning_rate": 2.038505747126437e-05, "loss": 0.096, "step": 5153 }, { "epoch": 5.97435340143447, "grad_norm": 0.5846208333969116, "learning_rate": 2.0379310344827588e-05, "loss": 0.0986, "step": 5154 }, { "epoch": 5.975512569731218, "grad_norm": 0.3806995749473572, "learning_rate": 2.0373563218390806e-05, "loss": 0.091, "step": 5155 }, { "epoch": 5.976671738027965, "grad_norm": 0.3785600960254669, "learning_rate": 2.0367816091954024e-05, "loss": 0.0969, "step": 5156 }, { "epoch": 5.977830906324712, "grad_norm": 0.3626193106174469, "learning_rate": 2.0362068965517243e-05, "loss": 0.09, "step": 5157 }, { "epoch": 5.978990074621459, "grad_norm": 0.33364805579185486, "learning_rate": 2.035632183908046e-05, "loss": 0.0933, "step": 5158 }, { "epoch": 5.980149242918206, "grad_norm": 0.4983973503112793, "learning_rate": 2.035057471264368e-05, "loss": 0.1003, "step": 5159 }, { "epoch": 5.981308411214953, "grad_norm": 0.5828344225883484, "learning_rate": 2.0344827586206897e-05, "loss": 0.0986, "step": 5160 }, { "epoch": 5.982467579511701, "grad_norm": 0.3898671269416809, "learning_rate": 2.0339080459770116e-05, "loss": 0.0839, "step": 5161 }, { "epoch": 5.983626747808447, "grad_norm": 0.3521953523159027, "learning_rate": 2.0333333333333334e-05, "loss": 0.0837, "step": 5162 }, { "epoch": 5.984785916105195, "grad_norm": 0.40425774455070496, "learning_rate": 2.0327586206896552e-05, "loss": 0.095, "step": 5163 }, { "epoch": 5.9859450844019415, "grad_norm": 0.45948293805122375, "learning_rate": 2.032183908045977e-05, "loss": 0.0974, "step": 5164 }, { "epoch": 5.987104252698689, "grad_norm": 0.40446847677230835, "learning_rate": 2.031609195402299e-05, "loss": 0.0932, "step": 5165 }, { "epoch": 5.988263420995436, "grad_norm": 0.49766218662261963, "learning_rate": 2.0310344827586207e-05, "loss": 0.0998, "step": 5166 }, { "epoch": 5.989422589292182, "grad_norm": 0.5645685195922852, "learning_rate": 2.030459770114943e-05, "loss": 0.1052, "step": 5167 }, { "epoch": 5.99058175758893, "grad_norm": 0.28001391887664795, "learning_rate": 2.0298850574712644e-05, "loss": 0.0839, "step": 5168 }, { "epoch": 5.991740925885677, "grad_norm": 0.3594200611114502, "learning_rate": 2.0293103448275862e-05, "loss": 0.0934, "step": 5169 }, { "epoch": 5.992900094182424, "grad_norm": 0.34900590777397156, "learning_rate": 2.0287356321839084e-05, "loss": 0.0856, "step": 5170 }, { "epoch": 5.994059262479171, "grad_norm": 0.4731251001358032, "learning_rate": 2.02816091954023e-05, "loss": 0.1025, "step": 5171 }, { "epoch": 5.995218430775918, "grad_norm": 0.3666514754295349, "learning_rate": 2.0275862068965517e-05, "loss": 0.0929, "step": 5172 }, { "epoch": 5.996377599072665, "grad_norm": 0.348015695810318, "learning_rate": 2.0270114942528735e-05, "loss": 0.0902, "step": 5173 }, { "epoch": 5.997536767369413, "grad_norm": 0.3184606432914734, "learning_rate": 2.0264367816091957e-05, "loss": 0.0891, "step": 5174 }, { "epoch": 5.998695935666159, "grad_norm": 0.7606669664382935, "learning_rate": 2.0258620689655172e-05, "loss": 0.1031, "step": 5175 }, { "epoch": 5.999855103962907, "grad_norm": 0.6448565721511841, "learning_rate": 2.025287356321839e-05, "loss": 0.1023, "step": 5176 }, { "epoch": 5.999855103962907, "eval_loss": 0.148734450340271, "eval_runtime": 265.936, "eval_samples_per_second": 5.768, "eval_steps_per_second": 5.768, "step": 5176 } ], "logging_steps": 1, "max_steps": 8620, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.3198416576181862e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }