diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,36313 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.999855103962907, + "eval_steps": 500, + "global_step": 5176, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001159168296747084, + "grad_norm": 2.7580204010009766, + "learning_rate": 4.999425287356322e-05, + "loss": 0.9725, + "step": 1 + }, + { + "epoch": 0.002318336593494168, + "grad_norm": 1.794647216796875, + "learning_rate": 4.998850574712644e-05, + "loss": 0.8067, + "step": 2 + }, + { + "epoch": 0.003477504890241252, + "grad_norm": 1.9236769676208496, + "learning_rate": 4.9982758620689654e-05, + "loss": 0.7883, + "step": 3 + }, + { + "epoch": 0.004636673186988336, + "grad_norm": 1.9243451356887817, + "learning_rate": 4.9977011494252876e-05, + "loss": 0.694, + "step": 4 + }, + { + "epoch": 0.00579584148373542, + "grad_norm": 2.325658082962036, + "learning_rate": 4.99712643678161e-05, + "loss": 0.658, + "step": 5 + }, + { + "epoch": 0.006955009780482504, + "grad_norm": 1.9371262788772583, + "learning_rate": 4.996551724137931e-05, + "loss": 0.6053, + "step": 6 + }, + { + "epoch": 0.008114178077229587, + "grad_norm": 2.50772762298584, + "learning_rate": 4.9959770114942534e-05, + "loss": 0.5673, + "step": 7 + }, + { + "epoch": 0.009273346373976673, + "grad_norm": 1.7069337368011475, + "learning_rate": 4.995402298850575e-05, + "loss": 0.5236, + "step": 8 + }, + { + "epoch": 0.010432514670723756, + "grad_norm": 2.124687433242798, + "learning_rate": 4.9948275862068964e-05, + "loss": 0.4998, + "step": 9 + }, + { + "epoch": 0.01159168296747084, + "grad_norm": 0.8905035257339478, + "learning_rate": 4.9942528735632185e-05, + "loss": 0.4632, + "step": 10 + }, + { + "epoch": 0.012750851264217924, + "grad_norm": 4.519129276275635, + "learning_rate": 4.993678160919541e-05, + "loss": 0.3914, + "step": 11 + }, + { + "epoch": 0.013910019560965008, + "grad_norm": 0.8016653060913086, + "learning_rate": 4.993103448275862e-05, + "loss": 0.3829, + "step": 12 + }, + { + "epoch": 0.015069187857712092, + "grad_norm": 0.8521615862846375, + "learning_rate": 4.9925287356321844e-05, + "loss": 0.3719, + "step": 13 + }, + { + "epoch": 0.016228356154459174, + "grad_norm": 0.664720356464386, + "learning_rate": 4.991954022988506e-05, + "loss": 0.3444, + "step": 14 + }, + { + "epoch": 0.017387524451206258, + "grad_norm": 1.236467719078064, + "learning_rate": 4.991379310344828e-05, + "loss": 0.3283, + "step": 15 + }, + { + "epoch": 0.018546692747953345, + "grad_norm": 1.1017507314682007, + "learning_rate": 4.9908045977011495e-05, + "loss": 0.3348, + "step": 16 + }, + { + "epoch": 0.01970586104470043, + "grad_norm": 0.70470130443573, + "learning_rate": 4.990229885057472e-05, + "loss": 0.318, + "step": 17 + }, + { + "epoch": 0.020865029341447513, + "grad_norm": 0.6223506927490234, + "learning_rate": 4.989655172413794e-05, + "loss": 0.3032, + "step": 18 + }, + { + "epoch": 0.022024197638194597, + "grad_norm": 0.7839768528938293, + "learning_rate": 4.989080459770115e-05, + "loss": 0.2854, + "step": 19 + }, + { + "epoch": 0.02318336593494168, + "grad_norm": 1.5129585266113281, + "learning_rate": 4.988505747126437e-05, + "loss": 0.2912, + "step": 20 + }, + { + "epoch": 0.024342534231688764, + "grad_norm": 0.6539391279220581, + "learning_rate": 4.987931034482759e-05, + "loss": 0.27, + "step": 21 + }, + { + "epoch": 0.025501702528435848, + "grad_norm": 0.693023145198822, + "learning_rate": 4.9873563218390805e-05, + "loss": 0.2558, + "step": 22 + }, + { + "epoch": 0.026660870825182932, + "grad_norm": 0.5335514545440674, + "learning_rate": 4.986781609195402e-05, + "loss": 0.2276, + "step": 23 + }, + { + "epoch": 0.027820039121930016, + "grad_norm": 0.6515309810638428, + "learning_rate": 4.986206896551724e-05, + "loss": 0.2594, + "step": 24 + }, + { + "epoch": 0.0289792074186771, + "grad_norm": 0.6215941905975342, + "learning_rate": 4.985632183908046e-05, + "loss": 0.255, + "step": 25 + }, + { + "epoch": 0.030138375715424184, + "grad_norm": 0.6825059056282043, + "learning_rate": 4.9850574712643685e-05, + "loss": 0.2719, + "step": 26 + }, + { + "epoch": 0.03129754401217127, + "grad_norm": 0.5802633166313171, + "learning_rate": 4.98448275862069e-05, + "loss": 0.2256, + "step": 27 + }, + { + "epoch": 0.03245671230891835, + "grad_norm": 0.5147871971130371, + "learning_rate": 4.9839080459770115e-05, + "loss": 0.214, + "step": 28 + }, + { + "epoch": 0.033615880605665435, + "grad_norm": 0.6146939396858215, + "learning_rate": 4.9833333333333336e-05, + "loss": 0.2401, + "step": 29 + }, + { + "epoch": 0.034775048902412516, + "grad_norm": 0.62993323802948, + "learning_rate": 4.982758620689655e-05, + "loss": 0.246, + "step": 30 + }, + { + "epoch": 0.0359342171991596, + "grad_norm": 0.6217732429504395, + "learning_rate": 4.982183908045977e-05, + "loss": 0.2162, + "step": 31 + }, + { + "epoch": 0.03709338549590669, + "grad_norm": 0.5851067304611206, + "learning_rate": 4.9816091954022994e-05, + "loss": 0.2226, + "step": 32 + }, + { + "epoch": 0.03825255379265377, + "grad_norm": 0.6002720594406128, + "learning_rate": 4.981034482758621e-05, + "loss": 0.226, + "step": 33 + }, + { + "epoch": 0.03941172208940086, + "grad_norm": 0.6244901418685913, + "learning_rate": 4.980459770114943e-05, + "loss": 0.2536, + "step": 34 + }, + { + "epoch": 0.04057089038614794, + "grad_norm": 0.6513347625732422, + "learning_rate": 4.9798850574712646e-05, + "loss": 0.2388, + "step": 35 + }, + { + "epoch": 0.041730058682895026, + "grad_norm": 0.45215773582458496, + "learning_rate": 4.979310344827586e-05, + "loss": 0.197, + "step": 36 + }, + { + "epoch": 0.042889226979642106, + "grad_norm": 0.45586729049682617, + "learning_rate": 4.978735632183908e-05, + "loss": 0.1992, + "step": 37 + }, + { + "epoch": 0.04404839527638919, + "grad_norm": 0.49368321895599365, + "learning_rate": 4.9781609195402304e-05, + "loss": 0.2319, + "step": 38 + }, + { + "epoch": 0.045207563573136274, + "grad_norm": 0.4354698956012726, + "learning_rate": 4.977586206896552e-05, + "loss": 0.2072, + "step": 39 + }, + { + "epoch": 0.04636673186988336, + "grad_norm": 0.40046998858451843, + "learning_rate": 4.977011494252874e-05, + "loss": 0.183, + "step": 40 + }, + { + "epoch": 0.04752590016663044, + "grad_norm": 0.41561418771743774, + "learning_rate": 4.9764367816091956e-05, + "loss": 0.196, + "step": 41 + }, + { + "epoch": 0.04868506846337753, + "grad_norm": 0.4249739646911621, + "learning_rate": 4.975862068965517e-05, + "loss": 0.1903, + "step": 42 + }, + { + "epoch": 0.04984423676012461, + "grad_norm": 0.5787274837493896, + "learning_rate": 4.975287356321839e-05, + "loss": 0.2098, + "step": 43 + }, + { + "epoch": 0.051003405056871697, + "grad_norm": 0.4138984978199005, + "learning_rate": 4.974712643678161e-05, + "loss": 0.1942, + "step": 44 + }, + { + "epoch": 0.05216257335361878, + "grad_norm": 0.4079653322696686, + "learning_rate": 4.9741379310344836e-05, + "loss": 0.2141, + "step": 45 + }, + { + "epoch": 0.053321741650365864, + "grad_norm": 0.4127311408519745, + "learning_rate": 4.973563218390805e-05, + "loss": 0.1795, + "step": 46 + }, + { + "epoch": 0.054480909947112945, + "grad_norm": 0.48196831345558167, + "learning_rate": 4.9729885057471265e-05, + "loss": 0.2008, + "step": 47 + }, + { + "epoch": 0.05564007824386003, + "grad_norm": 0.43922609090805054, + "learning_rate": 4.972413793103449e-05, + "loss": 0.1897, + "step": 48 + }, + { + "epoch": 0.05679924654060711, + "grad_norm": 0.455390065908432, + "learning_rate": 4.97183908045977e-05, + "loss": 0.1964, + "step": 49 + }, + { + "epoch": 0.0579584148373542, + "grad_norm": 0.4199633300304413, + "learning_rate": 4.971264367816092e-05, + "loss": 0.1984, + "step": 50 + }, + { + "epoch": 0.05911758313410128, + "grad_norm": 0.4313494861125946, + "learning_rate": 4.970689655172414e-05, + "loss": 0.2044, + "step": 51 + }, + { + "epoch": 0.06027675143084837, + "grad_norm": 0.431208997964859, + "learning_rate": 4.970114942528736e-05, + "loss": 0.2028, + "step": 52 + }, + { + "epoch": 0.06143591972759545, + "grad_norm": 0.4196656048297882, + "learning_rate": 4.969540229885058e-05, + "loss": 0.1892, + "step": 53 + }, + { + "epoch": 0.06259508802434254, + "grad_norm": 0.43570151925086975, + "learning_rate": 4.96896551724138e-05, + "loss": 0.2068, + "step": 54 + }, + { + "epoch": 0.06375425632108962, + "grad_norm": 0.3962918519973755, + "learning_rate": 4.968390804597701e-05, + "loss": 0.1936, + "step": 55 + }, + { + "epoch": 0.0649134246178367, + "grad_norm": 0.6284250617027283, + "learning_rate": 4.9678160919540233e-05, + "loss": 0.1999, + "step": 56 + }, + { + "epoch": 0.06607259291458378, + "grad_norm": 0.45828792452812195, + "learning_rate": 4.967241379310345e-05, + "loss": 0.1757, + "step": 57 + }, + { + "epoch": 0.06723176121133087, + "grad_norm": 0.43321043252944946, + "learning_rate": 4.966666666666667e-05, + "loss": 0.1908, + "step": 58 + }, + { + "epoch": 0.06839092950807796, + "grad_norm": 0.38716545701026917, + "learning_rate": 4.966091954022989e-05, + "loss": 0.1838, + "step": 59 + }, + { + "epoch": 0.06955009780482503, + "grad_norm": 0.5458523035049438, + "learning_rate": 4.9655172413793107e-05, + "loss": 0.1996, + "step": 60 + }, + { + "epoch": 0.07070926610157212, + "grad_norm": 0.5392327904701233, + "learning_rate": 4.964942528735632e-05, + "loss": 0.1895, + "step": 61 + }, + { + "epoch": 0.0718684343983192, + "grad_norm": 0.4704841077327728, + "learning_rate": 4.964367816091954e-05, + "loss": 0.1984, + "step": 62 + }, + { + "epoch": 0.0730276026950663, + "grad_norm": 0.3919082283973694, + "learning_rate": 4.963793103448276e-05, + "loss": 0.1642, + "step": 63 + }, + { + "epoch": 0.07418677099181338, + "grad_norm": 0.46172910928726196, + "learning_rate": 4.963218390804598e-05, + "loss": 0.1665, + "step": 64 + }, + { + "epoch": 0.07534593928856045, + "grad_norm": 0.42953500151634216, + "learning_rate": 4.9626436781609195e-05, + "loss": 0.1807, + "step": 65 + }, + { + "epoch": 0.07650510758530754, + "grad_norm": 0.4844302535057068, + "learning_rate": 4.9620689655172416e-05, + "loss": 0.1978, + "step": 66 + }, + { + "epoch": 0.07766427588205463, + "grad_norm": 0.5020731091499329, + "learning_rate": 4.961494252873564e-05, + "loss": 0.1943, + "step": 67 + }, + { + "epoch": 0.07882344417880172, + "grad_norm": 0.38344666361808777, + "learning_rate": 4.960919540229885e-05, + "loss": 0.1893, + "step": 68 + }, + { + "epoch": 0.07998261247554879, + "grad_norm": 0.46443063020706177, + "learning_rate": 4.960344827586207e-05, + "loss": 0.1896, + "step": 69 + }, + { + "epoch": 0.08114178077229588, + "grad_norm": 0.49002230167388916, + "learning_rate": 4.959770114942529e-05, + "loss": 0.1754, + "step": 70 + }, + { + "epoch": 0.08230094906904296, + "grad_norm": 0.40396377444267273, + "learning_rate": 4.9591954022988504e-05, + "loss": 0.1712, + "step": 71 + }, + { + "epoch": 0.08346011736579005, + "grad_norm": 0.44212767481803894, + "learning_rate": 4.9586206896551726e-05, + "loss": 0.1745, + "step": 72 + }, + { + "epoch": 0.08461928566253712, + "grad_norm": 0.40350252389907837, + "learning_rate": 4.958045977011495e-05, + "loss": 0.1664, + "step": 73 + }, + { + "epoch": 0.08577845395928421, + "grad_norm": 0.45109373331069946, + "learning_rate": 4.957471264367816e-05, + "loss": 0.1794, + "step": 74 + }, + { + "epoch": 0.0869376222560313, + "grad_norm": 0.4555060863494873, + "learning_rate": 4.9568965517241384e-05, + "loss": 0.1976, + "step": 75 + }, + { + "epoch": 0.08809679055277839, + "grad_norm": 0.39049214124679565, + "learning_rate": 4.95632183908046e-05, + "loss": 0.1642, + "step": 76 + }, + { + "epoch": 0.08925595884952546, + "grad_norm": 0.4325527250766754, + "learning_rate": 4.9557471264367814e-05, + "loss": 0.1735, + "step": 77 + }, + { + "epoch": 0.09041512714627255, + "grad_norm": 0.38487985730171204, + "learning_rate": 4.9551724137931036e-05, + "loss": 0.1727, + "step": 78 + }, + { + "epoch": 0.09157429544301963, + "grad_norm": 0.38345035910606384, + "learning_rate": 4.954597701149426e-05, + "loss": 0.153, + "step": 79 + }, + { + "epoch": 0.09273346373976672, + "grad_norm": 0.504005491733551, + "learning_rate": 4.954022988505747e-05, + "loss": 0.1852, + "step": 80 + }, + { + "epoch": 0.0938926320365138, + "grad_norm": 0.40111425518989563, + "learning_rate": 4.9534482758620694e-05, + "loss": 0.1834, + "step": 81 + }, + { + "epoch": 0.09505180033326088, + "grad_norm": 0.3611973226070404, + "learning_rate": 4.952873563218391e-05, + "loss": 0.1714, + "step": 82 + }, + { + "epoch": 0.09621096863000797, + "grad_norm": 0.37155240774154663, + "learning_rate": 4.952298850574713e-05, + "loss": 0.1589, + "step": 83 + }, + { + "epoch": 0.09737013692675506, + "grad_norm": 0.4023563861846924, + "learning_rate": 4.9517241379310346e-05, + "loss": 0.1751, + "step": 84 + }, + { + "epoch": 0.09852930522350213, + "grad_norm": 0.3763880431652069, + "learning_rate": 4.951149425287356e-05, + "loss": 0.1624, + "step": 85 + }, + { + "epoch": 0.09968847352024922, + "grad_norm": 0.38636812567710876, + "learning_rate": 4.950574712643679e-05, + "loss": 0.1609, + "step": 86 + }, + { + "epoch": 0.1008476418169963, + "grad_norm": 0.4320867657661438, + "learning_rate": 4.9500000000000004e-05, + "loss": 0.1945, + "step": 87 + }, + { + "epoch": 0.10200681011374339, + "grad_norm": 0.3686438798904419, + "learning_rate": 4.949425287356322e-05, + "loss": 0.167, + "step": 88 + }, + { + "epoch": 0.10316597841049047, + "grad_norm": 0.4408303201198578, + "learning_rate": 4.948850574712644e-05, + "loss": 0.1628, + "step": 89 + }, + { + "epoch": 0.10432514670723755, + "grad_norm": 0.5211153030395508, + "learning_rate": 4.9482758620689655e-05, + "loss": 0.1902, + "step": 90 + }, + { + "epoch": 0.10548431500398464, + "grad_norm": 0.3960166275501251, + "learning_rate": 4.947701149425288e-05, + "loss": 0.1615, + "step": 91 + }, + { + "epoch": 0.10664348330073173, + "grad_norm": 0.3887445628643036, + "learning_rate": 4.947126436781609e-05, + "loss": 0.1663, + "step": 92 + }, + { + "epoch": 0.10780265159747882, + "grad_norm": 0.4395574927330017, + "learning_rate": 4.9465517241379314e-05, + "loss": 0.1825, + "step": 93 + }, + { + "epoch": 0.10896181989422589, + "grad_norm": 0.4707461893558502, + "learning_rate": 4.9459770114942535e-05, + "loss": 0.1955, + "step": 94 + }, + { + "epoch": 0.11012098819097298, + "grad_norm": 0.4846711754798889, + "learning_rate": 4.945402298850575e-05, + "loss": 0.2082, + "step": 95 + }, + { + "epoch": 0.11128015648772006, + "grad_norm": 0.45903241634368896, + "learning_rate": 4.9448275862068965e-05, + "loss": 0.1856, + "step": 96 + }, + { + "epoch": 0.11243932478446715, + "grad_norm": 0.4076662063598633, + "learning_rate": 4.944252873563219e-05, + "loss": 0.1644, + "step": 97 + }, + { + "epoch": 0.11359849308121422, + "grad_norm": 0.4092468321323395, + "learning_rate": 4.94367816091954e-05, + "loss": 0.1709, + "step": 98 + }, + { + "epoch": 0.11475766137796131, + "grad_norm": 0.3405769169330597, + "learning_rate": 4.943103448275862e-05, + "loss": 0.1428, + "step": 99 + }, + { + "epoch": 0.1159168296747084, + "grad_norm": 0.38396161794662476, + "learning_rate": 4.9425287356321845e-05, + "loss": 0.1582, + "step": 100 + }, + { + "epoch": 0.11707599797145549, + "grad_norm": 0.38220831751823425, + "learning_rate": 4.941954022988506e-05, + "loss": 0.1645, + "step": 101 + }, + { + "epoch": 0.11823516626820256, + "grad_norm": 0.3981216549873352, + "learning_rate": 4.941379310344828e-05, + "loss": 0.1587, + "step": 102 + }, + { + "epoch": 0.11939433456494965, + "grad_norm": 0.3643917739391327, + "learning_rate": 4.9408045977011496e-05, + "loss": 0.1534, + "step": 103 + }, + { + "epoch": 0.12055350286169673, + "grad_norm": 0.42615723609924316, + "learning_rate": 4.940229885057471e-05, + "loss": 0.169, + "step": 104 + }, + { + "epoch": 0.12171267115844382, + "grad_norm": 0.4193412959575653, + "learning_rate": 4.939655172413793e-05, + "loss": 0.1586, + "step": 105 + }, + { + "epoch": 0.1228718394551909, + "grad_norm": 0.4014485478401184, + "learning_rate": 4.9390804597701155e-05, + "loss": 0.1523, + "step": 106 + }, + { + "epoch": 0.12403100775193798, + "grad_norm": 0.41445374488830566, + "learning_rate": 4.938505747126437e-05, + "loss": 0.1555, + "step": 107 + }, + { + "epoch": 0.12519017604868507, + "grad_norm": 0.39025428891181946, + "learning_rate": 4.937931034482759e-05, + "loss": 0.1719, + "step": 108 + }, + { + "epoch": 0.12634934434543216, + "grad_norm": 0.36380714178085327, + "learning_rate": 4.9373563218390806e-05, + "loss": 0.1539, + "step": 109 + }, + { + "epoch": 0.12750851264217924, + "grad_norm": 0.4117802679538727, + "learning_rate": 4.936781609195403e-05, + "loss": 0.162, + "step": 110 + }, + { + "epoch": 0.12866768093892633, + "grad_norm": 0.3868742883205414, + "learning_rate": 4.936206896551724e-05, + "loss": 0.1589, + "step": 111 + }, + { + "epoch": 0.1298268492356734, + "grad_norm": 0.34659501910209656, + "learning_rate": 4.935632183908046e-05, + "loss": 0.1594, + "step": 112 + }, + { + "epoch": 0.13098601753242048, + "grad_norm": 0.38265547156333923, + "learning_rate": 4.935057471264368e-05, + "loss": 0.1459, + "step": 113 + }, + { + "epoch": 0.13214518582916757, + "grad_norm": 0.39254799485206604, + "learning_rate": 4.93448275862069e-05, + "loss": 0.1608, + "step": 114 + }, + { + "epoch": 0.13330435412591465, + "grad_norm": 0.3889453411102295, + "learning_rate": 4.9339080459770116e-05, + "loss": 0.1637, + "step": 115 + }, + { + "epoch": 0.13446352242266174, + "grad_norm": 0.41454294323921204, + "learning_rate": 4.933333333333334e-05, + "loss": 0.1532, + "step": 116 + }, + { + "epoch": 0.13562269071940883, + "grad_norm": 0.4028087854385376, + "learning_rate": 4.932758620689655e-05, + "loss": 0.1569, + "step": 117 + }, + { + "epoch": 0.13678185901615592, + "grad_norm": 0.4449205994606018, + "learning_rate": 4.9321839080459774e-05, + "loss": 0.1745, + "step": 118 + }, + { + "epoch": 0.137941027312903, + "grad_norm": 0.3685709536075592, + "learning_rate": 4.931609195402299e-05, + "loss": 0.1476, + "step": 119 + }, + { + "epoch": 0.13910019560965006, + "grad_norm": 0.44437339901924133, + "learning_rate": 4.931034482758621e-05, + "loss": 0.1607, + "step": 120 + }, + { + "epoch": 0.14025936390639715, + "grad_norm": 0.5566555261611938, + "learning_rate": 4.930459770114943e-05, + "loss": 0.1943, + "step": 121 + }, + { + "epoch": 0.14141853220314424, + "grad_norm": 0.46472108364105225, + "learning_rate": 4.929885057471265e-05, + "loss": 0.1634, + "step": 122 + }, + { + "epoch": 0.14257770049989132, + "grad_norm": 0.38040968775749207, + "learning_rate": 4.929310344827586e-05, + "loss": 0.1411, + "step": 123 + }, + { + "epoch": 0.1437368687966384, + "grad_norm": 0.3785533607006073, + "learning_rate": 4.9287356321839084e-05, + "loss": 0.1515, + "step": 124 + }, + { + "epoch": 0.1448960370933855, + "grad_norm": 0.402432918548584, + "learning_rate": 4.92816091954023e-05, + "loss": 0.1529, + "step": 125 + }, + { + "epoch": 0.1460552053901326, + "grad_norm": 0.3900763690471649, + "learning_rate": 4.9275862068965514e-05, + "loss": 0.1716, + "step": 126 + }, + { + "epoch": 0.14721437368687967, + "grad_norm": 0.39079996943473816, + "learning_rate": 4.927011494252874e-05, + "loss": 0.1563, + "step": 127 + }, + { + "epoch": 0.14837354198362676, + "grad_norm": 0.38309377431869507, + "learning_rate": 4.926436781609196e-05, + "loss": 0.1602, + "step": 128 + }, + { + "epoch": 0.14953271028037382, + "grad_norm": 0.37955620884895325, + "learning_rate": 4.925862068965518e-05, + "loss": 0.1703, + "step": 129 + }, + { + "epoch": 0.1506918785771209, + "grad_norm": 0.37275955080986023, + "learning_rate": 4.9252873563218394e-05, + "loss": 0.1599, + "step": 130 + }, + { + "epoch": 0.151851046873868, + "grad_norm": 0.48944053053855896, + "learning_rate": 4.924712643678161e-05, + "loss": 0.1932, + "step": 131 + }, + { + "epoch": 0.15301021517061508, + "grad_norm": 0.4082753360271454, + "learning_rate": 4.924137931034483e-05, + "loss": 0.1674, + "step": 132 + }, + { + "epoch": 0.15416938346736217, + "grad_norm": 0.5443015098571777, + "learning_rate": 4.9235632183908045e-05, + "loss": 0.1679, + "step": 133 + }, + { + "epoch": 0.15532855176410926, + "grad_norm": 0.3231983184814453, + "learning_rate": 4.922988505747127e-05, + "loss": 0.1397, + "step": 134 + }, + { + "epoch": 0.15648772006085634, + "grad_norm": 0.384142130613327, + "learning_rate": 4.922413793103449e-05, + "loss": 0.1601, + "step": 135 + }, + { + "epoch": 0.15764688835760343, + "grad_norm": 0.4432188868522644, + "learning_rate": 4.9218390804597703e-05, + "loss": 0.1746, + "step": 136 + }, + { + "epoch": 0.1588060566543505, + "grad_norm": 0.3848440647125244, + "learning_rate": 4.9212643678160925e-05, + "loss": 0.1514, + "step": 137 + }, + { + "epoch": 0.15996522495109758, + "grad_norm": 0.40900081396102905, + "learning_rate": 4.920689655172414e-05, + "loss": 0.1728, + "step": 138 + }, + { + "epoch": 0.16112439324784467, + "grad_norm": 0.35513079166412354, + "learning_rate": 4.9201149425287355e-05, + "loss": 0.141, + "step": 139 + }, + { + "epoch": 0.16228356154459175, + "grad_norm": 0.4035043716430664, + "learning_rate": 4.9195402298850577e-05, + "loss": 0.1507, + "step": 140 + }, + { + "epoch": 0.16344272984133884, + "grad_norm": 0.4026646614074707, + "learning_rate": 4.91896551724138e-05, + "loss": 0.1629, + "step": 141 + }, + { + "epoch": 0.16460189813808593, + "grad_norm": 0.35455062985420227, + "learning_rate": 4.918390804597701e-05, + "loss": 0.1671, + "step": 142 + }, + { + "epoch": 0.16576106643483302, + "grad_norm": 0.3873222768306732, + "learning_rate": 4.9178160919540235e-05, + "loss": 0.1606, + "step": 143 + }, + { + "epoch": 0.1669202347315801, + "grad_norm": 0.34480100870132446, + "learning_rate": 4.917241379310345e-05, + "loss": 0.1446, + "step": 144 + }, + { + "epoch": 0.16807940302832716, + "grad_norm": 0.3599451184272766, + "learning_rate": 4.9166666666666665e-05, + "loss": 0.1432, + "step": 145 + }, + { + "epoch": 0.16923857132507425, + "grad_norm": 0.38096269965171814, + "learning_rate": 4.9160919540229886e-05, + "loss": 0.1726, + "step": 146 + }, + { + "epoch": 0.17039773962182134, + "grad_norm": 0.384888619184494, + "learning_rate": 4.915517241379311e-05, + "loss": 0.1579, + "step": 147 + }, + { + "epoch": 0.17155690791856842, + "grad_norm": 0.3715001344680786, + "learning_rate": 4.914942528735633e-05, + "loss": 0.1734, + "step": 148 + }, + { + "epoch": 0.1727160762153155, + "grad_norm": 0.4220834970474243, + "learning_rate": 4.9143678160919545e-05, + "loss": 0.1752, + "step": 149 + }, + { + "epoch": 0.1738752445120626, + "grad_norm": 0.3289235830307007, + "learning_rate": 4.913793103448276e-05, + "loss": 0.1441, + "step": 150 + }, + { + "epoch": 0.1750344128088097, + "grad_norm": 0.3469524085521698, + "learning_rate": 4.913218390804598e-05, + "loss": 0.1416, + "step": 151 + }, + { + "epoch": 0.17619358110555677, + "grad_norm": 0.38370615243911743, + "learning_rate": 4.9126436781609196e-05, + "loss": 0.1604, + "step": 152 + }, + { + "epoch": 0.17735274940230383, + "grad_norm": 0.33676958084106445, + "learning_rate": 4.912068965517241e-05, + "loss": 0.1585, + "step": 153 + }, + { + "epoch": 0.17851191769905092, + "grad_norm": 0.3484872281551361, + "learning_rate": 4.911494252873563e-05, + "loss": 0.1516, + "step": 154 + }, + { + "epoch": 0.179671085995798, + "grad_norm": 0.3425162434577942, + "learning_rate": 4.9109195402298854e-05, + "loss": 0.1468, + "step": 155 + }, + { + "epoch": 0.1808302542925451, + "grad_norm": 0.3834597170352936, + "learning_rate": 4.9103448275862076e-05, + "loss": 0.1902, + "step": 156 + }, + { + "epoch": 0.18198942258929218, + "grad_norm": 0.4861818253993988, + "learning_rate": 4.909770114942529e-05, + "loss": 0.1666, + "step": 157 + }, + { + "epoch": 0.18314859088603927, + "grad_norm": 0.4293661415576935, + "learning_rate": 4.9091954022988506e-05, + "loss": 0.1476, + "step": 158 + }, + { + "epoch": 0.18430775918278636, + "grad_norm": 0.4156044125556946, + "learning_rate": 4.908620689655173e-05, + "loss": 0.1427, + "step": 159 + }, + { + "epoch": 0.18546692747953344, + "grad_norm": 0.3734503388404846, + "learning_rate": 4.908045977011494e-05, + "loss": 0.1461, + "step": 160 + }, + { + "epoch": 0.18662609577628053, + "grad_norm": 0.4116658568382263, + "learning_rate": 4.9074712643678164e-05, + "loss": 0.1538, + "step": 161 + }, + { + "epoch": 0.1877852640730276, + "grad_norm": 0.40862905979156494, + "learning_rate": 4.9068965517241386e-05, + "loss": 0.1567, + "step": 162 + }, + { + "epoch": 0.18894443236977468, + "grad_norm": 0.3499276041984558, + "learning_rate": 4.90632183908046e-05, + "loss": 0.1413, + "step": 163 + }, + { + "epoch": 0.19010360066652177, + "grad_norm": 0.40991586446762085, + "learning_rate": 4.9057471264367816e-05, + "loss": 0.1686, + "step": 164 + }, + { + "epoch": 0.19126276896326885, + "grad_norm": 0.3891771733760834, + "learning_rate": 4.905172413793104e-05, + "loss": 0.157, + "step": 165 + }, + { + "epoch": 0.19242193726001594, + "grad_norm": 0.3548631966114044, + "learning_rate": 4.904597701149425e-05, + "loss": 0.1384, + "step": 166 + }, + { + "epoch": 0.19358110555676303, + "grad_norm": 0.36914005875587463, + "learning_rate": 4.9040229885057474e-05, + "loss": 0.1409, + "step": 167 + }, + { + "epoch": 0.19474027385351012, + "grad_norm": 0.36561641097068787, + "learning_rate": 4.9034482758620695e-05, + "loss": 0.155, + "step": 168 + }, + { + "epoch": 0.1958994421502572, + "grad_norm": 0.38047492504119873, + "learning_rate": 4.902873563218391e-05, + "loss": 0.153, + "step": 169 + }, + { + "epoch": 0.19705861044700426, + "grad_norm": 0.3930132985115051, + "learning_rate": 4.902298850574713e-05, + "loss": 0.168, + "step": 170 + }, + { + "epoch": 0.19821777874375135, + "grad_norm": 0.4097788333892822, + "learning_rate": 4.901724137931035e-05, + "loss": 0.1684, + "step": 171 + }, + { + "epoch": 0.19937694704049844, + "grad_norm": 0.3460334837436676, + "learning_rate": 4.901149425287356e-05, + "loss": 0.1477, + "step": 172 + }, + { + "epoch": 0.20053611533724552, + "grad_norm": 0.31552115082740784, + "learning_rate": 4.9005747126436784e-05, + "loss": 0.1382, + "step": 173 + }, + { + "epoch": 0.2016952836339926, + "grad_norm": 0.36335986852645874, + "learning_rate": 4.9e-05, + "loss": 0.1429, + "step": 174 + }, + { + "epoch": 0.2028544519307397, + "grad_norm": 0.32661738991737366, + "learning_rate": 4.899425287356322e-05, + "loss": 0.1515, + "step": 175 + }, + { + "epoch": 0.20401362022748679, + "grad_norm": 0.386250764131546, + "learning_rate": 4.898850574712644e-05, + "loss": 0.1558, + "step": 176 + }, + { + "epoch": 0.20517278852423387, + "grad_norm": 0.4087134599685669, + "learning_rate": 4.898275862068966e-05, + "loss": 0.1639, + "step": 177 + }, + { + "epoch": 0.20633195682098093, + "grad_norm": 0.39643535017967224, + "learning_rate": 4.897701149425288e-05, + "loss": 0.1487, + "step": 178 + }, + { + "epoch": 0.20749112511772802, + "grad_norm": 0.36634111404418945, + "learning_rate": 4.897126436781609e-05, + "loss": 0.1383, + "step": 179 + }, + { + "epoch": 0.2086502934144751, + "grad_norm": 0.32414987683296204, + "learning_rate": 4.896551724137931e-05, + "loss": 0.1345, + "step": 180 + }, + { + "epoch": 0.2098094617112222, + "grad_norm": 0.3729398548603058, + "learning_rate": 4.895977011494253e-05, + "loss": 0.14, + "step": 181 + }, + { + "epoch": 0.21096863000796928, + "grad_norm": 0.32442769408226013, + "learning_rate": 4.895402298850575e-05, + "loss": 0.1573, + "step": 182 + }, + { + "epoch": 0.21212779830471637, + "grad_norm": 0.33038130402565, + "learning_rate": 4.8948275862068966e-05, + "loss": 0.1539, + "step": 183 + }, + { + "epoch": 0.21328696660146346, + "grad_norm": 0.3526158630847931, + "learning_rate": 4.894252873563219e-05, + "loss": 0.1471, + "step": 184 + }, + { + "epoch": 0.21444613489821054, + "grad_norm": 0.3698391020298004, + "learning_rate": 4.89367816091954e-05, + "loss": 0.155, + "step": 185 + }, + { + "epoch": 0.21560530319495763, + "grad_norm": 0.35763153433799744, + "learning_rate": 4.8931034482758625e-05, + "loss": 0.145, + "step": 186 + }, + { + "epoch": 0.2167644714917047, + "grad_norm": 0.396771639585495, + "learning_rate": 4.892528735632184e-05, + "loss": 0.1616, + "step": 187 + }, + { + "epoch": 0.21792363978845178, + "grad_norm": 0.3490588963031769, + "learning_rate": 4.891954022988506e-05, + "loss": 0.1491, + "step": 188 + }, + { + "epoch": 0.21908280808519887, + "grad_norm": 0.3624797761440277, + "learning_rate": 4.891379310344828e-05, + "loss": 0.1731, + "step": 189 + }, + { + "epoch": 0.22024197638194595, + "grad_norm": 0.45168307423591614, + "learning_rate": 4.89080459770115e-05, + "loss": 0.1598, + "step": 190 + }, + { + "epoch": 0.22140114467869304, + "grad_norm": 0.35063374042510986, + "learning_rate": 4.890229885057471e-05, + "loss": 0.1479, + "step": 191 + }, + { + "epoch": 0.22256031297544013, + "grad_norm": 0.317920982837677, + "learning_rate": 4.8896551724137934e-05, + "loss": 0.1432, + "step": 192 + }, + { + "epoch": 0.22371948127218722, + "grad_norm": 0.31059324741363525, + "learning_rate": 4.889080459770115e-05, + "loss": 0.1419, + "step": 193 + }, + { + "epoch": 0.2248786495689343, + "grad_norm": 0.3705626428127289, + "learning_rate": 4.888505747126437e-05, + "loss": 0.1583, + "step": 194 + }, + { + "epoch": 0.22603781786568136, + "grad_norm": 0.3402438759803772, + "learning_rate": 4.8879310344827586e-05, + "loss": 0.138, + "step": 195 + }, + { + "epoch": 0.22719698616242845, + "grad_norm": 0.3639499247074127, + "learning_rate": 4.887356321839081e-05, + "loss": 0.154, + "step": 196 + }, + { + "epoch": 0.22835615445917554, + "grad_norm": 0.33627521991729736, + "learning_rate": 4.886781609195403e-05, + "loss": 0.1435, + "step": 197 + }, + { + "epoch": 0.22951532275592262, + "grad_norm": 0.35936155915260315, + "learning_rate": 4.8862068965517244e-05, + "loss": 0.1567, + "step": 198 + }, + { + "epoch": 0.2306744910526697, + "grad_norm": 0.4516816735267639, + "learning_rate": 4.885632183908046e-05, + "loss": 0.14, + "step": 199 + }, + { + "epoch": 0.2318336593494168, + "grad_norm": 0.3906489610671997, + "learning_rate": 4.885057471264368e-05, + "loss": 0.1405, + "step": 200 + }, + { + "epoch": 0.23299282764616389, + "grad_norm": 0.34470584988594055, + "learning_rate": 4.8844827586206896e-05, + "loss": 0.1519, + "step": 201 + }, + { + "epoch": 0.23415199594291097, + "grad_norm": 0.3606179654598236, + "learning_rate": 4.883908045977012e-05, + "loss": 0.1514, + "step": 202 + }, + { + "epoch": 0.23531116423965803, + "grad_norm": 0.31473562121391296, + "learning_rate": 4.883333333333334e-05, + "loss": 0.1472, + "step": 203 + }, + { + "epoch": 0.23647033253640512, + "grad_norm": 0.3649226725101471, + "learning_rate": 4.8827586206896554e-05, + "loss": 0.1565, + "step": 204 + }, + { + "epoch": 0.2376295008331522, + "grad_norm": 0.3161579370498657, + "learning_rate": 4.8821839080459776e-05, + "loss": 0.1357, + "step": 205 + }, + { + "epoch": 0.2387886691298993, + "grad_norm": 0.3359610140323639, + "learning_rate": 4.881609195402299e-05, + "loss": 0.1486, + "step": 206 + }, + { + "epoch": 0.23994783742664638, + "grad_norm": 0.38040071725845337, + "learning_rate": 4.8810344827586205e-05, + "loss": 0.1607, + "step": 207 + }, + { + "epoch": 0.24110700572339347, + "grad_norm": 0.37179407477378845, + "learning_rate": 4.880459770114943e-05, + "loss": 0.1472, + "step": 208 + }, + { + "epoch": 0.24226617402014056, + "grad_norm": 0.3661664128303528, + "learning_rate": 4.879885057471265e-05, + "loss": 0.1528, + "step": 209 + }, + { + "epoch": 0.24342534231688764, + "grad_norm": 0.39830392599105835, + "learning_rate": 4.8793103448275864e-05, + "loss": 0.1465, + "step": 210 + }, + { + "epoch": 0.2445845106136347, + "grad_norm": 0.4218502938747406, + "learning_rate": 4.8787356321839085e-05, + "loss": 0.1552, + "step": 211 + }, + { + "epoch": 0.2457436789103818, + "grad_norm": 0.45591163635253906, + "learning_rate": 4.87816091954023e-05, + "loss": 0.1648, + "step": 212 + }, + { + "epoch": 0.24690284720712888, + "grad_norm": 0.33883577585220337, + "learning_rate": 4.877586206896552e-05, + "loss": 0.1471, + "step": 213 + }, + { + "epoch": 0.24806201550387597, + "grad_norm": 0.4139156937599182, + "learning_rate": 4.877011494252874e-05, + "loss": 0.1603, + "step": 214 + }, + { + "epoch": 0.24922118380062305, + "grad_norm": 0.4006992280483246, + "learning_rate": 4.876436781609195e-05, + "loss": 0.1698, + "step": 215 + }, + { + "epoch": 0.25038035209737014, + "grad_norm": 0.3272416889667511, + "learning_rate": 4.875862068965517e-05, + "loss": 0.1271, + "step": 216 + }, + { + "epoch": 0.2515395203941172, + "grad_norm": 0.3487521708011627, + "learning_rate": 4.8752873563218395e-05, + "loss": 0.1528, + "step": 217 + }, + { + "epoch": 0.2526986886908643, + "grad_norm": 0.39113104343414307, + "learning_rate": 4.874712643678161e-05, + "loss": 0.1763, + "step": 218 + }, + { + "epoch": 0.2538578569876114, + "grad_norm": 0.32122161984443665, + "learning_rate": 4.874137931034483e-05, + "loss": 0.1324, + "step": 219 + }, + { + "epoch": 0.2550170252843585, + "grad_norm": 0.32524266839027405, + "learning_rate": 4.8735632183908047e-05, + "loss": 0.1483, + "step": 220 + }, + { + "epoch": 0.25617619358110555, + "grad_norm": 0.3443804085254669, + "learning_rate": 4.872988505747126e-05, + "loss": 0.1656, + "step": 221 + }, + { + "epoch": 0.25733536187785266, + "grad_norm": 0.33968833088874817, + "learning_rate": 4.872413793103448e-05, + "loss": 0.1425, + "step": 222 + }, + { + "epoch": 0.2584945301745997, + "grad_norm": 0.29967644810676575, + "learning_rate": 4.8718390804597705e-05, + "loss": 0.1458, + "step": 223 + }, + { + "epoch": 0.2596536984713468, + "grad_norm": 0.36407482624053955, + "learning_rate": 4.8712643678160926e-05, + "loss": 0.1467, + "step": 224 + }, + { + "epoch": 0.2608128667680939, + "grad_norm": 0.42614683508872986, + "learning_rate": 4.870689655172414e-05, + "loss": 0.1665, + "step": 225 + }, + { + "epoch": 0.26197203506484096, + "grad_norm": 0.3307810425758362, + "learning_rate": 4.8701149425287356e-05, + "loss": 0.1389, + "step": 226 + }, + { + "epoch": 0.2631312033615881, + "grad_norm": 0.3416149616241455, + "learning_rate": 4.869540229885058e-05, + "loss": 0.1616, + "step": 227 + }, + { + "epoch": 0.26429037165833513, + "grad_norm": 0.3303369879722595, + "learning_rate": 4.868965517241379e-05, + "loss": 0.1472, + "step": 228 + }, + { + "epoch": 0.26544953995508225, + "grad_norm": 0.31898778676986694, + "learning_rate": 4.8683908045977015e-05, + "loss": 0.1374, + "step": 229 + }, + { + "epoch": 0.2666087082518293, + "grad_norm": 0.33968180418014526, + "learning_rate": 4.8678160919540236e-05, + "loss": 0.1549, + "step": 230 + }, + { + "epoch": 0.2677678765485764, + "grad_norm": 0.3462338149547577, + "learning_rate": 4.867241379310345e-05, + "loss": 0.1548, + "step": 231 + }, + { + "epoch": 0.2689270448453235, + "grad_norm": 0.27784019708633423, + "learning_rate": 4.866666666666667e-05, + "loss": 0.1323, + "step": 232 + }, + { + "epoch": 0.27008621314207054, + "grad_norm": 0.3605106770992279, + "learning_rate": 4.866091954022989e-05, + "loss": 0.149, + "step": 233 + }, + { + "epoch": 0.27124538143881766, + "grad_norm": 0.39858150482177734, + "learning_rate": 4.86551724137931e-05, + "loss": 0.1625, + "step": 234 + }, + { + "epoch": 0.2724045497355647, + "grad_norm": 0.34399569034576416, + "learning_rate": 4.8649425287356324e-05, + "loss": 0.1536, + "step": 235 + }, + { + "epoch": 0.27356371803231183, + "grad_norm": 0.33953142166137695, + "learning_rate": 4.864367816091954e-05, + "loss": 0.1471, + "step": 236 + }, + { + "epoch": 0.2747228863290589, + "grad_norm": 0.3053218722343445, + "learning_rate": 4.863793103448276e-05, + "loss": 0.1288, + "step": 237 + }, + { + "epoch": 0.275882054625806, + "grad_norm": 0.40773364901542664, + "learning_rate": 4.863218390804598e-05, + "loss": 0.1814, + "step": 238 + }, + { + "epoch": 0.27704122292255307, + "grad_norm": 0.35415759682655334, + "learning_rate": 4.86264367816092e-05, + "loss": 0.1579, + "step": 239 + }, + { + "epoch": 0.2782003912193001, + "grad_norm": 0.39274096488952637, + "learning_rate": 4.862068965517241e-05, + "loss": 0.1678, + "step": 240 + }, + { + "epoch": 0.27935955951604724, + "grad_norm": 0.3034732937812805, + "learning_rate": 4.8614942528735634e-05, + "loss": 0.1498, + "step": 241 + }, + { + "epoch": 0.2805187278127943, + "grad_norm": 0.3109908998012543, + "learning_rate": 4.860919540229885e-05, + "loss": 0.149, + "step": 242 + }, + { + "epoch": 0.2816778961095414, + "grad_norm": 0.31770509481430054, + "learning_rate": 4.860344827586207e-05, + "loss": 0.1479, + "step": 243 + }, + { + "epoch": 0.2828370644062885, + "grad_norm": 0.3324630856513977, + "learning_rate": 4.859770114942529e-05, + "loss": 0.15, + "step": 244 + }, + { + "epoch": 0.2839962327030356, + "grad_norm": 0.30650344491004944, + "learning_rate": 4.859195402298851e-05, + "loss": 0.1328, + "step": 245 + }, + { + "epoch": 0.28515540099978265, + "grad_norm": 0.3292284309864044, + "learning_rate": 4.858620689655173e-05, + "loss": 0.1432, + "step": 246 + }, + { + "epoch": 0.28631456929652976, + "grad_norm": 0.3721246123313904, + "learning_rate": 4.8580459770114944e-05, + "loss": 0.1695, + "step": 247 + }, + { + "epoch": 0.2874737375932768, + "grad_norm": 0.39138564467430115, + "learning_rate": 4.857471264367816e-05, + "loss": 0.1595, + "step": 248 + }, + { + "epoch": 0.2886329058900239, + "grad_norm": 0.3113502860069275, + "learning_rate": 4.856896551724138e-05, + "loss": 0.1474, + "step": 249 + }, + { + "epoch": 0.289792074186771, + "grad_norm": 0.3272615373134613, + "learning_rate": 4.85632183908046e-05, + "loss": 0.1557, + "step": 250 + }, + { + "epoch": 0.29095124248351806, + "grad_norm": 0.32985273003578186, + "learning_rate": 4.8557471264367824e-05, + "loss": 0.1543, + "step": 251 + }, + { + "epoch": 0.2921104107802652, + "grad_norm": 0.4038926362991333, + "learning_rate": 4.855172413793104e-05, + "loss": 0.1762, + "step": 252 + }, + { + "epoch": 0.29326957907701223, + "grad_norm": 0.5201258063316345, + "learning_rate": 4.8545977011494253e-05, + "loss": 0.1435, + "step": 253 + }, + { + "epoch": 0.29442874737375935, + "grad_norm": 0.43428415060043335, + "learning_rate": 4.8540229885057475e-05, + "loss": 0.1461, + "step": 254 + }, + { + "epoch": 0.2955879156705064, + "grad_norm": 0.29131442308425903, + "learning_rate": 4.853448275862069e-05, + "loss": 0.1384, + "step": 255 + }, + { + "epoch": 0.2967470839672535, + "grad_norm": 0.40190252661705017, + "learning_rate": 4.8528735632183905e-05, + "loss": 0.1433, + "step": 256 + }, + { + "epoch": 0.2979062522640006, + "grad_norm": 0.34535473585128784, + "learning_rate": 4.8522988505747133e-05, + "loss": 0.147, + "step": 257 + }, + { + "epoch": 0.29906542056074764, + "grad_norm": 0.3541038930416107, + "learning_rate": 4.851724137931035e-05, + "loss": 0.1658, + "step": 258 + }, + { + "epoch": 0.30022458885749476, + "grad_norm": 0.39234408736228943, + "learning_rate": 4.851149425287357e-05, + "loss": 0.1711, + "step": 259 + }, + { + "epoch": 0.3013837571542418, + "grad_norm": 0.37314096093177795, + "learning_rate": 4.8505747126436785e-05, + "loss": 0.1475, + "step": 260 + }, + { + "epoch": 0.30254292545098893, + "grad_norm": 0.3385733366012573, + "learning_rate": 4.85e-05, + "loss": 0.1406, + "step": 261 + }, + { + "epoch": 0.303702093747736, + "grad_norm": 0.6898816823959351, + "learning_rate": 4.849425287356322e-05, + "loss": 0.1584, + "step": 262 + }, + { + "epoch": 0.3048612620444831, + "grad_norm": 0.30045434832572937, + "learning_rate": 4.8488505747126436e-05, + "loss": 0.1329, + "step": 263 + }, + { + "epoch": 0.30602043034123017, + "grad_norm": 0.40219223499298096, + "learning_rate": 4.848275862068966e-05, + "loss": 0.1529, + "step": 264 + }, + { + "epoch": 0.3071795986379772, + "grad_norm": 0.30509233474731445, + "learning_rate": 4.847701149425288e-05, + "loss": 0.1591, + "step": 265 + }, + { + "epoch": 0.30833876693472434, + "grad_norm": 0.3983840048313141, + "learning_rate": 4.8471264367816095e-05, + "loss": 0.1598, + "step": 266 + }, + { + "epoch": 0.3094979352314714, + "grad_norm": 0.39158061146736145, + "learning_rate": 4.846551724137931e-05, + "loss": 0.164, + "step": 267 + }, + { + "epoch": 0.3106571035282185, + "grad_norm": 0.3619145452976227, + "learning_rate": 4.845977011494253e-05, + "loss": 0.152, + "step": 268 + }, + { + "epoch": 0.3118162718249656, + "grad_norm": 0.34043577313423157, + "learning_rate": 4.8454022988505746e-05, + "loss": 0.1465, + "step": 269 + }, + { + "epoch": 0.3129754401217127, + "grad_norm": 0.38417112827301025, + "learning_rate": 4.844827586206897e-05, + "loss": 0.1398, + "step": 270 + }, + { + "epoch": 0.31413460841845975, + "grad_norm": 0.33110880851745605, + "learning_rate": 4.844252873563219e-05, + "loss": 0.1435, + "step": 271 + }, + { + "epoch": 0.31529377671520686, + "grad_norm": 0.3095364272594452, + "learning_rate": 4.8436781609195404e-05, + "loss": 0.1474, + "step": 272 + }, + { + "epoch": 0.3164529450119539, + "grad_norm": 0.34774062037467957, + "learning_rate": 4.8431034482758626e-05, + "loss": 0.156, + "step": 273 + }, + { + "epoch": 0.317612113308701, + "grad_norm": 0.3605648875236511, + "learning_rate": 4.842528735632184e-05, + "loss": 0.1545, + "step": 274 + }, + { + "epoch": 0.3187712816054481, + "grad_norm": 0.32547488808631897, + "learning_rate": 4.8419540229885056e-05, + "loss": 0.1448, + "step": 275 + }, + { + "epoch": 0.31993044990219516, + "grad_norm": 0.4434783458709717, + "learning_rate": 4.841379310344828e-05, + "loss": 0.1837, + "step": 276 + }, + { + "epoch": 0.3210896181989423, + "grad_norm": 0.3950771391391754, + "learning_rate": 4.840804597701149e-05, + "loss": 0.1667, + "step": 277 + }, + { + "epoch": 0.32224878649568933, + "grad_norm": 0.27604764699935913, + "learning_rate": 4.840229885057472e-05, + "loss": 0.1311, + "step": 278 + }, + { + "epoch": 0.32340795479243645, + "grad_norm": 0.29781222343444824, + "learning_rate": 4.8396551724137936e-05, + "loss": 0.1507, + "step": 279 + }, + { + "epoch": 0.3245671230891835, + "grad_norm": 0.300820529460907, + "learning_rate": 4.839080459770115e-05, + "loss": 0.1395, + "step": 280 + }, + { + "epoch": 0.3257262913859306, + "grad_norm": 0.3563086688518524, + "learning_rate": 4.838505747126437e-05, + "loss": 0.1542, + "step": 281 + }, + { + "epoch": 0.3268854596826777, + "grad_norm": 0.3500363826751709, + "learning_rate": 4.837931034482759e-05, + "loss": 0.1531, + "step": 282 + }, + { + "epoch": 0.32804462797942474, + "grad_norm": 0.340751975774765, + "learning_rate": 4.83735632183908e-05, + "loss": 0.154, + "step": 283 + }, + { + "epoch": 0.32920379627617186, + "grad_norm": 0.3770763874053955, + "learning_rate": 4.8367816091954024e-05, + "loss": 0.1632, + "step": 284 + }, + { + "epoch": 0.3303629645729189, + "grad_norm": 0.3394136130809784, + "learning_rate": 4.8362068965517246e-05, + "loss": 0.1575, + "step": 285 + }, + { + "epoch": 0.33152213286966603, + "grad_norm": 0.3633858263492584, + "learning_rate": 4.835632183908046e-05, + "loss": 0.1511, + "step": 286 + }, + { + "epoch": 0.3326813011664131, + "grad_norm": 0.27468162775039673, + "learning_rate": 4.835057471264368e-05, + "loss": 0.1445, + "step": 287 + }, + { + "epoch": 0.3338404694631602, + "grad_norm": 0.29763033986091614, + "learning_rate": 4.83448275862069e-05, + "loss": 0.1462, + "step": 288 + }, + { + "epoch": 0.33499963775990726, + "grad_norm": 0.2883772850036621, + "learning_rate": 4.833908045977012e-05, + "loss": 0.135, + "step": 289 + }, + { + "epoch": 0.3361588060566543, + "grad_norm": 0.3238728642463684, + "learning_rate": 4.8333333333333334e-05, + "loss": 0.1576, + "step": 290 + }, + { + "epoch": 0.33731797435340144, + "grad_norm": 0.4045778214931488, + "learning_rate": 4.8327586206896555e-05, + "loss": 0.1555, + "step": 291 + }, + { + "epoch": 0.3384771426501485, + "grad_norm": 0.3956005871295929, + "learning_rate": 4.832183908045978e-05, + "loss": 0.17, + "step": 292 + }, + { + "epoch": 0.3396363109468956, + "grad_norm": 0.2896331548690796, + "learning_rate": 4.831609195402299e-05, + "loss": 0.1436, + "step": 293 + }, + { + "epoch": 0.3407954792436427, + "grad_norm": 0.29543447494506836, + "learning_rate": 4.831034482758621e-05, + "loss": 0.1499, + "step": 294 + }, + { + "epoch": 0.3419546475403898, + "grad_norm": 0.32724907994270325, + "learning_rate": 4.830459770114943e-05, + "loss": 0.1568, + "step": 295 + }, + { + "epoch": 0.34311381583713685, + "grad_norm": 0.376208633184433, + "learning_rate": 4.829885057471264e-05, + "loss": 0.1449, + "step": 296 + }, + { + "epoch": 0.34427298413388396, + "grad_norm": 0.29767367243766785, + "learning_rate": 4.8293103448275865e-05, + "loss": 0.1444, + "step": 297 + }, + { + "epoch": 0.345432152430631, + "grad_norm": 0.30211395025253296, + "learning_rate": 4.828735632183909e-05, + "loss": 0.1375, + "step": 298 + }, + { + "epoch": 0.3465913207273781, + "grad_norm": 0.29970985651016235, + "learning_rate": 4.82816091954023e-05, + "loss": 0.1319, + "step": 299 + }, + { + "epoch": 0.3477504890241252, + "grad_norm": 0.30623510479927063, + "learning_rate": 4.827586206896552e-05, + "loss": 0.1489, + "step": 300 + }, + { + "epoch": 0.34890965732087226, + "grad_norm": 0.31533321738243103, + "learning_rate": 4.827011494252874e-05, + "loss": 0.1525, + "step": 301 + }, + { + "epoch": 0.3500688256176194, + "grad_norm": 0.35718074440956116, + "learning_rate": 4.826436781609195e-05, + "loss": 0.1465, + "step": 302 + }, + { + "epoch": 0.35122799391436643, + "grad_norm": 0.3315868675708771, + "learning_rate": 4.8258620689655175e-05, + "loss": 0.1438, + "step": 303 + }, + { + "epoch": 0.35238716221111355, + "grad_norm": 0.3789491355419159, + "learning_rate": 4.825287356321839e-05, + "loss": 0.1624, + "step": 304 + }, + { + "epoch": 0.3535463305078606, + "grad_norm": 0.3216198980808258, + "learning_rate": 4.824712643678161e-05, + "loss": 0.1495, + "step": 305 + }, + { + "epoch": 0.35470549880460767, + "grad_norm": 0.24692386388778687, + "learning_rate": 4.824137931034483e-05, + "loss": 0.1333, + "step": 306 + }, + { + "epoch": 0.3558646671013548, + "grad_norm": 0.26879554986953735, + "learning_rate": 4.823563218390805e-05, + "loss": 0.1328, + "step": 307 + }, + { + "epoch": 0.35702383539810184, + "grad_norm": 0.3616696894168854, + "learning_rate": 4.822988505747127e-05, + "loss": 0.1642, + "step": 308 + }, + { + "epoch": 0.35818300369484896, + "grad_norm": 0.4611773192882538, + "learning_rate": 4.8224137931034485e-05, + "loss": 0.1342, + "step": 309 + }, + { + "epoch": 0.359342171991596, + "grad_norm": 0.26914867758750916, + "learning_rate": 4.82183908045977e-05, + "loss": 0.1393, + "step": 310 + }, + { + "epoch": 0.36050134028834313, + "grad_norm": 0.3391886353492737, + "learning_rate": 4.821264367816092e-05, + "loss": 0.1502, + "step": 311 + }, + { + "epoch": 0.3616605085850902, + "grad_norm": 0.34171319007873535, + "learning_rate": 4.820689655172414e-05, + "loss": 0.1655, + "step": 312 + }, + { + "epoch": 0.3628196768818373, + "grad_norm": 0.2735805809497833, + "learning_rate": 4.820114942528736e-05, + "loss": 0.1211, + "step": 313 + }, + { + "epoch": 0.36397884517858436, + "grad_norm": 0.25337541103363037, + "learning_rate": 4.819540229885058e-05, + "loss": 0.1343, + "step": 314 + }, + { + "epoch": 0.3651380134753314, + "grad_norm": 0.3965268135070801, + "learning_rate": 4.8189655172413794e-05, + "loss": 0.1507, + "step": 315 + }, + { + "epoch": 0.36629718177207854, + "grad_norm": 0.26941707730293274, + "learning_rate": 4.8183908045977016e-05, + "loss": 0.1258, + "step": 316 + }, + { + "epoch": 0.3674563500688256, + "grad_norm": 0.31832900643348694, + "learning_rate": 4.817816091954023e-05, + "loss": 0.1693, + "step": 317 + }, + { + "epoch": 0.3686155183655727, + "grad_norm": 0.3194613456726074, + "learning_rate": 4.817241379310345e-05, + "loss": 0.1574, + "step": 318 + }, + { + "epoch": 0.3697746866623198, + "grad_norm": 0.3136231601238251, + "learning_rate": 4.8166666666666674e-05, + "loss": 0.1301, + "step": 319 + }, + { + "epoch": 0.3709338549590669, + "grad_norm": 0.302774578332901, + "learning_rate": 4.816091954022989e-05, + "loss": 0.1329, + "step": 320 + }, + { + "epoch": 0.37209302325581395, + "grad_norm": 0.3740023672580719, + "learning_rate": 4.8155172413793104e-05, + "loss": 0.1398, + "step": 321 + }, + { + "epoch": 0.37325219155256106, + "grad_norm": 0.35367128252983093, + "learning_rate": 4.8149425287356326e-05, + "loss": 0.1461, + "step": 322 + }, + { + "epoch": 0.3744113598493081, + "grad_norm": 0.35140734910964966, + "learning_rate": 4.814367816091954e-05, + "loss": 0.1545, + "step": 323 + }, + { + "epoch": 0.3755705281460552, + "grad_norm": 0.3153414726257324, + "learning_rate": 4.8137931034482755e-05, + "loss": 0.1283, + "step": 324 + }, + { + "epoch": 0.3767296964428023, + "grad_norm": 0.30042028427124023, + "learning_rate": 4.813218390804598e-05, + "loss": 0.151, + "step": 325 + }, + { + "epoch": 0.37788886473954936, + "grad_norm": 0.29036179184913635, + "learning_rate": 4.81264367816092e-05, + "loss": 0.1486, + "step": 326 + }, + { + "epoch": 0.3790480330362965, + "grad_norm": 0.2988058924674988, + "learning_rate": 4.812068965517242e-05, + "loss": 0.1545, + "step": 327 + }, + { + "epoch": 0.38020720133304353, + "grad_norm": 0.3292902410030365, + "learning_rate": 4.8114942528735635e-05, + "loss": 0.1629, + "step": 328 + }, + { + "epoch": 0.38136636962979065, + "grad_norm": 0.3768965005874634, + "learning_rate": 4.810919540229885e-05, + "loss": 0.1692, + "step": 329 + }, + { + "epoch": 0.3825255379265377, + "grad_norm": 0.28651896119117737, + "learning_rate": 4.810344827586207e-05, + "loss": 0.1534, + "step": 330 + }, + { + "epoch": 0.38368470622328477, + "grad_norm": 0.2976914942264557, + "learning_rate": 4.809770114942529e-05, + "loss": 0.1451, + "step": 331 + }, + { + "epoch": 0.3848438745200319, + "grad_norm": 0.3275349736213684, + "learning_rate": 4.809195402298851e-05, + "loss": 0.1632, + "step": 332 + }, + { + "epoch": 0.38600304281677894, + "grad_norm": 0.26491785049438477, + "learning_rate": 4.808620689655173e-05, + "loss": 0.1334, + "step": 333 + }, + { + "epoch": 0.38716221111352606, + "grad_norm": 0.28282400965690613, + "learning_rate": 4.8080459770114945e-05, + "loss": 0.1349, + "step": 334 + }, + { + "epoch": 0.3883213794102731, + "grad_norm": 0.31815335154533386, + "learning_rate": 4.807471264367817e-05, + "loss": 0.1514, + "step": 335 + }, + { + "epoch": 0.38948054770702023, + "grad_norm": 0.29894590377807617, + "learning_rate": 4.806896551724138e-05, + "loss": 0.1563, + "step": 336 + }, + { + "epoch": 0.3906397160037673, + "grad_norm": 0.31386902928352356, + "learning_rate": 4.80632183908046e-05, + "loss": 0.1553, + "step": 337 + }, + { + "epoch": 0.3917988843005144, + "grad_norm": 0.2859923541545868, + "learning_rate": 4.805747126436782e-05, + "loss": 0.1531, + "step": 338 + }, + { + "epoch": 0.39295805259726146, + "grad_norm": 0.3352300524711609, + "learning_rate": 4.805172413793104e-05, + "loss": 0.1412, + "step": 339 + }, + { + "epoch": 0.3941172208940085, + "grad_norm": 0.3148883283138275, + "learning_rate": 4.8045977011494255e-05, + "loss": 0.1297, + "step": 340 + }, + { + "epoch": 0.39527638919075564, + "grad_norm": 0.27482402324676514, + "learning_rate": 4.8040229885057477e-05, + "loss": 0.1333, + "step": 341 + }, + { + "epoch": 0.3964355574875027, + "grad_norm": 0.3114891052246094, + "learning_rate": 4.803448275862069e-05, + "loss": 0.1494, + "step": 342 + }, + { + "epoch": 0.3975947257842498, + "grad_norm": 0.3976518213748932, + "learning_rate": 4.8028735632183906e-05, + "loss": 0.1575, + "step": 343 + }, + { + "epoch": 0.3987538940809969, + "grad_norm": 0.4233146607875824, + "learning_rate": 4.802298850574713e-05, + "loss": 0.1459, + "step": 344 + }, + { + "epoch": 0.399913062377744, + "grad_norm": 0.30527806282043457, + "learning_rate": 4.801724137931034e-05, + "loss": 0.1369, + "step": 345 + }, + { + "epoch": 0.40107223067449105, + "grad_norm": 0.28466179966926575, + "learning_rate": 4.8011494252873565e-05, + "loss": 0.1283, + "step": 346 + }, + { + "epoch": 0.40223139897123816, + "grad_norm": 0.3994863033294678, + "learning_rate": 4.8005747126436786e-05, + "loss": 0.1623, + "step": 347 + }, + { + "epoch": 0.4033905672679852, + "grad_norm": 0.32415613532066345, + "learning_rate": 4.8e-05, + "loss": 0.155, + "step": 348 + }, + { + "epoch": 0.4045497355647323, + "grad_norm": 0.37542182207107544, + "learning_rate": 4.799425287356322e-05, + "loss": 0.1594, + "step": 349 + }, + { + "epoch": 0.4057089038614794, + "grad_norm": 0.33119359612464905, + "learning_rate": 4.798850574712644e-05, + "loss": 0.1555, + "step": 350 + }, + { + "epoch": 0.40686807215822646, + "grad_norm": 0.2560279369354248, + "learning_rate": 4.798275862068965e-05, + "loss": 0.1241, + "step": 351 + }, + { + "epoch": 0.40802724045497357, + "grad_norm": 0.2843601405620575, + "learning_rate": 4.7977011494252874e-05, + "loss": 0.1289, + "step": 352 + }, + { + "epoch": 0.40918640875172063, + "grad_norm": 0.29628679156303406, + "learning_rate": 4.7971264367816096e-05, + "loss": 0.1476, + "step": 353 + }, + { + "epoch": 0.41034557704846775, + "grad_norm": 0.36033913493156433, + "learning_rate": 4.796551724137932e-05, + "loss": 0.1579, + "step": 354 + }, + { + "epoch": 0.4115047453452148, + "grad_norm": 0.2819597125053406, + "learning_rate": 4.795977011494253e-05, + "loss": 0.1389, + "step": 355 + }, + { + "epoch": 0.41266391364196187, + "grad_norm": 0.2536991834640503, + "learning_rate": 4.795402298850575e-05, + "loss": 0.1342, + "step": 356 + }, + { + "epoch": 0.413823081938709, + "grad_norm": 0.339702844619751, + "learning_rate": 4.794827586206897e-05, + "loss": 0.1452, + "step": 357 + }, + { + "epoch": 0.41498225023545604, + "grad_norm": 0.29577532410621643, + "learning_rate": 4.7942528735632184e-05, + "loss": 0.1516, + "step": 358 + }, + { + "epoch": 0.41614141853220316, + "grad_norm": 0.30197975039482117, + "learning_rate": 4.7936781609195406e-05, + "loss": 0.1341, + "step": 359 + }, + { + "epoch": 0.4173005868289502, + "grad_norm": 0.2787219285964966, + "learning_rate": 4.793103448275863e-05, + "loss": 0.1426, + "step": 360 + }, + { + "epoch": 0.41845975512569733, + "grad_norm": 0.3281581997871399, + "learning_rate": 4.792528735632184e-05, + "loss": 0.1708, + "step": 361 + }, + { + "epoch": 0.4196189234224444, + "grad_norm": 0.2598758041858673, + "learning_rate": 4.791954022988506e-05, + "loss": 0.1323, + "step": 362 + }, + { + "epoch": 0.4207780917191915, + "grad_norm": 0.26790300011634827, + "learning_rate": 4.791379310344828e-05, + "loss": 0.1202, + "step": 363 + }, + { + "epoch": 0.42193726001593856, + "grad_norm": 0.2569654583930969, + "learning_rate": 4.7908045977011494e-05, + "loss": 0.1312, + "step": 364 + }, + { + "epoch": 0.4230964283126856, + "grad_norm": 0.3506622314453125, + "learning_rate": 4.7902298850574716e-05, + "loss": 0.1634, + "step": 365 + }, + { + "epoch": 0.42425559660943274, + "grad_norm": 0.3123525083065033, + "learning_rate": 4.789655172413793e-05, + "loss": 0.1588, + "step": 366 + }, + { + "epoch": 0.4254147649061798, + "grad_norm": 0.26198455691337585, + "learning_rate": 4.789080459770115e-05, + "loss": 0.1387, + "step": 367 + }, + { + "epoch": 0.4265739332029269, + "grad_norm": 0.3061137795448303, + "learning_rate": 4.7885057471264374e-05, + "loss": 0.1444, + "step": 368 + }, + { + "epoch": 0.427733101499674, + "grad_norm": 0.48860934376716614, + "learning_rate": 4.787931034482759e-05, + "loss": 0.151, + "step": 369 + }, + { + "epoch": 0.4288922697964211, + "grad_norm": 0.3341166377067566, + "learning_rate": 4.7873563218390804e-05, + "loss": 0.1377, + "step": 370 + }, + { + "epoch": 0.43005143809316815, + "grad_norm": 0.38944321870803833, + "learning_rate": 4.7867816091954025e-05, + "loss": 0.1635, + "step": 371 + }, + { + "epoch": 0.43121060638991526, + "grad_norm": 0.3289624750614166, + "learning_rate": 4.786206896551724e-05, + "loss": 0.1492, + "step": 372 + }, + { + "epoch": 0.4323697746866623, + "grad_norm": 0.2940187454223633, + "learning_rate": 4.785632183908046e-05, + "loss": 0.1386, + "step": 373 + }, + { + "epoch": 0.4335289429834094, + "grad_norm": 0.3036423921585083, + "learning_rate": 4.7850574712643684e-05, + "loss": 0.1383, + "step": 374 + }, + { + "epoch": 0.4346881112801565, + "grad_norm": 0.343669593334198, + "learning_rate": 4.78448275862069e-05, + "loss": 0.1605, + "step": 375 + }, + { + "epoch": 0.43584727957690356, + "grad_norm": 0.3061246871948242, + "learning_rate": 4.783908045977012e-05, + "loss": 0.1286, + "step": 376 + }, + { + "epoch": 0.43700644787365067, + "grad_norm": 0.33070918917655945, + "learning_rate": 4.7833333333333335e-05, + "loss": 0.1458, + "step": 377 + }, + { + "epoch": 0.43816561617039773, + "grad_norm": 0.9597157835960388, + "learning_rate": 4.782758620689655e-05, + "loss": 0.1461, + "step": 378 + }, + { + "epoch": 0.43932478446714485, + "grad_norm": 0.28434523940086365, + "learning_rate": 4.782183908045977e-05, + "loss": 0.1383, + "step": 379 + }, + { + "epoch": 0.4404839527638919, + "grad_norm": 0.2993975579738617, + "learning_rate": 4.781609195402299e-05, + "loss": 0.1398, + "step": 380 + }, + { + "epoch": 0.44164312106063897, + "grad_norm": 0.2816145420074463, + "learning_rate": 4.781034482758621e-05, + "loss": 0.1337, + "step": 381 + }, + { + "epoch": 0.4428022893573861, + "grad_norm": 0.906739354133606, + "learning_rate": 4.780459770114943e-05, + "loss": 0.1285, + "step": 382 + }, + { + "epoch": 0.44396145765413314, + "grad_norm": 0.25160372257232666, + "learning_rate": 4.7798850574712645e-05, + "loss": 0.1219, + "step": 383 + }, + { + "epoch": 0.44512062595088026, + "grad_norm": 0.3275229036808014, + "learning_rate": 4.7793103448275866e-05, + "loss": 0.1546, + "step": 384 + }, + { + "epoch": 0.4462797942476273, + "grad_norm": 0.2726188600063324, + "learning_rate": 4.778735632183908e-05, + "loss": 0.1479, + "step": 385 + }, + { + "epoch": 0.44743896254437443, + "grad_norm": 0.2804552912712097, + "learning_rate": 4.7781609195402296e-05, + "loss": 0.1386, + "step": 386 + }, + { + "epoch": 0.4485981308411215, + "grad_norm": 0.3268015682697296, + "learning_rate": 4.777586206896552e-05, + "loss": 0.1549, + "step": 387 + }, + { + "epoch": 0.4497572991378686, + "grad_norm": 0.263644814491272, + "learning_rate": 4.777011494252874e-05, + "loss": 0.1401, + "step": 388 + }, + { + "epoch": 0.45091646743461566, + "grad_norm": 0.28181982040405273, + "learning_rate": 4.7764367816091954e-05, + "loss": 0.1351, + "step": 389 + }, + { + "epoch": 0.4520756357313627, + "grad_norm": 0.2432672381401062, + "learning_rate": 4.7758620689655176e-05, + "loss": 0.1279, + "step": 390 + }, + { + "epoch": 0.45323480402810984, + "grad_norm": 0.24589940905570984, + "learning_rate": 4.775287356321839e-05, + "loss": 0.133, + "step": 391 + }, + { + "epoch": 0.4543939723248569, + "grad_norm": 0.29978325963020325, + "learning_rate": 4.774712643678161e-05, + "loss": 0.1392, + "step": 392 + }, + { + "epoch": 0.455553140621604, + "grad_norm": 0.32140663266181946, + "learning_rate": 4.774137931034483e-05, + "loss": 0.1376, + "step": 393 + }, + { + "epoch": 0.4567123089183511, + "grad_norm": 0.3055313527584076, + "learning_rate": 4.773563218390805e-05, + "loss": 0.148, + "step": 394 + }, + { + "epoch": 0.4578714772150982, + "grad_norm": 0.31229299306869507, + "learning_rate": 4.772988505747127e-05, + "loss": 0.1446, + "step": 395 + }, + { + "epoch": 0.45903064551184525, + "grad_norm": 0.3736218214035034, + "learning_rate": 4.7724137931034486e-05, + "loss": 0.1704, + "step": 396 + }, + { + "epoch": 0.4601898138085923, + "grad_norm": 0.2979601323604584, + "learning_rate": 4.77183908045977e-05, + "loss": 0.1347, + "step": 397 + }, + { + "epoch": 0.4613489821053394, + "grad_norm": 0.288699746131897, + "learning_rate": 4.771264367816092e-05, + "loss": 0.1335, + "step": 398 + }, + { + "epoch": 0.4625081504020865, + "grad_norm": 0.30045926570892334, + "learning_rate": 4.770689655172414e-05, + "loss": 0.1466, + "step": 399 + }, + { + "epoch": 0.4636673186988336, + "grad_norm": 0.41966867446899414, + "learning_rate": 4.770114942528736e-05, + "loss": 0.1628, + "step": 400 + }, + { + "epoch": 0.46482648699558066, + "grad_norm": 0.3311939537525177, + "learning_rate": 4.769540229885058e-05, + "loss": 0.1448, + "step": 401 + }, + { + "epoch": 0.46598565529232777, + "grad_norm": 0.2968727648258209, + "learning_rate": 4.7689655172413796e-05, + "loss": 0.1471, + "step": 402 + }, + { + "epoch": 0.46714482358907483, + "grad_norm": 0.3325969874858856, + "learning_rate": 4.768390804597702e-05, + "loss": 0.1527, + "step": 403 + }, + { + "epoch": 0.46830399188582195, + "grad_norm": 0.32422512769699097, + "learning_rate": 4.767816091954023e-05, + "loss": 0.1516, + "step": 404 + }, + { + "epoch": 0.469463160182569, + "grad_norm": 0.28346890211105347, + "learning_rate": 4.767241379310345e-05, + "loss": 0.1359, + "step": 405 + }, + { + "epoch": 0.47062232847931607, + "grad_norm": 0.27612075209617615, + "learning_rate": 4.766666666666667e-05, + "loss": 0.1373, + "step": 406 + }, + { + "epoch": 0.4717814967760632, + "grad_norm": 0.2808459401130676, + "learning_rate": 4.7660919540229884e-05, + "loss": 0.1408, + "step": 407 + }, + { + "epoch": 0.47294066507281024, + "grad_norm": 0.2832328677177429, + "learning_rate": 4.7655172413793105e-05, + "loss": 0.139, + "step": 408 + }, + { + "epoch": 0.47409983336955736, + "grad_norm": 0.27688664197921753, + "learning_rate": 4.764942528735633e-05, + "loss": 0.1392, + "step": 409 + }, + { + "epoch": 0.4752590016663044, + "grad_norm": 0.3172287344932556, + "learning_rate": 4.764367816091954e-05, + "loss": 0.1512, + "step": 410 + }, + { + "epoch": 0.47641816996305153, + "grad_norm": 0.2498215287923813, + "learning_rate": 4.7637931034482764e-05, + "loss": 0.12, + "step": 411 + }, + { + "epoch": 0.4775773382597986, + "grad_norm": 0.3091871440410614, + "learning_rate": 4.763218390804598e-05, + "loss": 0.1403, + "step": 412 + }, + { + "epoch": 0.4787365065565457, + "grad_norm": 0.29513412714004517, + "learning_rate": 4.7626436781609193e-05, + "loss": 0.1553, + "step": 413 + }, + { + "epoch": 0.47989567485329276, + "grad_norm": 0.31259381771087646, + "learning_rate": 4.7620689655172415e-05, + "loss": 0.138, + "step": 414 + }, + { + "epoch": 0.4810548431500398, + "grad_norm": 0.3411064147949219, + "learning_rate": 4.761494252873564e-05, + "loss": 0.1385, + "step": 415 + }, + { + "epoch": 0.48221401144678694, + "grad_norm": 0.5548298358917236, + "learning_rate": 4.760919540229885e-05, + "loss": 0.1357, + "step": 416 + }, + { + "epoch": 0.483373179743534, + "grad_norm": 0.3198058605194092, + "learning_rate": 4.7603448275862073e-05, + "loss": 0.1578, + "step": 417 + }, + { + "epoch": 0.4845323480402811, + "grad_norm": 0.2812342643737793, + "learning_rate": 4.759770114942529e-05, + "loss": 0.1559, + "step": 418 + }, + { + "epoch": 0.4856915163370282, + "grad_norm": 0.36544597148895264, + "learning_rate": 4.759195402298851e-05, + "loss": 0.1513, + "step": 419 + }, + { + "epoch": 0.4868506846337753, + "grad_norm": 0.29660576581954956, + "learning_rate": 4.7586206896551725e-05, + "loss": 0.1506, + "step": 420 + }, + { + "epoch": 0.48800985293052235, + "grad_norm": 0.2850414216518402, + "learning_rate": 4.7580459770114947e-05, + "loss": 0.1371, + "step": 421 + }, + { + "epoch": 0.4891690212272694, + "grad_norm": 0.3292869031429291, + "learning_rate": 4.757471264367817e-05, + "loss": 0.1758, + "step": 422 + }, + { + "epoch": 0.4903281895240165, + "grad_norm": 0.2677929997444153, + "learning_rate": 4.756896551724138e-05, + "loss": 0.1345, + "step": 423 + }, + { + "epoch": 0.4914873578207636, + "grad_norm": 0.32202059030532837, + "learning_rate": 4.75632183908046e-05, + "loss": 0.1465, + "step": 424 + }, + { + "epoch": 0.4926465261175107, + "grad_norm": 0.2961694300174713, + "learning_rate": 4.755747126436782e-05, + "loss": 0.1461, + "step": 425 + }, + { + "epoch": 0.49380569441425776, + "grad_norm": 0.27797091007232666, + "learning_rate": 4.7551724137931035e-05, + "loss": 0.1186, + "step": 426 + }, + { + "epoch": 0.49496486271100487, + "grad_norm": 0.42147669196128845, + "learning_rate": 4.754597701149425e-05, + "loss": 0.1351, + "step": 427 + }, + { + "epoch": 0.49612403100775193, + "grad_norm": 0.41197213530540466, + "learning_rate": 4.754022988505747e-05, + "loss": 0.1242, + "step": 428 + }, + { + "epoch": 0.49728319930449905, + "grad_norm": 0.352716326713562, + "learning_rate": 4.753448275862069e-05, + "loss": 0.1541, + "step": 429 + }, + { + "epoch": 0.4984423676012461, + "grad_norm": 0.29174548387527466, + "learning_rate": 4.7528735632183915e-05, + "loss": 0.1466, + "step": 430 + }, + { + "epoch": 0.49960153589799317, + "grad_norm": 0.32256200909614563, + "learning_rate": 4.752298850574713e-05, + "loss": 0.1365, + "step": 431 + }, + { + "epoch": 0.5007607041947403, + "grad_norm": 0.35533028841018677, + "learning_rate": 4.7517241379310344e-05, + "loss": 0.1489, + "step": 432 + }, + { + "epoch": 0.5019198724914874, + "grad_norm": 0.3250570297241211, + "learning_rate": 4.7511494252873566e-05, + "loss": 0.157, + "step": 433 + }, + { + "epoch": 0.5030790407882344, + "grad_norm": 0.36209189891815186, + "learning_rate": 4.750574712643678e-05, + "loss": 0.1579, + "step": 434 + }, + { + "epoch": 0.5042382090849815, + "grad_norm": 0.33881399035453796, + "learning_rate": 4.75e-05, + "loss": 0.1516, + "step": 435 + }, + { + "epoch": 0.5053973773817286, + "grad_norm": 0.2697497010231018, + "learning_rate": 4.7494252873563224e-05, + "loss": 0.1347, + "step": 436 + }, + { + "epoch": 0.5065565456784757, + "grad_norm": 0.3097374439239502, + "learning_rate": 4.748850574712644e-05, + "loss": 0.1506, + "step": 437 + }, + { + "epoch": 0.5077157139752227, + "grad_norm": 0.28055351972579956, + "learning_rate": 4.748275862068966e-05, + "loss": 0.1418, + "step": 438 + }, + { + "epoch": 0.5088748822719699, + "grad_norm": 0.31767186522483826, + "learning_rate": 4.7477011494252876e-05, + "loss": 0.1502, + "step": 439 + }, + { + "epoch": 0.510034050568717, + "grad_norm": 0.3027343451976776, + "learning_rate": 4.747126436781609e-05, + "loss": 0.1437, + "step": 440 + }, + { + "epoch": 0.511193218865464, + "grad_norm": 0.2773297131061554, + "learning_rate": 4.746551724137931e-05, + "loss": 0.139, + "step": 441 + }, + { + "epoch": 0.5123523871622111, + "grad_norm": 0.2976417541503906, + "learning_rate": 4.7459770114942534e-05, + "loss": 0.147, + "step": 442 + }, + { + "epoch": 0.5135115554589582, + "grad_norm": 0.30484509468078613, + "learning_rate": 4.745402298850575e-05, + "loss": 0.1371, + "step": 443 + }, + { + "epoch": 0.5146707237557053, + "grad_norm": 0.3369823694229126, + "learning_rate": 4.744827586206897e-05, + "loss": 0.1674, + "step": 444 + }, + { + "epoch": 0.5158298920524523, + "grad_norm": 0.27961060404777527, + "learning_rate": 4.7442528735632186e-05, + "loss": 0.141, + "step": 445 + }, + { + "epoch": 0.5169890603491994, + "grad_norm": 0.29113152623176575, + "learning_rate": 4.74367816091954e-05, + "loss": 0.1432, + "step": 446 + }, + { + "epoch": 0.5181482286459466, + "grad_norm": 0.363936185836792, + "learning_rate": 4.743103448275862e-05, + "loss": 0.1637, + "step": 447 + }, + { + "epoch": 0.5193073969426936, + "grad_norm": 0.3023361563682556, + "learning_rate": 4.742528735632184e-05, + "loss": 0.1368, + "step": 448 + }, + { + "epoch": 0.5204665652394407, + "grad_norm": 0.33404213190078735, + "learning_rate": 4.7419540229885065e-05, + "loss": 0.1555, + "step": 449 + }, + { + "epoch": 0.5216257335361878, + "grad_norm": 0.302898645401001, + "learning_rate": 4.741379310344828e-05, + "loss": 0.1354, + "step": 450 + }, + { + "epoch": 0.5227849018329349, + "grad_norm": 0.35367482900619507, + "learning_rate": 4.7408045977011495e-05, + "loss": 0.1583, + "step": 451 + }, + { + "epoch": 0.5239440701296819, + "grad_norm": 0.31517934799194336, + "learning_rate": 4.740229885057472e-05, + "loss": 0.1457, + "step": 452 + }, + { + "epoch": 0.525103238426429, + "grad_norm": 0.2723785936832428, + "learning_rate": 4.739655172413793e-05, + "loss": 0.1381, + "step": 453 + }, + { + "epoch": 0.5262624067231761, + "grad_norm": 0.2963480055332184, + "learning_rate": 4.739080459770115e-05, + "loss": 0.1408, + "step": 454 + }, + { + "epoch": 0.5274215750199232, + "grad_norm": 0.31834137439727783, + "learning_rate": 4.738505747126437e-05, + "loss": 0.1411, + "step": 455 + }, + { + "epoch": 0.5285807433166703, + "grad_norm": 0.31244561076164246, + "learning_rate": 4.737931034482759e-05, + "loss": 0.1368, + "step": 456 + }, + { + "epoch": 0.5297399116134174, + "grad_norm": 0.29169803857803345, + "learning_rate": 4.737356321839081e-05, + "loss": 0.1367, + "step": 457 + }, + { + "epoch": 0.5308990799101645, + "grad_norm": 0.2684137523174286, + "learning_rate": 4.736781609195403e-05, + "loss": 0.1482, + "step": 458 + }, + { + "epoch": 0.5320582482069115, + "grad_norm": 0.26147085428237915, + "learning_rate": 4.736206896551724e-05, + "loss": 0.1406, + "step": 459 + }, + { + "epoch": 0.5332174165036586, + "grad_norm": 0.2554711401462555, + "learning_rate": 4.735632183908046e-05, + "loss": 0.131, + "step": 460 + }, + { + "epoch": 0.5343765848004057, + "grad_norm": 0.2564930319786072, + "learning_rate": 4.735057471264368e-05, + "loss": 0.126, + "step": 461 + }, + { + "epoch": 0.5355357530971528, + "grad_norm": 0.2714848220348358, + "learning_rate": 4.73448275862069e-05, + "loss": 0.1308, + "step": 462 + }, + { + "epoch": 0.5366949213938998, + "grad_norm": 0.2883203327655792, + "learning_rate": 4.733908045977012e-05, + "loss": 0.1286, + "step": 463 + }, + { + "epoch": 0.537854089690647, + "grad_norm": 0.29676780104637146, + "learning_rate": 4.7333333333333336e-05, + "loss": 0.1499, + "step": 464 + }, + { + "epoch": 0.5390132579873941, + "grad_norm": 0.2832816541194916, + "learning_rate": 4.732758620689655e-05, + "loss": 0.1452, + "step": 465 + }, + { + "epoch": 0.5401724262841411, + "grad_norm": 0.28993991017341614, + "learning_rate": 4.732183908045977e-05, + "loss": 0.1501, + "step": 466 + }, + { + "epoch": 0.5413315945808882, + "grad_norm": 0.3065154254436493, + "learning_rate": 4.731609195402299e-05, + "loss": 0.1545, + "step": 467 + }, + { + "epoch": 0.5424907628776353, + "grad_norm": 0.28335148096084595, + "learning_rate": 4.731034482758621e-05, + "loss": 0.1305, + "step": 468 + }, + { + "epoch": 0.5436499311743824, + "grad_norm": 0.29181987047195435, + "learning_rate": 4.730459770114943e-05, + "loss": 0.1379, + "step": 469 + }, + { + "epoch": 0.5448090994711294, + "grad_norm": 0.2934599220752716, + "learning_rate": 4.7298850574712646e-05, + "loss": 0.1435, + "step": 470 + }, + { + "epoch": 0.5459682677678765, + "grad_norm": 0.2953728437423706, + "learning_rate": 4.729310344827587e-05, + "loss": 0.1453, + "step": 471 + }, + { + "epoch": 0.5471274360646237, + "grad_norm": 0.2823188006877899, + "learning_rate": 4.728735632183908e-05, + "loss": 0.1312, + "step": 472 + }, + { + "epoch": 0.5482866043613707, + "grad_norm": 0.2421528846025467, + "learning_rate": 4.72816091954023e-05, + "loss": 0.1207, + "step": 473 + }, + { + "epoch": 0.5494457726581178, + "grad_norm": 0.3098517954349518, + "learning_rate": 4.727586206896552e-05, + "loss": 0.1492, + "step": 474 + }, + { + "epoch": 0.5506049409548649, + "grad_norm": 0.2447943240404129, + "learning_rate": 4.7270114942528734e-05, + "loss": 0.128, + "step": 475 + }, + { + "epoch": 0.551764109251612, + "grad_norm": 0.37644991278648376, + "learning_rate": 4.7264367816091956e-05, + "loss": 0.1414, + "step": 476 + }, + { + "epoch": 0.552923277548359, + "grad_norm": 0.2898421883583069, + "learning_rate": 4.725862068965518e-05, + "loss": 0.1382, + "step": 477 + }, + { + "epoch": 0.5540824458451061, + "grad_norm": 0.3101235032081604, + "learning_rate": 4.725287356321839e-05, + "loss": 0.1507, + "step": 478 + }, + { + "epoch": 0.5552416141418532, + "grad_norm": 0.31535589694976807, + "learning_rate": 4.7247126436781614e-05, + "loss": 0.1479, + "step": 479 + }, + { + "epoch": 0.5564007824386002, + "grad_norm": 0.3017010986804962, + "learning_rate": 4.724137931034483e-05, + "loss": 0.1491, + "step": 480 + }, + { + "epoch": 0.5575599507353474, + "grad_norm": 0.27675431966781616, + "learning_rate": 4.7235632183908044e-05, + "loss": 0.1343, + "step": 481 + }, + { + "epoch": 0.5587191190320945, + "grad_norm": 0.28481778502464294, + "learning_rate": 4.7229885057471266e-05, + "loss": 0.1598, + "step": 482 + }, + { + "epoch": 0.5598782873288416, + "grad_norm": 0.2603989541530609, + "learning_rate": 4.722413793103449e-05, + "loss": 0.1365, + "step": 483 + }, + { + "epoch": 0.5610374556255886, + "grad_norm": 0.2759985625743866, + "learning_rate": 4.72183908045977e-05, + "loss": 0.1291, + "step": 484 + }, + { + "epoch": 0.5621966239223357, + "grad_norm": 0.27497798204421997, + "learning_rate": 4.7212643678160924e-05, + "loss": 0.1302, + "step": 485 + }, + { + "epoch": 0.5633557922190828, + "grad_norm": 0.4083801209926605, + "learning_rate": 4.720689655172414e-05, + "loss": 0.1512, + "step": 486 + }, + { + "epoch": 0.5645149605158299, + "grad_norm": 0.25597989559173584, + "learning_rate": 4.720114942528736e-05, + "loss": 0.125, + "step": 487 + }, + { + "epoch": 0.565674128812577, + "grad_norm": 0.34843572974205017, + "learning_rate": 4.7195402298850575e-05, + "loss": 0.1537, + "step": 488 + }, + { + "epoch": 0.5668332971093241, + "grad_norm": 0.34892401099205017, + "learning_rate": 4.718965517241379e-05, + "loss": 0.1342, + "step": 489 + }, + { + "epoch": 0.5679924654060712, + "grad_norm": 0.28030136227607727, + "learning_rate": 4.718390804597702e-05, + "loss": 0.1477, + "step": 490 + }, + { + "epoch": 0.5691516337028182, + "grad_norm": 0.27783283591270447, + "learning_rate": 4.7178160919540234e-05, + "loss": 0.1431, + "step": 491 + }, + { + "epoch": 0.5703108019995653, + "grad_norm": 0.23067143559455872, + "learning_rate": 4.717241379310345e-05, + "loss": 0.1242, + "step": 492 + }, + { + "epoch": 0.5714699702963124, + "grad_norm": 0.25984445214271545, + "learning_rate": 4.716666666666667e-05, + "loss": 0.1298, + "step": 493 + }, + { + "epoch": 0.5726291385930595, + "grad_norm": 0.255754292011261, + "learning_rate": 4.7160919540229885e-05, + "loss": 0.1398, + "step": 494 + }, + { + "epoch": 0.5737883068898065, + "grad_norm": 0.28669071197509766, + "learning_rate": 4.715517241379311e-05, + "loss": 0.1428, + "step": 495 + }, + { + "epoch": 0.5749474751865536, + "grad_norm": 0.29052338004112244, + "learning_rate": 4.714942528735632e-05, + "loss": 0.1399, + "step": 496 + }, + { + "epoch": 0.5761066434833008, + "grad_norm": 0.24293456971645355, + "learning_rate": 4.714367816091954e-05, + "loss": 0.122, + "step": 497 + }, + { + "epoch": 0.5772658117800478, + "grad_norm": 0.278358519077301, + "learning_rate": 4.7137931034482765e-05, + "loss": 0.146, + "step": 498 + }, + { + "epoch": 0.5784249800767949, + "grad_norm": 0.25874063372612, + "learning_rate": 4.713218390804598e-05, + "loss": 0.1298, + "step": 499 + }, + { + "epoch": 0.579584148373542, + "grad_norm": 0.4070255756378174, + "learning_rate": 4.7126436781609195e-05, + "loss": 0.1511, + "step": 500 + }, + { + "epoch": 0.5807433166702891, + "grad_norm": 0.4098628759384155, + "learning_rate": 4.7120689655172417e-05, + "loss": 0.1399, + "step": 501 + }, + { + "epoch": 0.5819024849670361, + "grad_norm": 0.33255958557128906, + "learning_rate": 4.711494252873563e-05, + "loss": 0.1489, + "step": 502 + }, + { + "epoch": 0.5830616532637832, + "grad_norm": 0.2659994959831238, + "learning_rate": 4.710919540229885e-05, + "loss": 0.1258, + "step": 503 + }, + { + "epoch": 0.5842208215605303, + "grad_norm": 0.30784937739372253, + "learning_rate": 4.7103448275862075e-05, + "loss": 0.1468, + "step": 504 + }, + { + "epoch": 0.5853799898572773, + "grad_norm": 0.30625712871551514, + "learning_rate": 4.709770114942529e-05, + "loss": 0.1355, + "step": 505 + }, + { + "epoch": 0.5865391581540245, + "grad_norm": 0.32840919494628906, + "learning_rate": 4.709195402298851e-05, + "loss": 0.1346, + "step": 506 + }, + { + "epoch": 0.5876983264507716, + "grad_norm": 0.2901023030281067, + "learning_rate": 4.7086206896551726e-05, + "loss": 0.1621, + "step": 507 + }, + { + "epoch": 0.5888574947475187, + "grad_norm": 0.29218408465385437, + "learning_rate": 4.708045977011494e-05, + "loss": 0.1415, + "step": 508 + }, + { + "epoch": 0.5900166630442657, + "grad_norm": 0.2910397946834564, + "learning_rate": 4.707471264367816e-05, + "loss": 0.1536, + "step": 509 + }, + { + "epoch": 0.5911758313410128, + "grad_norm": 0.23362290859222412, + "learning_rate": 4.7068965517241385e-05, + "loss": 0.1204, + "step": 510 + }, + { + "epoch": 0.5923349996377599, + "grad_norm": 0.29354870319366455, + "learning_rate": 4.70632183908046e-05, + "loss": 0.1392, + "step": 511 + }, + { + "epoch": 0.593494167934507, + "grad_norm": 0.26072168350219727, + "learning_rate": 4.705747126436782e-05, + "loss": 0.1376, + "step": 512 + }, + { + "epoch": 0.594653336231254, + "grad_norm": 0.32599276304244995, + "learning_rate": 4.7051724137931036e-05, + "loss": 0.1371, + "step": 513 + }, + { + "epoch": 0.5958125045280012, + "grad_norm": 0.2970302402973175, + "learning_rate": 4.704597701149426e-05, + "loss": 0.1376, + "step": 514 + }, + { + "epoch": 0.5969716728247483, + "grad_norm": 0.2292068749666214, + "learning_rate": 4.704022988505747e-05, + "loss": 0.1264, + "step": 515 + }, + { + "epoch": 0.5981308411214953, + "grad_norm": 0.2724842131137848, + "learning_rate": 4.703448275862069e-05, + "loss": 0.136, + "step": 516 + }, + { + "epoch": 0.5992900094182424, + "grad_norm": 0.2579404413700104, + "learning_rate": 4.702873563218391e-05, + "loss": 0.1378, + "step": 517 + }, + { + "epoch": 0.6004491777149895, + "grad_norm": 0.28821900486946106, + "learning_rate": 4.702298850574713e-05, + "loss": 0.1481, + "step": 518 + }, + { + "epoch": 0.6016083460117366, + "grad_norm": 0.31739768385887146, + "learning_rate": 4.7017241379310346e-05, + "loss": 0.1536, + "step": 519 + }, + { + "epoch": 0.6027675143084836, + "grad_norm": 0.2741948366165161, + "learning_rate": 4.701149425287357e-05, + "loss": 0.1293, + "step": 520 + }, + { + "epoch": 0.6039266826052307, + "grad_norm": 0.2746334969997406, + "learning_rate": 4.700574712643678e-05, + "loss": 0.1309, + "step": 521 + }, + { + "epoch": 0.6050858509019779, + "grad_norm": 0.2569972276687622, + "learning_rate": 4.7e-05, + "loss": 0.1192, + "step": 522 + }, + { + "epoch": 0.6062450191987249, + "grad_norm": 0.3422163128852844, + "learning_rate": 4.699425287356322e-05, + "loss": 0.1518, + "step": 523 + }, + { + "epoch": 0.607404187495472, + "grad_norm": 0.30587247014045715, + "learning_rate": 4.698850574712644e-05, + "loss": 0.1599, + "step": 524 + }, + { + "epoch": 0.6085633557922191, + "grad_norm": 0.27892470359802246, + "learning_rate": 4.698275862068966e-05, + "loss": 0.1373, + "step": 525 + }, + { + "epoch": 0.6097225240889662, + "grad_norm": 0.3484020233154297, + "learning_rate": 4.697701149425288e-05, + "loss": 0.1368, + "step": 526 + }, + { + "epoch": 0.6108816923857132, + "grad_norm": 0.2913890779018402, + "learning_rate": 4.697126436781609e-05, + "loss": 0.1351, + "step": 527 + }, + { + "epoch": 0.6120408606824603, + "grad_norm": 0.3555639982223511, + "learning_rate": 4.6965517241379314e-05, + "loss": 0.1608, + "step": 528 + }, + { + "epoch": 0.6132000289792074, + "grad_norm": 0.2778078317642212, + "learning_rate": 4.695977011494253e-05, + "loss": 0.1446, + "step": 529 + }, + { + "epoch": 0.6143591972759544, + "grad_norm": 0.24142858386039734, + "learning_rate": 4.695402298850575e-05, + "loss": 0.1161, + "step": 530 + }, + { + "epoch": 0.6155183655727016, + "grad_norm": 0.2555319368839264, + "learning_rate": 4.694827586206897e-05, + "loss": 0.1229, + "step": 531 + }, + { + "epoch": 0.6166775338694487, + "grad_norm": 0.2874903678894043, + "learning_rate": 4.694252873563219e-05, + "loss": 0.1487, + "step": 532 + }, + { + "epoch": 0.6178367021661958, + "grad_norm": 0.27659285068511963, + "learning_rate": 4.693678160919541e-05, + "loss": 0.1458, + "step": 533 + }, + { + "epoch": 0.6189958704629428, + "grad_norm": 0.3088036775588989, + "learning_rate": 4.6931034482758623e-05, + "loss": 0.1448, + "step": 534 + }, + { + "epoch": 0.6201550387596899, + "grad_norm": 0.338383287191391, + "learning_rate": 4.692528735632184e-05, + "loss": 0.124, + "step": 535 + }, + { + "epoch": 0.621314207056437, + "grad_norm": 0.27692559361457825, + "learning_rate": 4.691954022988506e-05, + "loss": 0.1417, + "step": 536 + }, + { + "epoch": 0.6224733753531841, + "grad_norm": 0.2864556610584259, + "learning_rate": 4.6913793103448275e-05, + "loss": 0.1406, + "step": 537 + }, + { + "epoch": 0.6236325436499311, + "grad_norm": 0.3573722243309021, + "learning_rate": 4.69080459770115e-05, + "loss": 0.1445, + "step": 538 + }, + { + "epoch": 0.6247917119466783, + "grad_norm": 0.27817103266716003, + "learning_rate": 4.690229885057472e-05, + "loss": 0.1359, + "step": 539 + }, + { + "epoch": 0.6259508802434254, + "grad_norm": 0.3294277787208557, + "learning_rate": 4.689655172413793e-05, + "loss": 0.1455, + "step": 540 + }, + { + "epoch": 0.6271100485401724, + "grad_norm": 0.2555405795574188, + "learning_rate": 4.689080459770115e-05, + "loss": 0.1352, + "step": 541 + }, + { + "epoch": 0.6282692168369195, + "grad_norm": 0.27222466468811035, + "learning_rate": 4.688505747126437e-05, + "loss": 0.1391, + "step": 542 + }, + { + "epoch": 0.6294283851336666, + "grad_norm": 0.26646101474761963, + "learning_rate": 4.6879310344827585e-05, + "loss": 0.1405, + "step": 543 + }, + { + "epoch": 0.6305875534304137, + "grad_norm": 0.295244961977005, + "learning_rate": 4.6873563218390806e-05, + "loss": 0.1401, + "step": 544 + }, + { + "epoch": 0.6317467217271607, + "grad_norm": 0.26528140902519226, + "learning_rate": 4.686781609195403e-05, + "loss": 0.1342, + "step": 545 + }, + { + "epoch": 0.6329058900239078, + "grad_norm": 0.2771555781364441, + "learning_rate": 4.686206896551724e-05, + "loss": 0.1525, + "step": 546 + }, + { + "epoch": 0.634065058320655, + "grad_norm": 0.3281809091567993, + "learning_rate": 4.6856321839080465e-05, + "loss": 0.1653, + "step": 547 + }, + { + "epoch": 0.635224226617402, + "grad_norm": 0.3082864284515381, + "learning_rate": 4.685057471264368e-05, + "loss": 0.125, + "step": 548 + }, + { + "epoch": 0.6363833949141491, + "grad_norm": 0.30822286009788513, + "learning_rate": 4.6844827586206894e-05, + "loss": 0.1455, + "step": 549 + }, + { + "epoch": 0.6375425632108962, + "grad_norm": 0.3183049261569977, + "learning_rate": 4.6839080459770116e-05, + "loss": 0.1363, + "step": 550 + }, + { + "epoch": 0.6387017315076433, + "grad_norm": 0.2250736802816391, + "learning_rate": 4.683333333333334e-05, + "loss": 0.1147, + "step": 551 + }, + { + "epoch": 0.6398608998043903, + "grad_norm": 0.2534612715244293, + "learning_rate": 4.682758620689656e-05, + "loss": 0.1447, + "step": 552 + }, + { + "epoch": 0.6410200681011374, + "grad_norm": 0.2627923786640167, + "learning_rate": 4.6821839080459774e-05, + "loss": 0.1404, + "step": 553 + }, + { + "epoch": 0.6421792363978845, + "grad_norm": 0.25790974497795105, + "learning_rate": 4.681609195402299e-05, + "loss": 0.1267, + "step": 554 + }, + { + "epoch": 0.6433384046946315, + "grad_norm": 0.2535199224948883, + "learning_rate": 4.681034482758621e-05, + "loss": 0.1377, + "step": 555 + }, + { + "epoch": 0.6444975729913787, + "grad_norm": 0.27798426151275635, + "learning_rate": 4.6804597701149426e-05, + "loss": 0.1422, + "step": 556 + }, + { + "epoch": 0.6456567412881258, + "grad_norm": 0.26984500885009766, + "learning_rate": 4.679885057471264e-05, + "loss": 0.1383, + "step": 557 + }, + { + "epoch": 0.6468159095848729, + "grad_norm": 0.28631749749183655, + "learning_rate": 4.679310344827586e-05, + "loss": 0.1402, + "step": 558 + }, + { + "epoch": 0.6479750778816199, + "grad_norm": 0.29483139514923096, + "learning_rate": 4.6787356321839084e-05, + "loss": 0.1384, + "step": 559 + }, + { + "epoch": 0.649134246178367, + "grad_norm": 0.23459696769714355, + "learning_rate": 4.67816091954023e-05, + "loss": 0.1242, + "step": 560 + }, + { + "epoch": 0.6502934144751141, + "grad_norm": 0.28944167494773865, + "learning_rate": 4.677586206896552e-05, + "loss": 0.1505, + "step": 561 + }, + { + "epoch": 0.6514525827718612, + "grad_norm": 0.3058931529521942, + "learning_rate": 4.6770114942528736e-05, + "loss": 0.1369, + "step": 562 + }, + { + "epoch": 0.6526117510686082, + "grad_norm": 0.28087151050567627, + "learning_rate": 4.676436781609196e-05, + "loss": 0.1372, + "step": 563 + }, + { + "epoch": 0.6537709193653554, + "grad_norm": 0.3356817364692688, + "learning_rate": 4.675862068965517e-05, + "loss": 0.1603, + "step": 564 + }, + { + "epoch": 0.6549300876621025, + "grad_norm": 0.24305835366249084, + "learning_rate": 4.6752873563218394e-05, + "loss": 0.1336, + "step": 565 + }, + { + "epoch": 0.6560892559588495, + "grad_norm": 0.26663926243782043, + "learning_rate": 4.6747126436781616e-05, + "loss": 0.1326, + "step": 566 + }, + { + "epoch": 0.6572484242555966, + "grad_norm": 0.3176979124546051, + "learning_rate": 4.674137931034483e-05, + "loss": 0.1586, + "step": 567 + }, + { + "epoch": 0.6584075925523437, + "grad_norm": 0.2597542405128479, + "learning_rate": 4.6735632183908045e-05, + "loss": 0.1325, + "step": 568 + }, + { + "epoch": 0.6595667608490908, + "grad_norm": 0.2565508782863617, + "learning_rate": 4.672988505747127e-05, + "loss": 0.1403, + "step": 569 + }, + { + "epoch": 0.6607259291458378, + "grad_norm": 0.22864797711372375, + "learning_rate": 4.672413793103448e-05, + "loss": 0.13, + "step": 570 + }, + { + "epoch": 0.661885097442585, + "grad_norm": 0.27961140871047974, + "learning_rate": 4.6718390804597704e-05, + "loss": 0.145, + "step": 571 + }, + { + "epoch": 0.6630442657393321, + "grad_norm": 0.23423443734645844, + "learning_rate": 4.6712643678160925e-05, + "loss": 0.1325, + "step": 572 + }, + { + "epoch": 0.6642034340360791, + "grad_norm": 0.26428887248039246, + "learning_rate": 4.670689655172414e-05, + "loss": 0.1299, + "step": 573 + }, + { + "epoch": 0.6653626023328262, + "grad_norm": 0.2903366684913635, + "learning_rate": 4.670114942528736e-05, + "loss": 0.1445, + "step": 574 + }, + { + "epoch": 0.6665217706295733, + "grad_norm": 0.3106566369533539, + "learning_rate": 4.669540229885058e-05, + "loss": 0.1396, + "step": 575 + }, + { + "epoch": 0.6676809389263204, + "grad_norm": 0.2907349169254303, + "learning_rate": 4.668965517241379e-05, + "loss": 0.1224, + "step": 576 + }, + { + "epoch": 0.6688401072230674, + "grad_norm": 0.2840725779533386, + "learning_rate": 4.668390804597701e-05, + "loss": 0.1356, + "step": 577 + }, + { + "epoch": 0.6699992755198145, + "grad_norm": 0.3185088634490967, + "learning_rate": 4.667816091954023e-05, + "loss": 0.1499, + "step": 578 + }, + { + "epoch": 0.6711584438165616, + "grad_norm": 0.2769455909729004, + "learning_rate": 4.667241379310345e-05, + "loss": 0.1354, + "step": 579 + }, + { + "epoch": 0.6723176121133086, + "grad_norm": 0.3882596492767334, + "learning_rate": 4.666666666666667e-05, + "loss": 0.1551, + "step": 580 + }, + { + "epoch": 0.6734767804100558, + "grad_norm": 0.29669293761253357, + "learning_rate": 4.6660919540229887e-05, + "loss": 0.1437, + "step": 581 + }, + { + "epoch": 0.6746359487068029, + "grad_norm": 0.24709312617778778, + "learning_rate": 4.665517241379311e-05, + "loss": 0.1349, + "step": 582 + }, + { + "epoch": 0.67579511700355, + "grad_norm": 0.29217076301574707, + "learning_rate": 4.664942528735632e-05, + "loss": 0.1327, + "step": 583 + }, + { + "epoch": 0.676954285300297, + "grad_norm": 0.2767103612422943, + "learning_rate": 4.664367816091954e-05, + "loss": 0.1341, + "step": 584 + }, + { + "epoch": 0.6781134535970441, + "grad_norm": 0.3487448990345001, + "learning_rate": 4.663793103448276e-05, + "loss": 0.1567, + "step": 585 + }, + { + "epoch": 0.6792726218937912, + "grad_norm": 0.30940744280815125, + "learning_rate": 4.663218390804598e-05, + "loss": 0.1486, + "step": 586 + }, + { + "epoch": 0.6804317901905383, + "grad_norm": 0.2427985668182373, + "learning_rate": 4.6626436781609196e-05, + "loss": 0.1298, + "step": 587 + }, + { + "epoch": 0.6815909584872853, + "grad_norm": 0.2587006092071533, + "learning_rate": 4.662068965517242e-05, + "loss": 0.1557, + "step": 588 + }, + { + "epoch": 0.6827501267840325, + "grad_norm": 0.2949361205101013, + "learning_rate": 4.661494252873563e-05, + "loss": 0.1513, + "step": 589 + }, + { + "epoch": 0.6839092950807796, + "grad_norm": 0.2784793972969055, + "learning_rate": 4.6609195402298855e-05, + "loss": 0.1407, + "step": 590 + }, + { + "epoch": 0.6850684633775266, + "grad_norm": 0.28113847970962524, + "learning_rate": 4.660344827586207e-05, + "loss": 0.1346, + "step": 591 + }, + { + "epoch": 0.6862276316742737, + "grad_norm": 0.30229294300079346, + "learning_rate": 4.659770114942529e-05, + "loss": 0.1533, + "step": 592 + }, + { + "epoch": 0.6873867999710208, + "grad_norm": 0.30034735798835754, + "learning_rate": 4.659195402298851e-05, + "loss": 0.1382, + "step": 593 + }, + { + "epoch": 0.6885459682677679, + "grad_norm": 0.23939953744411469, + "learning_rate": 4.658620689655173e-05, + "loss": 0.1248, + "step": 594 + }, + { + "epoch": 0.6897051365645149, + "grad_norm": 0.2364337295293808, + "learning_rate": 4.658045977011494e-05, + "loss": 0.1335, + "step": 595 + }, + { + "epoch": 0.690864304861262, + "grad_norm": 0.2641420364379883, + "learning_rate": 4.6574712643678164e-05, + "loss": 0.1492, + "step": 596 + }, + { + "epoch": 0.6920234731580092, + "grad_norm": 0.29211926460266113, + "learning_rate": 4.656896551724138e-05, + "loss": 0.1397, + "step": 597 + }, + { + "epoch": 0.6931826414547562, + "grad_norm": 0.2939442992210388, + "learning_rate": 4.65632183908046e-05, + "loss": 0.1345, + "step": 598 + }, + { + "epoch": 0.6943418097515033, + "grad_norm": 0.2861159145832062, + "learning_rate": 4.6557471264367816e-05, + "loss": 0.1359, + "step": 599 + }, + { + "epoch": 0.6955009780482504, + "grad_norm": 0.3505733907222748, + "learning_rate": 4.655172413793104e-05, + "loss": 0.1464, + "step": 600 + }, + { + "epoch": 0.6966601463449975, + "grad_norm": 0.26291075348854065, + "learning_rate": 4.654597701149426e-05, + "loss": 0.1354, + "step": 601 + }, + { + "epoch": 0.6978193146417445, + "grad_norm": 0.27308157086372375, + "learning_rate": 4.6540229885057474e-05, + "loss": 0.1374, + "step": 602 + }, + { + "epoch": 0.6989784829384916, + "grad_norm": 0.3093547224998474, + "learning_rate": 4.653448275862069e-05, + "loss": 0.1366, + "step": 603 + }, + { + "epoch": 0.7001376512352387, + "grad_norm": 0.29862943291664124, + "learning_rate": 4.652873563218391e-05, + "loss": 0.1417, + "step": 604 + }, + { + "epoch": 0.7012968195319857, + "grad_norm": 0.35645461082458496, + "learning_rate": 4.6522988505747125e-05, + "loss": 0.1348, + "step": 605 + }, + { + "epoch": 0.7024559878287329, + "grad_norm": 0.3341381847858429, + "learning_rate": 4.651724137931035e-05, + "loss": 0.1426, + "step": 606 + }, + { + "epoch": 0.70361515612548, + "grad_norm": 0.22590871155261993, + "learning_rate": 4.651149425287357e-05, + "loss": 0.1192, + "step": 607 + }, + { + "epoch": 0.7047743244222271, + "grad_norm": 0.22133195400238037, + "learning_rate": 4.6505747126436784e-05, + "loss": 0.1176, + "step": 608 + }, + { + "epoch": 0.7059334927189741, + "grad_norm": 0.2593124806880951, + "learning_rate": 4.6500000000000005e-05, + "loss": 0.1356, + "step": 609 + }, + { + "epoch": 0.7070926610157212, + "grad_norm": 0.28317561745643616, + "learning_rate": 4.649425287356322e-05, + "loss": 0.144, + "step": 610 + }, + { + "epoch": 0.7082518293124683, + "grad_norm": 0.23904190957546234, + "learning_rate": 4.6488505747126435e-05, + "loss": 0.1371, + "step": 611 + }, + { + "epoch": 0.7094109976092153, + "grad_norm": 0.23972614109516144, + "learning_rate": 4.648275862068966e-05, + "loss": 0.1268, + "step": 612 + }, + { + "epoch": 0.7105701659059624, + "grad_norm": 0.27289271354675293, + "learning_rate": 4.647701149425288e-05, + "loss": 0.1257, + "step": 613 + }, + { + "epoch": 0.7117293342027096, + "grad_norm": 0.2559848427772522, + "learning_rate": 4.6471264367816093e-05, + "loss": 0.139, + "step": 614 + }, + { + "epoch": 0.7128885024994567, + "grad_norm": 0.28095269203186035, + "learning_rate": 4.6465517241379315e-05, + "loss": 0.1508, + "step": 615 + }, + { + "epoch": 0.7140476707962037, + "grad_norm": 0.25798696279525757, + "learning_rate": 4.645977011494253e-05, + "loss": 0.1302, + "step": 616 + }, + { + "epoch": 0.7152068390929508, + "grad_norm": 0.28479090332984924, + "learning_rate": 4.645402298850575e-05, + "loss": 0.1462, + "step": 617 + }, + { + "epoch": 0.7163660073896979, + "grad_norm": 0.3032209575176239, + "learning_rate": 4.644827586206897e-05, + "loss": 0.1581, + "step": 618 + }, + { + "epoch": 0.717525175686445, + "grad_norm": 0.26038414239883423, + "learning_rate": 4.644252873563218e-05, + "loss": 0.133, + "step": 619 + }, + { + "epoch": 0.718684343983192, + "grad_norm": 0.2712029218673706, + "learning_rate": 4.643678160919541e-05, + "loss": 0.1456, + "step": 620 + }, + { + "epoch": 0.7198435122799391, + "grad_norm": 0.2711297571659088, + "learning_rate": 4.6431034482758625e-05, + "loss": 0.1485, + "step": 621 + }, + { + "epoch": 0.7210026805766863, + "grad_norm": 0.26714852452278137, + "learning_rate": 4.642528735632184e-05, + "loss": 0.1409, + "step": 622 + }, + { + "epoch": 0.7221618488734333, + "grad_norm": 0.2638694941997528, + "learning_rate": 4.641954022988506e-05, + "loss": 0.1415, + "step": 623 + }, + { + "epoch": 0.7233210171701804, + "grad_norm": 0.24792924523353577, + "learning_rate": 4.6413793103448276e-05, + "loss": 0.144, + "step": 624 + }, + { + "epoch": 0.7244801854669275, + "grad_norm": 0.33576005697250366, + "learning_rate": 4.640804597701149e-05, + "loss": 0.1677, + "step": 625 + }, + { + "epoch": 0.7256393537636746, + "grad_norm": 0.26434826850891113, + "learning_rate": 4.640229885057471e-05, + "loss": 0.1413, + "step": 626 + }, + { + "epoch": 0.7267985220604216, + "grad_norm": 0.22275975346565247, + "learning_rate": 4.6396551724137935e-05, + "loss": 0.1265, + "step": 627 + }, + { + "epoch": 0.7279576903571687, + "grad_norm": 0.25366446375846863, + "learning_rate": 4.6390804597701156e-05, + "loss": 0.1353, + "step": 628 + }, + { + "epoch": 0.7291168586539158, + "grad_norm": 0.24983662366867065, + "learning_rate": 4.638505747126437e-05, + "loss": 0.1338, + "step": 629 + }, + { + "epoch": 0.7302760269506628, + "grad_norm": 0.24769724905490875, + "learning_rate": 4.6379310344827586e-05, + "loss": 0.1242, + "step": 630 + }, + { + "epoch": 0.73143519524741, + "grad_norm": 0.2604047358036041, + "learning_rate": 4.637356321839081e-05, + "loss": 0.126, + "step": 631 + }, + { + "epoch": 0.7325943635441571, + "grad_norm": 0.2878481447696686, + "learning_rate": 4.636781609195402e-05, + "loss": 0.1573, + "step": 632 + }, + { + "epoch": 0.7337535318409042, + "grad_norm": 0.25279513001441956, + "learning_rate": 4.6362068965517244e-05, + "loss": 0.1361, + "step": 633 + }, + { + "epoch": 0.7349127001376512, + "grad_norm": 0.28761938214302063, + "learning_rate": 4.6356321839080466e-05, + "loss": 0.1437, + "step": 634 + }, + { + "epoch": 0.7360718684343983, + "grad_norm": 0.30495256185531616, + "learning_rate": 4.635057471264368e-05, + "loss": 0.1549, + "step": 635 + }, + { + "epoch": 0.7372310367311454, + "grad_norm": 0.2510074973106384, + "learning_rate": 4.63448275862069e-05, + "loss": 0.1286, + "step": 636 + }, + { + "epoch": 0.7383902050278924, + "grad_norm": 0.2822047472000122, + "learning_rate": 4.633908045977012e-05, + "loss": 0.1279, + "step": 637 + }, + { + "epoch": 0.7395493733246395, + "grad_norm": 0.23230963945388794, + "learning_rate": 4.633333333333333e-05, + "loss": 0.1406, + "step": 638 + }, + { + "epoch": 0.7407085416213867, + "grad_norm": 0.2472311556339264, + "learning_rate": 4.6327586206896554e-05, + "loss": 0.1244, + "step": 639 + }, + { + "epoch": 0.7418677099181338, + "grad_norm": 0.2912672460079193, + "learning_rate": 4.632183908045977e-05, + "loss": 0.136, + "step": 640 + }, + { + "epoch": 0.7430268782148808, + "grad_norm": 0.35469523072242737, + "learning_rate": 4.631609195402299e-05, + "loss": 0.1484, + "step": 641 + }, + { + "epoch": 0.7441860465116279, + "grad_norm": 0.26528200507164, + "learning_rate": 4.631034482758621e-05, + "loss": 0.1252, + "step": 642 + }, + { + "epoch": 0.745345214808375, + "grad_norm": 0.2616005539894104, + "learning_rate": 4.630459770114943e-05, + "loss": 0.1222, + "step": 643 + }, + { + "epoch": 0.7465043831051221, + "grad_norm": 0.26047012209892273, + "learning_rate": 4.629885057471264e-05, + "loss": 0.1401, + "step": 644 + }, + { + "epoch": 0.7476635514018691, + "grad_norm": 0.27605798840522766, + "learning_rate": 4.6293103448275864e-05, + "loss": 0.13, + "step": 645 + }, + { + "epoch": 0.7488227196986162, + "grad_norm": 0.30614152550697327, + "learning_rate": 4.628735632183908e-05, + "loss": 0.1486, + "step": 646 + }, + { + "epoch": 0.7499818879953634, + "grad_norm": 0.34578898549079895, + "learning_rate": 4.62816091954023e-05, + "loss": 0.1515, + "step": 647 + }, + { + "epoch": 0.7511410562921104, + "grad_norm": 0.24491111934185028, + "learning_rate": 4.627586206896552e-05, + "loss": 0.1234, + "step": 648 + }, + { + "epoch": 0.7523002245888575, + "grad_norm": 0.2561955153942108, + "learning_rate": 4.627011494252874e-05, + "loss": 0.1255, + "step": 649 + }, + { + "epoch": 0.7534593928856046, + "grad_norm": 0.24703799188137054, + "learning_rate": 4.626436781609196e-05, + "loss": 0.139, + "step": 650 + }, + { + "epoch": 0.7546185611823517, + "grad_norm": 0.26947158575057983, + "learning_rate": 4.6258620689655174e-05, + "loss": 0.1435, + "step": 651 + }, + { + "epoch": 0.7557777294790987, + "grad_norm": 0.2430969476699829, + "learning_rate": 4.625287356321839e-05, + "loss": 0.1316, + "step": 652 + }, + { + "epoch": 0.7569368977758458, + "grad_norm": 0.40103209018707275, + "learning_rate": 4.624712643678161e-05, + "loss": 0.1605, + "step": 653 + }, + { + "epoch": 0.758096066072593, + "grad_norm": 0.25342556834220886, + "learning_rate": 4.624137931034483e-05, + "loss": 0.1357, + "step": 654 + }, + { + "epoch": 0.75925523436934, + "grad_norm": 0.27793052792549133, + "learning_rate": 4.6235632183908054e-05, + "loss": 0.1392, + "step": 655 + }, + { + "epoch": 0.7604144026660871, + "grad_norm": 0.27625927329063416, + "learning_rate": 4.622988505747127e-05, + "loss": 0.1312, + "step": 656 + }, + { + "epoch": 0.7615735709628342, + "grad_norm": 0.2726586163043976, + "learning_rate": 4.622413793103448e-05, + "loss": 0.1439, + "step": 657 + }, + { + "epoch": 0.7627327392595813, + "grad_norm": 0.25028151273727417, + "learning_rate": 4.6218390804597705e-05, + "loss": 0.14, + "step": 658 + }, + { + "epoch": 0.7638919075563283, + "grad_norm": 0.2604377865791321, + "learning_rate": 4.621264367816092e-05, + "loss": 0.1318, + "step": 659 + }, + { + "epoch": 0.7650510758530754, + "grad_norm": 0.35959815979003906, + "learning_rate": 4.6206896551724135e-05, + "loss": 0.149, + "step": 660 + }, + { + "epoch": 0.7662102441498225, + "grad_norm": 0.21311073005199432, + "learning_rate": 4.620114942528736e-05, + "loss": 0.1123, + "step": 661 + }, + { + "epoch": 0.7673694124465695, + "grad_norm": 0.24613121151924133, + "learning_rate": 4.619540229885058e-05, + "loss": 0.1293, + "step": 662 + }, + { + "epoch": 0.7685285807433166, + "grad_norm": 0.27359437942504883, + "learning_rate": 4.618965517241379e-05, + "loss": 0.1359, + "step": 663 + }, + { + "epoch": 0.7696877490400638, + "grad_norm": 0.24835547804832458, + "learning_rate": 4.6183908045977015e-05, + "loss": 0.1335, + "step": 664 + }, + { + "epoch": 0.7708469173368109, + "grad_norm": 0.3183259665966034, + "learning_rate": 4.617816091954023e-05, + "loss": 0.1455, + "step": 665 + }, + { + "epoch": 0.7720060856335579, + "grad_norm": 0.23825454711914062, + "learning_rate": 4.617241379310345e-05, + "loss": 0.1327, + "step": 666 + }, + { + "epoch": 0.773165253930305, + "grad_norm": 0.24269577860832214, + "learning_rate": 4.6166666666666666e-05, + "loss": 0.1321, + "step": 667 + }, + { + "epoch": 0.7743244222270521, + "grad_norm": 0.26813822984695435, + "learning_rate": 4.616091954022989e-05, + "loss": 0.1385, + "step": 668 + }, + { + "epoch": 0.7754835905237992, + "grad_norm": 0.2735162377357483, + "learning_rate": 4.615517241379311e-05, + "loss": 0.1382, + "step": 669 + }, + { + "epoch": 0.7766427588205462, + "grad_norm": 0.26851484179496765, + "learning_rate": 4.6149425287356324e-05, + "loss": 0.1323, + "step": 670 + }, + { + "epoch": 0.7778019271172933, + "grad_norm": 0.3302837312221527, + "learning_rate": 4.614367816091954e-05, + "loss": 0.1644, + "step": 671 + }, + { + "epoch": 0.7789610954140405, + "grad_norm": 0.28512895107269287, + "learning_rate": 4.613793103448276e-05, + "loss": 0.1541, + "step": 672 + }, + { + "epoch": 0.7801202637107875, + "grad_norm": 0.26242977380752563, + "learning_rate": 4.6132183908045976e-05, + "loss": 0.1327, + "step": 673 + }, + { + "epoch": 0.7812794320075346, + "grad_norm": 0.286178320646286, + "learning_rate": 4.61264367816092e-05, + "loss": 0.1301, + "step": 674 + }, + { + "epoch": 0.7824386003042817, + "grad_norm": 0.22549080848693848, + "learning_rate": 4.612068965517242e-05, + "loss": 0.1237, + "step": 675 + }, + { + "epoch": 0.7835977686010288, + "grad_norm": 0.24352504312992096, + "learning_rate": 4.6114942528735634e-05, + "loss": 0.1298, + "step": 676 + }, + { + "epoch": 0.7847569368977758, + "grad_norm": 0.23466962575912476, + "learning_rate": 4.6109195402298856e-05, + "loss": 0.133, + "step": 677 + }, + { + "epoch": 0.7859161051945229, + "grad_norm": 0.2620813250541687, + "learning_rate": 4.610344827586207e-05, + "loss": 0.1311, + "step": 678 + }, + { + "epoch": 0.78707527349127, + "grad_norm": 0.2720955014228821, + "learning_rate": 4.6097701149425286e-05, + "loss": 0.1207, + "step": 679 + }, + { + "epoch": 0.788234441788017, + "grad_norm": 0.2539190649986267, + "learning_rate": 4.609195402298851e-05, + "loss": 0.1287, + "step": 680 + }, + { + "epoch": 0.7893936100847642, + "grad_norm": 0.2616521716117859, + "learning_rate": 4.608620689655173e-05, + "loss": 0.144, + "step": 681 + }, + { + "epoch": 0.7905527783815113, + "grad_norm": 0.2984738349914551, + "learning_rate": 4.6080459770114944e-05, + "loss": 0.1408, + "step": 682 + }, + { + "epoch": 0.7917119466782584, + "grad_norm": 0.25363418459892273, + "learning_rate": 4.6074712643678166e-05, + "loss": 0.121, + "step": 683 + }, + { + "epoch": 0.7928711149750054, + "grad_norm": 0.22894874215126038, + "learning_rate": 4.606896551724138e-05, + "loss": 0.1137, + "step": 684 + }, + { + "epoch": 0.7940302832717525, + "grad_norm": 0.25778335332870483, + "learning_rate": 4.60632183908046e-05, + "loss": 0.1338, + "step": 685 + }, + { + "epoch": 0.7951894515684996, + "grad_norm": 0.2576935887336731, + "learning_rate": 4.605747126436782e-05, + "loss": 0.1416, + "step": 686 + }, + { + "epoch": 0.7963486198652466, + "grad_norm": 0.3121855556964874, + "learning_rate": 4.605172413793103e-05, + "loss": 0.1284, + "step": 687 + }, + { + "epoch": 0.7975077881619937, + "grad_norm": 0.24522997438907623, + "learning_rate": 4.6045977011494254e-05, + "loss": 0.1367, + "step": 688 + }, + { + "epoch": 0.7986669564587409, + "grad_norm": 0.3264451324939728, + "learning_rate": 4.6040229885057475e-05, + "loss": 0.1574, + "step": 689 + }, + { + "epoch": 0.799826124755488, + "grad_norm": 0.29765743017196655, + "learning_rate": 4.603448275862069e-05, + "loss": 0.1336, + "step": 690 + }, + { + "epoch": 0.800985293052235, + "grad_norm": 0.302561491727829, + "learning_rate": 4.602873563218391e-05, + "loss": 0.1449, + "step": 691 + }, + { + "epoch": 0.8021444613489821, + "grad_norm": 0.2951429486274719, + "learning_rate": 4.602298850574713e-05, + "loss": 0.1373, + "step": 692 + }, + { + "epoch": 0.8033036296457292, + "grad_norm": 0.26344770193099976, + "learning_rate": 4.601724137931035e-05, + "loss": 0.1355, + "step": 693 + }, + { + "epoch": 0.8044627979424763, + "grad_norm": 0.23573601245880127, + "learning_rate": 4.6011494252873563e-05, + "loss": 0.1264, + "step": 694 + }, + { + "epoch": 0.8056219662392233, + "grad_norm": 0.2914588153362274, + "learning_rate": 4.6005747126436785e-05, + "loss": 0.1496, + "step": 695 + }, + { + "epoch": 0.8067811345359704, + "grad_norm": 0.23518706858158112, + "learning_rate": 4.600000000000001e-05, + "loss": 0.125, + "step": 696 + }, + { + "epoch": 0.8079403028327176, + "grad_norm": 0.2899051606655121, + "learning_rate": 4.599425287356322e-05, + "loss": 0.1379, + "step": 697 + }, + { + "epoch": 0.8090994711294646, + "grad_norm": 0.25654926896095276, + "learning_rate": 4.598850574712644e-05, + "loss": 0.1469, + "step": 698 + }, + { + "epoch": 0.8102586394262117, + "grad_norm": 0.36468809843063354, + "learning_rate": 4.598275862068966e-05, + "loss": 0.1424, + "step": 699 + }, + { + "epoch": 0.8114178077229588, + "grad_norm": 0.2551107108592987, + "learning_rate": 4.597701149425287e-05, + "loss": 0.1441, + "step": 700 + }, + { + "epoch": 0.8125769760197059, + "grad_norm": 0.21090874075889587, + "learning_rate": 4.597126436781609e-05, + "loss": 0.1253, + "step": 701 + }, + { + "epoch": 0.8137361443164529, + "grad_norm": 0.2588905096054077, + "learning_rate": 4.5965517241379317e-05, + "loss": 0.1327, + "step": 702 + }, + { + "epoch": 0.8148953126132, + "grad_norm": 0.24421721696853638, + "learning_rate": 4.595977011494253e-05, + "loss": 0.1294, + "step": 703 + }, + { + "epoch": 0.8160544809099471, + "grad_norm": 0.25245368480682373, + "learning_rate": 4.595402298850575e-05, + "loss": 0.1247, + "step": 704 + }, + { + "epoch": 0.8172136492066941, + "grad_norm": 0.3936619758605957, + "learning_rate": 4.594827586206897e-05, + "loss": 0.1321, + "step": 705 + }, + { + "epoch": 0.8183728175034413, + "grad_norm": 0.25811803340911865, + "learning_rate": 4.594252873563218e-05, + "loss": 0.1337, + "step": 706 + }, + { + "epoch": 0.8195319858001884, + "grad_norm": 0.23244990408420563, + "learning_rate": 4.5936781609195405e-05, + "loss": 0.1219, + "step": 707 + }, + { + "epoch": 0.8206911540969355, + "grad_norm": 0.2513156235218048, + "learning_rate": 4.593103448275862e-05, + "loss": 0.1422, + "step": 708 + }, + { + "epoch": 0.8218503223936825, + "grad_norm": 0.26262450218200684, + "learning_rate": 4.592528735632184e-05, + "loss": 0.1419, + "step": 709 + }, + { + "epoch": 0.8230094906904296, + "grad_norm": 0.268996924161911, + "learning_rate": 4.591954022988506e-05, + "loss": 0.1427, + "step": 710 + }, + { + "epoch": 0.8241686589871767, + "grad_norm": 0.2628903090953827, + "learning_rate": 4.591379310344828e-05, + "loss": 0.1406, + "step": 711 + }, + { + "epoch": 0.8253278272839237, + "grad_norm": 0.24836388230323792, + "learning_rate": 4.59080459770115e-05, + "loss": 0.1344, + "step": 712 + }, + { + "epoch": 0.8264869955806708, + "grad_norm": 0.29568374156951904, + "learning_rate": 4.5902298850574714e-05, + "loss": 0.1353, + "step": 713 + }, + { + "epoch": 0.827646163877418, + "grad_norm": 0.30942583084106445, + "learning_rate": 4.589655172413793e-05, + "loss": 0.1381, + "step": 714 + }, + { + "epoch": 0.8288053321741651, + "grad_norm": 0.3070472478866577, + "learning_rate": 4.589080459770115e-05, + "loss": 0.142, + "step": 715 + }, + { + "epoch": 0.8299645004709121, + "grad_norm": 0.27771466970443726, + "learning_rate": 4.588505747126437e-05, + "loss": 0.1397, + "step": 716 + }, + { + "epoch": 0.8311236687676592, + "grad_norm": 0.2712878882884979, + "learning_rate": 4.587931034482759e-05, + "loss": 0.1252, + "step": 717 + }, + { + "epoch": 0.8322828370644063, + "grad_norm": 0.24696029722690582, + "learning_rate": 4.587356321839081e-05, + "loss": 0.1327, + "step": 718 + }, + { + "epoch": 0.8334420053611534, + "grad_norm": 0.26035380363464355, + "learning_rate": 4.5867816091954024e-05, + "loss": 0.1276, + "step": 719 + }, + { + "epoch": 0.8346011736579004, + "grad_norm": 0.23014302551746368, + "learning_rate": 4.586206896551724e-05, + "loss": 0.1294, + "step": 720 + }, + { + "epoch": 0.8357603419546475, + "grad_norm": 0.22767721116542816, + "learning_rate": 4.585632183908046e-05, + "loss": 0.1204, + "step": 721 + }, + { + "epoch": 0.8369195102513947, + "grad_norm": 0.24614818394184113, + "learning_rate": 4.585057471264368e-05, + "loss": 0.1412, + "step": 722 + }, + { + "epoch": 0.8380786785481417, + "grad_norm": 0.2815050184726715, + "learning_rate": 4.5844827586206904e-05, + "loss": 0.142, + "step": 723 + }, + { + "epoch": 0.8392378468448888, + "grad_norm": 0.24825121462345123, + "learning_rate": 4.583908045977012e-05, + "loss": 0.1477, + "step": 724 + }, + { + "epoch": 0.8403970151416359, + "grad_norm": 0.2300599366426468, + "learning_rate": 4.5833333333333334e-05, + "loss": 0.1413, + "step": 725 + }, + { + "epoch": 0.841556183438383, + "grad_norm": 0.27150389552116394, + "learning_rate": 4.5827586206896556e-05, + "loss": 0.1553, + "step": 726 + }, + { + "epoch": 0.84271535173513, + "grad_norm": 0.33325132727622986, + "learning_rate": 4.582183908045977e-05, + "loss": 0.1699, + "step": 727 + }, + { + "epoch": 0.8438745200318771, + "grad_norm": 0.21544939279556274, + "learning_rate": 4.5816091954022985e-05, + "loss": 0.1279, + "step": 728 + }, + { + "epoch": 0.8450336883286242, + "grad_norm": 0.23038536310195923, + "learning_rate": 4.581034482758621e-05, + "loss": 0.1227, + "step": 729 + }, + { + "epoch": 0.8461928566253712, + "grad_norm": 0.25293296575546265, + "learning_rate": 4.580459770114943e-05, + "loss": 0.1497, + "step": 730 + }, + { + "epoch": 0.8473520249221184, + "grad_norm": 0.2448996752500534, + "learning_rate": 4.579885057471265e-05, + "loss": 0.1475, + "step": 731 + }, + { + "epoch": 0.8485111932188655, + "grad_norm": 0.2752504050731659, + "learning_rate": 4.5793103448275865e-05, + "loss": 0.1453, + "step": 732 + }, + { + "epoch": 0.8496703615156126, + "grad_norm": 0.2612292170524597, + "learning_rate": 4.578735632183908e-05, + "loss": 0.1306, + "step": 733 + }, + { + "epoch": 0.8508295298123596, + "grad_norm": 0.2557094395160675, + "learning_rate": 4.57816091954023e-05, + "loss": 0.1288, + "step": 734 + }, + { + "epoch": 0.8519886981091067, + "grad_norm": 0.24499356746673584, + "learning_rate": 4.577586206896552e-05, + "loss": 0.1391, + "step": 735 + }, + { + "epoch": 0.8531478664058538, + "grad_norm": 0.23510660231113434, + "learning_rate": 4.577011494252874e-05, + "loss": 0.1277, + "step": 736 + }, + { + "epoch": 0.8543070347026008, + "grad_norm": 0.25060582160949707, + "learning_rate": 4.576436781609196e-05, + "loss": 0.1301, + "step": 737 + }, + { + "epoch": 0.855466202999348, + "grad_norm": 0.2850019931793213, + "learning_rate": 4.5758620689655175e-05, + "loss": 0.1406, + "step": 738 + }, + { + "epoch": 0.8566253712960951, + "grad_norm": 0.30304649472236633, + "learning_rate": 4.575287356321839e-05, + "loss": 0.1455, + "step": 739 + }, + { + "epoch": 0.8577845395928422, + "grad_norm": 0.27735939621925354, + "learning_rate": 4.574712643678161e-05, + "loss": 0.133, + "step": 740 + }, + { + "epoch": 0.8589437078895892, + "grad_norm": 0.258037805557251, + "learning_rate": 4.5741379310344826e-05, + "loss": 0.1359, + "step": 741 + }, + { + "epoch": 0.8601028761863363, + "grad_norm": 0.2623947858810425, + "learning_rate": 4.573563218390805e-05, + "loss": 0.141, + "step": 742 + }, + { + "epoch": 0.8612620444830834, + "grad_norm": 0.2677944004535675, + "learning_rate": 4.572988505747127e-05, + "loss": 0.1398, + "step": 743 + }, + { + "epoch": 0.8624212127798305, + "grad_norm": 0.28370678424835205, + "learning_rate": 4.5724137931034485e-05, + "loss": 0.1551, + "step": 744 + }, + { + "epoch": 0.8635803810765775, + "grad_norm": 0.259971022605896, + "learning_rate": 4.5718390804597706e-05, + "loss": 0.1488, + "step": 745 + }, + { + "epoch": 0.8647395493733246, + "grad_norm": 0.22784096002578735, + "learning_rate": 4.571264367816092e-05, + "loss": 0.123, + "step": 746 + }, + { + "epoch": 0.8658987176700718, + "grad_norm": 0.25658029317855835, + "learning_rate": 4.5706896551724136e-05, + "loss": 0.1364, + "step": 747 + }, + { + "epoch": 0.8670578859668188, + "grad_norm": 0.24773739278316498, + "learning_rate": 4.570114942528736e-05, + "loss": 0.128, + "step": 748 + }, + { + "epoch": 0.8682170542635659, + "grad_norm": 0.2921466827392578, + "learning_rate": 4.569540229885057e-05, + "loss": 0.1432, + "step": 749 + }, + { + "epoch": 0.869376222560313, + "grad_norm": 0.26510924100875854, + "learning_rate": 4.5689655172413794e-05, + "loss": 0.1334, + "step": 750 + }, + { + "epoch": 0.8705353908570601, + "grad_norm": 0.2811342179775238, + "learning_rate": 4.5683908045977016e-05, + "loss": 0.1446, + "step": 751 + }, + { + "epoch": 0.8716945591538071, + "grad_norm": 0.24528606235980988, + "learning_rate": 4.567816091954023e-05, + "loss": 0.1254, + "step": 752 + }, + { + "epoch": 0.8728537274505542, + "grad_norm": 0.31388193368911743, + "learning_rate": 4.567241379310345e-05, + "loss": 0.148, + "step": 753 + }, + { + "epoch": 0.8740128957473013, + "grad_norm": 0.26188236474990845, + "learning_rate": 4.566666666666667e-05, + "loss": 0.1347, + "step": 754 + }, + { + "epoch": 0.8751720640440483, + "grad_norm": 0.2681477665901184, + "learning_rate": 4.566091954022988e-05, + "loss": 0.1251, + "step": 755 + }, + { + "epoch": 0.8763312323407955, + "grad_norm": 0.24694494903087616, + "learning_rate": 4.5655172413793104e-05, + "loss": 0.1341, + "step": 756 + }, + { + "epoch": 0.8774904006375426, + "grad_norm": 0.27526146173477173, + "learning_rate": 4.5649425287356326e-05, + "loss": 0.1425, + "step": 757 + }, + { + "epoch": 0.8786495689342897, + "grad_norm": 0.28800222277641296, + "learning_rate": 4.564367816091955e-05, + "loss": 0.1396, + "step": 758 + }, + { + "epoch": 0.8798087372310367, + "grad_norm": 0.29053691029548645, + "learning_rate": 4.563793103448276e-05, + "loss": 0.1427, + "step": 759 + }, + { + "epoch": 0.8809679055277838, + "grad_norm": 0.29743799567222595, + "learning_rate": 4.563218390804598e-05, + "loss": 0.1513, + "step": 760 + }, + { + "epoch": 0.8821270738245309, + "grad_norm": 0.27560529112815857, + "learning_rate": 4.56264367816092e-05, + "loss": 0.1356, + "step": 761 + }, + { + "epoch": 0.8832862421212779, + "grad_norm": 0.23586055636405945, + "learning_rate": 4.5620689655172414e-05, + "loss": 0.132, + "step": 762 + }, + { + "epoch": 0.884445410418025, + "grad_norm": 0.26681259274482727, + "learning_rate": 4.5614942528735636e-05, + "loss": 0.1584, + "step": 763 + }, + { + "epoch": 0.8856045787147722, + "grad_norm": 0.3042534589767456, + "learning_rate": 4.560919540229886e-05, + "loss": 0.149, + "step": 764 + }, + { + "epoch": 0.8867637470115193, + "grad_norm": 0.24351336061954498, + "learning_rate": 4.560344827586207e-05, + "loss": 0.1311, + "step": 765 + }, + { + "epoch": 0.8879229153082663, + "grad_norm": 0.2620246708393097, + "learning_rate": 4.559770114942529e-05, + "loss": 0.1448, + "step": 766 + }, + { + "epoch": 0.8890820836050134, + "grad_norm": 0.2437165081501007, + "learning_rate": 4.559195402298851e-05, + "loss": 0.1435, + "step": 767 + }, + { + "epoch": 0.8902412519017605, + "grad_norm": 0.231397345662117, + "learning_rate": 4.5586206896551724e-05, + "loss": 0.1378, + "step": 768 + }, + { + "epoch": 0.8914004201985076, + "grad_norm": 0.22732099890708923, + "learning_rate": 4.5580459770114945e-05, + "loss": 0.1399, + "step": 769 + }, + { + "epoch": 0.8925595884952546, + "grad_norm": 0.2291109710931778, + "learning_rate": 4.557471264367816e-05, + "loss": 0.1207, + "step": 770 + }, + { + "epoch": 0.8937187567920017, + "grad_norm": 0.23525553941726685, + "learning_rate": 4.556896551724138e-05, + "loss": 0.1269, + "step": 771 + }, + { + "epoch": 0.8948779250887489, + "grad_norm": 0.2587391436100006, + "learning_rate": 4.5563218390804604e-05, + "loss": 0.1208, + "step": 772 + }, + { + "epoch": 0.8960370933854959, + "grad_norm": 0.24662849307060242, + "learning_rate": 4.555747126436782e-05, + "loss": 0.1413, + "step": 773 + }, + { + "epoch": 0.897196261682243, + "grad_norm": 0.2599044144153595, + "learning_rate": 4.5551724137931033e-05, + "loss": 0.1322, + "step": 774 + }, + { + "epoch": 0.8983554299789901, + "grad_norm": 0.2424292415380478, + "learning_rate": 4.5545977011494255e-05, + "loss": 0.1359, + "step": 775 + }, + { + "epoch": 0.8995145982757372, + "grad_norm": 0.25603169202804565, + "learning_rate": 4.554022988505747e-05, + "loss": 0.1324, + "step": 776 + }, + { + "epoch": 0.9006737665724842, + "grad_norm": 0.2570304870605469, + "learning_rate": 4.553448275862069e-05, + "loss": 0.122, + "step": 777 + }, + { + "epoch": 0.9018329348692313, + "grad_norm": 0.2565818130970001, + "learning_rate": 4.552873563218391e-05, + "loss": 0.1246, + "step": 778 + }, + { + "epoch": 0.9029921031659784, + "grad_norm": 0.2947520613670349, + "learning_rate": 4.552298850574713e-05, + "loss": 0.1376, + "step": 779 + }, + { + "epoch": 0.9041512714627254, + "grad_norm": 0.31323108077049255, + "learning_rate": 4.551724137931035e-05, + "loss": 0.148, + "step": 780 + }, + { + "epoch": 0.9053104397594726, + "grad_norm": 0.3410814702510834, + "learning_rate": 4.5511494252873565e-05, + "loss": 0.1493, + "step": 781 + }, + { + "epoch": 0.9064696080562197, + "grad_norm": 0.2659667134284973, + "learning_rate": 4.550574712643678e-05, + "loss": 0.1303, + "step": 782 + }, + { + "epoch": 0.9076287763529668, + "grad_norm": 0.2651742696762085, + "learning_rate": 4.55e-05, + "loss": 0.1327, + "step": 783 + }, + { + "epoch": 0.9087879446497138, + "grad_norm": 0.24942578375339508, + "learning_rate": 4.549425287356322e-05, + "loss": 0.1342, + "step": 784 + }, + { + "epoch": 0.9099471129464609, + "grad_norm": 0.2682301104068756, + "learning_rate": 4.548850574712644e-05, + "loss": 0.1449, + "step": 785 + }, + { + "epoch": 0.911106281243208, + "grad_norm": 0.2612859606742859, + "learning_rate": 4.548275862068966e-05, + "loss": 0.1371, + "step": 786 + }, + { + "epoch": 0.912265449539955, + "grad_norm": 0.30771300196647644, + "learning_rate": 4.5477011494252875e-05, + "loss": 0.1446, + "step": 787 + }, + { + "epoch": 0.9134246178367021, + "grad_norm": 0.33343127369880676, + "learning_rate": 4.5471264367816096e-05, + "loss": 0.1375, + "step": 788 + }, + { + "epoch": 0.9145837861334493, + "grad_norm": 0.2678926885128021, + "learning_rate": 4.546551724137931e-05, + "loss": 0.1347, + "step": 789 + }, + { + "epoch": 0.9157429544301964, + "grad_norm": 0.2823614478111267, + "learning_rate": 4.5459770114942526e-05, + "loss": 0.1461, + "step": 790 + }, + { + "epoch": 0.9169021227269434, + "grad_norm": 0.27924710512161255, + "learning_rate": 4.545402298850575e-05, + "loss": 0.1343, + "step": 791 + }, + { + "epoch": 0.9180612910236905, + "grad_norm": 0.27781474590301514, + "learning_rate": 4.544827586206897e-05, + "loss": 0.1433, + "step": 792 + }, + { + "epoch": 0.9192204593204376, + "grad_norm": 0.23757725954055786, + "learning_rate": 4.5442528735632184e-05, + "loss": 0.1214, + "step": 793 + }, + { + "epoch": 0.9203796276171846, + "grad_norm": 0.305899053812027, + "learning_rate": 4.5436781609195406e-05, + "loss": 0.136, + "step": 794 + }, + { + "epoch": 0.9215387959139317, + "grad_norm": 0.29127955436706543, + "learning_rate": 4.543103448275862e-05, + "loss": 0.1551, + "step": 795 + }, + { + "epoch": 0.9226979642106788, + "grad_norm": 0.29604771733283997, + "learning_rate": 4.542528735632184e-05, + "loss": 0.1378, + "step": 796 + }, + { + "epoch": 0.923857132507426, + "grad_norm": 0.23887404799461365, + "learning_rate": 4.541954022988506e-05, + "loss": 0.1268, + "step": 797 + }, + { + "epoch": 0.925016300804173, + "grad_norm": 0.2829357981681824, + "learning_rate": 4.541379310344828e-05, + "loss": 0.1432, + "step": 798 + }, + { + "epoch": 0.9261754691009201, + "grad_norm": 0.2251252681016922, + "learning_rate": 4.54080459770115e-05, + "loss": 0.1298, + "step": 799 + }, + { + "epoch": 0.9273346373976672, + "grad_norm": 0.26304125785827637, + "learning_rate": 4.5402298850574716e-05, + "loss": 0.1376, + "step": 800 + }, + { + "epoch": 0.9284938056944143, + "grad_norm": 0.22513070702552795, + "learning_rate": 4.539655172413793e-05, + "loss": 0.127, + "step": 801 + }, + { + "epoch": 0.9296529739911613, + "grad_norm": 0.25344815850257874, + "learning_rate": 4.539080459770115e-05, + "loss": 0.1395, + "step": 802 + }, + { + "epoch": 0.9308121422879084, + "grad_norm": 0.23576590418815613, + "learning_rate": 4.538505747126437e-05, + "loss": 0.1269, + "step": 803 + }, + { + "epoch": 0.9319713105846555, + "grad_norm": 0.24553732573986053, + "learning_rate": 4.537931034482759e-05, + "loss": 0.1408, + "step": 804 + }, + { + "epoch": 0.9331304788814025, + "grad_norm": 0.24802154302597046, + "learning_rate": 4.537356321839081e-05, + "loss": 0.1307, + "step": 805 + }, + { + "epoch": 0.9342896471781497, + "grad_norm": 0.255938321352005, + "learning_rate": 4.5367816091954025e-05, + "loss": 0.1446, + "step": 806 + }, + { + "epoch": 0.9354488154748968, + "grad_norm": 0.23458512127399445, + "learning_rate": 4.536206896551725e-05, + "loss": 0.1371, + "step": 807 + }, + { + "epoch": 0.9366079837716439, + "grad_norm": 0.2571949064731598, + "learning_rate": 4.535632183908046e-05, + "loss": 0.1319, + "step": 808 + }, + { + "epoch": 0.9377671520683909, + "grad_norm": 0.2464578002691269, + "learning_rate": 4.535057471264368e-05, + "loss": 0.1425, + "step": 809 + }, + { + "epoch": 0.938926320365138, + "grad_norm": 0.23102332651615143, + "learning_rate": 4.53448275862069e-05, + "loss": 0.1351, + "step": 810 + }, + { + "epoch": 0.9400854886618851, + "grad_norm": 0.27257415652275085, + "learning_rate": 4.5339080459770114e-05, + "loss": 0.1359, + "step": 811 + }, + { + "epoch": 0.9412446569586321, + "grad_norm": 0.24599824845790863, + "learning_rate": 4.5333333333333335e-05, + "loss": 0.1356, + "step": 812 + }, + { + "epoch": 0.9424038252553792, + "grad_norm": 0.23127569258213043, + "learning_rate": 4.532758620689656e-05, + "loss": 0.1412, + "step": 813 + }, + { + "epoch": 0.9435629935521264, + "grad_norm": 0.2365388125181198, + "learning_rate": 4.532183908045977e-05, + "loss": 0.1118, + "step": 814 + }, + { + "epoch": 0.9447221618488735, + "grad_norm": 0.2809307873249054, + "learning_rate": 4.5316091954022993e-05, + "loss": 0.1363, + "step": 815 + }, + { + "epoch": 0.9458813301456205, + "grad_norm": 0.258364737033844, + "learning_rate": 4.531034482758621e-05, + "loss": 0.1378, + "step": 816 + }, + { + "epoch": 0.9470404984423676, + "grad_norm": 0.27401670813560486, + "learning_rate": 4.530459770114942e-05, + "loss": 0.1465, + "step": 817 + }, + { + "epoch": 0.9481996667391147, + "grad_norm": 0.22026842832565308, + "learning_rate": 4.5298850574712645e-05, + "loss": 0.1283, + "step": 818 + }, + { + "epoch": 0.9493588350358617, + "grad_norm": 0.2709653675556183, + "learning_rate": 4.529310344827587e-05, + "loss": 0.1284, + "step": 819 + }, + { + "epoch": 0.9505180033326088, + "grad_norm": 0.2417244166135788, + "learning_rate": 4.528735632183908e-05, + "loss": 0.1363, + "step": 820 + }, + { + "epoch": 0.951677171629356, + "grad_norm": 0.2279566526412964, + "learning_rate": 4.52816091954023e-05, + "loss": 0.1285, + "step": 821 + }, + { + "epoch": 0.9528363399261031, + "grad_norm": 0.32052162289619446, + "learning_rate": 4.527586206896552e-05, + "loss": 0.1433, + "step": 822 + }, + { + "epoch": 0.9539955082228501, + "grad_norm": 0.3256911635398865, + "learning_rate": 4.527011494252873e-05, + "loss": 0.145, + "step": 823 + }, + { + "epoch": 0.9551546765195972, + "grad_norm": 0.30633530020713806, + "learning_rate": 4.5264367816091955e-05, + "loss": 0.1458, + "step": 824 + }, + { + "epoch": 0.9563138448163443, + "grad_norm": 0.24086791276931763, + "learning_rate": 4.5258620689655176e-05, + "loss": 0.1213, + "step": 825 + }, + { + "epoch": 0.9574730131130914, + "grad_norm": 0.3224896788597107, + "learning_rate": 4.52528735632184e-05, + "loss": 0.1543, + "step": 826 + }, + { + "epoch": 0.9586321814098384, + "grad_norm": 0.24555163085460663, + "learning_rate": 4.524712643678161e-05, + "loss": 0.1295, + "step": 827 + }, + { + "epoch": 0.9597913497065855, + "grad_norm": 0.2528969645500183, + "learning_rate": 4.524137931034483e-05, + "loss": 0.1389, + "step": 828 + }, + { + "epoch": 0.9609505180033326, + "grad_norm": 0.2498130351305008, + "learning_rate": 4.523563218390805e-05, + "loss": 0.1529, + "step": 829 + }, + { + "epoch": 0.9621096863000796, + "grad_norm": 0.26962774991989136, + "learning_rate": 4.5229885057471264e-05, + "loss": 0.1346, + "step": 830 + }, + { + "epoch": 0.9632688545968268, + "grad_norm": 0.2615809142589569, + "learning_rate": 4.522413793103448e-05, + "loss": 0.1549, + "step": 831 + }, + { + "epoch": 0.9644280228935739, + "grad_norm": 0.25109943747520447, + "learning_rate": 4.521839080459771e-05, + "loss": 0.1402, + "step": 832 + }, + { + "epoch": 0.965587191190321, + "grad_norm": 0.2414674311876297, + "learning_rate": 4.521264367816092e-05, + "loss": 0.1329, + "step": 833 + }, + { + "epoch": 0.966746359487068, + "grad_norm": 0.2999972701072693, + "learning_rate": 4.5206896551724144e-05, + "loss": 0.1524, + "step": 834 + }, + { + "epoch": 0.9679055277838151, + "grad_norm": 0.22573649883270264, + "learning_rate": 4.520114942528736e-05, + "loss": 0.1248, + "step": 835 + }, + { + "epoch": 0.9690646960805622, + "grad_norm": 0.2691646218299866, + "learning_rate": 4.5195402298850574e-05, + "loss": 0.1259, + "step": 836 + }, + { + "epoch": 0.9702238643773092, + "grad_norm": 0.2566002607345581, + "learning_rate": 4.5189655172413796e-05, + "loss": 0.1338, + "step": 837 + }, + { + "epoch": 0.9713830326740563, + "grad_norm": 0.30500656366348267, + "learning_rate": 4.518390804597701e-05, + "loss": 0.1393, + "step": 838 + }, + { + "epoch": 0.9725422009708035, + "grad_norm": 0.2786789834499359, + "learning_rate": 4.517816091954023e-05, + "loss": 0.1418, + "step": 839 + }, + { + "epoch": 0.9737013692675506, + "grad_norm": 0.2511579096317291, + "learning_rate": 4.5172413793103454e-05, + "loss": 0.1261, + "step": 840 + }, + { + "epoch": 0.9748605375642976, + "grad_norm": 0.2628697454929352, + "learning_rate": 4.516666666666667e-05, + "loss": 0.1255, + "step": 841 + }, + { + "epoch": 0.9760197058610447, + "grad_norm": 0.2691211700439453, + "learning_rate": 4.5160919540229884e-05, + "loss": 0.1469, + "step": 842 + }, + { + "epoch": 0.9771788741577918, + "grad_norm": 0.4240913689136505, + "learning_rate": 4.5155172413793106e-05, + "loss": 0.1345, + "step": 843 + }, + { + "epoch": 0.9783380424545388, + "grad_norm": 0.26439380645751953, + "learning_rate": 4.514942528735632e-05, + "loss": 0.1389, + "step": 844 + }, + { + "epoch": 0.9794972107512859, + "grad_norm": 0.2187066376209259, + "learning_rate": 4.514367816091954e-05, + "loss": 0.1191, + "step": 845 + }, + { + "epoch": 0.980656379048033, + "grad_norm": 0.2630019187927246, + "learning_rate": 4.5137931034482764e-05, + "loss": 0.1255, + "step": 846 + }, + { + "epoch": 0.9818155473447802, + "grad_norm": 0.2619337737560272, + "learning_rate": 4.513218390804598e-05, + "loss": 0.146, + "step": 847 + }, + { + "epoch": 0.9829747156415272, + "grad_norm": 0.24650199711322784, + "learning_rate": 4.51264367816092e-05, + "loss": 0.1246, + "step": 848 + }, + { + "epoch": 0.9841338839382743, + "grad_norm": 0.33000877499580383, + "learning_rate": 4.5120689655172415e-05, + "loss": 0.1448, + "step": 849 + }, + { + "epoch": 0.9852930522350214, + "grad_norm": 0.39497750997543335, + "learning_rate": 4.511494252873563e-05, + "loss": 0.1332, + "step": 850 + }, + { + "epoch": 0.9864522205317685, + "grad_norm": 0.27251842617988586, + "learning_rate": 4.510919540229885e-05, + "loss": 0.127, + "step": 851 + }, + { + "epoch": 0.9876113888285155, + "grad_norm": 0.4220561981201172, + "learning_rate": 4.510344827586207e-05, + "loss": 0.1353, + "step": 852 + }, + { + "epoch": 0.9887705571252626, + "grad_norm": 0.24371755123138428, + "learning_rate": 4.5097701149425295e-05, + "loss": 0.1436, + "step": 853 + }, + { + "epoch": 0.9899297254220097, + "grad_norm": 0.2443200647830963, + "learning_rate": 4.509195402298851e-05, + "loss": 0.1405, + "step": 854 + }, + { + "epoch": 0.9910888937187567, + "grad_norm": 0.22042927145957947, + "learning_rate": 4.5086206896551725e-05, + "loss": 0.1294, + "step": 855 + }, + { + "epoch": 0.9922480620155039, + "grad_norm": 0.26495447754859924, + "learning_rate": 4.508045977011495e-05, + "loss": 0.1442, + "step": 856 + }, + { + "epoch": 0.993407230312251, + "grad_norm": 0.28576844930648804, + "learning_rate": 4.507471264367816e-05, + "loss": 0.1422, + "step": 857 + }, + { + "epoch": 0.9945663986089981, + "grad_norm": 0.24880261719226837, + "learning_rate": 4.5068965517241377e-05, + "loss": 0.1404, + "step": 858 + }, + { + "epoch": 0.9957255669057451, + "grad_norm": 0.2431262731552124, + "learning_rate": 4.50632183908046e-05, + "loss": 0.1376, + "step": 859 + }, + { + "epoch": 0.9968847352024922, + "grad_norm": 0.2703196704387665, + "learning_rate": 4.505747126436782e-05, + "loss": 0.1407, + "step": 860 + }, + { + "epoch": 0.9980439034992393, + "grad_norm": 0.3191419839859009, + "learning_rate": 4.5051724137931035e-05, + "loss": 0.1558, + "step": 861 + }, + { + "epoch": 0.9992030717959863, + "grad_norm": 0.25417354702949524, + "learning_rate": 4.5045977011494257e-05, + "loss": 0.1386, + "step": 862 + }, + { + "epoch": 0.9992030717959863, + "eval_loss": 0.13573063910007477, + "eval_runtime": 265.6894, + "eval_samples_per_second": 5.774, + "eval_steps_per_second": 5.774, + "step": 862 + }, + { + "epoch": 1.0003622400927334, + "grad_norm": 0.21144159138202667, + "learning_rate": 4.504022988505747e-05, + "loss": 0.119, + "step": 863 + }, + { + "epoch": 1.0015214083894806, + "grad_norm": 0.27048224210739136, + "learning_rate": 4.503448275862069e-05, + "loss": 0.1282, + "step": 864 + }, + { + "epoch": 1.0026805766862277, + "grad_norm": 0.2059887945652008, + "learning_rate": 4.502873563218391e-05, + "loss": 0.1389, + "step": 865 + }, + { + "epoch": 1.0038397449829748, + "grad_norm": 0.25673580169677734, + "learning_rate": 4.502298850574713e-05, + "loss": 0.134, + "step": 866 + }, + { + "epoch": 1.004998913279722, + "grad_norm": 0.2381593883037567, + "learning_rate": 4.501724137931035e-05, + "loss": 0.1271, + "step": 867 + }, + { + "epoch": 1.0061580815764688, + "grad_norm": 0.2609866261482239, + "learning_rate": 4.5011494252873566e-05, + "loss": 0.1349, + "step": 868 + }, + { + "epoch": 1.007317249873216, + "grad_norm": 0.24942117929458618, + "learning_rate": 4.500574712643678e-05, + "loss": 0.1251, + "step": 869 + }, + { + "epoch": 1.008476418169963, + "grad_norm": 0.2552180886268616, + "learning_rate": 4.5e-05, + "loss": 0.1333, + "step": 870 + }, + { + "epoch": 1.0096355864667101, + "grad_norm": 0.2939591407775879, + "learning_rate": 4.499425287356322e-05, + "loss": 0.1433, + "step": 871 + }, + { + "epoch": 1.0107947547634573, + "grad_norm": 0.2345464825630188, + "learning_rate": 4.498850574712644e-05, + "loss": 0.1273, + "step": 872 + }, + { + "epoch": 1.0119539230602044, + "grad_norm": 0.3161166310310364, + "learning_rate": 4.498275862068966e-05, + "loss": 0.1301, + "step": 873 + }, + { + "epoch": 1.0131130913569515, + "grad_norm": 0.2409006655216217, + "learning_rate": 4.4977011494252876e-05, + "loss": 0.1145, + "step": 874 + }, + { + "epoch": 1.0142722596536984, + "grad_norm": 0.26523053646087646, + "learning_rate": 4.49712643678161e-05, + "loss": 0.1386, + "step": 875 + }, + { + "epoch": 1.0154314279504455, + "grad_norm": 0.2150416225194931, + "learning_rate": 4.496551724137931e-05, + "loss": 0.1285, + "step": 876 + }, + { + "epoch": 1.0165905962471926, + "grad_norm": 0.23828035593032837, + "learning_rate": 4.495977011494253e-05, + "loss": 0.1161, + "step": 877 + }, + { + "epoch": 1.0177497645439397, + "grad_norm": 0.319975882768631, + "learning_rate": 4.495402298850575e-05, + "loss": 0.1444, + "step": 878 + }, + { + "epoch": 1.0189089328406868, + "grad_norm": 0.22034992277622223, + "learning_rate": 4.4948275862068964e-05, + "loss": 0.1195, + "step": 879 + }, + { + "epoch": 1.020068101137434, + "grad_norm": 0.2223641574382782, + "learning_rate": 4.4942528735632186e-05, + "loss": 0.1181, + "step": 880 + }, + { + "epoch": 1.021227269434181, + "grad_norm": 0.21697430312633514, + "learning_rate": 4.493678160919541e-05, + "loss": 0.1131, + "step": 881 + }, + { + "epoch": 1.022386437730928, + "grad_norm": 0.24257703125476837, + "learning_rate": 4.493103448275862e-05, + "loss": 0.123, + "step": 882 + }, + { + "epoch": 1.023545606027675, + "grad_norm": 0.2622239291667938, + "learning_rate": 4.4925287356321844e-05, + "loss": 0.1325, + "step": 883 + }, + { + "epoch": 1.0247047743244222, + "grad_norm": 0.2764127254486084, + "learning_rate": 4.491954022988506e-05, + "loss": 0.1228, + "step": 884 + }, + { + "epoch": 1.0258639426211693, + "grad_norm": 0.25700658559799194, + "learning_rate": 4.4913793103448274e-05, + "loss": 0.1212, + "step": 885 + }, + { + "epoch": 1.0270231109179164, + "grad_norm": 0.27795660495758057, + "learning_rate": 4.4908045977011495e-05, + "loss": 0.1328, + "step": 886 + }, + { + "epoch": 1.0281822792146635, + "grad_norm": 0.28292378783226013, + "learning_rate": 4.490229885057472e-05, + "loss": 0.1197, + "step": 887 + }, + { + "epoch": 1.0293414475114107, + "grad_norm": 0.27904993295669556, + "learning_rate": 4.489655172413793e-05, + "loss": 0.1208, + "step": 888 + }, + { + "epoch": 1.0305006158081575, + "grad_norm": 0.23822936415672302, + "learning_rate": 4.4890804597701154e-05, + "loss": 0.1187, + "step": 889 + }, + { + "epoch": 1.0316597841049047, + "grad_norm": 0.23615434765815735, + "learning_rate": 4.488505747126437e-05, + "loss": 0.119, + "step": 890 + }, + { + "epoch": 1.0328189524016518, + "grad_norm": 0.2193615883588791, + "learning_rate": 4.487931034482759e-05, + "loss": 0.1251, + "step": 891 + }, + { + "epoch": 1.033978120698399, + "grad_norm": 0.2385019063949585, + "learning_rate": 4.4873563218390805e-05, + "loss": 0.1261, + "step": 892 + }, + { + "epoch": 1.035137288995146, + "grad_norm": 0.262351393699646, + "learning_rate": 4.486781609195403e-05, + "loss": 0.1285, + "step": 893 + }, + { + "epoch": 1.0362964572918931, + "grad_norm": 0.2790246307849884, + "learning_rate": 4.486206896551725e-05, + "loss": 0.13, + "step": 894 + }, + { + "epoch": 1.0374556255886402, + "grad_norm": 0.28902125358581543, + "learning_rate": 4.4856321839080463e-05, + "loss": 0.1266, + "step": 895 + }, + { + "epoch": 1.0386147938853871, + "grad_norm": 0.2775196433067322, + "learning_rate": 4.485057471264368e-05, + "loss": 0.1453, + "step": 896 + }, + { + "epoch": 1.0397739621821342, + "grad_norm": 0.3169780671596527, + "learning_rate": 4.48448275862069e-05, + "loss": 0.1372, + "step": 897 + }, + { + "epoch": 1.0409331304788814, + "grad_norm": 0.24293041229248047, + "learning_rate": 4.4839080459770115e-05, + "loss": 0.1267, + "step": 898 + }, + { + "epoch": 1.0420922987756285, + "grad_norm": 0.3456263244152069, + "learning_rate": 4.483333333333333e-05, + "loss": 0.1566, + "step": 899 + }, + { + "epoch": 1.0432514670723756, + "grad_norm": 0.2542770504951477, + "learning_rate": 4.482758620689655e-05, + "loss": 0.1292, + "step": 900 + }, + { + "epoch": 1.0444106353691227, + "grad_norm": 0.308881551027298, + "learning_rate": 4.482183908045977e-05, + "loss": 0.1492, + "step": 901 + }, + { + "epoch": 1.0455698036658698, + "grad_norm": 0.26184189319610596, + "learning_rate": 4.4816091954022995e-05, + "loss": 0.1225, + "step": 902 + }, + { + "epoch": 1.0467289719626167, + "grad_norm": 0.22493098676204681, + "learning_rate": 4.481034482758621e-05, + "loss": 0.1239, + "step": 903 + }, + { + "epoch": 1.0478881402593638, + "grad_norm": 0.23027804493904114, + "learning_rate": 4.4804597701149425e-05, + "loss": 0.1253, + "step": 904 + }, + { + "epoch": 1.049047308556111, + "grad_norm": 0.2515313923358917, + "learning_rate": 4.4798850574712646e-05, + "loss": 0.1268, + "step": 905 + }, + { + "epoch": 1.050206476852858, + "grad_norm": 0.30108925700187683, + "learning_rate": 4.479310344827586e-05, + "loss": 0.1306, + "step": 906 + }, + { + "epoch": 1.0513656451496052, + "grad_norm": 0.25616785883903503, + "learning_rate": 4.478735632183908e-05, + "loss": 0.1292, + "step": 907 + }, + { + "epoch": 1.0525248134463523, + "grad_norm": 0.27651333808898926, + "learning_rate": 4.4781609195402305e-05, + "loss": 0.1293, + "step": 908 + }, + { + "epoch": 1.0536839817430994, + "grad_norm": 0.26399362087249756, + "learning_rate": 4.477586206896552e-05, + "loss": 0.1379, + "step": 909 + }, + { + "epoch": 1.0548431500398463, + "grad_norm": 0.27588364481925964, + "learning_rate": 4.477011494252874e-05, + "loss": 0.1369, + "step": 910 + }, + { + "epoch": 1.0560023183365934, + "grad_norm": 0.22311675548553467, + "learning_rate": 4.4764367816091956e-05, + "loss": 0.119, + "step": 911 + }, + { + "epoch": 1.0571614866333405, + "grad_norm": 0.25451788306236267, + "learning_rate": 4.475862068965517e-05, + "loss": 0.1204, + "step": 912 + }, + { + "epoch": 1.0583206549300876, + "grad_norm": 0.25941815972328186, + "learning_rate": 4.475287356321839e-05, + "loss": 0.1291, + "step": 913 + }, + { + "epoch": 1.0594798232268348, + "grad_norm": 0.2863259017467499, + "learning_rate": 4.4747126436781614e-05, + "loss": 0.1269, + "step": 914 + }, + { + "epoch": 1.0606389915235819, + "grad_norm": 0.26459577679634094, + "learning_rate": 4.474137931034483e-05, + "loss": 0.1291, + "step": 915 + }, + { + "epoch": 1.061798159820329, + "grad_norm": 0.2957836985588074, + "learning_rate": 4.473563218390805e-05, + "loss": 0.1313, + "step": 916 + }, + { + "epoch": 1.062957328117076, + "grad_norm": 0.33022555708885193, + "learning_rate": 4.4729885057471266e-05, + "loss": 0.1444, + "step": 917 + }, + { + "epoch": 1.064116496413823, + "grad_norm": 0.26044270396232605, + "learning_rate": 4.472413793103448e-05, + "loss": 0.1297, + "step": 918 + }, + { + "epoch": 1.0652756647105701, + "grad_norm": 0.25342807173728943, + "learning_rate": 4.47183908045977e-05, + "loss": 0.1134, + "step": 919 + }, + { + "epoch": 1.0664348330073172, + "grad_norm": 0.27540135383605957, + "learning_rate": 4.471264367816092e-05, + "loss": 0.1218, + "step": 920 + }, + { + "epoch": 1.0675940013040643, + "grad_norm": 0.24183742702007294, + "learning_rate": 4.470689655172414e-05, + "loss": 0.1227, + "step": 921 + }, + { + "epoch": 1.0687531696008115, + "grad_norm": 0.493832528591156, + "learning_rate": 4.470114942528736e-05, + "loss": 0.132, + "step": 922 + }, + { + "epoch": 1.0699123378975586, + "grad_norm": 0.31524115800857544, + "learning_rate": 4.4695402298850576e-05, + "loss": 0.135, + "step": 923 + }, + { + "epoch": 1.0710715061943057, + "grad_norm": 0.272127240896225, + "learning_rate": 4.46896551724138e-05, + "loss": 0.13, + "step": 924 + }, + { + "epoch": 1.0722306744910526, + "grad_norm": 0.24013520777225494, + "learning_rate": 4.468390804597701e-05, + "loss": 0.1231, + "step": 925 + }, + { + "epoch": 1.0733898427877997, + "grad_norm": 0.44068050384521484, + "learning_rate": 4.467816091954023e-05, + "loss": 0.1219, + "step": 926 + }, + { + "epoch": 1.0745490110845468, + "grad_norm": 0.2556682825088501, + "learning_rate": 4.467241379310345e-05, + "loss": 0.132, + "step": 927 + }, + { + "epoch": 1.075708179381294, + "grad_norm": 0.22146081924438477, + "learning_rate": 4.466666666666667e-05, + "loss": 0.1174, + "step": 928 + }, + { + "epoch": 1.076867347678041, + "grad_norm": 0.2589811086654663, + "learning_rate": 4.466091954022989e-05, + "loss": 0.1235, + "step": 929 + }, + { + "epoch": 1.0780265159747882, + "grad_norm": 0.26327693462371826, + "learning_rate": 4.465517241379311e-05, + "loss": 0.1303, + "step": 930 + }, + { + "epoch": 1.0791856842715353, + "grad_norm": 0.25532066822052, + "learning_rate": 4.464942528735632e-05, + "loss": 0.1442, + "step": 931 + }, + { + "epoch": 1.0803448525682822, + "grad_norm": 0.31189385056495667, + "learning_rate": 4.4643678160919544e-05, + "loss": 0.1448, + "step": 932 + }, + { + "epoch": 1.0815040208650293, + "grad_norm": 0.26370447874069214, + "learning_rate": 4.463793103448276e-05, + "loss": 0.1257, + "step": 933 + }, + { + "epoch": 1.0826631891617764, + "grad_norm": 0.2603430151939392, + "learning_rate": 4.463218390804598e-05, + "loss": 0.1248, + "step": 934 + }, + { + "epoch": 1.0838223574585235, + "grad_norm": 0.2369643896818161, + "learning_rate": 4.46264367816092e-05, + "loss": 0.1211, + "step": 935 + }, + { + "epoch": 1.0849815257552706, + "grad_norm": 0.2757248878479004, + "learning_rate": 4.462068965517242e-05, + "loss": 0.1319, + "step": 936 + }, + { + "epoch": 1.0861406940520177, + "grad_norm": 0.249775692820549, + "learning_rate": 4.461494252873564e-05, + "loss": 0.1349, + "step": 937 + }, + { + "epoch": 1.0872998623487649, + "grad_norm": 0.25326481461524963, + "learning_rate": 4.460919540229885e-05, + "loss": 0.1313, + "step": 938 + }, + { + "epoch": 1.0884590306455117, + "grad_norm": 0.20008519291877747, + "learning_rate": 4.460344827586207e-05, + "loss": 0.1142, + "step": 939 + }, + { + "epoch": 1.0896181989422589, + "grad_norm": 0.29447588324546814, + "learning_rate": 4.459770114942529e-05, + "loss": 0.1376, + "step": 940 + }, + { + "epoch": 1.090777367239006, + "grad_norm": 0.22719112038612366, + "learning_rate": 4.4591954022988505e-05, + "loss": 0.1122, + "step": 941 + }, + { + "epoch": 1.091936535535753, + "grad_norm": 0.2523985207080841, + "learning_rate": 4.4586206896551726e-05, + "loss": 0.1289, + "step": 942 + }, + { + "epoch": 1.0930957038325002, + "grad_norm": 0.2906104028224945, + "learning_rate": 4.458045977011495e-05, + "loss": 0.1285, + "step": 943 + }, + { + "epoch": 1.0942548721292473, + "grad_norm": 0.344566285610199, + "learning_rate": 4.457471264367816e-05, + "loss": 0.1237, + "step": 944 + }, + { + "epoch": 1.0954140404259944, + "grad_norm": 0.2776789963245392, + "learning_rate": 4.456896551724138e-05, + "loss": 0.1273, + "step": 945 + }, + { + "epoch": 1.0965732087227413, + "grad_norm": 0.2559017241001129, + "learning_rate": 4.45632183908046e-05, + "loss": 0.1239, + "step": 946 + }, + { + "epoch": 1.0977323770194884, + "grad_norm": 0.2877829670906067, + "learning_rate": 4.4557471264367815e-05, + "loss": 0.1337, + "step": 947 + }, + { + "epoch": 1.0988915453162356, + "grad_norm": 0.3006168007850647, + "learning_rate": 4.4551724137931036e-05, + "loss": 0.1387, + "step": 948 + }, + { + "epoch": 1.1000507136129827, + "grad_norm": 0.2508884370326996, + "learning_rate": 4.454597701149426e-05, + "loss": 0.1225, + "step": 949 + }, + { + "epoch": 1.1012098819097298, + "grad_norm": 0.2920970618724823, + "learning_rate": 4.454022988505747e-05, + "loss": 0.1384, + "step": 950 + }, + { + "epoch": 1.102369050206477, + "grad_norm": 0.28268909454345703, + "learning_rate": 4.4534482758620694e-05, + "loss": 0.1344, + "step": 951 + }, + { + "epoch": 1.103528218503224, + "grad_norm": 0.41611185669898987, + "learning_rate": 4.452873563218391e-05, + "loss": 0.1416, + "step": 952 + }, + { + "epoch": 1.104687386799971, + "grad_norm": 0.27131763100624084, + "learning_rate": 4.4522988505747124e-05, + "loss": 0.1279, + "step": 953 + }, + { + "epoch": 1.105846555096718, + "grad_norm": 0.250305712223053, + "learning_rate": 4.4517241379310346e-05, + "loss": 0.1136, + "step": 954 + }, + { + "epoch": 1.1070057233934651, + "grad_norm": 0.35008758306503296, + "learning_rate": 4.451149425287357e-05, + "loss": 0.1449, + "step": 955 + }, + { + "epoch": 1.1081648916902123, + "grad_norm": 0.23740458488464355, + "learning_rate": 4.450574712643679e-05, + "loss": 0.1094, + "step": 956 + }, + { + "epoch": 1.1093240599869594, + "grad_norm": 0.25829577445983887, + "learning_rate": 4.4500000000000004e-05, + "loss": 0.1097, + "step": 957 + }, + { + "epoch": 1.1104832282837065, + "grad_norm": 0.24298064410686493, + "learning_rate": 4.449425287356322e-05, + "loss": 0.1226, + "step": 958 + }, + { + "epoch": 1.1116423965804536, + "grad_norm": 0.27012696862220764, + "learning_rate": 4.448850574712644e-05, + "loss": 0.1289, + "step": 959 + }, + { + "epoch": 1.1128015648772007, + "grad_norm": 0.289583683013916, + "learning_rate": 4.4482758620689656e-05, + "loss": 0.138, + "step": 960 + }, + { + "epoch": 1.1139607331739476, + "grad_norm": 0.23818224668502808, + "learning_rate": 4.447701149425287e-05, + "loss": 0.1302, + "step": 961 + }, + { + "epoch": 1.1151199014706947, + "grad_norm": 0.2659807801246643, + "learning_rate": 4.447126436781609e-05, + "loss": 0.1248, + "step": 962 + }, + { + "epoch": 1.1162790697674418, + "grad_norm": 0.27049756050109863, + "learning_rate": 4.4465517241379314e-05, + "loss": 0.1355, + "step": 963 + }, + { + "epoch": 1.117438238064189, + "grad_norm": 0.2728050947189331, + "learning_rate": 4.445977011494253e-05, + "loss": 0.1264, + "step": 964 + }, + { + "epoch": 1.118597406360936, + "grad_norm": 0.2654384672641754, + "learning_rate": 4.445402298850575e-05, + "loss": 0.1303, + "step": 965 + }, + { + "epoch": 1.1197565746576832, + "grad_norm": 0.25535136461257935, + "learning_rate": 4.4448275862068965e-05, + "loss": 0.1189, + "step": 966 + }, + { + "epoch": 1.12091574295443, + "grad_norm": 0.2649828791618347, + "learning_rate": 4.444252873563219e-05, + "loss": 0.1289, + "step": 967 + }, + { + "epoch": 1.1220749112511772, + "grad_norm": 0.26751139760017395, + "learning_rate": 4.44367816091954e-05, + "loss": 0.1306, + "step": 968 + }, + { + "epoch": 1.1232340795479243, + "grad_norm": 0.2819879353046417, + "learning_rate": 4.4431034482758624e-05, + "loss": 0.132, + "step": 969 + }, + { + "epoch": 1.1243932478446714, + "grad_norm": 0.24419303238391876, + "learning_rate": 4.4425287356321845e-05, + "loss": 0.1302, + "step": 970 + }, + { + "epoch": 1.1255524161414185, + "grad_norm": 0.23565685749053955, + "learning_rate": 4.441954022988506e-05, + "loss": 0.1215, + "step": 971 + }, + { + "epoch": 1.1267115844381657, + "grad_norm": 0.24753518402576447, + "learning_rate": 4.4413793103448275e-05, + "loss": 0.1195, + "step": 972 + }, + { + "epoch": 1.1278707527349128, + "grad_norm": 0.26664602756500244, + "learning_rate": 4.44080459770115e-05, + "loss": 0.1289, + "step": 973 + }, + { + "epoch": 1.1290299210316599, + "grad_norm": 0.2846687436103821, + "learning_rate": 4.440229885057471e-05, + "loss": 0.1374, + "step": 974 + }, + { + "epoch": 1.1301890893284068, + "grad_norm": 0.2790849506855011, + "learning_rate": 4.4396551724137933e-05, + "loss": 0.1427, + "step": 975 + }, + { + "epoch": 1.131348257625154, + "grad_norm": 0.27603012323379517, + "learning_rate": 4.4390804597701155e-05, + "loss": 0.1335, + "step": 976 + }, + { + "epoch": 1.132507425921901, + "grad_norm": 0.2694251835346222, + "learning_rate": 4.438505747126437e-05, + "loss": 0.1278, + "step": 977 + }, + { + "epoch": 1.1336665942186481, + "grad_norm": 0.25316059589385986, + "learning_rate": 4.437931034482759e-05, + "loss": 0.1387, + "step": 978 + }, + { + "epoch": 1.1348257625153952, + "grad_norm": 0.2663499116897583, + "learning_rate": 4.437356321839081e-05, + "loss": 0.1206, + "step": 979 + }, + { + "epoch": 1.1359849308121424, + "grad_norm": 0.2719017267227173, + "learning_rate": 4.436781609195402e-05, + "loss": 0.1401, + "step": 980 + }, + { + "epoch": 1.1371440991088893, + "grad_norm": 0.4000224769115448, + "learning_rate": 4.436206896551724e-05, + "loss": 0.1259, + "step": 981 + }, + { + "epoch": 1.1383032674056364, + "grad_norm": 0.2817740738391876, + "learning_rate": 4.435632183908046e-05, + "loss": 0.1307, + "step": 982 + }, + { + "epoch": 1.1394624357023835, + "grad_norm": 0.28263017535209656, + "learning_rate": 4.435057471264368e-05, + "loss": 0.1328, + "step": 983 + }, + { + "epoch": 1.1406216039991306, + "grad_norm": 0.2730136513710022, + "learning_rate": 4.43448275862069e-05, + "loss": 0.125, + "step": 984 + }, + { + "epoch": 1.1417807722958777, + "grad_norm": 0.24145883321762085, + "learning_rate": 4.4339080459770116e-05, + "loss": 0.1206, + "step": 985 + }, + { + "epoch": 1.1429399405926248, + "grad_norm": 0.29111185669898987, + "learning_rate": 4.433333333333334e-05, + "loss": 0.1469, + "step": 986 + }, + { + "epoch": 1.144099108889372, + "grad_norm": 0.28760775923728943, + "learning_rate": 4.432758620689655e-05, + "loss": 0.1338, + "step": 987 + }, + { + "epoch": 1.145258277186119, + "grad_norm": 0.2725701630115509, + "learning_rate": 4.432183908045977e-05, + "loss": 0.1314, + "step": 988 + }, + { + "epoch": 1.146417445482866, + "grad_norm": 0.2519955635070801, + "learning_rate": 4.431609195402299e-05, + "loss": 0.1277, + "step": 989 + }, + { + "epoch": 1.147576613779613, + "grad_norm": 0.28241047263145447, + "learning_rate": 4.431034482758621e-05, + "loss": 0.1309, + "step": 990 + }, + { + "epoch": 1.1487357820763602, + "grad_norm": 0.284697562456131, + "learning_rate": 4.4304597701149426e-05, + "loss": 0.1396, + "step": 991 + }, + { + "epoch": 1.1498949503731073, + "grad_norm": 0.2153967171907425, + "learning_rate": 4.429885057471265e-05, + "loss": 0.1074, + "step": 992 + }, + { + "epoch": 1.1510541186698544, + "grad_norm": 0.28190892934799194, + "learning_rate": 4.429310344827586e-05, + "loss": 0.1408, + "step": 993 + }, + { + "epoch": 1.1522132869666015, + "grad_norm": 0.26553207635879517, + "learning_rate": 4.4287356321839084e-05, + "loss": 0.1191, + "step": 994 + }, + { + "epoch": 1.1533724552633486, + "grad_norm": 0.3092179596424103, + "learning_rate": 4.42816091954023e-05, + "loss": 0.1408, + "step": 995 + }, + { + "epoch": 1.1545316235600955, + "grad_norm": 0.2503480315208435, + "learning_rate": 4.427586206896552e-05, + "loss": 0.1152, + "step": 996 + }, + { + "epoch": 1.1556907918568426, + "grad_norm": 0.279032826423645, + "learning_rate": 4.427011494252874e-05, + "loss": 0.118, + "step": 997 + }, + { + "epoch": 1.1568499601535898, + "grad_norm": 0.283666729927063, + "learning_rate": 4.426436781609196e-05, + "loss": 0.143, + "step": 998 + }, + { + "epoch": 1.1580091284503369, + "grad_norm": 0.3145494759082794, + "learning_rate": 4.425862068965517e-05, + "loss": 0.1467, + "step": 999 + }, + { + "epoch": 1.159168296747084, + "grad_norm": 0.24866163730621338, + "learning_rate": 4.4252873563218394e-05, + "loss": 0.1217, + "step": 1000 + }, + { + "epoch": 1.160327465043831, + "grad_norm": 0.29811036586761475, + "learning_rate": 4.424712643678161e-05, + "loss": 0.1343, + "step": 1001 + }, + { + "epoch": 1.1614866333405782, + "grad_norm": 0.2376304566860199, + "learning_rate": 4.4241379310344824e-05, + "loss": 0.1111, + "step": 1002 + }, + { + "epoch": 1.1626458016373253, + "grad_norm": 0.2960997223854065, + "learning_rate": 4.423563218390805e-05, + "loss": 0.1537, + "step": 1003 + }, + { + "epoch": 1.1638049699340722, + "grad_norm": 0.2618663012981415, + "learning_rate": 4.422988505747127e-05, + "loss": 0.1244, + "step": 1004 + }, + { + "epoch": 1.1649641382308193, + "grad_norm": 0.2316623479127884, + "learning_rate": 4.422413793103449e-05, + "loss": 0.1276, + "step": 1005 + }, + { + "epoch": 1.1661233065275665, + "grad_norm": 0.225045844912529, + "learning_rate": 4.4218390804597704e-05, + "loss": 0.12, + "step": 1006 + }, + { + "epoch": 1.1672824748243136, + "grad_norm": 0.2224287986755371, + "learning_rate": 4.421264367816092e-05, + "loss": 0.1281, + "step": 1007 + }, + { + "epoch": 1.1684416431210607, + "grad_norm": 0.23176699876785278, + "learning_rate": 4.420689655172414e-05, + "loss": 0.1115, + "step": 1008 + }, + { + "epoch": 1.1696008114178078, + "grad_norm": 0.2502862215042114, + "learning_rate": 4.4201149425287355e-05, + "loss": 0.1183, + "step": 1009 + }, + { + "epoch": 1.1707599797145547, + "grad_norm": 0.24579674005508423, + "learning_rate": 4.419540229885058e-05, + "loss": 0.1302, + "step": 1010 + }, + { + "epoch": 1.1719191480113018, + "grad_norm": 0.26011398434638977, + "learning_rate": 4.41896551724138e-05, + "loss": 0.1386, + "step": 1011 + }, + { + "epoch": 1.173078316308049, + "grad_norm": 0.26081839203834534, + "learning_rate": 4.4183908045977014e-05, + "loss": 0.1511, + "step": 1012 + }, + { + "epoch": 1.174237484604796, + "grad_norm": 0.22634555399417877, + "learning_rate": 4.4178160919540235e-05, + "loss": 0.1304, + "step": 1013 + }, + { + "epoch": 1.1753966529015432, + "grad_norm": 0.23882032930850983, + "learning_rate": 4.417241379310345e-05, + "loss": 0.1289, + "step": 1014 + }, + { + "epoch": 1.1765558211982903, + "grad_norm": 0.26218292117118835, + "learning_rate": 4.4166666666666665e-05, + "loss": 0.1373, + "step": 1015 + }, + { + "epoch": 1.1777149894950374, + "grad_norm": 0.3141017556190491, + "learning_rate": 4.416091954022989e-05, + "loss": 0.1389, + "step": 1016 + }, + { + "epoch": 1.1788741577917845, + "grad_norm": 0.2978722155094147, + "learning_rate": 4.415517241379311e-05, + "loss": 0.14, + "step": 1017 + }, + { + "epoch": 1.1800333260885314, + "grad_norm": 0.30704426765441895, + "learning_rate": 4.414942528735632e-05, + "loss": 0.1286, + "step": 1018 + }, + { + "epoch": 1.1811924943852785, + "grad_norm": 0.3598279058933258, + "learning_rate": 4.4143678160919545e-05, + "loss": 0.1354, + "step": 1019 + }, + { + "epoch": 1.1823516626820256, + "grad_norm": 0.2837865650653839, + "learning_rate": 4.413793103448276e-05, + "loss": 0.1311, + "step": 1020 + }, + { + "epoch": 1.1835108309787727, + "grad_norm": 0.2859646677970886, + "learning_rate": 4.4132183908045975e-05, + "loss": 0.1153, + "step": 1021 + }, + { + "epoch": 1.1846699992755199, + "grad_norm": 0.2848678529262543, + "learning_rate": 4.4126436781609196e-05, + "loss": 0.1344, + "step": 1022 + }, + { + "epoch": 1.185829167572267, + "grad_norm": 0.27502962946891785, + "learning_rate": 4.412068965517241e-05, + "loss": 0.1336, + "step": 1023 + }, + { + "epoch": 1.1869883358690139, + "grad_norm": 0.23905035853385925, + "learning_rate": 4.411494252873564e-05, + "loss": 0.1415, + "step": 1024 + }, + { + "epoch": 1.188147504165761, + "grad_norm": 0.3138931691646576, + "learning_rate": 4.4109195402298855e-05, + "loss": 0.1388, + "step": 1025 + }, + { + "epoch": 1.189306672462508, + "grad_norm": 0.2796839773654938, + "learning_rate": 4.410344827586207e-05, + "loss": 0.126, + "step": 1026 + }, + { + "epoch": 1.1904658407592552, + "grad_norm": 0.24468503892421722, + "learning_rate": 4.409770114942529e-05, + "loss": 0.1356, + "step": 1027 + }, + { + "epoch": 1.1916250090560023, + "grad_norm": 0.2509666383266449, + "learning_rate": 4.4091954022988506e-05, + "loss": 0.1223, + "step": 1028 + }, + { + "epoch": 1.1927841773527494, + "grad_norm": 0.2291174829006195, + "learning_rate": 4.408620689655172e-05, + "loss": 0.1114, + "step": 1029 + }, + { + "epoch": 1.1939433456494966, + "grad_norm": 0.21360966563224792, + "learning_rate": 4.408045977011494e-05, + "loss": 0.1149, + "step": 1030 + }, + { + "epoch": 1.1951025139462437, + "grad_norm": 0.273369163274765, + "learning_rate": 4.4074712643678164e-05, + "loss": 0.14, + "step": 1031 + }, + { + "epoch": 1.1962616822429906, + "grad_norm": 0.24241317808628082, + "learning_rate": 4.4068965517241386e-05, + "loss": 0.1242, + "step": 1032 + }, + { + "epoch": 1.1974208505397377, + "grad_norm": 0.29952454566955566, + "learning_rate": 4.40632183908046e-05, + "loss": 0.1363, + "step": 1033 + }, + { + "epoch": 1.1985800188364848, + "grad_norm": 0.2337886095046997, + "learning_rate": 4.4057471264367816e-05, + "loss": 0.1229, + "step": 1034 + }, + { + "epoch": 1.199739187133232, + "grad_norm": 0.21567285060882568, + "learning_rate": 4.405172413793104e-05, + "loss": 0.1208, + "step": 1035 + }, + { + "epoch": 1.200898355429979, + "grad_norm": 0.2876308262348175, + "learning_rate": 4.404597701149425e-05, + "loss": 0.1376, + "step": 1036 + }, + { + "epoch": 1.2020575237267261, + "grad_norm": 0.30270469188690186, + "learning_rate": 4.4040229885057474e-05, + "loss": 0.1444, + "step": 1037 + }, + { + "epoch": 1.203216692023473, + "grad_norm": 0.2885245680809021, + "learning_rate": 4.4034482758620696e-05, + "loss": 0.1327, + "step": 1038 + }, + { + "epoch": 1.2043758603202201, + "grad_norm": 0.3696737289428711, + "learning_rate": 4.402873563218391e-05, + "loss": 0.1444, + "step": 1039 + }, + { + "epoch": 1.2055350286169673, + "grad_norm": 0.26480332016944885, + "learning_rate": 4.4022988505747126e-05, + "loss": 0.1332, + "step": 1040 + }, + { + "epoch": 1.2066941969137144, + "grad_norm": 0.25360003113746643, + "learning_rate": 4.401724137931035e-05, + "loss": 0.1315, + "step": 1041 + }, + { + "epoch": 1.2078533652104615, + "grad_norm": 0.22617582976818085, + "learning_rate": 4.401149425287356e-05, + "loss": 0.1289, + "step": 1042 + }, + { + "epoch": 1.2090125335072086, + "grad_norm": 0.22125768661499023, + "learning_rate": 4.4005747126436784e-05, + "loss": 0.125, + "step": 1043 + }, + { + "epoch": 1.2101717018039557, + "grad_norm": 0.23760303854942322, + "learning_rate": 4.4000000000000006e-05, + "loss": 0.1201, + "step": 1044 + }, + { + "epoch": 1.2113308701007028, + "grad_norm": 0.28465020656585693, + "learning_rate": 4.399425287356322e-05, + "loss": 0.1306, + "step": 1045 + }, + { + "epoch": 1.21249003839745, + "grad_norm": 0.2427520602941513, + "learning_rate": 4.398850574712644e-05, + "loss": 0.1252, + "step": 1046 + }, + { + "epoch": 1.2136492066941968, + "grad_norm": 0.29493942856788635, + "learning_rate": 4.398275862068966e-05, + "loss": 0.1131, + "step": 1047 + }, + { + "epoch": 1.214808374990944, + "grad_norm": 0.24606949090957642, + "learning_rate": 4.397701149425287e-05, + "loss": 0.1254, + "step": 1048 + }, + { + "epoch": 1.215967543287691, + "grad_norm": 0.26658526062965393, + "learning_rate": 4.3971264367816094e-05, + "loss": 0.1299, + "step": 1049 + }, + { + "epoch": 1.2171267115844382, + "grad_norm": 0.22725453972816467, + "learning_rate": 4.396551724137931e-05, + "loss": 0.1209, + "step": 1050 + }, + { + "epoch": 1.2182858798811853, + "grad_norm": 0.26789578795433044, + "learning_rate": 4.395977011494253e-05, + "loss": 0.131, + "step": 1051 + }, + { + "epoch": 1.2194450481779324, + "grad_norm": 0.2898029685020447, + "learning_rate": 4.395402298850575e-05, + "loss": 0.1321, + "step": 1052 + }, + { + "epoch": 1.2206042164746793, + "grad_norm": 0.2893812656402588, + "learning_rate": 4.394827586206897e-05, + "loss": 0.151, + "step": 1053 + }, + { + "epoch": 1.2217633847714264, + "grad_norm": 0.2428007870912552, + "learning_rate": 4.394252873563219e-05, + "loss": 0.1221, + "step": 1054 + }, + { + "epoch": 1.2229225530681735, + "grad_norm": 0.31453728675842285, + "learning_rate": 4.3936781609195403e-05, + "loss": 0.127, + "step": 1055 + }, + { + "epoch": 1.2240817213649207, + "grad_norm": 0.22855356335639954, + "learning_rate": 4.393103448275862e-05, + "loss": 0.1232, + "step": 1056 + }, + { + "epoch": 1.2252408896616678, + "grad_norm": 0.21666891872882843, + "learning_rate": 4.392528735632184e-05, + "loss": 0.1174, + "step": 1057 + }, + { + "epoch": 1.226400057958415, + "grad_norm": 0.2279151976108551, + "learning_rate": 4.391954022988506e-05, + "loss": 0.1307, + "step": 1058 + }, + { + "epoch": 1.227559226255162, + "grad_norm": 0.2704702615737915, + "learning_rate": 4.3913793103448277e-05, + "loss": 0.1328, + "step": 1059 + }, + { + "epoch": 1.2287183945519091, + "grad_norm": 0.2585502564907074, + "learning_rate": 4.39080459770115e-05, + "loss": 0.1359, + "step": 1060 + }, + { + "epoch": 1.229877562848656, + "grad_norm": 0.29662445187568665, + "learning_rate": 4.390229885057471e-05, + "loss": 0.1302, + "step": 1061 + }, + { + "epoch": 1.2310367311454031, + "grad_norm": 0.24643655121326447, + "learning_rate": 4.3896551724137935e-05, + "loss": 0.1233, + "step": 1062 + }, + { + "epoch": 1.2321958994421502, + "grad_norm": 0.238943949341774, + "learning_rate": 4.389080459770115e-05, + "loss": 0.1271, + "step": 1063 + }, + { + "epoch": 1.2333550677388974, + "grad_norm": 0.27087101340293884, + "learning_rate": 4.388505747126437e-05, + "loss": 0.125, + "step": 1064 + }, + { + "epoch": 1.2345142360356445, + "grad_norm": 0.29983842372894287, + "learning_rate": 4.387931034482759e-05, + "loss": 0.135, + "step": 1065 + }, + { + "epoch": 1.2356734043323916, + "grad_norm": 0.24061062932014465, + "learning_rate": 4.387356321839081e-05, + "loss": 0.1365, + "step": 1066 + }, + { + "epoch": 1.2368325726291385, + "grad_norm": 0.24052466452121735, + "learning_rate": 4.386781609195402e-05, + "loss": 0.1175, + "step": 1067 + }, + { + "epoch": 1.2379917409258856, + "grad_norm": 0.2357567995786667, + "learning_rate": 4.3862068965517245e-05, + "loss": 0.1337, + "step": 1068 + }, + { + "epoch": 1.2391509092226327, + "grad_norm": 0.3319186270236969, + "learning_rate": 4.385632183908046e-05, + "loss": 0.1434, + "step": 1069 + }, + { + "epoch": 1.2403100775193798, + "grad_norm": 0.34508636593818665, + "learning_rate": 4.385057471264368e-05, + "loss": 0.1511, + "step": 1070 + }, + { + "epoch": 1.241469245816127, + "grad_norm": 0.22875401377677917, + "learning_rate": 4.3844827586206896e-05, + "loss": 0.1278, + "step": 1071 + }, + { + "epoch": 1.242628414112874, + "grad_norm": 0.2499420940876007, + "learning_rate": 4.383908045977012e-05, + "loss": 0.1204, + "step": 1072 + }, + { + "epoch": 1.2437875824096212, + "grad_norm": 0.24523814022541046, + "learning_rate": 4.383333333333334e-05, + "loss": 0.1351, + "step": 1073 + }, + { + "epoch": 1.2449467507063683, + "grad_norm": 0.28577345609664917, + "learning_rate": 4.3827586206896554e-05, + "loss": 0.121, + "step": 1074 + }, + { + "epoch": 1.2461059190031152, + "grad_norm": 0.21815964579582214, + "learning_rate": 4.382183908045977e-05, + "loss": 0.113, + "step": 1075 + }, + { + "epoch": 1.2472650872998623, + "grad_norm": 0.24391821026802063, + "learning_rate": 4.381609195402299e-05, + "loss": 0.1196, + "step": 1076 + }, + { + "epoch": 1.2484242555966094, + "grad_norm": 0.2738696336746216, + "learning_rate": 4.3810344827586206e-05, + "loss": 0.1436, + "step": 1077 + }, + { + "epoch": 1.2495834238933565, + "grad_norm": 0.24671784043312073, + "learning_rate": 4.380459770114943e-05, + "loss": 0.1221, + "step": 1078 + }, + { + "epoch": 1.2507425921901036, + "grad_norm": 0.249884694814682, + "learning_rate": 4.379885057471265e-05, + "loss": 0.1199, + "step": 1079 + }, + { + "epoch": 1.2519017604868508, + "grad_norm": 0.25286155939102173, + "learning_rate": 4.3793103448275864e-05, + "loss": 0.1381, + "step": 1080 + }, + { + "epoch": 1.2530609287835976, + "grad_norm": 0.36500653624534607, + "learning_rate": 4.3787356321839086e-05, + "loss": 0.13, + "step": 1081 + }, + { + "epoch": 1.2542200970803448, + "grad_norm": 0.2865230441093445, + "learning_rate": 4.37816091954023e-05, + "loss": 0.1223, + "step": 1082 + }, + { + "epoch": 1.2553792653770919, + "grad_norm": 0.2717812657356262, + "learning_rate": 4.3775862068965516e-05, + "loss": 0.118, + "step": 1083 + }, + { + "epoch": 1.256538433673839, + "grad_norm": 0.23932139575481415, + "learning_rate": 4.377011494252874e-05, + "loss": 0.1171, + "step": 1084 + }, + { + "epoch": 1.257697601970586, + "grad_norm": 0.25386103987693787, + "learning_rate": 4.376436781609196e-05, + "loss": 0.127, + "step": 1085 + }, + { + "epoch": 1.2588567702673332, + "grad_norm": 0.25412800908088684, + "learning_rate": 4.3758620689655174e-05, + "loss": 0.1394, + "step": 1086 + }, + { + "epoch": 1.2600159385640803, + "grad_norm": 0.30397337675094604, + "learning_rate": 4.3752873563218395e-05, + "loss": 0.1087, + "step": 1087 + }, + { + "epoch": 1.2611751068608275, + "grad_norm": 0.2501704692840576, + "learning_rate": 4.374712643678161e-05, + "loss": 0.1342, + "step": 1088 + }, + { + "epoch": 1.2623342751575746, + "grad_norm": 0.2684697210788727, + "learning_rate": 4.374137931034483e-05, + "loss": 0.1225, + "step": 1089 + }, + { + "epoch": 1.2634934434543215, + "grad_norm": 0.2351657599210739, + "learning_rate": 4.373563218390805e-05, + "loss": 0.1106, + "step": 1090 + }, + { + "epoch": 1.2646526117510686, + "grad_norm": 0.3316408097743988, + "learning_rate": 4.372988505747126e-05, + "loss": 0.1367, + "step": 1091 + }, + { + "epoch": 1.2658117800478157, + "grad_norm": 0.2677469849586487, + "learning_rate": 4.3724137931034484e-05, + "loss": 0.1221, + "step": 1092 + }, + { + "epoch": 1.2669709483445628, + "grad_norm": 0.5877587199211121, + "learning_rate": 4.3718390804597705e-05, + "loss": 0.1538, + "step": 1093 + }, + { + "epoch": 1.26813011664131, + "grad_norm": 0.2598472833633423, + "learning_rate": 4.371264367816092e-05, + "loss": 0.1325, + "step": 1094 + }, + { + "epoch": 1.2692892849380568, + "grad_norm": 0.3260670602321625, + "learning_rate": 4.370689655172414e-05, + "loss": 0.1372, + "step": 1095 + }, + { + "epoch": 1.270448453234804, + "grad_norm": 0.24232131242752075, + "learning_rate": 4.370114942528736e-05, + "loss": 0.1255, + "step": 1096 + }, + { + "epoch": 1.271607621531551, + "grad_norm": 0.3213041126728058, + "learning_rate": 4.369540229885058e-05, + "loss": 0.1325, + "step": 1097 + }, + { + "epoch": 1.2727667898282982, + "grad_norm": 0.2717669904232025, + "learning_rate": 4.368965517241379e-05, + "loss": 0.1447, + "step": 1098 + }, + { + "epoch": 1.2739259581250453, + "grad_norm": 0.2611459791660309, + "learning_rate": 4.3683908045977015e-05, + "loss": 0.1349, + "step": 1099 + }, + { + "epoch": 1.2750851264217924, + "grad_norm": 0.21271906793117523, + "learning_rate": 4.367816091954024e-05, + "loss": 0.1062, + "step": 1100 + }, + { + "epoch": 1.2762442947185395, + "grad_norm": 0.28656983375549316, + "learning_rate": 4.367241379310345e-05, + "loss": 0.1535, + "step": 1101 + }, + { + "epoch": 1.2774034630152866, + "grad_norm": 0.24881134927272797, + "learning_rate": 4.3666666666666666e-05, + "loss": 0.132, + "step": 1102 + }, + { + "epoch": 1.2785626313120337, + "grad_norm": 0.3379434645175934, + "learning_rate": 4.366091954022989e-05, + "loss": 0.1276, + "step": 1103 + }, + { + "epoch": 1.2797217996087806, + "grad_norm": 0.32010799646377563, + "learning_rate": 4.36551724137931e-05, + "loss": 0.133, + "step": 1104 + }, + { + "epoch": 1.2808809679055277, + "grad_norm": 0.22083823382854462, + "learning_rate": 4.3649425287356325e-05, + "loss": 0.1258, + "step": 1105 + }, + { + "epoch": 1.2820401362022749, + "grad_norm": 0.27177348732948303, + "learning_rate": 4.3643678160919546e-05, + "loss": 0.1299, + "step": 1106 + }, + { + "epoch": 1.283199304499022, + "grad_norm": 0.2737819254398346, + "learning_rate": 4.363793103448276e-05, + "loss": 0.1322, + "step": 1107 + }, + { + "epoch": 1.284358472795769, + "grad_norm": 0.2882055640220642, + "learning_rate": 4.363218390804598e-05, + "loss": 0.1317, + "step": 1108 + }, + { + "epoch": 1.285517641092516, + "grad_norm": 0.24766948819160461, + "learning_rate": 4.36264367816092e-05, + "loss": 0.1247, + "step": 1109 + }, + { + "epoch": 1.286676809389263, + "grad_norm": 0.27174562215805054, + "learning_rate": 4.362068965517241e-05, + "loss": 0.1404, + "step": 1110 + }, + { + "epoch": 1.2878359776860102, + "grad_norm": 0.25926515460014343, + "learning_rate": 4.3614942528735634e-05, + "loss": 0.1275, + "step": 1111 + }, + { + "epoch": 1.2889951459827573, + "grad_norm": 0.2558022439479828, + "learning_rate": 4.360919540229885e-05, + "loss": 0.1224, + "step": 1112 + }, + { + "epoch": 1.2901543142795044, + "grad_norm": 0.24418483674526215, + "learning_rate": 4.360344827586207e-05, + "loss": 0.1274, + "step": 1113 + }, + { + "epoch": 1.2913134825762516, + "grad_norm": 0.2943531274795532, + "learning_rate": 4.359770114942529e-05, + "loss": 0.1242, + "step": 1114 + }, + { + "epoch": 1.2924726508729987, + "grad_norm": 0.24752426147460938, + "learning_rate": 4.359195402298851e-05, + "loss": 0.1336, + "step": 1115 + }, + { + "epoch": 1.2936318191697458, + "grad_norm": 0.24908795952796936, + "learning_rate": 4.358620689655173e-05, + "loss": 0.1176, + "step": 1116 + }, + { + "epoch": 1.294790987466493, + "grad_norm": 0.290752649307251, + "learning_rate": 4.3580459770114944e-05, + "loss": 0.1532, + "step": 1117 + }, + { + "epoch": 1.2959501557632398, + "grad_norm": 0.2473299503326416, + "learning_rate": 4.357471264367816e-05, + "loss": 0.1166, + "step": 1118 + }, + { + "epoch": 1.297109324059987, + "grad_norm": 0.24173453450202942, + "learning_rate": 4.356896551724138e-05, + "loss": 0.1069, + "step": 1119 + }, + { + "epoch": 1.298268492356734, + "grad_norm": 0.2775953412055969, + "learning_rate": 4.35632183908046e-05, + "loss": 0.1163, + "step": 1120 + }, + { + "epoch": 1.2994276606534811, + "grad_norm": 0.2543400824069977, + "learning_rate": 4.355747126436782e-05, + "loss": 0.1359, + "step": 1121 + }, + { + "epoch": 1.3005868289502283, + "grad_norm": 0.2600906193256378, + "learning_rate": 4.355172413793104e-05, + "loss": 0.1297, + "step": 1122 + }, + { + "epoch": 1.3017459972469754, + "grad_norm": 0.21949701011180878, + "learning_rate": 4.3545977011494254e-05, + "loss": 0.1223, + "step": 1123 + }, + { + "epoch": 1.3029051655437223, + "grad_norm": 0.2765035629272461, + "learning_rate": 4.354022988505747e-05, + "loss": 0.1302, + "step": 1124 + }, + { + "epoch": 1.3040643338404694, + "grad_norm": 0.2629549205303192, + "learning_rate": 4.353448275862069e-05, + "loss": 0.1321, + "step": 1125 + }, + { + "epoch": 1.3052235021372165, + "grad_norm": 0.278767853975296, + "learning_rate": 4.352873563218391e-05, + "loss": 0.1252, + "step": 1126 + }, + { + "epoch": 1.3063826704339636, + "grad_norm": 0.3192477226257324, + "learning_rate": 4.3522988505747134e-05, + "loss": 0.1559, + "step": 1127 + }, + { + "epoch": 1.3075418387307107, + "grad_norm": 0.24097472429275513, + "learning_rate": 4.351724137931035e-05, + "loss": 0.1239, + "step": 1128 + }, + { + "epoch": 1.3087010070274578, + "grad_norm": 0.27867624163627625, + "learning_rate": 4.3511494252873564e-05, + "loss": 0.142, + "step": 1129 + }, + { + "epoch": 1.309860175324205, + "grad_norm": 0.2419125735759735, + "learning_rate": 4.3505747126436785e-05, + "loss": 0.1126, + "step": 1130 + }, + { + "epoch": 1.311019343620952, + "grad_norm": 0.24018104374408722, + "learning_rate": 4.35e-05, + "loss": 0.1324, + "step": 1131 + }, + { + "epoch": 1.3121785119176992, + "grad_norm": 0.2754385769367218, + "learning_rate": 4.3494252873563215e-05, + "loss": 0.1422, + "step": 1132 + }, + { + "epoch": 1.313337680214446, + "grad_norm": 0.24360622465610504, + "learning_rate": 4.348850574712644e-05, + "loss": 0.1409, + "step": 1133 + }, + { + "epoch": 1.3144968485111932, + "grad_norm": 0.2462378591299057, + "learning_rate": 4.348275862068966e-05, + "loss": 0.1299, + "step": 1134 + }, + { + "epoch": 1.3156560168079403, + "grad_norm": 0.22275815904140472, + "learning_rate": 4.347701149425288e-05, + "loss": 0.1283, + "step": 1135 + }, + { + "epoch": 1.3168151851046874, + "grad_norm": 0.388450026512146, + "learning_rate": 4.3471264367816095e-05, + "loss": 0.152, + "step": 1136 + }, + { + "epoch": 1.3179743534014345, + "grad_norm": 0.24839439988136292, + "learning_rate": 4.346551724137931e-05, + "loss": 0.1333, + "step": 1137 + }, + { + "epoch": 1.3191335216981814, + "grad_norm": 0.24341842532157898, + "learning_rate": 4.345977011494253e-05, + "loss": 0.1176, + "step": 1138 + }, + { + "epoch": 1.3202926899949285, + "grad_norm": 0.22959062457084656, + "learning_rate": 4.3454022988505747e-05, + "loss": 0.1337, + "step": 1139 + }, + { + "epoch": 1.3214518582916757, + "grad_norm": 0.2183791995048523, + "learning_rate": 4.344827586206897e-05, + "loss": 0.1157, + "step": 1140 + }, + { + "epoch": 1.3226110265884228, + "grad_norm": 0.3063965141773224, + "learning_rate": 4.344252873563219e-05, + "loss": 0.1365, + "step": 1141 + }, + { + "epoch": 1.32377019488517, + "grad_norm": 0.24136781692504883, + "learning_rate": 4.3436781609195405e-05, + "loss": 0.1255, + "step": 1142 + }, + { + "epoch": 1.324929363181917, + "grad_norm": 0.21908266842365265, + "learning_rate": 4.343103448275862e-05, + "loss": 0.1144, + "step": 1143 + }, + { + "epoch": 1.3260885314786641, + "grad_norm": 0.2468622326850891, + "learning_rate": 4.342528735632184e-05, + "loss": 0.1298, + "step": 1144 + }, + { + "epoch": 1.3272476997754112, + "grad_norm": 0.28959596157073975, + "learning_rate": 4.3419540229885056e-05, + "loss": 0.1318, + "step": 1145 + }, + { + "epoch": 1.3284068680721584, + "grad_norm": 0.2605699300765991, + "learning_rate": 4.341379310344828e-05, + "loss": 0.137, + "step": 1146 + }, + { + "epoch": 1.3295660363689052, + "grad_norm": 0.3156377971172333, + "learning_rate": 4.34080459770115e-05, + "loss": 0.1417, + "step": 1147 + }, + { + "epoch": 1.3307252046656524, + "grad_norm": 0.24471645057201385, + "learning_rate": 4.3402298850574715e-05, + "loss": 0.1268, + "step": 1148 + }, + { + "epoch": 1.3318843729623995, + "grad_norm": 0.24388937652111053, + "learning_rate": 4.3396551724137936e-05, + "loss": 0.13, + "step": 1149 + }, + { + "epoch": 1.3330435412591466, + "grad_norm": 0.21283157169818878, + "learning_rate": 4.339080459770115e-05, + "loss": 0.1213, + "step": 1150 + }, + { + "epoch": 1.3342027095558937, + "grad_norm": 0.232645183801651, + "learning_rate": 4.3385057471264366e-05, + "loss": 0.1245, + "step": 1151 + }, + { + "epoch": 1.3353618778526406, + "grad_norm": 0.25331243872642517, + "learning_rate": 4.337931034482759e-05, + "loss": 0.1232, + "step": 1152 + }, + { + "epoch": 1.3365210461493877, + "grad_norm": 0.24029718339443207, + "learning_rate": 4.33735632183908e-05, + "loss": 0.1161, + "step": 1153 + }, + { + "epoch": 1.3376802144461348, + "grad_norm": 0.2674920856952667, + "learning_rate": 4.336781609195403e-05, + "loss": 0.1361, + "step": 1154 + }, + { + "epoch": 1.338839382742882, + "grad_norm": 0.19390049576759338, + "learning_rate": 4.3362068965517246e-05, + "loss": 0.1138, + "step": 1155 + }, + { + "epoch": 1.339998551039629, + "grad_norm": 0.2703855633735657, + "learning_rate": 4.335632183908046e-05, + "loss": 0.1334, + "step": 1156 + }, + { + "epoch": 1.3411577193363762, + "grad_norm": 0.23119378089904785, + "learning_rate": 4.335057471264368e-05, + "loss": 0.1278, + "step": 1157 + }, + { + "epoch": 1.3423168876331233, + "grad_norm": 0.22249457240104675, + "learning_rate": 4.33448275862069e-05, + "loss": 0.1122, + "step": 1158 + }, + { + "epoch": 1.3434760559298704, + "grad_norm": 0.22288426756858826, + "learning_rate": 4.333908045977011e-05, + "loss": 0.1251, + "step": 1159 + }, + { + "epoch": 1.3446352242266175, + "grad_norm": 0.22571322321891785, + "learning_rate": 4.3333333333333334e-05, + "loss": 0.1167, + "step": 1160 + }, + { + "epoch": 1.3457943925233644, + "grad_norm": 0.25278669595718384, + "learning_rate": 4.3327586206896556e-05, + "loss": 0.1247, + "step": 1161 + }, + { + "epoch": 1.3469535608201115, + "grad_norm": 0.24313043057918549, + "learning_rate": 4.332183908045977e-05, + "loss": 0.132, + "step": 1162 + }, + { + "epoch": 1.3481127291168586, + "grad_norm": 0.26298508048057556, + "learning_rate": 4.331609195402299e-05, + "loss": 0.1262, + "step": 1163 + }, + { + "epoch": 1.3492718974136058, + "grad_norm": 0.24116156995296478, + "learning_rate": 4.331034482758621e-05, + "loss": 0.1313, + "step": 1164 + }, + { + "epoch": 1.3504310657103529, + "grad_norm": 0.24901200830936432, + "learning_rate": 4.330459770114943e-05, + "loss": 0.1097, + "step": 1165 + }, + { + "epoch": 1.3515902340071, + "grad_norm": 0.30485498905181885, + "learning_rate": 4.3298850574712644e-05, + "loss": 0.1178, + "step": 1166 + }, + { + "epoch": 1.3527494023038469, + "grad_norm": 0.3070286810398102, + "learning_rate": 4.3293103448275865e-05, + "loss": 0.1386, + "step": 1167 + }, + { + "epoch": 1.353908570600594, + "grad_norm": 0.27904579043388367, + "learning_rate": 4.328735632183909e-05, + "loss": 0.1504, + "step": 1168 + }, + { + "epoch": 1.3550677388973411, + "grad_norm": 0.28574293851852417, + "learning_rate": 4.32816091954023e-05, + "loss": 0.1273, + "step": 1169 + }, + { + "epoch": 1.3562269071940882, + "grad_norm": 0.27783069014549255, + "learning_rate": 4.327586206896552e-05, + "loss": 0.13, + "step": 1170 + }, + { + "epoch": 1.3573860754908353, + "grad_norm": 0.21755783259868622, + "learning_rate": 4.327011494252874e-05, + "loss": 0.1189, + "step": 1171 + }, + { + "epoch": 1.3585452437875825, + "grad_norm": 0.27805066108703613, + "learning_rate": 4.3264367816091954e-05, + "loss": 0.1295, + "step": 1172 + }, + { + "epoch": 1.3597044120843296, + "grad_norm": 0.26228439807891846, + "learning_rate": 4.3258620689655175e-05, + "loss": 0.1312, + "step": 1173 + }, + { + "epoch": 1.3608635803810767, + "grad_norm": 0.31947600841522217, + "learning_rate": 4.325287356321839e-05, + "loss": 0.1407, + "step": 1174 + }, + { + "epoch": 1.3620227486778236, + "grad_norm": 0.2713332176208496, + "learning_rate": 4.324712643678161e-05, + "loss": 0.1229, + "step": 1175 + }, + { + "epoch": 1.3631819169745707, + "grad_norm": 0.3652999699115753, + "learning_rate": 4.3241379310344833e-05, + "loss": 0.133, + "step": 1176 + }, + { + "epoch": 1.3643410852713178, + "grad_norm": 0.26640525460243225, + "learning_rate": 4.323563218390805e-05, + "loss": 0.1254, + "step": 1177 + }, + { + "epoch": 1.365500253568065, + "grad_norm": 0.24308764934539795, + "learning_rate": 4.322988505747126e-05, + "loss": 0.1311, + "step": 1178 + }, + { + "epoch": 1.366659421864812, + "grad_norm": 0.2529830038547516, + "learning_rate": 4.3224137931034485e-05, + "loss": 0.1257, + "step": 1179 + }, + { + "epoch": 1.3678185901615592, + "grad_norm": 0.2467876523733139, + "learning_rate": 4.32183908045977e-05, + "loss": 0.1387, + "step": 1180 + }, + { + "epoch": 1.368977758458306, + "grad_norm": 0.27905914187431335, + "learning_rate": 4.321264367816092e-05, + "loss": 0.119, + "step": 1181 + }, + { + "epoch": 1.3701369267550532, + "grad_norm": 0.2613096535205841, + "learning_rate": 4.320689655172414e-05, + "loss": 0.1392, + "step": 1182 + }, + { + "epoch": 1.3712960950518003, + "grad_norm": 0.3503858745098114, + "learning_rate": 4.320114942528736e-05, + "loss": 0.1457, + "step": 1183 + }, + { + "epoch": 1.3724552633485474, + "grad_norm": 0.26688286662101746, + "learning_rate": 4.319540229885058e-05, + "loss": 0.141, + "step": 1184 + }, + { + "epoch": 1.3736144316452945, + "grad_norm": 0.2871832251548767, + "learning_rate": 4.3189655172413795e-05, + "loss": 0.1323, + "step": 1185 + }, + { + "epoch": 1.3747735999420416, + "grad_norm": 0.27119460701942444, + "learning_rate": 4.318390804597701e-05, + "loss": 0.1207, + "step": 1186 + }, + { + "epoch": 1.3759327682387887, + "grad_norm": 0.23951269686222076, + "learning_rate": 4.317816091954023e-05, + "loss": 0.1135, + "step": 1187 + }, + { + "epoch": 1.3770919365355359, + "grad_norm": 0.221263587474823, + "learning_rate": 4.317241379310345e-05, + "loss": 0.1169, + "step": 1188 + }, + { + "epoch": 1.378251104832283, + "grad_norm": 0.2704179883003235, + "learning_rate": 4.316666666666667e-05, + "loss": 0.1341, + "step": 1189 + }, + { + "epoch": 1.3794102731290299, + "grad_norm": 0.2637876272201538, + "learning_rate": 4.316091954022989e-05, + "loss": 0.13, + "step": 1190 + }, + { + "epoch": 1.380569441425777, + "grad_norm": 0.2561126947402954, + "learning_rate": 4.3155172413793104e-05, + "loss": 0.128, + "step": 1191 + }, + { + "epoch": 1.381728609722524, + "grad_norm": 0.2545618712902069, + "learning_rate": 4.3149425287356326e-05, + "loss": 0.1245, + "step": 1192 + }, + { + "epoch": 1.3828877780192712, + "grad_norm": 0.20275923609733582, + "learning_rate": 4.314367816091954e-05, + "loss": 0.1168, + "step": 1193 + }, + { + "epoch": 1.3840469463160183, + "grad_norm": 0.2271161824464798, + "learning_rate": 4.3137931034482756e-05, + "loss": 0.1326, + "step": 1194 + }, + { + "epoch": 1.3852061146127652, + "grad_norm": 0.21496017277240753, + "learning_rate": 4.3132183908045984e-05, + "loss": 0.1248, + "step": 1195 + }, + { + "epoch": 1.3863652829095123, + "grad_norm": 0.2341107428073883, + "learning_rate": 4.31264367816092e-05, + "loss": 0.1268, + "step": 1196 + }, + { + "epoch": 1.3875244512062594, + "grad_norm": 0.2202032208442688, + "learning_rate": 4.3120689655172414e-05, + "loss": 0.1246, + "step": 1197 + }, + { + "epoch": 1.3886836195030066, + "grad_norm": 0.2526317834854126, + "learning_rate": 4.3114942528735636e-05, + "loss": 0.1276, + "step": 1198 + }, + { + "epoch": 1.3898427877997537, + "grad_norm": 0.29777175188064575, + "learning_rate": 4.310919540229885e-05, + "loss": 0.119, + "step": 1199 + }, + { + "epoch": 1.3910019560965008, + "grad_norm": 0.28763195872306824, + "learning_rate": 4.3103448275862066e-05, + "loss": 0.1363, + "step": 1200 + }, + { + "epoch": 1.392161124393248, + "grad_norm": 0.22191934287548065, + "learning_rate": 4.309770114942529e-05, + "loss": 0.1113, + "step": 1201 + }, + { + "epoch": 1.393320292689995, + "grad_norm": 0.36686575412750244, + "learning_rate": 4.309195402298851e-05, + "loss": 0.1451, + "step": 1202 + }, + { + "epoch": 1.3944794609867421, + "grad_norm": 0.2628101110458374, + "learning_rate": 4.308620689655173e-05, + "loss": 0.1255, + "step": 1203 + }, + { + "epoch": 1.395638629283489, + "grad_norm": 0.2633892893791199, + "learning_rate": 4.3080459770114946e-05, + "loss": 0.1391, + "step": 1204 + }, + { + "epoch": 1.3967977975802361, + "grad_norm": 0.28212958574295044, + "learning_rate": 4.307471264367816e-05, + "loss": 0.1383, + "step": 1205 + }, + { + "epoch": 1.3979569658769833, + "grad_norm": 0.24369271099567413, + "learning_rate": 4.306896551724138e-05, + "loss": 0.1227, + "step": 1206 + }, + { + "epoch": 1.3991161341737304, + "grad_norm": 0.23191668093204498, + "learning_rate": 4.30632183908046e-05, + "loss": 0.1255, + "step": 1207 + }, + { + "epoch": 1.4002753024704775, + "grad_norm": 0.2150292694568634, + "learning_rate": 4.305747126436782e-05, + "loss": 0.1172, + "step": 1208 + }, + { + "epoch": 1.4014344707672244, + "grad_norm": 0.2306421399116516, + "learning_rate": 4.305172413793104e-05, + "loss": 0.1213, + "step": 1209 + }, + { + "epoch": 1.4025936390639715, + "grad_norm": 0.22703979909420013, + "learning_rate": 4.3045977011494255e-05, + "loss": 0.1059, + "step": 1210 + }, + { + "epoch": 1.4037528073607186, + "grad_norm": 0.2609024941921234, + "learning_rate": 4.304022988505748e-05, + "loss": 0.1393, + "step": 1211 + }, + { + "epoch": 1.4049119756574657, + "grad_norm": 0.23271793127059937, + "learning_rate": 4.303448275862069e-05, + "loss": 0.1306, + "step": 1212 + }, + { + "epoch": 1.4060711439542128, + "grad_norm": 0.25955161452293396, + "learning_rate": 4.302873563218391e-05, + "loss": 0.1345, + "step": 1213 + }, + { + "epoch": 1.40723031225096, + "grad_norm": 0.20886264741420746, + "learning_rate": 4.302298850574713e-05, + "loss": 0.1255, + "step": 1214 + }, + { + "epoch": 1.408389480547707, + "grad_norm": 0.2347596287727356, + "learning_rate": 4.301724137931035e-05, + "loss": 0.1219, + "step": 1215 + }, + { + "epoch": 1.4095486488444542, + "grad_norm": 0.25307637453079224, + "learning_rate": 4.3011494252873565e-05, + "loss": 0.1122, + "step": 1216 + }, + { + "epoch": 1.4107078171412013, + "grad_norm": 0.28534573316574097, + "learning_rate": 4.300574712643679e-05, + "loss": 0.1356, + "step": 1217 + }, + { + "epoch": 1.4118669854379482, + "grad_norm": 0.2502347528934479, + "learning_rate": 4.3e-05, + "loss": 0.1256, + "step": 1218 + }, + { + "epoch": 1.4130261537346953, + "grad_norm": 0.31428301334381104, + "learning_rate": 4.2994252873563217e-05, + "loss": 0.1375, + "step": 1219 + }, + { + "epoch": 1.4141853220314424, + "grad_norm": 0.23887260258197784, + "learning_rate": 4.298850574712644e-05, + "loss": 0.127, + "step": 1220 + }, + { + "epoch": 1.4153444903281895, + "grad_norm": 0.2826530933380127, + "learning_rate": 4.298275862068965e-05, + "loss": 0.1431, + "step": 1221 + }, + { + "epoch": 1.4165036586249367, + "grad_norm": 0.22156329452991486, + "learning_rate": 4.2977011494252875e-05, + "loss": 0.1132, + "step": 1222 + }, + { + "epoch": 1.4176628269216838, + "grad_norm": 0.23711679875850677, + "learning_rate": 4.2971264367816096e-05, + "loss": 0.1234, + "step": 1223 + }, + { + "epoch": 1.4188219952184307, + "grad_norm": 0.21596437692642212, + "learning_rate": 4.296551724137931e-05, + "loss": 0.1195, + "step": 1224 + }, + { + "epoch": 1.4199811635151778, + "grad_norm": 0.26402637362480164, + "learning_rate": 4.295977011494253e-05, + "loss": 0.1268, + "step": 1225 + }, + { + "epoch": 1.421140331811925, + "grad_norm": 0.20864778757095337, + "learning_rate": 4.295402298850575e-05, + "loss": 0.1287, + "step": 1226 + }, + { + "epoch": 1.422299500108672, + "grad_norm": 0.23605617880821228, + "learning_rate": 4.294827586206896e-05, + "loss": 0.1432, + "step": 1227 + }, + { + "epoch": 1.4234586684054191, + "grad_norm": 0.2601798176765442, + "learning_rate": 4.2942528735632185e-05, + "loss": 0.1317, + "step": 1228 + }, + { + "epoch": 1.4246178367021662, + "grad_norm": 0.2347661405801773, + "learning_rate": 4.2936781609195406e-05, + "loss": 0.1233, + "step": 1229 + }, + { + "epoch": 1.4257770049989134, + "grad_norm": 0.22487115859985352, + "learning_rate": 4.293103448275863e-05, + "loss": 0.1314, + "step": 1230 + }, + { + "epoch": 1.4269361732956605, + "grad_norm": 0.2523593306541443, + "learning_rate": 4.292528735632184e-05, + "loss": 0.127, + "step": 1231 + }, + { + "epoch": 1.4280953415924076, + "grad_norm": 0.25972914695739746, + "learning_rate": 4.291954022988506e-05, + "loss": 0.1404, + "step": 1232 + }, + { + "epoch": 1.4292545098891545, + "grad_norm": 0.2787970006465912, + "learning_rate": 4.291379310344828e-05, + "loss": 0.1309, + "step": 1233 + }, + { + "epoch": 1.4304136781859016, + "grad_norm": 0.3272540271282196, + "learning_rate": 4.2908045977011494e-05, + "loss": 0.1271, + "step": 1234 + }, + { + "epoch": 1.4315728464826487, + "grad_norm": 0.302356094121933, + "learning_rate": 4.290229885057471e-05, + "loss": 0.1341, + "step": 1235 + }, + { + "epoch": 1.4327320147793958, + "grad_norm": 0.19032296538352966, + "learning_rate": 4.289655172413794e-05, + "loss": 0.1179, + "step": 1236 + }, + { + "epoch": 1.433891183076143, + "grad_norm": 0.28379547595977783, + "learning_rate": 4.289080459770115e-05, + "loss": 0.127, + "step": 1237 + }, + { + "epoch": 1.4350503513728898, + "grad_norm": 0.25689971446990967, + "learning_rate": 4.288505747126437e-05, + "loss": 0.1253, + "step": 1238 + }, + { + "epoch": 1.436209519669637, + "grad_norm": 0.235792875289917, + "learning_rate": 4.287931034482759e-05, + "loss": 0.1287, + "step": 1239 + }, + { + "epoch": 1.437368687966384, + "grad_norm": 0.24741043150424957, + "learning_rate": 4.2873563218390804e-05, + "loss": 0.1317, + "step": 1240 + }, + { + "epoch": 1.4385278562631312, + "grad_norm": 0.25098156929016113, + "learning_rate": 4.2867816091954026e-05, + "loss": 0.1291, + "step": 1241 + }, + { + "epoch": 1.4396870245598783, + "grad_norm": 0.26148924231529236, + "learning_rate": 4.286206896551724e-05, + "loss": 0.1356, + "step": 1242 + }, + { + "epoch": 1.4408461928566254, + "grad_norm": 0.27373388409614563, + "learning_rate": 4.285632183908046e-05, + "loss": 0.1457, + "step": 1243 + }, + { + "epoch": 1.4420053611533725, + "grad_norm": 0.23837333917617798, + "learning_rate": 4.2850574712643684e-05, + "loss": 0.132, + "step": 1244 + }, + { + "epoch": 1.4431645294501196, + "grad_norm": 0.2650034725666046, + "learning_rate": 4.28448275862069e-05, + "loss": 0.1317, + "step": 1245 + }, + { + "epoch": 1.4443236977468668, + "grad_norm": 0.2445426732301712, + "learning_rate": 4.2839080459770114e-05, + "loss": 0.1183, + "step": 1246 + }, + { + "epoch": 1.4454828660436136, + "grad_norm": 0.2151249349117279, + "learning_rate": 4.2833333333333335e-05, + "loss": 0.1185, + "step": 1247 + }, + { + "epoch": 1.4466420343403608, + "grad_norm": 0.2612224817276001, + "learning_rate": 4.282758620689655e-05, + "loss": 0.1225, + "step": 1248 + }, + { + "epoch": 1.4478012026371079, + "grad_norm": 0.2132655531167984, + "learning_rate": 4.282183908045977e-05, + "loss": 0.1083, + "step": 1249 + }, + { + "epoch": 1.448960370933855, + "grad_norm": 0.28365272283554077, + "learning_rate": 4.2816091954022994e-05, + "loss": 0.1269, + "step": 1250 + }, + { + "epoch": 1.450119539230602, + "grad_norm": 0.24323928356170654, + "learning_rate": 4.281034482758621e-05, + "loss": 0.138, + "step": 1251 + }, + { + "epoch": 1.451278707527349, + "grad_norm": 0.2997300624847412, + "learning_rate": 4.280459770114943e-05, + "loss": 0.1181, + "step": 1252 + }, + { + "epoch": 1.4524378758240961, + "grad_norm": 0.28692933917045593, + "learning_rate": 4.2798850574712645e-05, + "loss": 0.136, + "step": 1253 + }, + { + "epoch": 1.4535970441208432, + "grad_norm": 0.3969144821166992, + "learning_rate": 4.279310344827586e-05, + "loss": 0.1519, + "step": 1254 + }, + { + "epoch": 1.4547562124175903, + "grad_norm": 0.29358184337615967, + "learning_rate": 4.278735632183908e-05, + "loss": 0.1243, + "step": 1255 + }, + { + "epoch": 1.4559153807143375, + "grad_norm": 0.2625267803668976, + "learning_rate": 4.2781609195402303e-05, + "loss": 0.1228, + "step": 1256 + }, + { + "epoch": 1.4570745490110846, + "grad_norm": 0.24338027834892273, + "learning_rate": 4.2775862068965525e-05, + "loss": 0.1182, + "step": 1257 + }, + { + "epoch": 1.4582337173078317, + "grad_norm": 0.25080496072769165, + "learning_rate": 4.277011494252874e-05, + "loss": 0.1171, + "step": 1258 + }, + { + "epoch": 1.4593928856045788, + "grad_norm": 0.2200382798910141, + "learning_rate": 4.2764367816091955e-05, + "loss": 0.1089, + "step": 1259 + }, + { + "epoch": 1.460552053901326, + "grad_norm": 0.5557222962379456, + "learning_rate": 4.275862068965518e-05, + "loss": 0.1408, + "step": 1260 + }, + { + "epoch": 1.4617112221980728, + "grad_norm": 0.21948528289794922, + "learning_rate": 4.275287356321839e-05, + "loss": 0.1188, + "step": 1261 + }, + { + "epoch": 1.46287039049482, + "grad_norm": 0.2826273441314697, + "learning_rate": 4.2747126436781606e-05, + "loss": 0.1208, + "step": 1262 + }, + { + "epoch": 1.464029558791567, + "grad_norm": 0.3197879195213318, + "learning_rate": 4.274137931034483e-05, + "loss": 0.1428, + "step": 1263 + }, + { + "epoch": 1.4651887270883142, + "grad_norm": 0.26854708790779114, + "learning_rate": 4.273563218390805e-05, + "loss": 0.1354, + "step": 1264 + }, + { + "epoch": 1.4663478953850613, + "grad_norm": 0.20866204798221588, + "learning_rate": 4.2729885057471265e-05, + "loss": 0.1205, + "step": 1265 + }, + { + "epoch": 1.4675070636818082, + "grad_norm": 0.22703410685062408, + "learning_rate": 4.2724137931034486e-05, + "loss": 0.1165, + "step": 1266 + }, + { + "epoch": 1.4686662319785553, + "grad_norm": 0.2805153727531433, + "learning_rate": 4.27183908045977e-05, + "loss": 0.129, + "step": 1267 + }, + { + "epoch": 1.4698254002753024, + "grad_norm": 0.25746089220046997, + "learning_rate": 4.271264367816092e-05, + "loss": 0.113, + "step": 1268 + }, + { + "epoch": 1.4709845685720495, + "grad_norm": 0.24136294424533844, + "learning_rate": 4.270689655172414e-05, + "loss": 0.1215, + "step": 1269 + }, + { + "epoch": 1.4721437368687966, + "grad_norm": 0.21415215730667114, + "learning_rate": 4.270114942528736e-05, + "loss": 0.113, + "step": 1270 + }, + { + "epoch": 1.4733029051655437, + "grad_norm": 0.23764324188232422, + "learning_rate": 4.269540229885058e-05, + "loss": 0.1283, + "step": 1271 + }, + { + "epoch": 1.4744620734622909, + "grad_norm": 0.3138484060764313, + "learning_rate": 4.2689655172413796e-05, + "loss": 0.1294, + "step": 1272 + }, + { + "epoch": 1.475621241759038, + "grad_norm": 0.29593080282211304, + "learning_rate": 4.268390804597701e-05, + "loss": 0.1109, + "step": 1273 + }, + { + "epoch": 1.476780410055785, + "grad_norm": 0.19137899577617645, + "learning_rate": 4.267816091954023e-05, + "loss": 0.1062, + "step": 1274 + }, + { + "epoch": 1.477939578352532, + "grad_norm": 0.2660485506057739, + "learning_rate": 4.267241379310345e-05, + "loss": 0.1331, + "step": 1275 + }, + { + "epoch": 1.479098746649279, + "grad_norm": 0.2462538778781891, + "learning_rate": 4.266666666666667e-05, + "loss": 0.1241, + "step": 1276 + }, + { + "epoch": 1.4802579149460262, + "grad_norm": 0.28121456503868103, + "learning_rate": 4.266091954022989e-05, + "loss": 0.1301, + "step": 1277 + }, + { + "epoch": 1.4814170832427733, + "grad_norm": 0.2737043797969818, + "learning_rate": 4.2655172413793106e-05, + "loss": 0.139, + "step": 1278 + }, + { + "epoch": 1.4825762515395204, + "grad_norm": 0.271465539932251, + "learning_rate": 4.264942528735633e-05, + "loss": 0.1272, + "step": 1279 + }, + { + "epoch": 1.4837354198362676, + "grad_norm": 0.35305821895599365, + "learning_rate": 4.264367816091954e-05, + "loss": 0.13, + "step": 1280 + }, + { + "epoch": 1.4848945881330144, + "grad_norm": 0.26508283615112305, + "learning_rate": 4.263793103448276e-05, + "loss": 0.1277, + "step": 1281 + }, + { + "epoch": 1.4860537564297616, + "grad_norm": 0.2650332450866699, + "learning_rate": 4.263218390804598e-05, + "loss": 0.1276, + "step": 1282 + }, + { + "epoch": 1.4872129247265087, + "grad_norm": 0.234884575009346, + "learning_rate": 4.2626436781609194e-05, + "loss": 0.1385, + "step": 1283 + }, + { + "epoch": 1.4883720930232558, + "grad_norm": 0.26508384943008423, + "learning_rate": 4.2620689655172416e-05, + "loss": 0.1308, + "step": 1284 + }, + { + "epoch": 1.489531261320003, + "grad_norm": 0.2862217426300049, + "learning_rate": 4.261494252873564e-05, + "loss": 0.1252, + "step": 1285 + }, + { + "epoch": 1.49069042961675, + "grad_norm": 0.2019922137260437, + "learning_rate": 4.260919540229885e-05, + "loss": 0.1138, + "step": 1286 + }, + { + "epoch": 1.4918495979134971, + "grad_norm": 0.278028279542923, + "learning_rate": 4.2603448275862074e-05, + "loss": 0.1363, + "step": 1287 + }, + { + "epoch": 1.4930087662102443, + "grad_norm": 0.29411014914512634, + "learning_rate": 4.259770114942529e-05, + "loss": 0.1316, + "step": 1288 + }, + { + "epoch": 1.4941679345069914, + "grad_norm": 0.22040338814258575, + "learning_rate": 4.2591954022988504e-05, + "loss": 0.1376, + "step": 1289 + }, + { + "epoch": 1.4953271028037383, + "grad_norm": 0.2261519730091095, + "learning_rate": 4.2586206896551725e-05, + "loss": 0.1237, + "step": 1290 + }, + { + "epoch": 1.4964862711004854, + "grad_norm": 0.3036960959434509, + "learning_rate": 4.258045977011495e-05, + "loss": 0.126, + "step": 1291 + }, + { + "epoch": 1.4976454393972325, + "grad_norm": 0.2828342020511627, + "learning_rate": 4.257471264367816e-05, + "loss": 0.1441, + "step": 1292 + }, + { + "epoch": 1.4988046076939796, + "grad_norm": 0.2660335302352905, + "learning_rate": 4.2568965517241384e-05, + "loss": 0.1235, + "step": 1293 + }, + { + "epoch": 1.4999637759907267, + "grad_norm": 0.22510606050491333, + "learning_rate": 4.25632183908046e-05, + "loss": 0.1087, + "step": 1294 + }, + { + "epoch": 1.5011229442874736, + "grad_norm": 0.2649793028831482, + "learning_rate": 4.255747126436782e-05, + "loss": 0.1258, + "step": 1295 + }, + { + "epoch": 1.5022821125842207, + "grad_norm": 0.2566182315349579, + "learning_rate": 4.2551724137931035e-05, + "loss": 0.1272, + "step": 1296 + }, + { + "epoch": 1.5034412808809678, + "grad_norm": 0.2644808888435364, + "learning_rate": 4.254597701149426e-05, + "loss": 0.1268, + "step": 1297 + }, + { + "epoch": 1.504600449177715, + "grad_norm": 0.32291561365127563, + "learning_rate": 4.254022988505748e-05, + "loss": 0.1329, + "step": 1298 + }, + { + "epoch": 1.505759617474462, + "grad_norm": 0.34708988666534424, + "learning_rate": 4.253448275862069e-05, + "loss": 0.1569, + "step": 1299 + }, + { + "epoch": 1.5069187857712092, + "grad_norm": 0.23418490588665009, + "learning_rate": 4.252873563218391e-05, + "loss": 0.1303, + "step": 1300 + }, + { + "epoch": 1.5080779540679563, + "grad_norm": 0.35759875178337097, + "learning_rate": 4.252298850574713e-05, + "loss": 0.1226, + "step": 1301 + }, + { + "epoch": 1.5092371223647034, + "grad_norm": 0.255636066198349, + "learning_rate": 4.2517241379310345e-05, + "loss": 0.12, + "step": 1302 + }, + { + "epoch": 1.5103962906614505, + "grad_norm": 0.2545788586139679, + "learning_rate": 4.251149425287356e-05, + "loss": 0.1326, + "step": 1303 + }, + { + "epoch": 1.5115554589581977, + "grad_norm": 0.2811221480369568, + "learning_rate": 4.250574712643678e-05, + "loss": 0.1441, + "step": 1304 + }, + { + "epoch": 1.5127146272549445, + "grad_norm": 0.2685137987136841, + "learning_rate": 4.25e-05, + "loss": 0.1367, + "step": 1305 + }, + { + "epoch": 1.5138737955516917, + "grad_norm": 0.26587024331092834, + "learning_rate": 4.2494252873563225e-05, + "loss": 0.1196, + "step": 1306 + }, + { + "epoch": 1.5150329638484388, + "grad_norm": 0.296543151140213, + "learning_rate": 4.248850574712644e-05, + "loss": 0.1502, + "step": 1307 + }, + { + "epoch": 1.5161921321451857, + "grad_norm": 0.2668646275997162, + "learning_rate": 4.2482758620689655e-05, + "loss": 0.1351, + "step": 1308 + }, + { + "epoch": 1.5173513004419328, + "grad_norm": 0.2467673420906067, + "learning_rate": 4.2477011494252876e-05, + "loss": 0.1281, + "step": 1309 + }, + { + "epoch": 1.51851046873868, + "grad_norm": 0.25315842032432556, + "learning_rate": 4.247126436781609e-05, + "loss": 0.1286, + "step": 1310 + }, + { + "epoch": 1.519669637035427, + "grad_norm": 0.25049182772636414, + "learning_rate": 4.246551724137931e-05, + "loss": 0.1356, + "step": 1311 + }, + { + "epoch": 1.5208288053321741, + "grad_norm": 0.23967556655406952, + "learning_rate": 4.2459770114942534e-05, + "loss": 0.1135, + "step": 1312 + }, + { + "epoch": 1.5219879736289212, + "grad_norm": 0.2868068218231201, + "learning_rate": 4.245402298850575e-05, + "loss": 0.1309, + "step": 1313 + }, + { + "epoch": 1.5231471419256684, + "grad_norm": 0.21902504563331604, + "learning_rate": 4.244827586206897e-05, + "loss": 0.1156, + "step": 1314 + }, + { + "epoch": 1.5243063102224155, + "grad_norm": 0.24290820956230164, + "learning_rate": 4.2442528735632186e-05, + "loss": 0.1296, + "step": 1315 + }, + { + "epoch": 1.5254654785191626, + "grad_norm": 0.21558572351932526, + "learning_rate": 4.24367816091954e-05, + "loss": 0.1213, + "step": 1316 + }, + { + "epoch": 1.5266246468159097, + "grad_norm": 0.28430676460266113, + "learning_rate": 4.243103448275862e-05, + "loss": 0.1275, + "step": 1317 + }, + { + "epoch": 1.5277838151126568, + "grad_norm": 0.28674018383026123, + "learning_rate": 4.2425287356321844e-05, + "loss": 0.1315, + "step": 1318 + }, + { + "epoch": 1.5289429834094037, + "grad_norm": 0.31648436188697815, + "learning_rate": 4.241954022988506e-05, + "loss": 0.1457, + "step": 1319 + }, + { + "epoch": 1.5301021517061508, + "grad_norm": 0.2692152261734009, + "learning_rate": 4.241379310344828e-05, + "loss": 0.1261, + "step": 1320 + }, + { + "epoch": 1.531261320002898, + "grad_norm": 0.21987690031528473, + "learning_rate": 4.2408045977011496e-05, + "loss": 0.1224, + "step": 1321 + }, + { + "epoch": 1.532420488299645, + "grad_norm": 0.32818254828453064, + "learning_rate": 4.240229885057471e-05, + "loss": 0.128, + "step": 1322 + }, + { + "epoch": 1.533579656596392, + "grad_norm": 0.2926555275917053, + "learning_rate": 4.239655172413793e-05, + "loss": 0.1425, + "step": 1323 + }, + { + "epoch": 1.534738824893139, + "grad_norm": 0.2183428853750229, + "learning_rate": 4.239080459770115e-05, + "loss": 0.121, + "step": 1324 + }, + { + "epoch": 1.5358979931898862, + "grad_norm": 0.26924946904182434, + "learning_rate": 4.238505747126437e-05, + "loss": 0.1474, + "step": 1325 + }, + { + "epoch": 1.5370571614866333, + "grad_norm": 0.1868240386247635, + "learning_rate": 4.237931034482759e-05, + "loss": 0.1293, + "step": 1326 + }, + { + "epoch": 1.5382163297833804, + "grad_norm": 0.20834980905056, + "learning_rate": 4.2373563218390805e-05, + "loss": 0.1159, + "step": 1327 + }, + { + "epoch": 1.5393754980801275, + "grad_norm": 0.2959633767604828, + "learning_rate": 4.236781609195403e-05, + "loss": 0.1421, + "step": 1328 + }, + { + "epoch": 1.5405346663768746, + "grad_norm": 0.222950279712677, + "learning_rate": 4.236206896551724e-05, + "loss": 0.1279, + "step": 1329 + }, + { + "epoch": 1.5416938346736218, + "grad_norm": 0.2185351401567459, + "learning_rate": 4.235632183908046e-05, + "loss": 0.1247, + "step": 1330 + }, + { + "epoch": 1.5428530029703689, + "grad_norm": 0.16755683720111847, + "learning_rate": 4.235057471264368e-05, + "loss": 0.101, + "step": 1331 + }, + { + "epoch": 1.544012171267116, + "grad_norm": 0.2677669823169708, + "learning_rate": 4.23448275862069e-05, + "loss": 0.1339, + "step": 1332 + }, + { + "epoch": 1.5451713395638629, + "grad_norm": 0.252986341714859, + "learning_rate": 4.233908045977012e-05, + "loss": 0.1202, + "step": 1333 + }, + { + "epoch": 1.54633050786061, + "grad_norm": 0.2726978659629822, + "learning_rate": 4.233333333333334e-05, + "loss": 0.1274, + "step": 1334 + }, + { + "epoch": 1.547489676157357, + "grad_norm": 0.24727889895439148, + "learning_rate": 4.232758620689655e-05, + "loss": 0.1292, + "step": 1335 + }, + { + "epoch": 1.5486488444541042, + "grad_norm": 0.24011863768100739, + "learning_rate": 4.2321839080459773e-05, + "loss": 0.1145, + "step": 1336 + }, + { + "epoch": 1.5498080127508511, + "grad_norm": 0.26071444153785706, + "learning_rate": 4.231609195402299e-05, + "loss": 0.1208, + "step": 1337 + }, + { + "epoch": 1.5509671810475982, + "grad_norm": 0.2095252424478531, + "learning_rate": 4.231034482758621e-05, + "loss": 0.1164, + "step": 1338 + }, + { + "epoch": 1.5521263493443453, + "grad_norm": 0.3264429569244385, + "learning_rate": 4.230459770114943e-05, + "loss": 0.1344, + "step": 1339 + }, + { + "epoch": 1.5532855176410925, + "grad_norm": 0.24655984342098236, + "learning_rate": 4.2298850574712647e-05, + "loss": 0.124, + "step": 1340 + }, + { + "epoch": 1.5544446859378396, + "grad_norm": 0.28563857078552246, + "learning_rate": 4.229310344827586e-05, + "loss": 0.1312, + "step": 1341 + }, + { + "epoch": 1.5556038542345867, + "grad_norm": 0.2881647050380707, + "learning_rate": 4.228735632183908e-05, + "loss": 0.1418, + "step": 1342 + }, + { + "epoch": 1.5567630225313338, + "grad_norm": 0.2571274936199188, + "learning_rate": 4.22816091954023e-05, + "loss": 0.1213, + "step": 1343 + }, + { + "epoch": 1.557922190828081, + "grad_norm": 0.26905322074890137, + "learning_rate": 4.227586206896552e-05, + "loss": 0.1294, + "step": 1344 + }, + { + "epoch": 1.559081359124828, + "grad_norm": 0.3038524389266968, + "learning_rate": 4.2270114942528735e-05, + "loss": 0.1398, + "step": 1345 + }, + { + "epoch": 1.5602405274215752, + "grad_norm": 0.31705963611602783, + "learning_rate": 4.2264367816091956e-05, + "loss": 0.1284, + "step": 1346 + }, + { + "epoch": 1.5613996957183223, + "grad_norm": 0.24810083210468292, + "learning_rate": 4.225862068965518e-05, + "loss": 0.126, + "step": 1347 + }, + { + "epoch": 1.5625588640150692, + "grad_norm": 0.2836195230484009, + "learning_rate": 4.225287356321839e-05, + "loss": 0.1318, + "step": 1348 + }, + { + "epoch": 1.5637180323118163, + "grad_norm": 0.26509571075439453, + "learning_rate": 4.224712643678161e-05, + "loss": 0.1279, + "step": 1349 + }, + { + "epoch": 1.5648772006085634, + "grad_norm": 0.24432741105556488, + "learning_rate": 4.224137931034483e-05, + "loss": 0.1238, + "step": 1350 + }, + { + "epoch": 1.5660363689053103, + "grad_norm": 0.25199276208877563, + "learning_rate": 4.2235632183908044e-05, + "loss": 0.1153, + "step": 1351 + }, + { + "epoch": 1.5671955372020574, + "grad_norm": 0.2422485649585724, + "learning_rate": 4.2229885057471266e-05, + "loss": 0.1222, + "step": 1352 + }, + { + "epoch": 1.5683547054988045, + "grad_norm": 0.28373682498931885, + "learning_rate": 4.222413793103449e-05, + "loss": 0.143, + "step": 1353 + }, + { + "epoch": 1.5695138737955516, + "grad_norm": 0.323743999004364, + "learning_rate": 4.22183908045977e-05, + "loss": 0.1185, + "step": 1354 + }, + { + "epoch": 1.5706730420922987, + "grad_norm": 0.27437567710876465, + "learning_rate": 4.2212643678160924e-05, + "loss": 0.1209, + "step": 1355 + }, + { + "epoch": 1.5718322103890459, + "grad_norm": 0.20446433126926422, + "learning_rate": 4.220689655172414e-05, + "loss": 0.1164, + "step": 1356 + }, + { + "epoch": 1.572991378685793, + "grad_norm": 0.21873438358306885, + "learning_rate": 4.2201149425287354e-05, + "loss": 0.1209, + "step": 1357 + }, + { + "epoch": 1.57415054698254, + "grad_norm": 0.2323230654001236, + "learning_rate": 4.2195402298850576e-05, + "loss": 0.1233, + "step": 1358 + }, + { + "epoch": 1.5753097152792872, + "grad_norm": 0.2679866552352905, + "learning_rate": 4.21896551724138e-05, + "loss": 0.1219, + "step": 1359 + }, + { + "epoch": 1.5764688835760343, + "grad_norm": 0.29342472553253174, + "learning_rate": 4.218390804597701e-05, + "loss": 0.1449, + "step": 1360 + }, + { + "epoch": 1.5776280518727814, + "grad_norm": 0.29322659969329834, + "learning_rate": 4.2178160919540234e-05, + "loss": 0.1494, + "step": 1361 + }, + { + "epoch": 1.5787872201695283, + "grad_norm": 0.23010887205600739, + "learning_rate": 4.217241379310345e-05, + "loss": 0.1304, + "step": 1362 + }, + { + "epoch": 1.5799463884662754, + "grad_norm": 0.20732039213180542, + "learning_rate": 4.216666666666667e-05, + "loss": 0.113, + "step": 1363 + }, + { + "epoch": 1.5811055567630226, + "grad_norm": 0.21723432838916779, + "learning_rate": 4.2160919540229886e-05, + "loss": 0.1193, + "step": 1364 + }, + { + "epoch": 1.5822647250597695, + "grad_norm": 0.268611878156662, + "learning_rate": 4.21551724137931e-05, + "loss": 0.1447, + "step": 1365 + }, + { + "epoch": 1.5834238933565166, + "grad_norm": 0.2262781858444214, + "learning_rate": 4.214942528735633e-05, + "loss": 0.1188, + "step": 1366 + }, + { + "epoch": 1.5845830616532637, + "grad_norm": 0.21195755898952484, + "learning_rate": 4.2143678160919544e-05, + "loss": 0.1156, + "step": 1367 + }, + { + "epoch": 1.5857422299500108, + "grad_norm": 0.26163250207901, + "learning_rate": 4.213793103448276e-05, + "loss": 0.1348, + "step": 1368 + }, + { + "epoch": 1.586901398246758, + "grad_norm": 0.3384718894958496, + "learning_rate": 4.213218390804598e-05, + "loss": 0.1473, + "step": 1369 + }, + { + "epoch": 1.588060566543505, + "grad_norm": 0.26248493790626526, + "learning_rate": 4.2126436781609195e-05, + "loss": 0.1212, + "step": 1370 + }, + { + "epoch": 1.5892197348402521, + "grad_norm": 0.21204310655593872, + "learning_rate": 4.212068965517242e-05, + "loss": 0.1131, + "step": 1371 + }, + { + "epoch": 1.5903789031369993, + "grad_norm": 0.2920737862586975, + "learning_rate": 4.211494252873563e-05, + "loss": 0.134, + "step": 1372 + }, + { + "epoch": 1.5915380714337464, + "grad_norm": 0.2683486044406891, + "learning_rate": 4.2109195402298854e-05, + "loss": 0.1297, + "step": 1373 + }, + { + "epoch": 1.5926972397304935, + "grad_norm": 0.25629928708076477, + "learning_rate": 4.2103448275862075e-05, + "loss": 0.1364, + "step": 1374 + }, + { + "epoch": 1.5938564080272406, + "grad_norm": 0.22611750662326813, + "learning_rate": 4.209770114942529e-05, + "loss": 0.1189, + "step": 1375 + }, + { + "epoch": 1.5950155763239875, + "grad_norm": 0.2171134501695633, + "learning_rate": 4.2091954022988505e-05, + "loss": 0.1224, + "step": 1376 + }, + { + "epoch": 1.5961747446207346, + "grad_norm": 0.3188816010951996, + "learning_rate": 4.208620689655173e-05, + "loss": 0.1325, + "step": 1377 + }, + { + "epoch": 1.5973339129174817, + "grad_norm": 0.25183406472206116, + "learning_rate": 4.208045977011494e-05, + "loss": 0.116, + "step": 1378 + }, + { + "epoch": 1.5984930812142288, + "grad_norm": 0.274368017911911, + "learning_rate": 4.207471264367816e-05, + "loss": 0.1388, + "step": 1379 + }, + { + "epoch": 1.5996522495109757, + "grad_norm": 0.2406664341688156, + "learning_rate": 4.2068965517241385e-05, + "loss": 0.1225, + "step": 1380 + }, + { + "epoch": 1.6008114178077228, + "grad_norm": 0.22044144570827484, + "learning_rate": 4.20632183908046e-05, + "loss": 0.1212, + "step": 1381 + }, + { + "epoch": 1.60197058610447, + "grad_norm": 0.2523006200790405, + "learning_rate": 4.205747126436782e-05, + "loss": 0.1142, + "step": 1382 + }, + { + "epoch": 1.603129754401217, + "grad_norm": 0.26407095789909363, + "learning_rate": 4.2051724137931036e-05, + "loss": 0.1252, + "step": 1383 + }, + { + "epoch": 1.6042889226979642, + "grad_norm": 0.22272075712680817, + "learning_rate": 4.204597701149425e-05, + "loss": 0.1177, + "step": 1384 + }, + { + "epoch": 1.6054480909947113, + "grad_norm": 0.2437649518251419, + "learning_rate": 4.204022988505747e-05, + "loss": 0.1238, + "step": 1385 + }, + { + "epoch": 1.6066072592914584, + "grad_norm": 0.29245513677597046, + "learning_rate": 4.203448275862069e-05, + "loss": 0.1389, + "step": 1386 + }, + { + "epoch": 1.6077664275882055, + "grad_norm": 0.25167253613471985, + "learning_rate": 4.202873563218391e-05, + "loss": 0.131, + "step": 1387 + }, + { + "epoch": 1.6089255958849527, + "grad_norm": 0.2610798478126526, + "learning_rate": 4.202298850574713e-05, + "loss": 0.1415, + "step": 1388 + }, + { + "epoch": 1.6100847641816998, + "grad_norm": 0.21162807941436768, + "learning_rate": 4.2017241379310346e-05, + "loss": 0.1296, + "step": 1389 + }, + { + "epoch": 1.6112439324784467, + "grad_norm": 0.26718345284461975, + "learning_rate": 4.201149425287357e-05, + "loss": 0.1346, + "step": 1390 + }, + { + "epoch": 1.6124031007751938, + "grad_norm": 0.2872393727302551, + "learning_rate": 4.200574712643678e-05, + "loss": 0.1452, + "step": 1391 + }, + { + "epoch": 1.613562269071941, + "grad_norm": 0.2846381366252899, + "learning_rate": 4.2e-05, + "loss": 0.1281, + "step": 1392 + }, + { + "epoch": 1.614721437368688, + "grad_norm": 0.2739090919494629, + "learning_rate": 4.199425287356322e-05, + "loss": 0.1248, + "step": 1393 + }, + { + "epoch": 1.615880605665435, + "grad_norm": 0.24870949983596802, + "learning_rate": 4.198850574712644e-05, + "loss": 0.1377, + "step": 1394 + }, + { + "epoch": 1.617039773962182, + "grad_norm": 0.26134321093559265, + "learning_rate": 4.1982758620689656e-05, + "loss": 0.1348, + "step": 1395 + }, + { + "epoch": 1.6181989422589291, + "grad_norm": 0.2896818220615387, + "learning_rate": 4.197701149425288e-05, + "loss": 0.1376, + "step": 1396 + }, + { + "epoch": 1.6193581105556762, + "grad_norm": 0.24523970484733582, + "learning_rate": 4.197126436781609e-05, + "loss": 0.1289, + "step": 1397 + }, + { + "epoch": 1.6205172788524234, + "grad_norm": 0.2446785867214203, + "learning_rate": 4.196551724137931e-05, + "loss": 0.1219, + "step": 1398 + }, + { + "epoch": 1.6216764471491705, + "grad_norm": 0.2920502722263336, + "learning_rate": 4.195977011494253e-05, + "loss": 0.1609, + "step": 1399 + }, + { + "epoch": 1.6228356154459176, + "grad_norm": 0.24321280419826508, + "learning_rate": 4.195402298850575e-05, + "loss": 0.1217, + "step": 1400 + }, + { + "epoch": 1.6239947837426647, + "grad_norm": 0.20927608013153076, + "learning_rate": 4.194827586206897e-05, + "loss": 0.1071, + "step": 1401 + }, + { + "epoch": 1.6251539520394118, + "grad_norm": 0.2517341077327728, + "learning_rate": 4.194252873563219e-05, + "loss": 0.1359, + "step": 1402 + }, + { + "epoch": 1.626313120336159, + "grad_norm": 0.2402840554714203, + "learning_rate": 4.19367816091954e-05, + "loss": 0.1352, + "step": 1403 + }, + { + "epoch": 1.627472288632906, + "grad_norm": 0.22628480195999146, + "learning_rate": 4.1931034482758624e-05, + "loss": 0.1271, + "step": 1404 + }, + { + "epoch": 1.628631456929653, + "grad_norm": 0.24545376002788544, + "learning_rate": 4.192528735632184e-05, + "loss": 0.1338, + "step": 1405 + }, + { + "epoch": 1.6297906252264, + "grad_norm": 0.2462320774793625, + "learning_rate": 4.1919540229885054e-05, + "loss": 0.1375, + "step": 1406 + }, + { + "epoch": 1.6309497935231472, + "grad_norm": 0.2645837068557739, + "learning_rate": 4.191379310344828e-05, + "loss": 0.1327, + "step": 1407 + }, + { + "epoch": 1.632108961819894, + "grad_norm": 0.2443280965089798, + "learning_rate": 4.19080459770115e-05, + "loss": 0.1211, + "step": 1408 + }, + { + "epoch": 1.6332681301166412, + "grad_norm": 0.260450154542923, + "learning_rate": 4.190229885057472e-05, + "loss": 0.1318, + "step": 1409 + }, + { + "epoch": 1.6344272984133883, + "grad_norm": 0.22310788929462433, + "learning_rate": 4.1896551724137934e-05, + "loss": 0.123, + "step": 1410 + }, + { + "epoch": 1.6355864667101354, + "grad_norm": 0.24180607497692108, + "learning_rate": 4.189080459770115e-05, + "loss": 0.1214, + "step": 1411 + }, + { + "epoch": 1.6367456350068825, + "grad_norm": 0.33246803283691406, + "learning_rate": 4.188505747126437e-05, + "loss": 0.159, + "step": 1412 + }, + { + "epoch": 1.6379048033036296, + "grad_norm": 0.2154608517885208, + "learning_rate": 4.1879310344827585e-05, + "loss": 0.1242, + "step": 1413 + }, + { + "epoch": 1.6390639716003768, + "grad_norm": 0.24036885797977448, + "learning_rate": 4.187356321839081e-05, + "loss": 0.1097, + "step": 1414 + }, + { + "epoch": 1.6402231398971239, + "grad_norm": 0.21203725039958954, + "learning_rate": 4.186781609195403e-05, + "loss": 0.1232, + "step": 1415 + }, + { + "epoch": 1.641382308193871, + "grad_norm": 0.24248747527599335, + "learning_rate": 4.1862068965517243e-05, + "loss": 0.1223, + "step": 1416 + }, + { + "epoch": 1.642541476490618, + "grad_norm": 0.2965581715106964, + "learning_rate": 4.185632183908046e-05, + "loss": 0.1277, + "step": 1417 + }, + { + "epoch": 1.6437006447873652, + "grad_norm": 0.24066859483718872, + "learning_rate": 4.185057471264368e-05, + "loss": 0.1388, + "step": 1418 + }, + { + "epoch": 1.644859813084112, + "grad_norm": 0.23044906556606293, + "learning_rate": 4.1844827586206895e-05, + "loss": 0.1173, + "step": 1419 + }, + { + "epoch": 1.6460189813808592, + "grad_norm": 0.2829865515232086, + "learning_rate": 4.1839080459770117e-05, + "loss": 0.1264, + "step": 1420 + }, + { + "epoch": 1.6471781496776063, + "grad_norm": 0.37790974974632263, + "learning_rate": 4.183333333333334e-05, + "loss": 0.1158, + "step": 1421 + }, + { + "epoch": 1.6483373179743535, + "grad_norm": 0.23219698667526245, + "learning_rate": 4.182758620689655e-05, + "loss": 0.1293, + "step": 1422 + }, + { + "epoch": 1.6494964862711003, + "grad_norm": 0.26675254106521606, + "learning_rate": 4.1821839080459775e-05, + "loss": 0.1245, + "step": 1423 + }, + { + "epoch": 1.6506556545678475, + "grad_norm": 0.2584593594074249, + "learning_rate": 4.181609195402299e-05, + "loss": 0.1132, + "step": 1424 + }, + { + "epoch": 1.6518148228645946, + "grad_norm": 0.2532110810279846, + "learning_rate": 4.1810344827586205e-05, + "loss": 0.1265, + "step": 1425 + }, + { + "epoch": 1.6529739911613417, + "grad_norm": 0.27470529079437256, + "learning_rate": 4.1804597701149426e-05, + "loss": 0.1422, + "step": 1426 + }, + { + "epoch": 1.6541331594580888, + "grad_norm": 0.24903489649295807, + "learning_rate": 4.179885057471265e-05, + "loss": 0.1065, + "step": 1427 + }, + { + "epoch": 1.655292327754836, + "grad_norm": 0.29945898056030273, + "learning_rate": 4.179310344827587e-05, + "loss": 0.1418, + "step": 1428 + }, + { + "epoch": 1.656451496051583, + "grad_norm": 0.19737482070922852, + "learning_rate": 4.1787356321839085e-05, + "loss": 0.1162, + "step": 1429 + }, + { + "epoch": 1.6576106643483302, + "grad_norm": 0.31681379675865173, + "learning_rate": 4.17816091954023e-05, + "loss": 0.1521, + "step": 1430 + }, + { + "epoch": 1.6587698326450773, + "grad_norm": 0.23246780037879944, + "learning_rate": 4.177586206896552e-05, + "loss": 0.1156, + "step": 1431 + }, + { + "epoch": 1.6599290009418244, + "grad_norm": 0.2456778883934021, + "learning_rate": 4.1770114942528736e-05, + "loss": 0.1194, + "step": 1432 + }, + { + "epoch": 1.6610881692385713, + "grad_norm": 0.23620393872261047, + "learning_rate": 4.176436781609195e-05, + "loss": 0.1231, + "step": 1433 + }, + { + "epoch": 1.6622473375353184, + "grad_norm": 0.20471753180027008, + "learning_rate": 4.175862068965517e-05, + "loss": 0.109, + "step": 1434 + }, + { + "epoch": 1.6634065058320655, + "grad_norm": 0.2827279269695282, + "learning_rate": 4.1752873563218394e-05, + "loss": 0.1467, + "step": 1435 + }, + { + "epoch": 1.6645656741288126, + "grad_norm": 0.21254539489746094, + "learning_rate": 4.1747126436781616e-05, + "loss": 0.1182, + "step": 1436 + }, + { + "epoch": 1.6657248424255595, + "grad_norm": 0.24518734216690063, + "learning_rate": 4.174137931034483e-05, + "loss": 0.1326, + "step": 1437 + }, + { + "epoch": 1.6668840107223066, + "grad_norm": 0.2632474899291992, + "learning_rate": 4.1735632183908046e-05, + "loss": 0.1357, + "step": 1438 + }, + { + "epoch": 1.6680431790190537, + "grad_norm": 0.23681853711605072, + "learning_rate": 4.172988505747127e-05, + "loss": 0.1272, + "step": 1439 + }, + { + "epoch": 1.6692023473158009, + "grad_norm": 0.25653204321861267, + "learning_rate": 4.172413793103448e-05, + "loss": 0.1305, + "step": 1440 + }, + { + "epoch": 1.670361515612548, + "grad_norm": 0.22474606335163116, + "learning_rate": 4.1718390804597704e-05, + "loss": 0.1192, + "step": 1441 + }, + { + "epoch": 1.671520683909295, + "grad_norm": 0.2875736951828003, + "learning_rate": 4.1712643678160926e-05, + "loss": 0.1271, + "step": 1442 + }, + { + "epoch": 1.6726798522060422, + "grad_norm": 0.27800050377845764, + "learning_rate": 4.170689655172414e-05, + "loss": 0.1348, + "step": 1443 + }, + { + "epoch": 1.6738390205027893, + "grad_norm": 0.2516680955886841, + "learning_rate": 4.1701149425287356e-05, + "loss": 0.1128, + "step": 1444 + }, + { + "epoch": 1.6749981887995364, + "grad_norm": 0.3367115259170532, + "learning_rate": 4.169540229885058e-05, + "loss": 0.1362, + "step": 1445 + }, + { + "epoch": 1.6761573570962836, + "grad_norm": 0.3319508731365204, + "learning_rate": 4.168965517241379e-05, + "loss": 0.1358, + "step": 1446 + }, + { + "epoch": 1.6773165253930307, + "grad_norm": 0.2614979147911072, + "learning_rate": 4.1683908045977014e-05, + "loss": 0.1244, + "step": 1447 + }, + { + "epoch": 1.6784756936897776, + "grad_norm": 0.24296019971370697, + "learning_rate": 4.1678160919540235e-05, + "loss": 0.1223, + "step": 1448 + }, + { + "epoch": 1.6796348619865247, + "grad_norm": 0.26165443658828735, + "learning_rate": 4.167241379310345e-05, + "loss": 0.1326, + "step": 1449 + }, + { + "epoch": 1.6807940302832718, + "grad_norm": 0.24626079201698303, + "learning_rate": 4.166666666666667e-05, + "loss": 0.1189, + "step": 1450 + }, + { + "epoch": 1.6819531985800187, + "grad_norm": 0.24544042348861694, + "learning_rate": 4.166091954022989e-05, + "loss": 0.1343, + "step": 1451 + }, + { + "epoch": 1.6831123668767658, + "grad_norm": 0.23547019064426422, + "learning_rate": 4.16551724137931e-05, + "loss": 0.1176, + "step": 1452 + }, + { + "epoch": 1.684271535173513, + "grad_norm": 0.2702592611312866, + "learning_rate": 4.1649425287356324e-05, + "loss": 0.1488, + "step": 1453 + }, + { + "epoch": 1.68543070347026, + "grad_norm": 0.24035340547561646, + "learning_rate": 4.164367816091954e-05, + "loss": 0.1348, + "step": 1454 + }, + { + "epoch": 1.6865898717670071, + "grad_norm": 0.22424210608005524, + "learning_rate": 4.163793103448276e-05, + "loss": 0.13, + "step": 1455 + }, + { + "epoch": 1.6877490400637543, + "grad_norm": 0.2859560549259186, + "learning_rate": 4.163218390804598e-05, + "loss": 0.1486, + "step": 1456 + }, + { + "epoch": 1.6889082083605014, + "grad_norm": 0.21824337542057037, + "learning_rate": 4.16264367816092e-05, + "loss": 0.1174, + "step": 1457 + }, + { + "epoch": 1.6900673766572485, + "grad_norm": 0.24200040102005005, + "learning_rate": 4.162068965517242e-05, + "loss": 0.1324, + "step": 1458 + }, + { + "epoch": 1.6912265449539956, + "grad_norm": 0.2552363872528076, + "learning_rate": 4.161494252873563e-05, + "loss": 0.1313, + "step": 1459 + }, + { + "epoch": 1.6923857132507427, + "grad_norm": 0.19345875084400177, + "learning_rate": 4.160919540229885e-05, + "loss": 0.1174, + "step": 1460 + }, + { + "epoch": 1.6935448815474898, + "grad_norm": 0.25094732642173767, + "learning_rate": 4.160344827586207e-05, + "loss": 0.1326, + "step": 1461 + }, + { + "epoch": 1.6947040498442367, + "grad_norm": 0.24605903029441833, + "learning_rate": 4.159770114942529e-05, + "loss": 0.1155, + "step": 1462 + }, + { + "epoch": 1.6958632181409838, + "grad_norm": 0.3044376075267792, + "learning_rate": 4.1591954022988506e-05, + "loss": 0.1273, + "step": 1463 + }, + { + "epoch": 1.697022386437731, + "grad_norm": 0.25969693064689636, + "learning_rate": 4.158620689655173e-05, + "loss": 0.1261, + "step": 1464 + }, + { + "epoch": 1.6981815547344778, + "grad_norm": 0.2134069949388504, + "learning_rate": 4.158045977011494e-05, + "loss": 0.129, + "step": 1465 + }, + { + "epoch": 1.699340723031225, + "grad_norm": 0.24624426662921906, + "learning_rate": 4.1574712643678165e-05, + "loss": 0.1254, + "step": 1466 + }, + { + "epoch": 1.700499891327972, + "grad_norm": 0.27227187156677246, + "learning_rate": 4.156896551724138e-05, + "loss": 0.1241, + "step": 1467 + }, + { + "epoch": 1.7016590596247192, + "grad_norm": 0.2617661952972412, + "learning_rate": 4.15632183908046e-05, + "loss": 0.1241, + "step": 1468 + }, + { + "epoch": 1.7028182279214663, + "grad_norm": 0.2572365701198578, + "learning_rate": 4.155747126436782e-05, + "loss": 0.1308, + "step": 1469 + }, + { + "epoch": 1.7039773962182134, + "grad_norm": 0.24519048631191254, + "learning_rate": 4.155172413793104e-05, + "loss": 0.1333, + "step": 1470 + }, + { + "epoch": 1.7051365645149605, + "grad_norm": 0.23390959203243256, + "learning_rate": 4.154597701149425e-05, + "loss": 0.1335, + "step": 1471 + }, + { + "epoch": 1.7062957328117077, + "grad_norm": 0.19565661251544952, + "learning_rate": 4.1540229885057474e-05, + "loss": 0.1161, + "step": 1472 + }, + { + "epoch": 1.7074549011084548, + "grad_norm": 0.19742251932621002, + "learning_rate": 4.153448275862069e-05, + "loss": 0.1153, + "step": 1473 + }, + { + "epoch": 1.7086140694052019, + "grad_norm": 0.2252473384141922, + "learning_rate": 4.152873563218391e-05, + "loss": 0.1223, + "step": 1474 + }, + { + "epoch": 1.709773237701949, + "grad_norm": 0.25856614112854004, + "learning_rate": 4.1522988505747126e-05, + "loss": 0.1225, + "step": 1475 + }, + { + "epoch": 1.710932405998696, + "grad_norm": 0.20960158109664917, + "learning_rate": 4.151724137931035e-05, + "loss": 0.1257, + "step": 1476 + }, + { + "epoch": 1.712091574295443, + "grad_norm": 0.26653629541397095, + "learning_rate": 4.151149425287357e-05, + "loss": 0.123, + "step": 1477 + }, + { + "epoch": 1.7132507425921901, + "grad_norm": 0.19926515221595764, + "learning_rate": 4.1505747126436784e-05, + "loss": 0.1199, + "step": 1478 + }, + { + "epoch": 1.7144099108889372, + "grad_norm": 0.2077583223581314, + "learning_rate": 4.15e-05, + "loss": 0.1212, + "step": 1479 + }, + { + "epoch": 1.7155690791856841, + "grad_norm": 0.22840292751789093, + "learning_rate": 4.149425287356322e-05, + "loss": 0.1244, + "step": 1480 + }, + { + "epoch": 1.7167282474824312, + "grad_norm": 0.26933014392852783, + "learning_rate": 4.1488505747126436e-05, + "loss": 0.1388, + "step": 1481 + }, + { + "epoch": 1.7178874157791784, + "grad_norm": 0.2531909942626953, + "learning_rate": 4.148275862068966e-05, + "loss": 0.128, + "step": 1482 + }, + { + "epoch": 1.7190465840759255, + "grad_norm": 0.3011757731437683, + "learning_rate": 4.147701149425288e-05, + "loss": 0.1574, + "step": 1483 + }, + { + "epoch": 1.7202057523726726, + "grad_norm": 0.27856916189193726, + "learning_rate": 4.1471264367816094e-05, + "loss": 0.1275, + "step": 1484 + }, + { + "epoch": 1.7213649206694197, + "grad_norm": 0.23599520325660706, + "learning_rate": 4.1465517241379316e-05, + "loss": 0.1206, + "step": 1485 + }, + { + "epoch": 1.7225240889661668, + "grad_norm": 0.2702353000640869, + "learning_rate": 4.145977011494253e-05, + "loss": 0.1296, + "step": 1486 + }, + { + "epoch": 1.723683257262914, + "grad_norm": 0.21505098044872284, + "learning_rate": 4.1454022988505745e-05, + "loss": 0.1131, + "step": 1487 + }, + { + "epoch": 1.724842425559661, + "grad_norm": 0.27001845836639404, + "learning_rate": 4.144827586206897e-05, + "loss": 0.1265, + "step": 1488 + }, + { + "epoch": 1.7260015938564082, + "grad_norm": 0.2392754852771759, + "learning_rate": 4.144252873563219e-05, + "loss": 0.1159, + "step": 1489 + }, + { + "epoch": 1.727160762153155, + "grad_norm": 0.24627234041690826, + "learning_rate": 4.1436781609195404e-05, + "loss": 0.1284, + "step": 1490 + }, + { + "epoch": 1.7283199304499022, + "grad_norm": 0.23222601413726807, + "learning_rate": 4.1431034482758625e-05, + "loss": 0.1288, + "step": 1491 + }, + { + "epoch": 1.7294790987466493, + "grad_norm": 0.2943344712257385, + "learning_rate": 4.142528735632184e-05, + "loss": 0.1299, + "step": 1492 + }, + { + "epoch": 1.7306382670433964, + "grad_norm": 0.2248132824897766, + "learning_rate": 4.141954022988506e-05, + "loss": 0.1191, + "step": 1493 + }, + { + "epoch": 1.7317974353401433, + "grad_norm": 0.24535109102725983, + "learning_rate": 4.141379310344828e-05, + "loss": 0.129, + "step": 1494 + }, + { + "epoch": 1.7329566036368904, + "grad_norm": 0.2560446262359619, + "learning_rate": 4.140804597701149e-05, + "loss": 0.1343, + "step": 1495 + }, + { + "epoch": 1.7341157719336375, + "grad_norm": 0.23877164721488953, + "learning_rate": 4.1402298850574713e-05, + "loss": 0.1026, + "step": 1496 + }, + { + "epoch": 1.7352749402303846, + "grad_norm": 0.2307940572500229, + "learning_rate": 4.1396551724137935e-05, + "loss": 0.1158, + "step": 1497 + }, + { + "epoch": 1.7364341085271318, + "grad_norm": 0.23847486078739166, + "learning_rate": 4.139080459770115e-05, + "loss": 0.1177, + "step": 1498 + }, + { + "epoch": 1.7375932768238789, + "grad_norm": 0.2667318880558014, + "learning_rate": 4.138505747126437e-05, + "loss": 0.1483, + "step": 1499 + }, + { + "epoch": 1.738752445120626, + "grad_norm": 0.2531675100326538, + "learning_rate": 4.1379310344827587e-05, + "loss": 0.1209, + "step": 1500 + }, + { + "epoch": 1.739911613417373, + "grad_norm": 0.4089076817035675, + "learning_rate": 4.13735632183908e-05, + "loss": 0.1485, + "step": 1501 + }, + { + "epoch": 1.7410707817141202, + "grad_norm": 0.2458110749721527, + "learning_rate": 4.136781609195402e-05, + "loss": 0.1252, + "step": 1502 + }, + { + "epoch": 1.7422299500108673, + "grad_norm": 0.23186376690864563, + "learning_rate": 4.1362068965517245e-05, + "loss": 0.1292, + "step": 1503 + }, + { + "epoch": 1.7433891183076144, + "grad_norm": 0.24878862500190735, + "learning_rate": 4.1356321839080466e-05, + "loss": 0.1307, + "step": 1504 + }, + { + "epoch": 1.7445482866043613, + "grad_norm": 0.2361961454153061, + "learning_rate": 4.135057471264368e-05, + "loss": 0.118, + "step": 1505 + }, + { + "epoch": 1.7457074549011085, + "grad_norm": 0.24874532222747803, + "learning_rate": 4.1344827586206896e-05, + "loss": 0.1122, + "step": 1506 + }, + { + "epoch": 1.7468666231978556, + "grad_norm": 0.2600504159927368, + "learning_rate": 4.133908045977012e-05, + "loss": 0.1229, + "step": 1507 + }, + { + "epoch": 1.7480257914946025, + "grad_norm": 0.28750351071357727, + "learning_rate": 4.133333333333333e-05, + "loss": 0.1385, + "step": 1508 + }, + { + "epoch": 1.7491849597913496, + "grad_norm": 0.26855355501174927, + "learning_rate": 4.1327586206896555e-05, + "loss": 0.1394, + "step": 1509 + }, + { + "epoch": 1.7503441280880967, + "grad_norm": 0.28046318888664246, + "learning_rate": 4.1321839080459776e-05, + "loss": 0.1362, + "step": 1510 + }, + { + "epoch": 1.7515032963848438, + "grad_norm": 0.27256345748901367, + "learning_rate": 4.131609195402299e-05, + "loss": 0.1283, + "step": 1511 + }, + { + "epoch": 1.752662464681591, + "grad_norm": 0.22228793799877167, + "learning_rate": 4.131034482758621e-05, + "loss": 0.1104, + "step": 1512 + }, + { + "epoch": 1.753821632978338, + "grad_norm": 0.24911850690841675, + "learning_rate": 4.130459770114943e-05, + "loss": 0.1212, + "step": 1513 + }, + { + "epoch": 1.7549808012750852, + "grad_norm": 0.21466509997844696, + "learning_rate": 4.129885057471264e-05, + "loss": 0.1141, + "step": 1514 + }, + { + "epoch": 1.7561399695718323, + "grad_norm": 0.2404317706823349, + "learning_rate": 4.1293103448275864e-05, + "loss": 0.119, + "step": 1515 + }, + { + "epoch": 1.7572991378685794, + "grad_norm": 0.25505301356315613, + "learning_rate": 4.128735632183908e-05, + "loss": 0.115, + "step": 1516 + }, + { + "epoch": 1.7584583061653265, + "grad_norm": 0.2898204028606415, + "learning_rate": 4.12816091954023e-05, + "loss": 0.1508, + "step": 1517 + }, + { + "epoch": 1.7596174744620736, + "grad_norm": 0.24544641375541687, + "learning_rate": 4.127586206896552e-05, + "loss": 0.1278, + "step": 1518 + }, + { + "epoch": 1.7607766427588205, + "grad_norm": 0.21617454290390015, + "learning_rate": 4.127011494252874e-05, + "loss": 0.1206, + "step": 1519 + }, + { + "epoch": 1.7619358110555676, + "grad_norm": 0.22762687504291534, + "learning_rate": 4.126436781609195e-05, + "loss": 0.1354, + "step": 1520 + }, + { + "epoch": 1.7630949793523147, + "grad_norm": 0.22020873427391052, + "learning_rate": 4.1258620689655174e-05, + "loss": 0.1361, + "step": 1521 + }, + { + "epoch": 1.7642541476490616, + "grad_norm": 0.21312019228935242, + "learning_rate": 4.125287356321839e-05, + "loss": 0.1258, + "step": 1522 + }, + { + "epoch": 1.7654133159458087, + "grad_norm": 0.22142820060253143, + "learning_rate": 4.124712643678161e-05, + "loss": 0.1167, + "step": 1523 + }, + { + "epoch": 1.7665724842425559, + "grad_norm": 0.23192018270492554, + "learning_rate": 4.124137931034483e-05, + "loss": 0.1149, + "step": 1524 + }, + { + "epoch": 1.767731652539303, + "grad_norm": 0.23699016869068146, + "learning_rate": 4.123563218390805e-05, + "loss": 0.1356, + "step": 1525 + }, + { + "epoch": 1.76889082083605, + "grad_norm": 0.25063955783843994, + "learning_rate": 4.122988505747127e-05, + "loss": 0.1232, + "step": 1526 + }, + { + "epoch": 1.7700499891327972, + "grad_norm": 0.2514595687389374, + "learning_rate": 4.1224137931034484e-05, + "loss": 0.1332, + "step": 1527 + }, + { + "epoch": 1.7712091574295443, + "grad_norm": 0.24346187710762024, + "learning_rate": 4.12183908045977e-05, + "loss": 0.1239, + "step": 1528 + }, + { + "epoch": 1.7723683257262914, + "grad_norm": 0.22228626906871796, + "learning_rate": 4.121264367816092e-05, + "loss": 0.1061, + "step": 1529 + }, + { + "epoch": 1.7735274940230386, + "grad_norm": 0.2860787510871887, + "learning_rate": 4.120689655172414e-05, + "loss": 0.1254, + "step": 1530 + }, + { + "epoch": 1.7746866623197857, + "grad_norm": 0.2642548680305481, + "learning_rate": 4.1201149425287364e-05, + "loss": 0.126, + "step": 1531 + }, + { + "epoch": 1.7758458306165328, + "grad_norm": 0.2488432377576828, + "learning_rate": 4.119540229885058e-05, + "loss": 0.1175, + "step": 1532 + }, + { + "epoch": 1.7770049989132797, + "grad_norm": 0.23602858185768127, + "learning_rate": 4.1189655172413794e-05, + "loss": 0.1244, + "step": 1533 + }, + { + "epoch": 1.7781641672100268, + "grad_norm": 0.23683659732341766, + "learning_rate": 4.1183908045977015e-05, + "loss": 0.1186, + "step": 1534 + }, + { + "epoch": 1.779323335506774, + "grad_norm": 0.2419913113117218, + "learning_rate": 4.117816091954023e-05, + "loss": 0.1364, + "step": 1535 + }, + { + "epoch": 1.780482503803521, + "grad_norm": 0.26852262020111084, + "learning_rate": 4.1172413793103445e-05, + "loss": 0.1287, + "step": 1536 + }, + { + "epoch": 1.781641672100268, + "grad_norm": 0.2520958185195923, + "learning_rate": 4.116666666666667e-05, + "loss": 0.1381, + "step": 1537 + }, + { + "epoch": 1.782800840397015, + "grad_norm": 0.2294151484966278, + "learning_rate": 4.116091954022989e-05, + "loss": 0.1458, + "step": 1538 + }, + { + "epoch": 1.7839600086937621, + "grad_norm": 0.20725025236606598, + "learning_rate": 4.11551724137931e-05, + "loss": 0.1358, + "step": 1539 + }, + { + "epoch": 1.7851191769905093, + "grad_norm": 0.22536805272102356, + "learning_rate": 4.1149425287356325e-05, + "loss": 0.1273, + "step": 1540 + }, + { + "epoch": 1.7862783452872564, + "grad_norm": 0.2271406203508377, + "learning_rate": 4.114367816091954e-05, + "loss": 0.1254, + "step": 1541 + }, + { + "epoch": 1.7874375135840035, + "grad_norm": 0.25207868218421936, + "learning_rate": 4.113793103448276e-05, + "loss": 0.1325, + "step": 1542 + }, + { + "epoch": 1.7885966818807506, + "grad_norm": 0.22057871520519257, + "learning_rate": 4.1132183908045976e-05, + "loss": 0.1287, + "step": 1543 + }, + { + "epoch": 1.7897558501774977, + "grad_norm": 0.21857698261737823, + "learning_rate": 4.11264367816092e-05, + "loss": 0.118, + "step": 1544 + }, + { + "epoch": 1.7909150184742448, + "grad_norm": 0.26904162764549255, + "learning_rate": 4.112068965517242e-05, + "loss": 0.137, + "step": 1545 + }, + { + "epoch": 1.792074186770992, + "grad_norm": 0.20811136066913605, + "learning_rate": 4.1114942528735635e-05, + "loss": 0.1148, + "step": 1546 + }, + { + "epoch": 1.7932333550677388, + "grad_norm": 0.2898993194103241, + "learning_rate": 4.110919540229885e-05, + "loss": 0.1396, + "step": 1547 + }, + { + "epoch": 1.794392523364486, + "grad_norm": 0.2406226396560669, + "learning_rate": 4.110344827586207e-05, + "loss": 0.1177, + "step": 1548 + }, + { + "epoch": 1.795551691661233, + "grad_norm": 0.21909652650356293, + "learning_rate": 4.1097701149425286e-05, + "loss": 0.1221, + "step": 1549 + }, + { + "epoch": 1.7967108599579802, + "grad_norm": 0.24359643459320068, + "learning_rate": 4.109195402298851e-05, + "loss": 0.1249, + "step": 1550 + }, + { + "epoch": 1.797870028254727, + "grad_norm": 0.20829685032367706, + "learning_rate": 4.108620689655173e-05, + "loss": 0.1155, + "step": 1551 + }, + { + "epoch": 1.7990291965514742, + "grad_norm": 0.34604066610336304, + "learning_rate": 4.1080459770114944e-05, + "loss": 0.1366, + "step": 1552 + }, + { + "epoch": 1.8001883648482213, + "grad_norm": 0.2357710599899292, + "learning_rate": 4.1074712643678166e-05, + "loss": 0.1238, + "step": 1553 + }, + { + "epoch": 1.8013475331449684, + "grad_norm": 0.2505914866924286, + "learning_rate": 4.106896551724138e-05, + "loss": 0.1381, + "step": 1554 + }, + { + "epoch": 1.8025067014417155, + "grad_norm": 0.3045206665992737, + "learning_rate": 4.1063218390804596e-05, + "loss": 0.125, + "step": 1555 + }, + { + "epoch": 1.8036658697384627, + "grad_norm": 0.22790023684501648, + "learning_rate": 4.105747126436782e-05, + "loss": 0.1189, + "step": 1556 + }, + { + "epoch": 1.8048250380352098, + "grad_norm": 0.267311155796051, + "learning_rate": 4.105172413793103e-05, + "loss": 0.133, + "step": 1557 + }, + { + "epoch": 1.8059842063319569, + "grad_norm": 0.2867993712425232, + "learning_rate": 4.1045977011494254e-05, + "loss": 0.1197, + "step": 1558 + }, + { + "epoch": 1.807143374628704, + "grad_norm": 0.2740671634674072, + "learning_rate": 4.1040229885057476e-05, + "loss": 0.1247, + "step": 1559 + }, + { + "epoch": 1.8083025429254511, + "grad_norm": 0.23246349394321442, + "learning_rate": 4.103448275862069e-05, + "loss": 0.1204, + "step": 1560 + }, + { + "epoch": 1.8094617112221982, + "grad_norm": 0.2556561231613159, + "learning_rate": 4.102873563218391e-05, + "loss": 0.1381, + "step": 1561 + }, + { + "epoch": 1.8106208795189451, + "grad_norm": 0.22063712775707245, + "learning_rate": 4.102298850574713e-05, + "loss": 0.1288, + "step": 1562 + }, + { + "epoch": 1.8117800478156922, + "grad_norm": 0.26971542835235596, + "learning_rate": 4.101724137931034e-05, + "loss": 0.1348, + "step": 1563 + }, + { + "epoch": 1.8129392161124394, + "grad_norm": 0.28952643275260925, + "learning_rate": 4.1011494252873564e-05, + "loss": 0.1408, + "step": 1564 + }, + { + "epoch": 1.8140983844091862, + "grad_norm": 0.24396105110645294, + "learning_rate": 4.1005747126436786e-05, + "loss": 0.1284, + "step": 1565 + }, + { + "epoch": 1.8152575527059334, + "grad_norm": 0.2277979552745819, + "learning_rate": 4.1e-05, + "loss": 0.1234, + "step": 1566 + }, + { + "epoch": 1.8164167210026805, + "grad_norm": 0.1920943409204483, + "learning_rate": 4.099425287356322e-05, + "loss": 0.1183, + "step": 1567 + }, + { + "epoch": 1.8175758892994276, + "grad_norm": 0.25110727548599243, + "learning_rate": 4.098850574712644e-05, + "loss": 0.1338, + "step": 1568 + }, + { + "epoch": 1.8187350575961747, + "grad_norm": 0.2476564645767212, + "learning_rate": 4.098275862068966e-05, + "loss": 0.1311, + "step": 1569 + }, + { + "epoch": 1.8198942258929218, + "grad_norm": 0.2378225028514862, + "learning_rate": 4.0977011494252874e-05, + "loss": 0.131, + "step": 1570 + }, + { + "epoch": 1.821053394189669, + "grad_norm": 0.2154211848974228, + "learning_rate": 4.0971264367816095e-05, + "loss": 0.1192, + "step": 1571 + }, + { + "epoch": 1.822212562486416, + "grad_norm": 0.23698244988918304, + "learning_rate": 4.096551724137932e-05, + "loss": 0.1302, + "step": 1572 + }, + { + "epoch": 1.8233717307831632, + "grad_norm": 0.22462879121303558, + "learning_rate": 4.095977011494253e-05, + "loss": 0.1225, + "step": 1573 + }, + { + "epoch": 1.8245308990799103, + "grad_norm": 0.21854135394096375, + "learning_rate": 4.095402298850575e-05, + "loss": 0.1185, + "step": 1574 + }, + { + "epoch": 1.8256900673766574, + "grad_norm": 0.24488233029842377, + "learning_rate": 4.094827586206897e-05, + "loss": 0.1434, + "step": 1575 + }, + { + "epoch": 1.8268492356734043, + "grad_norm": 0.22526147961616516, + "learning_rate": 4.094252873563218e-05, + "loss": 0.1272, + "step": 1576 + }, + { + "epoch": 1.8280084039701514, + "grad_norm": 0.206687793135643, + "learning_rate": 4.09367816091954e-05, + "loss": 0.1109, + "step": 1577 + }, + { + "epoch": 1.8291675722668985, + "grad_norm": 0.26105666160583496, + "learning_rate": 4.093103448275863e-05, + "loss": 0.1389, + "step": 1578 + }, + { + "epoch": 1.8303267405636456, + "grad_norm": 0.3510555326938629, + "learning_rate": 4.092528735632184e-05, + "loss": 0.1294, + "step": 1579 + }, + { + "epoch": 1.8314859088603925, + "grad_norm": 0.238253653049469, + "learning_rate": 4.091954022988506e-05, + "loss": 0.123, + "step": 1580 + }, + { + "epoch": 1.8326450771571396, + "grad_norm": 0.2679709792137146, + "learning_rate": 4.091379310344828e-05, + "loss": 0.1326, + "step": 1581 + }, + { + "epoch": 1.8338042454538868, + "grad_norm": 0.22641777992248535, + "learning_rate": 4.090804597701149e-05, + "loss": 0.1299, + "step": 1582 + }, + { + "epoch": 1.8349634137506339, + "grad_norm": 0.25320807099342346, + "learning_rate": 4.0902298850574715e-05, + "loss": 0.1223, + "step": 1583 + }, + { + "epoch": 1.836122582047381, + "grad_norm": 0.24081723392009735, + "learning_rate": 4.089655172413793e-05, + "loss": 0.1327, + "step": 1584 + }, + { + "epoch": 1.837281750344128, + "grad_norm": 0.2431461215019226, + "learning_rate": 4.089080459770115e-05, + "loss": 0.1347, + "step": 1585 + }, + { + "epoch": 1.8384409186408752, + "grad_norm": 0.2519475817680359, + "learning_rate": 4.088505747126437e-05, + "loss": 0.1438, + "step": 1586 + }, + { + "epoch": 1.8396000869376223, + "grad_norm": 0.252816766500473, + "learning_rate": 4.087931034482759e-05, + "loss": 0.138, + "step": 1587 + }, + { + "epoch": 1.8407592552343695, + "grad_norm": 0.22462011873722076, + "learning_rate": 4.087356321839081e-05, + "loss": 0.1192, + "step": 1588 + }, + { + "epoch": 1.8419184235311166, + "grad_norm": 0.28114578127861023, + "learning_rate": 4.0867816091954025e-05, + "loss": 0.1384, + "step": 1589 + }, + { + "epoch": 1.8430775918278635, + "grad_norm": 0.24173279106616974, + "learning_rate": 4.086206896551724e-05, + "loss": 0.1389, + "step": 1590 + }, + { + "epoch": 1.8442367601246106, + "grad_norm": 0.19824741780757904, + "learning_rate": 4.085632183908046e-05, + "loss": 0.1117, + "step": 1591 + }, + { + "epoch": 1.8453959284213577, + "grad_norm": 0.2025434970855713, + "learning_rate": 4.085057471264368e-05, + "loss": 0.1103, + "step": 1592 + }, + { + "epoch": 1.8465550967181048, + "grad_norm": 0.2460830956697464, + "learning_rate": 4.08448275862069e-05, + "loss": 0.1327, + "step": 1593 + }, + { + "epoch": 1.8477142650148517, + "grad_norm": 0.2609751224517822, + "learning_rate": 4.083908045977012e-05, + "loss": 0.1315, + "step": 1594 + }, + { + "epoch": 1.8488734333115988, + "grad_norm": 0.20882025361061096, + "learning_rate": 4.0833333333333334e-05, + "loss": 0.1076, + "step": 1595 + }, + { + "epoch": 1.850032601608346, + "grad_norm": 0.20100267231464386, + "learning_rate": 4.0827586206896556e-05, + "loss": 0.1116, + "step": 1596 + }, + { + "epoch": 1.851191769905093, + "grad_norm": 0.2182759940624237, + "learning_rate": 4.082183908045977e-05, + "loss": 0.135, + "step": 1597 + }, + { + "epoch": 1.8523509382018402, + "grad_norm": 0.21987977623939514, + "learning_rate": 4.0816091954022986e-05, + "loss": 0.1287, + "step": 1598 + }, + { + "epoch": 1.8535101064985873, + "grad_norm": 0.2634388506412506, + "learning_rate": 4.0810344827586214e-05, + "loss": 0.1267, + "step": 1599 + }, + { + "epoch": 1.8546692747953344, + "grad_norm": 0.2175815850496292, + "learning_rate": 4.080459770114943e-05, + "loss": 0.1175, + "step": 1600 + }, + { + "epoch": 1.8558284430920815, + "grad_norm": 0.24021421372890472, + "learning_rate": 4.0798850574712644e-05, + "loss": 0.1298, + "step": 1601 + }, + { + "epoch": 1.8569876113888286, + "grad_norm": 0.2455061972141266, + "learning_rate": 4.0793103448275866e-05, + "loss": 0.1318, + "step": 1602 + }, + { + "epoch": 1.8581467796855757, + "grad_norm": 0.5049688220024109, + "learning_rate": 4.078735632183908e-05, + "loss": 0.1285, + "step": 1603 + }, + { + "epoch": 1.8593059479823228, + "grad_norm": 0.3722595274448395, + "learning_rate": 4.0781609195402295e-05, + "loss": 0.1359, + "step": 1604 + }, + { + "epoch": 1.8604651162790697, + "grad_norm": 0.3047929108142853, + "learning_rate": 4.077586206896552e-05, + "loss": 0.1284, + "step": 1605 + }, + { + "epoch": 1.8616242845758169, + "grad_norm": 0.48975053429603577, + "learning_rate": 4.077011494252874e-05, + "loss": 0.1326, + "step": 1606 + }, + { + "epoch": 1.862783452872564, + "grad_norm": 0.27146080136299133, + "learning_rate": 4.076436781609196e-05, + "loss": 0.1237, + "step": 1607 + }, + { + "epoch": 1.8639426211693109, + "grad_norm": 0.26589420437812805, + "learning_rate": 4.0758620689655175e-05, + "loss": 0.1225, + "step": 1608 + }, + { + "epoch": 1.865101789466058, + "grad_norm": 0.2550943195819855, + "learning_rate": 4.075287356321839e-05, + "loss": 0.1348, + "step": 1609 + }, + { + "epoch": 1.866260957762805, + "grad_norm": 0.22643890976905823, + "learning_rate": 4.074712643678161e-05, + "loss": 0.1199, + "step": 1610 + }, + { + "epoch": 1.8674201260595522, + "grad_norm": 0.28529438376426697, + "learning_rate": 4.074137931034483e-05, + "loss": 0.1186, + "step": 1611 + }, + { + "epoch": 1.8685792943562993, + "grad_norm": 0.24935007095336914, + "learning_rate": 4.073563218390805e-05, + "loss": 0.1218, + "step": 1612 + }, + { + "epoch": 1.8697384626530464, + "grad_norm": 0.23699697852134705, + "learning_rate": 4.072988505747127e-05, + "loss": 0.136, + "step": 1613 + }, + { + "epoch": 1.8708976309497936, + "grad_norm": 0.4210551381111145, + "learning_rate": 4.0724137931034485e-05, + "loss": 0.1095, + "step": 1614 + }, + { + "epoch": 1.8720567992465407, + "grad_norm": 0.2276671975851059, + "learning_rate": 4.071839080459771e-05, + "loss": 0.1346, + "step": 1615 + }, + { + "epoch": 1.8732159675432878, + "grad_norm": 0.22973059117794037, + "learning_rate": 4.071264367816092e-05, + "loss": 0.122, + "step": 1616 + }, + { + "epoch": 1.874375135840035, + "grad_norm": 0.37576723098754883, + "learning_rate": 4.070689655172414e-05, + "loss": 0.1173, + "step": 1617 + }, + { + "epoch": 1.875534304136782, + "grad_norm": 0.23602044582366943, + "learning_rate": 4.070114942528736e-05, + "loss": 0.1426, + "step": 1618 + }, + { + "epoch": 1.876693472433529, + "grad_norm": 0.22846823930740356, + "learning_rate": 4.069540229885058e-05, + "loss": 0.1252, + "step": 1619 + }, + { + "epoch": 1.877852640730276, + "grad_norm": 0.21576084196567535, + "learning_rate": 4.0689655172413795e-05, + "loss": 0.1176, + "step": 1620 + }, + { + "epoch": 1.8790118090270231, + "grad_norm": 0.21635417640209198, + "learning_rate": 4.0683908045977017e-05, + "loss": 0.1256, + "step": 1621 + }, + { + "epoch": 1.88017097732377, + "grad_norm": 0.20854774117469788, + "learning_rate": 4.067816091954023e-05, + "loss": 0.1167, + "step": 1622 + }, + { + "epoch": 1.8813301456205171, + "grad_norm": 0.25999489426612854, + "learning_rate": 4.0672413793103446e-05, + "loss": 0.1225, + "step": 1623 + }, + { + "epoch": 1.8824893139172643, + "grad_norm": 0.2581290602684021, + "learning_rate": 4.066666666666667e-05, + "loss": 0.1254, + "step": 1624 + }, + { + "epoch": 1.8836484822140114, + "grad_norm": 0.2655247449874878, + "learning_rate": 4.066091954022988e-05, + "loss": 0.1326, + "step": 1625 + }, + { + "epoch": 1.8848076505107585, + "grad_norm": 0.2115846574306488, + "learning_rate": 4.0655172413793105e-05, + "loss": 0.1275, + "step": 1626 + }, + { + "epoch": 1.8859668188075056, + "grad_norm": 0.23177579045295715, + "learning_rate": 4.0649425287356326e-05, + "loss": 0.1169, + "step": 1627 + }, + { + "epoch": 1.8871259871042527, + "grad_norm": 0.45042136311531067, + "learning_rate": 4.064367816091954e-05, + "loss": 0.1309, + "step": 1628 + }, + { + "epoch": 1.8882851554009998, + "grad_norm": 0.26189303398132324, + "learning_rate": 4.063793103448276e-05, + "loss": 0.1309, + "step": 1629 + }, + { + "epoch": 1.889444323697747, + "grad_norm": 0.2205989956855774, + "learning_rate": 4.063218390804598e-05, + "loss": 0.1242, + "step": 1630 + }, + { + "epoch": 1.890603491994494, + "grad_norm": 0.3108113706111908, + "learning_rate": 4.062643678160919e-05, + "loss": 0.1392, + "step": 1631 + }, + { + "epoch": 1.8917626602912412, + "grad_norm": 0.2217811793088913, + "learning_rate": 4.0620689655172414e-05, + "loss": 0.1243, + "step": 1632 + }, + { + "epoch": 1.892921828587988, + "grad_norm": 0.21977943181991577, + "learning_rate": 4.0614942528735636e-05, + "loss": 0.1202, + "step": 1633 + }, + { + "epoch": 1.8940809968847352, + "grad_norm": 0.20839521288871765, + "learning_rate": 4.060919540229886e-05, + "loss": 0.1205, + "step": 1634 + }, + { + "epoch": 1.8952401651814823, + "grad_norm": 0.2656031548976898, + "learning_rate": 4.060344827586207e-05, + "loss": 0.1355, + "step": 1635 + }, + { + "epoch": 1.8963993334782294, + "grad_norm": 0.23771442472934723, + "learning_rate": 4.059770114942529e-05, + "loss": 0.1294, + "step": 1636 + }, + { + "epoch": 1.8975585017749763, + "grad_norm": 0.27091866731643677, + "learning_rate": 4.059195402298851e-05, + "loss": 0.1364, + "step": 1637 + }, + { + "epoch": 1.8987176700717234, + "grad_norm": 0.250888854265213, + "learning_rate": 4.0586206896551724e-05, + "loss": 0.1362, + "step": 1638 + }, + { + "epoch": 1.8998768383684705, + "grad_norm": 0.26287785172462463, + "learning_rate": 4.0580459770114946e-05, + "loss": 0.1268, + "step": 1639 + }, + { + "epoch": 1.9010360066652177, + "grad_norm": 0.2068604677915573, + "learning_rate": 4.057471264367817e-05, + "loss": 0.1186, + "step": 1640 + }, + { + "epoch": 1.9021951749619648, + "grad_norm": 0.27759426832199097, + "learning_rate": 4.056896551724138e-05, + "loss": 0.1424, + "step": 1641 + }, + { + "epoch": 1.903354343258712, + "grad_norm": 0.2885616719722748, + "learning_rate": 4.05632183908046e-05, + "loss": 0.1431, + "step": 1642 + }, + { + "epoch": 1.904513511555459, + "grad_norm": 0.2596352994441986, + "learning_rate": 4.055747126436782e-05, + "loss": 0.1301, + "step": 1643 + }, + { + "epoch": 1.9056726798522061, + "grad_norm": 0.22041890025138855, + "learning_rate": 4.0551724137931034e-05, + "loss": 0.1252, + "step": 1644 + }, + { + "epoch": 1.9068318481489532, + "grad_norm": 0.23935045301914215, + "learning_rate": 4.0545977011494256e-05, + "loss": 0.1234, + "step": 1645 + }, + { + "epoch": 1.9079910164457003, + "grad_norm": 0.23104766011238098, + "learning_rate": 4.054022988505747e-05, + "loss": 0.1247, + "step": 1646 + }, + { + "epoch": 1.9091501847424472, + "grad_norm": 0.25777825713157654, + "learning_rate": 4.053448275862069e-05, + "loss": 0.1221, + "step": 1647 + }, + { + "epoch": 1.9103093530391944, + "grad_norm": 0.21101577579975128, + "learning_rate": 4.0528735632183914e-05, + "loss": 0.1213, + "step": 1648 + }, + { + "epoch": 1.9114685213359415, + "grad_norm": 0.27736735343933105, + "learning_rate": 4.052298850574713e-05, + "loss": 0.1315, + "step": 1649 + }, + { + "epoch": 1.9126276896326886, + "grad_norm": 0.2874712646007538, + "learning_rate": 4.0517241379310344e-05, + "loss": 0.1267, + "step": 1650 + }, + { + "epoch": 1.9137868579294355, + "grad_norm": 0.27713993191719055, + "learning_rate": 4.0511494252873565e-05, + "loss": 0.1236, + "step": 1651 + }, + { + "epoch": 1.9149460262261826, + "grad_norm": 0.22386078536510468, + "learning_rate": 4.050574712643678e-05, + "loss": 0.1147, + "step": 1652 + }, + { + "epoch": 1.9161051945229297, + "grad_norm": 0.21959929168224335, + "learning_rate": 4.05e-05, + "loss": 0.1079, + "step": 1653 + }, + { + "epoch": 1.9172643628196768, + "grad_norm": 0.2387046068906784, + "learning_rate": 4.0494252873563224e-05, + "loss": 0.1267, + "step": 1654 + }, + { + "epoch": 1.918423531116424, + "grad_norm": 0.22587932646274567, + "learning_rate": 4.048850574712644e-05, + "loss": 0.124, + "step": 1655 + }, + { + "epoch": 1.919582699413171, + "grad_norm": 0.2760310471057892, + "learning_rate": 4.048275862068966e-05, + "loss": 0.1384, + "step": 1656 + }, + { + "epoch": 1.9207418677099182, + "grad_norm": 0.24281680583953857, + "learning_rate": 4.0477011494252875e-05, + "loss": 0.1264, + "step": 1657 + }, + { + "epoch": 1.9219010360066653, + "grad_norm": 0.23182785511016846, + "learning_rate": 4.047126436781609e-05, + "loss": 0.1184, + "step": 1658 + }, + { + "epoch": 1.9230602043034124, + "grad_norm": 0.2864990532398224, + "learning_rate": 4.046551724137931e-05, + "loss": 0.1398, + "step": 1659 + }, + { + "epoch": 1.9242193726001595, + "grad_norm": 0.267501562833786, + "learning_rate": 4.045977011494253e-05, + "loss": 0.135, + "step": 1660 + }, + { + "epoch": 1.9253785408969066, + "grad_norm": 0.6067526936531067, + "learning_rate": 4.045402298850575e-05, + "loss": 0.1321, + "step": 1661 + }, + { + "epoch": 1.9265377091936535, + "grad_norm": 0.2144288271665573, + "learning_rate": 4.044827586206897e-05, + "loss": 0.1195, + "step": 1662 + }, + { + "epoch": 1.9276968774904006, + "grad_norm": 0.2767769396305084, + "learning_rate": 4.0442528735632185e-05, + "loss": 0.121, + "step": 1663 + }, + { + "epoch": 1.9288560457871478, + "grad_norm": 0.258683443069458, + "learning_rate": 4.0436781609195406e-05, + "loss": 0.1358, + "step": 1664 + }, + { + "epoch": 1.9300152140838946, + "grad_norm": 0.3497917056083679, + "learning_rate": 4.043103448275862e-05, + "loss": 0.1271, + "step": 1665 + }, + { + "epoch": 1.9311743823806418, + "grad_norm": 0.2050793319940567, + "learning_rate": 4.0425287356321836e-05, + "loss": 0.0987, + "step": 1666 + }, + { + "epoch": 1.9323335506773889, + "grad_norm": 0.25068920850753784, + "learning_rate": 4.041954022988506e-05, + "loss": 0.1228, + "step": 1667 + }, + { + "epoch": 1.933492718974136, + "grad_norm": 0.31791725754737854, + "learning_rate": 4.041379310344828e-05, + "loss": 0.1245, + "step": 1668 + }, + { + "epoch": 1.934651887270883, + "grad_norm": 0.38801631331443787, + "learning_rate": 4.0408045977011495e-05, + "loss": 0.1264, + "step": 1669 + }, + { + "epoch": 1.9358110555676302, + "grad_norm": 0.26199209690093994, + "learning_rate": 4.0402298850574716e-05, + "loss": 0.1332, + "step": 1670 + }, + { + "epoch": 1.9369702238643773, + "grad_norm": 0.2558247148990631, + "learning_rate": 4.039655172413793e-05, + "loss": 0.126, + "step": 1671 + }, + { + "epoch": 1.9381293921611245, + "grad_norm": 0.2440565675497055, + "learning_rate": 4.039080459770115e-05, + "loss": 0.1265, + "step": 1672 + }, + { + "epoch": 1.9392885604578716, + "grad_norm": 0.35634905099868774, + "learning_rate": 4.038505747126437e-05, + "loss": 0.1492, + "step": 1673 + }, + { + "epoch": 1.9404477287546187, + "grad_norm": 0.2178521454334259, + "learning_rate": 4.037931034482759e-05, + "loss": 0.1166, + "step": 1674 + }, + { + "epoch": 1.9416068970513658, + "grad_norm": 0.25191420316696167, + "learning_rate": 4.037356321839081e-05, + "loss": 0.1329, + "step": 1675 + }, + { + "epoch": 1.9427660653481127, + "grad_norm": 0.25486209988594055, + "learning_rate": 4.0367816091954026e-05, + "loss": 0.1294, + "step": 1676 + }, + { + "epoch": 1.9439252336448598, + "grad_norm": 0.21692296862602234, + "learning_rate": 4.036206896551724e-05, + "loss": 0.1204, + "step": 1677 + }, + { + "epoch": 1.945084401941607, + "grad_norm": 0.24165906012058258, + "learning_rate": 4.035632183908046e-05, + "loss": 0.1244, + "step": 1678 + }, + { + "epoch": 1.9462435702383538, + "grad_norm": 0.2571072578430176, + "learning_rate": 4.035057471264368e-05, + "loss": 0.1387, + "step": 1679 + }, + { + "epoch": 1.947402738535101, + "grad_norm": 0.22820697724819183, + "learning_rate": 4.03448275862069e-05, + "loss": 0.1331, + "step": 1680 + }, + { + "epoch": 1.948561906831848, + "grad_norm": 0.2674961984157562, + "learning_rate": 4.033908045977012e-05, + "loss": 0.128, + "step": 1681 + }, + { + "epoch": 1.9497210751285952, + "grad_norm": 0.23374570906162262, + "learning_rate": 4.0333333333333336e-05, + "loss": 0.1256, + "step": 1682 + }, + { + "epoch": 1.9508802434253423, + "grad_norm": 0.23953098058700562, + "learning_rate": 4.032758620689656e-05, + "loss": 0.1354, + "step": 1683 + }, + { + "epoch": 1.9520394117220894, + "grad_norm": 0.21969419717788696, + "learning_rate": 4.032183908045977e-05, + "loss": 0.1273, + "step": 1684 + }, + { + "epoch": 1.9531985800188365, + "grad_norm": 0.2675417959690094, + "learning_rate": 4.031609195402299e-05, + "loss": 0.145, + "step": 1685 + }, + { + "epoch": 1.9543577483155836, + "grad_norm": 0.22775276005268097, + "learning_rate": 4.031034482758621e-05, + "loss": 0.1368, + "step": 1686 + }, + { + "epoch": 1.9555169166123307, + "grad_norm": 0.25551965832710266, + "learning_rate": 4.0304597701149424e-05, + "loss": 0.1395, + "step": 1687 + }, + { + "epoch": 1.9566760849090779, + "grad_norm": 0.20460672676563263, + "learning_rate": 4.0298850574712645e-05, + "loss": 0.1111, + "step": 1688 + }, + { + "epoch": 1.957835253205825, + "grad_norm": 0.42244213819503784, + "learning_rate": 4.029310344827587e-05, + "loss": 0.1504, + "step": 1689 + }, + { + "epoch": 1.9589944215025719, + "grad_norm": 0.2375163733959198, + "learning_rate": 4.028735632183908e-05, + "loss": 0.1243, + "step": 1690 + }, + { + "epoch": 1.960153589799319, + "grad_norm": 0.23860155045986176, + "learning_rate": 4.0281609195402304e-05, + "loss": 0.1317, + "step": 1691 + }, + { + "epoch": 1.961312758096066, + "grad_norm": 0.20756982266902924, + "learning_rate": 4.027586206896552e-05, + "loss": 0.1252, + "step": 1692 + }, + { + "epoch": 1.9624719263928132, + "grad_norm": 0.25197523832321167, + "learning_rate": 4.0270114942528733e-05, + "loss": 0.1406, + "step": 1693 + }, + { + "epoch": 1.96363109468956, + "grad_norm": 0.2613564133644104, + "learning_rate": 4.0264367816091955e-05, + "loss": 0.1342, + "step": 1694 + }, + { + "epoch": 1.9647902629863072, + "grad_norm": 0.21359771490097046, + "learning_rate": 4.025862068965518e-05, + "loss": 0.1171, + "step": 1695 + }, + { + "epoch": 1.9659494312830543, + "grad_norm": 0.2883491516113281, + "learning_rate": 4.025287356321839e-05, + "loss": 0.1262, + "step": 1696 + }, + { + "epoch": 1.9671085995798014, + "grad_norm": 0.22171908617019653, + "learning_rate": 4.0247126436781613e-05, + "loss": 0.1259, + "step": 1697 + }, + { + "epoch": 1.9682677678765486, + "grad_norm": 0.2951849699020386, + "learning_rate": 4.024137931034483e-05, + "loss": 0.1447, + "step": 1698 + }, + { + "epoch": 1.9694269361732957, + "grad_norm": 0.2592881917953491, + "learning_rate": 4.023563218390804e-05, + "loss": 0.1294, + "step": 1699 + }, + { + "epoch": 1.9705861044700428, + "grad_norm": 0.22371990978717804, + "learning_rate": 4.0229885057471265e-05, + "loss": 0.1238, + "step": 1700 + }, + { + "epoch": 1.97174527276679, + "grad_norm": 0.23679739236831665, + "learning_rate": 4.0224137931034487e-05, + "loss": 0.128, + "step": 1701 + }, + { + "epoch": 1.972904441063537, + "grad_norm": 0.208289235830307, + "learning_rate": 4.021839080459771e-05, + "loss": 0.1221, + "step": 1702 + }, + { + "epoch": 1.9740636093602841, + "grad_norm": 0.3069906532764435, + "learning_rate": 4.021264367816092e-05, + "loss": 0.1264, + "step": 1703 + }, + { + "epoch": 1.975222777657031, + "grad_norm": 0.23526400327682495, + "learning_rate": 4.020689655172414e-05, + "loss": 0.1314, + "step": 1704 + }, + { + "epoch": 1.9763819459537781, + "grad_norm": 0.36365655064582825, + "learning_rate": 4.020114942528736e-05, + "loss": 0.135, + "step": 1705 + }, + { + "epoch": 1.9775411142505253, + "grad_norm": 0.21487149596214294, + "learning_rate": 4.0195402298850575e-05, + "loss": 0.1166, + "step": 1706 + }, + { + "epoch": 1.9787002825472724, + "grad_norm": 0.1872834861278534, + "learning_rate": 4.018965517241379e-05, + "loss": 0.114, + "step": 1707 + }, + { + "epoch": 1.9798594508440193, + "grad_norm": 0.2666516602039337, + "learning_rate": 4.018390804597701e-05, + "loss": 0.1396, + "step": 1708 + }, + { + "epoch": 1.9810186191407664, + "grad_norm": 0.23788093030452728, + "learning_rate": 4.017816091954023e-05, + "loss": 0.1303, + "step": 1709 + }, + { + "epoch": 1.9821777874375135, + "grad_norm": 0.23278290033340454, + "learning_rate": 4.0172413793103455e-05, + "loss": 0.127, + "step": 1710 + }, + { + "epoch": 1.9833369557342606, + "grad_norm": 0.2589450180530548, + "learning_rate": 4.016666666666667e-05, + "loss": 0.1311, + "step": 1711 + }, + { + "epoch": 1.9844961240310077, + "grad_norm": 0.20115303993225098, + "learning_rate": 4.0160919540229884e-05, + "loss": 0.1255, + "step": 1712 + }, + { + "epoch": 1.9856552923277548, + "grad_norm": 0.24653758108615875, + "learning_rate": 4.0155172413793106e-05, + "loss": 0.1346, + "step": 1713 + }, + { + "epoch": 1.986814460624502, + "grad_norm": 0.25631844997406006, + "learning_rate": 4.014942528735632e-05, + "loss": 0.1385, + "step": 1714 + }, + { + "epoch": 1.987973628921249, + "grad_norm": 0.21562731266021729, + "learning_rate": 4.014367816091954e-05, + "loss": 0.118, + "step": 1715 + }, + { + "epoch": 1.9891327972179962, + "grad_norm": 0.20288534462451935, + "learning_rate": 4.0137931034482764e-05, + "loss": 0.1183, + "step": 1716 + }, + { + "epoch": 1.9902919655147433, + "grad_norm": 0.2522091865539551, + "learning_rate": 4.013218390804598e-05, + "loss": 0.1337, + "step": 1717 + }, + { + "epoch": 1.9914511338114904, + "grad_norm": 0.2482483983039856, + "learning_rate": 4.0126436781609194e-05, + "loss": 0.1347, + "step": 1718 + }, + { + "epoch": 1.9926103021082373, + "grad_norm": 0.26897165179252625, + "learning_rate": 4.0120689655172416e-05, + "loss": 0.1463, + "step": 1719 + }, + { + "epoch": 1.9937694704049844, + "grad_norm": 0.2053130716085434, + "learning_rate": 4.011494252873563e-05, + "loss": 0.1138, + "step": 1720 + }, + { + "epoch": 1.9949286387017315, + "grad_norm": 0.22863611578941345, + "learning_rate": 4.010919540229885e-05, + "loss": 0.1255, + "step": 1721 + }, + { + "epoch": 1.9960878069984784, + "grad_norm": 0.25083255767822266, + "learning_rate": 4.0103448275862074e-05, + "loss": 0.137, + "step": 1722 + }, + { + "epoch": 1.9972469752952255, + "grad_norm": 0.2726476788520813, + "learning_rate": 4.009770114942529e-05, + "loss": 0.1459, + "step": 1723 + }, + { + "epoch": 1.9984061435919727, + "grad_norm": 0.18482446670532227, + "learning_rate": 4.009195402298851e-05, + "loss": 0.1116, + "step": 1724 + }, + { + "epoch": 1.9995653118887198, + "grad_norm": 0.22042682766914368, + "learning_rate": 4.0086206896551726e-05, + "loss": 0.1282, + "step": 1725 + }, + { + "epoch": 1.9995653118887198, + "eval_loss": 0.13071221113204956, + "eval_runtime": 265.8343, + "eval_samples_per_second": 5.771, + "eval_steps_per_second": 5.771, + "step": 1725 + }, + { + "epoch": 2.000724480185467, + "grad_norm": 0.20493850111961365, + "learning_rate": 4.008045977011494e-05, + "loss": 0.115, + "step": 1726 + }, + { + "epoch": 2.001883648482214, + "grad_norm": 0.21877332031726837, + "learning_rate": 4.007471264367816e-05, + "loss": 0.1288, + "step": 1727 + }, + { + "epoch": 2.003042816778961, + "grad_norm": 0.17066803574562073, + "learning_rate": 4.006896551724138e-05, + "loss": 0.103, + "step": 1728 + }, + { + "epoch": 2.0042019850757082, + "grad_norm": 0.2311452031135559, + "learning_rate": 4.0063218390804605e-05, + "loss": 0.123, + "step": 1729 + }, + { + "epoch": 2.0053611533724554, + "grad_norm": 0.18617844581604004, + "learning_rate": 4.005747126436782e-05, + "loss": 0.1075, + "step": 1730 + }, + { + "epoch": 2.0065203216692025, + "grad_norm": 0.17932945489883423, + "learning_rate": 4.0051724137931035e-05, + "loss": 0.1124, + "step": 1731 + }, + { + "epoch": 2.0076794899659496, + "grad_norm": 0.2430291473865509, + "learning_rate": 4.004597701149426e-05, + "loss": 0.112, + "step": 1732 + }, + { + "epoch": 2.0088386582626967, + "grad_norm": 0.1980556845664978, + "learning_rate": 4.004022988505747e-05, + "loss": 0.1257, + "step": 1733 + }, + { + "epoch": 2.009997826559444, + "grad_norm": 0.2553955614566803, + "learning_rate": 4.003448275862069e-05, + "loss": 0.1168, + "step": 1734 + }, + { + "epoch": 2.0111569948561905, + "grad_norm": 0.5730363130569458, + "learning_rate": 4.002873563218391e-05, + "loss": 0.1191, + "step": 1735 + }, + { + "epoch": 2.0123161631529376, + "grad_norm": 0.27074134349823, + "learning_rate": 4.002298850574713e-05, + "loss": 0.1228, + "step": 1736 + }, + { + "epoch": 2.0134753314496847, + "grad_norm": 0.2566532492637634, + "learning_rate": 4.0017241379310345e-05, + "loss": 0.1185, + "step": 1737 + }, + { + "epoch": 2.014634499746432, + "grad_norm": 0.2553810179233551, + "learning_rate": 4.001149425287357e-05, + "loss": 0.1207, + "step": 1738 + }, + { + "epoch": 2.015793668043179, + "grad_norm": 0.314544677734375, + "learning_rate": 4.000574712643678e-05, + "loss": 0.1338, + "step": 1739 + }, + { + "epoch": 2.016952836339926, + "grad_norm": 0.26933231949806213, + "learning_rate": 4e-05, + "loss": 0.1175, + "step": 1740 + }, + { + "epoch": 2.018112004636673, + "grad_norm": 0.4147557020187378, + "learning_rate": 3.999425287356322e-05, + "loss": 0.1191, + "step": 1741 + }, + { + "epoch": 2.0192711729334203, + "grad_norm": 0.4365951716899872, + "learning_rate": 3.998850574712644e-05, + "loss": 0.1246, + "step": 1742 + }, + { + "epoch": 2.0204303412301674, + "grad_norm": 0.22776885330677032, + "learning_rate": 3.998275862068966e-05, + "loss": 0.1277, + "step": 1743 + }, + { + "epoch": 2.0215895095269145, + "grad_norm": 0.5201489925384521, + "learning_rate": 3.9977011494252876e-05, + "loss": 0.108, + "step": 1744 + }, + { + "epoch": 2.0227486778236616, + "grad_norm": 0.2805781960487366, + "learning_rate": 3.997126436781609e-05, + "loss": 0.1168, + "step": 1745 + }, + { + "epoch": 2.0239078461204087, + "grad_norm": 0.25759008526802063, + "learning_rate": 3.996551724137931e-05, + "loss": 0.1312, + "step": 1746 + }, + { + "epoch": 2.025067014417156, + "grad_norm": 0.2724314332008362, + "learning_rate": 3.995977011494253e-05, + "loss": 0.1102, + "step": 1747 + }, + { + "epoch": 2.026226182713903, + "grad_norm": 0.2714131772518158, + "learning_rate": 3.995402298850575e-05, + "loss": 0.1136, + "step": 1748 + }, + { + "epoch": 2.0273853510106497, + "grad_norm": 0.5604748725891113, + "learning_rate": 3.994827586206897e-05, + "loss": 0.1387, + "step": 1749 + }, + { + "epoch": 2.0285445193073968, + "grad_norm": 0.20117241144180298, + "learning_rate": 3.9942528735632186e-05, + "loss": 0.1045, + "step": 1750 + }, + { + "epoch": 2.029703687604144, + "grad_norm": 0.2628777027130127, + "learning_rate": 3.993678160919541e-05, + "loss": 0.1219, + "step": 1751 + }, + { + "epoch": 2.030862855900891, + "grad_norm": 0.29843881726264954, + "learning_rate": 3.993103448275862e-05, + "loss": 0.1282, + "step": 1752 + }, + { + "epoch": 2.032022024197638, + "grad_norm": 0.4298798441886902, + "learning_rate": 3.992528735632184e-05, + "loss": 0.1108, + "step": 1753 + }, + { + "epoch": 2.0331811924943852, + "grad_norm": 0.24070972204208374, + "learning_rate": 3.991954022988506e-05, + "loss": 0.1171, + "step": 1754 + }, + { + "epoch": 2.0343403607911323, + "grad_norm": 0.2516588568687439, + "learning_rate": 3.9913793103448274e-05, + "loss": 0.1178, + "step": 1755 + }, + { + "epoch": 2.0354995290878795, + "grad_norm": 0.3218182623386383, + "learning_rate": 3.9908045977011496e-05, + "loss": 0.1193, + "step": 1756 + }, + { + "epoch": 2.0366586973846266, + "grad_norm": 0.2040579468011856, + "learning_rate": 3.990229885057472e-05, + "loss": 0.1125, + "step": 1757 + }, + { + "epoch": 2.0378178656813737, + "grad_norm": 0.2935560643672943, + "learning_rate": 3.989655172413793e-05, + "loss": 0.1269, + "step": 1758 + }, + { + "epoch": 2.038977033978121, + "grad_norm": 0.22673268616199493, + "learning_rate": 3.9890804597701154e-05, + "loss": 0.1145, + "step": 1759 + }, + { + "epoch": 2.040136202274868, + "grad_norm": 0.37315088510513306, + "learning_rate": 3.988505747126437e-05, + "loss": 0.1226, + "step": 1760 + }, + { + "epoch": 2.041295370571615, + "grad_norm": 0.21398569643497467, + "learning_rate": 3.9879310344827584e-05, + "loss": 0.1196, + "step": 1761 + }, + { + "epoch": 2.042454538868362, + "grad_norm": 0.3148459494113922, + "learning_rate": 3.9873563218390806e-05, + "loss": 0.1321, + "step": 1762 + }, + { + "epoch": 2.043613707165109, + "grad_norm": 0.5063501596450806, + "learning_rate": 3.986781609195403e-05, + "loss": 0.1237, + "step": 1763 + }, + { + "epoch": 2.044772875461856, + "grad_norm": 0.25269943475723267, + "learning_rate": 3.986206896551724e-05, + "loss": 0.1149, + "step": 1764 + }, + { + "epoch": 2.045932043758603, + "grad_norm": 0.32295992970466614, + "learning_rate": 3.9856321839080464e-05, + "loss": 0.1087, + "step": 1765 + }, + { + "epoch": 2.04709121205535, + "grad_norm": 0.23823142051696777, + "learning_rate": 3.985057471264368e-05, + "loss": 0.1246, + "step": 1766 + }, + { + "epoch": 2.0482503803520973, + "grad_norm": 0.3159748315811157, + "learning_rate": 3.98448275862069e-05, + "loss": 0.1114, + "step": 1767 + }, + { + "epoch": 2.0494095486488444, + "grad_norm": 0.21690689027309418, + "learning_rate": 3.9839080459770115e-05, + "loss": 0.1105, + "step": 1768 + }, + { + "epoch": 2.0505687169455915, + "grad_norm": 0.26003125309944153, + "learning_rate": 3.983333333333333e-05, + "loss": 0.1257, + "step": 1769 + }, + { + "epoch": 2.0517278852423386, + "grad_norm": 0.27152520418167114, + "learning_rate": 3.982758620689656e-05, + "loss": 0.1207, + "step": 1770 + }, + { + "epoch": 2.0528870535390857, + "grad_norm": 0.2450665384531021, + "learning_rate": 3.9821839080459774e-05, + "loss": 0.1198, + "step": 1771 + }, + { + "epoch": 2.054046221835833, + "grad_norm": 0.20077472925186157, + "learning_rate": 3.981609195402299e-05, + "loss": 0.107, + "step": 1772 + }, + { + "epoch": 2.05520539013258, + "grad_norm": 0.4394693374633789, + "learning_rate": 3.981034482758621e-05, + "loss": 0.1103, + "step": 1773 + }, + { + "epoch": 2.056364558429327, + "grad_norm": 0.26243650913238525, + "learning_rate": 3.9804597701149425e-05, + "loss": 0.1128, + "step": 1774 + }, + { + "epoch": 2.057523726726074, + "grad_norm": 0.2425994873046875, + "learning_rate": 3.979885057471265e-05, + "loss": 0.123, + "step": 1775 + }, + { + "epoch": 2.0586828950228213, + "grad_norm": 0.23687340319156647, + "learning_rate": 3.979310344827586e-05, + "loss": 0.122, + "step": 1776 + }, + { + "epoch": 2.0598420633195684, + "grad_norm": 0.2466685026884079, + "learning_rate": 3.9787356321839083e-05, + "loss": 0.1201, + "step": 1777 + }, + { + "epoch": 2.061001231616315, + "grad_norm": 0.26380082964897156, + "learning_rate": 3.9781609195402305e-05, + "loss": 0.1216, + "step": 1778 + }, + { + "epoch": 2.062160399913062, + "grad_norm": 0.2581118643283844, + "learning_rate": 3.977586206896552e-05, + "loss": 0.1261, + "step": 1779 + }, + { + "epoch": 2.0633195682098093, + "grad_norm": 0.24851976335048676, + "learning_rate": 3.9770114942528735e-05, + "loss": 0.1212, + "step": 1780 + }, + { + "epoch": 2.0644787365065564, + "grad_norm": 0.3059890568256378, + "learning_rate": 3.9764367816091957e-05, + "loss": 0.1233, + "step": 1781 + }, + { + "epoch": 2.0656379048033036, + "grad_norm": 0.2686820924282074, + "learning_rate": 3.975862068965517e-05, + "loss": 0.1291, + "step": 1782 + }, + { + "epoch": 2.0667970731000507, + "grad_norm": 0.24418821930885315, + "learning_rate": 3.975287356321839e-05, + "loss": 0.115, + "step": 1783 + }, + { + "epoch": 2.067956241396798, + "grad_norm": 0.27120018005371094, + "learning_rate": 3.9747126436781615e-05, + "loss": 0.1192, + "step": 1784 + }, + { + "epoch": 2.069115409693545, + "grad_norm": 0.22046048939228058, + "learning_rate": 3.974137931034483e-05, + "loss": 0.1033, + "step": 1785 + }, + { + "epoch": 2.070274577990292, + "grad_norm": 0.24162054061889648, + "learning_rate": 3.973563218390805e-05, + "loss": 0.116, + "step": 1786 + }, + { + "epoch": 2.071433746287039, + "grad_norm": 0.26987624168395996, + "learning_rate": 3.9729885057471266e-05, + "loss": 0.127, + "step": 1787 + }, + { + "epoch": 2.0725929145837862, + "grad_norm": 0.2681949734687805, + "learning_rate": 3.972413793103448e-05, + "loss": 0.1266, + "step": 1788 + }, + { + "epoch": 2.0737520828805334, + "grad_norm": 0.26860347390174866, + "learning_rate": 3.97183908045977e-05, + "loss": 0.1186, + "step": 1789 + }, + { + "epoch": 2.0749112511772805, + "grad_norm": 0.2559060752391815, + "learning_rate": 3.9712643678160925e-05, + "loss": 0.108, + "step": 1790 + }, + { + "epoch": 2.0760704194740276, + "grad_norm": 0.1907990276813507, + "learning_rate": 3.970689655172414e-05, + "loss": 0.0989, + "step": 1791 + }, + { + "epoch": 2.0772295877707743, + "grad_norm": 0.2304263859987259, + "learning_rate": 3.970114942528736e-05, + "loss": 0.1186, + "step": 1792 + }, + { + "epoch": 2.0783887560675214, + "grad_norm": 0.21806973218917847, + "learning_rate": 3.9695402298850576e-05, + "loss": 0.1055, + "step": 1793 + }, + { + "epoch": 2.0795479243642685, + "grad_norm": 0.27678120136260986, + "learning_rate": 3.96896551724138e-05, + "loss": 0.1309, + "step": 1794 + }, + { + "epoch": 2.0807070926610156, + "grad_norm": 0.25782880187034607, + "learning_rate": 3.968390804597701e-05, + "loss": 0.1066, + "step": 1795 + }, + { + "epoch": 2.0818662609577627, + "grad_norm": 0.2833830416202545, + "learning_rate": 3.967816091954023e-05, + "loss": 0.1258, + "step": 1796 + }, + { + "epoch": 2.08302542925451, + "grad_norm": 0.231010302901268, + "learning_rate": 3.967241379310345e-05, + "loss": 0.1035, + "step": 1797 + }, + { + "epoch": 2.084184597551257, + "grad_norm": 0.3302299678325653, + "learning_rate": 3.966666666666667e-05, + "loss": 0.1137, + "step": 1798 + }, + { + "epoch": 2.085343765848004, + "grad_norm": 0.270266056060791, + "learning_rate": 3.9660919540229886e-05, + "loss": 0.1117, + "step": 1799 + }, + { + "epoch": 2.086502934144751, + "grad_norm": 0.23533837497234344, + "learning_rate": 3.965517241379311e-05, + "loss": 0.1112, + "step": 1800 + }, + { + "epoch": 2.0876621024414983, + "grad_norm": 0.316643089056015, + "learning_rate": 3.964942528735632e-05, + "loss": 0.1243, + "step": 1801 + }, + { + "epoch": 2.0888212707382454, + "grad_norm": 0.2741736173629761, + "learning_rate": 3.964367816091954e-05, + "loss": 0.1095, + "step": 1802 + }, + { + "epoch": 2.0899804390349925, + "grad_norm": 0.2951594889163971, + "learning_rate": 3.963793103448276e-05, + "loss": 0.1246, + "step": 1803 + }, + { + "epoch": 2.0911396073317396, + "grad_norm": 0.26919370889663696, + "learning_rate": 3.963218390804598e-05, + "loss": 0.1181, + "step": 1804 + }, + { + "epoch": 2.0922987756284868, + "grad_norm": 0.31926748156547546, + "learning_rate": 3.96264367816092e-05, + "loss": 0.1216, + "step": 1805 + }, + { + "epoch": 2.0934579439252334, + "grad_norm": 0.30833154916763306, + "learning_rate": 3.962068965517242e-05, + "loss": 0.1245, + "step": 1806 + }, + { + "epoch": 2.0946171122219805, + "grad_norm": 0.2637370824813843, + "learning_rate": 3.961494252873563e-05, + "loss": 0.1168, + "step": 1807 + }, + { + "epoch": 2.0957762805187277, + "grad_norm": 0.2496730089187622, + "learning_rate": 3.9609195402298854e-05, + "loss": 0.1152, + "step": 1808 + }, + { + "epoch": 2.096935448815475, + "grad_norm": 0.241145521402359, + "learning_rate": 3.960344827586207e-05, + "loss": 0.1257, + "step": 1809 + }, + { + "epoch": 2.098094617112222, + "grad_norm": 0.2335578054189682, + "learning_rate": 3.959770114942529e-05, + "loss": 0.1138, + "step": 1810 + }, + { + "epoch": 2.099253785408969, + "grad_norm": 0.23772041499614716, + "learning_rate": 3.959195402298851e-05, + "loss": 0.1115, + "step": 1811 + }, + { + "epoch": 2.100412953705716, + "grad_norm": 0.25079551339149475, + "learning_rate": 3.958620689655173e-05, + "loss": 0.1185, + "step": 1812 + }, + { + "epoch": 2.1015721220024632, + "grad_norm": 0.26366278529167175, + "learning_rate": 3.958045977011495e-05, + "loss": 0.1177, + "step": 1813 + }, + { + "epoch": 2.1027312902992104, + "grad_norm": 0.2486356645822525, + "learning_rate": 3.9574712643678164e-05, + "loss": 0.1204, + "step": 1814 + }, + { + "epoch": 2.1038904585959575, + "grad_norm": 0.24018944799900055, + "learning_rate": 3.956896551724138e-05, + "loss": 0.1333, + "step": 1815 + }, + { + "epoch": 2.1050496268927046, + "grad_norm": 0.25310537219047546, + "learning_rate": 3.95632183908046e-05, + "loss": 0.1243, + "step": 1816 + }, + { + "epoch": 2.1062087951894517, + "grad_norm": 0.26761263608932495, + "learning_rate": 3.9557471264367815e-05, + "loss": 0.1184, + "step": 1817 + }, + { + "epoch": 2.107367963486199, + "grad_norm": 0.32471829652786255, + "learning_rate": 3.955172413793104e-05, + "loss": 0.1234, + "step": 1818 + }, + { + "epoch": 2.108527131782946, + "grad_norm": 0.25897079706192017, + "learning_rate": 3.954597701149426e-05, + "loss": 0.1142, + "step": 1819 + }, + { + "epoch": 2.1096863000796926, + "grad_norm": 0.2632877826690674, + "learning_rate": 3.954022988505747e-05, + "loss": 0.1189, + "step": 1820 + }, + { + "epoch": 2.1108454683764397, + "grad_norm": 0.3021185100078583, + "learning_rate": 3.953448275862069e-05, + "loss": 0.1283, + "step": 1821 + }, + { + "epoch": 2.112004636673187, + "grad_norm": 0.2785482704639435, + "learning_rate": 3.952873563218391e-05, + "loss": 0.1302, + "step": 1822 + }, + { + "epoch": 2.113163804969934, + "grad_norm": 0.33356723189353943, + "learning_rate": 3.9522988505747125e-05, + "loss": 0.1194, + "step": 1823 + }, + { + "epoch": 2.114322973266681, + "grad_norm": 0.29633092880249023, + "learning_rate": 3.9517241379310346e-05, + "loss": 0.1243, + "step": 1824 + }, + { + "epoch": 2.115482141563428, + "grad_norm": 0.2905955910682678, + "learning_rate": 3.951149425287357e-05, + "loss": 0.1128, + "step": 1825 + }, + { + "epoch": 2.1166413098601753, + "grad_norm": 0.24889424443244934, + "learning_rate": 3.950574712643678e-05, + "loss": 0.1029, + "step": 1826 + }, + { + "epoch": 2.1178004781569224, + "grad_norm": 0.24264870584011078, + "learning_rate": 3.9500000000000005e-05, + "loss": 0.1125, + "step": 1827 + }, + { + "epoch": 2.1189596464536695, + "grad_norm": 0.2387196123600006, + "learning_rate": 3.949425287356322e-05, + "loss": 0.1123, + "step": 1828 + }, + { + "epoch": 2.1201188147504166, + "grad_norm": 0.23533257842063904, + "learning_rate": 3.9488505747126434e-05, + "loss": 0.118, + "step": 1829 + }, + { + "epoch": 2.1212779830471638, + "grad_norm": 0.24829024076461792, + "learning_rate": 3.9482758620689656e-05, + "loss": 0.1194, + "step": 1830 + }, + { + "epoch": 2.122437151343911, + "grad_norm": 0.26718825101852417, + "learning_rate": 3.947701149425288e-05, + "loss": 0.1139, + "step": 1831 + }, + { + "epoch": 2.123596319640658, + "grad_norm": 0.24464312195777893, + "learning_rate": 3.94712643678161e-05, + "loss": 0.1101, + "step": 1832 + }, + { + "epoch": 2.124755487937405, + "grad_norm": 0.22388160228729248, + "learning_rate": 3.9465517241379314e-05, + "loss": 0.1088, + "step": 1833 + }, + { + "epoch": 2.125914656234152, + "grad_norm": 0.25908219814300537, + "learning_rate": 3.945977011494253e-05, + "loss": 0.1202, + "step": 1834 + }, + { + "epoch": 2.127073824530899, + "grad_norm": 0.2020428478717804, + "learning_rate": 3.945402298850575e-05, + "loss": 0.108, + "step": 1835 + }, + { + "epoch": 2.128232992827646, + "grad_norm": 0.26430872082710266, + "learning_rate": 3.9448275862068966e-05, + "loss": 0.1235, + "step": 1836 + }, + { + "epoch": 2.129392161124393, + "grad_norm": 0.3125210702419281, + "learning_rate": 3.944252873563218e-05, + "loss": 0.1134, + "step": 1837 + }, + { + "epoch": 2.1305513294211402, + "grad_norm": 0.24071168899536133, + "learning_rate": 3.94367816091954e-05, + "loss": 0.1153, + "step": 1838 + }, + { + "epoch": 2.1317104977178873, + "grad_norm": 0.22722084820270538, + "learning_rate": 3.9431034482758624e-05, + "loss": 0.1135, + "step": 1839 + }, + { + "epoch": 2.1328696660146345, + "grad_norm": 0.2722453474998474, + "learning_rate": 3.942528735632184e-05, + "loss": 0.1159, + "step": 1840 + }, + { + "epoch": 2.1340288343113816, + "grad_norm": 0.26020506024360657, + "learning_rate": 3.941954022988506e-05, + "loss": 0.1083, + "step": 1841 + }, + { + "epoch": 2.1351880026081287, + "grad_norm": 0.38387203216552734, + "learning_rate": 3.9413793103448276e-05, + "loss": 0.1304, + "step": 1842 + }, + { + "epoch": 2.136347170904876, + "grad_norm": 0.28882285952568054, + "learning_rate": 3.94080459770115e-05, + "loss": 0.1188, + "step": 1843 + }, + { + "epoch": 2.137506339201623, + "grad_norm": 0.2755765914916992, + "learning_rate": 3.940229885057471e-05, + "loss": 0.1346, + "step": 1844 + }, + { + "epoch": 2.13866550749837, + "grad_norm": 0.2223798632621765, + "learning_rate": 3.9396551724137934e-05, + "loss": 0.1112, + "step": 1845 + }, + { + "epoch": 2.139824675795117, + "grad_norm": 0.30250582098960876, + "learning_rate": 3.9390804597701156e-05, + "loss": 0.1267, + "step": 1846 + }, + { + "epoch": 2.1409838440918643, + "grad_norm": 0.2759857177734375, + "learning_rate": 3.938505747126437e-05, + "loss": 0.1198, + "step": 1847 + }, + { + "epoch": 2.1421430123886114, + "grad_norm": 0.2635420560836792, + "learning_rate": 3.9379310344827585e-05, + "loss": 0.116, + "step": 1848 + }, + { + "epoch": 2.143302180685358, + "grad_norm": 0.22998470067977905, + "learning_rate": 3.937356321839081e-05, + "loss": 0.1124, + "step": 1849 + }, + { + "epoch": 2.144461348982105, + "grad_norm": 0.22204086184501648, + "learning_rate": 3.936781609195402e-05, + "loss": 0.1087, + "step": 1850 + }, + { + "epoch": 2.1456205172788523, + "grad_norm": 0.2902623116970062, + "learning_rate": 3.9362068965517244e-05, + "loss": 0.1307, + "step": 1851 + }, + { + "epoch": 2.1467796855755994, + "grad_norm": 0.2636187970638275, + "learning_rate": 3.9356321839080465e-05, + "loss": 0.1168, + "step": 1852 + }, + { + "epoch": 2.1479388538723465, + "grad_norm": 0.29677683115005493, + "learning_rate": 3.935057471264368e-05, + "loss": 0.1328, + "step": 1853 + }, + { + "epoch": 2.1490980221690936, + "grad_norm": 0.2324220985174179, + "learning_rate": 3.93448275862069e-05, + "loss": 0.1198, + "step": 1854 + }, + { + "epoch": 2.1502571904658407, + "grad_norm": 0.2689639627933502, + "learning_rate": 3.933908045977012e-05, + "loss": 0.1285, + "step": 1855 + }, + { + "epoch": 2.151416358762588, + "grad_norm": 0.264710932970047, + "learning_rate": 3.933333333333333e-05, + "loss": 0.1235, + "step": 1856 + }, + { + "epoch": 2.152575527059335, + "grad_norm": 0.26649346947669983, + "learning_rate": 3.932758620689655e-05, + "loss": 0.1212, + "step": 1857 + }, + { + "epoch": 2.153734695356082, + "grad_norm": 0.27043336629867554, + "learning_rate": 3.932183908045977e-05, + "loss": 0.1123, + "step": 1858 + }, + { + "epoch": 2.154893863652829, + "grad_norm": 0.24677076935768127, + "learning_rate": 3.931609195402299e-05, + "loss": 0.1058, + "step": 1859 + }, + { + "epoch": 2.1560530319495763, + "grad_norm": 0.3722219169139862, + "learning_rate": 3.931034482758621e-05, + "loss": 0.13, + "step": 1860 + }, + { + "epoch": 2.1572122002463234, + "grad_norm": 0.3465796411037445, + "learning_rate": 3.9304597701149427e-05, + "loss": 0.1323, + "step": 1861 + }, + { + "epoch": 2.1583713685430705, + "grad_norm": 0.24927546083927155, + "learning_rate": 3.929885057471265e-05, + "loss": 0.1133, + "step": 1862 + }, + { + "epoch": 2.159530536839817, + "grad_norm": 0.2714444696903229, + "learning_rate": 3.929310344827586e-05, + "loss": 0.114, + "step": 1863 + }, + { + "epoch": 2.1606897051365643, + "grad_norm": 0.2666366994380951, + "learning_rate": 3.928735632183908e-05, + "loss": 0.1182, + "step": 1864 + }, + { + "epoch": 2.1618488734333114, + "grad_norm": 0.24803896248340607, + "learning_rate": 3.92816091954023e-05, + "loss": 0.1162, + "step": 1865 + }, + { + "epoch": 2.1630080417300586, + "grad_norm": 0.23639126121997833, + "learning_rate": 3.927586206896552e-05, + "loss": 0.1154, + "step": 1866 + }, + { + "epoch": 2.1641672100268057, + "grad_norm": 0.2096773087978363, + "learning_rate": 3.9270114942528736e-05, + "loss": 0.1079, + "step": 1867 + }, + { + "epoch": 2.165326378323553, + "grad_norm": 0.3063262104988098, + "learning_rate": 3.926436781609196e-05, + "loss": 0.1197, + "step": 1868 + }, + { + "epoch": 2.1664855466203, + "grad_norm": 0.24331098794937134, + "learning_rate": 3.925862068965517e-05, + "loss": 0.1114, + "step": 1869 + }, + { + "epoch": 2.167644714917047, + "grad_norm": 0.25354528427124023, + "learning_rate": 3.9252873563218395e-05, + "loss": 0.1231, + "step": 1870 + }, + { + "epoch": 2.168803883213794, + "grad_norm": 0.2932867109775543, + "learning_rate": 3.924712643678161e-05, + "loss": 0.131, + "step": 1871 + }, + { + "epoch": 2.1699630515105413, + "grad_norm": 0.22387802600860596, + "learning_rate": 3.924137931034483e-05, + "loss": 0.1101, + "step": 1872 + }, + { + "epoch": 2.1711222198072884, + "grad_norm": 0.250288724899292, + "learning_rate": 3.923563218390805e-05, + "loss": 0.1229, + "step": 1873 + }, + { + "epoch": 2.1722813881040355, + "grad_norm": 0.39069998264312744, + "learning_rate": 3.922988505747127e-05, + "loss": 0.1259, + "step": 1874 + }, + { + "epoch": 2.1734405564007826, + "grad_norm": 0.20509329438209534, + "learning_rate": 3.922413793103448e-05, + "loss": 0.1273, + "step": 1875 + }, + { + "epoch": 2.1745997246975297, + "grad_norm": 0.27866530418395996, + "learning_rate": 3.9218390804597704e-05, + "loss": 0.1229, + "step": 1876 + }, + { + "epoch": 2.175758892994277, + "grad_norm": 0.2512810528278351, + "learning_rate": 3.921264367816092e-05, + "loss": 0.1142, + "step": 1877 + }, + { + "epoch": 2.1769180612910235, + "grad_norm": 0.2705281674861908, + "learning_rate": 3.9206896551724134e-05, + "loss": 0.1132, + "step": 1878 + }, + { + "epoch": 2.1780772295877706, + "grad_norm": 0.22448644042015076, + "learning_rate": 3.9201149425287356e-05, + "loss": 0.1077, + "step": 1879 + }, + { + "epoch": 2.1792363978845177, + "grad_norm": 0.30466312170028687, + "learning_rate": 3.919540229885058e-05, + "loss": 0.1246, + "step": 1880 + }, + { + "epoch": 2.180395566181265, + "grad_norm": 0.2996283769607544, + "learning_rate": 3.91896551724138e-05, + "loss": 0.1262, + "step": 1881 + }, + { + "epoch": 2.181554734478012, + "grad_norm": 0.3339924216270447, + "learning_rate": 3.9183908045977014e-05, + "loss": 0.1344, + "step": 1882 + }, + { + "epoch": 2.182713902774759, + "grad_norm": 0.24632121622562408, + "learning_rate": 3.917816091954023e-05, + "loss": 0.1099, + "step": 1883 + }, + { + "epoch": 2.183873071071506, + "grad_norm": 0.24614660441875458, + "learning_rate": 3.917241379310345e-05, + "loss": 0.1136, + "step": 1884 + }, + { + "epoch": 2.1850322393682533, + "grad_norm": 0.31324732303619385, + "learning_rate": 3.9166666666666665e-05, + "loss": 0.1242, + "step": 1885 + }, + { + "epoch": 2.1861914076650004, + "grad_norm": 0.3063158392906189, + "learning_rate": 3.916091954022989e-05, + "loss": 0.1313, + "step": 1886 + }, + { + "epoch": 2.1873505759617475, + "grad_norm": 0.3112230598926544, + "learning_rate": 3.915517241379311e-05, + "loss": 0.1244, + "step": 1887 + }, + { + "epoch": 2.1885097442584946, + "grad_norm": 0.2197721004486084, + "learning_rate": 3.9149425287356324e-05, + "loss": 0.1093, + "step": 1888 + }, + { + "epoch": 2.1896689125552418, + "grad_norm": 0.2518065273761749, + "learning_rate": 3.9143678160919545e-05, + "loss": 0.1126, + "step": 1889 + }, + { + "epoch": 2.190828080851989, + "grad_norm": 0.24468325078487396, + "learning_rate": 3.913793103448276e-05, + "loss": 0.1264, + "step": 1890 + }, + { + "epoch": 2.1919872491487356, + "grad_norm": 0.2638467252254486, + "learning_rate": 3.9132183908045975e-05, + "loss": 0.1299, + "step": 1891 + }, + { + "epoch": 2.1931464174454827, + "grad_norm": 0.29071396589279175, + "learning_rate": 3.91264367816092e-05, + "loss": 0.1406, + "step": 1892 + }, + { + "epoch": 2.19430558574223, + "grad_norm": 0.26825428009033203, + "learning_rate": 3.912068965517242e-05, + "loss": 0.133, + "step": 1893 + }, + { + "epoch": 2.195464754038977, + "grad_norm": 0.28922030329704285, + "learning_rate": 3.9114942528735633e-05, + "loss": 0.1303, + "step": 1894 + }, + { + "epoch": 2.196623922335724, + "grad_norm": 0.22582325339317322, + "learning_rate": 3.9109195402298855e-05, + "loss": 0.1118, + "step": 1895 + }, + { + "epoch": 2.197783090632471, + "grad_norm": 0.31042325496673584, + "learning_rate": 3.910344827586207e-05, + "loss": 0.1292, + "step": 1896 + }, + { + "epoch": 2.1989422589292182, + "grad_norm": 0.207808256149292, + "learning_rate": 3.9097701149425285e-05, + "loss": 0.1151, + "step": 1897 + }, + { + "epoch": 2.2001014272259654, + "grad_norm": 0.2200191169977188, + "learning_rate": 3.909195402298851e-05, + "loss": 0.1085, + "step": 1898 + }, + { + "epoch": 2.2012605955227125, + "grad_norm": 0.2516978979110718, + "learning_rate": 3.908620689655172e-05, + "loss": 0.122, + "step": 1899 + }, + { + "epoch": 2.2024197638194596, + "grad_norm": 0.2803818881511688, + "learning_rate": 3.908045977011495e-05, + "loss": 0.1214, + "step": 1900 + }, + { + "epoch": 2.2035789321162067, + "grad_norm": 0.24263282120227814, + "learning_rate": 3.9074712643678165e-05, + "loss": 0.1072, + "step": 1901 + }, + { + "epoch": 2.204738100412954, + "grad_norm": 0.24859733879566193, + "learning_rate": 3.906896551724138e-05, + "loss": 0.1286, + "step": 1902 + }, + { + "epoch": 2.205897268709701, + "grad_norm": 0.2725147306919098, + "learning_rate": 3.90632183908046e-05, + "loss": 0.1282, + "step": 1903 + }, + { + "epoch": 2.207056437006448, + "grad_norm": 0.42582276463508606, + "learning_rate": 3.9057471264367816e-05, + "loss": 0.1239, + "step": 1904 + }, + { + "epoch": 2.208215605303195, + "grad_norm": 0.2799397110939026, + "learning_rate": 3.905172413793103e-05, + "loss": 0.126, + "step": 1905 + }, + { + "epoch": 2.209374773599942, + "grad_norm": 0.23212167620658875, + "learning_rate": 3.904597701149425e-05, + "loss": 0.1143, + "step": 1906 + }, + { + "epoch": 2.210533941896689, + "grad_norm": 0.26310837268829346, + "learning_rate": 3.9040229885057475e-05, + "loss": 0.1218, + "step": 1907 + }, + { + "epoch": 2.211693110193436, + "grad_norm": 0.289028137922287, + "learning_rate": 3.9034482758620696e-05, + "loss": 0.1142, + "step": 1908 + }, + { + "epoch": 2.212852278490183, + "grad_norm": 0.2884521484375, + "learning_rate": 3.902873563218391e-05, + "loss": 0.1215, + "step": 1909 + }, + { + "epoch": 2.2140114467869303, + "grad_norm": 0.2507173717021942, + "learning_rate": 3.9022988505747126e-05, + "loss": 0.1184, + "step": 1910 + }, + { + "epoch": 2.2151706150836774, + "grad_norm": 0.23451244831085205, + "learning_rate": 3.901724137931035e-05, + "loss": 0.1123, + "step": 1911 + }, + { + "epoch": 2.2163297833804245, + "grad_norm": 0.2503047287464142, + "learning_rate": 3.901149425287356e-05, + "loss": 0.1204, + "step": 1912 + }, + { + "epoch": 2.2174889516771716, + "grad_norm": 0.2952342927455902, + "learning_rate": 3.9005747126436784e-05, + "loss": 0.1154, + "step": 1913 + }, + { + "epoch": 2.2186481199739188, + "grad_norm": 0.28569087386131287, + "learning_rate": 3.9000000000000006e-05, + "loss": 0.1348, + "step": 1914 + }, + { + "epoch": 2.219807288270666, + "grad_norm": 0.3290340304374695, + "learning_rate": 3.899425287356322e-05, + "loss": 0.1197, + "step": 1915 + }, + { + "epoch": 2.220966456567413, + "grad_norm": 0.20089855790138245, + "learning_rate": 3.8988505747126436e-05, + "loss": 0.1161, + "step": 1916 + }, + { + "epoch": 2.22212562486416, + "grad_norm": 0.2501339912414551, + "learning_rate": 3.898275862068966e-05, + "loss": 0.1122, + "step": 1917 + }, + { + "epoch": 2.223284793160907, + "grad_norm": 0.27215123176574707, + "learning_rate": 3.897701149425287e-05, + "loss": 0.1152, + "step": 1918 + }, + { + "epoch": 2.2244439614576543, + "grad_norm": 0.2323666214942932, + "learning_rate": 3.8971264367816094e-05, + "loss": 0.1029, + "step": 1919 + }, + { + "epoch": 2.2256031297544014, + "grad_norm": 0.3195997476577759, + "learning_rate": 3.896551724137931e-05, + "loss": 0.1307, + "step": 1920 + }, + { + "epoch": 2.226762298051148, + "grad_norm": 0.34427744150161743, + "learning_rate": 3.895977011494253e-05, + "loss": 0.1279, + "step": 1921 + }, + { + "epoch": 2.2279214663478952, + "grad_norm": 0.3101141154766083, + "learning_rate": 3.895402298850575e-05, + "loss": 0.1152, + "step": 1922 + }, + { + "epoch": 2.2290806346446423, + "grad_norm": 0.3297821283340454, + "learning_rate": 3.894827586206897e-05, + "loss": 0.1249, + "step": 1923 + }, + { + "epoch": 2.2302398029413895, + "grad_norm": 0.27412423491477966, + "learning_rate": 3.894252873563218e-05, + "loss": 0.1201, + "step": 1924 + }, + { + "epoch": 2.2313989712381366, + "grad_norm": 0.2107117623090744, + "learning_rate": 3.8936781609195404e-05, + "loss": 0.1083, + "step": 1925 + }, + { + "epoch": 2.2325581395348837, + "grad_norm": 0.3204614222049713, + "learning_rate": 3.893103448275862e-05, + "loss": 0.1319, + "step": 1926 + }, + { + "epoch": 2.233717307831631, + "grad_norm": 0.32438015937805176, + "learning_rate": 3.892528735632184e-05, + "loss": 0.1343, + "step": 1927 + }, + { + "epoch": 2.234876476128378, + "grad_norm": 0.22636090219020844, + "learning_rate": 3.891954022988506e-05, + "loss": 0.1092, + "step": 1928 + }, + { + "epoch": 2.236035644425125, + "grad_norm": 0.27245032787323, + "learning_rate": 3.891379310344828e-05, + "loss": 0.1141, + "step": 1929 + }, + { + "epoch": 2.237194812721872, + "grad_norm": 0.3505677580833435, + "learning_rate": 3.89080459770115e-05, + "loss": 0.1366, + "step": 1930 + }, + { + "epoch": 2.2383539810186193, + "grad_norm": 0.2798290252685547, + "learning_rate": 3.8902298850574714e-05, + "loss": 0.1153, + "step": 1931 + }, + { + "epoch": 2.2395131493153664, + "grad_norm": 0.20595116913318634, + "learning_rate": 3.889655172413793e-05, + "loss": 0.1109, + "step": 1932 + }, + { + "epoch": 2.2406723176121135, + "grad_norm": 0.26562798023223877, + "learning_rate": 3.889080459770115e-05, + "loss": 0.1309, + "step": 1933 + }, + { + "epoch": 2.24183148590886, + "grad_norm": 0.28645437955856323, + "learning_rate": 3.888505747126437e-05, + "loss": 0.124, + "step": 1934 + }, + { + "epoch": 2.2429906542056073, + "grad_norm": 0.26411932706832886, + "learning_rate": 3.8879310344827594e-05, + "loss": 0.1231, + "step": 1935 + }, + { + "epoch": 2.2441498225023544, + "grad_norm": 0.2324962168931961, + "learning_rate": 3.887356321839081e-05, + "loss": 0.1213, + "step": 1936 + }, + { + "epoch": 2.2453089907991015, + "grad_norm": 0.2609705924987793, + "learning_rate": 3.886781609195402e-05, + "loss": 0.1246, + "step": 1937 + }, + { + "epoch": 2.2464681590958486, + "grad_norm": 0.24605713784694672, + "learning_rate": 3.8862068965517245e-05, + "loss": 0.1193, + "step": 1938 + }, + { + "epoch": 2.2476273273925957, + "grad_norm": 0.3854371905326843, + "learning_rate": 3.885632183908046e-05, + "loss": 0.1324, + "step": 1939 + }, + { + "epoch": 2.248786495689343, + "grad_norm": 0.2347278594970703, + "learning_rate": 3.8850574712643675e-05, + "loss": 0.1185, + "step": 1940 + }, + { + "epoch": 2.24994566398609, + "grad_norm": 0.25814783573150635, + "learning_rate": 3.88448275862069e-05, + "loss": 0.1163, + "step": 1941 + }, + { + "epoch": 2.251104832282837, + "grad_norm": 0.2590005695819855, + "learning_rate": 3.883908045977012e-05, + "loss": 0.1159, + "step": 1942 + }, + { + "epoch": 2.252264000579584, + "grad_norm": 0.2866111099720001, + "learning_rate": 3.883333333333333e-05, + "loss": 0.119, + "step": 1943 + }, + { + "epoch": 2.2534231688763313, + "grad_norm": 0.20185330510139465, + "learning_rate": 3.8827586206896555e-05, + "loss": 0.1127, + "step": 1944 + }, + { + "epoch": 2.2545823371730784, + "grad_norm": 0.2572873830795288, + "learning_rate": 3.882183908045977e-05, + "loss": 0.1235, + "step": 1945 + }, + { + "epoch": 2.2557415054698255, + "grad_norm": 0.29143568873405457, + "learning_rate": 3.881609195402299e-05, + "loss": 0.1161, + "step": 1946 + }, + { + "epoch": 2.2569006737665727, + "grad_norm": 0.25425946712493896, + "learning_rate": 3.8810344827586206e-05, + "loss": 0.1022, + "step": 1947 + }, + { + "epoch": 2.2580598420633198, + "grad_norm": 0.25637203454971313, + "learning_rate": 3.880459770114943e-05, + "loss": 0.1139, + "step": 1948 + }, + { + "epoch": 2.2592190103600664, + "grad_norm": 0.30507326126098633, + "learning_rate": 3.879885057471265e-05, + "loss": 0.1264, + "step": 1949 + }, + { + "epoch": 2.2603781786568136, + "grad_norm": 0.28318384289741516, + "learning_rate": 3.8793103448275865e-05, + "loss": 0.1243, + "step": 1950 + }, + { + "epoch": 2.2615373469535607, + "grad_norm": 0.3025624752044678, + "learning_rate": 3.878735632183908e-05, + "loss": 0.1223, + "step": 1951 + }, + { + "epoch": 2.262696515250308, + "grad_norm": 0.3020472824573517, + "learning_rate": 3.87816091954023e-05, + "loss": 0.1205, + "step": 1952 + }, + { + "epoch": 2.263855683547055, + "grad_norm": 0.3075084388256073, + "learning_rate": 3.8775862068965516e-05, + "loss": 0.1151, + "step": 1953 + }, + { + "epoch": 2.265014851843802, + "grad_norm": 0.3189045786857605, + "learning_rate": 3.877011494252874e-05, + "loss": 0.1102, + "step": 1954 + }, + { + "epoch": 2.266174020140549, + "grad_norm": 0.347697913646698, + "learning_rate": 3.876436781609196e-05, + "loss": 0.1333, + "step": 1955 + }, + { + "epoch": 2.2673331884372963, + "grad_norm": 0.27193987369537354, + "learning_rate": 3.8758620689655174e-05, + "loss": 0.1237, + "step": 1956 + }, + { + "epoch": 2.2684923567340434, + "grad_norm": 0.22569780051708221, + "learning_rate": 3.8752873563218396e-05, + "loss": 0.1018, + "step": 1957 + }, + { + "epoch": 2.2696515250307905, + "grad_norm": 0.2848230302333832, + "learning_rate": 3.874712643678161e-05, + "loss": 0.1088, + "step": 1958 + }, + { + "epoch": 2.2708106933275376, + "grad_norm": 0.20593014359474182, + "learning_rate": 3.8741379310344826e-05, + "loss": 0.1036, + "step": 1959 + }, + { + "epoch": 2.2719698616242847, + "grad_norm": 0.2758769989013672, + "learning_rate": 3.873563218390805e-05, + "loss": 0.1227, + "step": 1960 + }, + { + "epoch": 2.273129029921032, + "grad_norm": 0.2546898126602173, + "learning_rate": 3.872988505747127e-05, + "loss": 0.1075, + "step": 1961 + }, + { + "epoch": 2.2742881982177785, + "grad_norm": 0.3119022250175476, + "learning_rate": 3.8724137931034484e-05, + "loss": 0.1343, + "step": 1962 + }, + { + "epoch": 2.275447366514526, + "grad_norm": 0.2919405400753021, + "learning_rate": 3.8718390804597706e-05, + "loss": 0.1392, + "step": 1963 + }, + { + "epoch": 2.2766065348112727, + "grad_norm": 0.29379087686538696, + "learning_rate": 3.871264367816092e-05, + "loss": 0.137, + "step": 1964 + }, + { + "epoch": 2.27776570310802, + "grad_norm": 0.32056736946105957, + "learning_rate": 3.870689655172414e-05, + "loss": 0.1112, + "step": 1965 + }, + { + "epoch": 2.278924871404767, + "grad_norm": 0.25349029898643494, + "learning_rate": 3.870114942528736e-05, + "loss": 0.1178, + "step": 1966 + }, + { + "epoch": 2.280084039701514, + "grad_norm": 0.24394166469573975, + "learning_rate": 3.869540229885057e-05, + "loss": 0.1072, + "step": 1967 + }, + { + "epoch": 2.281243207998261, + "grad_norm": 0.23791316151618958, + "learning_rate": 3.8689655172413794e-05, + "loss": 0.1076, + "step": 1968 + }, + { + "epoch": 2.2824023762950083, + "grad_norm": 0.24641607701778412, + "learning_rate": 3.8683908045977015e-05, + "loss": 0.1218, + "step": 1969 + }, + { + "epoch": 2.2835615445917554, + "grad_norm": 0.23730486631393433, + "learning_rate": 3.867816091954023e-05, + "loss": 0.1135, + "step": 1970 + }, + { + "epoch": 2.2847207128885025, + "grad_norm": 0.2812163531780243, + "learning_rate": 3.867241379310345e-05, + "loss": 0.1196, + "step": 1971 + }, + { + "epoch": 2.2858798811852497, + "grad_norm": 0.25188305974006653, + "learning_rate": 3.866666666666667e-05, + "loss": 0.1174, + "step": 1972 + }, + { + "epoch": 2.2870390494819968, + "grad_norm": 0.2505231499671936, + "learning_rate": 3.866091954022989e-05, + "loss": 0.1175, + "step": 1973 + }, + { + "epoch": 2.288198217778744, + "grad_norm": 0.36217159032821655, + "learning_rate": 3.8655172413793103e-05, + "loss": 0.1261, + "step": 1974 + }, + { + "epoch": 2.289357386075491, + "grad_norm": 0.26633283495903015, + "learning_rate": 3.8649425287356325e-05, + "loss": 0.126, + "step": 1975 + }, + { + "epoch": 2.290516554372238, + "grad_norm": 0.286703884601593, + "learning_rate": 3.864367816091955e-05, + "loss": 0.1083, + "step": 1976 + }, + { + "epoch": 2.291675722668985, + "grad_norm": 0.2949945032596588, + "learning_rate": 3.863793103448276e-05, + "loss": 0.1201, + "step": 1977 + }, + { + "epoch": 2.292834890965732, + "grad_norm": 0.22798071801662445, + "learning_rate": 3.863218390804598e-05, + "loss": 0.1158, + "step": 1978 + }, + { + "epoch": 2.293994059262479, + "grad_norm": 0.2760827839374542, + "learning_rate": 3.86264367816092e-05, + "loss": 0.1322, + "step": 1979 + }, + { + "epoch": 2.295153227559226, + "grad_norm": 0.25793665647506714, + "learning_rate": 3.862068965517241e-05, + "loss": 0.1113, + "step": 1980 + }, + { + "epoch": 2.2963123958559732, + "grad_norm": 0.28689324855804443, + "learning_rate": 3.861494252873563e-05, + "loss": 0.1064, + "step": 1981 + }, + { + "epoch": 2.2974715641527204, + "grad_norm": 0.2645778954029083, + "learning_rate": 3.8609195402298857e-05, + "loss": 0.1239, + "step": 1982 + }, + { + "epoch": 2.2986307324494675, + "grad_norm": 0.26537734270095825, + "learning_rate": 3.860344827586207e-05, + "loss": 0.1201, + "step": 1983 + }, + { + "epoch": 2.2997899007462146, + "grad_norm": 0.281444251537323, + "learning_rate": 3.859770114942529e-05, + "loss": 0.1199, + "step": 1984 + }, + { + "epoch": 2.3009490690429617, + "grad_norm": 0.25172534584999084, + "learning_rate": 3.859195402298851e-05, + "loss": 0.1133, + "step": 1985 + }, + { + "epoch": 2.302108237339709, + "grad_norm": 0.25721946358680725, + "learning_rate": 3.858620689655172e-05, + "loss": 0.1117, + "step": 1986 + }, + { + "epoch": 2.303267405636456, + "grad_norm": 0.24373944103717804, + "learning_rate": 3.8580459770114945e-05, + "loss": 0.1043, + "step": 1987 + }, + { + "epoch": 2.304426573933203, + "grad_norm": 0.2496458888053894, + "learning_rate": 3.857471264367816e-05, + "loss": 0.1147, + "step": 1988 + }, + { + "epoch": 2.30558574222995, + "grad_norm": 0.21120178699493408, + "learning_rate": 3.856896551724138e-05, + "loss": 0.1069, + "step": 1989 + }, + { + "epoch": 2.3067449105266973, + "grad_norm": 0.30783671140670776, + "learning_rate": 3.85632183908046e-05, + "loss": 0.125, + "step": 1990 + }, + { + "epoch": 2.3079040788234444, + "grad_norm": 0.22991196811199188, + "learning_rate": 3.855747126436782e-05, + "loss": 0.1206, + "step": 1991 + }, + { + "epoch": 2.309063247120191, + "grad_norm": 0.2827235758304596, + "learning_rate": 3.855172413793104e-05, + "loss": 0.1333, + "step": 1992 + }, + { + "epoch": 2.310222415416938, + "grad_norm": 0.3185621500015259, + "learning_rate": 3.8545977011494254e-05, + "loss": 0.135, + "step": 1993 + }, + { + "epoch": 2.3113815837136853, + "grad_norm": 0.25081896781921387, + "learning_rate": 3.854022988505747e-05, + "loss": 0.1197, + "step": 1994 + }, + { + "epoch": 2.3125407520104324, + "grad_norm": 0.3022123873233795, + "learning_rate": 3.853448275862069e-05, + "loss": 0.1232, + "step": 1995 + }, + { + "epoch": 2.3136999203071795, + "grad_norm": 0.3585253357887268, + "learning_rate": 3.852873563218391e-05, + "loss": 0.1371, + "step": 1996 + }, + { + "epoch": 2.3148590886039266, + "grad_norm": 0.22381120920181274, + "learning_rate": 3.852298850574713e-05, + "loss": 0.1093, + "step": 1997 + }, + { + "epoch": 2.3160182569006738, + "grad_norm": 0.3148757219314575, + "learning_rate": 3.851724137931035e-05, + "loss": 0.1487, + "step": 1998 + }, + { + "epoch": 2.317177425197421, + "grad_norm": 0.298365980386734, + "learning_rate": 3.8511494252873564e-05, + "loss": 0.1225, + "step": 1999 + }, + { + "epoch": 2.318336593494168, + "grad_norm": 0.27845144271850586, + "learning_rate": 3.850574712643678e-05, + "loss": 0.1202, + "step": 2000 + }, + { + "epoch": 2.319495761790915, + "grad_norm": 0.2849871814250946, + "learning_rate": 3.85e-05, + "loss": 0.1209, + "step": 2001 + }, + { + "epoch": 2.320654930087662, + "grad_norm": 0.30878233909606934, + "learning_rate": 3.849425287356322e-05, + "loss": 0.1256, + "step": 2002 + }, + { + "epoch": 2.3218140983844093, + "grad_norm": 0.27193957567214966, + "learning_rate": 3.8488505747126444e-05, + "loss": 0.1206, + "step": 2003 + }, + { + "epoch": 2.3229732666811564, + "grad_norm": 0.23535265028476715, + "learning_rate": 3.848275862068966e-05, + "loss": 0.1188, + "step": 2004 + }, + { + "epoch": 2.324132434977903, + "grad_norm": 0.2573367953300476, + "learning_rate": 3.8477011494252874e-05, + "loss": 0.1165, + "step": 2005 + }, + { + "epoch": 2.3252916032746507, + "grad_norm": 0.2889256179332733, + "learning_rate": 3.8471264367816096e-05, + "loss": 0.1293, + "step": 2006 + }, + { + "epoch": 2.3264507715713973, + "grad_norm": 0.29052430391311646, + "learning_rate": 3.846551724137931e-05, + "loss": 0.1194, + "step": 2007 + }, + { + "epoch": 2.3276099398681445, + "grad_norm": 0.2844145596027374, + "learning_rate": 3.8459770114942525e-05, + "loss": 0.1342, + "step": 2008 + }, + { + "epoch": 2.3287691081648916, + "grad_norm": 0.2795712649822235, + "learning_rate": 3.845402298850575e-05, + "loss": 0.1216, + "step": 2009 + }, + { + "epoch": 2.3299282764616387, + "grad_norm": 0.27312591671943665, + "learning_rate": 3.844827586206897e-05, + "loss": 0.1224, + "step": 2010 + }, + { + "epoch": 2.331087444758386, + "grad_norm": 0.20868080854415894, + "learning_rate": 3.844252873563219e-05, + "loss": 0.1051, + "step": 2011 + }, + { + "epoch": 2.332246613055133, + "grad_norm": 0.27325716614723206, + "learning_rate": 3.8436781609195405e-05, + "loss": 0.1154, + "step": 2012 + }, + { + "epoch": 2.33340578135188, + "grad_norm": 0.2669987082481384, + "learning_rate": 3.843103448275862e-05, + "loss": 0.1131, + "step": 2013 + }, + { + "epoch": 2.334564949648627, + "grad_norm": 0.25275173783302307, + "learning_rate": 3.842528735632184e-05, + "loss": 0.1209, + "step": 2014 + }, + { + "epoch": 2.3357241179453743, + "grad_norm": 0.2534005045890808, + "learning_rate": 3.841954022988506e-05, + "loss": 0.1266, + "step": 2015 + }, + { + "epoch": 2.3368832862421214, + "grad_norm": 0.26923778653144836, + "learning_rate": 3.841379310344828e-05, + "loss": 0.1289, + "step": 2016 + }, + { + "epoch": 2.3380424545388685, + "grad_norm": 0.23470233380794525, + "learning_rate": 3.84080459770115e-05, + "loss": 0.1097, + "step": 2017 + }, + { + "epoch": 2.3392016228356156, + "grad_norm": 0.29632678627967834, + "learning_rate": 3.8402298850574715e-05, + "loss": 0.1136, + "step": 2018 + }, + { + "epoch": 2.3403607911323627, + "grad_norm": 0.22054457664489746, + "learning_rate": 3.839655172413793e-05, + "loss": 0.1087, + "step": 2019 + }, + { + "epoch": 2.3415199594291094, + "grad_norm": 0.29391518235206604, + "learning_rate": 3.839080459770115e-05, + "loss": 0.1152, + "step": 2020 + }, + { + "epoch": 2.3426791277258565, + "grad_norm": 0.2556963264942169, + "learning_rate": 3.8385057471264366e-05, + "loss": 0.1298, + "step": 2021 + }, + { + "epoch": 2.3438382960226036, + "grad_norm": 0.20713378489017487, + "learning_rate": 3.837931034482759e-05, + "loss": 0.1121, + "step": 2022 + }, + { + "epoch": 2.3449974643193507, + "grad_norm": 0.30257120728492737, + "learning_rate": 3.837356321839081e-05, + "loss": 0.1267, + "step": 2023 + }, + { + "epoch": 2.346156632616098, + "grad_norm": 0.3050404489040375, + "learning_rate": 3.8367816091954025e-05, + "loss": 0.1361, + "step": 2024 + }, + { + "epoch": 2.347315800912845, + "grad_norm": 0.1983451396226883, + "learning_rate": 3.8362068965517246e-05, + "loss": 0.1103, + "step": 2025 + }, + { + "epoch": 2.348474969209592, + "grad_norm": 0.23524372279644012, + "learning_rate": 3.835632183908046e-05, + "loss": 0.1147, + "step": 2026 + }, + { + "epoch": 2.349634137506339, + "grad_norm": 0.2787211537361145, + "learning_rate": 3.8350574712643676e-05, + "loss": 0.1175, + "step": 2027 + }, + { + "epoch": 2.3507933058030863, + "grad_norm": 0.2609887421131134, + "learning_rate": 3.83448275862069e-05, + "loss": 0.118, + "step": 2028 + }, + { + "epoch": 2.3519524740998334, + "grad_norm": 0.2685818076133728, + "learning_rate": 3.833908045977011e-05, + "loss": 0.1245, + "step": 2029 + }, + { + "epoch": 2.3531116423965805, + "grad_norm": 0.25744444131851196, + "learning_rate": 3.8333333333333334e-05, + "loss": 0.1141, + "step": 2030 + }, + { + "epoch": 2.3542708106933277, + "grad_norm": 0.22216452658176422, + "learning_rate": 3.8327586206896556e-05, + "loss": 0.1097, + "step": 2031 + }, + { + "epoch": 2.355429978990075, + "grad_norm": 0.23163087666034698, + "learning_rate": 3.832183908045977e-05, + "loss": 0.1042, + "step": 2032 + }, + { + "epoch": 2.356589147286822, + "grad_norm": 0.3125256896018982, + "learning_rate": 3.831609195402299e-05, + "loss": 0.1074, + "step": 2033 + }, + { + "epoch": 2.357748315583569, + "grad_norm": 0.35050761699676514, + "learning_rate": 3.831034482758621e-05, + "loss": 0.1332, + "step": 2034 + }, + { + "epoch": 2.3589074838803157, + "grad_norm": 0.2466825395822525, + "learning_rate": 3.830459770114942e-05, + "loss": 0.1192, + "step": 2035 + }, + { + "epoch": 2.360066652177063, + "grad_norm": 0.26629340648651123, + "learning_rate": 3.8298850574712644e-05, + "loss": 0.1227, + "step": 2036 + }, + { + "epoch": 2.36122582047381, + "grad_norm": 0.3505549728870392, + "learning_rate": 3.8293103448275866e-05, + "loss": 0.1138, + "step": 2037 + }, + { + "epoch": 2.362384988770557, + "grad_norm": 0.22414235770702362, + "learning_rate": 3.828735632183908e-05, + "loss": 0.1087, + "step": 2038 + }, + { + "epoch": 2.363544157067304, + "grad_norm": 0.23031394183635712, + "learning_rate": 3.82816091954023e-05, + "loss": 0.1178, + "step": 2039 + }, + { + "epoch": 2.3647033253640513, + "grad_norm": 0.37229323387145996, + "learning_rate": 3.827586206896552e-05, + "loss": 0.1398, + "step": 2040 + }, + { + "epoch": 2.3658624936607984, + "grad_norm": 0.2622588872909546, + "learning_rate": 3.827011494252874e-05, + "loss": 0.1189, + "step": 2041 + }, + { + "epoch": 2.3670216619575455, + "grad_norm": 0.2855938673019409, + "learning_rate": 3.8264367816091954e-05, + "loss": 0.1256, + "step": 2042 + }, + { + "epoch": 2.3681808302542926, + "grad_norm": 0.3094041049480438, + "learning_rate": 3.8258620689655176e-05, + "loss": 0.1268, + "step": 2043 + }, + { + "epoch": 2.3693399985510397, + "grad_norm": 0.30594077706336975, + "learning_rate": 3.82528735632184e-05, + "loss": 0.1206, + "step": 2044 + }, + { + "epoch": 2.370499166847787, + "grad_norm": 0.3137442469596863, + "learning_rate": 3.824712643678161e-05, + "loss": 0.1346, + "step": 2045 + }, + { + "epoch": 2.371658335144534, + "grad_norm": 0.25371280312538147, + "learning_rate": 3.824137931034483e-05, + "loss": 0.121, + "step": 2046 + }, + { + "epoch": 2.372817503441281, + "grad_norm": 0.24225755035877228, + "learning_rate": 3.823563218390805e-05, + "loss": 0.1124, + "step": 2047 + }, + { + "epoch": 2.3739766717380277, + "grad_norm": 0.2342030107975006, + "learning_rate": 3.8229885057471264e-05, + "loss": 0.1014, + "step": 2048 + }, + { + "epoch": 2.3751358400347753, + "grad_norm": 0.3285464942455292, + "learning_rate": 3.8224137931034485e-05, + "loss": 0.122, + "step": 2049 + }, + { + "epoch": 2.376295008331522, + "grad_norm": 0.2837320864200592, + "learning_rate": 3.82183908045977e-05, + "loss": 0.1265, + "step": 2050 + }, + { + "epoch": 2.377454176628269, + "grad_norm": 0.30253443121910095, + "learning_rate": 3.821264367816092e-05, + "loss": 0.1191, + "step": 2051 + }, + { + "epoch": 2.378613344925016, + "grad_norm": 0.24019919335842133, + "learning_rate": 3.8206896551724144e-05, + "loss": 0.1035, + "step": 2052 + }, + { + "epoch": 2.3797725132217633, + "grad_norm": 0.2411479502916336, + "learning_rate": 3.820114942528736e-05, + "loss": 0.1238, + "step": 2053 + }, + { + "epoch": 2.3809316815185104, + "grad_norm": 0.2749464213848114, + "learning_rate": 3.8195402298850573e-05, + "loss": 0.1259, + "step": 2054 + }, + { + "epoch": 2.3820908498152575, + "grad_norm": 0.2516106963157654, + "learning_rate": 3.8189655172413795e-05, + "loss": 0.1173, + "step": 2055 + }, + { + "epoch": 2.3832500181120047, + "grad_norm": 0.2396615743637085, + "learning_rate": 3.818390804597701e-05, + "loss": 0.1057, + "step": 2056 + }, + { + "epoch": 2.3844091864087518, + "grad_norm": 0.23807305097579956, + "learning_rate": 3.817816091954023e-05, + "loss": 0.1216, + "step": 2057 + }, + { + "epoch": 2.385568354705499, + "grad_norm": 0.24773019552230835, + "learning_rate": 3.8172413793103453e-05, + "loss": 0.1243, + "step": 2058 + }, + { + "epoch": 2.386727523002246, + "grad_norm": 0.21717150509357452, + "learning_rate": 3.816666666666667e-05, + "loss": 0.1101, + "step": 2059 + }, + { + "epoch": 2.387886691298993, + "grad_norm": 0.28333330154418945, + "learning_rate": 3.816091954022989e-05, + "loss": 0.1183, + "step": 2060 + }, + { + "epoch": 2.3890458595957402, + "grad_norm": 0.22554628551006317, + "learning_rate": 3.8155172413793105e-05, + "loss": 0.1276, + "step": 2061 + }, + { + "epoch": 2.3902050278924873, + "grad_norm": 0.2866622507572174, + "learning_rate": 3.814942528735632e-05, + "loss": 0.1378, + "step": 2062 + }, + { + "epoch": 2.391364196189234, + "grad_norm": 0.25763562321662903, + "learning_rate": 3.814367816091954e-05, + "loss": 0.1174, + "step": 2063 + }, + { + "epoch": 2.392523364485981, + "grad_norm": 0.24066491425037384, + "learning_rate": 3.813793103448276e-05, + "loss": 0.1193, + "step": 2064 + }, + { + "epoch": 2.3936825327827282, + "grad_norm": 0.27270743250846863, + "learning_rate": 3.813218390804598e-05, + "loss": 0.1219, + "step": 2065 + }, + { + "epoch": 2.3948417010794754, + "grad_norm": 0.28559228777885437, + "learning_rate": 3.81264367816092e-05, + "loss": 0.1345, + "step": 2066 + }, + { + "epoch": 2.3960008693762225, + "grad_norm": 0.27294817566871643, + "learning_rate": 3.8120689655172415e-05, + "loss": 0.1181, + "step": 2067 + }, + { + "epoch": 2.3971600376729696, + "grad_norm": 0.2862413227558136, + "learning_rate": 3.8114942528735636e-05, + "loss": 0.1141, + "step": 2068 + }, + { + "epoch": 2.3983192059697167, + "grad_norm": 0.26329490542411804, + "learning_rate": 3.810919540229885e-05, + "loss": 0.1233, + "step": 2069 + }, + { + "epoch": 2.399478374266464, + "grad_norm": 0.25338056683540344, + "learning_rate": 3.8103448275862066e-05, + "loss": 0.1157, + "step": 2070 + }, + { + "epoch": 2.400637542563211, + "grad_norm": 0.251355916261673, + "learning_rate": 3.809770114942529e-05, + "loss": 0.123, + "step": 2071 + }, + { + "epoch": 2.401796710859958, + "grad_norm": 0.32914426922798157, + "learning_rate": 3.809195402298851e-05, + "loss": 0.1366, + "step": 2072 + }, + { + "epoch": 2.402955879156705, + "grad_norm": 0.3044224977493286, + "learning_rate": 3.8086206896551724e-05, + "loss": 0.1273, + "step": 2073 + }, + { + "epoch": 2.4041150474534523, + "grad_norm": 0.2599372863769531, + "learning_rate": 3.8080459770114946e-05, + "loss": 0.1143, + "step": 2074 + }, + { + "epoch": 2.4052742157501994, + "grad_norm": 0.26074209809303284, + "learning_rate": 3.807471264367816e-05, + "loss": 0.121, + "step": 2075 + }, + { + "epoch": 2.406433384046946, + "grad_norm": 0.2514488399028778, + "learning_rate": 3.8068965517241376e-05, + "loss": 0.123, + "step": 2076 + }, + { + "epoch": 2.4075925523436936, + "grad_norm": 0.26624441146850586, + "learning_rate": 3.80632183908046e-05, + "loss": 0.1219, + "step": 2077 + }, + { + "epoch": 2.4087517206404403, + "grad_norm": 0.2705528140068054, + "learning_rate": 3.805747126436782e-05, + "loss": 0.1155, + "step": 2078 + }, + { + "epoch": 2.4099108889371874, + "grad_norm": 0.2879337668418884, + "learning_rate": 3.805172413793104e-05, + "loss": 0.1241, + "step": 2079 + }, + { + "epoch": 2.4110700572339345, + "grad_norm": 0.2518971264362335, + "learning_rate": 3.8045977011494256e-05, + "loss": 0.1157, + "step": 2080 + }, + { + "epoch": 2.4122292255306816, + "grad_norm": 0.3543863296508789, + "learning_rate": 3.804022988505747e-05, + "loss": 0.1295, + "step": 2081 + }, + { + "epoch": 2.4133883938274288, + "grad_norm": 0.2539769113063812, + "learning_rate": 3.803448275862069e-05, + "loss": 0.1139, + "step": 2082 + }, + { + "epoch": 2.414547562124176, + "grad_norm": 0.271513432264328, + "learning_rate": 3.802873563218391e-05, + "loss": 0.1255, + "step": 2083 + }, + { + "epoch": 2.415706730420923, + "grad_norm": 0.3522006869316101, + "learning_rate": 3.802298850574713e-05, + "loss": 0.1294, + "step": 2084 + }, + { + "epoch": 2.41686589871767, + "grad_norm": 0.27951931953430176, + "learning_rate": 3.801724137931035e-05, + "loss": 0.1331, + "step": 2085 + }, + { + "epoch": 2.418025067014417, + "grad_norm": 0.2659072279930115, + "learning_rate": 3.8011494252873566e-05, + "loss": 0.1187, + "step": 2086 + }, + { + "epoch": 2.4191842353111643, + "grad_norm": 0.23229147493839264, + "learning_rate": 3.800574712643679e-05, + "loss": 0.1146, + "step": 2087 + }, + { + "epoch": 2.4203434036079114, + "grad_norm": 0.28749972581863403, + "learning_rate": 3.8e-05, + "loss": 0.123, + "step": 2088 + }, + { + "epoch": 2.4215025719046586, + "grad_norm": 0.3175642788410187, + "learning_rate": 3.799425287356322e-05, + "loss": 0.1264, + "step": 2089 + }, + { + "epoch": 2.4226617402014057, + "grad_norm": 0.25972867012023926, + "learning_rate": 3.798850574712644e-05, + "loss": 0.1193, + "step": 2090 + }, + { + "epoch": 2.4238209084981523, + "grad_norm": 0.26062315702438354, + "learning_rate": 3.7982758620689654e-05, + "loss": 0.121, + "step": 2091 + }, + { + "epoch": 2.4249800767949, + "grad_norm": 0.25162971019744873, + "learning_rate": 3.7977011494252875e-05, + "loss": 0.1115, + "step": 2092 + }, + { + "epoch": 2.4261392450916466, + "grad_norm": 0.2526095509529114, + "learning_rate": 3.79712643678161e-05, + "loss": 0.1135, + "step": 2093 + }, + { + "epoch": 2.4272984133883937, + "grad_norm": 0.25769245624542236, + "learning_rate": 3.796551724137931e-05, + "loss": 0.116, + "step": 2094 + }, + { + "epoch": 2.428457581685141, + "grad_norm": 0.2587513029575348, + "learning_rate": 3.7959770114942534e-05, + "loss": 0.1046, + "step": 2095 + }, + { + "epoch": 2.429616749981888, + "grad_norm": 0.2877984941005707, + "learning_rate": 3.795402298850575e-05, + "loss": 0.1147, + "step": 2096 + }, + { + "epoch": 2.430775918278635, + "grad_norm": 0.28424739837646484, + "learning_rate": 3.794827586206896e-05, + "loss": 0.1188, + "step": 2097 + }, + { + "epoch": 2.431935086575382, + "grad_norm": 0.38208964467048645, + "learning_rate": 3.7942528735632185e-05, + "loss": 0.1128, + "step": 2098 + }, + { + "epoch": 2.4330942548721293, + "grad_norm": 0.27642908692359924, + "learning_rate": 3.793678160919541e-05, + "loss": 0.1336, + "step": 2099 + }, + { + "epoch": 2.4342534231688764, + "grad_norm": 0.2712388336658478, + "learning_rate": 3.793103448275862e-05, + "loss": 0.1335, + "step": 2100 + }, + { + "epoch": 2.4354125914656235, + "grad_norm": 0.3354480564594269, + "learning_rate": 3.792528735632184e-05, + "loss": 0.1292, + "step": 2101 + }, + { + "epoch": 2.4365717597623706, + "grad_norm": 0.2718063294887543, + "learning_rate": 3.791954022988506e-05, + "loss": 0.1318, + "step": 2102 + }, + { + "epoch": 2.4377309280591177, + "grad_norm": 0.2873290777206421, + "learning_rate": 3.791379310344827e-05, + "loss": 0.1184, + "step": 2103 + }, + { + "epoch": 2.438890096355865, + "grad_norm": 0.2832820415496826, + "learning_rate": 3.7908045977011495e-05, + "loss": 0.1181, + "step": 2104 + }, + { + "epoch": 2.440049264652612, + "grad_norm": 0.27394813299179077, + "learning_rate": 3.7902298850574716e-05, + "loss": 0.1117, + "step": 2105 + }, + { + "epoch": 2.4412084329493586, + "grad_norm": 0.2313212752342224, + "learning_rate": 3.789655172413794e-05, + "loss": 0.1154, + "step": 2106 + }, + { + "epoch": 2.4423676012461057, + "grad_norm": 0.2213948369026184, + "learning_rate": 3.789080459770115e-05, + "loss": 0.1214, + "step": 2107 + }, + { + "epoch": 2.443526769542853, + "grad_norm": 0.2309620976448059, + "learning_rate": 3.788505747126437e-05, + "loss": 0.1104, + "step": 2108 + }, + { + "epoch": 2.4446859378396, + "grad_norm": 0.24604758620262146, + "learning_rate": 3.787931034482759e-05, + "loss": 0.1153, + "step": 2109 + }, + { + "epoch": 2.445845106136347, + "grad_norm": 0.21898497641086578, + "learning_rate": 3.7873563218390804e-05, + "loss": 0.1093, + "step": 2110 + }, + { + "epoch": 2.447004274433094, + "grad_norm": 0.24178272485733032, + "learning_rate": 3.786781609195402e-05, + "loss": 0.1178, + "step": 2111 + }, + { + "epoch": 2.4481634427298413, + "grad_norm": 0.28789666295051575, + "learning_rate": 3.786206896551725e-05, + "loss": 0.1074, + "step": 2112 + }, + { + "epoch": 2.4493226110265884, + "grad_norm": 0.22450456023216248, + "learning_rate": 3.785632183908046e-05, + "loss": 0.1111, + "step": 2113 + }, + { + "epoch": 2.4504817793233356, + "grad_norm": 0.23580452799797058, + "learning_rate": 3.7850574712643684e-05, + "loss": 0.1088, + "step": 2114 + }, + { + "epoch": 2.4516409476200827, + "grad_norm": 0.26990747451782227, + "learning_rate": 3.78448275862069e-05, + "loss": 0.1343, + "step": 2115 + }, + { + "epoch": 2.45280011591683, + "grad_norm": 0.3045923709869385, + "learning_rate": 3.7839080459770114e-05, + "loss": 0.1226, + "step": 2116 + }, + { + "epoch": 2.453959284213577, + "grad_norm": 0.24597370624542236, + "learning_rate": 3.7833333333333336e-05, + "loss": 0.1329, + "step": 2117 + }, + { + "epoch": 2.455118452510324, + "grad_norm": 0.32915717363357544, + "learning_rate": 3.782758620689655e-05, + "loss": 0.1257, + "step": 2118 + }, + { + "epoch": 2.4562776208070707, + "grad_norm": 0.2542544901371002, + "learning_rate": 3.782183908045977e-05, + "loss": 0.1222, + "step": 2119 + }, + { + "epoch": 2.4574367891038182, + "grad_norm": 0.28636133670806885, + "learning_rate": 3.7816091954022994e-05, + "loss": 0.1311, + "step": 2120 + }, + { + "epoch": 2.458595957400565, + "grad_norm": 0.2496955841779709, + "learning_rate": 3.781034482758621e-05, + "loss": 0.1221, + "step": 2121 + }, + { + "epoch": 2.459755125697312, + "grad_norm": 0.31861692667007446, + "learning_rate": 3.7804597701149424e-05, + "loss": 0.115, + "step": 2122 + }, + { + "epoch": 2.460914293994059, + "grad_norm": 0.27043020725250244, + "learning_rate": 3.7798850574712646e-05, + "loss": 0.1132, + "step": 2123 + }, + { + "epoch": 2.4620734622908063, + "grad_norm": 0.22890667617321014, + "learning_rate": 3.779310344827586e-05, + "loss": 0.1082, + "step": 2124 + }, + { + "epoch": 2.4632326305875534, + "grad_norm": 0.27371424436569214, + "learning_rate": 3.778735632183908e-05, + "loss": 0.1231, + "step": 2125 + }, + { + "epoch": 2.4643917988843005, + "grad_norm": 0.2712050974369049, + "learning_rate": 3.7781609195402304e-05, + "loss": 0.141, + "step": 2126 + }, + { + "epoch": 2.4655509671810476, + "grad_norm": 0.26597630977630615, + "learning_rate": 3.777586206896552e-05, + "loss": 0.1193, + "step": 2127 + }, + { + "epoch": 2.4667101354777947, + "grad_norm": 0.28506308794021606, + "learning_rate": 3.777011494252874e-05, + "loss": 0.1172, + "step": 2128 + }, + { + "epoch": 2.467869303774542, + "grad_norm": 0.2775918245315552, + "learning_rate": 3.7764367816091955e-05, + "loss": 0.1131, + "step": 2129 + }, + { + "epoch": 2.469028472071289, + "grad_norm": 0.2832791209220886, + "learning_rate": 3.775862068965517e-05, + "loss": 0.13, + "step": 2130 + }, + { + "epoch": 2.470187640368036, + "grad_norm": 0.2610412836074829, + "learning_rate": 3.775287356321839e-05, + "loss": 0.1195, + "step": 2131 + }, + { + "epoch": 2.471346808664783, + "grad_norm": 0.2954767346382141, + "learning_rate": 3.774712643678161e-05, + "loss": 0.1197, + "step": 2132 + }, + { + "epoch": 2.4725059769615303, + "grad_norm": 0.30459877848625183, + "learning_rate": 3.7741379310344835e-05, + "loss": 0.1233, + "step": 2133 + }, + { + "epoch": 2.473665145258277, + "grad_norm": 0.2544924318790436, + "learning_rate": 3.773563218390805e-05, + "loss": 0.1124, + "step": 2134 + }, + { + "epoch": 2.474824313555024, + "grad_norm": 0.3522142469882965, + "learning_rate": 3.7729885057471265e-05, + "loss": 0.1125, + "step": 2135 + }, + { + "epoch": 2.475983481851771, + "grad_norm": 0.2694511115550995, + "learning_rate": 3.772413793103449e-05, + "loss": 0.117, + "step": 2136 + }, + { + "epoch": 2.4771426501485183, + "grad_norm": 0.261785626411438, + "learning_rate": 3.77183908045977e-05, + "loss": 0.112, + "step": 2137 + }, + { + "epoch": 2.4783018184452654, + "grad_norm": 0.24989941716194153, + "learning_rate": 3.7712643678160917e-05, + "loss": 0.1075, + "step": 2138 + }, + { + "epoch": 2.4794609867420125, + "grad_norm": 0.3909321427345276, + "learning_rate": 3.770689655172414e-05, + "loss": 0.1222, + "step": 2139 + }, + { + "epoch": 2.4806201550387597, + "grad_norm": 0.2468433380126953, + "learning_rate": 3.770114942528736e-05, + "loss": 0.1193, + "step": 2140 + }, + { + "epoch": 2.4817793233355068, + "grad_norm": 0.27097734808921814, + "learning_rate": 3.7695402298850575e-05, + "loss": 0.1379, + "step": 2141 + }, + { + "epoch": 2.482938491632254, + "grad_norm": 0.26425981521606445, + "learning_rate": 3.7689655172413797e-05, + "loss": 0.1156, + "step": 2142 + }, + { + "epoch": 2.484097659929001, + "grad_norm": 0.23942025005817413, + "learning_rate": 3.768390804597701e-05, + "loss": 0.1217, + "step": 2143 + }, + { + "epoch": 2.485256828225748, + "grad_norm": 0.27993398904800415, + "learning_rate": 3.767816091954023e-05, + "loss": 0.1239, + "step": 2144 + }, + { + "epoch": 2.4864159965224952, + "grad_norm": 0.2943548262119293, + "learning_rate": 3.767241379310345e-05, + "loss": 0.1265, + "step": 2145 + }, + { + "epoch": 2.4875751648192423, + "grad_norm": 0.33951184153556824, + "learning_rate": 3.766666666666667e-05, + "loss": 0.1267, + "step": 2146 + }, + { + "epoch": 2.4887343331159895, + "grad_norm": 0.2834022641181946, + "learning_rate": 3.766091954022989e-05, + "loss": 0.1261, + "step": 2147 + }, + { + "epoch": 2.4898935014127366, + "grad_norm": 0.2666892111301422, + "learning_rate": 3.7655172413793106e-05, + "loss": 0.1148, + "step": 2148 + }, + { + "epoch": 2.4910526697094832, + "grad_norm": 0.22419312596321106, + "learning_rate": 3.764942528735632e-05, + "loss": 0.1095, + "step": 2149 + }, + { + "epoch": 2.4922118380062304, + "grad_norm": 0.2352280616760254, + "learning_rate": 3.764367816091954e-05, + "loss": 0.1183, + "step": 2150 + }, + { + "epoch": 2.4933710063029775, + "grad_norm": 0.2600967288017273, + "learning_rate": 3.763793103448276e-05, + "loss": 0.1197, + "step": 2151 + }, + { + "epoch": 2.4945301745997246, + "grad_norm": 0.25782260298728943, + "learning_rate": 3.763218390804598e-05, + "loss": 0.1201, + "step": 2152 + }, + { + "epoch": 2.4956893428964717, + "grad_norm": 0.2279644012451172, + "learning_rate": 3.76264367816092e-05, + "loss": 0.1141, + "step": 2153 + }, + { + "epoch": 2.496848511193219, + "grad_norm": 0.3026202917098999, + "learning_rate": 3.7620689655172416e-05, + "loss": 0.1371, + "step": 2154 + }, + { + "epoch": 2.498007679489966, + "grad_norm": 0.18554648756980896, + "learning_rate": 3.761494252873564e-05, + "loss": 0.0954, + "step": 2155 + }, + { + "epoch": 2.499166847786713, + "grad_norm": 0.22495919466018677, + "learning_rate": 3.760919540229885e-05, + "loss": 0.1215, + "step": 2156 + }, + { + "epoch": 2.50032601608346, + "grad_norm": 0.26901552081108093, + "learning_rate": 3.760344827586207e-05, + "loss": 0.1227, + "step": 2157 + }, + { + "epoch": 2.5014851843802073, + "grad_norm": 0.2286049723625183, + "learning_rate": 3.759770114942529e-05, + "loss": 0.1266, + "step": 2158 + }, + { + "epoch": 2.5026443526769544, + "grad_norm": 0.2322368025779724, + "learning_rate": 3.7591954022988504e-05, + "loss": 0.1073, + "step": 2159 + }, + { + "epoch": 2.5038035209737015, + "grad_norm": 0.24827679991722107, + "learning_rate": 3.7586206896551726e-05, + "loss": 0.1278, + "step": 2160 + }, + { + "epoch": 2.5049626892704486, + "grad_norm": 0.24690257012844086, + "learning_rate": 3.758045977011495e-05, + "loss": 0.1126, + "step": 2161 + }, + { + "epoch": 2.5061218575671953, + "grad_norm": 0.2480195313692093, + "learning_rate": 3.757471264367816e-05, + "loss": 0.119, + "step": 2162 + }, + { + "epoch": 2.507281025863943, + "grad_norm": 0.21804195642471313, + "learning_rate": 3.7568965517241384e-05, + "loss": 0.1119, + "step": 2163 + }, + { + "epoch": 2.5084401941606895, + "grad_norm": 0.25382915139198303, + "learning_rate": 3.75632183908046e-05, + "loss": 0.1198, + "step": 2164 + }, + { + "epoch": 2.5095993624574366, + "grad_norm": 0.24101321399211884, + "learning_rate": 3.7557471264367814e-05, + "loss": 0.1187, + "step": 2165 + }, + { + "epoch": 2.5107585307541838, + "grad_norm": 0.28679129481315613, + "learning_rate": 3.7551724137931035e-05, + "loss": 0.127, + "step": 2166 + }, + { + "epoch": 2.511917699050931, + "grad_norm": 0.2896977365016937, + "learning_rate": 3.754597701149426e-05, + "loss": 0.1246, + "step": 2167 + }, + { + "epoch": 2.513076867347678, + "grad_norm": 0.31614774465560913, + "learning_rate": 3.754022988505747e-05, + "loss": 0.1283, + "step": 2168 + }, + { + "epoch": 2.514236035644425, + "grad_norm": 0.29304152727127075, + "learning_rate": 3.7534482758620694e-05, + "loss": 0.1263, + "step": 2169 + }, + { + "epoch": 2.515395203941172, + "grad_norm": 0.29353946447372437, + "learning_rate": 3.752873563218391e-05, + "loss": 0.1148, + "step": 2170 + }, + { + "epoch": 2.5165543722379193, + "grad_norm": 0.28586602210998535, + "learning_rate": 3.752298850574713e-05, + "loss": 0.1202, + "step": 2171 + }, + { + "epoch": 2.5177135405346664, + "grad_norm": 0.29421526193618774, + "learning_rate": 3.7517241379310345e-05, + "loss": 0.1329, + "step": 2172 + }, + { + "epoch": 2.5188727088314136, + "grad_norm": 0.23773986101150513, + "learning_rate": 3.751149425287357e-05, + "loss": 0.1212, + "step": 2173 + }, + { + "epoch": 2.5200318771281607, + "grad_norm": 0.2569189965724945, + "learning_rate": 3.750574712643679e-05, + "loss": 0.1142, + "step": 2174 + }, + { + "epoch": 2.521191045424908, + "grad_norm": 0.2963407337665558, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.1409, + "step": 2175 + }, + { + "epoch": 2.522350213721655, + "grad_norm": 0.24243849515914917, + "learning_rate": 3.749425287356322e-05, + "loss": 0.1304, + "step": 2176 + }, + { + "epoch": 2.5235093820184016, + "grad_norm": 0.27366575598716736, + "learning_rate": 3.748850574712644e-05, + "loss": 0.1381, + "step": 2177 + }, + { + "epoch": 2.524668550315149, + "grad_norm": 0.2454218864440918, + "learning_rate": 3.7482758620689655e-05, + "loss": 0.1145, + "step": 2178 + }, + { + "epoch": 2.525827718611896, + "grad_norm": 0.23390009999275208, + "learning_rate": 3.747701149425287e-05, + "loss": 0.1131, + "step": 2179 + }, + { + "epoch": 2.526986886908643, + "grad_norm": 0.24678659439086914, + "learning_rate": 3.747126436781609e-05, + "loss": 0.1215, + "step": 2180 + }, + { + "epoch": 2.52814605520539, + "grad_norm": 0.2842147648334503, + "learning_rate": 3.746551724137931e-05, + "loss": 0.1298, + "step": 2181 + }, + { + "epoch": 2.529305223502137, + "grad_norm": 0.3027459979057312, + "learning_rate": 3.7459770114942535e-05, + "loss": 0.1212, + "step": 2182 + }, + { + "epoch": 2.5304643917988843, + "grad_norm": 0.31403622031211853, + "learning_rate": 3.745402298850575e-05, + "loss": 0.1231, + "step": 2183 + }, + { + "epoch": 2.5316235600956314, + "grad_norm": 0.2930856943130493, + "learning_rate": 3.7448275862068965e-05, + "loss": 0.13, + "step": 2184 + }, + { + "epoch": 2.5327827283923785, + "grad_norm": 0.2218947857618332, + "learning_rate": 3.7442528735632186e-05, + "loss": 0.1079, + "step": 2185 + }, + { + "epoch": 2.5339418966891256, + "grad_norm": 0.2198045253753662, + "learning_rate": 3.74367816091954e-05, + "loss": 0.1077, + "step": 2186 + }, + { + "epoch": 2.5351010649858727, + "grad_norm": 0.198323592543602, + "learning_rate": 3.743103448275862e-05, + "loss": 0.11, + "step": 2187 + }, + { + "epoch": 2.53626023328262, + "grad_norm": 0.258024662733078, + "learning_rate": 3.7425287356321845e-05, + "loss": 0.1162, + "step": 2188 + }, + { + "epoch": 2.537419401579367, + "grad_norm": 0.3027409315109253, + "learning_rate": 3.741954022988506e-05, + "loss": 0.1383, + "step": 2189 + }, + { + "epoch": 2.5385785698761136, + "grad_norm": 0.27792859077453613, + "learning_rate": 3.741379310344828e-05, + "loss": 0.1138, + "step": 2190 + }, + { + "epoch": 2.539737738172861, + "grad_norm": 0.2331332266330719, + "learning_rate": 3.7408045977011496e-05, + "loss": 0.1065, + "step": 2191 + }, + { + "epoch": 2.540896906469608, + "grad_norm": 0.21128974854946136, + "learning_rate": 3.740229885057471e-05, + "loss": 0.1197, + "step": 2192 + }, + { + "epoch": 2.542056074766355, + "grad_norm": 0.2796282172203064, + "learning_rate": 3.739655172413793e-05, + "loss": 0.1131, + "step": 2193 + }, + { + "epoch": 2.543215243063102, + "grad_norm": 0.24139229953289032, + "learning_rate": 3.7390804597701154e-05, + "loss": 0.1165, + "step": 2194 + }, + { + "epoch": 2.544374411359849, + "grad_norm": 0.22860391438007355, + "learning_rate": 3.738505747126437e-05, + "loss": 0.1176, + "step": 2195 + }, + { + "epoch": 2.5455335796565963, + "grad_norm": 0.2661796510219574, + "learning_rate": 3.737931034482759e-05, + "loss": 0.1295, + "step": 2196 + }, + { + "epoch": 2.5466927479533434, + "grad_norm": 0.28988024592399597, + "learning_rate": 3.7373563218390806e-05, + "loss": 0.1363, + "step": 2197 + }, + { + "epoch": 2.5478519162500906, + "grad_norm": 0.26467469334602356, + "learning_rate": 3.736781609195402e-05, + "loss": 0.1078, + "step": 2198 + }, + { + "epoch": 2.5490110845468377, + "grad_norm": 0.3318430483341217, + "learning_rate": 3.736206896551724e-05, + "loss": 0.1299, + "step": 2199 + }, + { + "epoch": 2.550170252843585, + "grad_norm": 0.24909447133541107, + "learning_rate": 3.735632183908046e-05, + "loss": 0.1019, + "step": 2200 + }, + { + "epoch": 2.551329421140332, + "grad_norm": 0.3391667604446411, + "learning_rate": 3.735057471264368e-05, + "loss": 0.1336, + "step": 2201 + }, + { + "epoch": 2.552488589437079, + "grad_norm": 0.2599574327468872, + "learning_rate": 3.73448275862069e-05, + "loss": 0.1154, + "step": 2202 + }, + { + "epoch": 2.553647757733826, + "grad_norm": 0.22761644423007965, + "learning_rate": 3.7339080459770116e-05, + "loss": 0.0995, + "step": 2203 + }, + { + "epoch": 2.5548069260305732, + "grad_norm": 0.2874188721179962, + "learning_rate": 3.733333333333334e-05, + "loss": 0.1172, + "step": 2204 + }, + { + "epoch": 2.55596609432732, + "grad_norm": 0.2598547041416168, + "learning_rate": 3.732758620689655e-05, + "loss": 0.1315, + "step": 2205 + }, + { + "epoch": 2.5571252626240675, + "grad_norm": 0.2864150106906891, + "learning_rate": 3.732183908045977e-05, + "loss": 0.1163, + "step": 2206 + }, + { + "epoch": 2.558284430920814, + "grad_norm": 0.3307492434978485, + "learning_rate": 3.731609195402299e-05, + "loss": 0.1267, + "step": 2207 + }, + { + "epoch": 2.5594435992175613, + "grad_norm": 0.2648185193538666, + "learning_rate": 3.731034482758621e-05, + "loss": 0.1218, + "step": 2208 + }, + { + "epoch": 2.5606027675143084, + "grad_norm": 0.21011538803577423, + "learning_rate": 3.730459770114943e-05, + "loss": 0.1069, + "step": 2209 + }, + { + "epoch": 2.5617619358110555, + "grad_norm": 0.31918808817863464, + "learning_rate": 3.729885057471265e-05, + "loss": 0.1256, + "step": 2210 + }, + { + "epoch": 2.5629211041078026, + "grad_norm": 0.28522172570228577, + "learning_rate": 3.729310344827586e-05, + "loss": 0.1294, + "step": 2211 + }, + { + "epoch": 2.5640802724045497, + "grad_norm": 0.21739451587200165, + "learning_rate": 3.7287356321839084e-05, + "loss": 0.1001, + "step": 2212 + }, + { + "epoch": 2.565239440701297, + "grad_norm": 0.23575380444526672, + "learning_rate": 3.72816091954023e-05, + "loss": 0.1192, + "step": 2213 + }, + { + "epoch": 2.566398608998044, + "grad_norm": 0.20924952626228333, + "learning_rate": 3.727586206896552e-05, + "loss": 0.1079, + "step": 2214 + }, + { + "epoch": 2.567557777294791, + "grad_norm": 0.31391969323158264, + "learning_rate": 3.727011494252874e-05, + "loss": 0.1236, + "step": 2215 + }, + { + "epoch": 2.568716945591538, + "grad_norm": 0.2599903643131256, + "learning_rate": 3.726436781609196e-05, + "loss": 0.1175, + "step": 2216 + }, + { + "epoch": 2.5698761138882853, + "grad_norm": 0.2094232141971588, + "learning_rate": 3.725862068965517e-05, + "loss": 0.1147, + "step": 2217 + }, + { + "epoch": 2.571035282185032, + "grad_norm": 0.2692113518714905, + "learning_rate": 3.725287356321839e-05, + "loss": 0.125, + "step": 2218 + }, + { + "epoch": 2.5721944504817795, + "grad_norm": 0.28311142325401306, + "learning_rate": 3.724712643678161e-05, + "loss": 0.1233, + "step": 2219 + }, + { + "epoch": 2.573353618778526, + "grad_norm": 0.31285127997398376, + "learning_rate": 3.724137931034483e-05, + "loss": 0.1282, + "step": 2220 + }, + { + "epoch": 2.5745127870752738, + "grad_norm": 0.2766589820384979, + "learning_rate": 3.7235632183908045e-05, + "loss": 0.1163, + "step": 2221 + }, + { + "epoch": 2.5756719553720204, + "grad_norm": 0.26953697204589844, + "learning_rate": 3.7229885057471267e-05, + "loss": 0.1296, + "step": 2222 + }, + { + "epoch": 2.5768311236687675, + "grad_norm": 0.23072504997253418, + "learning_rate": 3.722413793103449e-05, + "loss": 0.1202, + "step": 2223 + }, + { + "epoch": 2.5779902919655147, + "grad_norm": 0.22109727561473846, + "learning_rate": 3.72183908045977e-05, + "loss": 0.1187, + "step": 2224 + }, + { + "epoch": 2.5791494602622618, + "grad_norm": 0.28811052441596985, + "learning_rate": 3.721264367816092e-05, + "loss": 0.1201, + "step": 2225 + }, + { + "epoch": 2.580308628559009, + "grad_norm": 0.31841617822647095, + "learning_rate": 3.720689655172414e-05, + "loss": 0.1092, + "step": 2226 + }, + { + "epoch": 2.581467796855756, + "grad_norm": 0.27956047654151917, + "learning_rate": 3.7201149425287355e-05, + "loss": 0.1238, + "step": 2227 + }, + { + "epoch": 2.582626965152503, + "grad_norm": 0.28410202264785767, + "learning_rate": 3.7195402298850576e-05, + "loss": 0.122, + "step": 2228 + }, + { + "epoch": 2.5837861334492502, + "grad_norm": 0.23132778704166412, + "learning_rate": 3.71896551724138e-05, + "loss": 0.1221, + "step": 2229 + }, + { + "epoch": 2.5849453017459973, + "grad_norm": 0.27684658765792847, + "learning_rate": 3.718390804597701e-05, + "loss": 0.1099, + "step": 2230 + }, + { + "epoch": 2.5861044700427445, + "grad_norm": 0.241298645734787, + "learning_rate": 3.7178160919540235e-05, + "loss": 0.11, + "step": 2231 + }, + { + "epoch": 2.5872636383394916, + "grad_norm": 0.23437266051769257, + "learning_rate": 3.717241379310345e-05, + "loss": 0.1234, + "step": 2232 + }, + { + "epoch": 2.5884228066362382, + "grad_norm": 0.27701497077941895, + "learning_rate": 3.7166666666666664e-05, + "loss": 0.1105, + "step": 2233 + }, + { + "epoch": 2.589581974932986, + "grad_norm": 0.24334724247455597, + "learning_rate": 3.7160919540229886e-05, + "loss": 0.1087, + "step": 2234 + }, + { + "epoch": 2.5907411432297325, + "grad_norm": 0.25163525342941284, + "learning_rate": 3.715517241379311e-05, + "loss": 0.1084, + "step": 2235 + }, + { + "epoch": 2.5919003115264796, + "grad_norm": 0.313559353351593, + "learning_rate": 3.714942528735632e-05, + "loss": 0.1397, + "step": 2236 + }, + { + "epoch": 2.5930594798232267, + "grad_norm": 0.2449546456336975, + "learning_rate": 3.7143678160919544e-05, + "loss": 0.1127, + "step": 2237 + }, + { + "epoch": 2.594218648119974, + "grad_norm": 0.2583812475204468, + "learning_rate": 3.713793103448276e-05, + "loss": 0.1089, + "step": 2238 + }, + { + "epoch": 2.595377816416721, + "grad_norm": 0.2844179570674896, + "learning_rate": 3.713218390804598e-05, + "loss": 0.1166, + "step": 2239 + }, + { + "epoch": 2.596536984713468, + "grad_norm": 0.2738819718360901, + "learning_rate": 3.7126436781609196e-05, + "loss": 0.1199, + "step": 2240 + }, + { + "epoch": 2.597696153010215, + "grad_norm": 0.253930002450943, + "learning_rate": 3.712068965517241e-05, + "loss": 0.1134, + "step": 2241 + }, + { + "epoch": 2.5988553213069623, + "grad_norm": 0.2768709659576416, + "learning_rate": 3.711494252873563e-05, + "loss": 0.1121, + "step": 2242 + }, + { + "epoch": 2.6000144896037094, + "grad_norm": 0.2729703187942505, + "learning_rate": 3.7109195402298854e-05, + "loss": 0.1197, + "step": 2243 + }, + { + "epoch": 2.6011736579004565, + "grad_norm": 0.3085278570652008, + "learning_rate": 3.710344827586207e-05, + "loss": 0.1363, + "step": 2244 + }, + { + "epoch": 2.6023328261972036, + "grad_norm": 0.2976699471473694, + "learning_rate": 3.709770114942529e-05, + "loss": 0.1215, + "step": 2245 + }, + { + "epoch": 2.6034919944939507, + "grad_norm": 0.24314960837364197, + "learning_rate": 3.7091954022988505e-05, + "loss": 0.1124, + "step": 2246 + }, + { + "epoch": 2.604651162790698, + "grad_norm": 0.20558379590511322, + "learning_rate": 3.708620689655173e-05, + "loss": 0.1064, + "step": 2247 + }, + { + "epoch": 2.6058103310874445, + "grad_norm": 0.2385900467634201, + "learning_rate": 3.708045977011494e-05, + "loss": 0.1125, + "step": 2248 + }, + { + "epoch": 2.606969499384192, + "grad_norm": 0.3381904363632202, + "learning_rate": 3.7074712643678164e-05, + "loss": 0.1222, + "step": 2249 + }, + { + "epoch": 2.6081286676809388, + "grad_norm": 0.24625416100025177, + "learning_rate": 3.7068965517241385e-05, + "loss": 0.1172, + "step": 2250 + }, + { + "epoch": 2.609287835977686, + "grad_norm": 0.2692759931087494, + "learning_rate": 3.70632183908046e-05, + "loss": 0.1275, + "step": 2251 + }, + { + "epoch": 2.610447004274433, + "grad_norm": 0.2261049598455429, + "learning_rate": 3.7057471264367815e-05, + "loss": 0.1132, + "step": 2252 + }, + { + "epoch": 2.61160617257118, + "grad_norm": 0.2826228141784668, + "learning_rate": 3.705172413793104e-05, + "loss": 0.1276, + "step": 2253 + }, + { + "epoch": 2.612765340867927, + "grad_norm": 0.24635249376296997, + "learning_rate": 3.704597701149425e-05, + "loss": 0.1231, + "step": 2254 + }, + { + "epoch": 2.6139245091646743, + "grad_norm": 0.2234896570444107, + "learning_rate": 3.7040229885057473e-05, + "loss": 0.1048, + "step": 2255 + }, + { + "epoch": 2.6150836774614215, + "grad_norm": 0.2619444727897644, + "learning_rate": 3.7034482758620695e-05, + "loss": 0.1306, + "step": 2256 + }, + { + "epoch": 2.6162428457581686, + "grad_norm": 0.3317265808582306, + "learning_rate": 3.702873563218391e-05, + "loss": 0.1275, + "step": 2257 + }, + { + "epoch": 2.6174020140549157, + "grad_norm": 0.2708195447921753, + "learning_rate": 3.702298850574713e-05, + "loss": 0.1133, + "step": 2258 + }, + { + "epoch": 2.618561182351663, + "grad_norm": 0.2827620208263397, + "learning_rate": 3.701724137931035e-05, + "loss": 0.1307, + "step": 2259 + }, + { + "epoch": 2.61972035064841, + "grad_norm": 0.24215327203273773, + "learning_rate": 3.701149425287356e-05, + "loss": 0.1187, + "step": 2260 + }, + { + "epoch": 2.6208795189451566, + "grad_norm": 0.2541255056858063, + "learning_rate": 3.700574712643678e-05, + "loss": 0.1215, + "step": 2261 + }, + { + "epoch": 2.622038687241904, + "grad_norm": 0.2253810614347458, + "learning_rate": 3.7e-05, + "loss": 0.1169, + "step": 2262 + }, + { + "epoch": 2.623197855538651, + "grad_norm": 0.27330759167671204, + "learning_rate": 3.699425287356322e-05, + "loss": 0.1285, + "step": 2263 + }, + { + "epoch": 2.6243570238353984, + "grad_norm": 0.2241210639476776, + "learning_rate": 3.698850574712644e-05, + "loss": 0.1143, + "step": 2264 + }, + { + "epoch": 2.625516192132145, + "grad_norm": 0.28012943267822266, + "learning_rate": 3.6982758620689656e-05, + "loss": 0.1303, + "step": 2265 + }, + { + "epoch": 2.626675360428892, + "grad_norm": 0.2770480215549469, + "learning_rate": 3.697701149425288e-05, + "loss": 0.1294, + "step": 2266 + }, + { + "epoch": 2.6278345287256393, + "grad_norm": 0.22335301339626312, + "learning_rate": 3.697126436781609e-05, + "loss": 0.1223, + "step": 2267 + }, + { + "epoch": 2.6289936970223864, + "grad_norm": 0.2822023332118988, + "learning_rate": 3.696551724137931e-05, + "loss": 0.123, + "step": 2268 + }, + { + "epoch": 2.6301528653191335, + "grad_norm": 0.2653454840183258, + "learning_rate": 3.695977011494253e-05, + "loss": 0.1153, + "step": 2269 + }, + { + "epoch": 2.6313120336158806, + "grad_norm": 0.2543680965900421, + "learning_rate": 3.695402298850575e-05, + "loss": 0.1211, + "step": 2270 + }, + { + "epoch": 2.6324712019126277, + "grad_norm": 0.3247624635696411, + "learning_rate": 3.6948275862068966e-05, + "loss": 0.1232, + "step": 2271 + }, + { + "epoch": 2.633630370209375, + "grad_norm": 0.30818063020706177, + "learning_rate": 3.694252873563219e-05, + "loss": 0.1255, + "step": 2272 + }, + { + "epoch": 2.634789538506122, + "grad_norm": 0.2480584979057312, + "learning_rate": 3.69367816091954e-05, + "loss": 0.1266, + "step": 2273 + }, + { + "epoch": 2.635948706802869, + "grad_norm": 0.2562277615070343, + "learning_rate": 3.6931034482758624e-05, + "loss": 0.1245, + "step": 2274 + }, + { + "epoch": 2.637107875099616, + "grad_norm": 0.21452297270298004, + "learning_rate": 3.692528735632184e-05, + "loss": 0.112, + "step": 2275 + }, + { + "epoch": 2.638267043396363, + "grad_norm": 0.20973417162895203, + "learning_rate": 3.691954022988506e-05, + "loss": 0.1148, + "step": 2276 + }, + { + "epoch": 2.6394262116931104, + "grad_norm": 0.2719014286994934, + "learning_rate": 3.691379310344828e-05, + "loss": 0.1165, + "step": 2277 + }, + { + "epoch": 2.640585379989857, + "grad_norm": 0.2603360414505005, + "learning_rate": 3.69080459770115e-05, + "loss": 0.1217, + "step": 2278 + }, + { + "epoch": 2.641744548286604, + "grad_norm": 0.2589358985424042, + "learning_rate": 3.690229885057471e-05, + "loss": 0.1261, + "step": 2279 + }, + { + "epoch": 2.6429037165833513, + "grad_norm": 0.2366466522216797, + "learning_rate": 3.6896551724137934e-05, + "loss": 0.112, + "step": 2280 + }, + { + "epoch": 2.6440628848800984, + "grad_norm": 0.3076455891132355, + "learning_rate": 3.689080459770115e-05, + "loss": 0.1337, + "step": 2281 + }, + { + "epoch": 2.6452220531768456, + "grad_norm": 0.24393005669116974, + "learning_rate": 3.6885057471264364e-05, + "loss": 0.1198, + "step": 2282 + }, + { + "epoch": 2.6463812214735927, + "grad_norm": 0.23066796362400055, + "learning_rate": 3.6879310344827586e-05, + "loss": 0.1197, + "step": 2283 + }, + { + "epoch": 2.64754038977034, + "grad_norm": 0.22284121811389923, + "learning_rate": 3.687356321839081e-05, + "loss": 0.1076, + "step": 2284 + }, + { + "epoch": 2.648699558067087, + "grad_norm": 0.2574465870857239, + "learning_rate": 3.686781609195403e-05, + "loss": 0.1185, + "step": 2285 + }, + { + "epoch": 2.649858726363834, + "grad_norm": 0.2645076811313629, + "learning_rate": 3.6862068965517244e-05, + "loss": 0.1175, + "step": 2286 + }, + { + "epoch": 2.651017894660581, + "grad_norm": 0.25155895948410034, + "learning_rate": 3.685632183908046e-05, + "loss": 0.1228, + "step": 2287 + }, + { + "epoch": 2.6521770629573282, + "grad_norm": 0.26526379585266113, + "learning_rate": 3.685057471264368e-05, + "loss": 0.1196, + "step": 2288 + }, + { + "epoch": 2.6533362312540754, + "grad_norm": 0.25388583540916443, + "learning_rate": 3.6844827586206895e-05, + "loss": 0.1181, + "step": 2289 + }, + { + "epoch": 2.6544953995508225, + "grad_norm": 0.31535226106643677, + "learning_rate": 3.683908045977012e-05, + "loss": 0.1282, + "step": 2290 + }, + { + "epoch": 2.655654567847569, + "grad_norm": 0.2896163761615753, + "learning_rate": 3.683333333333334e-05, + "loss": 0.1271, + "step": 2291 + }, + { + "epoch": 2.6568137361443167, + "grad_norm": 0.26265934109687805, + "learning_rate": 3.6827586206896554e-05, + "loss": 0.1289, + "step": 2292 + }, + { + "epoch": 2.6579729044410634, + "grad_norm": 0.25363272428512573, + "learning_rate": 3.6821839080459775e-05, + "loss": 0.1212, + "step": 2293 + }, + { + "epoch": 2.6591320727378105, + "grad_norm": 0.304082453250885, + "learning_rate": 3.681609195402299e-05, + "loss": 0.1121, + "step": 2294 + }, + { + "epoch": 2.6602912410345576, + "grad_norm": 0.21810661256313324, + "learning_rate": 3.6810344827586205e-05, + "loss": 0.1177, + "step": 2295 + }, + { + "epoch": 2.6614504093313047, + "grad_norm": 0.24036329984664917, + "learning_rate": 3.680459770114943e-05, + "loss": 0.109, + "step": 2296 + }, + { + "epoch": 2.662609577628052, + "grad_norm": 0.2967372238636017, + "learning_rate": 3.679885057471265e-05, + "loss": 0.1218, + "step": 2297 + }, + { + "epoch": 2.663768745924799, + "grad_norm": 0.3007439374923706, + "learning_rate": 3.679310344827586e-05, + "loss": 0.1286, + "step": 2298 + }, + { + "epoch": 2.664927914221546, + "grad_norm": 0.26587292551994324, + "learning_rate": 3.6787356321839085e-05, + "loss": 0.1242, + "step": 2299 + }, + { + "epoch": 2.666087082518293, + "grad_norm": 0.2876141369342804, + "learning_rate": 3.67816091954023e-05, + "loss": 0.1322, + "step": 2300 + }, + { + "epoch": 2.6672462508150403, + "grad_norm": 0.27488911151885986, + "learning_rate": 3.6775862068965515e-05, + "loss": 0.1218, + "step": 2301 + }, + { + "epoch": 2.6684054191117874, + "grad_norm": 0.2519826292991638, + "learning_rate": 3.6770114942528736e-05, + "loss": 0.1138, + "step": 2302 + }, + { + "epoch": 2.6695645874085345, + "grad_norm": 0.26412853598594666, + "learning_rate": 3.676436781609195e-05, + "loss": 0.1184, + "step": 2303 + }, + { + "epoch": 2.670723755705281, + "grad_norm": 0.24107958376407623, + "learning_rate": 3.675862068965518e-05, + "loss": 0.1159, + "step": 2304 + }, + { + "epoch": 2.6718829240020288, + "grad_norm": 0.25105899572372437, + "learning_rate": 3.6752873563218395e-05, + "loss": 0.1075, + "step": 2305 + }, + { + "epoch": 2.6730420922987754, + "grad_norm": 0.23559515178203583, + "learning_rate": 3.674712643678161e-05, + "loss": 0.1225, + "step": 2306 + }, + { + "epoch": 2.674201260595523, + "grad_norm": 0.23702873289585114, + "learning_rate": 3.674137931034483e-05, + "loss": 0.1221, + "step": 2307 + }, + { + "epoch": 2.6753604288922697, + "grad_norm": 0.21996693313121796, + "learning_rate": 3.6735632183908046e-05, + "loss": 0.1152, + "step": 2308 + }, + { + "epoch": 2.6765195971890168, + "grad_norm": 0.2361491173505783, + "learning_rate": 3.672988505747126e-05, + "loss": 0.115, + "step": 2309 + }, + { + "epoch": 2.677678765485764, + "grad_norm": 0.24125604331493378, + "learning_rate": 3.672413793103448e-05, + "loss": 0.1197, + "step": 2310 + }, + { + "epoch": 2.678837933782511, + "grad_norm": 0.23669910430908203, + "learning_rate": 3.6718390804597704e-05, + "loss": 0.1091, + "step": 2311 + }, + { + "epoch": 2.679997102079258, + "grad_norm": 0.26078295707702637, + "learning_rate": 3.6712643678160926e-05, + "loss": 0.1059, + "step": 2312 + }, + { + "epoch": 2.6811562703760052, + "grad_norm": 0.30441415309906006, + "learning_rate": 3.670689655172414e-05, + "loss": 0.1154, + "step": 2313 + }, + { + "epoch": 2.6823154386727524, + "grad_norm": 0.22218701243400574, + "learning_rate": 3.6701149425287356e-05, + "loss": 0.1235, + "step": 2314 + }, + { + "epoch": 2.6834746069694995, + "grad_norm": 0.25640806555747986, + "learning_rate": 3.669540229885058e-05, + "loss": 0.1218, + "step": 2315 + }, + { + "epoch": 2.6846337752662466, + "grad_norm": 0.2934137284755707, + "learning_rate": 3.668965517241379e-05, + "loss": 0.1164, + "step": 2316 + }, + { + "epoch": 2.6857929435629937, + "grad_norm": 0.27650073170661926, + "learning_rate": 3.6683908045977014e-05, + "loss": 0.1183, + "step": 2317 + }, + { + "epoch": 2.686952111859741, + "grad_norm": 0.24524487555027008, + "learning_rate": 3.6678160919540236e-05, + "loss": 0.1197, + "step": 2318 + }, + { + "epoch": 2.6881112801564875, + "grad_norm": 0.3019874095916748, + "learning_rate": 3.667241379310345e-05, + "loss": 0.1294, + "step": 2319 + }, + { + "epoch": 2.689270448453235, + "grad_norm": 0.321593314409256, + "learning_rate": 3.6666666666666666e-05, + "loss": 0.1279, + "step": 2320 + }, + { + "epoch": 2.6904296167499817, + "grad_norm": 0.26887890696525574, + "learning_rate": 3.666091954022989e-05, + "loss": 0.1225, + "step": 2321 + }, + { + "epoch": 2.691588785046729, + "grad_norm": 0.24992501735687256, + "learning_rate": 3.66551724137931e-05, + "loss": 0.1136, + "step": 2322 + }, + { + "epoch": 2.692747953343476, + "grad_norm": 0.21639297902584076, + "learning_rate": 3.6649425287356324e-05, + "loss": 0.1074, + "step": 2323 + }, + { + "epoch": 2.693907121640223, + "grad_norm": 0.3038938045501709, + "learning_rate": 3.6643678160919546e-05, + "loss": 0.1307, + "step": 2324 + }, + { + "epoch": 2.69506628993697, + "grad_norm": 0.31261420249938965, + "learning_rate": 3.663793103448276e-05, + "loss": 0.1392, + "step": 2325 + }, + { + "epoch": 2.6962254582337173, + "grad_norm": 0.22381141781806946, + "learning_rate": 3.663218390804598e-05, + "loss": 0.1043, + "step": 2326 + }, + { + "epoch": 2.6973846265304644, + "grad_norm": 0.28085264563560486, + "learning_rate": 3.66264367816092e-05, + "loss": 0.1145, + "step": 2327 + }, + { + "epoch": 2.6985437948272115, + "grad_norm": 0.2581823468208313, + "learning_rate": 3.662068965517241e-05, + "loss": 0.1333, + "step": 2328 + }, + { + "epoch": 2.6997029631239586, + "grad_norm": 0.25244686007499695, + "learning_rate": 3.6614942528735634e-05, + "loss": 0.1249, + "step": 2329 + }, + { + "epoch": 2.7008621314207057, + "grad_norm": 0.24646331369876862, + "learning_rate": 3.660919540229885e-05, + "loss": 0.1142, + "step": 2330 + }, + { + "epoch": 2.702021299717453, + "grad_norm": 0.2165023535490036, + "learning_rate": 3.660344827586207e-05, + "loss": 0.1225, + "step": 2331 + }, + { + "epoch": 2.7031804680142, + "grad_norm": 0.25640419125556946, + "learning_rate": 3.659770114942529e-05, + "loss": 0.1242, + "step": 2332 + }, + { + "epoch": 2.704339636310947, + "grad_norm": 0.33699721097946167, + "learning_rate": 3.659195402298851e-05, + "loss": 0.1284, + "step": 2333 + }, + { + "epoch": 2.7054988046076938, + "grad_norm": 0.2463027685880661, + "learning_rate": 3.658620689655173e-05, + "loss": 0.1252, + "step": 2334 + }, + { + "epoch": 2.7066579729044413, + "grad_norm": 0.2819848954677582, + "learning_rate": 3.6580459770114943e-05, + "loss": 0.1097, + "step": 2335 + }, + { + "epoch": 2.707817141201188, + "grad_norm": 0.2654036283493042, + "learning_rate": 3.657471264367816e-05, + "loss": 0.1187, + "step": 2336 + }, + { + "epoch": 2.708976309497935, + "grad_norm": 0.2798532545566559, + "learning_rate": 3.656896551724138e-05, + "loss": 0.1169, + "step": 2337 + }, + { + "epoch": 2.7101354777946822, + "grad_norm": 0.24664916098117828, + "learning_rate": 3.65632183908046e-05, + "loss": 0.1337, + "step": 2338 + }, + { + "epoch": 2.7112946460914293, + "grad_norm": 0.2289317101240158, + "learning_rate": 3.655747126436782e-05, + "loss": 0.1255, + "step": 2339 + }, + { + "epoch": 2.7124538143881765, + "grad_norm": 0.28109264373779297, + "learning_rate": 3.655172413793104e-05, + "loss": 0.1219, + "step": 2340 + }, + { + "epoch": 2.7136129826849236, + "grad_norm": 0.2645872235298157, + "learning_rate": 3.654597701149425e-05, + "loss": 0.1365, + "step": 2341 + }, + { + "epoch": 2.7147721509816707, + "grad_norm": 0.2895812690258026, + "learning_rate": 3.6540229885057475e-05, + "loss": 0.1297, + "step": 2342 + }, + { + "epoch": 2.715931319278418, + "grad_norm": 0.2900197207927704, + "learning_rate": 3.653448275862069e-05, + "loss": 0.1319, + "step": 2343 + }, + { + "epoch": 2.717090487575165, + "grad_norm": 0.22161021828651428, + "learning_rate": 3.6528735632183905e-05, + "loss": 0.1153, + "step": 2344 + }, + { + "epoch": 2.718249655871912, + "grad_norm": 0.28364619612693787, + "learning_rate": 3.652298850574713e-05, + "loss": 0.1261, + "step": 2345 + }, + { + "epoch": 2.719408824168659, + "grad_norm": 0.2544122636318207, + "learning_rate": 3.651724137931035e-05, + "loss": 0.1175, + "step": 2346 + }, + { + "epoch": 2.720567992465406, + "grad_norm": 0.21084167063236237, + "learning_rate": 3.651149425287356e-05, + "loss": 0.1011, + "step": 2347 + }, + { + "epoch": 2.7217271607621534, + "grad_norm": 0.24158698320388794, + "learning_rate": 3.6505747126436785e-05, + "loss": 0.1125, + "step": 2348 + }, + { + "epoch": 2.7228863290589, + "grad_norm": 0.2733836770057678, + "learning_rate": 3.65e-05, + "loss": 0.1187, + "step": 2349 + }, + { + "epoch": 2.724045497355647, + "grad_norm": 0.32197657227516174, + "learning_rate": 3.649425287356322e-05, + "loss": 0.1252, + "step": 2350 + }, + { + "epoch": 2.7252046656523943, + "grad_norm": 0.2622753381729126, + "learning_rate": 3.6488505747126436e-05, + "loss": 0.1251, + "step": 2351 + }, + { + "epoch": 2.7263638339491414, + "grad_norm": 0.2564488351345062, + "learning_rate": 3.648275862068966e-05, + "loss": 0.1175, + "step": 2352 + }, + { + "epoch": 2.7275230022458885, + "grad_norm": 0.29768943786621094, + "learning_rate": 3.647701149425288e-05, + "loss": 0.1133, + "step": 2353 + }, + { + "epoch": 2.7286821705426356, + "grad_norm": 0.297397643327713, + "learning_rate": 3.6471264367816094e-05, + "loss": 0.121, + "step": 2354 + }, + { + "epoch": 2.7298413388393827, + "grad_norm": 0.29082873463630676, + "learning_rate": 3.646551724137931e-05, + "loss": 0.1182, + "step": 2355 + }, + { + "epoch": 2.73100050713613, + "grad_norm": 0.22852347791194916, + "learning_rate": 3.645977011494253e-05, + "loss": 0.1106, + "step": 2356 + }, + { + "epoch": 2.732159675432877, + "grad_norm": 0.24832133948802948, + "learning_rate": 3.6454022988505746e-05, + "loss": 0.1103, + "step": 2357 + }, + { + "epoch": 2.733318843729624, + "grad_norm": 0.2785024046897888, + "learning_rate": 3.644827586206897e-05, + "loss": 0.1094, + "step": 2358 + }, + { + "epoch": 2.734478012026371, + "grad_norm": 0.25696471333503723, + "learning_rate": 3.644252873563219e-05, + "loss": 0.1202, + "step": 2359 + }, + { + "epoch": 2.7356371803231183, + "grad_norm": 0.264342337846756, + "learning_rate": 3.6436781609195404e-05, + "loss": 0.1151, + "step": 2360 + }, + { + "epoch": 2.7367963486198654, + "grad_norm": 0.21554605662822723, + "learning_rate": 3.6431034482758626e-05, + "loss": 0.1109, + "step": 2361 + }, + { + "epoch": 2.737955516916612, + "grad_norm": 0.26814398169517517, + "learning_rate": 3.642528735632184e-05, + "loss": 0.1184, + "step": 2362 + }, + { + "epoch": 2.7391146852133597, + "grad_norm": 0.24080701172351837, + "learning_rate": 3.6419540229885056e-05, + "loss": 0.1151, + "step": 2363 + }, + { + "epoch": 2.7402738535101063, + "grad_norm": 0.2593724727630615, + "learning_rate": 3.641379310344828e-05, + "loss": 0.1201, + "step": 2364 + }, + { + "epoch": 2.7414330218068534, + "grad_norm": 0.21561947464942932, + "learning_rate": 3.64080459770115e-05, + "loss": 0.1213, + "step": 2365 + }, + { + "epoch": 2.7425921901036006, + "grad_norm": 0.3276143968105316, + "learning_rate": 3.6402298850574714e-05, + "loss": 0.1139, + "step": 2366 + }, + { + "epoch": 2.7437513584003477, + "grad_norm": 0.26314690709114075, + "learning_rate": 3.6396551724137936e-05, + "loss": 0.1104, + "step": 2367 + }, + { + "epoch": 2.744910526697095, + "grad_norm": 0.2540189027786255, + "learning_rate": 3.639080459770115e-05, + "loss": 0.1218, + "step": 2368 + }, + { + "epoch": 2.746069694993842, + "grad_norm": 0.23746971786022186, + "learning_rate": 3.638505747126437e-05, + "loss": 0.1109, + "step": 2369 + }, + { + "epoch": 2.747228863290589, + "grad_norm": 0.25298699736595154, + "learning_rate": 3.637931034482759e-05, + "loss": 0.1159, + "step": 2370 + }, + { + "epoch": 2.748388031587336, + "grad_norm": 0.25313448905944824, + "learning_rate": 3.63735632183908e-05, + "loss": 0.1115, + "step": 2371 + }, + { + "epoch": 2.7495471998840832, + "grad_norm": 0.2032177448272705, + "learning_rate": 3.6367816091954024e-05, + "loss": 0.1203, + "step": 2372 + }, + { + "epoch": 2.7507063681808304, + "grad_norm": 0.3549204468727112, + "learning_rate": 3.6362068965517245e-05, + "loss": 0.1234, + "step": 2373 + }, + { + "epoch": 2.7518655364775775, + "grad_norm": 0.2937655746936798, + "learning_rate": 3.635632183908046e-05, + "loss": 0.1222, + "step": 2374 + }, + { + "epoch": 2.753024704774324, + "grad_norm": 0.23844723403453827, + "learning_rate": 3.635057471264368e-05, + "loss": 0.1168, + "step": 2375 + }, + { + "epoch": 2.7541838730710717, + "grad_norm": 0.3607582747936249, + "learning_rate": 3.63448275862069e-05, + "loss": 0.1224, + "step": 2376 + }, + { + "epoch": 2.7553430413678184, + "grad_norm": 0.29361268877983093, + "learning_rate": 3.633908045977011e-05, + "loss": 0.1209, + "step": 2377 + }, + { + "epoch": 2.756502209664566, + "grad_norm": 0.2490907460451126, + "learning_rate": 3.633333333333333e-05, + "loss": 0.1157, + "step": 2378 + }, + { + "epoch": 2.7576613779613126, + "grad_norm": 0.24479351937770844, + "learning_rate": 3.6327586206896555e-05, + "loss": 0.1194, + "step": 2379 + }, + { + "epoch": 2.7588205462580597, + "grad_norm": 0.33146244287490845, + "learning_rate": 3.632183908045978e-05, + "loss": 0.1374, + "step": 2380 + }, + { + "epoch": 2.759979714554807, + "grad_norm": 0.2993434965610504, + "learning_rate": 3.631609195402299e-05, + "loss": 0.1125, + "step": 2381 + }, + { + "epoch": 2.761138882851554, + "grad_norm": 0.25163406133651733, + "learning_rate": 3.6310344827586206e-05, + "loss": 0.1168, + "step": 2382 + }, + { + "epoch": 2.762298051148301, + "grad_norm": 0.2083575427532196, + "learning_rate": 3.630459770114943e-05, + "loss": 0.1033, + "step": 2383 + }, + { + "epoch": 2.763457219445048, + "grad_norm": 0.3714704215526581, + "learning_rate": 3.629885057471264e-05, + "loss": 0.1262, + "step": 2384 + }, + { + "epoch": 2.7646163877417953, + "grad_norm": 0.3778088688850403, + "learning_rate": 3.6293103448275865e-05, + "loss": 0.1243, + "step": 2385 + }, + { + "epoch": 2.7657755560385424, + "grad_norm": 0.28580304980278015, + "learning_rate": 3.6287356321839086e-05, + "loss": 0.1188, + "step": 2386 + }, + { + "epoch": 2.7669347243352895, + "grad_norm": 0.3345694839954376, + "learning_rate": 3.62816091954023e-05, + "loss": 0.1078, + "step": 2387 + }, + { + "epoch": 2.7680938926320366, + "grad_norm": 0.2846716642379761, + "learning_rate": 3.627586206896552e-05, + "loss": 0.1142, + "step": 2388 + }, + { + "epoch": 2.7692530609287838, + "grad_norm": 0.30462977290153503, + "learning_rate": 3.627011494252874e-05, + "loss": 0.1327, + "step": 2389 + }, + { + "epoch": 2.7704122292255304, + "grad_norm": 0.2605188190937042, + "learning_rate": 3.626436781609195e-05, + "loss": 0.1099, + "step": 2390 + }, + { + "epoch": 2.771571397522278, + "grad_norm": 0.2525080740451813, + "learning_rate": 3.6258620689655174e-05, + "loss": 0.1149, + "step": 2391 + }, + { + "epoch": 2.7727305658190247, + "grad_norm": 0.2309272438287735, + "learning_rate": 3.625287356321839e-05, + "loss": 0.1232, + "step": 2392 + }, + { + "epoch": 2.7738897341157718, + "grad_norm": 0.25411146879196167, + "learning_rate": 3.624712643678161e-05, + "loss": 0.138, + "step": 2393 + }, + { + "epoch": 2.775048902412519, + "grad_norm": 0.2844621241092682, + "learning_rate": 3.624137931034483e-05, + "loss": 0.1282, + "step": 2394 + }, + { + "epoch": 2.776208070709266, + "grad_norm": 0.29295527935028076, + "learning_rate": 3.623563218390805e-05, + "loss": 0.1213, + "step": 2395 + }, + { + "epoch": 2.777367239006013, + "grad_norm": 0.3641514778137207, + "learning_rate": 3.622988505747126e-05, + "loss": 0.1313, + "step": 2396 + }, + { + "epoch": 2.7785264073027602, + "grad_norm": 0.24175724387168884, + "learning_rate": 3.6224137931034484e-05, + "loss": 0.1199, + "step": 2397 + }, + { + "epoch": 2.7796855755995074, + "grad_norm": 0.3517894744873047, + "learning_rate": 3.62183908045977e-05, + "loss": 0.1308, + "step": 2398 + }, + { + "epoch": 2.7808447438962545, + "grad_norm": 0.24343693256378174, + "learning_rate": 3.621264367816092e-05, + "loss": 0.1162, + "step": 2399 + }, + { + "epoch": 2.7820039121930016, + "grad_norm": 0.2250448763370514, + "learning_rate": 3.620689655172414e-05, + "loss": 0.112, + "step": 2400 + }, + { + "epoch": 2.7831630804897487, + "grad_norm": 0.23824208974838257, + "learning_rate": 3.620114942528736e-05, + "loss": 0.1222, + "step": 2401 + }, + { + "epoch": 2.784322248786496, + "grad_norm": 0.22008417546749115, + "learning_rate": 3.619540229885058e-05, + "loss": 0.1213, + "step": 2402 + }, + { + "epoch": 2.785481417083243, + "grad_norm": 0.3020162582397461, + "learning_rate": 3.6189655172413794e-05, + "loss": 0.1329, + "step": 2403 + }, + { + "epoch": 2.78664058537999, + "grad_norm": 0.21677182614803314, + "learning_rate": 3.618390804597701e-05, + "loss": 0.1186, + "step": 2404 + }, + { + "epoch": 2.7877997536767367, + "grad_norm": 0.25215545296669006, + "learning_rate": 3.617816091954023e-05, + "loss": 0.117, + "step": 2405 + }, + { + "epoch": 2.7889589219734843, + "grad_norm": 0.24186648428440094, + "learning_rate": 3.617241379310345e-05, + "loss": 0.1181, + "step": 2406 + }, + { + "epoch": 2.790118090270231, + "grad_norm": 0.2735014259815216, + "learning_rate": 3.6166666666666674e-05, + "loss": 0.1276, + "step": 2407 + }, + { + "epoch": 2.791277258566978, + "grad_norm": 0.20679566264152527, + "learning_rate": 3.616091954022989e-05, + "loss": 0.1146, + "step": 2408 + }, + { + "epoch": 2.792436426863725, + "grad_norm": 0.25650107860565186, + "learning_rate": 3.6155172413793104e-05, + "loss": 0.1223, + "step": 2409 + }, + { + "epoch": 2.7935955951604723, + "grad_norm": 0.22591929137706757, + "learning_rate": 3.6149425287356325e-05, + "loss": 0.1167, + "step": 2410 + }, + { + "epoch": 2.7947547634572194, + "grad_norm": 0.2470989227294922, + "learning_rate": 3.614367816091954e-05, + "loss": 0.1116, + "step": 2411 + }, + { + "epoch": 2.7959139317539665, + "grad_norm": 0.24131318926811218, + "learning_rate": 3.6137931034482755e-05, + "loss": 0.117, + "step": 2412 + }, + { + "epoch": 2.7970731000507136, + "grad_norm": 0.2714328467845917, + "learning_rate": 3.613218390804598e-05, + "loss": 0.1246, + "step": 2413 + }, + { + "epoch": 2.7982322683474607, + "grad_norm": 0.27446985244750977, + "learning_rate": 3.61264367816092e-05, + "loss": 0.1134, + "step": 2414 + }, + { + "epoch": 2.799391436644208, + "grad_norm": 0.23050987720489502, + "learning_rate": 3.6120689655172413e-05, + "loss": 0.1099, + "step": 2415 + }, + { + "epoch": 2.800550604940955, + "grad_norm": 0.27243009209632874, + "learning_rate": 3.6114942528735635e-05, + "loss": 0.1351, + "step": 2416 + }, + { + "epoch": 2.801709773237702, + "grad_norm": 0.2836304306983948, + "learning_rate": 3.610919540229885e-05, + "loss": 0.1248, + "step": 2417 + }, + { + "epoch": 2.8028689415344488, + "grad_norm": 0.27356666326522827, + "learning_rate": 3.610344827586207e-05, + "loss": 0.1193, + "step": 2418 + }, + { + "epoch": 2.8040281098311963, + "grad_norm": 0.2828332185745239, + "learning_rate": 3.6097701149425287e-05, + "loss": 0.1253, + "step": 2419 + }, + { + "epoch": 2.805187278127943, + "grad_norm": 0.2918018102645874, + "learning_rate": 3.609195402298851e-05, + "loss": 0.1309, + "step": 2420 + }, + { + "epoch": 2.8063464464246906, + "grad_norm": 0.22664931416511536, + "learning_rate": 3.608620689655173e-05, + "loss": 0.1153, + "step": 2421 + }, + { + "epoch": 2.8075056147214372, + "grad_norm": 0.2359132617712021, + "learning_rate": 3.6080459770114945e-05, + "loss": 0.1218, + "step": 2422 + }, + { + "epoch": 2.8086647830181843, + "grad_norm": 0.26460394263267517, + "learning_rate": 3.607471264367816e-05, + "loss": 0.1309, + "step": 2423 + }, + { + "epoch": 2.8098239513149315, + "grad_norm": 0.25006482005119324, + "learning_rate": 3.606896551724138e-05, + "loss": 0.1078, + "step": 2424 + }, + { + "epoch": 2.8109831196116786, + "grad_norm": 0.25657373666763306, + "learning_rate": 3.6063218390804596e-05, + "loss": 0.1084, + "step": 2425 + }, + { + "epoch": 2.8121422879084257, + "grad_norm": 0.3210379183292389, + "learning_rate": 3.605747126436782e-05, + "loss": 0.1272, + "step": 2426 + }, + { + "epoch": 2.813301456205173, + "grad_norm": 0.23152370750904083, + "learning_rate": 3.605172413793104e-05, + "loss": 0.1124, + "step": 2427 + }, + { + "epoch": 2.81446062450192, + "grad_norm": 0.24926093220710754, + "learning_rate": 3.6045977011494255e-05, + "loss": 0.1181, + "step": 2428 + }, + { + "epoch": 2.815619792798667, + "grad_norm": 0.25474682450294495, + "learning_rate": 3.6040229885057476e-05, + "loss": 0.1287, + "step": 2429 + }, + { + "epoch": 2.816778961095414, + "grad_norm": 0.27318352460861206, + "learning_rate": 3.603448275862069e-05, + "loss": 0.1203, + "step": 2430 + }, + { + "epoch": 2.8179381293921613, + "grad_norm": 0.2800019383430481, + "learning_rate": 3.6028735632183906e-05, + "loss": 0.1196, + "step": 2431 + }, + { + "epoch": 2.8190972976889084, + "grad_norm": 0.20935480296611786, + "learning_rate": 3.602298850574713e-05, + "loss": 0.1105, + "step": 2432 + }, + { + "epoch": 2.820256465985655, + "grad_norm": 0.24666768312454224, + "learning_rate": 3.601724137931034e-05, + "loss": 0.1182, + "step": 2433 + }, + { + "epoch": 2.8214156342824026, + "grad_norm": 0.2466108649969101, + "learning_rate": 3.6011494252873564e-05, + "loss": 0.1183, + "step": 2434 + }, + { + "epoch": 2.8225748025791493, + "grad_norm": 0.2528558671474457, + "learning_rate": 3.6005747126436786e-05, + "loss": 0.1188, + "step": 2435 + }, + { + "epoch": 2.8237339708758964, + "grad_norm": 0.29567837715148926, + "learning_rate": 3.6e-05, + "loss": 0.1443, + "step": 2436 + }, + { + "epoch": 2.8248931391726435, + "grad_norm": 0.31635522842407227, + "learning_rate": 3.599425287356322e-05, + "loss": 0.107, + "step": 2437 + }, + { + "epoch": 2.8260523074693906, + "grad_norm": 0.23318928480148315, + "learning_rate": 3.598850574712644e-05, + "loss": 0.1231, + "step": 2438 + }, + { + "epoch": 2.8272114757661377, + "grad_norm": 0.29607391357421875, + "learning_rate": 3.598275862068965e-05, + "loss": 0.1257, + "step": 2439 + }, + { + "epoch": 2.828370644062885, + "grad_norm": 0.2570621371269226, + "learning_rate": 3.5977011494252874e-05, + "loss": 0.1128, + "step": 2440 + }, + { + "epoch": 2.829529812359632, + "grad_norm": 0.24534910917282104, + "learning_rate": 3.5971264367816096e-05, + "loss": 0.111, + "step": 2441 + }, + { + "epoch": 2.830688980656379, + "grad_norm": 0.26997894048690796, + "learning_rate": 3.596551724137931e-05, + "loss": 0.1114, + "step": 2442 + }, + { + "epoch": 2.831848148953126, + "grad_norm": 0.29635146260261536, + "learning_rate": 3.595977011494253e-05, + "loss": 0.1228, + "step": 2443 + }, + { + "epoch": 2.8330073172498733, + "grad_norm": 0.2879303991794586, + "learning_rate": 3.595402298850575e-05, + "loss": 0.1248, + "step": 2444 + }, + { + "epoch": 2.8341664855466204, + "grad_norm": 0.2170044481754303, + "learning_rate": 3.594827586206897e-05, + "loss": 0.11, + "step": 2445 + }, + { + "epoch": 2.8353256538433675, + "grad_norm": 0.2590021789073944, + "learning_rate": 3.5942528735632184e-05, + "loss": 0.1257, + "step": 2446 + }, + { + "epoch": 2.8364848221401147, + "grad_norm": 0.25677573680877686, + "learning_rate": 3.5936781609195405e-05, + "loss": 0.1266, + "step": 2447 + }, + { + "epoch": 2.8376439904368613, + "grad_norm": 0.2618117928504944, + "learning_rate": 3.593103448275863e-05, + "loss": 0.1143, + "step": 2448 + }, + { + "epoch": 2.838803158733609, + "grad_norm": 0.23307304084300995, + "learning_rate": 3.592528735632184e-05, + "loss": 0.1126, + "step": 2449 + }, + { + "epoch": 2.8399623270303556, + "grad_norm": 0.2070130705833435, + "learning_rate": 3.591954022988506e-05, + "loss": 0.1141, + "step": 2450 + }, + { + "epoch": 2.8411214953271027, + "grad_norm": 0.25860536098480225, + "learning_rate": 3.591379310344828e-05, + "loss": 0.1102, + "step": 2451 + }, + { + "epoch": 2.84228066362385, + "grad_norm": 0.22545821964740753, + "learning_rate": 3.5908045977011494e-05, + "loss": 0.1037, + "step": 2452 + }, + { + "epoch": 2.843439831920597, + "grad_norm": 0.27270445227622986, + "learning_rate": 3.5902298850574715e-05, + "loss": 0.1171, + "step": 2453 + }, + { + "epoch": 2.844599000217344, + "grad_norm": 0.23583996295928955, + "learning_rate": 3.589655172413793e-05, + "loss": 0.099, + "step": 2454 + }, + { + "epoch": 2.845758168514091, + "grad_norm": 0.27526620030403137, + "learning_rate": 3.589080459770115e-05, + "loss": 0.1124, + "step": 2455 + }, + { + "epoch": 2.8469173368108383, + "grad_norm": 0.23859523236751556, + "learning_rate": 3.5885057471264373e-05, + "loss": 0.1142, + "step": 2456 + }, + { + "epoch": 2.8480765051075854, + "grad_norm": 0.3173846900463104, + "learning_rate": 3.587931034482759e-05, + "loss": 0.1249, + "step": 2457 + }, + { + "epoch": 2.8492356734043325, + "grad_norm": 0.2413249909877777, + "learning_rate": 3.58735632183908e-05, + "loss": 0.1165, + "step": 2458 + }, + { + "epoch": 2.8503948417010796, + "grad_norm": 0.2488858848810196, + "learning_rate": 3.5867816091954025e-05, + "loss": 0.1176, + "step": 2459 + }, + { + "epoch": 2.8515540099978267, + "grad_norm": 0.272587388753891, + "learning_rate": 3.586206896551724e-05, + "loss": 0.1212, + "step": 2460 + }, + { + "epoch": 2.8527131782945734, + "grad_norm": 0.29471564292907715, + "learning_rate": 3.585632183908046e-05, + "loss": 0.1298, + "step": 2461 + }, + { + "epoch": 2.853872346591321, + "grad_norm": 0.25901687145233154, + "learning_rate": 3.585057471264368e-05, + "loss": 0.1231, + "step": 2462 + }, + { + "epoch": 2.8550315148880676, + "grad_norm": 0.2517445981502533, + "learning_rate": 3.58448275862069e-05, + "loss": 0.1169, + "step": 2463 + }, + { + "epoch": 2.856190683184815, + "grad_norm": 0.23075416684150696, + "learning_rate": 3.583908045977012e-05, + "loss": 0.1025, + "step": 2464 + }, + { + "epoch": 2.857349851481562, + "grad_norm": 0.2388884723186493, + "learning_rate": 3.5833333333333335e-05, + "loss": 0.1107, + "step": 2465 + }, + { + "epoch": 2.858509019778309, + "grad_norm": 0.3148244619369507, + "learning_rate": 3.582758620689655e-05, + "loss": 0.1179, + "step": 2466 + }, + { + "epoch": 2.859668188075056, + "grad_norm": 0.28545230627059937, + "learning_rate": 3.582183908045977e-05, + "loss": 0.1237, + "step": 2467 + }, + { + "epoch": 2.860827356371803, + "grad_norm": 0.2898615002632141, + "learning_rate": 3.581609195402299e-05, + "loss": 0.1331, + "step": 2468 + }, + { + "epoch": 2.8619865246685503, + "grad_norm": 0.2582353353500366, + "learning_rate": 3.581034482758621e-05, + "loss": 0.1204, + "step": 2469 + }, + { + "epoch": 2.8631456929652974, + "grad_norm": 0.2813541293144226, + "learning_rate": 3.580459770114943e-05, + "loss": 0.1232, + "step": 2470 + }, + { + "epoch": 2.8643048612620445, + "grad_norm": 0.2520640194416046, + "learning_rate": 3.5798850574712644e-05, + "loss": 0.1137, + "step": 2471 + }, + { + "epoch": 2.8654640295587916, + "grad_norm": 0.29542282223701477, + "learning_rate": 3.5793103448275866e-05, + "loss": 0.121, + "step": 2472 + }, + { + "epoch": 2.8666231978555388, + "grad_norm": 0.21729327738285065, + "learning_rate": 3.578735632183908e-05, + "loss": 0.1108, + "step": 2473 + }, + { + "epoch": 2.867782366152286, + "grad_norm": 0.285705029964447, + "learning_rate": 3.5781609195402296e-05, + "loss": 0.1234, + "step": 2474 + }, + { + "epoch": 2.868941534449033, + "grad_norm": 0.2638833224773407, + "learning_rate": 3.5775862068965524e-05, + "loss": 0.1361, + "step": 2475 + }, + { + "epoch": 2.8701007027457797, + "grad_norm": 0.25807875394821167, + "learning_rate": 3.577011494252874e-05, + "loss": 0.1067, + "step": 2476 + }, + { + "epoch": 2.8712598710425272, + "grad_norm": 0.20497660338878632, + "learning_rate": 3.5764367816091954e-05, + "loss": 0.1072, + "step": 2477 + }, + { + "epoch": 2.872419039339274, + "grad_norm": 0.23752939701080322, + "learning_rate": 3.5758620689655176e-05, + "loss": 0.1231, + "step": 2478 + }, + { + "epoch": 2.873578207636021, + "grad_norm": 0.2198176383972168, + "learning_rate": 3.575287356321839e-05, + "loss": 0.1212, + "step": 2479 + }, + { + "epoch": 2.874737375932768, + "grad_norm": 0.28689807653427124, + "learning_rate": 3.5747126436781606e-05, + "loss": 0.1292, + "step": 2480 + }, + { + "epoch": 2.8758965442295152, + "grad_norm": 0.21287685632705688, + "learning_rate": 3.574137931034483e-05, + "loss": 0.106, + "step": 2481 + }, + { + "epoch": 2.8770557125262624, + "grad_norm": 0.3098215162754059, + "learning_rate": 3.573563218390805e-05, + "loss": 0.1212, + "step": 2482 + }, + { + "epoch": 2.8782148808230095, + "grad_norm": 0.1953476518392563, + "learning_rate": 3.572988505747127e-05, + "loss": 0.1001, + "step": 2483 + }, + { + "epoch": 2.8793740491197566, + "grad_norm": 0.22873975336551666, + "learning_rate": 3.5724137931034486e-05, + "loss": 0.1115, + "step": 2484 + }, + { + "epoch": 2.8805332174165037, + "grad_norm": 0.21951954066753387, + "learning_rate": 3.57183908045977e-05, + "loss": 0.1197, + "step": 2485 + }, + { + "epoch": 2.881692385713251, + "grad_norm": 0.25903722643852234, + "learning_rate": 3.571264367816092e-05, + "loss": 0.1082, + "step": 2486 + }, + { + "epoch": 2.882851554009998, + "grad_norm": 0.2247413992881775, + "learning_rate": 3.570689655172414e-05, + "loss": 0.1138, + "step": 2487 + }, + { + "epoch": 2.884010722306745, + "grad_norm": 0.2930486798286438, + "learning_rate": 3.570114942528736e-05, + "loss": 0.1327, + "step": 2488 + }, + { + "epoch": 2.885169890603492, + "grad_norm": 0.32218900322914124, + "learning_rate": 3.569540229885058e-05, + "loss": 0.1261, + "step": 2489 + }, + { + "epoch": 2.8863290589002393, + "grad_norm": 0.2783204913139343, + "learning_rate": 3.5689655172413795e-05, + "loss": 0.1132, + "step": 2490 + }, + { + "epoch": 2.887488227196986, + "grad_norm": 0.214439257979393, + "learning_rate": 3.568390804597702e-05, + "loss": 0.1179, + "step": 2491 + }, + { + "epoch": 2.8886473954937335, + "grad_norm": 0.25692418217658997, + "learning_rate": 3.567816091954023e-05, + "loss": 0.1147, + "step": 2492 + }, + { + "epoch": 2.88980656379048, + "grad_norm": 0.27473074197769165, + "learning_rate": 3.567241379310345e-05, + "loss": 0.1351, + "step": 2493 + }, + { + "epoch": 2.8909657320872273, + "grad_norm": 0.27782875299453735, + "learning_rate": 3.566666666666667e-05, + "loss": 0.1246, + "step": 2494 + }, + { + "epoch": 2.8921249003839744, + "grad_norm": 0.22847121953964233, + "learning_rate": 3.5660919540229883e-05, + "loss": 0.1118, + "step": 2495 + }, + { + "epoch": 2.8932840686807215, + "grad_norm": 0.22717076539993286, + "learning_rate": 3.5655172413793105e-05, + "loss": 0.1233, + "step": 2496 + }, + { + "epoch": 2.8944432369774686, + "grad_norm": 0.2825745642185211, + "learning_rate": 3.564942528735633e-05, + "loss": 0.1282, + "step": 2497 + }, + { + "epoch": 2.8956024052742158, + "grad_norm": 0.25072160363197327, + "learning_rate": 3.564367816091954e-05, + "loss": 0.1239, + "step": 2498 + }, + { + "epoch": 2.896761573570963, + "grad_norm": 0.2573794424533844, + "learning_rate": 3.5637931034482757e-05, + "loss": 0.1083, + "step": 2499 + }, + { + "epoch": 2.89792074186771, + "grad_norm": 0.29869526624679565, + "learning_rate": 3.563218390804598e-05, + "loss": 0.132, + "step": 2500 + }, + { + "epoch": 2.899079910164457, + "grad_norm": 0.24335411190986633, + "learning_rate": 3.562643678160919e-05, + "loss": 0.1187, + "step": 2501 + }, + { + "epoch": 2.900239078461204, + "grad_norm": 0.25249624252319336, + "learning_rate": 3.5620689655172415e-05, + "loss": 0.1157, + "step": 2502 + }, + { + "epoch": 2.9013982467579513, + "grad_norm": 0.2802266776561737, + "learning_rate": 3.5614942528735637e-05, + "loss": 0.1267, + "step": 2503 + }, + { + "epoch": 2.902557415054698, + "grad_norm": 0.3055930435657501, + "learning_rate": 3.560919540229885e-05, + "loss": 0.1287, + "step": 2504 + }, + { + "epoch": 2.9037165833514456, + "grad_norm": 0.2905328571796417, + "learning_rate": 3.560344827586207e-05, + "loss": 0.1317, + "step": 2505 + }, + { + "epoch": 2.9048757516481922, + "grad_norm": 0.21209411323070526, + "learning_rate": 3.559770114942529e-05, + "loss": 0.1131, + "step": 2506 + }, + { + "epoch": 2.9060349199449393, + "grad_norm": 0.2763073444366455, + "learning_rate": 3.55919540229885e-05, + "loss": 0.1205, + "step": 2507 + }, + { + "epoch": 2.9071940882416865, + "grad_norm": 0.23870214819908142, + "learning_rate": 3.5586206896551725e-05, + "loss": 0.1237, + "step": 2508 + }, + { + "epoch": 2.9083532565384336, + "grad_norm": 0.2398124486207962, + "learning_rate": 3.5580459770114946e-05, + "loss": 0.1161, + "step": 2509 + }, + { + "epoch": 2.9095124248351807, + "grad_norm": 0.23274962604045868, + "learning_rate": 3.557471264367817e-05, + "loss": 0.1094, + "step": 2510 + }, + { + "epoch": 2.910671593131928, + "grad_norm": 0.2622717022895813, + "learning_rate": 3.556896551724138e-05, + "loss": 0.1189, + "step": 2511 + }, + { + "epoch": 2.911830761428675, + "grad_norm": 0.335947185754776, + "learning_rate": 3.55632183908046e-05, + "loss": 0.1237, + "step": 2512 + }, + { + "epoch": 2.912989929725422, + "grad_norm": 0.2339511215686798, + "learning_rate": 3.555747126436782e-05, + "loss": 0.1195, + "step": 2513 + }, + { + "epoch": 2.914149098022169, + "grad_norm": 0.2983540892601013, + "learning_rate": 3.5551724137931034e-05, + "loss": 0.1184, + "step": 2514 + }, + { + "epoch": 2.9153082663189163, + "grad_norm": 0.29137369990348816, + "learning_rate": 3.554597701149425e-05, + "loss": 0.1074, + "step": 2515 + }, + { + "epoch": 2.9164674346156634, + "grad_norm": 0.3541145622730255, + "learning_rate": 3.554022988505748e-05, + "loss": 0.1187, + "step": 2516 + }, + { + "epoch": 2.9176266029124105, + "grad_norm": 0.2662143111228943, + "learning_rate": 3.553448275862069e-05, + "loss": 0.1227, + "step": 2517 + }, + { + "epoch": 2.9187857712091576, + "grad_norm": 0.2711283266544342, + "learning_rate": 3.552873563218391e-05, + "loss": 0.1304, + "step": 2518 + }, + { + "epoch": 2.9199449395059043, + "grad_norm": 0.2492939978837967, + "learning_rate": 3.552298850574713e-05, + "loss": 0.1191, + "step": 2519 + }, + { + "epoch": 2.921104107802652, + "grad_norm": 0.25983232259750366, + "learning_rate": 3.5517241379310344e-05, + "loss": 0.1129, + "step": 2520 + }, + { + "epoch": 2.9222632760993985, + "grad_norm": 0.24803534150123596, + "learning_rate": 3.5511494252873566e-05, + "loss": 0.116, + "step": 2521 + }, + { + "epoch": 2.9234224443961456, + "grad_norm": 0.2651211619377136, + "learning_rate": 3.550574712643678e-05, + "loss": 0.1361, + "step": 2522 + }, + { + "epoch": 2.9245816126928927, + "grad_norm": 0.2923348844051361, + "learning_rate": 3.55e-05, + "loss": 0.135, + "step": 2523 + }, + { + "epoch": 2.92574078098964, + "grad_norm": 0.26953768730163574, + "learning_rate": 3.5494252873563224e-05, + "loss": 0.1183, + "step": 2524 + }, + { + "epoch": 2.926899949286387, + "grad_norm": 0.235351100564003, + "learning_rate": 3.548850574712644e-05, + "loss": 0.1216, + "step": 2525 + }, + { + "epoch": 2.928059117583134, + "grad_norm": 0.2557763457298279, + "learning_rate": 3.5482758620689654e-05, + "loss": 0.1138, + "step": 2526 + }, + { + "epoch": 2.929218285879881, + "grad_norm": 0.2516639232635498, + "learning_rate": 3.5477011494252875e-05, + "loss": 0.144, + "step": 2527 + }, + { + "epoch": 2.9303774541766283, + "grad_norm": 0.19669529795646667, + "learning_rate": 3.547126436781609e-05, + "loss": 0.1112, + "step": 2528 + }, + { + "epoch": 2.9315366224733754, + "grad_norm": 0.21730567514896393, + "learning_rate": 3.546551724137931e-05, + "loss": 0.1113, + "step": 2529 + }, + { + "epoch": 2.9326957907701225, + "grad_norm": 0.20566688477993011, + "learning_rate": 3.5459770114942534e-05, + "loss": 0.1146, + "step": 2530 + }, + { + "epoch": 2.9338549590668697, + "grad_norm": 0.26135411858558655, + "learning_rate": 3.545402298850575e-05, + "loss": 0.1083, + "step": 2531 + }, + { + "epoch": 2.9350141273636163, + "grad_norm": 0.23038803040981293, + "learning_rate": 3.544827586206897e-05, + "loss": 0.1212, + "step": 2532 + }, + { + "epoch": 2.936173295660364, + "grad_norm": 0.2405983805656433, + "learning_rate": 3.5442528735632185e-05, + "loss": 0.1085, + "step": 2533 + }, + { + "epoch": 2.9373324639571106, + "grad_norm": 0.2417573630809784, + "learning_rate": 3.54367816091954e-05, + "loss": 0.1173, + "step": 2534 + }, + { + "epoch": 2.938491632253858, + "grad_norm": 0.21391847729682922, + "learning_rate": 3.543103448275862e-05, + "loss": 0.1161, + "step": 2535 + }, + { + "epoch": 2.939650800550605, + "grad_norm": 0.2825719714164734, + "learning_rate": 3.5425287356321843e-05, + "loss": 0.1332, + "step": 2536 + }, + { + "epoch": 2.940809968847352, + "grad_norm": 0.23542939126491547, + "learning_rate": 3.541954022988506e-05, + "loss": 0.1184, + "step": 2537 + }, + { + "epoch": 2.941969137144099, + "grad_norm": 0.2510528862476349, + "learning_rate": 3.541379310344828e-05, + "loss": 0.1238, + "step": 2538 + }, + { + "epoch": 2.943128305440846, + "grad_norm": 0.2628154456615448, + "learning_rate": 3.5408045977011495e-05, + "loss": 0.1224, + "step": 2539 + }, + { + "epoch": 2.9442874737375933, + "grad_norm": 0.18918563425540924, + "learning_rate": 3.540229885057472e-05, + "loss": 0.1026, + "step": 2540 + }, + { + "epoch": 2.9454466420343404, + "grad_norm": 0.299092561006546, + "learning_rate": 3.539655172413793e-05, + "loss": 0.1332, + "step": 2541 + }, + { + "epoch": 2.9466058103310875, + "grad_norm": 0.22174027562141418, + "learning_rate": 3.5390804597701146e-05, + "loss": 0.1071, + "step": 2542 + }, + { + "epoch": 2.9477649786278346, + "grad_norm": 0.30895039439201355, + "learning_rate": 3.538505747126437e-05, + "loss": 0.1243, + "step": 2543 + }, + { + "epoch": 2.9489241469245817, + "grad_norm": 0.30053114891052246, + "learning_rate": 3.537931034482759e-05, + "loss": 0.1197, + "step": 2544 + }, + { + "epoch": 2.950083315221329, + "grad_norm": 0.276027113199234, + "learning_rate": 3.5373563218390805e-05, + "loss": 0.1166, + "step": 2545 + }, + { + "epoch": 2.951242483518076, + "grad_norm": 0.2735234797000885, + "learning_rate": 3.5367816091954026e-05, + "loss": 0.1244, + "step": 2546 + }, + { + "epoch": 2.9524016518148226, + "grad_norm": 0.3124696612358093, + "learning_rate": 3.536206896551724e-05, + "loss": 0.1115, + "step": 2547 + }, + { + "epoch": 2.95356082011157, + "grad_norm": 0.21694187819957733, + "learning_rate": 3.535632183908046e-05, + "loss": 0.1146, + "step": 2548 + }, + { + "epoch": 2.954719988408317, + "grad_norm": 0.28036484122276306, + "learning_rate": 3.535057471264368e-05, + "loss": 0.1027, + "step": 2549 + }, + { + "epoch": 2.955879156705064, + "grad_norm": 0.31232985854148865, + "learning_rate": 3.53448275862069e-05, + "loss": 0.1165, + "step": 2550 + }, + { + "epoch": 2.957038325001811, + "grad_norm": 0.28552737832069397, + "learning_rate": 3.533908045977012e-05, + "loss": 0.1225, + "step": 2551 + }, + { + "epoch": 2.958197493298558, + "grad_norm": 0.27900782227516174, + "learning_rate": 3.5333333333333336e-05, + "loss": 0.1191, + "step": 2552 + }, + { + "epoch": 2.9593566615953053, + "grad_norm": 0.25439637899398804, + "learning_rate": 3.532758620689655e-05, + "loss": 0.1324, + "step": 2553 + }, + { + "epoch": 2.9605158298920524, + "grad_norm": 0.24893079698085785, + "learning_rate": 3.532183908045977e-05, + "loss": 0.1161, + "step": 2554 + }, + { + "epoch": 2.9616749981887995, + "grad_norm": 0.2921384274959564, + "learning_rate": 3.531609195402299e-05, + "loss": 0.1209, + "step": 2555 + }, + { + "epoch": 2.9628341664855466, + "grad_norm": 0.21801045536994934, + "learning_rate": 3.53103448275862e-05, + "loss": 0.118, + "step": 2556 + }, + { + "epoch": 2.9639933347822938, + "grad_norm": 0.26922228932380676, + "learning_rate": 3.530459770114943e-05, + "loss": 0.1315, + "step": 2557 + }, + { + "epoch": 2.965152503079041, + "grad_norm": 0.2805965840816498, + "learning_rate": 3.5298850574712646e-05, + "loss": 0.1291, + "step": 2558 + }, + { + "epoch": 2.966311671375788, + "grad_norm": 0.3655463457107544, + "learning_rate": 3.529310344827587e-05, + "loss": 0.1303, + "step": 2559 + }, + { + "epoch": 2.967470839672535, + "grad_norm": 0.2937909960746765, + "learning_rate": 3.528735632183908e-05, + "loss": 0.1259, + "step": 2560 + }, + { + "epoch": 2.9686300079692822, + "grad_norm": 0.3068729043006897, + "learning_rate": 3.52816091954023e-05, + "loss": 0.1348, + "step": 2561 + }, + { + "epoch": 2.969789176266029, + "grad_norm": 0.2570815980434418, + "learning_rate": 3.527586206896552e-05, + "loss": 0.1261, + "step": 2562 + }, + { + "epoch": 2.9709483445627765, + "grad_norm": 0.22781704366207123, + "learning_rate": 3.5270114942528734e-05, + "loss": 0.1177, + "step": 2563 + }, + { + "epoch": 2.972107512859523, + "grad_norm": 0.2892128527164459, + "learning_rate": 3.5264367816091956e-05, + "loss": 0.1181, + "step": 2564 + }, + { + "epoch": 2.9732666811562702, + "grad_norm": 0.24574293196201324, + "learning_rate": 3.525862068965518e-05, + "loss": 0.1309, + "step": 2565 + }, + { + "epoch": 2.9744258494530174, + "grad_norm": 0.2564846873283386, + "learning_rate": 3.525287356321839e-05, + "loss": 0.1201, + "step": 2566 + }, + { + "epoch": 2.9755850177497645, + "grad_norm": 0.3438556492328644, + "learning_rate": 3.5247126436781614e-05, + "loss": 0.1292, + "step": 2567 + }, + { + "epoch": 2.9767441860465116, + "grad_norm": 0.25281572341918945, + "learning_rate": 3.524137931034483e-05, + "loss": 0.1202, + "step": 2568 + }, + { + "epoch": 2.9779033543432587, + "grad_norm": 0.20820648968219757, + "learning_rate": 3.5235632183908044e-05, + "loss": 0.1221, + "step": 2569 + }, + { + "epoch": 2.979062522640006, + "grad_norm": 0.23150451481342316, + "learning_rate": 3.5229885057471265e-05, + "loss": 0.1269, + "step": 2570 + }, + { + "epoch": 2.980221690936753, + "grad_norm": 0.25065869092941284, + "learning_rate": 3.522413793103449e-05, + "loss": 0.1268, + "step": 2571 + }, + { + "epoch": 2.9813808592335, + "grad_norm": 0.25164109468460083, + "learning_rate": 3.52183908045977e-05, + "loss": 0.1156, + "step": 2572 + }, + { + "epoch": 2.982540027530247, + "grad_norm": 0.2479361742734909, + "learning_rate": 3.5212643678160924e-05, + "loss": 0.1204, + "step": 2573 + }, + { + "epoch": 2.9836991958269943, + "grad_norm": 0.21308045089244843, + "learning_rate": 3.520689655172414e-05, + "loss": 0.1161, + "step": 2574 + }, + { + "epoch": 2.984858364123741, + "grad_norm": 0.2212274670600891, + "learning_rate": 3.5201149425287353e-05, + "loss": 0.1138, + "step": 2575 + }, + { + "epoch": 2.9860175324204885, + "grad_norm": 0.26298844814300537, + "learning_rate": 3.5195402298850575e-05, + "loss": 0.1218, + "step": 2576 + }, + { + "epoch": 2.987176700717235, + "grad_norm": 0.20742398500442505, + "learning_rate": 3.51896551724138e-05, + "loss": 0.1156, + "step": 2577 + }, + { + "epoch": 2.9883358690139827, + "grad_norm": 0.23608221113681793, + "learning_rate": 3.518390804597702e-05, + "loss": 0.1058, + "step": 2578 + }, + { + "epoch": 2.9894950373107294, + "grad_norm": 0.25990620255470276, + "learning_rate": 3.517816091954023e-05, + "loss": 0.1249, + "step": 2579 + }, + { + "epoch": 2.9906542056074765, + "grad_norm": 0.2265261709690094, + "learning_rate": 3.517241379310345e-05, + "loss": 0.1177, + "step": 2580 + }, + { + "epoch": 2.9918133739042236, + "grad_norm": 0.21493545174598694, + "learning_rate": 3.516666666666667e-05, + "loss": 0.1042, + "step": 2581 + }, + { + "epoch": 2.9929725422009708, + "grad_norm": 0.2955082058906555, + "learning_rate": 3.5160919540229885e-05, + "loss": 0.1213, + "step": 2582 + }, + { + "epoch": 2.994131710497718, + "grad_norm": 0.25260546803474426, + "learning_rate": 3.51551724137931e-05, + "loss": 0.1231, + "step": 2583 + }, + { + "epoch": 2.995290878794465, + "grad_norm": 0.22875334322452545, + "learning_rate": 3.514942528735632e-05, + "loss": 0.1313, + "step": 2584 + }, + { + "epoch": 2.996450047091212, + "grad_norm": 0.24003781378269196, + "learning_rate": 3.514367816091954e-05, + "loss": 0.1311, + "step": 2585 + }, + { + "epoch": 2.997609215387959, + "grad_norm": 0.23002997040748596, + "learning_rate": 3.5137931034482765e-05, + "loss": 0.1153, + "step": 2586 + }, + { + "epoch": 2.9987683836847063, + "grad_norm": 0.28721874952316284, + "learning_rate": 3.513218390804598e-05, + "loss": 0.1178, + "step": 2587 + }, + { + "epoch": 2.9999275519814534, + "grad_norm": 0.2430833876132965, + "learning_rate": 3.5126436781609195e-05, + "loss": 0.1155, + "step": 2588 + }, + { + "epoch": 2.9999275519814534, + "eval_loss": 0.13075338304042816, + "eval_runtime": 279.8691, + "eval_samples_per_second": 5.481, + "eval_steps_per_second": 5.481, + "step": 2588 + }, + { + "epoch": 3.0010867202782006, + "grad_norm": 0.23414799571037292, + "learning_rate": 3.5120689655172416e-05, + "loss": 0.1143, + "step": 2589 + }, + { + "epoch": 3.0022458885749477, + "grad_norm": 0.25894778966903687, + "learning_rate": 3.511494252873563e-05, + "loss": 0.1089, + "step": 2590 + }, + { + "epoch": 3.0034050568716943, + "grad_norm": 0.23415379226207733, + "learning_rate": 3.510919540229885e-05, + "loss": 0.1097, + "step": 2591 + }, + { + "epoch": 3.0045642251684415, + "grad_norm": 0.21930260956287384, + "learning_rate": 3.5103448275862074e-05, + "loss": 0.1111, + "step": 2592 + }, + { + "epoch": 3.0057233934651886, + "grad_norm": 0.21448297798633575, + "learning_rate": 3.509770114942529e-05, + "loss": 0.1116, + "step": 2593 + }, + { + "epoch": 3.0068825617619357, + "grad_norm": 0.23125120997428894, + "learning_rate": 3.509195402298851e-05, + "loss": 0.116, + "step": 2594 + }, + { + "epoch": 3.008041730058683, + "grad_norm": 0.25538870692253113, + "learning_rate": 3.5086206896551726e-05, + "loss": 0.1146, + "step": 2595 + }, + { + "epoch": 3.00920089835543, + "grad_norm": 0.2790210247039795, + "learning_rate": 3.508045977011494e-05, + "loss": 0.1138, + "step": 2596 + }, + { + "epoch": 3.010360066652177, + "grad_norm": 0.22470445930957794, + "learning_rate": 3.507471264367816e-05, + "loss": 0.119, + "step": 2597 + }, + { + "epoch": 3.011519234948924, + "grad_norm": 0.2440544217824936, + "learning_rate": 3.5068965517241384e-05, + "loss": 0.1027, + "step": 2598 + }, + { + "epoch": 3.0126784032456713, + "grad_norm": 0.3405747413635254, + "learning_rate": 3.50632183908046e-05, + "loss": 0.113, + "step": 2599 + }, + { + "epoch": 3.0138375715424184, + "grad_norm": 0.22075963020324707, + "learning_rate": 3.505747126436782e-05, + "loss": 0.1043, + "step": 2600 + }, + { + "epoch": 3.0149967398391655, + "grad_norm": 0.28365811705589294, + "learning_rate": 3.5051724137931036e-05, + "loss": 0.1105, + "step": 2601 + }, + { + "epoch": 3.0161559081359126, + "grad_norm": 0.38459697365760803, + "learning_rate": 3.504597701149425e-05, + "loss": 0.1043, + "step": 2602 + }, + { + "epoch": 3.0173150764326597, + "grad_norm": 0.3330821394920349, + "learning_rate": 3.504022988505747e-05, + "loss": 0.1052, + "step": 2603 + }, + { + "epoch": 3.018474244729407, + "grad_norm": 0.25718963146209717, + "learning_rate": 3.503448275862069e-05, + "loss": 0.1091, + "step": 2604 + }, + { + "epoch": 3.0196334130261535, + "grad_norm": 0.2592739462852478, + "learning_rate": 3.502873563218391e-05, + "loss": 0.0919, + "step": 2605 + }, + { + "epoch": 3.0207925813229006, + "grad_norm": 0.24682362377643585, + "learning_rate": 3.502298850574713e-05, + "loss": 0.1094, + "step": 2606 + }, + { + "epoch": 3.0219517496196477, + "grad_norm": 0.4260392487049103, + "learning_rate": 3.5017241379310345e-05, + "loss": 0.1122, + "step": 2607 + }, + { + "epoch": 3.023110917916395, + "grad_norm": 0.35808852314949036, + "learning_rate": 3.501149425287357e-05, + "loss": 0.1215, + "step": 2608 + }, + { + "epoch": 3.024270086213142, + "grad_norm": 0.31842851638793945, + "learning_rate": 3.500574712643678e-05, + "loss": 0.1083, + "step": 2609 + }, + { + "epoch": 3.025429254509889, + "grad_norm": 0.29208144545555115, + "learning_rate": 3.5e-05, + "loss": 0.1146, + "step": 2610 + }, + { + "epoch": 3.026588422806636, + "grad_norm": 0.2514886260032654, + "learning_rate": 3.499425287356322e-05, + "loss": 0.0972, + "step": 2611 + }, + { + "epoch": 3.0277475911033833, + "grad_norm": 0.3120005428791046, + "learning_rate": 3.498850574712644e-05, + "loss": 0.1072, + "step": 2612 + }, + { + "epoch": 3.0289067594001304, + "grad_norm": 0.30786648392677307, + "learning_rate": 3.498275862068966e-05, + "loss": 0.1142, + "step": 2613 + }, + { + "epoch": 3.0300659276968775, + "grad_norm": 0.3202109634876251, + "learning_rate": 3.497701149425288e-05, + "loss": 0.1153, + "step": 2614 + }, + { + "epoch": 3.0312250959936247, + "grad_norm": 0.3748902976512909, + "learning_rate": 3.497126436781609e-05, + "loss": 0.1064, + "step": 2615 + }, + { + "epoch": 3.032384264290372, + "grad_norm": 0.3179676830768585, + "learning_rate": 3.4965517241379313e-05, + "loss": 0.1193, + "step": 2616 + }, + { + "epoch": 3.033543432587119, + "grad_norm": 0.31394878029823303, + "learning_rate": 3.495977011494253e-05, + "loss": 0.1158, + "step": 2617 + }, + { + "epoch": 3.034702600883866, + "grad_norm": 0.2936527132987976, + "learning_rate": 3.495402298850575e-05, + "loss": 0.1094, + "step": 2618 + }, + { + "epoch": 3.035861769180613, + "grad_norm": 0.252445787191391, + "learning_rate": 3.494827586206897e-05, + "loss": 0.1038, + "step": 2619 + }, + { + "epoch": 3.03702093747736, + "grad_norm": 0.29907506704330444, + "learning_rate": 3.4942528735632187e-05, + "loss": 0.112, + "step": 2620 + }, + { + "epoch": 3.038180105774107, + "grad_norm": 0.24381442368030548, + "learning_rate": 3.49367816091954e-05, + "loss": 0.1026, + "step": 2621 + }, + { + "epoch": 3.039339274070854, + "grad_norm": 0.3066432476043701, + "learning_rate": 3.493103448275862e-05, + "loss": 0.1018, + "step": 2622 + }, + { + "epoch": 3.040498442367601, + "grad_norm": 0.2872762680053711, + "learning_rate": 3.492528735632184e-05, + "loss": 0.1105, + "step": 2623 + }, + { + "epoch": 3.0416576106643483, + "grad_norm": 0.2535305917263031, + "learning_rate": 3.491954022988506e-05, + "loss": 0.1013, + "step": 2624 + }, + { + "epoch": 3.0428167789610954, + "grad_norm": 0.22548238933086395, + "learning_rate": 3.4913793103448275e-05, + "loss": 0.1034, + "step": 2625 + }, + { + "epoch": 3.0439759472578425, + "grad_norm": 0.3591626286506653, + "learning_rate": 3.4908045977011496e-05, + "loss": 0.1124, + "step": 2626 + }, + { + "epoch": 3.0451351155545896, + "grad_norm": 0.3023275136947632, + "learning_rate": 3.490229885057472e-05, + "loss": 0.1029, + "step": 2627 + }, + { + "epoch": 3.0462942838513367, + "grad_norm": 0.25562551617622375, + "learning_rate": 3.489655172413793e-05, + "loss": 0.1056, + "step": 2628 + }, + { + "epoch": 3.047453452148084, + "grad_norm": 0.32490625977516174, + "learning_rate": 3.489080459770115e-05, + "loss": 0.1061, + "step": 2629 + }, + { + "epoch": 3.048612620444831, + "grad_norm": 0.30418357253074646, + "learning_rate": 3.488505747126437e-05, + "loss": 0.1139, + "step": 2630 + }, + { + "epoch": 3.049771788741578, + "grad_norm": 0.3882814645767212, + "learning_rate": 3.4879310344827584e-05, + "loss": 0.1037, + "step": 2631 + }, + { + "epoch": 3.050930957038325, + "grad_norm": 0.4713079631328583, + "learning_rate": 3.4873563218390806e-05, + "loss": 0.1157, + "step": 2632 + }, + { + "epoch": 3.0520901253350723, + "grad_norm": 0.32217907905578613, + "learning_rate": 3.486781609195403e-05, + "loss": 0.1135, + "step": 2633 + }, + { + "epoch": 3.053249293631819, + "grad_norm": 0.24278047680854797, + "learning_rate": 3.486206896551724e-05, + "loss": 0.1121, + "step": 2634 + }, + { + "epoch": 3.054408461928566, + "grad_norm": 0.32192277908325195, + "learning_rate": 3.4856321839080464e-05, + "loss": 0.1019, + "step": 2635 + }, + { + "epoch": 3.055567630225313, + "grad_norm": 0.317410945892334, + "learning_rate": 3.485057471264368e-05, + "loss": 0.1081, + "step": 2636 + }, + { + "epoch": 3.0567267985220603, + "grad_norm": 0.30316248536109924, + "learning_rate": 3.4844827586206894e-05, + "loss": 0.0999, + "step": 2637 + }, + { + "epoch": 3.0578859668188074, + "grad_norm": 0.4302930235862732, + "learning_rate": 3.4839080459770116e-05, + "loss": 0.113, + "step": 2638 + }, + { + "epoch": 3.0590451351155545, + "grad_norm": 0.29626715183258057, + "learning_rate": 3.483333333333334e-05, + "loss": 0.1057, + "step": 2639 + }, + { + "epoch": 3.0602043034123017, + "grad_norm": 0.38166263699531555, + "learning_rate": 3.482758620689655e-05, + "loss": 0.1216, + "step": 2640 + }, + { + "epoch": 3.0613634717090488, + "grad_norm": 0.3923417031764984, + "learning_rate": 3.4821839080459774e-05, + "loss": 0.1105, + "step": 2641 + }, + { + "epoch": 3.062522640005796, + "grad_norm": 0.27287542819976807, + "learning_rate": 3.481609195402299e-05, + "loss": 0.1096, + "step": 2642 + }, + { + "epoch": 3.063681808302543, + "grad_norm": 0.2712195813655853, + "learning_rate": 3.481034482758621e-05, + "loss": 0.1141, + "step": 2643 + }, + { + "epoch": 3.06484097659929, + "grad_norm": 0.2444847673177719, + "learning_rate": 3.4804597701149426e-05, + "loss": 0.1045, + "step": 2644 + }, + { + "epoch": 3.0660001448960372, + "grad_norm": 0.26770317554473877, + "learning_rate": 3.479885057471264e-05, + "loss": 0.1044, + "step": 2645 + }, + { + "epoch": 3.0671593131927843, + "grad_norm": 0.27558213472366333, + "learning_rate": 3.479310344827587e-05, + "loss": 0.1089, + "step": 2646 + }, + { + "epoch": 3.0683184814895315, + "grad_norm": 0.26137158274650574, + "learning_rate": 3.4787356321839084e-05, + "loss": 0.1059, + "step": 2647 + }, + { + "epoch": 3.069477649786278, + "grad_norm": 0.3349991738796234, + "learning_rate": 3.47816091954023e-05, + "loss": 0.1185, + "step": 2648 + }, + { + "epoch": 3.0706368180830252, + "grad_norm": 0.32622119784355164, + "learning_rate": 3.477586206896552e-05, + "loss": 0.1101, + "step": 2649 + }, + { + "epoch": 3.0717959863797724, + "grad_norm": 0.3372982442378998, + "learning_rate": 3.4770114942528735e-05, + "loss": 0.113, + "step": 2650 + }, + { + "epoch": 3.0729551546765195, + "grad_norm": 0.31593719124794006, + "learning_rate": 3.476436781609196e-05, + "loss": 0.1188, + "step": 2651 + }, + { + "epoch": 3.0741143229732666, + "grad_norm": 0.26254966855049133, + "learning_rate": 3.475862068965517e-05, + "loss": 0.1185, + "step": 2652 + }, + { + "epoch": 3.0752734912700137, + "grad_norm": 0.22806406021118164, + "learning_rate": 3.4752873563218394e-05, + "loss": 0.0998, + "step": 2653 + }, + { + "epoch": 3.076432659566761, + "grad_norm": 0.26122182607650757, + "learning_rate": 3.4747126436781615e-05, + "loss": 0.1111, + "step": 2654 + }, + { + "epoch": 3.077591827863508, + "grad_norm": 0.2905748188495636, + "learning_rate": 3.474137931034483e-05, + "loss": 0.1127, + "step": 2655 + }, + { + "epoch": 3.078750996160255, + "grad_norm": 0.32908645272254944, + "learning_rate": 3.4735632183908045e-05, + "loss": 0.1165, + "step": 2656 + }, + { + "epoch": 3.079910164457002, + "grad_norm": 0.32499000430107117, + "learning_rate": 3.472988505747127e-05, + "loss": 0.1151, + "step": 2657 + }, + { + "epoch": 3.0810693327537493, + "grad_norm": 0.30107995867729187, + "learning_rate": 3.472413793103448e-05, + "loss": 0.1142, + "step": 2658 + }, + { + "epoch": 3.0822285010504964, + "grad_norm": 0.29092562198638916, + "learning_rate": 3.47183908045977e-05, + "loss": 0.1015, + "step": 2659 + }, + { + "epoch": 3.0833876693472435, + "grad_norm": 0.23830758035182953, + "learning_rate": 3.4712643678160925e-05, + "loss": 0.1084, + "step": 2660 + }, + { + "epoch": 3.0845468376439906, + "grad_norm": 0.254711776971817, + "learning_rate": 3.470689655172414e-05, + "loss": 0.106, + "step": 2661 + }, + { + "epoch": 3.0857060059407377, + "grad_norm": 0.24054522812366486, + "learning_rate": 3.470114942528736e-05, + "loss": 0.0995, + "step": 2662 + }, + { + "epoch": 3.0868651742374844, + "grad_norm": 0.2799350917339325, + "learning_rate": 3.4695402298850576e-05, + "loss": 0.1114, + "step": 2663 + }, + { + "epoch": 3.0880243425342315, + "grad_norm": 0.3153596818447113, + "learning_rate": 3.468965517241379e-05, + "loss": 0.1107, + "step": 2664 + }, + { + "epoch": 3.0891835108309786, + "grad_norm": 0.2980448603630066, + "learning_rate": 3.468390804597701e-05, + "loss": 0.1169, + "step": 2665 + }, + { + "epoch": 3.0903426791277258, + "grad_norm": 0.26500749588012695, + "learning_rate": 3.467816091954023e-05, + "loss": 0.1071, + "step": 2666 + }, + { + "epoch": 3.091501847424473, + "grad_norm": 0.308677077293396, + "learning_rate": 3.467241379310345e-05, + "loss": 0.112, + "step": 2667 + }, + { + "epoch": 3.09266101572122, + "grad_norm": 0.3179532289505005, + "learning_rate": 3.466666666666667e-05, + "loss": 0.1219, + "step": 2668 + }, + { + "epoch": 3.093820184017967, + "grad_norm": 0.28046876192092896, + "learning_rate": 3.4660919540229886e-05, + "loss": 0.1197, + "step": 2669 + }, + { + "epoch": 3.094979352314714, + "grad_norm": 0.3597003221511841, + "learning_rate": 3.465517241379311e-05, + "loss": 0.0995, + "step": 2670 + }, + { + "epoch": 3.0961385206114613, + "grad_norm": 0.2430323362350464, + "learning_rate": 3.464942528735632e-05, + "loss": 0.1078, + "step": 2671 + }, + { + "epoch": 3.0972976889082084, + "grad_norm": 0.3916395902633667, + "learning_rate": 3.464367816091954e-05, + "loss": 0.1289, + "step": 2672 + }, + { + "epoch": 3.0984568572049556, + "grad_norm": 0.28007376194000244, + "learning_rate": 3.463793103448276e-05, + "loss": 0.1082, + "step": 2673 + }, + { + "epoch": 3.0996160255017027, + "grad_norm": 0.370697945356369, + "learning_rate": 3.463218390804598e-05, + "loss": 0.1157, + "step": 2674 + }, + { + "epoch": 3.10077519379845, + "grad_norm": 0.27986010909080505, + "learning_rate": 3.4626436781609196e-05, + "loss": 0.1067, + "step": 2675 + }, + { + "epoch": 3.101934362095197, + "grad_norm": 0.34155765175819397, + "learning_rate": 3.462068965517242e-05, + "loss": 0.1124, + "step": 2676 + }, + { + "epoch": 3.1030935303919436, + "grad_norm": 0.27783599495887756, + "learning_rate": 3.461494252873563e-05, + "loss": 0.1084, + "step": 2677 + }, + { + "epoch": 3.1042526986886907, + "grad_norm": 0.32850217819213867, + "learning_rate": 3.460919540229885e-05, + "loss": 0.1029, + "step": 2678 + }, + { + "epoch": 3.105411866985438, + "grad_norm": 0.21651075780391693, + "learning_rate": 3.460344827586207e-05, + "loss": 0.0985, + "step": 2679 + }, + { + "epoch": 3.106571035282185, + "grad_norm": 0.29915618896484375, + "learning_rate": 3.459770114942529e-05, + "loss": 0.1114, + "step": 2680 + }, + { + "epoch": 3.107730203578932, + "grad_norm": 0.2699195146560669, + "learning_rate": 3.459195402298851e-05, + "loss": 0.108, + "step": 2681 + }, + { + "epoch": 3.108889371875679, + "grad_norm": 0.33292192220687866, + "learning_rate": 3.458620689655173e-05, + "loss": 0.121, + "step": 2682 + }, + { + "epoch": 3.1100485401724263, + "grad_norm": 0.48338431119918823, + "learning_rate": 3.458045977011494e-05, + "loss": 0.1148, + "step": 2683 + }, + { + "epoch": 3.1112077084691734, + "grad_norm": 0.5552323460578918, + "learning_rate": 3.4574712643678164e-05, + "loss": 0.1119, + "step": 2684 + }, + { + "epoch": 3.1123668767659205, + "grad_norm": 0.26004117727279663, + "learning_rate": 3.456896551724138e-05, + "loss": 0.1115, + "step": 2685 + }, + { + "epoch": 3.1135260450626676, + "grad_norm": 0.35093238949775696, + "learning_rate": 3.4563218390804594e-05, + "loss": 0.1196, + "step": 2686 + }, + { + "epoch": 3.1146852133594147, + "grad_norm": 0.26992592215538025, + "learning_rate": 3.455747126436782e-05, + "loss": 0.1114, + "step": 2687 + }, + { + "epoch": 3.115844381656162, + "grad_norm": 0.23782843351364136, + "learning_rate": 3.455172413793104e-05, + "loss": 0.112, + "step": 2688 + }, + { + "epoch": 3.117003549952909, + "grad_norm": 0.37826716899871826, + "learning_rate": 3.454597701149426e-05, + "loss": 0.1154, + "step": 2689 + }, + { + "epoch": 3.118162718249656, + "grad_norm": 0.4076231122016907, + "learning_rate": 3.4540229885057474e-05, + "loss": 0.1181, + "step": 2690 + }, + { + "epoch": 3.1193218865464027, + "grad_norm": 0.27670371532440186, + "learning_rate": 3.453448275862069e-05, + "loss": 0.1192, + "step": 2691 + }, + { + "epoch": 3.12048105484315, + "grad_norm": 0.3018391728401184, + "learning_rate": 3.452873563218391e-05, + "loss": 0.112, + "step": 2692 + }, + { + "epoch": 3.121640223139897, + "grad_norm": 0.3222883641719818, + "learning_rate": 3.4522988505747125e-05, + "loss": 0.1108, + "step": 2693 + }, + { + "epoch": 3.122799391436644, + "grad_norm": 0.32334110140800476, + "learning_rate": 3.451724137931035e-05, + "loss": 0.1059, + "step": 2694 + }, + { + "epoch": 3.123958559733391, + "grad_norm": 0.3832739293575287, + "learning_rate": 3.451149425287357e-05, + "loss": 0.1176, + "step": 2695 + }, + { + "epoch": 3.1251177280301383, + "grad_norm": 0.30942925810813904, + "learning_rate": 3.4505747126436783e-05, + "loss": 0.1249, + "step": 2696 + }, + { + "epoch": 3.1262768963268854, + "grad_norm": 0.25625792145729065, + "learning_rate": 3.45e-05, + "loss": 0.1054, + "step": 2697 + }, + { + "epoch": 3.1274360646236326, + "grad_norm": 0.28217336535453796, + "learning_rate": 3.449425287356322e-05, + "loss": 0.1095, + "step": 2698 + }, + { + "epoch": 3.1285952329203797, + "grad_norm": 0.2541414499282837, + "learning_rate": 3.4488505747126435e-05, + "loss": 0.108, + "step": 2699 + }, + { + "epoch": 3.129754401217127, + "grad_norm": 0.25402045249938965, + "learning_rate": 3.4482758620689657e-05, + "loss": 0.1133, + "step": 2700 + }, + { + "epoch": 3.130913569513874, + "grad_norm": 0.23192809522151947, + "learning_rate": 3.447701149425288e-05, + "loss": 0.1102, + "step": 2701 + }, + { + "epoch": 3.132072737810621, + "grad_norm": 0.3315061926841736, + "learning_rate": 3.447126436781609e-05, + "loss": 0.1214, + "step": 2702 + }, + { + "epoch": 3.133231906107368, + "grad_norm": 0.2749609053134918, + "learning_rate": 3.4465517241379315e-05, + "loss": 0.1005, + "step": 2703 + }, + { + "epoch": 3.1343910744041152, + "grad_norm": 0.36684563755989075, + "learning_rate": 3.445977011494253e-05, + "loss": 0.1055, + "step": 2704 + }, + { + "epoch": 3.1355502427008624, + "grad_norm": 0.30152618885040283, + "learning_rate": 3.4454022988505745e-05, + "loss": 0.1053, + "step": 2705 + }, + { + "epoch": 3.136709410997609, + "grad_norm": 0.24934984743595123, + "learning_rate": 3.4448275862068966e-05, + "loss": 0.1231, + "step": 2706 + }, + { + "epoch": 3.137868579294356, + "grad_norm": 0.3130822777748108, + "learning_rate": 3.444252873563219e-05, + "loss": 0.1075, + "step": 2707 + }, + { + "epoch": 3.1390277475911033, + "grad_norm": 0.28072500228881836, + "learning_rate": 3.443678160919541e-05, + "loss": 0.1016, + "step": 2708 + }, + { + "epoch": 3.1401869158878504, + "grad_norm": 0.36836108565330505, + "learning_rate": 3.4431034482758625e-05, + "loss": 0.1132, + "step": 2709 + }, + { + "epoch": 3.1413460841845975, + "grad_norm": 0.40246373414993286, + "learning_rate": 3.442528735632184e-05, + "loss": 0.1265, + "step": 2710 + }, + { + "epoch": 3.1425052524813446, + "grad_norm": 0.3051183223724365, + "learning_rate": 3.441954022988506e-05, + "loss": 0.0992, + "step": 2711 + }, + { + "epoch": 3.1436644207780917, + "grad_norm": 0.2942332625389099, + "learning_rate": 3.4413793103448276e-05, + "loss": 0.1132, + "step": 2712 + }, + { + "epoch": 3.144823589074839, + "grad_norm": 0.34289440512657166, + "learning_rate": 3.440804597701149e-05, + "loss": 0.112, + "step": 2713 + }, + { + "epoch": 3.145982757371586, + "grad_norm": 0.273400217294693, + "learning_rate": 3.440229885057471e-05, + "loss": 0.0977, + "step": 2714 + }, + { + "epoch": 3.147141925668333, + "grad_norm": 0.3162487745285034, + "learning_rate": 3.4396551724137934e-05, + "loss": 0.1127, + "step": 2715 + }, + { + "epoch": 3.14830109396508, + "grad_norm": 0.33499714732170105, + "learning_rate": 3.439080459770115e-05, + "loss": 0.112, + "step": 2716 + }, + { + "epoch": 3.1494602622618273, + "grad_norm": 0.28966301679611206, + "learning_rate": 3.438505747126437e-05, + "loss": 0.1062, + "step": 2717 + }, + { + "epoch": 3.1506194305585744, + "grad_norm": 0.2868296205997467, + "learning_rate": 3.4379310344827586e-05, + "loss": 0.1149, + "step": 2718 + }, + { + "epoch": 3.151778598855321, + "grad_norm": 0.4284828007221222, + "learning_rate": 3.437356321839081e-05, + "loss": 0.1244, + "step": 2719 + }, + { + "epoch": 3.152937767152068, + "grad_norm": 0.3205833435058594, + "learning_rate": 3.436781609195402e-05, + "loss": 0.0985, + "step": 2720 + }, + { + "epoch": 3.1540969354488153, + "grad_norm": 0.29444050788879395, + "learning_rate": 3.4362068965517244e-05, + "loss": 0.1103, + "step": 2721 + }, + { + "epoch": 3.1552561037455624, + "grad_norm": 0.2642725706100464, + "learning_rate": 3.4356321839080466e-05, + "loss": 0.1047, + "step": 2722 + }, + { + "epoch": 3.1564152720423095, + "grad_norm": 0.34533828496932983, + "learning_rate": 3.435057471264368e-05, + "loss": 0.1131, + "step": 2723 + }, + { + "epoch": 3.1575744403390567, + "grad_norm": 0.31518617272377014, + "learning_rate": 3.4344827586206896e-05, + "loss": 0.1065, + "step": 2724 + }, + { + "epoch": 3.1587336086358038, + "grad_norm": 0.3125956058502197, + "learning_rate": 3.433908045977012e-05, + "loss": 0.1039, + "step": 2725 + }, + { + "epoch": 3.159892776932551, + "grad_norm": 0.24645163118839264, + "learning_rate": 3.433333333333333e-05, + "loss": 0.1016, + "step": 2726 + }, + { + "epoch": 3.161051945229298, + "grad_norm": 0.42728784680366516, + "learning_rate": 3.4327586206896554e-05, + "loss": 0.101, + "step": 2727 + }, + { + "epoch": 3.162211113526045, + "grad_norm": 0.5982040166854858, + "learning_rate": 3.4321839080459775e-05, + "loss": 0.1286, + "step": 2728 + }, + { + "epoch": 3.1633702818227922, + "grad_norm": 0.3353968560695648, + "learning_rate": 3.431609195402299e-05, + "loss": 0.1232, + "step": 2729 + }, + { + "epoch": 3.1645294501195393, + "grad_norm": 0.3435153663158417, + "learning_rate": 3.431034482758621e-05, + "loss": 0.1188, + "step": 2730 + }, + { + "epoch": 3.1656886184162865, + "grad_norm": 0.39836829900741577, + "learning_rate": 3.430459770114943e-05, + "loss": 0.121, + "step": 2731 + }, + { + "epoch": 3.1668477867130336, + "grad_norm": 0.30429473519325256, + "learning_rate": 3.429885057471264e-05, + "loss": 0.1089, + "step": 2732 + }, + { + "epoch": 3.1680069550097807, + "grad_norm": 0.3069549798965454, + "learning_rate": 3.4293103448275864e-05, + "loss": 0.1196, + "step": 2733 + }, + { + "epoch": 3.1691661233065274, + "grad_norm": 0.2685747444629669, + "learning_rate": 3.428735632183908e-05, + "loss": 0.1202, + "step": 2734 + }, + { + "epoch": 3.1703252916032745, + "grad_norm": 0.2461479902267456, + "learning_rate": 3.42816091954023e-05, + "loss": 0.1145, + "step": 2735 + }, + { + "epoch": 3.1714844599000216, + "grad_norm": 0.29422634840011597, + "learning_rate": 3.427586206896552e-05, + "loss": 0.118, + "step": 2736 + }, + { + "epoch": 3.1726436281967687, + "grad_norm": 0.3010058104991913, + "learning_rate": 3.427011494252874e-05, + "loss": 0.1168, + "step": 2737 + }, + { + "epoch": 3.173802796493516, + "grad_norm": 0.2945718765258789, + "learning_rate": 3.426436781609196e-05, + "loss": 0.109, + "step": 2738 + }, + { + "epoch": 3.174961964790263, + "grad_norm": 0.2734999358654022, + "learning_rate": 3.425862068965517e-05, + "loss": 0.1117, + "step": 2739 + }, + { + "epoch": 3.17612113308701, + "grad_norm": 0.25881072878837585, + "learning_rate": 3.425287356321839e-05, + "loss": 0.1062, + "step": 2740 + }, + { + "epoch": 3.177280301383757, + "grad_norm": 0.2748000919818878, + "learning_rate": 3.424712643678161e-05, + "loss": 0.1093, + "step": 2741 + }, + { + "epoch": 3.1784394696805043, + "grad_norm": 0.37148886919021606, + "learning_rate": 3.424137931034483e-05, + "loss": 0.1227, + "step": 2742 + }, + { + "epoch": 3.1795986379772514, + "grad_norm": 0.32973724603652954, + "learning_rate": 3.4235632183908046e-05, + "loss": 0.1057, + "step": 2743 + }, + { + "epoch": 3.1807578062739985, + "grad_norm": 0.27131950855255127, + "learning_rate": 3.422988505747127e-05, + "loss": 0.106, + "step": 2744 + }, + { + "epoch": 3.1819169745707456, + "grad_norm": 0.25479647517204285, + "learning_rate": 3.422413793103448e-05, + "loss": 0.1154, + "step": 2745 + }, + { + "epoch": 3.1830761428674927, + "grad_norm": 0.331696480512619, + "learning_rate": 3.4218390804597705e-05, + "loss": 0.1166, + "step": 2746 + }, + { + "epoch": 3.18423531116424, + "grad_norm": 0.30211329460144043, + "learning_rate": 3.421264367816092e-05, + "loss": 0.1158, + "step": 2747 + }, + { + "epoch": 3.185394479460987, + "grad_norm": 0.265440434217453, + "learning_rate": 3.420689655172414e-05, + "loss": 0.0981, + "step": 2748 + }, + { + "epoch": 3.1865536477577336, + "grad_norm": 0.34413856267929077, + "learning_rate": 3.420114942528736e-05, + "loss": 0.1058, + "step": 2749 + }, + { + "epoch": 3.1877128160544808, + "grad_norm": 0.3049376308917999, + "learning_rate": 3.419540229885058e-05, + "loss": 0.107, + "step": 2750 + }, + { + "epoch": 3.188871984351228, + "grad_norm": 0.24063755571842194, + "learning_rate": 3.418965517241379e-05, + "loss": 0.1063, + "step": 2751 + }, + { + "epoch": 3.190031152647975, + "grad_norm": 0.40382859110832214, + "learning_rate": 3.4183908045977014e-05, + "loss": 0.1112, + "step": 2752 + }, + { + "epoch": 3.191190320944722, + "grad_norm": 0.3165836036205292, + "learning_rate": 3.417816091954023e-05, + "loss": 0.1113, + "step": 2753 + }, + { + "epoch": 3.192349489241469, + "grad_norm": 0.3423636555671692, + "learning_rate": 3.4172413793103444e-05, + "loss": 0.1036, + "step": 2754 + }, + { + "epoch": 3.1935086575382163, + "grad_norm": 0.3611285388469696, + "learning_rate": 3.4166666666666666e-05, + "loss": 0.1057, + "step": 2755 + }, + { + "epoch": 3.1946678258349634, + "grad_norm": 0.2877699136734009, + "learning_rate": 3.416091954022989e-05, + "loss": 0.1124, + "step": 2756 + }, + { + "epoch": 3.1958269941317106, + "grad_norm": 0.32133352756500244, + "learning_rate": 3.415517241379311e-05, + "loss": 0.1136, + "step": 2757 + }, + { + "epoch": 3.1969861624284577, + "grad_norm": 0.28176063299179077, + "learning_rate": 3.4149425287356324e-05, + "loss": 0.1036, + "step": 2758 + }, + { + "epoch": 3.198145330725205, + "grad_norm": 0.35929253697395325, + "learning_rate": 3.414367816091954e-05, + "loss": 0.1226, + "step": 2759 + }, + { + "epoch": 3.199304499021952, + "grad_norm": 0.37938886880874634, + "learning_rate": 3.413793103448276e-05, + "loss": 0.1133, + "step": 2760 + }, + { + "epoch": 3.200463667318699, + "grad_norm": 0.27585509419441223, + "learning_rate": 3.4132183908045976e-05, + "loss": 0.0993, + "step": 2761 + }, + { + "epoch": 3.2016228356154457, + "grad_norm": 0.33102548122406006, + "learning_rate": 3.41264367816092e-05, + "loss": 0.1055, + "step": 2762 + }, + { + "epoch": 3.202782003912193, + "grad_norm": 0.4190862774848938, + "learning_rate": 3.412068965517242e-05, + "loss": 0.1232, + "step": 2763 + }, + { + "epoch": 3.20394117220894, + "grad_norm": 0.3287636935710907, + "learning_rate": 3.4114942528735634e-05, + "loss": 0.1189, + "step": 2764 + }, + { + "epoch": 3.205100340505687, + "grad_norm": 0.27761226892471313, + "learning_rate": 3.4109195402298856e-05, + "loss": 0.1102, + "step": 2765 + }, + { + "epoch": 3.206259508802434, + "grad_norm": 0.271768718957901, + "learning_rate": 3.410344827586207e-05, + "loss": 0.1042, + "step": 2766 + }, + { + "epoch": 3.2074186770991813, + "grad_norm": 0.44091716408729553, + "learning_rate": 3.4097701149425285e-05, + "loss": 0.1098, + "step": 2767 + }, + { + "epoch": 3.2085778453959284, + "grad_norm": 0.32181066274642944, + "learning_rate": 3.409195402298851e-05, + "loss": 0.1077, + "step": 2768 + }, + { + "epoch": 3.2097370136926755, + "grad_norm": 0.33620011806488037, + "learning_rate": 3.408620689655173e-05, + "loss": 0.1113, + "step": 2769 + }, + { + "epoch": 3.2108961819894226, + "grad_norm": 0.3269021511077881, + "learning_rate": 3.4080459770114944e-05, + "loss": 0.1266, + "step": 2770 + }, + { + "epoch": 3.2120553502861697, + "grad_norm": 0.3237850069999695, + "learning_rate": 3.4074712643678165e-05, + "loss": 0.1132, + "step": 2771 + }, + { + "epoch": 3.213214518582917, + "grad_norm": 0.37990424036979675, + "learning_rate": 3.406896551724138e-05, + "loss": 0.1028, + "step": 2772 + }, + { + "epoch": 3.214373686879664, + "grad_norm": 0.28017759323120117, + "learning_rate": 3.40632183908046e-05, + "loss": 0.108, + "step": 2773 + }, + { + "epoch": 3.215532855176411, + "grad_norm": 0.3556600511074066, + "learning_rate": 3.405747126436782e-05, + "loss": 0.121, + "step": 2774 + }, + { + "epoch": 3.216692023473158, + "grad_norm": 0.26597216725349426, + "learning_rate": 3.405172413793103e-05, + "loss": 0.1024, + "step": 2775 + }, + { + "epoch": 3.2178511917699053, + "grad_norm": 0.35249412059783936, + "learning_rate": 3.4045977011494253e-05, + "loss": 0.1202, + "step": 2776 + }, + { + "epoch": 3.219010360066652, + "grad_norm": 0.35307571291923523, + "learning_rate": 3.4040229885057475e-05, + "loss": 0.1139, + "step": 2777 + }, + { + "epoch": 3.220169528363399, + "grad_norm": 0.25899243354797363, + "learning_rate": 3.403448275862069e-05, + "loss": 0.1079, + "step": 2778 + }, + { + "epoch": 3.221328696660146, + "grad_norm": 0.3125319480895996, + "learning_rate": 3.402873563218391e-05, + "loss": 0.1137, + "step": 2779 + }, + { + "epoch": 3.2224878649568933, + "grad_norm": 0.3008607029914856, + "learning_rate": 3.4022988505747127e-05, + "loss": 0.1013, + "step": 2780 + }, + { + "epoch": 3.2236470332536404, + "grad_norm": 0.2770404517650604, + "learning_rate": 3.401724137931034e-05, + "loss": 0.1137, + "step": 2781 + }, + { + "epoch": 3.2248062015503876, + "grad_norm": 0.35955601930618286, + "learning_rate": 3.401149425287356e-05, + "loss": 0.1108, + "step": 2782 + }, + { + "epoch": 3.2259653698471347, + "grad_norm": 0.2946624159812927, + "learning_rate": 3.4005747126436785e-05, + "loss": 0.1023, + "step": 2783 + }, + { + "epoch": 3.227124538143882, + "grad_norm": 0.34001263976097107, + "learning_rate": 3.4000000000000007e-05, + "loss": 0.1049, + "step": 2784 + }, + { + "epoch": 3.228283706440629, + "grad_norm": 0.2924330234527588, + "learning_rate": 3.399425287356322e-05, + "loss": 0.1097, + "step": 2785 + }, + { + "epoch": 3.229442874737376, + "grad_norm": 0.3034241199493408, + "learning_rate": 3.3988505747126436e-05, + "loss": 0.1148, + "step": 2786 + }, + { + "epoch": 3.230602043034123, + "grad_norm": 0.3855332136154175, + "learning_rate": 3.398275862068966e-05, + "loss": 0.0945, + "step": 2787 + }, + { + "epoch": 3.2317612113308702, + "grad_norm": 0.24068623781204224, + "learning_rate": 3.397701149425287e-05, + "loss": 0.1092, + "step": 2788 + }, + { + "epoch": 3.2329203796276174, + "grad_norm": 0.2632269859313965, + "learning_rate": 3.3971264367816095e-05, + "loss": 0.1123, + "step": 2789 + }, + { + "epoch": 3.2340795479243645, + "grad_norm": 0.2876719832420349, + "learning_rate": 3.3965517241379316e-05, + "loss": 0.1158, + "step": 2790 + }, + { + "epoch": 3.2352387162211116, + "grad_norm": 0.3035225570201874, + "learning_rate": 3.395977011494253e-05, + "loss": 0.1187, + "step": 2791 + }, + { + "epoch": 3.2363978845178583, + "grad_norm": 0.27335041761398315, + "learning_rate": 3.395402298850575e-05, + "loss": 0.1005, + "step": 2792 + }, + { + "epoch": 3.2375570528146054, + "grad_norm": 0.2715912461280823, + "learning_rate": 3.394827586206897e-05, + "loss": 0.1102, + "step": 2793 + }, + { + "epoch": 3.2387162211113525, + "grad_norm": 0.2653818726539612, + "learning_rate": 3.394252873563218e-05, + "loss": 0.1119, + "step": 2794 + }, + { + "epoch": 3.2398753894080996, + "grad_norm": 0.3413951098918915, + "learning_rate": 3.3936781609195404e-05, + "loss": 0.1209, + "step": 2795 + }, + { + "epoch": 3.2410345577048467, + "grad_norm": 0.3429463505744934, + "learning_rate": 3.393103448275862e-05, + "loss": 0.1285, + "step": 2796 + }, + { + "epoch": 3.242193726001594, + "grad_norm": 0.3493320345878601, + "learning_rate": 3.392528735632184e-05, + "loss": 0.1169, + "step": 2797 + }, + { + "epoch": 3.243352894298341, + "grad_norm": 0.31924867630004883, + "learning_rate": 3.391954022988506e-05, + "loss": 0.1119, + "step": 2798 + }, + { + "epoch": 3.244512062595088, + "grad_norm": 0.42040061950683594, + "learning_rate": 3.391379310344828e-05, + "loss": 0.1106, + "step": 2799 + }, + { + "epoch": 3.245671230891835, + "grad_norm": 0.3470522463321686, + "learning_rate": 3.390804597701149e-05, + "loss": 0.1122, + "step": 2800 + }, + { + "epoch": 3.2468303991885823, + "grad_norm": 0.356770783662796, + "learning_rate": 3.3902298850574714e-05, + "loss": 0.1191, + "step": 2801 + }, + { + "epoch": 3.2479895674853294, + "grad_norm": 0.40925726294517517, + "learning_rate": 3.389655172413793e-05, + "loss": 0.1073, + "step": 2802 + }, + { + "epoch": 3.2491487357820765, + "grad_norm": 0.34631669521331787, + "learning_rate": 3.389080459770115e-05, + "loss": 0.1188, + "step": 2803 + }, + { + "epoch": 3.2503079040788236, + "grad_norm": 0.3311026394367218, + "learning_rate": 3.388505747126437e-05, + "loss": 0.1099, + "step": 2804 + }, + { + "epoch": 3.2514670723755703, + "grad_norm": 0.34615299105644226, + "learning_rate": 3.387931034482759e-05, + "loss": 0.1197, + "step": 2805 + }, + { + "epoch": 3.2526262406723174, + "grad_norm": 0.33618980646133423, + "learning_rate": 3.387356321839081e-05, + "loss": 0.1158, + "step": 2806 + }, + { + "epoch": 3.2537854089690645, + "grad_norm": 0.30485883355140686, + "learning_rate": 3.3867816091954024e-05, + "loss": 0.1141, + "step": 2807 + }, + { + "epoch": 3.2549445772658117, + "grad_norm": 0.35726383328437805, + "learning_rate": 3.386206896551724e-05, + "loss": 0.1059, + "step": 2808 + }, + { + "epoch": 3.2561037455625588, + "grad_norm": 0.45298418402671814, + "learning_rate": 3.385632183908046e-05, + "loss": 0.1142, + "step": 2809 + }, + { + "epoch": 3.257262913859306, + "grad_norm": 0.2797136902809143, + "learning_rate": 3.385057471264368e-05, + "loss": 0.104, + "step": 2810 + }, + { + "epoch": 3.258422082156053, + "grad_norm": 0.2695922553539276, + "learning_rate": 3.3844827586206904e-05, + "loss": 0.1047, + "step": 2811 + }, + { + "epoch": 3.2595812504528, + "grad_norm": 0.2859964668750763, + "learning_rate": 3.383908045977012e-05, + "loss": 0.1057, + "step": 2812 + }, + { + "epoch": 3.2607404187495472, + "grad_norm": 0.26754042506217957, + "learning_rate": 3.3833333333333334e-05, + "loss": 0.1138, + "step": 2813 + }, + { + "epoch": 3.2618995870462943, + "grad_norm": 0.3192819058895111, + "learning_rate": 3.3827586206896555e-05, + "loss": 0.1159, + "step": 2814 + }, + { + "epoch": 3.2630587553430415, + "grad_norm": 0.33752667903900146, + "learning_rate": 3.382183908045977e-05, + "loss": 0.1124, + "step": 2815 + }, + { + "epoch": 3.2642179236397886, + "grad_norm": 0.2708832025527954, + "learning_rate": 3.3816091954022985e-05, + "loss": 0.1181, + "step": 2816 + }, + { + "epoch": 3.2653770919365357, + "grad_norm": 0.3345796465873718, + "learning_rate": 3.381034482758621e-05, + "loss": 0.1123, + "step": 2817 + }, + { + "epoch": 3.266536260233283, + "grad_norm": 0.33695465326309204, + "learning_rate": 3.380459770114943e-05, + "loss": 0.0986, + "step": 2818 + }, + { + "epoch": 3.26769542853003, + "grad_norm": 0.2809793949127197, + "learning_rate": 3.379885057471264e-05, + "loss": 0.1151, + "step": 2819 + }, + { + "epoch": 3.2688545968267766, + "grad_norm": 0.35505884885787964, + "learning_rate": 3.3793103448275865e-05, + "loss": 0.1203, + "step": 2820 + }, + { + "epoch": 3.2700137651235237, + "grad_norm": 0.2393438071012497, + "learning_rate": 3.378735632183908e-05, + "loss": 0.1109, + "step": 2821 + }, + { + "epoch": 3.271172933420271, + "grad_norm": 0.35280153155326843, + "learning_rate": 3.37816091954023e-05, + "loss": 0.1217, + "step": 2822 + }, + { + "epoch": 3.272332101717018, + "grad_norm": 0.3676847815513611, + "learning_rate": 3.3775862068965516e-05, + "loss": 0.1182, + "step": 2823 + }, + { + "epoch": 3.273491270013765, + "grad_norm": 0.26780447363853455, + "learning_rate": 3.377011494252874e-05, + "loss": 0.1093, + "step": 2824 + }, + { + "epoch": 3.274650438310512, + "grad_norm": 0.3136596381664276, + "learning_rate": 3.376436781609196e-05, + "loss": 0.1126, + "step": 2825 + }, + { + "epoch": 3.2758096066072593, + "grad_norm": 0.26697683334350586, + "learning_rate": 3.3758620689655175e-05, + "loss": 0.1163, + "step": 2826 + }, + { + "epoch": 3.2769687749040064, + "grad_norm": 0.32463008165359497, + "learning_rate": 3.375287356321839e-05, + "loss": 0.1113, + "step": 2827 + }, + { + "epoch": 3.2781279432007535, + "grad_norm": 0.3022761344909668, + "learning_rate": 3.374712643678161e-05, + "loss": 0.113, + "step": 2828 + }, + { + "epoch": 3.2792871114975006, + "grad_norm": 0.2690412700176239, + "learning_rate": 3.3741379310344826e-05, + "loss": 0.1018, + "step": 2829 + }, + { + "epoch": 3.2804462797942477, + "grad_norm": 0.2957005798816681, + "learning_rate": 3.373563218390805e-05, + "loss": 0.1163, + "step": 2830 + }, + { + "epoch": 3.281605448090995, + "grad_norm": 0.3116534352302551, + "learning_rate": 3.372988505747127e-05, + "loss": 0.1263, + "step": 2831 + }, + { + "epoch": 3.282764616387742, + "grad_norm": 0.2315097600221634, + "learning_rate": 3.3724137931034484e-05, + "loss": 0.1036, + "step": 2832 + }, + { + "epoch": 3.2839237846844886, + "grad_norm": 0.5004782676696777, + "learning_rate": 3.3718390804597706e-05, + "loss": 0.1086, + "step": 2833 + }, + { + "epoch": 3.285082952981236, + "grad_norm": 0.3481523096561432, + "learning_rate": 3.371264367816092e-05, + "loss": 0.1112, + "step": 2834 + }, + { + "epoch": 3.286242121277983, + "grad_norm": 0.27010461688041687, + "learning_rate": 3.3706896551724136e-05, + "loss": 0.1145, + "step": 2835 + }, + { + "epoch": 3.28740128957473, + "grad_norm": 0.30692821741104126, + "learning_rate": 3.370114942528736e-05, + "loss": 0.1043, + "step": 2836 + }, + { + "epoch": 3.288560457871477, + "grad_norm": 0.33265191316604614, + "learning_rate": 3.369540229885057e-05, + "loss": 0.1151, + "step": 2837 + }, + { + "epoch": 3.289719626168224, + "grad_norm": 0.328311562538147, + "learning_rate": 3.3689655172413794e-05, + "loss": 0.1136, + "step": 2838 + }, + { + "epoch": 3.2908787944649713, + "grad_norm": 0.2806066870689392, + "learning_rate": 3.3683908045977016e-05, + "loss": 0.114, + "step": 2839 + }, + { + "epoch": 3.2920379627617185, + "grad_norm": 0.30125707387924194, + "learning_rate": 3.367816091954023e-05, + "loss": 0.128, + "step": 2840 + }, + { + "epoch": 3.2931971310584656, + "grad_norm": 0.31410592794418335, + "learning_rate": 3.367241379310345e-05, + "loss": 0.1133, + "step": 2841 + }, + { + "epoch": 3.2943562993552127, + "grad_norm": 0.24413692951202393, + "learning_rate": 3.366666666666667e-05, + "loss": 0.1056, + "step": 2842 + }, + { + "epoch": 3.29551546765196, + "grad_norm": 0.3842843472957611, + "learning_rate": 3.366091954022988e-05, + "loss": 0.1107, + "step": 2843 + }, + { + "epoch": 3.296674635948707, + "grad_norm": 0.2507694661617279, + "learning_rate": 3.3655172413793104e-05, + "loss": 0.108, + "step": 2844 + }, + { + "epoch": 3.297833804245454, + "grad_norm": 0.28499695658683777, + "learning_rate": 3.3649425287356326e-05, + "loss": 0.1169, + "step": 2845 + }, + { + "epoch": 3.298992972542201, + "grad_norm": 0.2902618646621704, + "learning_rate": 3.364367816091954e-05, + "loss": 0.118, + "step": 2846 + }, + { + "epoch": 3.3001521408389483, + "grad_norm": 0.26456037163734436, + "learning_rate": 3.363793103448276e-05, + "loss": 0.1105, + "step": 2847 + }, + { + "epoch": 3.301311309135695, + "grad_norm": 0.3139413595199585, + "learning_rate": 3.363218390804598e-05, + "loss": 0.1272, + "step": 2848 + }, + { + "epoch": 3.302470477432442, + "grad_norm": 0.282294899225235, + "learning_rate": 3.36264367816092e-05, + "loss": 0.1098, + "step": 2849 + }, + { + "epoch": 3.303629645729189, + "grad_norm": 0.2812632918357849, + "learning_rate": 3.3620689655172414e-05, + "loss": 0.1124, + "step": 2850 + }, + { + "epoch": 3.3047888140259363, + "grad_norm": 0.27748167514801025, + "learning_rate": 3.3614942528735635e-05, + "loss": 0.1028, + "step": 2851 + }, + { + "epoch": 3.3059479823226834, + "grad_norm": 0.39685526490211487, + "learning_rate": 3.360919540229886e-05, + "loss": 0.1191, + "step": 2852 + }, + { + "epoch": 3.3071071506194305, + "grad_norm": 0.33615151047706604, + "learning_rate": 3.360344827586207e-05, + "loss": 0.1085, + "step": 2853 + }, + { + "epoch": 3.3082663189161776, + "grad_norm": 0.2919673025608063, + "learning_rate": 3.359770114942529e-05, + "loss": 0.1045, + "step": 2854 + }, + { + "epoch": 3.3094254872129247, + "grad_norm": 0.2599186301231384, + "learning_rate": 3.359195402298851e-05, + "loss": 0.1106, + "step": 2855 + }, + { + "epoch": 3.310584655509672, + "grad_norm": 0.3396240174770355, + "learning_rate": 3.3586206896551723e-05, + "loss": 0.1063, + "step": 2856 + }, + { + "epoch": 3.311743823806419, + "grad_norm": 0.2739236354827881, + "learning_rate": 3.358045977011494e-05, + "loss": 0.1033, + "step": 2857 + }, + { + "epoch": 3.312902992103166, + "grad_norm": 0.5088075399398804, + "learning_rate": 3.357471264367817e-05, + "loss": 0.1295, + "step": 2858 + }, + { + "epoch": 3.314062160399913, + "grad_norm": 0.3902330994606018, + "learning_rate": 3.356896551724138e-05, + "loss": 0.1233, + "step": 2859 + }, + { + "epoch": 3.3152213286966603, + "grad_norm": 0.33790311217308044, + "learning_rate": 3.35632183908046e-05, + "loss": 0.1204, + "step": 2860 + }, + { + "epoch": 3.3163804969934074, + "grad_norm": 0.2987493872642517, + "learning_rate": 3.355747126436782e-05, + "loss": 0.1068, + "step": 2861 + }, + { + "epoch": 3.3175396652901545, + "grad_norm": 0.3023374080657959, + "learning_rate": 3.355172413793103e-05, + "loss": 0.1109, + "step": 2862 + }, + { + "epoch": 3.318698833586901, + "grad_norm": 0.26257169246673584, + "learning_rate": 3.3545977011494255e-05, + "loss": 0.1092, + "step": 2863 + }, + { + "epoch": 3.3198580018836483, + "grad_norm": 0.22975148260593414, + "learning_rate": 3.354022988505747e-05, + "loss": 0.1036, + "step": 2864 + }, + { + "epoch": 3.3210171701803954, + "grad_norm": 0.29056644439697266, + "learning_rate": 3.353448275862069e-05, + "loss": 0.1094, + "step": 2865 + }, + { + "epoch": 3.3221763384771426, + "grad_norm": 0.3132513761520386, + "learning_rate": 3.352873563218391e-05, + "loss": 0.1257, + "step": 2866 + }, + { + "epoch": 3.3233355067738897, + "grad_norm": 0.23093485832214355, + "learning_rate": 3.352298850574713e-05, + "loss": 0.1044, + "step": 2867 + }, + { + "epoch": 3.324494675070637, + "grad_norm": 0.3373148441314697, + "learning_rate": 3.351724137931035e-05, + "loss": 0.1174, + "step": 2868 + }, + { + "epoch": 3.325653843367384, + "grad_norm": 0.2774425148963928, + "learning_rate": 3.3511494252873565e-05, + "loss": 0.1137, + "step": 2869 + }, + { + "epoch": 3.326813011664131, + "grad_norm": 0.2680792808532715, + "learning_rate": 3.350574712643678e-05, + "loss": 0.1045, + "step": 2870 + }, + { + "epoch": 3.327972179960878, + "grad_norm": 0.39170289039611816, + "learning_rate": 3.35e-05, + "loss": 0.1121, + "step": 2871 + }, + { + "epoch": 3.3291313482576252, + "grad_norm": 0.2982361614704132, + "learning_rate": 3.349425287356322e-05, + "loss": 0.1169, + "step": 2872 + }, + { + "epoch": 3.3302905165543724, + "grad_norm": 0.2685178816318512, + "learning_rate": 3.348850574712644e-05, + "loss": 0.1001, + "step": 2873 + }, + { + "epoch": 3.3314496848511195, + "grad_norm": 0.35294216871261597, + "learning_rate": 3.348275862068966e-05, + "loss": 0.1087, + "step": 2874 + }, + { + "epoch": 3.3326088531478666, + "grad_norm": 0.3147522509098053, + "learning_rate": 3.3477011494252874e-05, + "loss": 0.1244, + "step": 2875 + }, + { + "epoch": 3.3337680214446133, + "grad_norm": 0.2905179560184479, + "learning_rate": 3.347126436781609e-05, + "loss": 0.1017, + "step": 2876 + }, + { + "epoch": 3.334927189741361, + "grad_norm": 0.28454288840293884, + "learning_rate": 3.346551724137931e-05, + "loss": 0.117, + "step": 2877 + }, + { + "epoch": 3.3360863580381075, + "grad_norm": 0.24592730402946472, + "learning_rate": 3.3459770114942526e-05, + "loss": 0.105, + "step": 2878 + }, + { + "epoch": 3.3372455263348546, + "grad_norm": 0.2803703248500824, + "learning_rate": 3.3454022988505754e-05, + "loss": 0.1251, + "step": 2879 + }, + { + "epoch": 3.3384046946316017, + "grad_norm": 0.2251860499382019, + "learning_rate": 3.344827586206897e-05, + "loss": 0.0974, + "step": 2880 + }, + { + "epoch": 3.339563862928349, + "grad_norm": 0.30766502022743225, + "learning_rate": 3.3442528735632184e-05, + "loss": 0.1055, + "step": 2881 + }, + { + "epoch": 3.340723031225096, + "grad_norm": 0.31499797105789185, + "learning_rate": 3.3436781609195406e-05, + "loss": 0.1112, + "step": 2882 + }, + { + "epoch": 3.341882199521843, + "grad_norm": 0.3749004900455475, + "learning_rate": 3.343103448275862e-05, + "loss": 0.1056, + "step": 2883 + }, + { + "epoch": 3.34304136781859, + "grad_norm": 0.3857807517051697, + "learning_rate": 3.3425287356321836e-05, + "loss": 0.1111, + "step": 2884 + }, + { + "epoch": 3.3442005361153373, + "grad_norm": 0.31204038858413696, + "learning_rate": 3.341954022988506e-05, + "loss": 0.1077, + "step": 2885 + }, + { + "epoch": 3.3453597044120844, + "grad_norm": 0.3444509506225586, + "learning_rate": 3.341379310344828e-05, + "loss": 0.1068, + "step": 2886 + }, + { + "epoch": 3.3465188727088315, + "grad_norm": 0.34033796191215515, + "learning_rate": 3.34080459770115e-05, + "loss": 0.1187, + "step": 2887 + }, + { + "epoch": 3.3476780410055786, + "grad_norm": 0.3054271936416626, + "learning_rate": 3.3402298850574715e-05, + "loss": 0.118, + "step": 2888 + }, + { + "epoch": 3.3488372093023258, + "grad_norm": 0.2640574276447296, + "learning_rate": 3.339655172413793e-05, + "loss": 0.1113, + "step": 2889 + }, + { + "epoch": 3.349996377599073, + "grad_norm": 0.9027268886566162, + "learning_rate": 3.339080459770115e-05, + "loss": 0.1232, + "step": 2890 + }, + { + "epoch": 3.3511555458958195, + "grad_norm": 0.2914552390575409, + "learning_rate": 3.338505747126437e-05, + "loss": 0.1053, + "step": 2891 + }, + { + "epoch": 3.3523147141925667, + "grad_norm": 0.2577556371688843, + "learning_rate": 3.337931034482759e-05, + "loss": 0.1057, + "step": 2892 + }, + { + "epoch": 3.3534738824893138, + "grad_norm": 0.2784647047519684, + "learning_rate": 3.337356321839081e-05, + "loss": 0.1012, + "step": 2893 + }, + { + "epoch": 3.354633050786061, + "grad_norm": 0.2601286470890045, + "learning_rate": 3.3367816091954025e-05, + "loss": 0.1066, + "step": 2894 + }, + { + "epoch": 3.355792219082808, + "grad_norm": 0.25884032249450684, + "learning_rate": 3.336206896551724e-05, + "loss": 0.1121, + "step": 2895 + }, + { + "epoch": 3.356951387379555, + "grad_norm": 0.3100210428237915, + "learning_rate": 3.335632183908046e-05, + "loss": 0.1129, + "step": 2896 + }, + { + "epoch": 3.3581105556763022, + "grad_norm": 0.3463558256626129, + "learning_rate": 3.335057471264368e-05, + "loss": 0.1178, + "step": 2897 + }, + { + "epoch": 3.3592697239730493, + "grad_norm": 0.3152965009212494, + "learning_rate": 3.33448275862069e-05, + "loss": 0.1214, + "step": 2898 + }, + { + "epoch": 3.3604288922697965, + "grad_norm": 0.32749250531196594, + "learning_rate": 3.333908045977012e-05, + "loss": 0.1165, + "step": 2899 + }, + { + "epoch": 3.3615880605665436, + "grad_norm": 0.3287135660648346, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.124, + "step": 2900 + }, + { + "epoch": 3.3627472288632907, + "grad_norm": 0.2673903703689575, + "learning_rate": 3.3327586206896557e-05, + "loss": 0.1162, + "step": 2901 + }, + { + "epoch": 3.363906397160038, + "grad_norm": 0.29948368668556213, + "learning_rate": 3.332183908045977e-05, + "loss": 0.1116, + "step": 2902 + }, + { + "epoch": 3.365065565456785, + "grad_norm": 0.4442507028579712, + "learning_rate": 3.3316091954022986e-05, + "loss": 0.1093, + "step": 2903 + }, + { + "epoch": 3.3662247337535316, + "grad_norm": 0.2337656021118164, + "learning_rate": 3.331034482758621e-05, + "loss": 0.0952, + "step": 2904 + }, + { + "epoch": 3.367383902050279, + "grad_norm": 0.2944662272930145, + "learning_rate": 3.330459770114942e-05, + "loss": 0.1011, + "step": 2905 + }, + { + "epoch": 3.368543070347026, + "grad_norm": 0.24339404702186584, + "learning_rate": 3.3298850574712645e-05, + "loss": 0.098, + "step": 2906 + }, + { + "epoch": 3.369702238643773, + "grad_norm": 0.33986908197402954, + "learning_rate": 3.3293103448275866e-05, + "loss": 0.1111, + "step": 2907 + }, + { + "epoch": 3.37086140694052, + "grad_norm": 0.333913117647171, + "learning_rate": 3.328735632183908e-05, + "loss": 0.1163, + "step": 2908 + }, + { + "epoch": 3.372020575237267, + "grad_norm": 0.2759074568748474, + "learning_rate": 3.32816091954023e-05, + "loss": 0.1083, + "step": 2909 + }, + { + "epoch": 3.3731797435340143, + "grad_norm": 0.264443039894104, + "learning_rate": 3.327586206896552e-05, + "loss": 0.1069, + "step": 2910 + }, + { + "epoch": 3.3743389118307614, + "grad_norm": 0.33492133021354675, + "learning_rate": 3.327011494252873e-05, + "loss": 0.1081, + "step": 2911 + }, + { + "epoch": 3.3754980801275085, + "grad_norm": 0.37320229411125183, + "learning_rate": 3.3264367816091954e-05, + "loss": 0.1177, + "step": 2912 + }, + { + "epoch": 3.3766572484242556, + "grad_norm": 0.3110753297805786, + "learning_rate": 3.3258620689655176e-05, + "loss": 0.1062, + "step": 2913 + }, + { + "epoch": 3.3778164167210027, + "grad_norm": 0.35673680901527405, + "learning_rate": 3.325287356321839e-05, + "loss": 0.1095, + "step": 2914 + }, + { + "epoch": 3.37897558501775, + "grad_norm": 0.3656083643436432, + "learning_rate": 3.324712643678161e-05, + "loss": 0.1075, + "step": 2915 + }, + { + "epoch": 3.380134753314497, + "grad_norm": 0.29794302582740784, + "learning_rate": 3.324137931034483e-05, + "loss": 0.1062, + "step": 2916 + }, + { + "epoch": 3.381293921611244, + "grad_norm": 0.3165527284145355, + "learning_rate": 3.323563218390805e-05, + "loss": 0.1035, + "step": 2917 + }, + { + "epoch": 3.382453089907991, + "grad_norm": 0.44483765959739685, + "learning_rate": 3.3229885057471264e-05, + "loss": 0.1039, + "step": 2918 + }, + { + "epoch": 3.383612258204738, + "grad_norm": 0.3703659474849701, + "learning_rate": 3.3224137931034486e-05, + "loss": 0.1176, + "step": 2919 + }, + { + "epoch": 3.384771426501485, + "grad_norm": 0.2532442510128021, + "learning_rate": 3.321839080459771e-05, + "loss": 0.1028, + "step": 2920 + }, + { + "epoch": 3.385930594798232, + "grad_norm": 0.2925904393196106, + "learning_rate": 3.321264367816092e-05, + "loss": 0.1037, + "step": 2921 + }, + { + "epoch": 3.3870897630949792, + "grad_norm": 0.45509782433509827, + "learning_rate": 3.320689655172414e-05, + "loss": 0.1159, + "step": 2922 + }, + { + "epoch": 3.3882489313917263, + "grad_norm": 0.36248907446861267, + "learning_rate": 3.320114942528736e-05, + "loss": 0.114, + "step": 2923 + }, + { + "epoch": 3.3894080996884735, + "grad_norm": 0.3373396098613739, + "learning_rate": 3.3195402298850574e-05, + "loss": 0.1131, + "step": 2924 + }, + { + "epoch": 3.3905672679852206, + "grad_norm": 0.44164901971817017, + "learning_rate": 3.3189655172413796e-05, + "loss": 0.125, + "step": 2925 + }, + { + "epoch": 3.3917264362819677, + "grad_norm": 0.32218971848487854, + "learning_rate": 3.318390804597701e-05, + "loss": 0.1103, + "step": 2926 + }, + { + "epoch": 3.392885604578715, + "grad_norm": 0.25145384669303894, + "learning_rate": 3.317816091954023e-05, + "loss": 0.1172, + "step": 2927 + }, + { + "epoch": 3.394044772875462, + "grad_norm": 0.3067503571510315, + "learning_rate": 3.3172413793103454e-05, + "loss": 0.1042, + "step": 2928 + }, + { + "epoch": 3.395203941172209, + "grad_norm": 0.31007614731788635, + "learning_rate": 3.316666666666667e-05, + "loss": 0.1022, + "step": 2929 + }, + { + "epoch": 3.396363109468956, + "grad_norm": 0.398503839969635, + "learning_rate": 3.3160919540229884e-05, + "loss": 0.1158, + "step": 2930 + }, + { + "epoch": 3.3975222777657033, + "grad_norm": 0.32212406396865845, + "learning_rate": 3.3155172413793105e-05, + "loss": 0.1178, + "step": 2931 + }, + { + "epoch": 3.3986814460624504, + "grad_norm": 0.40230342745780945, + "learning_rate": 3.314942528735632e-05, + "loss": 0.1165, + "step": 2932 + }, + { + "epoch": 3.3998406143591975, + "grad_norm": 0.46653199195861816, + "learning_rate": 3.314367816091954e-05, + "loss": 0.1123, + "step": 2933 + }, + { + "epoch": 3.400999782655944, + "grad_norm": 0.217983677983284, + "learning_rate": 3.3137931034482764e-05, + "loss": 0.0943, + "step": 2934 + }, + { + "epoch": 3.4021589509526913, + "grad_norm": 0.3591748774051666, + "learning_rate": 3.313218390804598e-05, + "loss": 0.1121, + "step": 2935 + }, + { + "epoch": 3.4033181192494384, + "grad_norm": 0.28786206245422363, + "learning_rate": 3.31264367816092e-05, + "loss": 0.104, + "step": 2936 + }, + { + "epoch": 3.4044772875461855, + "grad_norm": 0.23771996796131134, + "learning_rate": 3.3120689655172415e-05, + "loss": 0.1036, + "step": 2937 + }, + { + "epoch": 3.4056364558429326, + "grad_norm": 0.2682191729545593, + "learning_rate": 3.311494252873563e-05, + "loss": 0.1108, + "step": 2938 + }, + { + "epoch": 3.4067956241396797, + "grad_norm": 0.2601390480995178, + "learning_rate": 3.310919540229885e-05, + "loss": 0.1215, + "step": 2939 + }, + { + "epoch": 3.407954792436427, + "grad_norm": 0.3103736639022827, + "learning_rate": 3.310344827586207e-05, + "loss": 0.1083, + "step": 2940 + }, + { + "epoch": 3.409113960733174, + "grad_norm": 0.2568131983280182, + "learning_rate": 3.309770114942529e-05, + "loss": 0.107, + "step": 2941 + }, + { + "epoch": 3.410273129029921, + "grad_norm": 0.29044461250305176, + "learning_rate": 3.309195402298851e-05, + "loss": 0.1074, + "step": 2942 + }, + { + "epoch": 3.411432297326668, + "grad_norm": 0.30530881881713867, + "learning_rate": 3.3086206896551725e-05, + "loss": 0.1095, + "step": 2943 + }, + { + "epoch": 3.4125914656234153, + "grad_norm": 0.30023837089538574, + "learning_rate": 3.3080459770114946e-05, + "loss": 0.1057, + "step": 2944 + }, + { + "epoch": 3.4137506339201624, + "grad_norm": 0.3304160237312317, + "learning_rate": 3.307471264367816e-05, + "loss": 0.1015, + "step": 2945 + }, + { + "epoch": 3.4149098022169095, + "grad_norm": 0.3712501525878906, + "learning_rate": 3.3068965517241376e-05, + "loss": 0.1215, + "step": 2946 + }, + { + "epoch": 3.416068970513656, + "grad_norm": 0.36417120695114136, + "learning_rate": 3.30632183908046e-05, + "loss": 0.1185, + "step": 2947 + }, + { + "epoch": 3.4172281388104038, + "grad_norm": 0.401915580034256, + "learning_rate": 3.305747126436782e-05, + "loss": 0.1118, + "step": 2948 + }, + { + "epoch": 3.4183873071071504, + "grad_norm": 0.27606403827667236, + "learning_rate": 3.3051724137931035e-05, + "loss": 0.1035, + "step": 2949 + }, + { + "epoch": 3.4195464754038976, + "grad_norm": 0.28905513882637024, + "learning_rate": 3.3045977011494256e-05, + "loss": 0.1059, + "step": 2950 + }, + { + "epoch": 3.4207056437006447, + "grad_norm": 0.30320367217063904, + "learning_rate": 3.304022988505747e-05, + "loss": 0.1162, + "step": 2951 + }, + { + "epoch": 3.421864811997392, + "grad_norm": 0.36751824617385864, + "learning_rate": 3.303448275862069e-05, + "loss": 0.1222, + "step": 2952 + }, + { + "epoch": 3.423023980294139, + "grad_norm": 0.5594837665557861, + "learning_rate": 3.302873563218391e-05, + "loss": 0.0988, + "step": 2953 + }, + { + "epoch": 3.424183148590886, + "grad_norm": 0.2814250588417053, + "learning_rate": 3.302298850574713e-05, + "loss": 0.1089, + "step": 2954 + }, + { + "epoch": 3.425342316887633, + "grad_norm": 0.30620330572128296, + "learning_rate": 3.301724137931035e-05, + "loss": 0.1238, + "step": 2955 + }, + { + "epoch": 3.4265014851843802, + "grad_norm": 0.3259787857532501, + "learning_rate": 3.3011494252873566e-05, + "loss": 0.1061, + "step": 2956 + }, + { + "epoch": 3.4276606534811274, + "grad_norm": 0.3204762041568756, + "learning_rate": 3.300574712643678e-05, + "loss": 0.1081, + "step": 2957 + }, + { + "epoch": 3.4288198217778745, + "grad_norm": 0.32642972469329834, + "learning_rate": 3.3e-05, + "loss": 0.1097, + "step": 2958 + }, + { + "epoch": 3.4299789900746216, + "grad_norm": 0.3611741065979004, + "learning_rate": 3.299425287356322e-05, + "loss": 0.1153, + "step": 2959 + }, + { + "epoch": 3.4311381583713687, + "grad_norm": 0.24560561776161194, + "learning_rate": 3.298850574712644e-05, + "loss": 0.1122, + "step": 2960 + }, + { + "epoch": 3.432297326668116, + "grad_norm": 0.2226334512233734, + "learning_rate": 3.298275862068966e-05, + "loss": 0.1046, + "step": 2961 + }, + { + "epoch": 3.4334564949648625, + "grad_norm": 0.3255598545074463, + "learning_rate": 3.2977011494252876e-05, + "loss": 0.1003, + "step": 2962 + }, + { + "epoch": 3.4346156632616096, + "grad_norm": 0.2790594696998596, + "learning_rate": 3.29712643678161e-05, + "loss": 0.1162, + "step": 2963 + }, + { + "epoch": 3.4357748315583567, + "grad_norm": 0.3118756413459778, + "learning_rate": 3.296551724137931e-05, + "loss": 0.1025, + "step": 2964 + }, + { + "epoch": 3.436933999855104, + "grad_norm": 0.3635786473751068, + "learning_rate": 3.295977011494253e-05, + "loss": 0.1311, + "step": 2965 + }, + { + "epoch": 3.438093168151851, + "grad_norm": 0.2933517098426819, + "learning_rate": 3.295402298850575e-05, + "loss": 0.1107, + "step": 2966 + }, + { + "epoch": 3.439252336448598, + "grad_norm": 0.37896665930747986, + "learning_rate": 3.2948275862068964e-05, + "loss": 0.1058, + "step": 2967 + }, + { + "epoch": 3.440411504745345, + "grad_norm": 0.23080338537693024, + "learning_rate": 3.2942528735632185e-05, + "loss": 0.0987, + "step": 2968 + }, + { + "epoch": 3.4415706730420923, + "grad_norm": 0.262114554643631, + "learning_rate": 3.293678160919541e-05, + "loss": 0.1152, + "step": 2969 + }, + { + "epoch": 3.4427298413388394, + "grad_norm": 0.30394595861434937, + "learning_rate": 3.293103448275862e-05, + "loss": 0.1076, + "step": 2970 + }, + { + "epoch": 3.4438890096355865, + "grad_norm": 0.30665647983551025, + "learning_rate": 3.2925287356321844e-05, + "loss": 0.1169, + "step": 2971 + }, + { + "epoch": 3.4450481779323336, + "grad_norm": 0.35701873898506165, + "learning_rate": 3.291954022988506e-05, + "loss": 0.1186, + "step": 2972 + }, + { + "epoch": 3.4462073462290808, + "grad_norm": 0.3494682013988495, + "learning_rate": 3.2913793103448273e-05, + "loss": 0.1188, + "step": 2973 + }, + { + "epoch": 3.447366514525828, + "grad_norm": 0.28734609484672546, + "learning_rate": 3.2908045977011495e-05, + "loss": 0.1164, + "step": 2974 + }, + { + "epoch": 3.448525682822575, + "grad_norm": 0.25234439969062805, + "learning_rate": 3.290229885057472e-05, + "loss": 0.107, + "step": 2975 + }, + { + "epoch": 3.449684851119322, + "grad_norm": 0.3193553388118744, + "learning_rate": 3.289655172413793e-05, + "loss": 0.1101, + "step": 2976 + }, + { + "epoch": 3.4508440194160688, + "grad_norm": 0.32543477416038513, + "learning_rate": 3.2890804597701153e-05, + "loss": 0.1204, + "step": 2977 + }, + { + "epoch": 3.452003187712816, + "grad_norm": 0.34769558906555176, + "learning_rate": 3.288505747126437e-05, + "loss": 0.1161, + "step": 2978 + }, + { + "epoch": 3.453162356009563, + "grad_norm": 0.315849244594574, + "learning_rate": 3.287931034482758e-05, + "loss": 0.1074, + "step": 2979 + }, + { + "epoch": 3.45432152430631, + "grad_norm": 0.3821331262588501, + "learning_rate": 3.2873563218390805e-05, + "loss": 0.1193, + "step": 2980 + }, + { + "epoch": 3.4554806926030572, + "grad_norm": 0.2808358669281006, + "learning_rate": 3.2867816091954027e-05, + "loss": 0.1031, + "step": 2981 + }, + { + "epoch": 3.4566398608998044, + "grad_norm": 0.5840047001838684, + "learning_rate": 3.286206896551725e-05, + "loss": 0.1185, + "step": 2982 + }, + { + "epoch": 3.4577990291965515, + "grad_norm": 0.3200035095214844, + "learning_rate": 3.285632183908046e-05, + "loss": 0.1226, + "step": 2983 + }, + { + "epoch": 3.4589581974932986, + "grad_norm": 0.3244805335998535, + "learning_rate": 3.285057471264368e-05, + "loss": 0.1222, + "step": 2984 + }, + { + "epoch": 3.4601173657900457, + "grad_norm": 0.3481799364089966, + "learning_rate": 3.28448275862069e-05, + "loss": 0.1145, + "step": 2985 + }, + { + "epoch": 3.461276534086793, + "grad_norm": 0.40199366211891174, + "learning_rate": 3.2839080459770115e-05, + "loss": 0.1198, + "step": 2986 + }, + { + "epoch": 3.46243570238354, + "grad_norm": 0.297984778881073, + "learning_rate": 3.283333333333333e-05, + "loss": 0.1072, + "step": 2987 + }, + { + "epoch": 3.463594870680287, + "grad_norm": 0.272242933511734, + "learning_rate": 3.282758620689655e-05, + "loss": 0.1262, + "step": 2988 + }, + { + "epoch": 3.464754038977034, + "grad_norm": 0.5346304178237915, + "learning_rate": 3.282183908045977e-05, + "loss": 0.1111, + "step": 2989 + }, + { + "epoch": 3.465913207273781, + "grad_norm": 0.25313276052474976, + "learning_rate": 3.2816091954022995e-05, + "loss": 0.0966, + "step": 2990 + }, + { + "epoch": 3.4670723755705284, + "grad_norm": 0.2953258156776428, + "learning_rate": 3.281034482758621e-05, + "loss": 0.11, + "step": 2991 + }, + { + "epoch": 3.468231543867275, + "grad_norm": 0.2898904085159302, + "learning_rate": 3.2804597701149424e-05, + "loss": 0.1163, + "step": 2992 + }, + { + "epoch": 3.469390712164022, + "grad_norm": 0.2814153730869293, + "learning_rate": 3.2798850574712646e-05, + "loss": 0.1005, + "step": 2993 + }, + { + "epoch": 3.4705498804607693, + "grad_norm": 0.25759944319725037, + "learning_rate": 3.279310344827586e-05, + "loss": 0.1051, + "step": 2994 + }, + { + "epoch": 3.4717090487575164, + "grad_norm": 0.2664312720298767, + "learning_rate": 3.278735632183908e-05, + "loss": 0.1084, + "step": 2995 + }, + { + "epoch": 3.4728682170542635, + "grad_norm": 0.26857104897499084, + "learning_rate": 3.2781609195402304e-05, + "loss": 0.1059, + "step": 2996 + }, + { + "epoch": 3.4740273853510106, + "grad_norm": 0.6132874488830566, + "learning_rate": 3.277586206896552e-05, + "loss": 0.1036, + "step": 2997 + }, + { + "epoch": 3.4751865536477577, + "grad_norm": 0.2552327513694763, + "learning_rate": 3.2770114942528734e-05, + "loss": 0.0961, + "step": 2998 + }, + { + "epoch": 3.476345721944505, + "grad_norm": 0.3144066035747528, + "learning_rate": 3.2764367816091956e-05, + "loss": 0.1203, + "step": 2999 + }, + { + "epoch": 3.477504890241252, + "grad_norm": 0.2916877567768097, + "learning_rate": 3.275862068965517e-05, + "loss": 0.1169, + "step": 3000 + }, + { + "epoch": 3.478664058537999, + "grad_norm": 0.2755250036716461, + "learning_rate": 3.275287356321839e-05, + "loss": 0.1092, + "step": 3001 + }, + { + "epoch": 3.479823226834746, + "grad_norm": 0.30885928869247437, + "learning_rate": 3.2747126436781614e-05, + "loss": 0.1072, + "step": 3002 + }, + { + "epoch": 3.4809823951314933, + "grad_norm": 0.26637914776802063, + "learning_rate": 3.274137931034483e-05, + "loss": 0.1115, + "step": 3003 + }, + { + "epoch": 3.4821415634282404, + "grad_norm": 0.25272154808044434, + "learning_rate": 3.273563218390805e-05, + "loss": 0.1062, + "step": 3004 + }, + { + "epoch": 3.483300731724987, + "grad_norm": 0.24959425628185272, + "learning_rate": 3.2729885057471266e-05, + "loss": 0.1034, + "step": 3005 + }, + { + "epoch": 3.4844599000217342, + "grad_norm": 0.29696011543273926, + "learning_rate": 3.272413793103448e-05, + "loss": 0.129, + "step": 3006 + }, + { + "epoch": 3.4856190683184813, + "grad_norm": 0.24814733862876892, + "learning_rate": 3.27183908045977e-05, + "loss": 0.1057, + "step": 3007 + }, + { + "epoch": 3.4867782366152285, + "grad_norm": 0.2551887035369873, + "learning_rate": 3.271264367816092e-05, + "loss": 0.1072, + "step": 3008 + }, + { + "epoch": 3.4879374049119756, + "grad_norm": 0.2791741192340851, + "learning_rate": 3.2706896551724145e-05, + "loss": 0.1059, + "step": 3009 + }, + { + "epoch": 3.4890965732087227, + "grad_norm": 0.3654879033565521, + "learning_rate": 3.270114942528736e-05, + "loss": 0.1077, + "step": 3010 + }, + { + "epoch": 3.49025574150547, + "grad_norm": 0.21658289432525635, + "learning_rate": 3.2695402298850575e-05, + "loss": 0.0997, + "step": 3011 + }, + { + "epoch": 3.491414909802217, + "grad_norm": 0.2808610796928406, + "learning_rate": 3.26896551724138e-05, + "loss": 0.1127, + "step": 3012 + }, + { + "epoch": 3.492574078098964, + "grad_norm": 0.327570378780365, + "learning_rate": 3.268390804597701e-05, + "loss": 0.1062, + "step": 3013 + }, + { + "epoch": 3.493733246395711, + "grad_norm": 0.24436767399311066, + "learning_rate": 3.267816091954023e-05, + "loss": 0.1024, + "step": 3014 + }, + { + "epoch": 3.4948924146924583, + "grad_norm": 0.33249321579933167, + "learning_rate": 3.267241379310345e-05, + "loss": 0.1109, + "step": 3015 + }, + { + "epoch": 3.4960515829892054, + "grad_norm": 0.31921130418777466, + "learning_rate": 3.266666666666667e-05, + "loss": 0.1264, + "step": 3016 + }, + { + "epoch": 3.4972107512859525, + "grad_norm": 0.2958783805370331, + "learning_rate": 3.2660919540229885e-05, + "loss": 0.1125, + "step": 3017 + }, + { + "epoch": 3.4983699195826996, + "grad_norm": 0.2842378318309784, + "learning_rate": 3.265517241379311e-05, + "loss": 0.1187, + "step": 3018 + }, + { + "epoch": 3.4995290878794467, + "grad_norm": 0.37194153666496277, + "learning_rate": 3.264942528735632e-05, + "loss": 0.1131, + "step": 3019 + }, + { + "epoch": 3.5006882561761934, + "grad_norm": 0.342540979385376, + "learning_rate": 3.264367816091954e-05, + "loss": 0.1059, + "step": 3020 + }, + { + "epoch": 3.5018474244729405, + "grad_norm": 0.517421305179596, + "learning_rate": 3.263793103448276e-05, + "loss": 0.1226, + "step": 3021 + }, + { + "epoch": 3.5030065927696876, + "grad_norm": 0.2997574508190155, + "learning_rate": 3.263218390804598e-05, + "loss": 0.0962, + "step": 3022 + }, + { + "epoch": 3.5041657610664347, + "grad_norm": 0.3273774981498718, + "learning_rate": 3.26264367816092e-05, + "loss": 0.1205, + "step": 3023 + }, + { + "epoch": 3.505324929363182, + "grad_norm": 0.25956934690475464, + "learning_rate": 3.2620689655172416e-05, + "loss": 0.1186, + "step": 3024 + }, + { + "epoch": 3.506484097659929, + "grad_norm": 0.306949645280838, + "learning_rate": 3.261494252873563e-05, + "loss": 0.1115, + "step": 3025 + }, + { + "epoch": 3.507643265956676, + "grad_norm": 0.3232230842113495, + "learning_rate": 3.260919540229885e-05, + "loss": 0.1155, + "step": 3026 + }, + { + "epoch": 3.508802434253423, + "grad_norm": 0.2607809007167816, + "learning_rate": 3.260344827586207e-05, + "loss": 0.1142, + "step": 3027 + }, + { + "epoch": 3.5099616025501703, + "grad_norm": 0.31110328435897827, + "learning_rate": 3.259770114942529e-05, + "loss": 0.1102, + "step": 3028 + }, + { + "epoch": 3.5111207708469174, + "grad_norm": 0.24277092516422272, + "learning_rate": 3.2591954022988505e-05, + "loss": 0.1149, + "step": 3029 + }, + { + "epoch": 3.5122799391436645, + "grad_norm": 0.29219481348991394, + "learning_rate": 3.2586206896551726e-05, + "loss": 0.1141, + "step": 3030 + }, + { + "epoch": 3.5134391074404117, + "grad_norm": 0.24297522008419037, + "learning_rate": 3.258045977011495e-05, + "loss": 0.1015, + "step": 3031 + }, + { + "epoch": 3.5145982757371588, + "grad_norm": 0.2821162939071655, + "learning_rate": 3.257471264367816e-05, + "loss": 0.1153, + "step": 3032 + }, + { + "epoch": 3.5157574440339054, + "grad_norm": 0.5800468325614929, + "learning_rate": 3.256896551724138e-05, + "loss": 0.1193, + "step": 3033 + }, + { + "epoch": 3.516916612330653, + "grad_norm": 0.3560374677181244, + "learning_rate": 3.25632183908046e-05, + "loss": 0.1076, + "step": 3034 + }, + { + "epoch": 3.5180757806273997, + "grad_norm": 0.4053660035133362, + "learning_rate": 3.2557471264367814e-05, + "loss": 0.1098, + "step": 3035 + }, + { + "epoch": 3.519234948924147, + "grad_norm": 0.36452099680900574, + "learning_rate": 3.2551724137931036e-05, + "loss": 0.1074, + "step": 3036 + }, + { + "epoch": 3.520394117220894, + "grad_norm": 0.2924714982509613, + "learning_rate": 3.254597701149426e-05, + "loss": 0.1131, + "step": 3037 + }, + { + "epoch": 3.521553285517641, + "grad_norm": 0.30950403213500977, + "learning_rate": 3.254022988505747e-05, + "loss": 0.1121, + "step": 3038 + }, + { + "epoch": 3.522712453814388, + "grad_norm": 0.3289961814880371, + "learning_rate": 3.2534482758620694e-05, + "loss": 0.1153, + "step": 3039 + }, + { + "epoch": 3.5238716221111352, + "grad_norm": 0.2730542719364166, + "learning_rate": 3.252873563218391e-05, + "loss": 0.1181, + "step": 3040 + }, + { + "epoch": 3.5250307904078824, + "grad_norm": 0.4356801211833954, + "learning_rate": 3.2522988505747124e-05, + "loss": 0.1049, + "step": 3041 + }, + { + "epoch": 3.5261899587046295, + "grad_norm": 0.2651293873786926, + "learning_rate": 3.2517241379310346e-05, + "loss": 0.1194, + "step": 3042 + }, + { + "epoch": 3.5273491270013766, + "grad_norm": 0.3155745267868042, + "learning_rate": 3.251149425287357e-05, + "loss": 0.1106, + "step": 3043 + }, + { + "epoch": 3.5285082952981237, + "grad_norm": 0.2664744257926941, + "learning_rate": 3.250574712643678e-05, + "loss": 0.1166, + "step": 3044 + }, + { + "epoch": 3.529667463594871, + "grad_norm": 0.3852986693382263, + "learning_rate": 3.2500000000000004e-05, + "loss": 0.1129, + "step": 3045 + }, + { + "epoch": 3.5308266318916175, + "grad_norm": 0.31656765937805176, + "learning_rate": 3.249425287356322e-05, + "loss": 0.1202, + "step": 3046 + }, + { + "epoch": 3.531985800188365, + "grad_norm": 0.2873196303844452, + "learning_rate": 3.248850574712644e-05, + "loss": 0.1228, + "step": 3047 + }, + { + "epoch": 3.5331449684851117, + "grad_norm": 0.29903656244277954, + "learning_rate": 3.2482758620689655e-05, + "loss": 0.1052, + "step": 3048 + }, + { + "epoch": 3.5343041367818593, + "grad_norm": 0.2554726302623749, + "learning_rate": 3.247701149425287e-05, + "loss": 0.1036, + "step": 3049 + }, + { + "epoch": 3.535463305078606, + "grad_norm": 0.33585837483406067, + "learning_rate": 3.24712643678161e-05, + "loss": 0.117, + "step": 3050 + }, + { + "epoch": 3.536622473375353, + "grad_norm": 0.28238433599472046, + "learning_rate": 3.2465517241379314e-05, + "loss": 0.1106, + "step": 3051 + }, + { + "epoch": 3.5377816416721, + "grad_norm": 0.33771008253097534, + "learning_rate": 3.245977011494253e-05, + "loss": 0.1117, + "step": 3052 + }, + { + "epoch": 3.5389408099688473, + "grad_norm": 0.34234920144081116, + "learning_rate": 3.245402298850575e-05, + "loss": 0.1119, + "step": 3053 + }, + { + "epoch": 3.5400999782655944, + "grad_norm": 0.29577329754829407, + "learning_rate": 3.2448275862068965e-05, + "loss": 0.1119, + "step": 3054 + }, + { + "epoch": 3.5412591465623415, + "grad_norm": 0.5385765433311462, + "learning_rate": 3.244252873563218e-05, + "loss": 0.1013, + "step": 3055 + }, + { + "epoch": 3.5424183148590886, + "grad_norm": 0.31815600395202637, + "learning_rate": 3.24367816091954e-05, + "loss": 0.1136, + "step": 3056 + }, + { + "epoch": 3.5435774831558358, + "grad_norm": 0.3204317092895508, + "learning_rate": 3.2431034482758623e-05, + "loss": 0.1209, + "step": 3057 + }, + { + "epoch": 3.544736651452583, + "grad_norm": 0.34729644656181335, + "learning_rate": 3.2425287356321845e-05, + "loss": 0.1186, + "step": 3058 + }, + { + "epoch": 3.54589581974933, + "grad_norm": 0.35338544845581055, + "learning_rate": 3.241954022988506e-05, + "loss": 0.1165, + "step": 3059 + }, + { + "epoch": 3.547054988046077, + "grad_norm": 0.36400482058525085, + "learning_rate": 3.2413793103448275e-05, + "loss": 0.1285, + "step": 3060 + }, + { + "epoch": 3.5482141563428238, + "grad_norm": 0.2854449450969696, + "learning_rate": 3.2408045977011497e-05, + "loss": 0.1064, + "step": 3061 + }, + { + "epoch": 3.5493733246395713, + "grad_norm": 0.2662312090396881, + "learning_rate": 3.240229885057471e-05, + "loss": 0.105, + "step": 3062 + }, + { + "epoch": 3.550532492936318, + "grad_norm": 0.30098652839660645, + "learning_rate": 3.239655172413793e-05, + "loss": 0.1025, + "step": 3063 + }, + { + "epoch": 3.551691661233065, + "grad_norm": 0.35262545943260193, + "learning_rate": 3.2390804597701155e-05, + "loss": 0.1148, + "step": 3064 + }, + { + "epoch": 3.5528508295298122, + "grad_norm": 0.2994409501552582, + "learning_rate": 3.238505747126437e-05, + "loss": 0.1072, + "step": 3065 + }, + { + "epoch": 3.5540099978265594, + "grad_norm": 0.3354470133781433, + "learning_rate": 3.237931034482759e-05, + "loss": 0.1112, + "step": 3066 + }, + { + "epoch": 3.5551691661233065, + "grad_norm": 0.30493679642677307, + "learning_rate": 3.2373563218390806e-05, + "loss": 0.1071, + "step": 3067 + }, + { + "epoch": 3.5563283344200536, + "grad_norm": 0.24388879537582397, + "learning_rate": 3.236781609195402e-05, + "loss": 0.1135, + "step": 3068 + }, + { + "epoch": 3.5574875027168007, + "grad_norm": 0.37016189098358154, + "learning_rate": 3.236206896551724e-05, + "loss": 0.1184, + "step": 3069 + }, + { + "epoch": 3.558646671013548, + "grad_norm": 0.3760676681995392, + "learning_rate": 3.2356321839080465e-05, + "loss": 0.1099, + "step": 3070 + }, + { + "epoch": 3.559805839310295, + "grad_norm": 0.25357261300086975, + "learning_rate": 3.235057471264368e-05, + "loss": 0.1016, + "step": 3071 + }, + { + "epoch": 3.560965007607042, + "grad_norm": 0.3364717960357666, + "learning_rate": 3.23448275862069e-05, + "loss": 0.1095, + "step": 3072 + }, + { + "epoch": 3.562124175903789, + "grad_norm": 0.316201776266098, + "learning_rate": 3.2339080459770116e-05, + "loss": 0.1217, + "step": 3073 + }, + { + "epoch": 3.5632833442005363, + "grad_norm": 0.2470405250787735, + "learning_rate": 3.233333333333333e-05, + "loss": 0.1078, + "step": 3074 + }, + { + "epoch": 3.5644425124972834, + "grad_norm": 0.25586774945259094, + "learning_rate": 3.232758620689655e-05, + "loss": 0.1138, + "step": 3075 + }, + { + "epoch": 3.56560168079403, + "grad_norm": 0.37907102704048157, + "learning_rate": 3.232183908045977e-05, + "loss": 0.1062, + "step": 3076 + }, + { + "epoch": 3.5667608490907776, + "grad_norm": 0.36059078574180603, + "learning_rate": 3.231609195402299e-05, + "loss": 0.1178, + "step": 3077 + }, + { + "epoch": 3.5679200173875243, + "grad_norm": 0.24348652362823486, + "learning_rate": 3.231034482758621e-05, + "loss": 0.1101, + "step": 3078 + }, + { + "epoch": 3.5690791856842714, + "grad_norm": 0.31023815274238586, + "learning_rate": 3.2304597701149426e-05, + "loss": 0.1072, + "step": 3079 + }, + { + "epoch": 3.5702383539810185, + "grad_norm": 0.2670823931694031, + "learning_rate": 3.229885057471265e-05, + "loss": 0.1137, + "step": 3080 + }, + { + "epoch": 3.5713975222777656, + "grad_norm": 0.4615989029407501, + "learning_rate": 3.229310344827586e-05, + "loss": 0.1186, + "step": 3081 + }, + { + "epoch": 3.5725566905745127, + "grad_norm": 0.2701643407344818, + "learning_rate": 3.228735632183908e-05, + "loss": 0.1039, + "step": 3082 + }, + { + "epoch": 3.57371585887126, + "grad_norm": 0.28233602643013, + "learning_rate": 3.22816091954023e-05, + "loss": 0.1227, + "step": 3083 + }, + { + "epoch": 3.574875027168007, + "grad_norm": 0.3075142204761505, + "learning_rate": 3.227586206896552e-05, + "loss": 0.115, + "step": 3084 + }, + { + "epoch": 3.576034195464754, + "grad_norm": 0.31267890334129333, + "learning_rate": 3.227011494252874e-05, + "loss": 0.1172, + "step": 3085 + }, + { + "epoch": 3.577193363761501, + "grad_norm": 0.27376192808151245, + "learning_rate": 3.226436781609196e-05, + "loss": 0.1111, + "step": 3086 + }, + { + "epoch": 3.5783525320582483, + "grad_norm": 0.2703624963760376, + "learning_rate": 3.225862068965517e-05, + "loss": 0.1122, + "step": 3087 + }, + { + "epoch": 3.5795117003549954, + "grad_norm": 0.32042473554611206, + "learning_rate": 3.2252873563218394e-05, + "loss": 0.1001, + "step": 3088 + }, + { + "epoch": 3.580670868651742, + "grad_norm": 0.28498318791389465, + "learning_rate": 3.224712643678161e-05, + "loss": 0.1022, + "step": 3089 + }, + { + "epoch": 3.5818300369484897, + "grad_norm": 0.2931499183177948, + "learning_rate": 3.2241379310344824e-05, + "loss": 0.1109, + "step": 3090 + }, + { + "epoch": 3.5829892052452363, + "grad_norm": 0.34564685821533203, + "learning_rate": 3.223563218390805e-05, + "loss": 0.1222, + "step": 3091 + }, + { + "epoch": 3.584148373541984, + "grad_norm": 0.40452006459236145, + "learning_rate": 3.222988505747127e-05, + "loss": 0.1101, + "step": 3092 + }, + { + "epoch": 3.5853075418387306, + "grad_norm": 0.40536507964134216, + "learning_rate": 3.222413793103448e-05, + "loss": 0.1166, + "step": 3093 + }, + { + "epoch": 3.5864667101354777, + "grad_norm": 0.30855557322502136, + "learning_rate": 3.2218390804597704e-05, + "loss": 0.1273, + "step": 3094 + }, + { + "epoch": 3.587625878432225, + "grad_norm": 0.2754945755004883, + "learning_rate": 3.221264367816092e-05, + "loss": 0.1053, + "step": 3095 + }, + { + "epoch": 3.588785046728972, + "grad_norm": 0.3107283413410187, + "learning_rate": 3.220689655172414e-05, + "loss": 0.1072, + "step": 3096 + }, + { + "epoch": 3.589944215025719, + "grad_norm": 0.30340340733528137, + "learning_rate": 3.2201149425287355e-05, + "loss": 0.1079, + "step": 3097 + }, + { + "epoch": 3.591103383322466, + "grad_norm": 0.32089850306510925, + "learning_rate": 3.219540229885058e-05, + "loss": 0.1076, + "step": 3098 + }, + { + "epoch": 3.5922625516192133, + "grad_norm": 0.3080294728279114, + "learning_rate": 3.21896551724138e-05, + "loss": 0.1093, + "step": 3099 + }, + { + "epoch": 3.5934217199159604, + "grad_norm": 0.28277286887168884, + "learning_rate": 3.218390804597701e-05, + "loss": 0.1045, + "step": 3100 + }, + { + "epoch": 3.5945808882127075, + "grad_norm": 0.30949509143829346, + "learning_rate": 3.217816091954023e-05, + "loss": 0.1089, + "step": 3101 + }, + { + "epoch": 3.5957400565094546, + "grad_norm": 0.24771907925605774, + "learning_rate": 3.217241379310345e-05, + "loss": 0.1061, + "step": 3102 + }, + { + "epoch": 3.5968992248062017, + "grad_norm": 0.37252581119537354, + "learning_rate": 3.2166666666666665e-05, + "loss": 0.12, + "step": 3103 + }, + { + "epoch": 3.5980583931029484, + "grad_norm": 0.3569035232067108, + "learning_rate": 3.2160919540229886e-05, + "loss": 0.1188, + "step": 3104 + }, + { + "epoch": 3.599217561399696, + "grad_norm": 0.46223220229148865, + "learning_rate": 3.215517241379311e-05, + "loss": 0.1251, + "step": 3105 + }, + { + "epoch": 3.6003767296964426, + "grad_norm": 0.42109113931655884, + "learning_rate": 3.214942528735632e-05, + "loss": 0.1117, + "step": 3106 + }, + { + "epoch": 3.6015358979931897, + "grad_norm": 0.3445476293563843, + "learning_rate": 3.2143678160919545e-05, + "loss": 0.1015, + "step": 3107 + }, + { + "epoch": 3.602695066289937, + "grad_norm": 0.3141631484031677, + "learning_rate": 3.213793103448276e-05, + "loss": 0.1086, + "step": 3108 + }, + { + "epoch": 3.603854234586684, + "grad_norm": 0.4695309102535248, + "learning_rate": 3.2132183908045974e-05, + "loss": 0.1293, + "step": 3109 + }, + { + "epoch": 3.605013402883431, + "grad_norm": 0.2591305077075958, + "learning_rate": 3.2126436781609196e-05, + "loss": 0.1013, + "step": 3110 + }, + { + "epoch": 3.606172571180178, + "grad_norm": 0.26712268590927124, + "learning_rate": 3.212068965517242e-05, + "loss": 0.1071, + "step": 3111 + }, + { + "epoch": 3.6073317394769253, + "grad_norm": 0.30240774154663086, + "learning_rate": 3.211494252873564e-05, + "loss": 0.1096, + "step": 3112 + }, + { + "epoch": 3.6084909077736724, + "grad_norm": 0.3110927641391754, + "learning_rate": 3.2109195402298854e-05, + "loss": 0.1147, + "step": 3113 + }, + { + "epoch": 3.6096500760704195, + "grad_norm": 0.3136695325374603, + "learning_rate": 3.210344827586207e-05, + "loss": 0.1206, + "step": 3114 + }, + { + "epoch": 3.6108092443671667, + "grad_norm": 0.23492947220802307, + "learning_rate": 3.209770114942529e-05, + "loss": 0.1091, + "step": 3115 + }, + { + "epoch": 3.6119684126639138, + "grad_norm": 0.29082900285720825, + "learning_rate": 3.2091954022988506e-05, + "loss": 0.1052, + "step": 3116 + }, + { + "epoch": 3.613127580960661, + "grad_norm": 0.2898150086402893, + "learning_rate": 3.208620689655172e-05, + "loss": 0.1239, + "step": 3117 + }, + { + "epoch": 3.614286749257408, + "grad_norm": 0.25189781188964844, + "learning_rate": 3.208045977011494e-05, + "loss": 0.1027, + "step": 3118 + }, + { + "epoch": 3.6154459175541547, + "grad_norm": 0.2974126636981964, + "learning_rate": 3.2074712643678164e-05, + "loss": 0.0982, + "step": 3119 + }, + { + "epoch": 3.6166050858509022, + "grad_norm": 0.2558945119380951, + "learning_rate": 3.206896551724138e-05, + "loss": 0.1072, + "step": 3120 + }, + { + "epoch": 3.617764254147649, + "grad_norm": 0.2936224937438965, + "learning_rate": 3.20632183908046e-05, + "loss": 0.1153, + "step": 3121 + }, + { + "epoch": 3.618923422444396, + "grad_norm": 0.24101215600967407, + "learning_rate": 3.2057471264367816e-05, + "loss": 0.1005, + "step": 3122 + }, + { + "epoch": 3.620082590741143, + "grad_norm": 0.3117283582687378, + "learning_rate": 3.205172413793104e-05, + "loss": 0.1159, + "step": 3123 + }, + { + "epoch": 3.6212417590378903, + "grad_norm": 0.3123790919780731, + "learning_rate": 3.204597701149425e-05, + "loss": 0.1102, + "step": 3124 + }, + { + "epoch": 3.6224009273346374, + "grad_norm": 0.32129356265068054, + "learning_rate": 3.2040229885057474e-05, + "loss": 0.1247, + "step": 3125 + }, + { + "epoch": 3.6235600956313845, + "grad_norm": 0.287790983915329, + "learning_rate": 3.2034482758620696e-05, + "loss": 0.1225, + "step": 3126 + }, + { + "epoch": 3.6247192639281316, + "grad_norm": 0.33491361141204834, + "learning_rate": 3.202873563218391e-05, + "loss": 0.1189, + "step": 3127 + }, + { + "epoch": 3.6258784322248787, + "grad_norm": 0.3275797963142395, + "learning_rate": 3.2022988505747125e-05, + "loss": 0.1246, + "step": 3128 + }, + { + "epoch": 3.627037600521626, + "grad_norm": 0.30787503719329834, + "learning_rate": 3.201724137931035e-05, + "loss": 0.1204, + "step": 3129 + }, + { + "epoch": 3.628196768818373, + "grad_norm": 0.254406601190567, + "learning_rate": 3.201149425287356e-05, + "loss": 0.1033, + "step": 3130 + }, + { + "epoch": 3.62935593711512, + "grad_norm": 0.2660934329032898, + "learning_rate": 3.2005747126436784e-05, + "loss": 0.1096, + "step": 3131 + }, + { + "epoch": 3.6305151054118667, + "grad_norm": 0.30187851190567017, + "learning_rate": 3.2000000000000005e-05, + "loss": 0.1064, + "step": 3132 + }, + { + "epoch": 3.6316742737086143, + "grad_norm": 0.2814703583717346, + "learning_rate": 3.199425287356322e-05, + "loss": 0.1209, + "step": 3133 + }, + { + "epoch": 3.632833442005361, + "grad_norm": 0.31418412923812866, + "learning_rate": 3.198850574712644e-05, + "loss": 0.1147, + "step": 3134 + }, + { + "epoch": 3.633992610302108, + "grad_norm": 0.25230860710144043, + "learning_rate": 3.198275862068966e-05, + "loss": 0.1094, + "step": 3135 + }, + { + "epoch": 3.635151778598855, + "grad_norm": 0.2896537482738495, + "learning_rate": 3.197701149425287e-05, + "loss": 0.1166, + "step": 3136 + }, + { + "epoch": 3.6363109468956023, + "grad_norm": 0.2596437931060791, + "learning_rate": 3.1971264367816093e-05, + "loss": 0.1141, + "step": 3137 + }, + { + "epoch": 3.6374701151923494, + "grad_norm": 0.29989323019981384, + "learning_rate": 3.196551724137931e-05, + "loss": 0.1154, + "step": 3138 + }, + { + "epoch": 3.6386292834890965, + "grad_norm": 0.303406685590744, + "learning_rate": 3.195977011494253e-05, + "loss": 0.1147, + "step": 3139 + }, + { + "epoch": 3.6397884517858436, + "grad_norm": 0.2839892506599426, + "learning_rate": 3.195402298850575e-05, + "loss": 0.1164, + "step": 3140 + }, + { + "epoch": 3.6409476200825908, + "grad_norm": 0.29430249333381653, + "learning_rate": 3.1948275862068967e-05, + "loss": 0.1083, + "step": 3141 + }, + { + "epoch": 3.642106788379338, + "grad_norm": 0.24778763949871063, + "learning_rate": 3.194252873563219e-05, + "loss": 0.1037, + "step": 3142 + }, + { + "epoch": 3.643265956676085, + "grad_norm": 0.3377165198326111, + "learning_rate": 3.19367816091954e-05, + "loss": 0.1178, + "step": 3143 + }, + { + "epoch": 3.644425124972832, + "grad_norm": 0.38875332474708557, + "learning_rate": 3.193103448275862e-05, + "loss": 0.1159, + "step": 3144 + }, + { + "epoch": 3.6455842932695792, + "grad_norm": 0.3728177547454834, + "learning_rate": 3.192528735632184e-05, + "loss": 0.1088, + "step": 3145 + }, + { + "epoch": 3.6467434615663263, + "grad_norm": 0.31668561697006226, + "learning_rate": 3.191954022988506e-05, + "loss": 0.1055, + "step": 3146 + }, + { + "epoch": 3.647902629863073, + "grad_norm": 0.3226422965526581, + "learning_rate": 3.1913793103448276e-05, + "loss": 0.1252, + "step": 3147 + }, + { + "epoch": 3.6490617981598206, + "grad_norm": 0.31225472688674927, + "learning_rate": 3.19080459770115e-05, + "loss": 0.1131, + "step": 3148 + }, + { + "epoch": 3.6502209664565672, + "grad_norm": 0.3561670482158661, + "learning_rate": 3.190229885057471e-05, + "loss": 0.1341, + "step": 3149 + }, + { + "epoch": 3.6513801347533144, + "grad_norm": 0.29987701773643494, + "learning_rate": 3.1896551724137935e-05, + "loss": 0.1263, + "step": 3150 + }, + { + "epoch": 3.6525393030500615, + "grad_norm": 0.2811523675918579, + "learning_rate": 3.189080459770115e-05, + "loss": 0.1225, + "step": 3151 + }, + { + "epoch": 3.6536984713468086, + "grad_norm": 0.31684601306915283, + "learning_rate": 3.188505747126437e-05, + "loss": 0.1086, + "step": 3152 + }, + { + "epoch": 3.6548576396435557, + "grad_norm": 0.28780969977378845, + "learning_rate": 3.187931034482759e-05, + "loss": 0.1085, + "step": 3153 + }, + { + "epoch": 3.656016807940303, + "grad_norm": 0.31450673937797546, + "learning_rate": 3.187356321839081e-05, + "loss": 0.109, + "step": 3154 + }, + { + "epoch": 3.65717597623705, + "grad_norm": 0.2759770452976227, + "learning_rate": 3.186781609195402e-05, + "loss": 0.1049, + "step": 3155 + }, + { + "epoch": 3.658335144533797, + "grad_norm": 0.3448818624019623, + "learning_rate": 3.1862068965517244e-05, + "loss": 0.1283, + "step": 3156 + }, + { + "epoch": 3.659494312830544, + "grad_norm": 0.2501785457134247, + "learning_rate": 3.185632183908046e-05, + "loss": 0.1056, + "step": 3157 + }, + { + "epoch": 3.6606534811272913, + "grad_norm": 0.288875550031662, + "learning_rate": 3.1850574712643674e-05, + "loss": 0.1018, + "step": 3158 + }, + { + "epoch": 3.6618126494240384, + "grad_norm": 0.29204368591308594, + "learning_rate": 3.1844827586206896e-05, + "loss": 0.1082, + "step": 3159 + }, + { + "epoch": 3.662971817720785, + "grad_norm": 0.3856884837150574, + "learning_rate": 3.183908045977012e-05, + "loss": 0.0979, + "step": 3160 + }, + { + "epoch": 3.6641309860175326, + "grad_norm": 0.30575424432754517, + "learning_rate": 3.183333333333334e-05, + "loss": 0.1151, + "step": 3161 + }, + { + "epoch": 3.6652901543142793, + "grad_norm": 0.3039374053478241, + "learning_rate": 3.1827586206896554e-05, + "loss": 0.1203, + "step": 3162 + }, + { + "epoch": 3.666449322611027, + "grad_norm": 0.2760923206806183, + "learning_rate": 3.182183908045977e-05, + "loss": 0.1114, + "step": 3163 + }, + { + "epoch": 3.6676084909077735, + "grad_norm": 0.3372345566749573, + "learning_rate": 3.181609195402299e-05, + "loss": 0.1204, + "step": 3164 + }, + { + "epoch": 3.6687676592045206, + "grad_norm": 0.34602412581443787, + "learning_rate": 3.1810344827586206e-05, + "loss": 0.1149, + "step": 3165 + }, + { + "epoch": 3.6699268275012678, + "grad_norm": 0.3075660467147827, + "learning_rate": 3.180459770114943e-05, + "loss": 0.109, + "step": 3166 + }, + { + "epoch": 3.671085995798015, + "grad_norm": 0.4555627405643463, + "learning_rate": 3.179885057471265e-05, + "loss": 0.1119, + "step": 3167 + }, + { + "epoch": 3.672245164094762, + "grad_norm": 0.3053813576698303, + "learning_rate": 3.1793103448275864e-05, + "loss": 0.117, + "step": 3168 + }, + { + "epoch": 3.673404332391509, + "grad_norm": 0.34098416566848755, + "learning_rate": 3.1787356321839085e-05, + "loss": 0.1131, + "step": 3169 + }, + { + "epoch": 3.674563500688256, + "grad_norm": 0.2684711813926697, + "learning_rate": 3.17816091954023e-05, + "loss": 0.1205, + "step": 3170 + }, + { + "epoch": 3.6757226689850033, + "grad_norm": 0.2564740777015686, + "learning_rate": 3.1775862068965515e-05, + "loss": 0.1063, + "step": 3171 + }, + { + "epoch": 3.6768818372817504, + "grad_norm": 0.27508318424224854, + "learning_rate": 3.177011494252874e-05, + "loss": 0.1106, + "step": 3172 + }, + { + "epoch": 3.6780410055784976, + "grad_norm": 0.23721154034137726, + "learning_rate": 3.176436781609196e-05, + "loss": 0.107, + "step": 3173 + }, + { + "epoch": 3.6792001738752447, + "grad_norm": 0.2993345260620117, + "learning_rate": 3.1758620689655174e-05, + "loss": 0.1075, + "step": 3174 + }, + { + "epoch": 3.6803593421719913, + "grad_norm": 0.3475472927093506, + "learning_rate": 3.1752873563218395e-05, + "loss": 0.1034, + "step": 3175 + }, + { + "epoch": 3.681518510468739, + "grad_norm": 0.25363993644714355, + "learning_rate": 3.174712643678161e-05, + "loss": 0.113, + "step": 3176 + }, + { + "epoch": 3.6826776787654856, + "grad_norm": 0.27115947008132935, + "learning_rate": 3.1741379310344825e-05, + "loss": 0.1231, + "step": 3177 + }, + { + "epoch": 3.6838368470622327, + "grad_norm": 0.2857735753059387, + "learning_rate": 3.173563218390805e-05, + "loss": 0.1171, + "step": 3178 + }, + { + "epoch": 3.68499601535898, + "grad_norm": 0.2827852666378021, + "learning_rate": 3.172988505747126e-05, + "loss": 0.1089, + "step": 3179 + }, + { + "epoch": 3.686155183655727, + "grad_norm": 0.31064677238464355, + "learning_rate": 3.172413793103448e-05, + "loss": 0.1281, + "step": 3180 + }, + { + "epoch": 3.687314351952474, + "grad_norm": 0.38487324118614197, + "learning_rate": 3.1718390804597705e-05, + "loss": 0.1147, + "step": 3181 + }, + { + "epoch": 3.688473520249221, + "grad_norm": 0.2798865735530853, + "learning_rate": 3.171264367816092e-05, + "loss": 0.1065, + "step": 3182 + }, + { + "epoch": 3.6896326885459683, + "grad_norm": 0.3226175308227539, + "learning_rate": 3.170689655172414e-05, + "loss": 0.1131, + "step": 3183 + }, + { + "epoch": 3.6907918568427154, + "grad_norm": 0.2735946774482727, + "learning_rate": 3.1701149425287356e-05, + "loss": 0.1149, + "step": 3184 + }, + { + "epoch": 3.6919510251394625, + "grad_norm": 0.3574925661087036, + "learning_rate": 3.169540229885057e-05, + "loss": 0.1081, + "step": 3185 + }, + { + "epoch": 3.6931101934362096, + "grad_norm": 0.24798470735549927, + "learning_rate": 3.168965517241379e-05, + "loss": 0.1078, + "step": 3186 + }, + { + "epoch": 3.6942693617329567, + "grad_norm": 0.27632611989974976, + "learning_rate": 3.1683908045977015e-05, + "loss": 0.1247, + "step": 3187 + }, + { + "epoch": 3.695428530029704, + "grad_norm": 0.21540036797523499, + "learning_rate": 3.1678160919540236e-05, + "loss": 0.0948, + "step": 3188 + }, + { + "epoch": 3.696587698326451, + "grad_norm": 0.34586644172668457, + "learning_rate": 3.167241379310345e-05, + "loss": 0.1299, + "step": 3189 + }, + { + "epoch": 3.6977468666231976, + "grad_norm": 0.3047373592853546, + "learning_rate": 3.1666666666666666e-05, + "loss": 0.1185, + "step": 3190 + }, + { + "epoch": 3.698906034919945, + "grad_norm": 0.30293378233909607, + "learning_rate": 3.166091954022989e-05, + "loss": 0.114, + "step": 3191 + }, + { + "epoch": 3.700065203216692, + "grad_norm": 0.3187353014945984, + "learning_rate": 3.16551724137931e-05, + "loss": 0.1255, + "step": 3192 + }, + { + "epoch": 3.701224371513439, + "grad_norm": 0.25981852412223816, + "learning_rate": 3.1649425287356324e-05, + "loss": 0.114, + "step": 3193 + }, + { + "epoch": 3.702383539810186, + "grad_norm": 0.2848886549472809, + "learning_rate": 3.1643678160919546e-05, + "loss": 0.104, + "step": 3194 + }, + { + "epoch": 3.703542708106933, + "grad_norm": 0.2802824079990387, + "learning_rate": 3.163793103448276e-05, + "loss": 0.0887, + "step": 3195 + }, + { + "epoch": 3.7047018764036803, + "grad_norm": 0.2874505817890167, + "learning_rate": 3.1632183908045976e-05, + "loss": 0.1067, + "step": 3196 + }, + { + "epoch": 3.7058610447004274, + "grad_norm": 0.4384983479976654, + "learning_rate": 3.16264367816092e-05, + "loss": 0.1229, + "step": 3197 + }, + { + "epoch": 3.7070202129971745, + "grad_norm": 0.3324441909790039, + "learning_rate": 3.162068965517241e-05, + "loss": 0.1089, + "step": 3198 + }, + { + "epoch": 3.7081793812939217, + "grad_norm": 0.311897337436676, + "learning_rate": 3.1614942528735634e-05, + "loss": 0.1182, + "step": 3199 + }, + { + "epoch": 3.7093385495906688, + "grad_norm": 0.28386661410331726, + "learning_rate": 3.160919540229885e-05, + "loss": 0.1092, + "step": 3200 + }, + { + "epoch": 3.710497717887416, + "grad_norm": 0.27510008215904236, + "learning_rate": 3.160344827586207e-05, + "loss": 0.1098, + "step": 3201 + }, + { + "epoch": 3.711656886184163, + "grad_norm": 0.25964272022247314, + "learning_rate": 3.159770114942529e-05, + "loss": 0.0969, + "step": 3202 + }, + { + "epoch": 3.7128160544809097, + "grad_norm": 0.31421464681625366, + "learning_rate": 3.159195402298851e-05, + "loss": 0.1123, + "step": 3203 + }, + { + "epoch": 3.7139752227776572, + "grad_norm": 0.2845355272293091, + "learning_rate": 3.158620689655172e-05, + "loss": 0.1089, + "step": 3204 + }, + { + "epoch": 3.715134391074404, + "grad_norm": 0.5368481874465942, + "learning_rate": 3.1580459770114944e-05, + "loss": 0.1038, + "step": 3205 + }, + { + "epoch": 3.7162935593711515, + "grad_norm": 0.5967825651168823, + "learning_rate": 3.157471264367816e-05, + "loss": 0.105, + "step": 3206 + }, + { + "epoch": 3.717452727667898, + "grad_norm": 0.29082855582237244, + "learning_rate": 3.156896551724138e-05, + "loss": 0.1157, + "step": 3207 + }, + { + "epoch": 3.7186118959646453, + "grad_norm": 0.337176114320755, + "learning_rate": 3.15632183908046e-05, + "loss": 0.1105, + "step": 3208 + }, + { + "epoch": 3.7197710642613924, + "grad_norm": 0.4318813979625702, + "learning_rate": 3.155747126436782e-05, + "loss": 0.1172, + "step": 3209 + }, + { + "epoch": 3.7209302325581395, + "grad_norm": 0.2882601320743561, + "learning_rate": 3.155172413793104e-05, + "loss": 0.1016, + "step": 3210 + }, + { + "epoch": 3.7220894008548866, + "grad_norm": 0.2961920499801636, + "learning_rate": 3.1545977011494254e-05, + "loss": 0.1317, + "step": 3211 + }, + { + "epoch": 3.7232485691516337, + "grad_norm": 0.2166760265827179, + "learning_rate": 3.154022988505747e-05, + "loss": 0.1111, + "step": 3212 + }, + { + "epoch": 3.724407737448381, + "grad_norm": 0.28654518723487854, + "learning_rate": 3.153448275862069e-05, + "loss": 0.1187, + "step": 3213 + }, + { + "epoch": 3.725566905745128, + "grad_norm": 0.34299778938293457, + "learning_rate": 3.152873563218391e-05, + "loss": 0.1278, + "step": 3214 + }, + { + "epoch": 3.726726074041875, + "grad_norm": 0.25071898102760315, + "learning_rate": 3.152298850574713e-05, + "loss": 0.1067, + "step": 3215 + }, + { + "epoch": 3.727885242338622, + "grad_norm": 0.2629477083683014, + "learning_rate": 3.151724137931035e-05, + "loss": 0.1121, + "step": 3216 + }, + { + "epoch": 3.7290444106353693, + "grad_norm": 0.37242335081100464, + "learning_rate": 3.151149425287356e-05, + "loss": 0.1083, + "step": 3217 + }, + { + "epoch": 3.730203578932116, + "grad_norm": 0.27490779757499695, + "learning_rate": 3.1505747126436785e-05, + "loss": 0.1174, + "step": 3218 + }, + { + "epoch": 3.7313627472288635, + "grad_norm": 0.2524685263633728, + "learning_rate": 3.15e-05, + "loss": 0.118, + "step": 3219 + }, + { + "epoch": 3.73252191552561, + "grad_norm": 0.27024635672569275, + "learning_rate": 3.1494252873563215e-05, + "loss": 0.1198, + "step": 3220 + }, + { + "epoch": 3.7336810838223573, + "grad_norm": 0.2741260528564453, + "learning_rate": 3.148850574712644e-05, + "loss": 0.1144, + "step": 3221 + }, + { + "epoch": 3.7348402521191044, + "grad_norm": 0.321684867143631, + "learning_rate": 3.148275862068966e-05, + "loss": 0.0993, + "step": 3222 + }, + { + "epoch": 3.7359994204158515, + "grad_norm": 0.19403128325939178, + "learning_rate": 3.147701149425287e-05, + "loss": 0.0992, + "step": 3223 + }, + { + "epoch": 3.7371585887125987, + "grad_norm": 0.26611411571502686, + "learning_rate": 3.1471264367816095e-05, + "loss": 0.1176, + "step": 3224 + }, + { + "epoch": 3.7383177570093458, + "grad_norm": 0.36536505818367004, + "learning_rate": 3.146551724137931e-05, + "loss": 0.1099, + "step": 3225 + }, + { + "epoch": 3.739476925306093, + "grad_norm": 0.2902340590953827, + "learning_rate": 3.145977011494253e-05, + "loss": 0.1109, + "step": 3226 + }, + { + "epoch": 3.74063609360284, + "grad_norm": 0.24061763286590576, + "learning_rate": 3.1454022988505746e-05, + "loss": 0.111, + "step": 3227 + }, + { + "epoch": 3.741795261899587, + "grad_norm": 0.32074034214019775, + "learning_rate": 3.144827586206897e-05, + "loss": 0.1056, + "step": 3228 + }, + { + "epoch": 3.7429544301963342, + "grad_norm": 0.2975504994392395, + "learning_rate": 3.144252873563219e-05, + "loss": 0.1241, + "step": 3229 + }, + { + "epoch": 3.7441135984930813, + "grad_norm": 0.33813774585723877, + "learning_rate": 3.1436781609195405e-05, + "loss": 0.11, + "step": 3230 + }, + { + "epoch": 3.7452727667898285, + "grad_norm": 0.29403892159461975, + "learning_rate": 3.143103448275862e-05, + "loss": 0.1118, + "step": 3231 + }, + { + "epoch": 3.7464319350865756, + "grad_norm": 0.2610683739185333, + "learning_rate": 3.142528735632184e-05, + "loss": 0.1055, + "step": 3232 + }, + { + "epoch": 3.7475911033833222, + "grad_norm": 0.2696630358695984, + "learning_rate": 3.1419540229885056e-05, + "loss": 0.1062, + "step": 3233 + }, + { + "epoch": 3.74875027168007, + "grad_norm": 0.3874567747116089, + "learning_rate": 3.141379310344828e-05, + "loss": 0.1243, + "step": 3234 + }, + { + "epoch": 3.7499094399768165, + "grad_norm": 0.39800602197647095, + "learning_rate": 3.14080459770115e-05, + "loss": 0.124, + "step": 3235 + }, + { + "epoch": 3.7510686082735636, + "grad_norm": 0.3324180245399475, + "learning_rate": 3.1402298850574714e-05, + "loss": 0.1292, + "step": 3236 + }, + { + "epoch": 3.7522277765703107, + "grad_norm": 0.3287622630596161, + "learning_rate": 3.1396551724137936e-05, + "loss": 0.1152, + "step": 3237 + }, + { + "epoch": 3.753386944867058, + "grad_norm": 0.22335346043109894, + "learning_rate": 3.139080459770115e-05, + "loss": 0.1006, + "step": 3238 + }, + { + "epoch": 3.754546113163805, + "grad_norm": 0.32788801193237305, + "learning_rate": 3.1385057471264366e-05, + "loss": 0.1072, + "step": 3239 + }, + { + "epoch": 3.755705281460552, + "grad_norm": 0.3683432340621948, + "learning_rate": 3.137931034482759e-05, + "loss": 0.1085, + "step": 3240 + }, + { + "epoch": 3.756864449757299, + "grad_norm": 0.33938443660736084, + "learning_rate": 3.13735632183908e-05, + "loss": 0.1115, + "step": 3241 + }, + { + "epoch": 3.7580236180540463, + "grad_norm": 0.27710798382759094, + "learning_rate": 3.1367816091954024e-05, + "loss": 0.1152, + "step": 3242 + }, + { + "epoch": 3.7591827863507934, + "grad_norm": 0.4090668261051178, + "learning_rate": 3.1362068965517246e-05, + "loss": 0.1064, + "step": 3243 + }, + { + "epoch": 3.7603419546475405, + "grad_norm": 0.33840107917785645, + "learning_rate": 3.135632183908046e-05, + "loss": 0.1124, + "step": 3244 + }, + { + "epoch": 3.7615011229442876, + "grad_norm": 0.2494470775127411, + "learning_rate": 3.135057471264368e-05, + "loss": 0.1, + "step": 3245 + }, + { + "epoch": 3.7626602912410343, + "grad_norm": 0.2987157106399536, + "learning_rate": 3.13448275862069e-05, + "loss": 0.1159, + "step": 3246 + }, + { + "epoch": 3.763819459537782, + "grad_norm": 0.3439898192882538, + "learning_rate": 3.133908045977011e-05, + "loss": 0.1171, + "step": 3247 + }, + { + "epoch": 3.7649786278345285, + "grad_norm": 0.4079365134239197, + "learning_rate": 3.1333333333333334e-05, + "loss": 0.1184, + "step": 3248 + }, + { + "epoch": 3.766137796131276, + "grad_norm": 0.25229400396347046, + "learning_rate": 3.1327586206896555e-05, + "loss": 0.107, + "step": 3249 + }, + { + "epoch": 3.7672969644280228, + "grad_norm": 0.28538668155670166, + "learning_rate": 3.132183908045977e-05, + "loss": 0.1143, + "step": 3250 + }, + { + "epoch": 3.76845613272477, + "grad_norm": 0.3976428806781769, + "learning_rate": 3.131609195402299e-05, + "loss": 0.1152, + "step": 3251 + }, + { + "epoch": 3.769615301021517, + "grad_norm": 0.3206281363964081, + "learning_rate": 3.131034482758621e-05, + "loss": 0.1219, + "step": 3252 + }, + { + "epoch": 3.770774469318264, + "grad_norm": 0.34073176980018616, + "learning_rate": 3.130459770114942e-05, + "loss": 0.1111, + "step": 3253 + }, + { + "epoch": 3.771933637615011, + "grad_norm": 0.2973473370075226, + "learning_rate": 3.1298850574712643e-05, + "loss": 0.1155, + "step": 3254 + }, + { + "epoch": 3.7730928059117583, + "grad_norm": 0.268359512090683, + "learning_rate": 3.1293103448275865e-05, + "loss": 0.1207, + "step": 3255 + }, + { + "epoch": 3.7742519742085054, + "grad_norm": 0.24991478025913239, + "learning_rate": 3.128735632183909e-05, + "loss": 0.1108, + "step": 3256 + }, + { + "epoch": 3.7754111425052526, + "grad_norm": 0.28493621945381165, + "learning_rate": 3.12816091954023e-05, + "loss": 0.1153, + "step": 3257 + }, + { + "epoch": 3.7765703108019997, + "grad_norm": 0.25810399651527405, + "learning_rate": 3.127586206896552e-05, + "loss": 0.1134, + "step": 3258 + }, + { + "epoch": 3.777729479098747, + "grad_norm": 0.33807915449142456, + "learning_rate": 3.127011494252874e-05, + "loss": 0.1156, + "step": 3259 + }, + { + "epoch": 3.778888647395494, + "grad_norm": 0.37843331694602966, + "learning_rate": 3.126436781609195e-05, + "loss": 0.1112, + "step": 3260 + }, + { + "epoch": 3.7800478156922406, + "grad_norm": 0.3349437713623047, + "learning_rate": 3.125862068965517e-05, + "loss": 0.1004, + "step": 3261 + }, + { + "epoch": 3.781206983988988, + "grad_norm": 0.24173638224601746, + "learning_rate": 3.1252873563218397e-05, + "loss": 0.1016, + "step": 3262 + }, + { + "epoch": 3.782366152285735, + "grad_norm": 0.27250927686691284, + "learning_rate": 3.124712643678161e-05, + "loss": 0.1076, + "step": 3263 + }, + { + "epoch": 3.783525320582482, + "grad_norm": 0.3555189073085785, + "learning_rate": 3.124137931034483e-05, + "loss": 0.1234, + "step": 3264 + }, + { + "epoch": 3.784684488879229, + "grad_norm": 0.3194526433944702, + "learning_rate": 3.123563218390805e-05, + "loss": 0.1168, + "step": 3265 + }, + { + "epoch": 3.785843657175976, + "grad_norm": 0.3455429673194885, + "learning_rate": 3.122988505747126e-05, + "loss": 0.1243, + "step": 3266 + }, + { + "epoch": 3.7870028254727233, + "grad_norm": 0.31268778443336487, + "learning_rate": 3.1224137931034485e-05, + "loss": 0.1013, + "step": 3267 + }, + { + "epoch": 3.7881619937694704, + "grad_norm": 0.3976519703865051, + "learning_rate": 3.12183908045977e-05, + "loss": 0.1106, + "step": 3268 + }, + { + "epoch": 3.7893211620662175, + "grad_norm": 0.37077805399894714, + "learning_rate": 3.121264367816092e-05, + "loss": 0.115, + "step": 3269 + }, + { + "epoch": 3.7904803303629646, + "grad_norm": 0.26564523577690125, + "learning_rate": 3.120689655172414e-05, + "loss": 0.1025, + "step": 3270 + }, + { + "epoch": 3.7916394986597117, + "grad_norm": 0.30875927209854126, + "learning_rate": 3.120114942528736e-05, + "loss": 0.1061, + "step": 3271 + }, + { + "epoch": 3.792798666956459, + "grad_norm": 0.2600082755088806, + "learning_rate": 3.119540229885058e-05, + "loss": 0.0994, + "step": 3272 + }, + { + "epoch": 3.793957835253206, + "grad_norm": 0.2653372585773468, + "learning_rate": 3.1189655172413794e-05, + "loss": 0.1006, + "step": 3273 + }, + { + "epoch": 3.795117003549953, + "grad_norm": 0.23784595727920532, + "learning_rate": 3.118390804597701e-05, + "loss": 0.1064, + "step": 3274 + }, + { + "epoch": 3.7962761718467, + "grad_norm": 0.3346762955188751, + "learning_rate": 3.117816091954023e-05, + "loss": 0.1174, + "step": 3275 + }, + { + "epoch": 3.797435340143447, + "grad_norm": 0.28256985545158386, + "learning_rate": 3.117241379310345e-05, + "loss": 0.1233, + "step": 3276 + }, + { + "epoch": 3.7985945084401944, + "grad_norm": 0.3030017614364624, + "learning_rate": 3.116666666666667e-05, + "loss": 0.1022, + "step": 3277 + }, + { + "epoch": 3.799753676736941, + "grad_norm": 0.28801146149635315, + "learning_rate": 3.116091954022989e-05, + "loss": 0.1084, + "step": 3278 + }, + { + "epoch": 3.800912845033688, + "grad_norm": 0.3173084259033203, + "learning_rate": 3.1155172413793104e-05, + "loss": 0.1118, + "step": 3279 + }, + { + "epoch": 3.8020720133304353, + "grad_norm": 0.22037595510482788, + "learning_rate": 3.114942528735632e-05, + "loss": 0.1013, + "step": 3280 + }, + { + "epoch": 3.8032311816271824, + "grad_norm": 0.4173763394355774, + "learning_rate": 3.114367816091954e-05, + "loss": 0.1147, + "step": 3281 + }, + { + "epoch": 3.8043903499239295, + "grad_norm": 0.38189518451690674, + "learning_rate": 3.113793103448276e-05, + "loss": 0.1191, + "step": 3282 + }, + { + "epoch": 3.8055495182206767, + "grad_norm": 0.28138333559036255, + "learning_rate": 3.1132183908045984e-05, + "loss": 0.118, + "step": 3283 + }, + { + "epoch": 3.806708686517424, + "grad_norm": 0.28683844208717346, + "learning_rate": 3.11264367816092e-05, + "loss": 0.1156, + "step": 3284 + }, + { + "epoch": 3.807867854814171, + "grad_norm": 0.28898361325263977, + "learning_rate": 3.1120689655172414e-05, + "loss": 0.1054, + "step": 3285 + }, + { + "epoch": 3.809027023110918, + "grad_norm": 0.2948157787322998, + "learning_rate": 3.1114942528735636e-05, + "loss": 0.0977, + "step": 3286 + }, + { + "epoch": 3.810186191407665, + "grad_norm": 0.27834632992744446, + "learning_rate": 3.110919540229885e-05, + "loss": 0.1206, + "step": 3287 + }, + { + "epoch": 3.8113453597044122, + "grad_norm": 0.2628282606601715, + "learning_rate": 3.1103448275862065e-05, + "loss": 0.1112, + "step": 3288 + }, + { + "epoch": 3.812504528001159, + "grad_norm": 0.2561165988445282, + "learning_rate": 3.109770114942529e-05, + "loss": 0.1138, + "step": 3289 + }, + { + "epoch": 3.8136636962979065, + "grad_norm": 0.23604638874530792, + "learning_rate": 3.109195402298851e-05, + "loss": 0.1081, + "step": 3290 + }, + { + "epoch": 3.814822864594653, + "grad_norm": 0.2508888840675354, + "learning_rate": 3.108620689655173e-05, + "loss": 0.1002, + "step": 3291 + }, + { + "epoch": 3.8159820328914003, + "grad_norm": 0.3218640089035034, + "learning_rate": 3.1080459770114945e-05, + "loss": 0.1064, + "step": 3292 + }, + { + "epoch": 3.8171412011881474, + "grad_norm": 0.22713243961334229, + "learning_rate": 3.107471264367816e-05, + "loss": 0.1074, + "step": 3293 + }, + { + "epoch": 3.8183003694848945, + "grad_norm": 0.2802816927433014, + "learning_rate": 3.106896551724138e-05, + "loss": 0.1089, + "step": 3294 + }, + { + "epoch": 3.8194595377816416, + "grad_norm": 0.25849002599716187, + "learning_rate": 3.10632183908046e-05, + "loss": 0.1132, + "step": 3295 + }, + { + "epoch": 3.8206187060783887, + "grad_norm": 0.3340066373348236, + "learning_rate": 3.105747126436782e-05, + "loss": 0.1287, + "step": 3296 + }, + { + "epoch": 3.821777874375136, + "grad_norm": 0.3333747684955597, + "learning_rate": 3.105172413793104e-05, + "loss": 0.1157, + "step": 3297 + }, + { + "epoch": 3.822937042671883, + "grad_norm": 0.26639991998672485, + "learning_rate": 3.1045977011494255e-05, + "loss": 0.1089, + "step": 3298 + }, + { + "epoch": 3.82409621096863, + "grad_norm": 0.2500338852405548, + "learning_rate": 3.104022988505747e-05, + "loss": 0.1146, + "step": 3299 + }, + { + "epoch": 3.825255379265377, + "grad_norm": 0.3188420534133911, + "learning_rate": 3.103448275862069e-05, + "loss": 0.1142, + "step": 3300 + }, + { + "epoch": 3.8264145475621243, + "grad_norm": 0.22074191272258759, + "learning_rate": 3.1028735632183907e-05, + "loss": 0.1022, + "step": 3301 + }, + { + "epoch": 3.8275737158588714, + "grad_norm": 0.32169198989868164, + "learning_rate": 3.102298850574713e-05, + "loss": 0.1234, + "step": 3302 + }, + { + "epoch": 3.8287328841556185, + "grad_norm": 0.23068860173225403, + "learning_rate": 3.101724137931035e-05, + "loss": 0.1052, + "step": 3303 + }, + { + "epoch": 3.829892052452365, + "grad_norm": 0.4056016206741333, + "learning_rate": 3.1011494252873565e-05, + "loss": 0.1122, + "step": 3304 + }, + { + "epoch": 3.8310512207491128, + "grad_norm": 0.7055277228355408, + "learning_rate": 3.1005747126436786e-05, + "loss": 0.1154, + "step": 3305 + }, + { + "epoch": 3.8322103890458594, + "grad_norm": 0.3167892098426819, + "learning_rate": 3.1e-05, + "loss": 0.1151, + "step": 3306 + }, + { + "epoch": 3.8333695573426065, + "grad_norm": 0.3068336546421051, + "learning_rate": 3.0994252873563216e-05, + "loss": 0.1093, + "step": 3307 + }, + { + "epoch": 3.8345287256393537, + "grad_norm": 0.28874102234840393, + "learning_rate": 3.098850574712644e-05, + "loss": 0.101, + "step": 3308 + }, + { + "epoch": 3.8356878939361008, + "grad_norm": 0.30974501371383667, + "learning_rate": 3.098275862068965e-05, + "loss": 0.1292, + "step": 3309 + }, + { + "epoch": 3.836847062232848, + "grad_norm": 0.2519693672657013, + "learning_rate": 3.0977011494252875e-05, + "loss": 0.0984, + "step": 3310 + }, + { + "epoch": 3.838006230529595, + "grad_norm": 0.3312533497810364, + "learning_rate": 3.0971264367816096e-05, + "loss": 0.1092, + "step": 3311 + }, + { + "epoch": 3.839165398826342, + "grad_norm": 0.33562129735946655, + "learning_rate": 3.096551724137931e-05, + "loss": 0.1045, + "step": 3312 + }, + { + "epoch": 3.8403245671230892, + "grad_norm": 0.22888796031475067, + "learning_rate": 3.095977011494253e-05, + "loss": 0.1107, + "step": 3313 + }, + { + "epoch": 3.8414837354198363, + "grad_norm": 0.338222861289978, + "learning_rate": 3.095402298850575e-05, + "loss": 0.1227, + "step": 3314 + }, + { + "epoch": 3.8426429037165835, + "grad_norm": 0.3241053521633148, + "learning_rate": 3.094827586206896e-05, + "loss": 0.1243, + "step": 3315 + }, + { + "epoch": 3.8438020720133306, + "grad_norm": 0.2689477503299713, + "learning_rate": 3.0942528735632184e-05, + "loss": 0.1091, + "step": 3316 + }, + { + "epoch": 3.8449612403100772, + "grad_norm": 0.2917823791503906, + "learning_rate": 3.0936781609195406e-05, + "loss": 0.099, + "step": 3317 + }, + { + "epoch": 3.846120408606825, + "grad_norm": 0.2831590473651886, + "learning_rate": 3.093103448275862e-05, + "loss": 0.0987, + "step": 3318 + }, + { + "epoch": 3.8472795769035715, + "grad_norm": 0.2956966161727905, + "learning_rate": 3.092528735632184e-05, + "loss": 0.1038, + "step": 3319 + }, + { + "epoch": 3.848438745200319, + "grad_norm": 0.41253408789634705, + "learning_rate": 3.091954022988506e-05, + "loss": 0.1173, + "step": 3320 + }, + { + "epoch": 3.8495979134970657, + "grad_norm": 0.37665021419525146, + "learning_rate": 3.091379310344828e-05, + "loss": 0.1211, + "step": 3321 + }, + { + "epoch": 3.850757081793813, + "grad_norm": 0.29172924160957336, + "learning_rate": 3.0908045977011494e-05, + "loss": 0.1087, + "step": 3322 + }, + { + "epoch": 3.85191625009056, + "grad_norm": 0.4337025284767151, + "learning_rate": 3.0902298850574716e-05, + "loss": 0.1089, + "step": 3323 + }, + { + "epoch": 3.853075418387307, + "grad_norm": 0.36892715096473694, + "learning_rate": 3.089655172413794e-05, + "loss": 0.108, + "step": 3324 + }, + { + "epoch": 3.854234586684054, + "grad_norm": 0.34563952684402466, + "learning_rate": 3.089080459770115e-05, + "loss": 0.116, + "step": 3325 + }, + { + "epoch": 3.8553937549808013, + "grad_norm": 0.3011876344680786, + "learning_rate": 3.088505747126437e-05, + "loss": 0.1105, + "step": 3326 + }, + { + "epoch": 3.8565529232775484, + "grad_norm": 0.36122533679008484, + "learning_rate": 3.087931034482759e-05, + "loss": 0.1171, + "step": 3327 + }, + { + "epoch": 3.8577120915742955, + "grad_norm": 0.31418511271476746, + "learning_rate": 3.0873563218390804e-05, + "loss": 0.1189, + "step": 3328 + }, + { + "epoch": 3.8588712598710426, + "grad_norm": 0.2561214566230774, + "learning_rate": 3.0867816091954025e-05, + "loss": 0.1061, + "step": 3329 + }, + { + "epoch": 3.8600304281677897, + "grad_norm": 0.28758811950683594, + "learning_rate": 3.086206896551724e-05, + "loss": 0.1142, + "step": 3330 + }, + { + "epoch": 3.861189596464537, + "grad_norm": 0.2774584889411926, + "learning_rate": 3.085632183908046e-05, + "loss": 0.1097, + "step": 3331 + }, + { + "epoch": 3.8623487647612835, + "grad_norm": 0.3186490535736084, + "learning_rate": 3.0850574712643684e-05, + "loss": 0.117, + "step": 3332 + }, + { + "epoch": 3.863507933058031, + "grad_norm": 0.3841571807861328, + "learning_rate": 3.08448275862069e-05, + "loss": 0.1224, + "step": 3333 + }, + { + "epoch": 3.8646671013547778, + "grad_norm": 0.2925528585910797, + "learning_rate": 3.0839080459770113e-05, + "loss": 0.1049, + "step": 3334 + }, + { + "epoch": 3.865826269651525, + "grad_norm": 0.35688725113868713, + "learning_rate": 3.0833333333333335e-05, + "loss": 0.1102, + "step": 3335 + }, + { + "epoch": 3.866985437948272, + "grad_norm": 0.27751150727272034, + "learning_rate": 3.082758620689655e-05, + "loss": 0.1114, + "step": 3336 + }, + { + "epoch": 3.868144606245019, + "grad_norm": 0.25139451026916504, + "learning_rate": 3.082183908045977e-05, + "loss": 0.1159, + "step": 3337 + }, + { + "epoch": 3.869303774541766, + "grad_norm": 0.4053381085395813, + "learning_rate": 3.0816091954022993e-05, + "loss": 0.1268, + "step": 3338 + }, + { + "epoch": 3.8704629428385133, + "grad_norm": 0.22363197803497314, + "learning_rate": 3.081034482758621e-05, + "loss": 0.106, + "step": 3339 + }, + { + "epoch": 3.8716221111352604, + "grad_norm": 0.310592919588089, + "learning_rate": 3.080459770114943e-05, + "loss": 0.1168, + "step": 3340 + }, + { + "epoch": 3.8727812794320076, + "grad_norm": 0.29735296964645386, + "learning_rate": 3.0798850574712645e-05, + "loss": 0.1257, + "step": 3341 + }, + { + "epoch": 3.8739404477287547, + "grad_norm": 0.2444659024477005, + "learning_rate": 3.079310344827586e-05, + "loss": 0.109, + "step": 3342 + }, + { + "epoch": 3.875099616025502, + "grad_norm": 0.32433512806892395, + "learning_rate": 3.078735632183908e-05, + "loss": 0.1177, + "step": 3343 + }, + { + "epoch": 3.876258784322249, + "grad_norm": 0.2731485366821289, + "learning_rate": 3.07816091954023e-05, + "loss": 0.1176, + "step": 3344 + }, + { + "epoch": 3.877417952618996, + "grad_norm": 0.25933563709259033, + "learning_rate": 3.077586206896552e-05, + "loss": 0.0964, + "step": 3345 + }, + { + "epoch": 3.878577120915743, + "grad_norm": 0.34049245715141296, + "learning_rate": 3.077011494252874e-05, + "loss": 0.1237, + "step": 3346 + }, + { + "epoch": 3.87973628921249, + "grad_norm": 0.2941095232963562, + "learning_rate": 3.0764367816091955e-05, + "loss": 0.1159, + "step": 3347 + }, + { + "epoch": 3.8808954575092374, + "grad_norm": 0.2893029451370239, + "learning_rate": 3.0758620689655176e-05, + "loss": 0.1072, + "step": 3348 + }, + { + "epoch": 3.882054625805984, + "grad_norm": 0.2857878506183624, + "learning_rate": 3.075287356321839e-05, + "loss": 0.1117, + "step": 3349 + }, + { + "epoch": 3.883213794102731, + "grad_norm": 0.3128105103969574, + "learning_rate": 3.0747126436781606e-05, + "loss": 0.1154, + "step": 3350 + }, + { + "epoch": 3.8843729623994783, + "grad_norm": 0.34108099341392517, + "learning_rate": 3.074137931034483e-05, + "loss": 0.1083, + "step": 3351 + }, + { + "epoch": 3.8855321306962254, + "grad_norm": 0.29815927147865295, + "learning_rate": 3.073563218390805e-05, + "loss": 0.1092, + "step": 3352 + }, + { + "epoch": 3.8866912989929725, + "grad_norm": 0.32374799251556396, + "learning_rate": 3.0729885057471264e-05, + "loss": 0.1138, + "step": 3353 + }, + { + "epoch": 3.8878504672897196, + "grad_norm": 0.30145663022994995, + "learning_rate": 3.0724137931034486e-05, + "loss": 0.1105, + "step": 3354 + }, + { + "epoch": 3.8890096355864667, + "grad_norm": 0.27380499243736267, + "learning_rate": 3.07183908045977e-05, + "loss": 0.1038, + "step": 3355 + }, + { + "epoch": 3.890168803883214, + "grad_norm": 0.33210647106170654, + "learning_rate": 3.0712643678160916e-05, + "loss": 0.1312, + "step": 3356 + }, + { + "epoch": 3.891327972179961, + "grad_norm": 0.27347707748413086, + "learning_rate": 3.070689655172414e-05, + "loss": 0.104, + "step": 3357 + }, + { + "epoch": 3.892487140476708, + "grad_norm": 0.31707629561424255, + "learning_rate": 3.070114942528736e-05, + "loss": 0.1097, + "step": 3358 + }, + { + "epoch": 3.893646308773455, + "grad_norm": 0.3601592481136322, + "learning_rate": 3.069540229885058e-05, + "loss": 0.1209, + "step": 3359 + }, + { + "epoch": 3.894805477070202, + "grad_norm": 0.29122745990753174, + "learning_rate": 3.0689655172413796e-05, + "loss": 0.1059, + "step": 3360 + }, + { + "epoch": 3.8959646453669494, + "grad_norm": 0.3259906470775604, + "learning_rate": 3.068390804597701e-05, + "loss": 0.1168, + "step": 3361 + }, + { + "epoch": 3.897123813663696, + "grad_norm": 0.31396666169166565, + "learning_rate": 3.067816091954023e-05, + "loss": 0.1048, + "step": 3362 + }, + { + "epoch": 3.8982829819604436, + "grad_norm": 0.28336969017982483, + "learning_rate": 3.067241379310345e-05, + "loss": 0.1083, + "step": 3363 + }, + { + "epoch": 3.8994421502571903, + "grad_norm": 0.29647666215896606, + "learning_rate": 3.066666666666667e-05, + "loss": 0.1163, + "step": 3364 + }, + { + "epoch": 3.9006013185539374, + "grad_norm": 0.4049275815486908, + "learning_rate": 3.066091954022989e-05, + "loss": 0.1132, + "step": 3365 + }, + { + "epoch": 3.9017604868506846, + "grad_norm": 0.2655840516090393, + "learning_rate": 3.0655172413793106e-05, + "loss": 0.0983, + "step": 3366 + }, + { + "epoch": 3.9029196551474317, + "grad_norm": 0.5942867994308472, + "learning_rate": 3.064942528735633e-05, + "loss": 0.1326, + "step": 3367 + }, + { + "epoch": 3.904078823444179, + "grad_norm": 0.24042394757270813, + "learning_rate": 3.064367816091954e-05, + "loss": 0.1083, + "step": 3368 + }, + { + "epoch": 3.905237991740926, + "grad_norm": 0.24177686870098114, + "learning_rate": 3.063793103448276e-05, + "loss": 0.1151, + "step": 3369 + }, + { + "epoch": 3.906397160037673, + "grad_norm": 0.33689844608306885, + "learning_rate": 3.063218390804598e-05, + "loss": 0.1128, + "step": 3370 + }, + { + "epoch": 3.90755632833442, + "grad_norm": 0.3097141683101654, + "learning_rate": 3.0626436781609194e-05, + "loss": 0.1209, + "step": 3371 + }, + { + "epoch": 3.9087154966311672, + "grad_norm": 0.2580316364765167, + "learning_rate": 3.0620689655172415e-05, + "loss": 0.1079, + "step": 3372 + }, + { + "epoch": 3.9098746649279144, + "grad_norm": 0.2943916320800781, + "learning_rate": 3.061494252873564e-05, + "loss": 0.124, + "step": 3373 + }, + { + "epoch": 3.9110338332246615, + "grad_norm": 0.27771899104118347, + "learning_rate": 3.060919540229885e-05, + "loss": 0.1084, + "step": 3374 + }, + { + "epoch": 3.912193001521408, + "grad_norm": 0.30981943011283875, + "learning_rate": 3.060344827586207e-05, + "loss": 0.1076, + "step": 3375 + }, + { + "epoch": 3.9133521698181557, + "grad_norm": 0.3062245547771454, + "learning_rate": 3.059770114942529e-05, + "loss": 0.1273, + "step": 3376 + }, + { + "epoch": 3.9145113381149024, + "grad_norm": 0.25687843561172485, + "learning_rate": 3.05919540229885e-05, + "loss": 0.1053, + "step": 3377 + }, + { + "epoch": 3.9156705064116495, + "grad_norm": 0.25983986258506775, + "learning_rate": 3.0586206896551725e-05, + "loss": 0.1062, + "step": 3378 + }, + { + "epoch": 3.9168296747083966, + "grad_norm": 0.7442601323127747, + "learning_rate": 3.058045977011495e-05, + "loss": 0.1203, + "step": 3379 + }, + { + "epoch": 3.9179888430051437, + "grad_norm": 0.289230078458786, + "learning_rate": 3.057471264367816e-05, + "loss": 0.1178, + "step": 3380 + }, + { + "epoch": 3.919148011301891, + "grad_norm": 0.25480005145072937, + "learning_rate": 3.056896551724138e-05, + "loss": 0.1119, + "step": 3381 + }, + { + "epoch": 3.920307179598638, + "grad_norm": 0.2439074069261551, + "learning_rate": 3.05632183908046e-05, + "loss": 0.103, + "step": 3382 + }, + { + "epoch": 3.921466347895385, + "grad_norm": 0.32492339611053467, + "learning_rate": 3.055747126436781e-05, + "loss": 0.1203, + "step": 3383 + }, + { + "epoch": 3.922625516192132, + "grad_norm": 0.2885267436504364, + "learning_rate": 3.0551724137931035e-05, + "loss": 0.1092, + "step": 3384 + }, + { + "epoch": 3.9237846844888793, + "grad_norm": 0.5572437644004822, + "learning_rate": 3.0545977011494256e-05, + "loss": 0.1181, + "step": 3385 + }, + { + "epoch": 3.9249438527856264, + "grad_norm": 0.3531005382537842, + "learning_rate": 3.054022988505748e-05, + "loss": 0.1129, + "step": 3386 + }, + { + "epoch": 3.9261030210823735, + "grad_norm": 0.35553064942359924, + "learning_rate": 3.053448275862069e-05, + "loss": 0.1238, + "step": 3387 + }, + { + "epoch": 3.9272621893791206, + "grad_norm": 0.31617647409439087, + "learning_rate": 3.052873563218391e-05, + "loss": 0.0986, + "step": 3388 + }, + { + "epoch": 3.9284213576758678, + "grad_norm": 0.3103896379470825, + "learning_rate": 3.052298850574713e-05, + "loss": 0.1315, + "step": 3389 + }, + { + "epoch": 3.9295805259726144, + "grad_norm": 0.31774574518203735, + "learning_rate": 3.0517241379310348e-05, + "loss": 0.1116, + "step": 3390 + }, + { + "epoch": 3.930739694269362, + "grad_norm": 0.2169465720653534, + "learning_rate": 3.0511494252873563e-05, + "loss": 0.0987, + "step": 3391 + }, + { + "epoch": 3.9318988625661087, + "grad_norm": 0.27531975507736206, + "learning_rate": 3.0505747126436784e-05, + "loss": 0.1173, + "step": 3392 + }, + { + "epoch": 3.9330580308628558, + "grad_norm": 0.34132128953933716, + "learning_rate": 3.05e-05, + "loss": 0.1246, + "step": 3393 + }, + { + "epoch": 3.934217199159603, + "grad_norm": 0.3186652362346649, + "learning_rate": 3.0494252873563218e-05, + "loss": 0.1182, + "step": 3394 + }, + { + "epoch": 3.93537636745635, + "grad_norm": 0.2878797948360443, + "learning_rate": 3.048850574712644e-05, + "loss": 0.1155, + "step": 3395 + }, + { + "epoch": 3.936535535753097, + "grad_norm": 0.27938562631607056, + "learning_rate": 3.0482758620689654e-05, + "loss": 0.1024, + "step": 3396 + }, + { + "epoch": 3.9376947040498442, + "grad_norm": 0.3184402883052826, + "learning_rate": 3.0477011494252876e-05, + "loss": 0.1094, + "step": 3397 + }, + { + "epoch": 3.9388538723465913, + "grad_norm": 0.2606091797351837, + "learning_rate": 3.0471264367816094e-05, + "loss": 0.1115, + "step": 3398 + }, + { + "epoch": 3.9400130406433385, + "grad_norm": 0.3425968885421753, + "learning_rate": 3.046551724137931e-05, + "loss": 0.1085, + "step": 3399 + }, + { + "epoch": 3.9411722089400856, + "grad_norm": 0.30267462134361267, + "learning_rate": 3.045977011494253e-05, + "loss": 0.1239, + "step": 3400 + }, + { + "epoch": 3.9423313772368327, + "grad_norm": 0.31355491280555725, + "learning_rate": 3.045402298850575e-05, + "loss": 0.1179, + "step": 3401 + }, + { + "epoch": 3.94349054553358, + "grad_norm": 0.2975825071334839, + "learning_rate": 3.0448275862068964e-05, + "loss": 0.1146, + "step": 3402 + }, + { + "epoch": 3.9446497138303265, + "grad_norm": 0.3157767653465271, + "learning_rate": 3.0442528735632186e-05, + "loss": 0.1192, + "step": 3403 + }, + { + "epoch": 3.945808882127074, + "grad_norm": 0.2541486620903015, + "learning_rate": 3.0436781609195404e-05, + "loss": 0.0994, + "step": 3404 + }, + { + "epoch": 3.9469680504238207, + "grad_norm": 0.42458853125572205, + "learning_rate": 3.0431034482758626e-05, + "loss": 0.1295, + "step": 3405 + }, + { + "epoch": 3.9481272187205683, + "grad_norm": 0.28080934286117554, + "learning_rate": 3.042528735632184e-05, + "loss": 0.1086, + "step": 3406 + }, + { + "epoch": 3.949286387017315, + "grad_norm": 0.26001524925231934, + "learning_rate": 3.041954022988506e-05, + "loss": 0.1099, + "step": 3407 + }, + { + "epoch": 3.950445555314062, + "grad_norm": 0.2862829267978668, + "learning_rate": 3.041379310344828e-05, + "loss": 0.1153, + "step": 3408 + }, + { + "epoch": 3.951604723610809, + "grad_norm": 0.2606809735298157, + "learning_rate": 3.0408045977011495e-05, + "loss": 0.1151, + "step": 3409 + }, + { + "epoch": 3.9527638919075563, + "grad_norm": 0.3059811294078827, + "learning_rate": 3.040229885057471e-05, + "loss": 0.1151, + "step": 3410 + }, + { + "epoch": 3.9539230602043034, + "grad_norm": 0.30108916759490967, + "learning_rate": 3.0396551724137935e-05, + "loss": 0.1125, + "step": 3411 + }, + { + "epoch": 3.9550822285010505, + "grad_norm": 0.21490095555782318, + "learning_rate": 3.039080459770115e-05, + "loss": 0.0883, + "step": 3412 + }, + { + "epoch": 3.9562413967977976, + "grad_norm": 0.39761850237846375, + "learning_rate": 3.0385057471264365e-05, + "loss": 0.1313, + "step": 3413 + }, + { + "epoch": 3.9574005650945447, + "grad_norm": 0.33936530351638794, + "learning_rate": 3.0379310344827587e-05, + "loss": 0.1119, + "step": 3414 + }, + { + "epoch": 3.958559733391292, + "grad_norm": 0.33767202496528625, + "learning_rate": 3.0373563218390805e-05, + "loss": 0.1272, + "step": 3415 + }, + { + "epoch": 3.959718901688039, + "grad_norm": 0.31474098563194275, + "learning_rate": 3.0367816091954027e-05, + "loss": 0.1093, + "step": 3416 + }, + { + "epoch": 3.960878069984786, + "grad_norm": 0.394424170255661, + "learning_rate": 3.0362068965517242e-05, + "loss": 0.1262, + "step": 3417 + }, + { + "epoch": 3.9620372382815328, + "grad_norm": 0.3407604992389679, + "learning_rate": 3.035632183908046e-05, + "loss": 0.1184, + "step": 3418 + }, + { + "epoch": 3.9631964065782803, + "grad_norm": 0.25149932503700256, + "learning_rate": 3.035057471264368e-05, + "loss": 0.1003, + "step": 3419 + }, + { + "epoch": 3.964355574875027, + "grad_norm": 0.22654342651367188, + "learning_rate": 3.0344827586206897e-05, + "loss": 0.1024, + "step": 3420 + }, + { + "epoch": 3.965514743171774, + "grad_norm": 0.3020685911178589, + "learning_rate": 3.0339080459770115e-05, + "loss": 0.1096, + "step": 3421 + }, + { + "epoch": 3.966673911468521, + "grad_norm": 0.25613948702812195, + "learning_rate": 3.0333333333333337e-05, + "loss": 0.1232, + "step": 3422 + }, + { + "epoch": 3.9678330797652683, + "grad_norm": 0.3310185670852661, + "learning_rate": 3.032758620689655e-05, + "loss": 0.1222, + "step": 3423 + }, + { + "epoch": 3.9689922480620154, + "grad_norm": 0.26435863971710205, + "learning_rate": 3.0321839080459773e-05, + "loss": 0.1013, + "step": 3424 + }, + { + "epoch": 3.9701514163587626, + "grad_norm": 0.3062216341495514, + "learning_rate": 3.031609195402299e-05, + "loss": 0.1156, + "step": 3425 + }, + { + "epoch": 3.9713105846555097, + "grad_norm": 0.3526940643787384, + "learning_rate": 3.0310344827586206e-05, + "loss": 0.1171, + "step": 3426 + }, + { + "epoch": 3.972469752952257, + "grad_norm": 0.2813412547111511, + "learning_rate": 3.0304597701149428e-05, + "loss": 0.1031, + "step": 3427 + }, + { + "epoch": 3.973628921249004, + "grad_norm": 0.3743179738521576, + "learning_rate": 3.0298850574712646e-05, + "loss": 0.1138, + "step": 3428 + }, + { + "epoch": 3.974788089545751, + "grad_norm": 0.30746176838874817, + "learning_rate": 3.029310344827586e-05, + "loss": 0.1214, + "step": 3429 + }, + { + "epoch": 3.975947257842498, + "grad_norm": 0.3143618106842041, + "learning_rate": 3.0287356321839083e-05, + "loss": 0.1114, + "step": 3430 + }, + { + "epoch": 3.9771064261392453, + "grad_norm": 0.33030736446380615, + "learning_rate": 3.02816091954023e-05, + "loss": 0.1038, + "step": 3431 + }, + { + "epoch": 3.9782655944359924, + "grad_norm": 0.40963098406791687, + "learning_rate": 3.0275862068965523e-05, + "loss": 0.1182, + "step": 3432 + }, + { + "epoch": 3.979424762732739, + "grad_norm": 0.31964415311813354, + "learning_rate": 3.0270114942528738e-05, + "loss": 0.1074, + "step": 3433 + }, + { + "epoch": 3.9805839310294866, + "grad_norm": 0.24795931577682495, + "learning_rate": 3.0264367816091953e-05, + "loss": 0.1048, + "step": 3434 + }, + { + "epoch": 3.9817430993262333, + "grad_norm": 0.25050121545791626, + "learning_rate": 3.0258620689655178e-05, + "loss": 0.1026, + "step": 3435 + }, + { + "epoch": 3.9829022676229804, + "grad_norm": 0.27471792697906494, + "learning_rate": 3.0252873563218393e-05, + "loss": 0.1081, + "step": 3436 + }, + { + "epoch": 3.9840614359197275, + "grad_norm": 0.24730391800403595, + "learning_rate": 3.0247126436781608e-05, + "loss": 0.1041, + "step": 3437 + }, + { + "epoch": 3.9852206042164746, + "grad_norm": 0.24231289327144623, + "learning_rate": 3.024137931034483e-05, + "loss": 0.1031, + "step": 3438 + }, + { + "epoch": 3.9863797725132217, + "grad_norm": 0.3932971656322479, + "learning_rate": 3.0235632183908047e-05, + "loss": 0.1118, + "step": 3439 + }, + { + "epoch": 3.987538940809969, + "grad_norm": 0.28917720913887024, + "learning_rate": 3.0229885057471262e-05, + "loss": 0.1139, + "step": 3440 + }, + { + "epoch": 3.988698109106716, + "grad_norm": 0.2533641457557678, + "learning_rate": 3.0224137931034484e-05, + "loss": 0.0963, + "step": 3441 + }, + { + "epoch": 3.989857277403463, + "grad_norm": 0.303200364112854, + "learning_rate": 3.0218390804597702e-05, + "loss": 0.1199, + "step": 3442 + }, + { + "epoch": 3.99101644570021, + "grad_norm": 0.2749335765838623, + "learning_rate": 3.0212643678160924e-05, + "loss": 0.1025, + "step": 3443 + }, + { + "epoch": 3.9921756139969573, + "grad_norm": 0.2617204785346985, + "learning_rate": 3.020689655172414e-05, + "loss": 0.1123, + "step": 3444 + }, + { + "epoch": 3.9933347822937044, + "grad_norm": 0.2848406434059143, + "learning_rate": 3.0201149425287357e-05, + "loss": 0.1098, + "step": 3445 + }, + { + "epoch": 3.994493950590451, + "grad_norm": 0.3450290262699127, + "learning_rate": 3.019540229885058e-05, + "loss": 0.1087, + "step": 3446 + }, + { + "epoch": 3.9956531188871987, + "grad_norm": 0.3187498450279236, + "learning_rate": 3.0189655172413794e-05, + "loss": 0.1144, + "step": 3447 + }, + { + "epoch": 3.9968122871839453, + "grad_norm": 0.3266300857067108, + "learning_rate": 3.0183908045977012e-05, + "loss": 0.1161, + "step": 3448 + }, + { + "epoch": 3.9979714554806924, + "grad_norm": 0.2566640079021454, + "learning_rate": 3.0178160919540234e-05, + "loss": 0.1034, + "step": 3449 + }, + { + "epoch": 3.9991306237774396, + "grad_norm": 0.31494787335395813, + "learning_rate": 3.017241379310345e-05, + "loss": 0.1086, + "step": 3450 + }, + { + "epoch": 3.9991306237774396, + "eval_loss": 0.1329651027917862, + "eval_runtime": 265.5812, + "eval_samples_per_second": 5.776, + "eval_steps_per_second": 5.776, + "step": 3450 + }, + { + "epoch": 4.000289792074187, + "grad_norm": 0.3459082245826721, + "learning_rate": 3.016666666666667e-05, + "loss": 0.1203, + "step": 3451 + }, + { + "epoch": 4.001448960370934, + "grad_norm": 0.25177237391471863, + "learning_rate": 3.016091954022989e-05, + "loss": 0.1, + "step": 3452 + }, + { + "epoch": 4.002608128667681, + "grad_norm": 0.2541625201702118, + "learning_rate": 3.0155172413793104e-05, + "loss": 0.1011, + "step": 3453 + }, + { + "epoch": 4.003767296964428, + "grad_norm": 0.23528751730918884, + "learning_rate": 3.0149425287356325e-05, + "loss": 0.1055, + "step": 3454 + }, + { + "epoch": 4.004926465261175, + "grad_norm": 0.25770679116249084, + "learning_rate": 3.014367816091954e-05, + "loss": 0.0924, + "step": 3455 + }, + { + "epoch": 4.006085633557922, + "grad_norm": 0.24531324207782745, + "learning_rate": 3.013793103448276e-05, + "loss": 0.1031, + "step": 3456 + }, + { + "epoch": 4.007244801854669, + "grad_norm": 0.2368677854537964, + "learning_rate": 3.013218390804598e-05, + "loss": 0.1022, + "step": 3457 + }, + { + "epoch": 4.0084039701514165, + "grad_norm": 0.2580307722091675, + "learning_rate": 3.0126436781609195e-05, + "loss": 0.0883, + "step": 3458 + }, + { + "epoch": 4.009563138448163, + "grad_norm": 0.2701059579849243, + "learning_rate": 3.0120689655172413e-05, + "loss": 0.0988, + "step": 3459 + }, + { + "epoch": 4.010722306744911, + "grad_norm": 0.3617304563522339, + "learning_rate": 3.0114942528735635e-05, + "loss": 0.1142, + "step": 3460 + }, + { + "epoch": 4.011881475041657, + "grad_norm": 0.300418883562088, + "learning_rate": 3.010919540229885e-05, + "loss": 0.0961, + "step": 3461 + }, + { + "epoch": 4.013040643338405, + "grad_norm": 0.34037333726882935, + "learning_rate": 3.010344827586207e-05, + "loss": 0.1101, + "step": 3462 + }, + { + "epoch": 4.014199811635152, + "grad_norm": 0.30946558713912964, + "learning_rate": 3.009770114942529e-05, + "loss": 0.1017, + "step": 3463 + }, + { + "epoch": 4.015358979931899, + "grad_norm": 0.2514745891094208, + "learning_rate": 3.0091954022988505e-05, + "loss": 0.0895, + "step": 3464 + }, + { + "epoch": 4.016518148228646, + "grad_norm": 0.3249182403087616, + "learning_rate": 3.0086206896551726e-05, + "loss": 0.0983, + "step": 3465 + }, + { + "epoch": 4.017677316525393, + "grad_norm": 0.30697742104530334, + "learning_rate": 3.0080459770114945e-05, + "loss": 0.1012, + "step": 3466 + }, + { + "epoch": 4.01883648482214, + "grad_norm": 0.5410303473472595, + "learning_rate": 3.007471264367816e-05, + "loss": 0.101, + "step": 3467 + }, + { + "epoch": 4.019995653118888, + "grad_norm": 0.367801308631897, + "learning_rate": 3.006896551724138e-05, + "loss": 0.0972, + "step": 3468 + }, + { + "epoch": 4.021154821415634, + "grad_norm": 0.6262068152427673, + "learning_rate": 3.00632183908046e-05, + "loss": 0.1085, + "step": 3469 + }, + { + "epoch": 4.022313989712381, + "grad_norm": 0.39368757605552673, + "learning_rate": 3.005747126436782e-05, + "loss": 0.1074, + "step": 3470 + }, + { + "epoch": 4.0234731580091285, + "grad_norm": 0.3936510682106018, + "learning_rate": 3.0051724137931036e-05, + "loss": 0.0996, + "step": 3471 + }, + { + "epoch": 4.024632326305875, + "grad_norm": 0.5199762582778931, + "learning_rate": 3.0045977011494254e-05, + "loss": 0.1011, + "step": 3472 + }, + { + "epoch": 4.025791494602623, + "grad_norm": 0.3035762906074524, + "learning_rate": 3.0040229885057476e-05, + "loss": 0.103, + "step": 3473 + }, + { + "epoch": 4.026950662899369, + "grad_norm": 0.34868428111076355, + "learning_rate": 3.003448275862069e-05, + "loss": 0.1066, + "step": 3474 + }, + { + "epoch": 4.028109831196117, + "grad_norm": 0.4163806438446045, + "learning_rate": 3.0028735632183906e-05, + "loss": 0.103, + "step": 3475 + }, + { + "epoch": 4.029268999492864, + "grad_norm": 0.4000799059867859, + "learning_rate": 3.002298850574713e-05, + "loss": 0.0984, + "step": 3476 + }, + { + "epoch": 4.030428167789611, + "grad_norm": 0.3437565267086029, + "learning_rate": 3.0017241379310346e-05, + "loss": 0.0968, + "step": 3477 + }, + { + "epoch": 4.031587336086358, + "grad_norm": 0.2822440564632416, + "learning_rate": 3.001149425287356e-05, + "loss": 0.1045, + "step": 3478 + }, + { + "epoch": 4.0327465043831054, + "grad_norm": 0.44937509298324585, + "learning_rate": 3.0005747126436782e-05, + "loss": 0.1031, + "step": 3479 + }, + { + "epoch": 4.033905672679852, + "grad_norm": 0.667371392250061, + "learning_rate": 3e-05, + "loss": 0.112, + "step": 3480 + }, + { + "epoch": 4.0350648409766, + "grad_norm": 0.44788849353790283, + "learning_rate": 2.9994252873563222e-05, + "loss": 0.1159, + "step": 3481 + }, + { + "epoch": 4.036224009273346, + "grad_norm": 0.3105098009109497, + "learning_rate": 2.9988505747126437e-05, + "loss": 0.1164, + "step": 3482 + }, + { + "epoch": 4.037383177570093, + "grad_norm": 0.3047453463077545, + "learning_rate": 2.9982758620689656e-05, + "loss": 0.098, + "step": 3483 + }, + { + "epoch": 4.038542345866841, + "grad_norm": 0.39169877767562866, + "learning_rate": 2.9977011494252877e-05, + "loss": 0.0997, + "step": 3484 + }, + { + "epoch": 4.039701514163587, + "grad_norm": 0.31317397952079773, + "learning_rate": 2.9971264367816092e-05, + "loss": 0.1013, + "step": 3485 + }, + { + "epoch": 4.040860682460335, + "grad_norm": 0.396670937538147, + "learning_rate": 2.996551724137931e-05, + "loss": 0.1137, + "step": 3486 + }, + { + "epoch": 4.0420198507570815, + "grad_norm": 0.30599015951156616, + "learning_rate": 2.9959770114942532e-05, + "loss": 0.1096, + "step": 3487 + }, + { + "epoch": 4.043179019053829, + "grad_norm": 0.35810816287994385, + "learning_rate": 2.9954022988505747e-05, + "loss": 0.0991, + "step": 3488 + }, + { + "epoch": 4.044338187350576, + "grad_norm": 0.36599722504615784, + "learning_rate": 2.994827586206897e-05, + "loss": 0.1015, + "step": 3489 + }, + { + "epoch": 4.045497355647323, + "grad_norm": 0.30756571888923645, + "learning_rate": 2.9942528735632187e-05, + "loss": 0.0986, + "step": 3490 + }, + { + "epoch": 4.04665652394407, + "grad_norm": 0.42771488428115845, + "learning_rate": 2.9936781609195402e-05, + "loss": 0.1141, + "step": 3491 + }, + { + "epoch": 4.0478156922408175, + "grad_norm": 0.24768565595149994, + "learning_rate": 2.9931034482758624e-05, + "loss": 0.1063, + "step": 3492 + }, + { + "epoch": 4.048974860537564, + "grad_norm": 0.3938906490802765, + "learning_rate": 2.9925287356321842e-05, + "loss": 0.096, + "step": 3493 + }, + { + "epoch": 4.050134028834312, + "grad_norm": 0.3523421883583069, + "learning_rate": 2.9919540229885057e-05, + "loss": 0.1066, + "step": 3494 + }, + { + "epoch": 4.051293197131058, + "grad_norm": 0.32402533292770386, + "learning_rate": 2.991379310344828e-05, + "loss": 0.106, + "step": 3495 + }, + { + "epoch": 4.052452365427806, + "grad_norm": 0.3420311510562897, + "learning_rate": 2.9908045977011497e-05, + "loss": 0.0934, + "step": 3496 + }, + { + "epoch": 4.053611533724553, + "grad_norm": 0.5918967723846436, + "learning_rate": 2.990229885057471e-05, + "loss": 0.105, + "step": 3497 + }, + { + "epoch": 4.054770702021299, + "grad_norm": 0.2808108627796173, + "learning_rate": 2.9896551724137933e-05, + "loss": 0.1022, + "step": 3498 + }, + { + "epoch": 4.055929870318047, + "grad_norm": 0.43918702006340027, + "learning_rate": 2.9890804597701148e-05, + "loss": 0.1052, + "step": 3499 + }, + { + "epoch": 4.0570890386147935, + "grad_norm": 0.32599446177482605, + "learning_rate": 2.988505747126437e-05, + "loss": 0.0984, + "step": 3500 + }, + { + "epoch": 4.058248206911541, + "grad_norm": 0.3194628655910492, + "learning_rate": 2.9879310344827588e-05, + "loss": 0.0974, + "step": 3501 + }, + { + "epoch": 4.059407375208288, + "grad_norm": 0.358102947473526, + "learning_rate": 2.9873563218390803e-05, + "loss": 0.1023, + "step": 3502 + }, + { + "epoch": 4.060566543505035, + "grad_norm": 0.30983057618141174, + "learning_rate": 2.9867816091954025e-05, + "loss": 0.0888, + "step": 3503 + }, + { + "epoch": 4.061725711801782, + "grad_norm": 0.38468021154403687, + "learning_rate": 2.9862068965517243e-05, + "loss": 0.0959, + "step": 3504 + }, + { + "epoch": 4.0628848800985295, + "grad_norm": 0.4226651191711426, + "learning_rate": 2.9856321839080458e-05, + "loss": 0.1022, + "step": 3505 + }, + { + "epoch": 4.064044048395276, + "grad_norm": 0.5442459583282471, + "learning_rate": 2.985057471264368e-05, + "loss": 0.1106, + "step": 3506 + }, + { + "epoch": 4.065203216692024, + "grad_norm": 0.3217725455760956, + "learning_rate": 2.9844827586206898e-05, + "loss": 0.1021, + "step": 3507 + }, + { + "epoch": 4.0663623849887705, + "grad_norm": 0.30229008197784424, + "learning_rate": 2.983908045977012e-05, + "loss": 0.1031, + "step": 3508 + }, + { + "epoch": 4.067521553285518, + "grad_norm": 0.3460632562637329, + "learning_rate": 2.9833333333333335e-05, + "loss": 0.111, + "step": 3509 + }, + { + "epoch": 4.068680721582265, + "grad_norm": 0.38224828243255615, + "learning_rate": 2.9827586206896553e-05, + "loss": 0.1013, + "step": 3510 + }, + { + "epoch": 4.069839889879012, + "grad_norm": 0.35836493968963623, + "learning_rate": 2.9821839080459775e-05, + "loss": 0.105, + "step": 3511 + }, + { + "epoch": 4.070999058175759, + "grad_norm": 0.293371319770813, + "learning_rate": 2.981609195402299e-05, + "loss": 0.093, + "step": 3512 + }, + { + "epoch": 4.072158226472506, + "grad_norm": 0.37686097621917725, + "learning_rate": 2.9810344827586208e-05, + "loss": 0.1018, + "step": 3513 + }, + { + "epoch": 4.073317394769253, + "grad_norm": 0.30317679047584534, + "learning_rate": 2.980459770114943e-05, + "loss": 0.1095, + "step": 3514 + }, + { + "epoch": 4.074476563066, + "grad_norm": 0.3032994866371155, + "learning_rate": 2.9798850574712644e-05, + "loss": 0.1063, + "step": 3515 + }, + { + "epoch": 4.075635731362747, + "grad_norm": 0.29661819338798523, + "learning_rate": 2.979310344827586e-05, + "loss": 0.1068, + "step": 3516 + }, + { + "epoch": 4.076794899659494, + "grad_norm": 0.34293368458747864, + "learning_rate": 2.9787356321839084e-05, + "loss": 0.1063, + "step": 3517 + }, + { + "epoch": 4.077954067956242, + "grad_norm": 0.3552305996417999, + "learning_rate": 2.97816091954023e-05, + "loss": 0.1002, + "step": 3518 + }, + { + "epoch": 4.079113236252988, + "grad_norm": 0.3097629249095917, + "learning_rate": 2.977586206896552e-05, + "loss": 0.0955, + "step": 3519 + }, + { + "epoch": 4.080272404549736, + "grad_norm": 0.35458773374557495, + "learning_rate": 2.9770114942528736e-05, + "loss": 0.098, + "step": 3520 + }, + { + "epoch": 4.0814315728464825, + "grad_norm": 0.30128052830696106, + "learning_rate": 2.9764367816091954e-05, + "loss": 0.1087, + "step": 3521 + }, + { + "epoch": 4.08259074114323, + "grad_norm": 0.407092422246933, + "learning_rate": 2.9758620689655176e-05, + "loss": 0.0927, + "step": 3522 + }, + { + "epoch": 4.083749909439977, + "grad_norm": 0.43123817443847656, + "learning_rate": 2.975287356321839e-05, + "loss": 0.1045, + "step": 3523 + }, + { + "epoch": 4.084909077736724, + "grad_norm": 0.47122377157211304, + "learning_rate": 2.974712643678161e-05, + "loss": 0.1043, + "step": 3524 + }, + { + "epoch": 4.086068246033471, + "grad_norm": 0.4324159324169159, + "learning_rate": 2.974137931034483e-05, + "loss": 0.1095, + "step": 3525 + }, + { + "epoch": 4.087227414330218, + "grad_norm": 0.3203006982803345, + "learning_rate": 2.9735632183908045e-05, + "loss": 0.0967, + "step": 3526 + }, + { + "epoch": 4.088386582626965, + "grad_norm": 0.45111727714538574, + "learning_rate": 2.9729885057471267e-05, + "loss": 0.0979, + "step": 3527 + }, + { + "epoch": 4.089545750923712, + "grad_norm": 0.3182992935180664, + "learning_rate": 2.9724137931034485e-05, + "loss": 0.0957, + "step": 3528 + }, + { + "epoch": 4.090704919220459, + "grad_norm": 0.392347127199173, + "learning_rate": 2.97183908045977e-05, + "loss": 0.0982, + "step": 3529 + }, + { + "epoch": 4.091864087517206, + "grad_norm": 0.41814011335372925, + "learning_rate": 2.9712643678160922e-05, + "loss": 0.1017, + "step": 3530 + }, + { + "epoch": 4.093023255813954, + "grad_norm": 0.3611200749874115, + "learning_rate": 2.970689655172414e-05, + "loss": 0.0953, + "step": 3531 + }, + { + "epoch": 4.0941824241107, + "grad_norm": 0.3499966859817505, + "learning_rate": 2.9701149425287355e-05, + "loss": 0.1012, + "step": 3532 + }, + { + "epoch": 4.095341592407448, + "grad_norm": 0.33209457993507385, + "learning_rate": 2.9695402298850577e-05, + "loss": 0.0989, + "step": 3533 + }, + { + "epoch": 4.0965007607041946, + "grad_norm": 0.421478807926178, + "learning_rate": 2.9689655172413795e-05, + "loss": 0.1052, + "step": 3534 + }, + { + "epoch": 4.097659929000942, + "grad_norm": 0.3154635727405548, + "learning_rate": 2.968390804597701e-05, + "loss": 0.0984, + "step": 3535 + }, + { + "epoch": 4.098819097297689, + "grad_norm": 0.7844545841217041, + "learning_rate": 2.9678160919540232e-05, + "loss": 0.1113, + "step": 3536 + }, + { + "epoch": 4.099978265594436, + "grad_norm": 0.3256261646747589, + "learning_rate": 2.967241379310345e-05, + "loss": 0.0994, + "step": 3537 + }, + { + "epoch": 4.101137433891183, + "grad_norm": 0.4824637770652771, + "learning_rate": 2.9666666666666672e-05, + "loss": 0.1117, + "step": 3538 + }, + { + "epoch": 4.102296602187931, + "grad_norm": 0.46031931042671204, + "learning_rate": 2.9660919540229887e-05, + "loss": 0.1103, + "step": 3539 + }, + { + "epoch": 4.103455770484677, + "grad_norm": 0.33370453119277954, + "learning_rate": 2.96551724137931e-05, + "loss": 0.1121, + "step": 3540 + }, + { + "epoch": 4.104614938781424, + "grad_norm": 0.4630499482154846, + "learning_rate": 2.9649425287356327e-05, + "loss": 0.1128, + "step": 3541 + }, + { + "epoch": 4.1057741070781715, + "grad_norm": 0.47765010595321655, + "learning_rate": 2.964367816091954e-05, + "loss": 0.0989, + "step": 3542 + }, + { + "epoch": 4.106933275374918, + "grad_norm": 0.2912958562374115, + "learning_rate": 2.9637931034482756e-05, + "loss": 0.0968, + "step": 3543 + }, + { + "epoch": 4.108092443671666, + "grad_norm": 0.4124128520488739, + "learning_rate": 2.9632183908045978e-05, + "loss": 0.1004, + "step": 3544 + }, + { + "epoch": 4.109251611968412, + "grad_norm": 0.38116535544395447, + "learning_rate": 2.9626436781609196e-05, + "loss": 0.1079, + "step": 3545 + }, + { + "epoch": 4.11041078026516, + "grad_norm": 0.35159099102020264, + "learning_rate": 2.9620689655172418e-05, + "loss": 0.105, + "step": 3546 + }, + { + "epoch": 4.111569948561907, + "grad_norm": 0.38807129859924316, + "learning_rate": 2.9614942528735633e-05, + "loss": 0.1012, + "step": 3547 + }, + { + "epoch": 4.112729116858654, + "grad_norm": 0.33173301815986633, + "learning_rate": 2.960919540229885e-05, + "loss": 0.1213, + "step": 3548 + }, + { + "epoch": 4.113888285155401, + "grad_norm": 0.2943272888660431, + "learning_rate": 2.9603448275862073e-05, + "loss": 0.1063, + "step": 3549 + }, + { + "epoch": 4.115047453452148, + "grad_norm": 0.4288482666015625, + "learning_rate": 2.9597701149425288e-05, + "loss": 0.1047, + "step": 3550 + }, + { + "epoch": 4.116206621748895, + "grad_norm": 0.3704341948032379, + "learning_rate": 2.9591954022988506e-05, + "loss": 0.1166, + "step": 3551 + }, + { + "epoch": 4.117365790045643, + "grad_norm": 0.34703579545021057, + "learning_rate": 2.9586206896551728e-05, + "loss": 0.1068, + "step": 3552 + }, + { + "epoch": 4.118524958342389, + "grad_norm": 0.4057508111000061, + "learning_rate": 2.9580459770114943e-05, + "loss": 0.1048, + "step": 3553 + }, + { + "epoch": 4.119684126639137, + "grad_norm": 0.30694398283958435, + "learning_rate": 2.957471264367816e-05, + "loss": 0.1057, + "step": 3554 + }, + { + "epoch": 4.1208432949358835, + "grad_norm": 0.42865851521492004, + "learning_rate": 2.9568965517241383e-05, + "loss": 0.1085, + "step": 3555 + }, + { + "epoch": 4.12200246323263, + "grad_norm": 0.3558603525161743, + "learning_rate": 2.9563218390804598e-05, + "loss": 0.1029, + "step": 3556 + }, + { + "epoch": 4.123161631529378, + "grad_norm": 0.40978485345840454, + "learning_rate": 2.955747126436782e-05, + "loss": 0.095, + "step": 3557 + }, + { + "epoch": 4.124320799826124, + "grad_norm": 0.3812257647514343, + "learning_rate": 2.9551724137931038e-05, + "loss": 0.1052, + "step": 3558 + }, + { + "epoch": 4.125479968122872, + "grad_norm": 0.3055720925331116, + "learning_rate": 2.9545977011494252e-05, + "loss": 0.1113, + "step": 3559 + }, + { + "epoch": 4.126639136419619, + "grad_norm": 0.2221103310585022, + "learning_rate": 2.9540229885057474e-05, + "loss": 0.0886, + "step": 3560 + }, + { + "epoch": 4.127798304716366, + "grad_norm": 0.3389085531234741, + "learning_rate": 2.953448275862069e-05, + "loss": 0.113, + "step": 3561 + }, + { + "epoch": 4.128957473013113, + "grad_norm": 0.3829638659954071, + "learning_rate": 2.9528735632183907e-05, + "loss": 0.1048, + "step": 3562 + }, + { + "epoch": 4.1301166413098604, + "grad_norm": 0.5124924182891846, + "learning_rate": 2.952298850574713e-05, + "loss": 0.0942, + "step": 3563 + }, + { + "epoch": 4.131275809606607, + "grad_norm": 0.2896119952201843, + "learning_rate": 2.9517241379310344e-05, + "loss": 0.1041, + "step": 3564 + }, + { + "epoch": 4.132434977903355, + "grad_norm": 0.39416196942329407, + "learning_rate": 2.9511494252873566e-05, + "loss": 0.1046, + "step": 3565 + }, + { + "epoch": 4.133594146200101, + "grad_norm": 0.4558121860027313, + "learning_rate": 2.9505747126436784e-05, + "loss": 0.1077, + "step": 3566 + }, + { + "epoch": 4.134753314496849, + "grad_norm": 0.5235021114349365, + "learning_rate": 2.95e-05, + "loss": 0.0916, + "step": 3567 + }, + { + "epoch": 4.135912482793596, + "grad_norm": 0.4148746728897095, + "learning_rate": 2.949425287356322e-05, + "loss": 0.1049, + "step": 3568 + }, + { + "epoch": 4.137071651090342, + "grad_norm": 0.3960752785205841, + "learning_rate": 2.948850574712644e-05, + "loss": 0.0955, + "step": 3569 + }, + { + "epoch": 4.13823081938709, + "grad_norm": 0.2922149896621704, + "learning_rate": 2.9482758620689654e-05, + "loss": 0.1046, + "step": 3570 + }, + { + "epoch": 4.1393899876838365, + "grad_norm": 0.461208313703537, + "learning_rate": 2.9477011494252875e-05, + "loss": 0.1062, + "step": 3571 + }, + { + "epoch": 4.140549155980584, + "grad_norm": 0.35071003437042236, + "learning_rate": 2.9471264367816094e-05, + "loss": 0.1028, + "step": 3572 + }, + { + "epoch": 4.141708324277331, + "grad_norm": 0.3092256486415863, + "learning_rate": 2.946551724137931e-05, + "loss": 0.0979, + "step": 3573 + }, + { + "epoch": 4.142867492574078, + "grad_norm": 0.36416730284690857, + "learning_rate": 2.945977011494253e-05, + "loss": 0.1096, + "step": 3574 + }, + { + "epoch": 4.144026660870825, + "grad_norm": 0.4897642135620117, + "learning_rate": 2.945402298850575e-05, + "loss": 0.0942, + "step": 3575 + }, + { + "epoch": 4.1451858291675725, + "grad_norm": 0.3243999481201172, + "learning_rate": 2.944827586206897e-05, + "loss": 0.1, + "step": 3576 + }, + { + "epoch": 4.146344997464319, + "grad_norm": 0.3881404399871826, + "learning_rate": 2.9442528735632185e-05, + "loss": 0.1136, + "step": 3577 + }, + { + "epoch": 4.147504165761067, + "grad_norm": 0.3916066884994507, + "learning_rate": 2.9436781609195403e-05, + "loss": 0.1066, + "step": 3578 + }, + { + "epoch": 4.148663334057813, + "grad_norm": 0.3146432638168335, + "learning_rate": 2.9431034482758625e-05, + "loss": 0.0874, + "step": 3579 + }, + { + "epoch": 4.149822502354561, + "grad_norm": 0.3404291272163391, + "learning_rate": 2.942528735632184e-05, + "loss": 0.095, + "step": 3580 + }, + { + "epoch": 4.150981670651308, + "grad_norm": 0.4161677658557892, + "learning_rate": 2.9419540229885055e-05, + "loss": 0.0965, + "step": 3581 + }, + { + "epoch": 4.152140838948055, + "grad_norm": 0.38748228549957275, + "learning_rate": 2.941379310344828e-05, + "loss": 0.1148, + "step": 3582 + }, + { + "epoch": 4.153300007244802, + "grad_norm": 0.3862541615962982, + "learning_rate": 2.9408045977011495e-05, + "loss": 0.0994, + "step": 3583 + }, + { + "epoch": 4.1544591755415485, + "grad_norm": 0.4990377724170685, + "learning_rate": 2.9402298850574716e-05, + "loss": 0.1023, + "step": 3584 + }, + { + "epoch": 4.155618343838296, + "grad_norm": 0.3841322362422943, + "learning_rate": 2.939655172413793e-05, + "loss": 0.1137, + "step": 3585 + }, + { + "epoch": 4.156777512135043, + "grad_norm": 0.2818942368030548, + "learning_rate": 2.939080459770115e-05, + "loss": 0.0971, + "step": 3586 + }, + { + "epoch": 4.15793668043179, + "grad_norm": 0.4144044518470764, + "learning_rate": 2.938505747126437e-05, + "loss": 0.1, + "step": 3587 + }, + { + "epoch": 4.159095848728537, + "grad_norm": 0.265598326921463, + "learning_rate": 2.9379310344827586e-05, + "loss": 0.0911, + "step": 3588 + }, + { + "epoch": 4.1602550170252846, + "grad_norm": 0.40067845582962036, + "learning_rate": 2.9373563218390805e-05, + "loss": 0.1078, + "step": 3589 + }, + { + "epoch": 4.161414185322031, + "grad_norm": 0.3143431842327118, + "learning_rate": 2.9367816091954026e-05, + "loss": 0.0986, + "step": 3590 + }, + { + "epoch": 4.162573353618779, + "grad_norm": 0.2772219181060791, + "learning_rate": 2.936206896551724e-05, + "loss": 0.0944, + "step": 3591 + }, + { + "epoch": 4.1637325219155255, + "grad_norm": 0.4990723729133606, + "learning_rate": 2.935632183908046e-05, + "loss": 0.0989, + "step": 3592 + }, + { + "epoch": 4.164891690212273, + "grad_norm": 0.3033507168292999, + "learning_rate": 2.935057471264368e-05, + "loss": 0.1002, + "step": 3593 + }, + { + "epoch": 4.16605085850902, + "grad_norm": 0.28623032569885254, + "learning_rate": 2.9344827586206896e-05, + "loss": 0.1109, + "step": 3594 + }, + { + "epoch": 4.167210026805767, + "grad_norm": 0.28263723850250244, + "learning_rate": 2.9339080459770118e-05, + "loss": 0.1128, + "step": 3595 + }, + { + "epoch": 4.168369195102514, + "grad_norm": 0.7730621099472046, + "learning_rate": 2.9333333333333336e-05, + "loss": 0.1082, + "step": 3596 + }, + { + "epoch": 4.169528363399261, + "grad_norm": 0.306794673204422, + "learning_rate": 2.932758620689655e-05, + "loss": 0.1018, + "step": 3597 + }, + { + "epoch": 4.170687531696008, + "grad_norm": 0.3806617558002472, + "learning_rate": 2.9321839080459773e-05, + "loss": 0.1206, + "step": 3598 + }, + { + "epoch": 4.171846699992755, + "grad_norm": 0.3783591687679291, + "learning_rate": 2.931609195402299e-05, + "loss": 0.105, + "step": 3599 + }, + { + "epoch": 4.173005868289502, + "grad_norm": 0.49170562624931335, + "learning_rate": 2.9310344827586206e-05, + "loss": 0.1053, + "step": 3600 + }, + { + "epoch": 4.174165036586249, + "grad_norm": 0.36278071999549866, + "learning_rate": 2.9304597701149427e-05, + "loss": 0.1006, + "step": 3601 + }, + { + "epoch": 4.175324204882997, + "grad_norm": 0.4412696957588196, + "learning_rate": 2.9298850574712646e-05, + "loss": 0.1136, + "step": 3602 + }, + { + "epoch": 4.176483373179743, + "grad_norm": 0.33460313081741333, + "learning_rate": 2.9293103448275867e-05, + "loss": 0.1022, + "step": 3603 + }, + { + "epoch": 4.177642541476491, + "grad_norm": 0.32124656438827515, + "learning_rate": 2.9287356321839082e-05, + "loss": 0.1033, + "step": 3604 + }, + { + "epoch": 4.1788017097732375, + "grad_norm": 0.3389214873313904, + "learning_rate": 2.9281609195402297e-05, + "loss": 0.1074, + "step": 3605 + }, + { + "epoch": 4.179960878069985, + "grad_norm": 0.3551189601421356, + "learning_rate": 2.927586206896552e-05, + "loss": 0.0946, + "step": 3606 + }, + { + "epoch": 4.181120046366732, + "grad_norm": 0.3766753673553467, + "learning_rate": 2.9270114942528737e-05, + "loss": 0.0939, + "step": 3607 + }, + { + "epoch": 4.182279214663479, + "grad_norm": 0.362870454788208, + "learning_rate": 2.9264367816091952e-05, + "loss": 0.1079, + "step": 3608 + }, + { + "epoch": 4.183438382960226, + "grad_norm": 0.44273027777671814, + "learning_rate": 2.9258620689655174e-05, + "loss": 0.0969, + "step": 3609 + }, + { + "epoch": 4.1845975512569735, + "grad_norm": 0.3742962181568146, + "learning_rate": 2.9252873563218392e-05, + "loss": 0.1129, + "step": 3610 + }, + { + "epoch": 4.18575671955372, + "grad_norm": 0.4322942793369293, + "learning_rate": 2.9247126436781614e-05, + "loss": 0.1058, + "step": 3611 + }, + { + "epoch": 4.186915887850467, + "grad_norm": 0.44302043318748474, + "learning_rate": 2.924137931034483e-05, + "loss": 0.1062, + "step": 3612 + }, + { + "epoch": 4.188075056147214, + "grad_norm": 0.32489603757858276, + "learning_rate": 2.9235632183908047e-05, + "loss": 0.0974, + "step": 3613 + }, + { + "epoch": 4.189234224443961, + "grad_norm": 0.282850444316864, + "learning_rate": 2.922988505747127e-05, + "loss": 0.0941, + "step": 3614 + }, + { + "epoch": 4.190393392740709, + "grad_norm": 0.27374568581581116, + "learning_rate": 2.9224137931034483e-05, + "loss": 0.0963, + "step": 3615 + }, + { + "epoch": 4.191552561037455, + "grad_norm": 0.31600022315979004, + "learning_rate": 2.9218390804597702e-05, + "loss": 0.1027, + "step": 3616 + }, + { + "epoch": 4.192711729334203, + "grad_norm": 0.24971164762973785, + "learning_rate": 2.9212643678160923e-05, + "loss": 0.1103, + "step": 3617 + }, + { + "epoch": 4.19387089763095, + "grad_norm": 0.3458114564418793, + "learning_rate": 2.920689655172414e-05, + "loss": 0.0996, + "step": 3618 + }, + { + "epoch": 4.195030065927697, + "grad_norm": 0.3708142340183258, + "learning_rate": 2.9201149425287357e-05, + "loss": 0.0953, + "step": 3619 + }, + { + "epoch": 4.196189234224444, + "grad_norm": 0.3767480254173279, + "learning_rate": 2.919540229885058e-05, + "loss": 0.1081, + "step": 3620 + }, + { + "epoch": 4.197348402521191, + "grad_norm": 0.34636861085891724, + "learning_rate": 2.9189655172413793e-05, + "loss": 0.0996, + "step": 3621 + }, + { + "epoch": 4.198507570817938, + "grad_norm": 0.4181584119796753, + "learning_rate": 2.9183908045977015e-05, + "loss": 0.0951, + "step": 3622 + }, + { + "epoch": 4.199666739114686, + "grad_norm": 0.4052351117134094, + "learning_rate": 2.9178160919540233e-05, + "loss": 0.1059, + "step": 3623 + }, + { + "epoch": 4.200825907411432, + "grad_norm": 0.28023508191108704, + "learning_rate": 2.9172413793103448e-05, + "loss": 0.0992, + "step": 3624 + }, + { + "epoch": 4.20198507570818, + "grad_norm": 0.3520253896713257, + "learning_rate": 2.916666666666667e-05, + "loss": 0.107, + "step": 3625 + }, + { + "epoch": 4.2031442440049265, + "grad_norm": 0.3649381697177887, + "learning_rate": 2.9160919540229885e-05, + "loss": 0.114, + "step": 3626 + }, + { + "epoch": 4.204303412301673, + "grad_norm": 0.36334317922592163, + "learning_rate": 2.9155172413793103e-05, + "loss": 0.1012, + "step": 3627 + }, + { + "epoch": 4.205462580598421, + "grad_norm": 0.3084128797054291, + "learning_rate": 2.9149425287356325e-05, + "loss": 0.1035, + "step": 3628 + }, + { + "epoch": 4.206621748895167, + "grad_norm": 0.31549155712127686, + "learning_rate": 2.914367816091954e-05, + "loss": 0.1022, + "step": 3629 + }, + { + "epoch": 4.207780917191915, + "grad_norm": 0.33115145564079285, + "learning_rate": 2.913793103448276e-05, + "loss": 0.1034, + "step": 3630 + }, + { + "epoch": 4.208940085488662, + "grad_norm": 0.321877121925354, + "learning_rate": 2.913218390804598e-05, + "loss": 0.0948, + "step": 3631 + }, + { + "epoch": 4.210099253785409, + "grad_norm": 0.34342601895332336, + "learning_rate": 2.9126436781609194e-05, + "loss": 0.0987, + "step": 3632 + }, + { + "epoch": 4.211258422082156, + "grad_norm": 0.3793211579322815, + "learning_rate": 2.9120689655172416e-05, + "loss": 0.1032, + "step": 3633 + }, + { + "epoch": 4.212417590378903, + "grad_norm": 0.46431970596313477, + "learning_rate": 2.9114942528735634e-05, + "loss": 0.0997, + "step": 3634 + }, + { + "epoch": 4.21357675867565, + "grad_norm": 0.27705472707748413, + "learning_rate": 2.910919540229885e-05, + "loss": 0.0957, + "step": 3635 + }, + { + "epoch": 4.214735926972398, + "grad_norm": 0.5416485667228699, + "learning_rate": 2.910344827586207e-05, + "loss": 0.1021, + "step": 3636 + }, + { + "epoch": 4.215895095269144, + "grad_norm": 0.3653283715248108, + "learning_rate": 2.909770114942529e-05, + "loss": 0.0992, + "step": 3637 + }, + { + "epoch": 4.217054263565892, + "grad_norm": 0.34338125586509705, + "learning_rate": 2.9091954022988504e-05, + "loss": 0.1045, + "step": 3638 + }, + { + "epoch": 4.2182134318626385, + "grad_norm": 0.39666372537612915, + "learning_rate": 2.9086206896551726e-05, + "loss": 0.1029, + "step": 3639 + }, + { + "epoch": 4.219372600159385, + "grad_norm": 0.46274009346961975, + "learning_rate": 2.9080459770114944e-05, + "loss": 0.1097, + "step": 3640 + }, + { + "epoch": 4.220531768456133, + "grad_norm": 0.3507300615310669, + "learning_rate": 2.9074712643678166e-05, + "loss": 0.1001, + "step": 3641 + }, + { + "epoch": 4.221690936752879, + "grad_norm": 0.5658450126647949, + "learning_rate": 2.906896551724138e-05, + "loss": 0.1062, + "step": 3642 + }, + { + "epoch": 4.222850105049627, + "grad_norm": 0.33815431594848633, + "learning_rate": 2.90632183908046e-05, + "loss": 0.1002, + "step": 3643 + }, + { + "epoch": 4.224009273346374, + "grad_norm": 0.3951635956764221, + "learning_rate": 2.905747126436782e-05, + "loss": 0.0966, + "step": 3644 + }, + { + "epoch": 4.225168441643121, + "grad_norm": 0.4805266261100769, + "learning_rate": 2.9051724137931036e-05, + "loss": 0.1122, + "step": 3645 + }, + { + "epoch": 4.226327609939868, + "grad_norm": 0.3607132136821747, + "learning_rate": 2.904597701149425e-05, + "loss": 0.0924, + "step": 3646 + }, + { + "epoch": 4.2274867782366155, + "grad_norm": 0.3858696520328522, + "learning_rate": 2.9040229885057476e-05, + "loss": 0.0954, + "step": 3647 + }, + { + "epoch": 4.228645946533362, + "grad_norm": 0.35056713223457336, + "learning_rate": 2.903448275862069e-05, + "loss": 0.1034, + "step": 3648 + }, + { + "epoch": 4.22980511483011, + "grad_norm": 0.45169034600257874, + "learning_rate": 2.9028735632183912e-05, + "loss": 0.1089, + "step": 3649 + }, + { + "epoch": 4.230964283126856, + "grad_norm": 0.33204302191734314, + "learning_rate": 2.9022988505747127e-05, + "loss": 0.1078, + "step": 3650 + }, + { + "epoch": 4.232123451423604, + "grad_norm": 0.4049871563911438, + "learning_rate": 2.9017241379310345e-05, + "loss": 0.1022, + "step": 3651 + }, + { + "epoch": 4.233282619720351, + "grad_norm": 0.37815043330192566, + "learning_rate": 2.9011494252873567e-05, + "loss": 0.1025, + "step": 3652 + }, + { + "epoch": 4.234441788017098, + "grad_norm": 0.6017569899559021, + "learning_rate": 2.9005747126436782e-05, + "loss": 0.1095, + "step": 3653 + }, + { + "epoch": 4.235600956313845, + "grad_norm": 0.44660142064094543, + "learning_rate": 2.9e-05, + "loss": 0.1027, + "step": 3654 + }, + { + "epoch": 4.2367601246105915, + "grad_norm": 0.22865894436836243, + "learning_rate": 2.8994252873563222e-05, + "loss": 0.0953, + "step": 3655 + }, + { + "epoch": 4.237919292907339, + "grad_norm": 0.28317615389823914, + "learning_rate": 2.8988505747126437e-05, + "loss": 0.1009, + "step": 3656 + }, + { + "epoch": 4.239078461204086, + "grad_norm": 0.30559006333351135, + "learning_rate": 2.8982758620689655e-05, + "loss": 0.1079, + "step": 3657 + }, + { + "epoch": 4.240237629500833, + "grad_norm": 0.3804382383823395, + "learning_rate": 2.8977011494252877e-05, + "loss": 0.115, + "step": 3658 + }, + { + "epoch": 4.24139679779758, + "grad_norm": 0.4205724000930786, + "learning_rate": 2.897126436781609e-05, + "loss": 0.1013, + "step": 3659 + }, + { + "epoch": 4.2425559660943275, + "grad_norm": 0.3758813142776489, + "learning_rate": 2.8965517241379313e-05, + "loss": 0.103, + "step": 3660 + }, + { + "epoch": 4.243715134391074, + "grad_norm": 0.6086719632148743, + "learning_rate": 2.895977011494253e-05, + "loss": 0.1231, + "step": 3661 + }, + { + "epoch": 4.244874302687822, + "grad_norm": 0.4006671607494354, + "learning_rate": 2.8954022988505746e-05, + "loss": 0.0924, + "step": 3662 + }, + { + "epoch": 4.246033470984568, + "grad_norm": 0.41011175513267517, + "learning_rate": 2.8948275862068968e-05, + "loss": 0.0992, + "step": 3663 + }, + { + "epoch": 4.247192639281316, + "grad_norm": 0.4147280156612396, + "learning_rate": 2.8942528735632186e-05, + "loss": 0.1034, + "step": 3664 + }, + { + "epoch": 4.248351807578063, + "grad_norm": 0.32018283009529114, + "learning_rate": 2.89367816091954e-05, + "loss": 0.0927, + "step": 3665 + }, + { + "epoch": 4.24951097587481, + "grad_norm": 0.3739756643772125, + "learning_rate": 2.8931034482758623e-05, + "loss": 0.11, + "step": 3666 + }, + { + "epoch": 4.250670144171557, + "grad_norm": 0.3220639228820801, + "learning_rate": 2.8925287356321838e-05, + "loss": 0.0968, + "step": 3667 + }, + { + "epoch": 4.251829312468304, + "grad_norm": 0.3207123279571533, + "learning_rate": 2.8919540229885063e-05, + "loss": 0.1002, + "step": 3668 + }, + { + "epoch": 4.252988480765051, + "grad_norm": 0.38051706552505493, + "learning_rate": 2.8913793103448278e-05, + "loss": 0.1089, + "step": 3669 + }, + { + "epoch": 4.254147649061798, + "grad_norm": 0.3652991056442261, + "learning_rate": 2.8908045977011493e-05, + "loss": 0.1044, + "step": 3670 + }, + { + "epoch": 4.255306817358545, + "grad_norm": 0.4439450204372406, + "learning_rate": 2.8902298850574714e-05, + "loss": 0.1028, + "step": 3671 + }, + { + "epoch": 4.256465985655292, + "grad_norm": 0.2599686086177826, + "learning_rate": 2.8896551724137933e-05, + "loss": 0.0919, + "step": 3672 + }, + { + "epoch": 4.2576251539520396, + "grad_norm": 0.33243459463119507, + "learning_rate": 2.8890804597701148e-05, + "loss": 0.1044, + "step": 3673 + }, + { + "epoch": 4.258784322248786, + "grad_norm": 0.305882066488266, + "learning_rate": 2.888505747126437e-05, + "loss": 0.0968, + "step": 3674 + }, + { + "epoch": 4.259943490545534, + "grad_norm": 0.403209388256073, + "learning_rate": 2.8879310344827588e-05, + "loss": 0.1124, + "step": 3675 + }, + { + "epoch": 4.2611026588422805, + "grad_norm": 0.32304686307907104, + "learning_rate": 2.8873563218390803e-05, + "loss": 0.0908, + "step": 3676 + }, + { + "epoch": 4.262261827139028, + "grad_norm": 0.3692949116230011, + "learning_rate": 2.8867816091954024e-05, + "loss": 0.1067, + "step": 3677 + }, + { + "epoch": 4.263420995435775, + "grad_norm": 0.2813093066215515, + "learning_rate": 2.8862068965517243e-05, + "loss": 0.1046, + "step": 3678 + }, + { + "epoch": 4.264580163732522, + "grad_norm": 0.28762635588645935, + "learning_rate": 2.8856321839080464e-05, + "loss": 0.0968, + "step": 3679 + }, + { + "epoch": 4.265739332029269, + "grad_norm": 0.447125643491745, + "learning_rate": 2.885057471264368e-05, + "loss": 0.115, + "step": 3680 + }, + { + "epoch": 4.2668985003260165, + "grad_norm": 0.32157212495803833, + "learning_rate": 2.8844827586206897e-05, + "loss": 0.0963, + "step": 3681 + }, + { + "epoch": 4.268057668622763, + "grad_norm": 0.5032519698143005, + "learning_rate": 2.883908045977012e-05, + "loss": 0.107, + "step": 3682 + }, + { + "epoch": 4.26921683691951, + "grad_norm": 0.2923990786075592, + "learning_rate": 2.8833333333333334e-05, + "loss": 0.0884, + "step": 3683 + }, + { + "epoch": 4.270376005216257, + "grad_norm": 0.4205268621444702, + "learning_rate": 2.8827586206896552e-05, + "loss": 0.1035, + "step": 3684 + }, + { + "epoch": 4.271535173513004, + "grad_norm": 0.343766987323761, + "learning_rate": 2.8821839080459774e-05, + "loss": 0.1016, + "step": 3685 + }, + { + "epoch": 4.272694341809752, + "grad_norm": 0.39418303966522217, + "learning_rate": 2.881609195402299e-05, + "loss": 0.1057, + "step": 3686 + }, + { + "epoch": 4.273853510106498, + "grad_norm": 0.41105031967163086, + "learning_rate": 2.881034482758621e-05, + "loss": 0.0932, + "step": 3687 + }, + { + "epoch": 4.275012678403246, + "grad_norm": 0.32232189178466797, + "learning_rate": 2.880459770114943e-05, + "loss": 0.1043, + "step": 3688 + }, + { + "epoch": 4.2761718466999925, + "grad_norm": 0.49274659156799316, + "learning_rate": 2.8798850574712644e-05, + "loss": 0.1039, + "step": 3689 + }, + { + "epoch": 4.27733101499674, + "grad_norm": 0.5565973520278931, + "learning_rate": 2.8793103448275865e-05, + "loss": 0.1054, + "step": 3690 + }, + { + "epoch": 4.278490183293487, + "grad_norm": 0.366921603679657, + "learning_rate": 2.878735632183908e-05, + "loss": 0.1087, + "step": 3691 + }, + { + "epoch": 4.279649351590234, + "grad_norm": 0.3326165974140167, + "learning_rate": 2.87816091954023e-05, + "loss": 0.0971, + "step": 3692 + }, + { + "epoch": 4.280808519886981, + "grad_norm": 0.46312418580055237, + "learning_rate": 2.877586206896552e-05, + "loss": 0.1038, + "step": 3693 + }, + { + "epoch": 4.2819676881837285, + "grad_norm": 0.2820906639099121, + "learning_rate": 2.8770114942528735e-05, + "loss": 0.0958, + "step": 3694 + }, + { + "epoch": 4.283126856480475, + "grad_norm": 0.3850758969783783, + "learning_rate": 2.8764367816091953e-05, + "loss": 0.1098, + "step": 3695 + }, + { + "epoch": 4.284286024777223, + "grad_norm": 0.4358065128326416, + "learning_rate": 2.8758620689655175e-05, + "loss": 0.1004, + "step": 3696 + }, + { + "epoch": 4.285445193073969, + "grad_norm": 0.3014400601387024, + "learning_rate": 2.875287356321839e-05, + "loss": 0.1031, + "step": 3697 + }, + { + "epoch": 4.286604361370716, + "grad_norm": 0.30815666913986206, + "learning_rate": 2.8747126436781612e-05, + "loss": 0.1043, + "step": 3698 + }, + { + "epoch": 4.287763529667464, + "grad_norm": 0.39481833577156067, + "learning_rate": 2.874137931034483e-05, + "loss": 0.1068, + "step": 3699 + }, + { + "epoch": 4.28892269796421, + "grad_norm": 0.28407761454582214, + "learning_rate": 2.8735632183908045e-05, + "loss": 0.0992, + "step": 3700 + }, + { + "epoch": 4.290081866260958, + "grad_norm": 0.3631652891635895, + "learning_rate": 2.8729885057471267e-05, + "loss": 0.1069, + "step": 3701 + }, + { + "epoch": 4.291241034557705, + "grad_norm": 0.3379894495010376, + "learning_rate": 2.8724137931034485e-05, + "loss": 0.1074, + "step": 3702 + }, + { + "epoch": 4.292400202854452, + "grad_norm": 0.29723644256591797, + "learning_rate": 2.87183908045977e-05, + "loss": 0.1051, + "step": 3703 + }, + { + "epoch": 4.293559371151199, + "grad_norm": 0.4353717863559723, + "learning_rate": 2.871264367816092e-05, + "loss": 0.1067, + "step": 3704 + }, + { + "epoch": 4.294718539447946, + "grad_norm": 0.30414873361587524, + "learning_rate": 2.870689655172414e-05, + "loss": 0.0915, + "step": 3705 + }, + { + "epoch": 4.295877707744693, + "grad_norm": 0.5142664909362793, + "learning_rate": 2.870114942528736e-05, + "loss": 0.1019, + "step": 3706 + }, + { + "epoch": 4.297036876041441, + "grad_norm": 0.47813358902931213, + "learning_rate": 2.8695402298850576e-05, + "loss": 0.0917, + "step": 3707 + }, + { + "epoch": 4.298196044338187, + "grad_norm": 0.418399840593338, + "learning_rate": 2.8689655172413795e-05, + "loss": 0.1045, + "step": 3708 + }, + { + "epoch": 4.299355212634935, + "grad_norm": 0.36528536677360535, + "learning_rate": 2.8683908045977016e-05, + "loss": 0.112, + "step": 3709 + }, + { + "epoch": 4.3005143809316815, + "grad_norm": 0.41568946838378906, + "learning_rate": 2.867816091954023e-05, + "loss": 0.0988, + "step": 3710 + }, + { + "epoch": 4.301673549228429, + "grad_norm": 0.3442577123641968, + "learning_rate": 2.8672413793103446e-05, + "loss": 0.0949, + "step": 3711 + }, + { + "epoch": 4.302832717525176, + "grad_norm": 0.4584302604198456, + "learning_rate": 2.8666666666666668e-05, + "loss": 0.1037, + "step": 3712 + }, + { + "epoch": 4.303991885821922, + "grad_norm": 0.36319372057914734, + "learning_rate": 2.8660919540229886e-05, + "loss": 0.1082, + "step": 3713 + }, + { + "epoch": 4.30515105411867, + "grad_norm": 0.2930583953857422, + "learning_rate": 2.86551724137931e-05, + "loss": 0.0955, + "step": 3714 + }, + { + "epoch": 4.306310222415417, + "grad_norm": 0.4411957859992981, + "learning_rate": 2.8649425287356323e-05, + "loss": 0.1108, + "step": 3715 + }, + { + "epoch": 4.307469390712164, + "grad_norm": 0.3296014070510864, + "learning_rate": 2.864367816091954e-05, + "loss": 0.1025, + "step": 3716 + }, + { + "epoch": 4.308628559008911, + "grad_norm": 0.3820302188396454, + "learning_rate": 2.8637931034482763e-05, + "loss": 0.1105, + "step": 3717 + }, + { + "epoch": 4.309787727305658, + "grad_norm": 0.30717596411705017, + "learning_rate": 2.8632183908045978e-05, + "loss": 0.1037, + "step": 3718 + }, + { + "epoch": 4.310946895602405, + "grad_norm": 0.4110417664051056, + "learning_rate": 2.8626436781609196e-05, + "loss": 0.1076, + "step": 3719 + }, + { + "epoch": 4.312106063899153, + "grad_norm": 0.3443779945373535, + "learning_rate": 2.8620689655172417e-05, + "loss": 0.0982, + "step": 3720 + }, + { + "epoch": 4.313265232195899, + "grad_norm": 0.2677842080593109, + "learning_rate": 2.8614942528735632e-05, + "loss": 0.1009, + "step": 3721 + }, + { + "epoch": 4.314424400492647, + "grad_norm": 0.28737112879753113, + "learning_rate": 2.860919540229885e-05, + "loss": 0.0953, + "step": 3722 + }, + { + "epoch": 4.3155835687893935, + "grad_norm": 0.3234851658344269, + "learning_rate": 2.8603448275862072e-05, + "loss": 0.0992, + "step": 3723 + }, + { + "epoch": 4.316742737086141, + "grad_norm": 0.2924320101737976, + "learning_rate": 2.8597701149425287e-05, + "loss": 0.1053, + "step": 3724 + }, + { + "epoch": 4.317901905382888, + "grad_norm": 0.42933475971221924, + "learning_rate": 2.859195402298851e-05, + "loss": 0.1063, + "step": 3725 + }, + { + "epoch": 4.319061073679634, + "grad_norm": 0.34258314967155457, + "learning_rate": 2.8586206896551727e-05, + "loss": 0.1059, + "step": 3726 + }, + { + "epoch": 4.320220241976382, + "grad_norm": 0.3477523624897003, + "learning_rate": 2.8580459770114942e-05, + "loss": 0.0947, + "step": 3727 + }, + { + "epoch": 4.321379410273129, + "grad_norm": 0.5333256125450134, + "learning_rate": 2.8574712643678164e-05, + "loss": 0.1111, + "step": 3728 + }, + { + "epoch": 4.322538578569876, + "grad_norm": 0.3772802948951721, + "learning_rate": 2.8568965517241382e-05, + "loss": 0.0991, + "step": 3729 + }, + { + "epoch": 4.323697746866623, + "grad_norm": 0.415400892496109, + "learning_rate": 2.8563218390804597e-05, + "loss": 0.108, + "step": 3730 + }, + { + "epoch": 4.3248569151633705, + "grad_norm": 0.34160271286964417, + "learning_rate": 2.855747126436782e-05, + "loss": 0.0992, + "step": 3731 + }, + { + "epoch": 4.326016083460117, + "grad_norm": 0.29234248399734497, + "learning_rate": 2.8551724137931034e-05, + "loss": 0.1012, + "step": 3732 + }, + { + "epoch": 4.327175251756865, + "grad_norm": 0.36780115962028503, + "learning_rate": 2.8545977011494252e-05, + "loss": 0.1029, + "step": 3733 + }, + { + "epoch": 4.328334420053611, + "grad_norm": 0.3559337556362152, + "learning_rate": 2.8540229885057474e-05, + "loss": 0.116, + "step": 3734 + }, + { + "epoch": 4.329493588350359, + "grad_norm": 0.35530203580856323, + "learning_rate": 2.853448275862069e-05, + "loss": 0.1117, + "step": 3735 + }, + { + "epoch": 4.330652756647106, + "grad_norm": 0.39250123500823975, + "learning_rate": 2.852873563218391e-05, + "loss": 0.1021, + "step": 3736 + }, + { + "epoch": 4.331811924943853, + "grad_norm": 0.3159255087375641, + "learning_rate": 2.852298850574713e-05, + "loss": 0.1061, + "step": 3737 + }, + { + "epoch": 4.3329710932406, + "grad_norm": 0.28667929768562317, + "learning_rate": 2.8517241379310343e-05, + "loss": 0.0963, + "step": 3738 + }, + { + "epoch": 4.3341302615373465, + "grad_norm": 0.49704504013061523, + "learning_rate": 2.8511494252873565e-05, + "loss": 0.1079, + "step": 3739 + }, + { + "epoch": 4.335289429834094, + "grad_norm": 0.383167564868927, + "learning_rate": 2.8505747126436783e-05, + "loss": 0.1031, + "step": 3740 + }, + { + "epoch": 4.336448598130841, + "grad_norm": 0.28818759322166443, + "learning_rate": 2.8499999999999998e-05, + "loss": 0.1103, + "step": 3741 + }, + { + "epoch": 4.337607766427588, + "grad_norm": 0.3614335358142853, + "learning_rate": 2.849425287356322e-05, + "loss": 0.1032, + "step": 3742 + }, + { + "epoch": 4.338766934724335, + "grad_norm": 0.3309556841850281, + "learning_rate": 2.8488505747126438e-05, + "loss": 0.1053, + "step": 3743 + }, + { + "epoch": 4.3399261030210825, + "grad_norm": 0.4003108739852905, + "learning_rate": 2.848275862068966e-05, + "loss": 0.1038, + "step": 3744 + }, + { + "epoch": 4.341085271317829, + "grad_norm": 0.30309104919433594, + "learning_rate": 2.8477011494252875e-05, + "loss": 0.1019, + "step": 3745 + }, + { + "epoch": 4.342244439614577, + "grad_norm": 0.3004578649997711, + "learning_rate": 2.8471264367816093e-05, + "loss": 0.0911, + "step": 3746 + }, + { + "epoch": 4.343403607911323, + "grad_norm": 0.3777802586555481, + "learning_rate": 2.8465517241379315e-05, + "loss": 0.0995, + "step": 3747 + }, + { + "epoch": 4.344562776208071, + "grad_norm": 0.29196664690971375, + "learning_rate": 2.845977011494253e-05, + "loss": 0.101, + "step": 3748 + }, + { + "epoch": 4.345721944504818, + "grad_norm": 0.4231297969818115, + "learning_rate": 2.8454022988505748e-05, + "loss": 0.1033, + "step": 3749 + }, + { + "epoch": 4.346881112801565, + "grad_norm": 0.38380828499794006, + "learning_rate": 2.844827586206897e-05, + "loss": 0.1193, + "step": 3750 + }, + { + "epoch": 4.348040281098312, + "grad_norm": 0.37438878417015076, + "learning_rate": 2.8442528735632184e-05, + "loss": 0.0972, + "step": 3751 + }, + { + "epoch": 4.349199449395059, + "grad_norm": 0.29754364490509033, + "learning_rate": 2.84367816091954e-05, + "loss": 0.0936, + "step": 3752 + }, + { + "epoch": 4.350358617691806, + "grad_norm": 0.2982840836048126, + "learning_rate": 2.8431034482758624e-05, + "loss": 0.0892, + "step": 3753 + }, + { + "epoch": 4.351517785988554, + "grad_norm": 0.4285561740398407, + "learning_rate": 2.842528735632184e-05, + "loss": 0.0987, + "step": 3754 + }, + { + "epoch": 4.3526769542853, + "grad_norm": 0.356584370136261, + "learning_rate": 2.841954022988506e-05, + "loss": 0.1078, + "step": 3755 + }, + { + "epoch": 4.353836122582047, + "grad_norm": 0.4146307408809662, + "learning_rate": 2.8413793103448276e-05, + "loss": 0.1098, + "step": 3756 + }, + { + "epoch": 4.354995290878795, + "grad_norm": 0.39184170961380005, + "learning_rate": 2.8408045977011494e-05, + "loss": 0.0973, + "step": 3757 + }, + { + "epoch": 4.356154459175541, + "grad_norm": 0.38053470849990845, + "learning_rate": 2.8402298850574716e-05, + "loss": 0.106, + "step": 3758 + }, + { + "epoch": 4.357313627472289, + "grad_norm": 0.4440837800502777, + "learning_rate": 2.839655172413793e-05, + "loss": 0.1195, + "step": 3759 + }, + { + "epoch": 4.3584727957690355, + "grad_norm": 0.4450792670249939, + "learning_rate": 2.839080459770115e-05, + "loss": 0.1102, + "step": 3760 + }, + { + "epoch": 4.359631964065783, + "grad_norm": 0.29842203855514526, + "learning_rate": 2.838505747126437e-05, + "loss": 0.1029, + "step": 3761 + }, + { + "epoch": 4.36079113236253, + "grad_norm": 0.35125017166137695, + "learning_rate": 2.8379310344827586e-05, + "loss": 0.1055, + "step": 3762 + }, + { + "epoch": 4.361950300659277, + "grad_norm": 0.43805792927742004, + "learning_rate": 2.8373563218390807e-05, + "loss": 0.1018, + "step": 3763 + }, + { + "epoch": 4.363109468956024, + "grad_norm": 0.4038124978542328, + "learning_rate": 2.8367816091954026e-05, + "loss": 0.1019, + "step": 3764 + }, + { + "epoch": 4.3642686372527715, + "grad_norm": 0.3258083462715149, + "learning_rate": 2.836206896551724e-05, + "loss": 0.0994, + "step": 3765 + }, + { + "epoch": 4.365427805549518, + "grad_norm": 0.3961776793003082, + "learning_rate": 2.8356321839080462e-05, + "loss": 0.1168, + "step": 3766 + }, + { + "epoch": 4.366586973846266, + "grad_norm": 0.39625585079193115, + "learning_rate": 2.835057471264368e-05, + "loss": 0.1075, + "step": 3767 + }, + { + "epoch": 4.367746142143012, + "grad_norm": 0.3018212914466858, + "learning_rate": 2.8344827586206895e-05, + "loss": 0.1109, + "step": 3768 + }, + { + "epoch": 4.368905310439759, + "grad_norm": 0.32641318440437317, + "learning_rate": 2.8339080459770117e-05, + "loss": 0.1058, + "step": 3769 + }, + { + "epoch": 4.370064478736507, + "grad_norm": 0.28830480575561523, + "learning_rate": 2.8333333333333335e-05, + "loss": 0.0888, + "step": 3770 + }, + { + "epoch": 4.371223647033253, + "grad_norm": 0.299063116312027, + "learning_rate": 2.8327586206896557e-05, + "loss": 0.0994, + "step": 3771 + }, + { + "epoch": 4.372382815330001, + "grad_norm": 0.3458064794540405, + "learning_rate": 2.8321839080459772e-05, + "loss": 0.099, + "step": 3772 + }, + { + "epoch": 4.3735419836267475, + "grad_norm": 0.44503387808799744, + "learning_rate": 2.8316091954022987e-05, + "loss": 0.1047, + "step": 3773 + }, + { + "epoch": 4.374701151923495, + "grad_norm": 0.34976091980934143, + "learning_rate": 2.8310344827586212e-05, + "loss": 0.1, + "step": 3774 + }, + { + "epoch": 4.375860320220242, + "grad_norm": 0.6220278739929199, + "learning_rate": 2.8304597701149427e-05, + "loss": 0.1055, + "step": 3775 + }, + { + "epoch": 4.377019488516989, + "grad_norm": 0.5796971917152405, + "learning_rate": 2.8298850574712642e-05, + "loss": 0.1164, + "step": 3776 + }, + { + "epoch": 4.378178656813736, + "grad_norm": 0.38796380162239075, + "learning_rate": 2.8293103448275863e-05, + "loss": 0.0984, + "step": 3777 + }, + { + "epoch": 4.3793378251104835, + "grad_norm": 0.33590441942214966, + "learning_rate": 2.828735632183908e-05, + "loss": 0.1074, + "step": 3778 + }, + { + "epoch": 4.38049699340723, + "grad_norm": 0.3619459569454193, + "learning_rate": 2.8281609195402297e-05, + "loss": 0.1002, + "step": 3779 + }, + { + "epoch": 4.381656161703978, + "grad_norm": 0.27559658885002136, + "learning_rate": 2.8275862068965518e-05, + "loss": 0.098, + "step": 3780 + }, + { + "epoch": 4.382815330000724, + "grad_norm": 0.25252318382263184, + "learning_rate": 2.8270114942528737e-05, + "loss": 0.0882, + "step": 3781 + }, + { + "epoch": 4.383974498297471, + "grad_norm": 0.35869985818862915, + "learning_rate": 2.8264367816091958e-05, + "loss": 0.106, + "step": 3782 + }, + { + "epoch": 4.385133666594219, + "grad_norm": 0.4135657250881195, + "learning_rate": 2.8258620689655173e-05, + "loss": 0.1053, + "step": 3783 + }, + { + "epoch": 4.386292834890965, + "grad_norm": 0.3181520104408264, + "learning_rate": 2.825287356321839e-05, + "loss": 0.1071, + "step": 3784 + }, + { + "epoch": 4.387452003187713, + "grad_norm": 0.5326215624809265, + "learning_rate": 2.8247126436781613e-05, + "loss": 0.1095, + "step": 3785 + }, + { + "epoch": 4.38861117148446, + "grad_norm": 0.278712660074234, + "learning_rate": 2.8241379310344828e-05, + "loss": 0.0979, + "step": 3786 + }, + { + "epoch": 4.389770339781207, + "grad_norm": 0.30752044916152954, + "learning_rate": 2.8235632183908046e-05, + "loss": 0.0938, + "step": 3787 + }, + { + "epoch": 4.390929508077954, + "grad_norm": 0.3980220854282379, + "learning_rate": 2.8229885057471268e-05, + "loss": 0.101, + "step": 3788 + }, + { + "epoch": 4.392088676374701, + "grad_norm": 0.3621883690357208, + "learning_rate": 2.8224137931034483e-05, + "loss": 0.1064, + "step": 3789 + }, + { + "epoch": 4.393247844671448, + "grad_norm": 0.3474581837654114, + "learning_rate": 2.8218390804597705e-05, + "loss": 0.1024, + "step": 3790 + }, + { + "epoch": 4.394407012968196, + "grad_norm": 0.33540961146354675, + "learning_rate": 2.8212643678160923e-05, + "loss": 0.1032, + "step": 3791 + }, + { + "epoch": 4.395566181264942, + "grad_norm": 0.39876723289489746, + "learning_rate": 2.8206896551724138e-05, + "loss": 0.0977, + "step": 3792 + }, + { + "epoch": 4.39672534956169, + "grad_norm": 0.31449171900749207, + "learning_rate": 2.820114942528736e-05, + "loss": 0.0971, + "step": 3793 + }, + { + "epoch": 4.3978845178584365, + "grad_norm": 0.30794376134872437, + "learning_rate": 2.8195402298850578e-05, + "loss": 0.11, + "step": 3794 + }, + { + "epoch": 4.399043686155184, + "grad_norm": 0.30705249309539795, + "learning_rate": 2.8189655172413793e-05, + "loss": 0.1023, + "step": 3795 + }, + { + "epoch": 4.400202854451931, + "grad_norm": 0.37450698018074036, + "learning_rate": 2.8183908045977014e-05, + "loss": 0.1003, + "step": 3796 + }, + { + "epoch": 4.401362022748678, + "grad_norm": 0.4203515350818634, + "learning_rate": 2.817816091954023e-05, + "loss": 0.1178, + "step": 3797 + }, + { + "epoch": 4.402521191045425, + "grad_norm": 0.31087568402290344, + "learning_rate": 2.8172413793103447e-05, + "loss": 0.115, + "step": 3798 + }, + { + "epoch": 4.403680359342172, + "grad_norm": 0.32186147570610046, + "learning_rate": 2.816666666666667e-05, + "loss": 0.0939, + "step": 3799 + }, + { + "epoch": 4.404839527638919, + "grad_norm": 0.38315415382385254, + "learning_rate": 2.8160919540229884e-05, + "loss": 0.0979, + "step": 3800 + }, + { + "epoch": 4.405998695935666, + "grad_norm": 0.5825996398925781, + "learning_rate": 2.8155172413793106e-05, + "loss": 0.109, + "step": 3801 + }, + { + "epoch": 4.407157864232413, + "grad_norm": 0.42345044016838074, + "learning_rate": 2.8149425287356324e-05, + "loss": 0.106, + "step": 3802 + }, + { + "epoch": 4.40831703252916, + "grad_norm": 0.2959122657775879, + "learning_rate": 2.814367816091954e-05, + "loss": 0.0996, + "step": 3803 + }, + { + "epoch": 4.409476200825908, + "grad_norm": 0.479018896818161, + "learning_rate": 2.813793103448276e-05, + "loss": 0.0952, + "step": 3804 + }, + { + "epoch": 4.410635369122654, + "grad_norm": 0.3577905297279358, + "learning_rate": 2.813218390804598e-05, + "loss": 0.1047, + "step": 3805 + }, + { + "epoch": 4.411794537419402, + "grad_norm": 0.5896329283714294, + "learning_rate": 2.8126436781609194e-05, + "loss": 0.1145, + "step": 3806 + }, + { + "epoch": 4.4129537057161485, + "grad_norm": 0.31682226061820984, + "learning_rate": 2.8120689655172415e-05, + "loss": 0.0913, + "step": 3807 + }, + { + "epoch": 4.414112874012896, + "grad_norm": 0.38672780990600586, + "learning_rate": 2.8114942528735634e-05, + "loss": 0.1036, + "step": 3808 + }, + { + "epoch": 4.415272042309643, + "grad_norm": 0.3999164402484894, + "learning_rate": 2.8109195402298855e-05, + "loss": 0.0992, + "step": 3809 + }, + { + "epoch": 4.41643121060639, + "grad_norm": 0.32970115542411804, + "learning_rate": 2.810344827586207e-05, + "loss": 0.0957, + "step": 3810 + }, + { + "epoch": 4.417590378903137, + "grad_norm": 0.4063851535320282, + "learning_rate": 2.809770114942529e-05, + "loss": 0.1041, + "step": 3811 + }, + { + "epoch": 4.418749547199884, + "grad_norm": 0.4128503203392029, + "learning_rate": 2.809195402298851e-05, + "loss": 0.1006, + "step": 3812 + }, + { + "epoch": 4.419908715496631, + "grad_norm": 0.3817613124847412, + "learning_rate": 2.8086206896551725e-05, + "loss": 0.1138, + "step": 3813 + }, + { + "epoch": 4.421067883793378, + "grad_norm": 0.4013262391090393, + "learning_rate": 2.8080459770114944e-05, + "loss": 0.1078, + "step": 3814 + }, + { + "epoch": 4.4222270520901255, + "grad_norm": 0.4090496599674225, + "learning_rate": 2.8074712643678165e-05, + "loss": 0.1052, + "step": 3815 + }, + { + "epoch": 4.423386220386872, + "grad_norm": 0.307972252368927, + "learning_rate": 2.806896551724138e-05, + "loss": 0.1008, + "step": 3816 + }, + { + "epoch": 4.42454538868362, + "grad_norm": 0.4433940052986145, + "learning_rate": 2.8063218390804595e-05, + "loss": 0.1126, + "step": 3817 + }, + { + "epoch": 4.425704556980366, + "grad_norm": 0.34657689929008484, + "learning_rate": 2.8057471264367817e-05, + "loss": 0.085, + "step": 3818 + }, + { + "epoch": 4.426863725277114, + "grad_norm": 0.4442926049232483, + "learning_rate": 2.8051724137931035e-05, + "loss": 0.1089, + "step": 3819 + }, + { + "epoch": 4.428022893573861, + "grad_norm": 0.5043272376060486, + "learning_rate": 2.8045977011494257e-05, + "loss": 0.1284, + "step": 3820 + }, + { + "epoch": 4.429182061870608, + "grad_norm": 0.34867578744888306, + "learning_rate": 2.804022988505747e-05, + "loss": 0.101, + "step": 3821 + }, + { + "epoch": 4.430341230167355, + "grad_norm": 0.3213669955730438, + "learning_rate": 2.803448275862069e-05, + "loss": 0.0922, + "step": 3822 + }, + { + "epoch": 4.431500398464102, + "grad_norm": 1.2875010967254639, + "learning_rate": 2.802873563218391e-05, + "loss": 0.1089, + "step": 3823 + }, + { + "epoch": 4.432659566760849, + "grad_norm": 0.3360012173652649, + "learning_rate": 2.8022988505747126e-05, + "loss": 0.1059, + "step": 3824 + }, + { + "epoch": 4.433818735057596, + "grad_norm": 0.31028974056243896, + "learning_rate": 2.8017241379310345e-05, + "loss": 0.1003, + "step": 3825 + }, + { + "epoch": 4.434977903354343, + "grad_norm": 0.45090678334236145, + "learning_rate": 2.8011494252873566e-05, + "loss": 0.1056, + "step": 3826 + }, + { + "epoch": 4.43613707165109, + "grad_norm": 0.29718369245529175, + "learning_rate": 2.800574712643678e-05, + "loss": 0.1012, + "step": 3827 + }, + { + "epoch": 4.4372962399478375, + "grad_norm": 0.3447516858577728, + "learning_rate": 2.8000000000000003e-05, + "loss": 0.0984, + "step": 3828 + }, + { + "epoch": 4.438455408244584, + "grad_norm": 0.3421769440174103, + "learning_rate": 2.799425287356322e-05, + "loss": 0.1022, + "step": 3829 + }, + { + "epoch": 4.439614576541332, + "grad_norm": 0.4500944912433624, + "learning_rate": 2.7988505747126436e-05, + "loss": 0.1007, + "step": 3830 + }, + { + "epoch": 4.440773744838078, + "grad_norm": 0.434008926153183, + "learning_rate": 2.7982758620689658e-05, + "loss": 0.1144, + "step": 3831 + }, + { + "epoch": 4.441932913134826, + "grad_norm": 0.3270074129104614, + "learning_rate": 2.7977011494252876e-05, + "loss": 0.1051, + "step": 3832 + }, + { + "epoch": 4.443092081431573, + "grad_norm": 0.37319257855415344, + "learning_rate": 2.797126436781609e-05, + "loss": 0.1048, + "step": 3833 + }, + { + "epoch": 4.44425124972832, + "grad_norm": 0.3999885320663452, + "learning_rate": 2.7965517241379313e-05, + "loss": 0.1119, + "step": 3834 + }, + { + "epoch": 4.445410418025067, + "grad_norm": 0.3219684362411499, + "learning_rate": 2.795977011494253e-05, + "loss": 0.1173, + "step": 3835 + }, + { + "epoch": 4.446569586321814, + "grad_norm": 0.3253685534000397, + "learning_rate": 2.7954022988505746e-05, + "loss": 0.1012, + "step": 3836 + }, + { + "epoch": 4.447728754618561, + "grad_norm": 0.3479521870613098, + "learning_rate": 2.7948275862068968e-05, + "loss": 0.0977, + "step": 3837 + }, + { + "epoch": 4.448887922915309, + "grad_norm": 0.33220139145851135, + "learning_rate": 2.7942528735632182e-05, + "loss": 0.1052, + "step": 3838 + }, + { + "epoch": 4.450047091212055, + "grad_norm": 0.3189479410648346, + "learning_rate": 2.7936781609195408e-05, + "loss": 0.0937, + "step": 3839 + }, + { + "epoch": 4.451206259508803, + "grad_norm": 0.34778445959091187, + "learning_rate": 2.7931034482758622e-05, + "loss": 0.1142, + "step": 3840 + }, + { + "epoch": 4.45236542780555, + "grad_norm": 0.4988318979740143, + "learning_rate": 2.7925287356321837e-05, + "loss": 0.1025, + "step": 3841 + }, + { + "epoch": 4.453524596102296, + "grad_norm": 0.41545069217681885, + "learning_rate": 2.791954022988506e-05, + "loss": 0.1075, + "step": 3842 + }, + { + "epoch": 4.454683764399044, + "grad_norm": 0.38228297233581543, + "learning_rate": 2.7913793103448277e-05, + "loss": 0.1112, + "step": 3843 + }, + { + "epoch": 4.4558429326957905, + "grad_norm": 0.34965816140174866, + "learning_rate": 2.7908045977011492e-05, + "loss": 0.1088, + "step": 3844 + }, + { + "epoch": 4.457002100992538, + "grad_norm": 0.32655954360961914, + "learning_rate": 2.7902298850574714e-05, + "loss": 0.1071, + "step": 3845 + }, + { + "epoch": 4.458161269289285, + "grad_norm": 0.35616791248321533, + "learning_rate": 2.7896551724137932e-05, + "loss": 0.1095, + "step": 3846 + }, + { + "epoch": 4.459320437586032, + "grad_norm": 0.4080839157104492, + "learning_rate": 2.7890804597701154e-05, + "loss": 0.1029, + "step": 3847 + }, + { + "epoch": 4.460479605882779, + "grad_norm": 0.36598852276802063, + "learning_rate": 2.788505747126437e-05, + "loss": 0.0919, + "step": 3848 + }, + { + "epoch": 4.4616387741795265, + "grad_norm": 0.3768613040447235, + "learning_rate": 2.7879310344827587e-05, + "loss": 0.1065, + "step": 3849 + }, + { + "epoch": 4.462797942476273, + "grad_norm": 0.3174971044063568, + "learning_rate": 2.787356321839081e-05, + "loss": 0.1003, + "step": 3850 + }, + { + "epoch": 4.463957110773021, + "grad_norm": 0.34315162897109985, + "learning_rate": 2.7867816091954024e-05, + "loss": 0.1042, + "step": 3851 + }, + { + "epoch": 4.465116279069767, + "grad_norm": 0.29958537220954895, + "learning_rate": 2.7862068965517242e-05, + "loss": 0.0927, + "step": 3852 + }, + { + "epoch": 4.466275447366515, + "grad_norm": 0.40852096676826477, + "learning_rate": 2.7856321839080464e-05, + "loss": 0.1002, + "step": 3853 + }, + { + "epoch": 4.467434615663262, + "grad_norm": 0.32836470007896423, + "learning_rate": 2.785057471264368e-05, + "loss": 0.1069, + "step": 3854 + }, + { + "epoch": 4.468593783960008, + "grad_norm": 0.37109801173210144, + "learning_rate": 2.7844827586206897e-05, + "loss": 0.1069, + "step": 3855 + }, + { + "epoch": 4.469752952256756, + "grad_norm": 0.4152434170246124, + "learning_rate": 2.783908045977012e-05, + "loss": 0.1026, + "step": 3856 + }, + { + "epoch": 4.4709121205535025, + "grad_norm": 0.31486672163009644, + "learning_rate": 2.7833333333333333e-05, + "loss": 0.0983, + "step": 3857 + }, + { + "epoch": 4.47207128885025, + "grad_norm": 0.2974405288696289, + "learning_rate": 2.7827586206896555e-05, + "loss": 0.0975, + "step": 3858 + }, + { + "epoch": 4.473230457146997, + "grad_norm": 0.3772009015083313, + "learning_rate": 2.7821839080459773e-05, + "loss": 0.1035, + "step": 3859 + }, + { + "epoch": 4.474389625443744, + "grad_norm": 0.3580301105976105, + "learning_rate": 2.7816091954022988e-05, + "loss": 0.1001, + "step": 3860 + }, + { + "epoch": 4.475548793740491, + "grad_norm": 0.31176498532295227, + "learning_rate": 2.781034482758621e-05, + "loss": 0.0991, + "step": 3861 + }, + { + "epoch": 4.4767079620372385, + "grad_norm": 0.36399632692337036, + "learning_rate": 2.7804597701149425e-05, + "loss": 0.1106, + "step": 3862 + }, + { + "epoch": 4.477867130333985, + "grad_norm": 0.36921966075897217, + "learning_rate": 2.7798850574712643e-05, + "loss": 0.1045, + "step": 3863 + }, + { + "epoch": 4.479026298630733, + "grad_norm": 0.28541839122772217, + "learning_rate": 2.7793103448275865e-05, + "loss": 0.0879, + "step": 3864 + }, + { + "epoch": 4.480185466927479, + "grad_norm": 0.5838915109634399, + "learning_rate": 2.778735632183908e-05, + "loss": 0.1098, + "step": 3865 + }, + { + "epoch": 4.481344635224227, + "grad_norm": 0.32612380385398865, + "learning_rate": 2.77816091954023e-05, + "loss": 0.107, + "step": 3866 + }, + { + "epoch": 4.482503803520974, + "grad_norm": 0.33638033270835876, + "learning_rate": 2.777586206896552e-05, + "loss": 0.101, + "step": 3867 + }, + { + "epoch": 4.48366297181772, + "grad_norm": 0.40769678354263306, + "learning_rate": 2.7770114942528735e-05, + "loss": 0.107, + "step": 3868 + }, + { + "epoch": 4.484822140114468, + "grad_norm": 0.3535826504230499, + "learning_rate": 2.7764367816091956e-05, + "loss": 0.1087, + "step": 3869 + }, + { + "epoch": 4.485981308411215, + "grad_norm": 0.3710591495037079, + "learning_rate": 2.7758620689655175e-05, + "loss": 0.1045, + "step": 3870 + }, + { + "epoch": 4.487140476707962, + "grad_norm": 0.3905164301395416, + "learning_rate": 2.775287356321839e-05, + "loss": 0.1059, + "step": 3871 + }, + { + "epoch": 4.488299645004709, + "grad_norm": 0.27448105812072754, + "learning_rate": 2.774712643678161e-05, + "loss": 0.1021, + "step": 3872 + }, + { + "epoch": 4.489458813301456, + "grad_norm": 0.3772237002849579, + "learning_rate": 2.774137931034483e-05, + "loss": 0.1088, + "step": 3873 + }, + { + "epoch": 4.490617981598203, + "grad_norm": 0.4715537130832672, + "learning_rate": 2.7735632183908044e-05, + "loss": 0.1001, + "step": 3874 + }, + { + "epoch": 4.491777149894951, + "grad_norm": 0.4609696865081787, + "learning_rate": 2.7729885057471266e-05, + "loss": 0.1155, + "step": 3875 + }, + { + "epoch": 4.492936318191697, + "grad_norm": 0.368083119392395, + "learning_rate": 2.7724137931034484e-05, + "loss": 0.1018, + "step": 3876 + }, + { + "epoch": 4.494095486488445, + "grad_norm": 0.40971893072128296, + "learning_rate": 2.7718390804597706e-05, + "loss": 0.0994, + "step": 3877 + }, + { + "epoch": 4.4952546547851915, + "grad_norm": 0.40606266260147095, + "learning_rate": 2.771264367816092e-05, + "loss": 0.1162, + "step": 3878 + }, + { + "epoch": 4.496413823081939, + "grad_norm": 0.3284454047679901, + "learning_rate": 2.7706896551724136e-05, + "loss": 0.1001, + "step": 3879 + }, + { + "epoch": 4.497572991378686, + "grad_norm": 0.3405894935131073, + "learning_rate": 2.770114942528736e-05, + "loss": 0.089, + "step": 3880 + }, + { + "epoch": 4.498732159675433, + "grad_norm": 0.44441795349121094, + "learning_rate": 2.7695402298850576e-05, + "loss": 0.1161, + "step": 3881 + }, + { + "epoch": 4.49989132797218, + "grad_norm": 0.3282816410064697, + "learning_rate": 2.768965517241379e-05, + "loss": 0.1014, + "step": 3882 + }, + { + "epoch": 4.5010504962689275, + "grad_norm": 0.31686845421791077, + "learning_rate": 2.7683908045977012e-05, + "loss": 0.1002, + "step": 3883 + }, + { + "epoch": 4.502209664565674, + "grad_norm": 0.33207643032073975, + "learning_rate": 2.767816091954023e-05, + "loss": 0.0948, + "step": 3884 + }, + { + "epoch": 4.503368832862421, + "grad_norm": 0.3429032564163208, + "learning_rate": 2.7672413793103452e-05, + "loss": 0.1001, + "step": 3885 + }, + { + "epoch": 4.504528001159168, + "grad_norm": 0.333465576171875, + "learning_rate": 2.7666666666666667e-05, + "loss": 0.1043, + "step": 3886 + }, + { + "epoch": 4.505687169455915, + "grad_norm": 0.3548075556755066, + "learning_rate": 2.7660919540229885e-05, + "loss": 0.1056, + "step": 3887 + }, + { + "epoch": 4.506846337752663, + "grad_norm": 0.41525760293006897, + "learning_rate": 2.7655172413793107e-05, + "loss": 0.1086, + "step": 3888 + }, + { + "epoch": 4.508005506049409, + "grad_norm": 0.30972954630851746, + "learning_rate": 2.7649425287356322e-05, + "loss": 0.0972, + "step": 3889 + }, + { + "epoch": 4.509164674346157, + "grad_norm": 0.46984899044036865, + "learning_rate": 2.764367816091954e-05, + "loss": 0.1082, + "step": 3890 + }, + { + "epoch": 4.5103238426429035, + "grad_norm": 0.3709583580493927, + "learning_rate": 2.7637931034482762e-05, + "loss": 0.1007, + "step": 3891 + }, + { + "epoch": 4.511483010939651, + "grad_norm": 0.45104119181632996, + "learning_rate": 2.7632183908045977e-05, + "loss": 0.1131, + "step": 3892 + }, + { + "epoch": 4.512642179236398, + "grad_norm": 0.311952143907547, + "learning_rate": 2.7626436781609195e-05, + "loss": 0.1021, + "step": 3893 + }, + { + "epoch": 4.513801347533145, + "grad_norm": 0.3778269588947296, + "learning_rate": 2.7620689655172417e-05, + "loss": 0.1046, + "step": 3894 + }, + { + "epoch": 4.514960515829892, + "grad_norm": 0.3810199797153473, + "learning_rate": 2.7614942528735632e-05, + "loss": 0.1044, + "step": 3895 + }, + { + "epoch": 4.5161196841266396, + "grad_norm": 0.3130066394805908, + "learning_rate": 2.7609195402298853e-05, + "loss": 0.1037, + "step": 3896 + }, + { + "epoch": 4.517278852423386, + "grad_norm": 0.32273080945014954, + "learning_rate": 2.7603448275862072e-05, + "loss": 0.0984, + "step": 3897 + }, + { + "epoch": 4.518438020720133, + "grad_norm": 0.4851730763912201, + "learning_rate": 2.7597701149425287e-05, + "loss": 0.1088, + "step": 3898 + }, + { + "epoch": 4.5195971890168805, + "grad_norm": 0.30688047409057617, + "learning_rate": 2.759195402298851e-05, + "loss": 0.0997, + "step": 3899 + }, + { + "epoch": 4.520756357313627, + "grad_norm": 0.29143083095550537, + "learning_rate": 2.7586206896551727e-05, + "loss": 0.1066, + "step": 3900 + }, + { + "epoch": 4.521915525610375, + "grad_norm": 0.4881395697593689, + "learning_rate": 2.758045977011494e-05, + "loss": 0.103, + "step": 3901 + }, + { + "epoch": 4.523074693907121, + "grad_norm": 0.2735028862953186, + "learning_rate": 2.7574712643678163e-05, + "loss": 0.093, + "step": 3902 + }, + { + "epoch": 4.524233862203869, + "grad_norm": 0.3494533598423004, + "learning_rate": 2.7568965517241378e-05, + "loss": 0.1089, + "step": 3903 + }, + { + "epoch": 4.525393030500616, + "grad_norm": 0.36798563599586487, + "learning_rate": 2.7563218390804603e-05, + "loss": 0.1049, + "step": 3904 + }, + { + "epoch": 4.526552198797363, + "grad_norm": 0.3005774915218353, + "learning_rate": 2.7557471264367818e-05, + "loss": 0.1141, + "step": 3905 + }, + { + "epoch": 4.52771136709411, + "grad_norm": 0.3312096893787384, + "learning_rate": 2.7551724137931033e-05, + "loss": 0.1006, + "step": 3906 + }, + { + "epoch": 4.528870535390857, + "grad_norm": 0.3487379848957062, + "learning_rate": 2.7545977011494255e-05, + "loss": 0.1005, + "step": 3907 + }, + { + "epoch": 4.530029703687604, + "grad_norm": 0.36595210433006287, + "learning_rate": 2.7540229885057473e-05, + "loss": 0.1016, + "step": 3908 + }, + { + "epoch": 4.531188871984352, + "grad_norm": 0.2543398439884186, + "learning_rate": 2.7534482758620688e-05, + "loss": 0.0935, + "step": 3909 + }, + { + "epoch": 4.532348040281098, + "grad_norm": 0.38063111901283264, + "learning_rate": 2.752873563218391e-05, + "loss": 0.106, + "step": 3910 + }, + { + "epoch": 4.533507208577845, + "grad_norm": 0.37747347354888916, + "learning_rate": 2.7522988505747128e-05, + "loss": 0.1045, + "step": 3911 + }, + { + "epoch": 4.5346663768745925, + "grad_norm": 0.4080142676830292, + "learning_rate": 2.7517241379310343e-05, + "loss": 0.1065, + "step": 3912 + }, + { + "epoch": 4.535825545171339, + "grad_norm": 0.3274177014827728, + "learning_rate": 2.7511494252873564e-05, + "loss": 0.1012, + "step": 3913 + }, + { + "epoch": 4.536984713468087, + "grad_norm": 0.2830631136894226, + "learning_rate": 2.7505747126436783e-05, + "loss": 0.1, + "step": 3914 + }, + { + "epoch": 4.538143881764833, + "grad_norm": 0.27470821142196655, + "learning_rate": 2.7500000000000004e-05, + "loss": 0.0975, + "step": 3915 + }, + { + "epoch": 4.539303050061581, + "grad_norm": 0.37115851044654846, + "learning_rate": 2.749425287356322e-05, + "loss": 0.1079, + "step": 3916 + }, + { + "epoch": 4.540462218358328, + "grad_norm": 0.3276093900203705, + "learning_rate": 2.7488505747126438e-05, + "loss": 0.1043, + "step": 3917 + }, + { + "epoch": 4.541621386655075, + "grad_norm": 0.3176075518131256, + "learning_rate": 2.748275862068966e-05, + "loss": 0.1021, + "step": 3918 + }, + { + "epoch": 4.542780554951822, + "grad_norm": 0.32996928691864014, + "learning_rate": 2.7477011494252874e-05, + "loss": 0.095, + "step": 3919 + }, + { + "epoch": 4.543939723248569, + "grad_norm": 0.3963630497455597, + "learning_rate": 2.7471264367816092e-05, + "loss": 0.1069, + "step": 3920 + }, + { + "epoch": 4.545098891545316, + "grad_norm": 0.9190638065338135, + "learning_rate": 2.7465517241379314e-05, + "loss": 0.1153, + "step": 3921 + }, + { + "epoch": 4.546258059842064, + "grad_norm": 0.3529224991798401, + "learning_rate": 2.745977011494253e-05, + "loss": 0.092, + "step": 3922 + }, + { + "epoch": 4.54741722813881, + "grad_norm": 0.4127110540866852, + "learning_rate": 2.745402298850575e-05, + "loss": 0.1058, + "step": 3923 + }, + { + "epoch": 4.548576396435557, + "grad_norm": 0.4567547142505646, + "learning_rate": 2.744827586206897e-05, + "loss": 0.1041, + "step": 3924 + }, + { + "epoch": 4.549735564732305, + "grad_norm": 0.30119210481643677, + "learning_rate": 2.7442528735632184e-05, + "loss": 0.0998, + "step": 3925 + }, + { + "epoch": 4.550894733029052, + "grad_norm": 0.3091825842857361, + "learning_rate": 2.7436781609195406e-05, + "loss": 0.0967, + "step": 3926 + }, + { + "epoch": 4.552053901325799, + "grad_norm": 0.38511085510253906, + "learning_rate": 2.743103448275862e-05, + "loss": 0.1076, + "step": 3927 + }, + { + "epoch": 4.5532130696225455, + "grad_norm": 0.33152782917022705, + "learning_rate": 2.742528735632184e-05, + "loss": 0.0974, + "step": 3928 + }, + { + "epoch": 4.554372237919293, + "grad_norm": 0.35295701026916504, + "learning_rate": 2.741954022988506e-05, + "loss": 0.1044, + "step": 3929 + }, + { + "epoch": 4.55553140621604, + "grad_norm": 0.40102672576904297, + "learning_rate": 2.7413793103448275e-05, + "loss": 0.1007, + "step": 3930 + }, + { + "epoch": 4.556690574512787, + "grad_norm": 0.4231402575969696, + "learning_rate": 2.7408045977011494e-05, + "loss": 0.1146, + "step": 3931 + }, + { + "epoch": 4.557849742809534, + "grad_norm": 0.43630826473236084, + "learning_rate": 2.7402298850574715e-05, + "loss": 0.1009, + "step": 3932 + }, + { + "epoch": 4.5590089111062815, + "grad_norm": 0.2806771993637085, + "learning_rate": 2.739655172413793e-05, + "loss": 0.1037, + "step": 3933 + }, + { + "epoch": 4.560168079403028, + "grad_norm": 0.3072310984134674, + "learning_rate": 2.7390804597701152e-05, + "loss": 0.0999, + "step": 3934 + }, + { + "epoch": 4.561327247699776, + "grad_norm": 0.3378421366214752, + "learning_rate": 2.738505747126437e-05, + "loss": 0.1061, + "step": 3935 + }, + { + "epoch": 4.562486415996522, + "grad_norm": 0.25815141201019287, + "learning_rate": 2.7379310344827585e-05, + "loss": 0.0916, + "step": 3936 + }, + { + "epoch": 4.56364558429327, + "grad_norm": 0.3428052067756653, + "learning_rate": 2.7373563218390807e-05, + "loss": 0.1086, + "step": 3937 + }, + { + "epoch": 4.564804752590017, + "grad_norm": 0.3696090579032898, + "learning_rate": 2.7367816091954025e-05, + "loss": 0.1022, + "step": 3938 + }, + { + "epoch": 4.565963920886764, + "grad_norm": 0.31813839077949524, + "learning_rate": 2.736206896551724e-05, + "loss": 0.0936, + "step": 3939 + }, + { + "epoch": 4.567123089183511, + "grad_norm": 0.40487226843833923, + "learning_rate": 2.735632183908046e-05, + "loss": 0.103, + "step": 3940 + }, + { + "epoch": 4.5682822574802575, + "grad_norm": 0.3482483923435211, + "learning_rate": 2.735057471264368e-05, + "loss": 0.1005, + "step": 3941 + }, + { + "epoch": 4.569441425777005, + "grad_norm": 0.7645699381828308, + "learning_rate": 2.73448275862069e-05, + "loss": 0.1048, + "step": 3942 + }, + { + "epoch": 4.570600594073752, + "grad_norm": 0.2931617796421051, + "learning_rate": 2.7339080459770116e-05, + "loss": 0.0907, + "step": 3943 + }, + { + "epoch": 4.571759762370499, + "grad_norm": 0.491020530462265, + "learning_rate": 2.733333333333333e-05, + "loss": 0.1099, + "step": 3944 + }, + { + "epoch": 4.572918930667246, + "grad_norm": 0.28609660267829895, + "learning_rate": 2.7327586206896556e-05, + "loss": 0.104, + "step": 3945 + }, + { + "epoch": 4.5740780989639935, + "grad_norm": 0.4021299183368683, + "learning_rate": 2.732183908045977e-05, + "loss": 0.1071, + "step": 3946 + }, + { + "epoch": 4.57523726726074, + "grad_norm": 0.39661794900894165, + "learning_rate": 2.7316091954022986e-05, + "loss": 0.1019, + "step": 3947 + }, + { + "epoch": 4.576396435557488, + "grad_norm": 0.3372730314731598, + "learning_rate": 2.7310344827586208e-05, + "loss": 0.1109, + "step": 3948 + }, + { + "epoch": 4.577555603854234, + "grad_norm": 0.33836305141448975, + "learning_rate": 2.7304597701149426e-05, + "loss": 0.0944, + "step": 3949 + }, + { + "epoch": 4.578714772150982, + "grad_norm": 0.3433241844177246, + "learning_rate": 2.7298850574712648e-05, + "loss": 0.1087, + "step": 3950 + }, + { + "epoch": 4.579873940447729, + "grad_norm": 0.3279787600040436, + "learning_rate": 2.7293103448275863e-05, + "loss": 0.1091, + "step": 3951 + }, + { + "epoch": 4.581033108744476, + "grad_norm": 0.3588603734970093, + "learning_rate": 2.728735632183908e-05, + "loss": 0.0953, + "step": 3952 + }, + { + "epoch": 4.582192277041223, + "grad_norm": 0.2686539590358734, + "learning_rate": 2.7281609195402303e-05, + "loss": 0.099, + "step": 3953 + }, + { + "epoch": 4.58335144533797, + "grad_norm": 0.42638182640075684, + "learning_rate": 2.7275862068965518e-05, + "loss": 0.1131, + "step": 3954 + }, + { + "epoch": 4.584510613634717, + "grad_norm": 0.34486687183380127, + "learning_rate": 2.7270114942528736e-05, + "loss": 0.0993, + "step": 3955 + }, + { + "epoch": 4.585669781931464, + "grad_norm": 0.3565883934497833, + "learning_rate": 2.7264367816091958e-05, + "loss": 0.1047, + "step": 3956 + }, + { + "epoch": 4.586828950228211, + "grad_norm": 0.2582899332046509, + "learning_rate": 2.7258620689655173e-05, + "loss": 0.0902, + "step": 3957 + }, + { + "epoch": 4.587988118524958, + "grad_norm": 0.3092981278896332, + "learning_rate": 2.725287356321839e-05, + "loss": 0.1102, + "step": 3958 + }, + { + "epoch": 4.589147286821706, + "grad_norm": 0.41385966539382935, + "learning_rate": 2.7247126436781613e-05, + "loss": 0.0956, + "step": 3959 + }, + { + "epoch": 4.590306455118452, + "grad_norm": 0.4211784303188324, + "learning_rate": 2.7241379310344827e-05, + "loss": 0.1025, + "step": 3960 + }, + { + "epoch": 4.5914656234152, + "grad_norm": 0.3663957417011261, + "learning_rate": 2.723563218390805e-05, + "loss": 0.1041, + "step": 3961 + }, + { + "epoch": 4.5926247917119465, + "grad_norm": 0.30467289686203003, + "learning_rate": 2.7229885057471267e-05, + "loss": 0.1013, + "step": 3962 + }, + { + "epoch": 4.593783960008694, + "grad_norm": 0.41379502415657043, + "learning_rate": 2.7224137931034482e-05, + "loss": 0.1, + "step": 3963 + }, + { + "epoch": 4.594943128305441, + "grad_norm": 0.31340333819389343, + "learning_rate": 2.7218390804597704e-05, + "loss": 0.0935, + "step": 3964 + }, + { + "epoch": 4.596102296602188, + "grad_norm": 0.3276791274547577, + "learning_rate": 2.7212643678160922e-05, + "loss": 0.1058, + "step": 3965 + }, + { + "epoch": 4.597261464898935, + "grad_norm": 0.4072125554084778, + "learning_rate": 2.7206896551724137e-05, + "loss": 0.0965, + "step": 3966 + }, + { + "epoch": 4.598420633195682, + "grad_norm": 0.32723650336265564, + "learning_rate": 2.720114942528736e-05, + "loss": 0.1108, + "step": 3967 + }, + { + "epoch": 4.599579801492429, + "grad_norm": 0.3264493942260742, + "learning_rate": 2.7195402298850574e-05, + "loss": 0.119, + "step": 3968 + }, + { + "epoch": 4.600738969789177, + "grad_norm": 0.32702627778053284, + "learning_rate": 2.71896551724138e-05, + "loss": 0.1014, + "step": 3969 + }, + { + "epoch": 4.601898138085923, + "grad_norm": 0.5226812958717346, + "learning_rate": 2.7183908045977014e-05, + "loss": 0.1009, + "step": 3970 + }, + { + "epoch": 4.60305730638267, + "grad_norm": 0.3024195730686188, + "learning_rate": 2.717816091954023e-05, + "loss": 0.0955, + "step": 3971 + }, + { + "epoch": 4.604216474679418, + "grad_norm": 0.3582437038421631, + "learning_rate": 2.717241379310345e-05, + "loss": 0.1102, + "step": 3972 + }, + { + "epoch": 4.605375642976164, + "grad_norm": 0.38571134209632874, + "learning_rate": 2.716666666666667e-05, + "loss": 0.1012, + "step": 3973 + }, + { + "epoch": 4.606534811272912, + "grad_norm": 0.30533286929130554, + "learning_rate": 2.7160919540229883e-05, + "loss": 0.1025, + "step": 3974 + }, + { + "epoch": 4.6076939795696585, + "grad_norm": 0.38881656527519226, + "learning_rate": 2.7155172413793105e-05, + "loss": 0.0957, + "step": 3975 + }, + { + "epoch": 4.608853147866406, + "grad_norm": 0.40366286039352417, + "learning_rate": 2.7149425287356323e-05, + "loss": 0.1178, + "step": 3976 + }, + { + "epoch": 4.610012316163153, + "grad_norm": 0.3555791974067688, + "learning_rate": 2.714367816091954e-05, + "loss": 0.1039, + "step": 3977 + }, + { + "epoch": 4.6111714844599, + "grad_norm": 0.405254602432251, + "learning_rate": 2.713793103448276e-05, + "loss": 0.1036, + "step": 3978 + }, + { + "epoch": 4.612330652756647, + "grad_norm": 0.42055705189704895, + "learning_rate": 2.713218390804598e-05, + "loss": 0.1063, + "step": 3979 + }, + { + "epoch": 4.613489821053395, + "grad_norm": 0.38437414169311523, + "learning_rate": 2.71264367816092e-05, + "loss": 0.1002, + "step": 3980 + }, + { + "epoch": 4.614648989350141, + "grad_norm": 0.37615966796875, + "learning_rate": 2.7120689655172415e-05, + "loss": 0.101, + "step": 3981 + }, + { + "epoch": 4.615808157646889, + "grad_norm": 0.363473117351532, + "learning_rate": 2.7114942528735633e-05, + "loss": 0.1121, + "step": 3982 + }, + { + "epoch": 4.6169673259436355, + "grad_norm": 0.4437958896160126, + "learning_rate": 2.7109195402298855e-05, + "loss": 0.1137, + "step": 3983 + }, + { + "epoch": 4.618126494240382, + "grad_norm": 0.2698152959346771, + "learning_rate": 2.710344827586207e-05, + "loss": 0.1011, + "step": 3984 + }, + { + "epoch": 4.61928566253713, + "grad_norm": 0.360612154006958, + "learning_rate": 2.7097701149425288e-05, + "loss": 0.1123, + "step": 3985 + }, + { + "epoch": 4.620444830833876, + "grad_norm": 0.43149617314338684, + "learning_rate": 2.709195402298851e-05, + "loss": 0.1087, + "step": 3986 + }, + { + "epoch": 4.621603999130624, + "grad_norm": 0.37150686979293823, + "learning_rate": 2.7086206896551725e-05, + "loss": 0.0995, + "step": 3987 + }, + { + "epoch": 4.622763167427371, + "grad_norm": 0.3497457802295685, + "learning_rate": 2.7080459770114946e-05, + "loss": 0.1014, + "step": 3988 + }, + { + "epoch": 4.623922335724118, + "grad_norm": 0.33245381712913513, + "learning_rate": 2.707471264367816e-05, + "loss": 0.1003, + "step": 3989 + }, + { + "epoch": 4.625081504020865, + "grad_norm": 0.44494423270225525, + "learning_rate": 2.706896551724138e-05, + "loss": 0.102, + "step": 3990 + }, + { + "epoch": 4.626240672317612, + "grad_norm": 0.38262462615966797, + "learning_rate": 2.70632183908046e-05, + "loss": 0.1171, + "step": 3991 + }, + { + "epoch": 4.627399840614359, + "grad_norm": 0.4167878329753876, + "learning_rate": 2.7057471264367816e-05, + "loss": 0.1033, + "step": 3992 + }, + { + "epoch": 4.628559008911107, + "grad_norm": 0.39638566970825195, + "learning_rate": 2.7051724137931034e-05, + "loss": 0.0931, + "step": 3993 + }, + { + "epoch": 4.629718177207853, + "grad_norm": 0.45487165451049805, + "learning_rate": 2.7045977011494256e-05, + "loss": 0.1104, + "step": 3994 + }, + { + "epoch": 4.630877345504601, + "grad_norm": 0.42852312326431274, + "learning_rate": 2.704022988505747e-05, + "loss": 0.0995, + "step": 3995 + }, + { + "epoch": 4.6320365138013475, + "grad_norm": 0.388959139585495, + "learning_rate": 2.703448275862069e-05, + "loss": 0.1152, + "step": 3996 + }, + { + "epoch": 4.633195682098094, + "grad_norm": 0.33843693137168884, + "learning_rate": 2.702873563218391e-05, + "loss": 0.1018, + "step": 3997 + }, + { + "epoch": 4.634354850394842, + "grad_norm": 0.3359590768814087, + "learning_rate": 2.7022988505747126e-05, + "loss": 0.1009, + "step": 3998 + }, + { + "epoch": 4.635514018691588, + "grad_norm": 0.29807549715042114, + "learning_rate": 2.7017241379310348e-05, + "loss": 0.0949, + "step": 3999 + }, + { + "epoch": 4.636673186988336, + "grad_norm": 0.3521331250667572, + "learning_rate": 2.7011494252873566e-05, + "loss": 0.0987, + "step": 4000 + }, + { + "epoch": 4.637832355285083, + "grad_norm": 0.32744914293289185, + "learning_rate": 2.700574712643678e-05, + "loss": 0.1068, + "step": 4001 + }, + { + "epoch": 4.63899152358183, + "grad_norm": 0.33140355348587036, + "learning_rate": 2.7000000000000002e-05, + "loss": 0.0941, + "step": 4002 + }, + { + "epoch": 4.640150691878577, + "grad_norm": 0.35607364773750305, + "learning_rate": 2.699425287356322e-05, + "loss": 0.1128, + "step": 4003 + }, + { + "epoch": 4.641309860175324, + "grad_norm": 0.28557470440864563, + "learning_rate": 2.6988505747126436e-05, + "loss": 0.1044, + "step": 4004 + }, + { + "epoch": 4.642469028472071, + "grad_norm": 0.4250648617744446, + "learning_rate": 2.6982758620689657e-05, + "loss": 0.1022, + "step": 4005 + }, + { + "epoch": 4.643628196768819, + "grad_norm": 0.2685200273990631, + "learning_rate": 2.6977011494252876e-05, + "loss": 0.0965, + "step": 4006 + }, + { + "epoch": 4.644787365065565, + "grad_norm": 0.3097551465034485, + "learning_rate": 2.6971264367816097e-05, + "loss": 0.11, + "step": 4007 + }, + { + "epoch": 4.645946533362313, + "grad_norm": 0.3279826045036316, + "learning_rate": 2.6965517241379312e-05, + "loss": 0.1003, + "step": 4008 + }, + { + "epoch": 4.64710570165906, + "grad_norm": 0.3129337430000305, + "learning_rate": 2.6959770114942527e-05, + "loss": 0.1062, + "step": 4009 + }, + { + "epoch": 4.648264869955806, + "grad_norm": 0.27965065836906433, + "learning_rate": 2.6954022988505752e-05, + "loss": 0.1017, + "step": 4010 + }, + { + "epoch": 4.649424038252554, + "grad_norm": 0.3252604007720947, + "learning_rate": 2.6948275862068967e-05, + "loss": 0.0935, + "step": 4011 + }, + { + "epoch": 4.650583206549301, + "grad_norm": 0.32502391934394836, + "learning_rate": 2.6942528735632182e-05, + "loss": 0.1059, + "step": 4012 + }, + { + "epoch": 4.651742374846048, + "grad_norm": 0.3370577394962311, + "learning_rate": 2.6936781609195404e-05, + "loss": 0.0959, + "step": 4013 + }, + { + "epoch": 4.652901543142795, + "grad_norm": 0.3003638684749603, + "learning_rate": 2.6931034482758622e-05, + "loss": 0.0969, + "step": 4014 + }, + { + "epoch": 4.654060711439542, + "grad_norm": 0.35102033615112305, + "learning_rate": 2.6925287356321837e-05, + "loss": 0.104, + "step": 4015 + }, + { + "epoch": 4.655219879736289, + "grad_norm": 0.355948269367218, + "learning_rate": 2.691954022988506e-05, + "loss": 0.1125, + "step": 4016 + }, + { + "epoch": 4.6563790480330365, + "grad_norm": 0.526878833770752, + "learning_rate": 2.6913793103448277e-05, + "loss": 0.1195, + "step": 4017 + }, + { + "epoch": 4.657538216329783, + "grad_norm": 0.3807942867279053, + "learning_rate": 2.69080459770115e-05, + "loss": 0.1092, + "step": 4018 + }, + { + "epoch": 4.658697384626531, + "grad_norm": 0.36606287956237793, + "learning_rate": 2.6902298850574713e-05, + "loss": 0.1061, + "step": 4019 + }, + { + "epoch": 4.659856552923277, + "grad_norm": 0.4438146650791168, + "learning_rate": 2.689655172413793e-05, + "loss": 0.1059, + "step": 4020 + }, + { + "epoch": 4.661015721220025, + "grad_norm": 0.3968319296836853, + "learning_rate": 2.6890804597701153e-05, + "loss": 0.0976, + "step": 4021 + }, + { + "epoch": 4.662174889516772, + "grad_norm": 0.34883856773376465, + "learning_rate": 2.6885057471264368e-05, + "loss": 0.1019, + "step": 4022 + }, + { + "epoch": 4.663334057813519, + "grad_norm": 0.5616782903671265, + "learning_rate": 2.6879310344827586e-05, + "loss": 0.1028, + "step": 4023 + }, + { + "epoch": 4.664493226110266, + "grad_norm": 0.3588404953479767, + "learning_rate": 2.6873563218390808e-05, + "loss": 0.0995, + "step": 4024 + }, + { + "epoch": 4.665652394407013, + "grad_norm": 0.3863297998905182, + "learning_rate": 2.6867816091954023e-05, + "loss": 0.0863, + "step": 4025 + }, + { + "epoch": 4.66681156270376, + "grad_norm": 0.2935449481010437, + "learning_rate": 2.6862068965517245e-05, + "loss": 0.0876, + "step": 4026 + }, + { + "epoch": 4.667970731000507, + "grad_norm": 0.3793972432613373, + "learning_rate": 2.6856321839080463e-05, + "loss": 0.095, + "step": 4027 + }, + { + "epoch": 4.669129899297254, + "grad_norm": 1.2369050979614258, + "learning_rate": 2.6850574712643678e-05, + "loss": 0.1036, + "step": 4028 + }, + { + "epoch": 4.670289067594001, + "grad_norm": 0.4123905599117279, + "learning_rate": 2.68448275862069e-05, + "loss": 0.0965, + "step": 4029 + }, + { + "epoch": 4.6714482358907485, + "grad_norm": 0.4004489481449127, + "learning_rate": 2.6839080459770118e-05, + "loss": 0.1136, + "step": 4030 + }, + { + "epoch": 4.672607404187495, + "grad_norm": 0.37782585620880127, + "learning_rate": 2.6833333333333333e-05, + "loss": 0.1215, + "step": 4031 + }, + { + "epoch": 4.673766572484243, + "grad_norm": 0.49208974838256836, + "learning_rate": 2.6827586206896554e-05, + "loss": 0.1135, + "step": 4032 + }, + { + "epoch": 4.674925740780989, + "grad_norm": 0.38141676783561707, + "learning_rate": 2.682183908045977e-05, + "loss": 0.0966, + "step": 4033 + }, + { + "epoch": 4.676084909077737, + "grad_norm": 0.3110532760620117, + "learning_rate": 2.6816091954022988e-05, + "loss": 0.1001, + "step": 4034 + }, + { + "epoch": 4.677244077374484, + "grad_norm": 0.38810020685195923, + "learning_rate": 2.681034482758621e-05, + "loss": 0.1021, + "step": 4035 + }, + { + "epoch": 4.678403245671231, + "grad_norm": 0.6681183576583862, + "learning_rate": 2.6804597701149424e-05, + "loss": 0.1168, + "step": 4036 + }, + { + "epoch": 4.679562413967978, + "grad_norm": 0.28798437118530273, + "learning_rate": 2.6798850574712646e-05, + "loss": 0.1115, + "step": 4037 + }, + { + "epoch": 4.6807215822647255, + "grad_norm": 0.43260279297828674, + "learning_rate": 2.6793103448275864e-05, + "loss": 0.1064, + "step": 4038 + }, + { + "epoch": 4.681880750561472, + "grad_norm": 0.3560652732849121, + "learning_rate": 2.678735632183908e-05, + "loss": 0.0963, + "step": 4039 + }, + { + "epoch": 4.683039918858219, + "grad_norm": 0.38259580731391907, + "learning_rate": 2.67816091954023e-05, + "loss": 0.1062, + "step": 4040 + }, + { + "epoch": 4.684199087154966, + "grad_norm": 0.26703354716300964, + "learning_rate": 2.677586206896552e-05, + "loss": 0.0953, + "step": 4041 + }, + { + "epoch": 4.685358255451713, + "grad_norm": 0.3569488227367401, + "learning_rate": 2.6770114942528734e-05, + "loss": 0.1066, + "step": 4042 + }, + { + "epoch": 4.686517423748461, + "grad_norm": 0.3421025574207306, + "learning_rate": 2.6764367816091956e-05, + "loss": 0.104, + "step": 4043 + }, + { + "epoch": 4.687676592045207, + "grad_norm": 0.31309759616851807, + "learning_rate": 2.6758620689655174e-05, + "loss": 0.1047, + "step": 4044 + }, + { + "epoch": 4.688835760341955, + "grad_norm": 0.6280311346054077, + "learning_rate": 2.6752873563218396e-05, + "loss": 0.1096, + "step": 4045 + }, + { + "epoch": 4.6899949286387015, + "grad_norm": 0.36928072571754456, + "learning_rate": 2.674712643678161e-05, + "loss": 0.1067, + "step": 4046 + }, + { + "epoch": 4.691154096935449, + "grad_norm": 0.2873797118663788, + "learning_rate": 2.674137931034483e-05, + "loss": 0.102, + "step": 4047 + }, + { + "epoch": 4.692313265232196, + "grad_norm": 0.3734436631202698, + "learning_rate": 2.673563218390805e-05, + "loss": 0.116, + "step": 4048 + }, + { + "epoch": 4.693472433528943, + "grad_norm": 0.33200186491012573, + "learning_rate": 2.6729885057471265e-05, + "loss": 0.0983, + "step": 4049 + }, + { + "epoch": 4.69463160182569, + "grad_norm": 0.3504514992237091, + "learning_rate": 2.672413793103448e-05, + "loss": 0.1072, + "step": 4050 + }, + { + "epoch": 4.6957907701224375, + "grad_norm": 0.4115615487098694, + "learning_rate": 2.6718390804597705e-05, + "loss": 0.1073, + "step": 4051 + }, + { + "epoch": 4.696949938419184, + "grad_norm": 0.35126233100891113, + "learning_rate": 2.671264367816092e-05, + "loss": 0.1048, + "step": 4052 + }, + { + "epoch": 4.698109106715931, + "grad_norm": 0.2556185722351074, + "learning_rate": 2.6706896551724135e-05, + "loss": 0.1022, + "step": 4053 + }, + { + "epoch": 4.699268275012678, + "grad_norm": 0.38733574748039246, + "learning_rate": 2.6701149425287357e-05, + "loss": 0.1198, + "step": 4054 + }, + { + "epoch": 4.700427443309426, + "grad_norm": 0.45527157187461853, + "learning_rate": 2.6695402298850575e-05, + "loss": 0.1059, + "step": 4055 + }, + { + "epoch": 4.701586611606173, + "grad_norm": 0.5728080868721008, + "learning_rate": 2.6689655172413797e-05, + "loss": 0.0996, + "step": 4056 + }, + { + "epoch": 4.702745779902919, + "grad_norm": 0.3340456187725067, + "learning_rate": 2.6683908045977012e-05, + "loss": 0.108, + "step": 4057 + }, + { + "epoch": 4.703904948199667, + "grad_norm": 0.35559847950935364, + "learning_rate": 2.667816091954023e-05, + "loss": 0.1056, + "step": 4058 + }, + { + "epoch": 4.7050641164964135, + "grad_norm": 0.2684958279132843, + "learning_rate": 2.667241379310345e-05, + "loss": 0.0996, + "step": 4059 + }, + { + "epoch": 4.706223284793161, + "grad_norm": 0.31825652718544006, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.1091, + "step": 4060 + }, + { + "epoch": 4.707382453089908, + "grad_norm": 0.3788098990917206, + "learning_rate": 2.6660919540229885e-05, + "loss": 0.0985, + "step": 4061 + }, + { + "epoch": 4.708541621386655, + "grad_norm": 0.31583863496780396, + "learning_rate": 2.6655172413793107e-05, + "loss": 0.1064, + "step": 4062 + }, + { + "epoch": 4.709700789683402, + "grad_norm": 0.5462097525596619, + "learning_rate": 2.664942528735632e-05, + "loss": 0.0976, + "step": 4063 + }, + { + "epoch": 4.71085995798015, + "grad_norm": 0.4478006660938263, + "learning_rate": 2.6643678160919543e-05, + "loss": 0.1024, + "step": 4064 + }, + { + "epoch": 4.712019126276896, + "grad_norm": 0.41845348477363586, + "learning_rate": 2.663793103448276e-05, + "loss": 0.1003, + "step": 4065 + }, + { + "epoch": 4.713178294573644, + "grad_norm": 0.3364732265472412, + "learning_rate": 2.6632183908045976e-05, + "loss": 0.0996, + "step": 4066 + }, + { + "epoch": 4.7143374628703905, + "grad_norm": 0.3141407370567322, + "learning_rate": 2.6626436781609198e-05, + "loss": 0.0995, + "step": 4067 + }, + { + "epoch": 4.715496631167138, + "grad_norm": 0.2963089942932129, + "learning_rate": 2.6620689655172416e-05, + "loss": 0.1043, + "step": 4068 + }, + { + "epoch": 4.716655799463885, + "grad_norm": 0.3780442774295807, + "learning_rate": 2.661494252873563e-05, + "loss": 0.1161, + "step": 4069 + }, + { + "epoch": 4.717814967760631, + "grad_norm": 0.4881404936313629, + "learning_rate": 2.6609195402298853e-05, + "loss": 0.1049, + "step": 4070 + }, + { + "epoch": 4.718974136057379, + "grad_norm": 0.39198312163352966, + "learning_rate": 2.660344827586207e-05, + "loss": 0.1134, + "step": 4071 + }, + { + "epoch": 4.720133304354126, + "grad_norm": 0.35181596875190735, + "learning_rate": 2.6597701149425286e-05, + "loss": 0.1013, + "step": 4072 + }, + { + "epoch": 4.721292472650873, + "grad_norm": 0.3244793713092804, + "learning_rate": 2.6591954022988508e-05, + "loss": 0.098, + "step": 4073 + }, + { + "epoch": 4.72245164094762, + "grad_norm": 0.3857901692390442, + "learning_rate": 2.6586206896551723e-05, + "loss": 0.1198, + "step": 4074 + }, + { + "epoch": 4.723610809244367, + "grad_norm": 0.3005591034889221, + "learning_rate": 2.6580459770114948e-05, + "loss": 0.0984, + "step": 4075 + }, + { + "epoch": 4.724769977541114, + "grad_norm": 0.34256380796432495, + "learning_rate": 2.6574712643678163e-05, + "loss": 0.106, + "step": 4076 + }, + { + "epoch": 4.725929145837862, + "grad_norm": 0.31626880168914795, + "learning_rate": 2.6568965517241378e-05, + "loss": 0.1079, + "step": 4077 + }, + { + "epoch": 4.727088314134608, + "grad_norm": 0.3006041944026947, + "learning_rate": 2.65632183908046e-05, + "loss": 0.1084, + "step": 4078 + }, + { + "epoch": 4.728247482431356, + "grad_norm": 0.31316351890563965, + "learning_rate": 2.6557471264367817e-05, + "loss": 0.1048, + "step": 4079 + }, + { + "epoch": 4.7294066507281025, + "grad_norm": 0.30015018582344055, + "learning_rate": 2.6551724137931032e-05, + "loss": 0.1159, + "step": 4080 + }, + { + "epoch": 4.73056581902485, + "grad_norm": 0.3358341455459595, + "learning_rate": 2.6545977011494254e-05, + "loss": 0.1077, + "step": 4081 + }, + { + "epoch": 4.731724987321597, + "grad_norm": 0.3661000728607178, + "learning_rate": 2.6540229885057472e-05, + "loss": 0.1164, + "step": 4082 + }, + { + "epoch": 4.732884155618343, + "grad_norm": 0.30234283208847046, + "learning_rate": 2.6534482758620694e-05, + "loss": 0.1018, + "step": 4083 + }, + { + "epoch": 4.734043323915091, + "grad_norm": 0.29934728145599365, + "learning_rate": 2.652873563218391e-05, + "loss": 0.1044, + "step": 4084 + }, + { + "epoch": 4.735202492211838, + "grad_norm": 0.3741641640663147, + "learning_rate": 2.6522988505747127e-05, + "loss": 0.1084, + "step": 4085 + }, + { + "epoch": 4.736361660508585, + "grad_norm": 0.35596325993537903, + "learning_rate": 2.651724137931035e-05, + "loss": 0.1069, + "step": 4086 + }, + { + "epoch": 4.737520828805332, + "grad_norm": 0.3338968753814697, + "learning_rate": 2.6511494252873564e-05, + "loss": 0.1055, + "step": 4087 + }, + { + "epoch": 4.738679997102079, + "grad_norm": 0.290626585483551, + "learning_rate": 2.6505747126436782e-05, + "loss": 0.0997, + "step": 4088 + }, + { + "epoch": 4.739839165398826, + "grad_norm": 0.6464786529541016, + "learning_rate": 2.6500000000000004e-05, + "loss": 0.1179, + "step": 4089 + }, + { + "epoch": 4.740998333695574, + "grad_norm": 0.5353714227676392, + "learning_rate": 2.649425287356322e-05, + "loss": 0.1145, + "step": 4090 + }, + { + "epoch": 4.74215750199232, + "grad_norm": 0.39538970589637756, + "learning_rate": 2.6488505747126437e-05, + "loss": 0.0998, + "step": 4091 + }, + { + "epoch": 4.743316670289068, + "grad_norm": 0.36018985509872437, + "learning_rate": 2.648275862068966e-05, + "loss": 0.1087, + "step": 4092 + }, + { + "epoch": 4.744475838585815, + "grad_norm": 0.4141652286052704, + "learning_rate": 2.6477011494252874e-05, + "loss": 0.1129, + "step": 4093 + }, + { + "epoch": 4.745635006882562, + "grad_norm": 0.4767494201660156, + "learning_rate": 2.6471264367816095e-05, + "loss": 0.116, + "step": 4094 + }, + { + "epoch": 4.746794175179309, + "grad_norm": 0.37823235988616943, + "learning_rate": 2.646551724137931e-05, + "loss": 0.1107, + "step": 4095 + }, + { + "epoch": 4.7479533434760555, + "grad_norm": 0.3563480079174042, + "learning_rate": 2.645977011494253e-05, + "loss": 0.1083, + "step": 4096 + }, + { + "epoch": 4.749112511772803, + "grad_norm": 0.27948468923568726, + "learning_rate": 2.645402298850575e-05, + "loss": 0.1067, + "step": 4097 + }, + { + "epoch": 4.750271680069551, + "grad_norm": 0.311713844537735, + "learning_rate": 2.6448275862068965e-05, + "loss": 0.1077, + "step": 4098 + }, + { + "epoch": 4.751430848366297, + "grad_norm": 0.2752847969532013, + "learning_rate": 2.6442528735632183e-05, + "loss": 0.0956, + "step": 4099 + }, + { + "epoch": 4.752590016663044, + "grad_norm": 0.3526310622692108, + "learning_rate": 2.6436781609195405e-05, + "loss": 0.1152, + "step": 4100 + }, + { + "epoch": 4.7537491849597915, + "grad_norm": 0.38243040442466736, + "learning_rate": 2.643103448275862e-05, + "loss": 0.1043, + "step": 4101 + }, + { + "epoch": 4.754908353256538, + "grad_norm": 0.32026922702789307, + "learning_rate": 2.642528735632184e-05, + "loss": 0.1129, + "step": 4102 + }, + { + "epoch": 4.756067521553286, + "grad_norm": 0.2896019220352173, + "learning_rate": 2.641954022988506e-05, + "loss": 0.0993, + "step": 4103 + }, + { + "epoch": 4.757226689850032, + "grad_norm": 0.33304235339164734, + "learning_rate": 2.6413793103448275e-05, + "loss": 0.1097, + "step": 4104 + }, + { + "epoch": 4.75838585814678, + "grad_norm": 0.2931871712207794, + "learning_rate": 2.6408045977011496e-05, + "loss": 0.0938, + "step": 4105 + }, + { + "epoch": 4.759545026443527, + "grad_norm": 0.39289239048957825, + "learning_rate": 2.6402298850574715e-05, + "loss": 0.116, + "step": 4106 + }, + { + "epoch": 4.760704194740274, + "grad_norm": 0.45948904752731323, + "learning_rate": 2.639655172413793e-05, + "loss": 0.1125, + "step": 4107 + }, + { + "epoch": 4.761863363037021, + "grad_norm": 0.3527555465698242, + "learning_rate": 2.639080459770115e-05, + "loss": 0.0992, + "step": 4108 + }, + { + "epoch": 4.763022531333768, + "grad_norm": 0.3574606478214264, + "learning_rate": 2.638505747126437e-05, + "loss": 0.1097, + "step": 4109 + }, + { + "epoch": 4.764181699630515, + "grad_norm": 0.41160982847213745, + "learning_rate": 2.637931034482759e-05, + "loss": 0.0972, + "step": 4110 + }, + { + "epoch": 4.765340867927263, + "grad_norm": 0.5347371697425842, + "learning_rate": 2.6373563218390806e-05, + "loss": 0.101, + "step": 4111 + }, + { + "epoch": 4.766500036224009, + "grad_norm": 0.3305839002132416, + "learning_rate": 2.6367816091954024e-05, + "loss": 0.0983, + "step": 4112 + }, + { + "epoch": 4.767659204520756, + "grad_norm": 0.4040708541870117, + "learning_rate": 2.6362068965517246e-05, + "loss": 0.1072, + "step": 4113 + }, + { + "epoch": 4.7688183728175035, + "grad_norm": 0.3733755648136139, + "learning_rate": 2.635632183908046e-05, + "loss": 0.0923, + "step": 4114 + }, + { + "epoch": 4.76997754111425, + "grad_norm": 0.4366565942764282, + "learning_rate": 2.6350574712643676e-05, + "loss": 0.0999, + "step": 4115 + }, + { + "epoch": 4.771136709410998, + "grad_norm": 0.3389378786087036, + "learning_rate": 2.63448275862069e-05, + "loss": 0.1016, + "step": 4116 + }, + { + "epoch": 4.772295877707744, + "grad_norm": 0.2860844135284424, + "learning_rate": 2.6339080459770116e-05, + "loss": 0.0976, + "step": 4117 + }, + { + "epoch": 4.773455046004492, + "grad_norm": 0.4258897304534912, + "learning_rate": 2.633333333333333e-05, + "loss": 0.105, + "step": 4118 + }, + { + "epoch": 4.774614214301239, + "grad_norm": 0.36527156829833984, + "learning_rate": 2.6327586206896552e-05, + "loss": 0.1089, + "step": 4119 + }, + { + "epoch": 4.775773382597986, + "grad_norm": 0.4984228312969208, + "learning_rate": 2.632183908045977e-05, + "loss": 0.1178, + "step": 4120 + }, + { + "epoch": 4.776932550894733, + "grad_norm": 0.3134072721004486, + "learning_rate": 2.6316091954022992e-05, + "loss": 0.0994, + "step": 4121 + }, + { + "epoch": 4.7780917191914805, + "grad_norm": 0.28975674510002136, + "learning_rate": 2.6310344827586207e-05, + "loss": 0.112, + "step": 4122 + }, + { + "epoch": 4.779250887488227, + "grad_norm": 0.4431288242340088, + "learning_rate": 2.6304597701149426e-05, + "loss": 0.1108, + "step": 4123 + }, + { + "epoch": 4.780410055784975, + "grad_norm": 0.3906228542327881, + "learning_rate": 2.6298850574712647e-05, + "loss": 0.1187, + "step": 4124 + }, + { + "epoch": 4.781569224081721, + "grad_norm": 0.32078051567077637, + "learning_rate": 2.6293103448275862e-05, + "loss": 0.1081, + "step": 4125 + }, + { + "epoch": 4.782728392378468, + "grad_norm": 0.34437334537506104, + "learning_rate": 2.628735632183908e-05, + "loss": 0.1045, + "step": 4126 + }, + { + "epoch": 4.783887560675216, + "grad_norm": 0.37716683745384216, + "learning_rate": 2.6281609195402302e-05, + "loss": 0.1112, + "step": 4127 + }, + { + "epoch": 4.785046728971962, + "grad_norm": 0.3816121220588684, + "learning_rate": 2.6275862068965517e-05, + "loss": 0.1015, + "step": 4128 + }, + { + "epoch": 4.78620589726871, + "grad_norm": 0.30392956733703613, + "learning_rate": 2.627011494252874e-05, + "loss": 0.096, + "step": 4129 + }, + { + "epoch": 4.7873650655654565, + "grad_norm": 0.37450024485588074, + "learning_rate": 2.6264367816091957e-05, + "loss": 0.1037, + "step": 4130 + }, + { + "epoch": 4.788524233862204, + "grad_norm": 0.3201805055141449, + "learning_rate": 2.6258620689655172e-05, + "loss": 0.0916, + "step": 4131 + }, + { + "epoch": 4.789683402158951, + "grad_norm": 0.35114505887031555, + "learning_rate": 2.6252873563218394e-05, + "loss": 0.0957, + "step": 4132 + }, + { + "epoch": 4.790842570455698, + "grad_norm": 0.3193678855895996, + "learning_rate": 2.6247126436781612e-05, + "loss": 0.0976, + "step": 4133 + }, + { + "epoch": 4.792001738752445, + "grad_norm": 0.3565538227558136, + "learning_rate": 2.6241379310344827e-05, + "loss": 0.0974, + "step": 4134 + }, + { + "epoch": 4.7931609070491925, + "grad_norm": 0.425341933965683, + "learning_rate": 2.623563218390805e-05, + "loss": 0.1083, + "step": 4135 + }, + { + "epoch": 4.794320075345939, + "grad_norm": 0.3331052362918854, + "learning_rate": 2.6229885057471267e-05, + "loss": 0.1, + "step": 4136 + }, + { + "epoch": 4.795479243642687, + "grad_norm": 0.3157724142074585, + "learning_rate": 2.6224137931034482e-05, + "loss": 0.103, + "step": 4137 + }, + { + "epoch": 4.796638411939433, + "grad_norm": 0.3534112572669983, + "learning_rate": 2.6218390804597703e-05, + "loss": 0.1032, + "step": 4138 + }, + { + "epoch": 4.79779758023618, + "grad_norm": 0.2811379134654999, + "learning_rate": 2.6212643678160918e-05, + "loss": 0.0926, + "step": 4139 + }, + { + "epoch": 4.798956748532928, + "grad_norm": 0.3642650246620178, + "learning_rate": 2.620689655172414e-05, + "loss": 0.1109, + "step": 4140 + }, + { + "epoch": 4.800115916829675, + "grad_norm": 0.4630174934864044, + "learning_rate": 2.6201149425287358e-05, + "loss": 0.1029, + "step": 4141 + }, + { + "epoch": 4.801275085126422, + "grad_norm": 0.48048609495162964, + "learning_rate": 2.6195402298850573e-05, + "loss": 0.1111, + "step": 4142 + }, + { + "epoch": 4.8024342534231685, + "grad_norm": 0.47064340114593506, + "learning_rate": 2.6189655172413795e-05, + "loss": 0.1048, + "step": 4143 + }, + { + "epoch": 4.803593421719916, + "grad_norm": 0.30796119570732117, + "learning_rate": 2.6183908045977013e-05, + "loss": 0.1083, + "step": 4144 + }, + { + "epoch": 4.804752590016663, + "grad_norm": 0.325536847114563, + "learning_rate": 2.6178160919540228e-05, + "loss": 0.1034, + "step": 4145 + }, + { + "epoch": 4.80591175831341, + "grad_norm": 0.40425774455070496, + "learning_rate": 2.617241379310345e-05, + "loss": 0.106, + "step": 4146 + }, + { + "epoch": 4.807070926610157, + "grad_norm": 0.33187299966812134, + "learning_rate": 2.6166666666666668e-05, + "loss": 0.1018, + "step": 4147 + }, + { + "epoch": 4.808230094906905, + "grad_norm": 0.35282886028289795, + "learning_rate": 2.616091954022989e-05, + "loss": 0.1052, + "step": 4148 + }, + { + "epoch": 4.809389263203651, + "grad_norm": 0.3740447163581848, + "learning_rate": 2.6155172413793105e-05, + "loss": 0.106, + "step": 4149 + }, + { + "epoch": 4.810548431500399, + "grad_norm": 0.30205562710762024, + "learning_rate": 2.6149425287356323e-05, + "loss": 0.0938, + "step": 4150 + }, + { + "epoch": 4.8117075997971455, + "grad_norm": 0.2860054671764374, + "learning_rate": 2.6143678160919545e-05, + "loss": 0.0934, + "step": 4151 + }, + { + "epoch": 4.812866768093892, + "grad_norm": 0.465196430683136, + "learning_rate": 2.613793103448276e-05, + "loss": 0.098, + "step": 4152 + }, + { + "epoch": 4.81402593639064, + "grad_norm": 0.3192795217037201, + "learning_rate": 2.6132183908045978e-05, + "loss": 0.0972, + "step": 4153 + }, + { + "epoch": 4.815185104687387, + "grad_norm": 0.42049890756607056, + "learning_rate": 2.61264367816092e-05, + "loss": 0.1133, + "step": 4154 + }, + { + "epoch": 4.816344272984134, + "grad_norm": 0.38218989968299866, + "learning_rate": 2.6120689655172414e-05, + "loss": 0.1014, + "step": 4155 + }, + { + "epoch": 4.817503441280881, + "grad_norm": 0.46369317173957825, + "learning_rate": 2.611494252873563e-05, + "loss": 0.1097, + "step": 4156 + }, + { + "epoch": 4.818662609577628, + "grad_norm": 0.3875451982021332, + "learning_rate": 2.6109195402298854e-05, + "loss": 0.1063, + "step": 4157 + }, + { + "epoch": 4.819821777874375, + "grad_norm": 0.582574725151062, + "learning_rate": 2.610344827586207e-05, + "loss": 0.0954, + "step": 4158 + }, + { + "epoch": 4.820980946171122, + "grad_norm": 0.2862607538700104, + "learning_rate": 2.609770114942529e-05, + "loss": 0.0948, + "step": 4159 + }, + { + "epoch": 4.822140114467869, + "grad_norm": 0.47772908210754395, + "learning_rate": 2.6091954022988506e-05, + "loss": 0.0984, + "step": 4160 + }, + { + "epoch": 4.823299282764617, + "grad_norm": 0.4275151789188385, + "learning_rate": 2.6086206896551724e-05, + "loss": 0.103, + "step": 4161 + }, + { + "epoch": 4.824458451061363, + "grad_norm": 0.3913339674472809, + "learning_rate": 2.6080459770114946e-05, + "loss": 0.1175, + "step": 4162 + }, + { + "epoch": 4.825617619358111, + "grad_norm": 0.37874650955200195, + "learning_rate": 2.607471264367816e-05, + "loss": 0.1093, + "step": 4163 + }, + { + "epoch": 4.8267767876548575, + "grad_norm": 0.33609622716903687, + "learning_rate": 2.606896551724138e-05, + "loss": 0.1113, + "step": 4164 + }, + { + "epoch": 4.827935955951605, + "grad_norm": 0.32703521847724915, + "learning_rate": 2.60632183908046e-05, + "loss": 0.1167, + "step": 4165 + }, + { + "epoch": 4.829095124248352, + "grad_norm": 0.45330101251602173, + "learning_rate": 2.6057471264367815e-05, + "loss": 0.0996, + "step": 4166 + }, + { + "epoch": 4.830254292545099, + "grad_norm": 0.3833322823047638, + "learning_rate": 2.6051724137931037e-05, + "loss": 0.1106, + "step": 4167 + }, + { + "epoch": 4.831413460841846, + "grad_norm": 0.416790634393692, + "learning_rate": 2.6045977011494255e-05, + "loss": 0.1077, + "step": 4168 + }, + { + "epoch": 4.832572629138593, + "grad_norm": 0.40256428718566895, + "learning_rate": 2.604022988505747e-05, + "loss": 0.0983, + "step": 4169 + }, + { + "epoch": 4.83373179743534, + "grad_norm": 0.3929414451122284, + "learning_rate": 2.6034482758620692e-05, + "loss": 0.1143, + "step": 4170 + }, + { + "epoch": 4.834890965732087, + "grad_norm": 0.3878033757209778, + "learning_rate": 2.602873563218391e-05, + "loss": 0.0966, + "step": 4171 + }, + { + "epoch": 4.836050134028834, + "grad_norm": 0.40418297052383423, + "learning_rate": 2.6022988505747125e-05, + "loss": 0.1041, + "step": 4172 + }, + { + "epoch": 4.837209302325581, + "grad_norm": 0.3119466006755829, + "learning_rate": 2.6017241379310347e-05, + "loss": 0.104, + "step": 4173 + }, + { + "epoch": 4.838368470622329, + "grad_norm": 0.31403571367263794, + "learning_rate": 2.6011494252873565e-05, + "loss": 0.1008, + "step": 4174 + }, + { + "epoch": 4.839527638919075, + "grad_norm": 0.32932233810424805, + "learning_rate": 2.600574712643678e-05, + "loss": 0.1069, + "step": 4175 + }, + { + "epoch": 4.840686807215823, + "grad_norm": 0.3228423595428467, + "learning_rate": 2.6000000000000002e-05, + "loss": 0.1007, + "step": 4176 + }, + { + "epoch": 4.84184597551257, + "grad_norm": 0.3488909900188446, + "learning_rate": 2.599425287356322e-05, + "loss": 0.1019, + "step": 4177 + }, + { + "epoch": 4.843005143809317, + "grad_norm": 0.4107312858104706, + "learning_rate": 2.5988505747126442e-05, + "loss": 0.1235, + "step": 4178 + }, + { + "epoch": 4.844164312106064, + "grad_norm": 0.2800889015197754, + "learning_rate": 2.5982758620689657e-05, + "loss": 0.1017, + "step": 4179 + }, + { + "epoch": 4.845323480402811, + "grad_norm": 0.2643517851829529, + "learning_rate": 2.597701149425287e-05, + "loss": 0.1084, + "step": 4180 + }, + { + "epoch": 4.846482648699558, + "grad_norm": 0.40645238757133484, + "learning_rate": 2.5971264367816097e-05, + "loss": 0.1026, + "step": 4181 + }, + { + "epoch": 4.847641816996305, + "grad_norm": 0.5874878168106079, + "learning_rate": 2.596551724137931e-05, + "loss": 0.0999, + "step": 4182 + }, + { + "epoch": 4.848800985293052, + "grad_norm": 0.3134619891643524, + "learning_rate": 2.5959770114942526e-05, + "loss": 0.1144, + "step": 4183 + }, + { + "epoch": 4.8499601535898, + "grad_norm": 0.33643344044685364, + "learning_rate": 2.5954022988505748e-05, + "loss": 0.1035, + "step": 4184 + }, + { + "epoch": 4.8511193218865465, + "grad_norm": 0.39385950565338135, + "learning_rate": 2.5948275862068966e-05, + "loss": 0.1082, + "step": 4185 + }, + { + "epoch": 4.852278490183293, + "grad_norm": 0.32700175046920776, + "learning_rate": 2.5942528735632188e-05, + "loss": 0.1048, + "step": 4186 + }, + { + "epoch": 4.853437658480041, + "grad_norm": 0.4382705092430115, + "learning_rate": 2.5936781609195403e-05, + "loss": 0.0977, + "step": 4187 + }, + { + "epoch": 4.854596826776787, + "grad_norm": 0.4250156879425049, + "learning_rate": 2.593103448275862e-05, + "loss": 0.1147, + "step": 4188 + }, + { + "epoch": 4.855755995073535, + "grad_norm": 0.29872676730155945, + "learning_rate": 2.5925287356321843e-05, + "loss": 0.0983, + "step": 4189 + }, + { + "epoch": 4.856915163370282, + "grad_norm": 0.33435744047164917, + "learning_rate": 2.5919540229885058e-05, + "loss": 0.099, + "step": 4190 + }, + { + "epoch": 4.858074331667029, + "grad_norm": 0.49045172333717346, + "learning_rate": 2.5913793103448276e-05, + "loss": 0.0921, + "step": 4191 + }, + { + "epoch": 4.859233499963776, + "grad_norm": 0.3408951759338379, + "learning_rate": 2.5908045977011498e-05, + "loss": 0.0991, + "step": 4192 + }, + { + "epoch": 4.860392668260523, + "grad_norm": 0.2995651960372925, + "learning_rate": 2.5902298850574713e-05, + "loss": 0.0979, + "step": 4193 + }, + { + "epoch": 4.86155183655727, + "grad_norm": 0.462162584066391, + "learning_rate": 2.589655172413793e-05, + "loss": 0.1083, + "step": 4194 + }, + { + "epoch": 4.862711004854017, + "grad_norm": 0.4822317659854889, + "learning_rate": 2.5890804597701153e-05, + "loss": 0.0976, + "step": 4195 + }, + { + "epoch": 4.863870173150764, + "grad_norm": 0.34353649616241455, + "learning_rate": 2.5885057471264368e-05, + "loss": 0.1036, + "step": 4196 + }, + { + "epoch": 4.865029341447512, + "grad_norm": 0.32048463821411133, + "learning_rate": 2.587931034482759e-05, + "loss": 0.1164, + "step": 4197 + }, + { + "epoch": 4.8661885097442585, + "grad_norm": 0.37083297967910767, + "learning_rate": 2.5873563218390808e-05, + "loss": 0.107, + "step": 4198 + }, + { + "epoch": 4.867347678041005, + "grad_norm": 0.2568010091781616, + "learning_rate": 2.5867816091954022e-05, + "loss": 0.0924, + "step": 4199 + }, + { + "epoch": 4.868506846337753, + "grad_norm": 0.3095264136791229, + "learning_rate": 2.5862068965517244e-05, + "loss": 0.0984, + "step": 4200 + }, + { + "epoch": 4.869666014634499, + "grad_norm": 0.34810638427734375, + "learning_rate": 2.585632183908046e-05, + "loss": 0.1055, + "step": 4201 + }, + { + "epoch": 4.870825182931247, + "grad_norm": 0.3801257014274597, + "learning_rate": 2.5850574712643677e-05, + "loss": 0.1107, + "step": 4202 + }, + { + "epoch": 4.871984351227994, + "grad_norm": 0.31729915738105774, + "learning_rate": 2.58448275862069e-05, + "loss": 0.1158, + "step": 4203 + }, + { + "epoch": 4.873143519524741, + "grad_norm": 0.4292013347148895, + "learning_rate": 2.5839080459770114e-05, + "loss": 0.1062, + "step": 4204 + }, + { + "epoch": 4.874302687821488, + "grad_norm": 0.35838496685028076, + "learning_rate": 2.5833333333333336e-05, + "loss": 0.1062, + "step": 4205 + }, + { + "epoch": 4.8754618561182355, + "grad_norm": 0.479670912027359, + "learning_rate": 2.5827586206896554e-05, + "loss": 0.0975, + "step": 4206 + }, + { + "epoch": 4.876621024414982, + "grad_norm": 0.3806205689907074, + "learning_rate": 2.582183908045977e-05, + "loss": 0.1128, + "step": 4207 + }, + { + "epoch": 4.87778019271173, + "grad_norm": 0.2937345802783966, + "learning_rate": 2.581609195402299e-05, + "loss": 0.1024, + "step": 4208 + }, + { + "epoch": 4.878939361008476, + "grad_norm": 0.28283917903900146, + "learning_rate": 2.581034482758621e-05, + "loss": 0.0977, + "step": 4209 + }, + { + "epoch": 4.880098529305224, + "grad_norm": 0.4149821400642395, + "learning_rate": 2.5804597701149424e-05, + "loss": 0.1125, + "step": 4210 + }, + { + "epoch": 4.881257697601971, + "grad_norm": 0.41813716292381287, + "learning_rate": 2.5798850574712645e-05, + "loss": 0.1096, + "step": 4211 + }, + { + "epoch": 4.882416865898717, + "grad_norm": 0.40485164523124695, + "learning_rate": 2.5793103448275864e-05, + "loss": 0.1119, + "step": 4212 + }, + { + "epoch": 4.883576034195465, + "grad_norm": 0.36861032247543335, + "learning_rate": 2.578735632183908e-05, + "loss": 0.0978, + "step": 4213 + }, + { + "epoch": 4.8847352024922115, + "grad_norm": 0.4352358281612396, + "learning_rate": 2.57816091954023e-05, + "loss": 0.1026, + "step": 4214 + }, + { + "epoch": 4.885894370788959, + "grad_norm": 0.3257162272930145, + "learning_rate": 2.577586206896552e-05, + "loss": 0.1098, + "step": 4215 + }, + { + "epoch": 4.887053539085706, + "grad_norm": 0.3483160734176636, + "learning_rate": 2.577011494252874e-05, + "loss": 0.1115, + "step": 4216 + }, + { + "epoch": 4.888212707382453, + "grad_norm": 0.30593305826187134, + "learning_rate": 2.5764367816091955e-05, + "loss": 0.1081, + "step": 4217 + }, + { + "epoch": 4.8893718756792, + "grad_norm": 0.30475834012031555, + "learning_rate": 2.5758620689655173e-05, + "loss": 0.0919, + "step": 4218 + }, + { + "epoch": 4.8905310439759475, + "grad_norm": 0.296648770570755, + "learning_rate": 2.5752873563218395e-05, + "loss": 0.0985, + "step": 4219 + }, + { + "epoch": 4.891690212272694, + "grad_norm": 0.3274528980255127, + "learning_rate": 2.574712643678161e-05, + "loss": 0.1018, + "step": 4220 + }, + { + "epoch": 4.892849380569442, + "grad_norm": 0.3239176571369171, + "learning_rate": 2.5741379310344825e-05, + "loss": 0.1053, + "step": 4221 + }, + { + "epoch": 4.894008548866188, + "grad_norm": 0.3093695342540741, + "learning_rate": 2.573563218390805e-05, + "loss": 0.1006, + "step": 4222 + }, + { + "epoch": 4.895167717162936, + "grad_norm": 0.29413002729415894, + "learning_rate": 2.5729885057471265e-05, + "loss": 0.1035, + "step": 4223 + }, + { + "epoch": 4.896326885459683, + "grad_norm": 0.3807078003883362, + "learning_rate": 2.5724137931034486e-05, + "loss": 0.1111, + "step": 4224 + }, + { + "epoch": 4.897486053756429, + "grad_norm": 0.357345312833786, + "learning_rate": 2.57183908045977e-05, + "loss": 0.1102, + "step": 4225 + }, + { + "epoch": 4.898645222053177, + "grad_norm": 0.45739462971687317, + "learning_rate": 2.571264367816092e-05, + "loss": 0.0942, + "step": 4226 + }, + { + "epoch": 4.8998043903499235, + "grad_norm": 0.325125515460968, + "learning_rate": 2.570689655172414e-05, + "loss": 0.0932, + "step": 4227 + }, + { + "epoch": 4.900963558646671, + "grad_norm": 0.39498814940452576, + "learning_rate": 2.5701149425287356e-05, + "loss": 0.1001, + "step": 4228 + }, + { + "epoch": 4.902122726943418, + "grad_norm": 0.3661857843399048, + "learning_rate": 2.5695402298850575e-05, + "loss": 0.1013, + "step": 4229 + }, + { + "epoch": 4.903281895240165, + "grad_norm": 0.4122169315814972, + "learning_rate": 2.5689655172413796e-05, + "loss": 0.1106, + "step": 4230 + }, + { + "epoch": 4.904441063536912, + "grad_norm": 0.6558185815811157, + "learning_rate": 2.568390804597701e-05, + "loss": 0.1162, + "step": 4231 + }, + { + "epoch": 4.90560023183366, + "grad_norm": 0.2946203052997589, + "learning_rate": 2.567816091954023e-05, + "loss": 0.1005, + "step": 4232 + }, + { + "epoch": 4.906759400130406, + "grad_norm": 0.3813852071762085, + "learning_rate": 2.567241379310345e-05, + "loss": 0.1039, + "step": 4233 + }, + { + "epoch": 4.907918568427154, + "grad_norm": 0.33533862233161926, + "learning_rate": 2.5666666666666666e-05, + "loss": 0.0955, + "step": 4234 + }, + { + "epoch": 4.9090777367239005, + "grad_norm": 0.3679860830307007, + "learning_rate": 2.5660919540229888e-05, + "loss": 0.1084, + "step": 4235 + }, + { + "epoch": 4.910236905020648, + "grad_norm": 0.32838982343673706, + "learning_rate": 2.5655172413793106e-05, + "loss": 0.1006, + "step": 4236 + }, + { + "epoch": 4.911396073317395, + "grad_norm": 0.40485119819641113, + "learning_rate": 2.564942528735632e-05, + "loss": 0.1078, + "step": 4237 + }, + { + "epoch": 4.912555241614141, + "grad_norm": 0.43411320447921753, + "learning_rate": 2.5643678160919543e-05, + "loss": 0.1015, + "step": 4238 + }, + { + "epoch": 4.913714409910889, + "grad_norm": 0.622022807598114, + "learning_rate": 2.563793103448276e-05, + "loss": 0.1006, + "step": 4239 + }, + { + "epoch": 4.9148735782076365, + "grad_norm": 0.37312477827072144, + "learning_rate": 2.5632183908045976e-05, + "loss": 0.0984, + "step": 4240 + }, + { + "epoch": 4.916032746504383, + "grad_norm": 0.4039802849292755, + "learning_rate": 2.5626436781609197e-05, + "loss": 0.1012, + "step": 4241 + }, + { + "epoch": 4.91719191480113, + "grad_norm": 0.28577470779418945, + "learning_rate": 2.5620689655172416e-05, + "loss": 0.1062, + "step": 4242 + }, + { + "epoch": 4.918351083097877, + "grad_norm": 0.4363808333873749, + "learning_rate": 2.5614942528735637e-05, + "loss": 0.1079, + "step": 4243 + }, + { + "epoch": 4.919510251394624, + "grad_norm": 0.31524115800857544, + "learning_rate": 2.5609195402298852e-05, + "loss": 0.1027, + "step": 4244 + }, + { + "epoch": 4.920669419691372, + "grad_norm": 0.324258416891098, + "learning_rate": 2.5603448275862067e-05, + "loss": 0.1011, + "step": 4245 + }, + { + "epoch": 4.921828587988118, + "grad_norm": 0.3237408697605133, + "learning_rate": 2.559770114942529e-05, + "loss": 0.0999, + "step": 4246 + }, + { + "epoch": 4.922987756284866, + "grad_norm": 0.3803741931915283, + "learning_rate": 2.5591954022988507e-05, + "loss": 0.1069, + "step": 4247 + }, + { + "epoch": 4.9241469245816125, + "grad_norm": 0.36005181074142456, + "learning_rate": 2.5586206896551722e-05, + "loss": 0.1102, + "step": 4248 + }, + { + "epoch": 4.92530609287836, + "grad_norm": 0.3981182873249054, + "learning_rate": 2.5580459770114944e-05, + "loss": 0.1076, + "step": 4249 + }, + { + "epoch": 4.926465261175107, + "grad_norm": 0.37391379475593567, + "learning_rate": 2.5574712643678162e-05, + "loss": 0.0985, + "step": 4250 + }, + { + "epoch": 4.927624429471854, + "grad_norm": 0.38138946890830994, + "learning_rate": 2.5568965517241377e-05, + "loss": 0.0987, + "step": 4251 + }, + { + "epoch": 4.928783597768601, + "grad_norm": 0.5195475816726685, + "learning_rate": 2.55632183908046e-05, + "loss": 0.0944, + "step": 4252 + }, + { + "epoch": 4.9299427660653485, + "grad_norm": 0.4090215265750885, + "learning_rate": 2.5557471264367817e-05, + "loss": 0.1082, + "step": 4253 + }, + { + "epoch": 4.931101934362095, + "grad_norm": 0.2851259410381317, + "learning_rate": 2.555172413793104e-05, + "loss": 0.0993, + "step": 4254 + }, + { + "epoch": 4.932261102658842, + "grad_norm": 0.37689948081970215, + "learning_rate": 2.5545977011494253e-05, + "loss": 0.108, + "step": 4255 + }, + { + "epoch": 4.933420270955589, + "grad_norm": 0.3453246057033539, + "learning_rate": 2.5540229885057472e-05, + "loss": 0.1001, + "step": 4256 + }, + { + "epoch": 4.934579439252336, + "grad_norm": 0.4019702970981598, + "learning_rate": 2.5534482758620693e-05, + "loss": 0.1084, + "step": 4257 + }, + { + "epoch": 4.935738607549084, + "grad_norm": 0.30191949009895325, + "learning_rate": 2.552873563218391e-05, + "loss": 0.0991, + "step": 4258 + }, + { + "epoch": 4.93689777584583, + "grad_norm": 0.34570759534835815, + "learning_rate": 2.5522988505747127e-05, + "loss": 0.0976, + "step": 4259 + }, + { + "epoch": 4.938056944142578, + "grad_norm": 0.32790854573249817, + "learning_rate": 2.551724137931035e-05, + "loss": 0.088, + "step": 4260 + }, + { + "epoch": 4.939216112439325, + "grad_norm": 0.32591426372528076, + "learning_rate": 2.5511494252873563e-05, + "loss": 0.0989, + "step": 4261 + }, + { + "epoch": 4.940375280736072, + "grad_norm": 0.3843526542186737, + "learning_rate": 2.5505747126436785e-05, + "loss": 0.103, + "step": 4262 + }, + { + "epoch": 4.941534449032819, + "grad_norm": 0.38071221113204956, + "learning_rate": 2.5500000000000003e-05, + "loss": 0.1045, + "step": 4263 + }, + { + "epoch": 4.942693617329566, + "grad_norm": 0.3825156092643738, + "learning_rate": 2.5494252873563218e-05, + "loss": 0.0969, + "step": 4264 + }, + { + "epoch": 4.943852785626313, + "grad_norm": 0.3230441212654114, + "learning_rate": 2.548850574712644e-05, + "loss": 0.1042, + "step": 4265 + }, + { + "epoch": 4.945011953923061, + "grad_norm": 0.35447943210601807, + "learning_rate": 2.5482758620689655e-05, + "loss": 0.1133, + "step": 4266 + }, + { + "epoch": 4.946171122219807, + "grad_norm": 0.2828787863254547, + "learning_rate": 2.5477011494252873e-05, + "loss": 0.0917, + "step": 4267 + }, + { + "epoch": 4.947330290516554, + "grad_norm": 0.2748185694217682, + "learning_rate": 2.5471264367816095e-05, + "loss": 0.0973, + "step": 4268 + }, + { + "epoch": 4.9484894588133015, + "grad_norm": 0.321551114320755, + "learning_rate": 2.546551724137931e-05, + "loss": 0.0973, + "step": 4269 + }, + { + "epoch": 4.949648627110048, + "grad_norm": 0.3038749396800995, + "learning_rate": 2.545977011494253e-05, + "loss": 0.0888, + "step": 4270 + }, + { + "epoch": 4.950807795406796, + "grad_norm": 0.4207051396369934, + "learning_rate": 2.545402298850575e-05, + "loss": 0.0991, + "step": 4271 + }, + { + "epoch": 4.951966963703542, + "grad_norm": 0.44856876134872437, + "learning_rate": 2.5448275862068964e-05, + "loss": 0.1067, + "step": 4272 + }, + { + "epoch": 4.95312613200029, + "grad_norm": 0.3357430398464203, + "learning_rate": 2.5442528735632186e-05, + "loss": 0.0877, + "step": 4273 + }, + { + "epoch": 4.954285300297037, + "grad_norm": 0.33568447828292847, + "learning_rate": 2.5436781609195404e-05, + "loss": 0.0988, + "step": 4274 + }, + { + "epoch": 4.955444468593784, + "grad_norm": 0.3613547384738922, + "learning_rate": 2.543103448275862e-05, + "loss": 0.1052, + "step": 4275 + }, + { + "epoch": 4.956603636890531, + "grad_norm": 0.3529345393180847, + "learning_rate": 2.542528735632184e-05, + "loss": 0.1049, + "step": 4276 + }, + { + "epoch": 4.957762805187278, + "grad_norm": 0.35080334544181824, + "learning_rate": 2.541954022988506e-05, + "loss": 0.1034, + "step": 4277 + }, + { + "epoch": 4.958921973484025, + "grad_norm": 0.68172287940979, + "learning_rate": 2.5413793103448274e-05, + "loss": 0.1033, + "step": 4278 + }, + { + "epoch": 4.960081141780773, + "grad_norm": 0.3911987543106079, + "learning_rate": 2.5408045977011496e-05, + "loss": 0.1144, + "step": 4279 + }, + { + "epoch": 4.961240310077519, + "grad_norm": 0.3486397862434387, + "learning_rate": 2.5402298850574714e-05, + "loss": 0.1069, + "step": 4280 + }, + { + "epoch": 4.962399478374266, + "grad_norm": 0.4205693006515503, + "learning_rate": 2.5396551724137936e-05, + "loss": 0.1115, + "step": 4281 + }, + { + "epoch": 4.9635586466710135, + "grad_norm": 0.4175969660282135, + "learning_rate": 2.539080459770115e-05, + "loss": 0.106, + "step": 4282 + }, + { + "epoch": 4.964717814967761, + "grad_norm": 0.3387162685394287, + "learning_rate": 2.538505747126437e-05, + "loss": 0.1113, + "step": 4283 + }, + { + "epoch": 4.965876983264508, + "grad_norm": 0.3226720988750458, + "learning_rate": 2.537931034482759e-05, + "loss": 0.0975, + "step": 4284 + }, + { + "epoch": 4.967036151561254, + "grad_norm": 0.3947923183441162, + "learning_rate": 2.5373563218390806e-05, + "loss": 0.0999, + "step": 4285 + }, + { + "epoch": 4.968195319858002, + "grad_norm": 0.40375250577926636, + "learning_rate": 2.536781609195402e-05, + "loss": 0.1104, + "step": 4286 + }, + { + "epoch": 4.969354488154749, + "grad_norm": 0.44277888536453247, + "learning_rate": 2.5362068965517246e-05, + "loss": 0.1052, + "step": 4287 + }, + { + "epoch": 4.970513656451496, + "grad_norm": 0.38605281710624695, + "learning_rate": 2.535632183908046e-05, + "loss": 0.1062, + "step": 4288 + }, + { + "epoch": 4.971672824748243, + "grad_norm": 0.3479140102863312, + "learning_rate": 2.5350574712643682e-05, + "loss": 0.1085, + "step": 4289 + }, + { + "epoch": 4.9728319930449905, + "grad_norm": 0.38351091742515564, + "learning_rate": 2.5344827586206897e-05, + "loss": 0.1046, + "step": 4290 + }, + { + "epoch": 4.973991161341737, + "grad_norm": 0.3506777882575989, + "learning_rate": 2.5339080459770115e-05, + "loss": 0.0964, + "step": 4291 + }, + { + "epoch": 4.975150329638485, + "grad_norm": 0.4924359619617462, + "learning_rate": 2.5333333333333337e-05, + "loss": 0.116, + "step": 4292 + }, + { + "epoch": 4.976309497935231, + "grad_norm": 0.31600138545036316, + "learning_rate": 2.5327586206896552e-05, + "loss": 0.1058, + "step": 4293 + }, + { + "epoch": 4.977468666231979, + "grad_norm": 0.3318912386894226, + "learning_rate": 2.532183908045977e-05, + "loss": 0.0918, + "step": 4294 + }, + { + "epoch": 4.978627834528726, + "grad_norm": 0.361688494682312, + "learning_rate": 2.5316091954022992e-05, + "loss": 0.1061, + "step": 4295 + }, + { + "epoch": 4.979787002825473, + "grad_norm": 0.37395092844963074, + "learning_rate": 2.5310344827586207e-05, + "loss": 0.1067, + "step": 4296 + }, + { + "epoch": 4.98094617112222, + "grad_norm": 0.48089030385017395, + "learning_rate": 2.5304597701149425e-05, + "loss": 0.1015, + "step": 4297 + }, + { + "epoch": 4.9821053394189665, + "grad_norm": 0.305130273103714, + "learning_rate": 2.5298850574712647e-05, + "loss": 0.0966, + "step": 4298 + }, + { + "epoch": 4.983264507715714, + "grad_norm": 0.30088353157043457, + "learning_rate": 2.529310344827586e-05, + "loss": 0.1099, + "step": 4299 + }, + { + "epoch": 4.984423676012461, + "grad_norm": 0.4635865092277527, + "learning_rate": 2.5287356321839083e-05, + "loss": 0.1118, + "step": 4300 + }, + { + "epoch": 4.985582844309208, + "grad_norm": 0.34878647327423096, + "learning_rate": 2.52816091954023e-05, + "loss": 0.1004, + "step": 4301 + }, + { + "epoch": 4.986742012605955, + "grad_norm": 0.31814780831336975, + "learning_rate": 2.5275862068965516e-05, + "loss": 0.1032, + "step": 4302 + }, + { + "epoch": 4.9879011809027025, + "grad_norm": 0.311321884393692, + "learning_rate": 2.5270114942528738e-05, + "loss": 0.0982, + "step": 4303 + }, + { + "epoch": 4.989060349199449, + "grad_norm": 0.3587198853492737, + "learning_rate": 2.5264367816091956e-05, + "loss": 0.0981, + "step": 4304 + }, + { + "epoch": 4.990219517496197, + "grad_norm": 0.33750030398368835, + "learning_rate": 2.525862068965517e-05, + "loss": 0.0996, + "step": 4305 + }, + { + "epoch": 4.991378685792943, + "grad_norm": 0.3702969253063202, + "learning_rate": 2.5252873563218393e-05, + "loss": 0.1126, + "step": 4306 + }, + { + "epoch": 4.992537854089691, + "grad_norm": 0.33090633153915405, + "learning_rate": 2.5247126436781608e-05, + "loss": 0.1072, + "step": 4307 + }, + { + "epoch": 4.993697022386438, + "grad_norm": 0.4805144965648651, + "learning_rate": 2.5241379310344833e-05, + "loss": 0.0977, + "step": 4308 + }, + { + "epoch": 4.994856190683185, + "grad_norm": 0.3834288418292999, + "learning_rate": 2.5235632183908048e-05, + "loss": 0.0978, + "step": 4309 + }, + { + "epoch": 4.996015358979932, + "grad_norm": 0.39906617999076843, + "learning_rate": 2.5229885057471263e-05, + "loss": 0.1049, + "step": 4310 + }, + { + "epoch": 4.9971745272766785, + "grad_norm": 0.2854554355144501, + "learning_rate": 2.5224137931034484e-05, + "loss": 0.1047, + "step": 4311 + }, + { + "epoch": 4.998333695573426, + "grad_norm": 0.37985438108444214, + "learning_rate": 2.5218390804597703e-05, + "loss": 0.101, + "step": 4312 + }, + { + "epoch": 4.999492863870173, + "grad_norm": 0.4384147822856903, + "learning_rate": 2.5212643678160918e-05, + "loss": 0.1011, + "step": 4313 + }, + { + "epoch": 4.999492863870173, + "eval_loss": 0.1395179033279419, + "eval_runtime": 265.5301, + "eval_samples_per_second": 5.777, + "eval_steps_per_second": 5.777, + "step": 4313 + }, + { + "epoch": 5.00065203216692, + "grad_norm": 0.49006325006484985, + "learning_rate": 2.520689655172414e-05, + "loss": 0.1007, + "step": 4314 + }, + { + "epoch": 5.001811200463667, + "grad_norm": 0.2520461082458496, + "learning_rate": 2.5201149425287358e-05, + "loss": 0.0905, + "step": 4315 + }, + { + "epoch": 5.002970368760415, + "grad_norm": 0.2542027235031128, + "learning_rate": 2.5195402298850573e-05, + "loss": 0.0858, + "step": 4316 + }, + { + "epoch": 5.004129537057161, + "grad_norm": 0.3190763592720032, + "learning_rate": 2.5189655172413794e-05, + "loss": 0.0818, + "step": 4317 + }, + { + "epoch": 5.005288705353909, + "grad_norm": 0.2673119008541107, + "learning_rate": 2.5183908045977013e-05, + "loss": 0.0943, + "step": 4318 + }, + { + "epoch": 5.0064478736506555, + "grad_norm": 0.31075116991996765, + "learning_rate": 2.5178160919540234e-05, + "loss": 0.0966, + "step": 4319 + }, + { + "epoch": 5.007607041947403, + "grad_norm": 0.3015635311603546, + "learning_rate": 2.517241379310345e-05, + "loss": 0.0971, + "step": 4320 + }, + { + "epoch": 5.00876621024415, + "grad_norm": 0.43081480264663696, + "learning_rate": 2.5166666666666667e-05, + "loss": 0.0955, + "step": 4321 + }, + { + "epoch": 5.009925378540897, + "grad_norm": 0.3676944971084595, + "learning_rate": 2.516091954022989e-05, + "loss": 0.0966, + "step": 4322 + }, + { + "epoch": 5.011084546837644, + "grad_norm": 0.34852761030197144, + "learning_rate": 2.5155172413793104e-05, + "loss": 0.0881, + "step": 4323 + }, + { + "epoch": 5.0122437151343915, + "grad_norm": 0.3189998269081116, + "learning_rate": 2.5149425287356322e-05, + "loss": 0.0981, + "step": 4324 + }, + { + "epoch": 5.013402883431138, + "grad_norm": 0.5241860151290894, + "learning_rate": 2.5143678160919544e-05, + "loss": 0.0923, + "step": 4325 + }, + { + "epoch": 5.014562051727885, + "grad_norm": 0.4675620198249817, + "learning_rate": 2.513793103448276e-05, + "loss": 0.0899, + "step": 4326 + }, + { + "epoch": 5.015721220024632, + "grad_norm": 0.37763670086860657, + "learning_rate": 2.513218390804598e-05, + "loss": 0.084, + "step": 4327 + }, + { + "epoch": 5.016880388321379, + "grad_norm": 0.4259863495826721, + "learning_rate": 2.51264367816092e-05, + "loss": 0.1091, + "step": 4328 + }, + { + "epoch": 5.018039556618127, + "grad_norm": 0.6199847459793091, + "learning_rate": 2.5120689655172414e-05, + "loss": 0.0982, + "step": 4329 + }, + { + "epoch": 5.019198724914873, + "grad_norm": 0.4678066670894623, + "learning_rate": 2.5114942528735635e-05, + "loss": 0.0968, + "step": 4330 + }, + { + "epoch": 5.020357893211621, + "grad_norm": 0.46299999952316284, + "learning_rate": 2.510919540229885e-05, + "loss": 0.0903, + "step": 4331 + }, + { + "epoch": 5.0215170615083675, + "grad_norm": 0.3906437158584595, + "learning_rate": 2.510344827586207e-05, + "loss": 0.0929, + "step": 4332 + }, + { + "epoch": 5.022676229805115, + "grad_norm": 0.4142506420612335, + "learning_rate": 2.509770114942529e-05, + "loss": 0.1009, + "step": 4333 + }, + { + "epoch": 5.023835398101862, + "grad_norm": 0.3510461151599884, + "learning_rate": 2.5091954022988505e-05, + "loss": 0.0879, + "step": 4334 + }, + { + "epoch": 5.024994566398609, + "grad_norm": 0.3772026598453522, + "learning_rate": 2.5086206896551723e-05, + "loss": 0.0947, + "step": 4335 + }, + { + "epoch": 5.026153734695356, + "grad_norm": 0.3540421426296234, + "learning_rate": 2.5080459770114945e-05, + "loss": 0.0897, + "step": 4336 + }, + { + "epoch": 5.0273129029921035, + "grad_norm": 0.39619699120521545, + "learning_rate": 2.507471264367816e-05, + "loss": 0.0932, + "step": 4337 + }, + { + "epoch": 5.02847207128885, + "grad_norm": 0.4173181354999542, + "learning_rate": 2.5068965517241382e-05, + "loss": 0.0857, + "step": 4338 + }, + { + "epoch": 5.029631239585598, + "grad_norm": 0.4821605682373047, + "learning_rate": 2.50632183908046e-05, + "loss": 0.0936, + "step": 4339 + }, + { + "epoch": 5.030790407882344, + "grad_norm": 0.3579258918762207, + "learning_rate": 2.5057471264367815e-05, + "loss": 0.0945, + "step": 4340 + }, + { + "epoch": 5.031949576179091, + "grad_norm": 0.3280121982097626, + "learning_rate": 2.5051724137931037e-05, + "loss": 0.0958, + "step": 4341 + }, + { + "epoch": 5.033108744475839, + "grad_norm": 0.4641885757446289, + "learning_rate": 2.5045977011494255e-05, + "loss": 0.0982, + "step": 4342 + }, + { + "epoch": 5.034267912772585, + "grad_norm": 0.4874429404735565, + "learning_rate": 2.504022988505747e-05, + "loss": 0.0996, + "step": 4343 + }, + { + "epoch": 5.035427081069333, + "grad_norm": 0.45620620250701904, + "learning_rate": 2.503448275862069e-05, + "loss": 0.0894, + "step": 4344 + }, + { + "epoch": 5.03658624936608, + "grad_norm": 0.42721596360206604, + "learning_rate": 2.502873563218391e-05, + "loss": 0.1, + "step": 4345 + }, + { + "epoch": 5.037745417662827, + "grad_norm": 0.5702162981033325, + "learning_rate": 2.502298850574713e-05, + "loss": 0.0954, + "step": 4346 + }, + { + "epoch": 5.038904585959574, + "grad_norm": 0.2864901125431061, + "learning_rate": 2.5017241379310346e-05, + "loss": 0.0902, + "step": 4347 + }, + { + "epoch": 5.040063754256321, + "grad_norm": 0.31084322929382324, + "learning_rate": 2.5011494252873565e-05, + "loss": 0.0924, + "step": 4348 + }, + { + "epoch": 5.041222922553068, + "grad_norm": 0.5150142312049866, + "learning_rate": 2.5005747126436786e-05, + "loss": 0.0936, + "step": 4349 + }, + { + "epoch": 5.042382090849816, + "grad_norm": 0.572697103023529, + "learning_rate": 2.5e-05, + "loss": 0.0926, + "step": 4350 + }, + { + "epoch": 5.043541259146562, + "grad_norm": 0.4093453288078308, + "learning_rate": 2.499425287356322e-05, + "loss": 0.0913, + "step": 4351 + }, + { + "epoch": 5.04470042744331, + "grad_norm": 0.3991166949272156, + "learning_rate": 2.4988505747126438e-05, + "loss": 0.0965, + "step": 4352 + }, + { + "epoch": 5.0458595957400565, + "grad_norm": 0.6385246515274048, + "learning_rate": 2.4982758620689656e-05, + "loss": 0.0879, + "step": 4353 + }, + { + "epoch": 5.047018764036803, + "grad_norm": 0.4218897223472595, + "learning_rate": 2.4977011494252874e-05, + "loss": 0.0874, + "step": 4354 + }, + { + "epoch": 5.048177932333551, + "grad_norm": 0.3741064667701721, + "learning_rate": 2.4971264367816093e-05, + "loss": 0.0987, + "step": 4355 + }, + { + "epoch": 5.049337100630297, + "grad_norm": 0.4053811728954315, + "learning_rate": 2.496551724137931e-05, + "loss": 0.0972, + "step": 4356 + }, + { + "epoch": 5.050496268927045, + "grad_norm": 0.521132230758667, + "learning_rate": 2.495977011494253e-05, + "loss": 0.0903, + "step": 4357 + }, + { + "epoch": 5.051655437223792, + "grad_norm": 0.3791821002960205, + "learning_rate": 2.4954022988505748e-05, + "loss": 0.0827, + "step": 4358 + }, + { + "epoch": 5.052814605520539, + "grad_norm": 0.5845307111740112, + "learning_rate": 2.494827586206897e-05, + "loss": 0.1025, + "step": 4359 + }, + { + "epoch": 5.053973773817286, + "grad_norm": 0.5024420022964478, + "learning_rate": 2.4942528735632184e-05, + "loss": 0.0903, + "step": 4360 + }, + { + "epoch": 5.055132942114033, + "grad_norm": 0.3732399046421051, + "learning_rate": 2.4936781609195402e-05, + "loss": 0.0925, + "step": 4361 + }, + { + "epoch": 5.05629211041078, + "grad_norm": 0.3285994827747345, + "learning_rate": 2.493103448275862e-05, + "loss": 0.0922, + "step": 4362 + }, + { + "epoch": 5.057451278707528, + "grad_norm": 0.4758022129535675, + "learning_rate": 2.4925287356321842e-05, + "loss": 0.0955, + "step": 4363 + }, + { + "epoch": 5.058610447004274, + "grad_norm": 0.558384358882904, + "learning_rate": 2.4919540229885057e-05, + "loss": 0.0947, + "step": 4364 + }, + { + "epoch": 5.059769615301022, + "grad_norm": 0.7286369800567627, + "learning_rate": 2.4913793103448276e-05, + "loss": 0.0965, + "step": 4365 + }, + { + "epoch": 5.0609287835977685, + "grad_norm": 0.35651078820228577, + "learning_rate": 2.4908045977011497e-05, + "loss": 0.0874, + "step": 4366 + }, + { + "epoch": 5.062087951894516, + "grad_norm": 0.5121673345565796, + "learning_rate": 2.4902298850574716e-05, + "loss": 0.0997, + "step": 4367 + }, + { + "epoch": 5.063247120191263, + "grad_norm": 0.506806492805481, + "learning_rate": 2.489655172413793e-05, + "loss": 0.0869, + "step": 4368 + }, + { + "epoch": 5.0644062884880094, + "grad_norm": 1.1122589111328125, + "learning_rate": 2.4890804597701152e-05, + "loss": 0.1026, + "step": 4369 + }, + { + "epoch": 5.065565456784757, + "grad_norm": 0.42941662669181824, + "learning_rate": 2.488505747126437e-05, + "loss": 0.0868, + "step": 4370 + }, + { + "epoch": 5.066724625081504, + "grad_norm": 0.4219612181186676, + "learning_rate": 2.4879310344827585e-05, + "loss": 0.0924, + "step": 4371 + }, + { + "epoch": 5.067883793378251, + "grad_norm": 0.3930709660053253, + "learning_rate": 2.4873563218390804e-05, + "loss": 0.0868, + "step": 4372 + }, + { + "epoch": 5.069042961674998, + "grad_norm": 0.3940015137195587, + "learning_rate": 2.4867816091954025e-05, + "loss": 0.0815, + "step": 4373 + }, + { + "epoch": 5.0702021299717455, + "grad_norm": 0.496271014213562, + "learning_rate": 2.4862068965517244e-05, + "loss": 0.0987, + "step": 4374 + }, + { + "epoch": 5.071361298268492, + "grad_norm": 0.3629682958126068, + "learning_rate": 2.485632183908046e-05, + "loss": 0.0875, + "step": 4375 + }, + { + "epoch": 5.07252046656524, + "grad_norm": 0.5022166967391968, + "learning_rate": 2.485057471264368e-05, + "loss": 0.0989, + "step": 4376 + }, + { + "epoch": 5.073679634861986, + "grad_norm": 0.7410409450531006, + "learning_rate": 2.48448275862069e-05, + "loss": 0.0991, + "step": 4377 + }, + { + "epoch": 5.074838803158734, + "grad_norm": 0.30620214343070984, + "learning_rate": 2.4839080459770117e-05, + "loss": 0.0834, + "step": 4378 + }, + { + "epoch": 5.075997971455481, + "grad_norm": 0.5494264960289001, + "learning_rate": 2.4833333333333335e-05, + "loss": 0.0888, + "step": 4379 + }, + { + "epoch": 5.077157139752228, + "grad_norm": 0.45978859066963196, + "learning_rate": 2.4827586206896553e-05, + "loss": 0.1035, + "step": 4380 + }, + { + "epoch": 5.078316308048975, + "grad_norm": 0.4293060004711151, + "learning_rate": 2.482183908045977e-05, + "loss": 0.0898, + "step": 4381 + }, + { + "epoch": 5.079475476345722, + "grad_norm": 0.3662157654762268, + "learning_rate": 2.481609195402299e-05, + "loss": 0.1001, + "step": 4382 + }, + { + "epoch": 5.080634644642469, + "grad_norm": 0.3850663900375366, + "learning_rate": 2.4810344827586208e-05, + "loss": 0.0901, + "step": 4383 + }, + { + "epoch": 5.081793812939216, + "grad_norm": 0.32764530181884766, + "learning_rate": 2.4804597701149426e-05, + "loss": 0.0922, + "step": 4384 + }, + { + "epoch": 5.082952981235963, + "grad_norm": 0.8065647482872009, + "learning_rate": 2.4798850574712645e-05, + "loss": 0.0949, + "step": 4385 + }, + { + "epoch": 5.08411214953271, + "grad_norm": 0.4645923376083374, + "learning_rate": 2.4793103448275863e-05, + "loss": 0.0899, + "step": 4386 + }, + { + "epoch": 5.0852713178294575, + "grad_norm": 0.3961159288883209, + "learning_rate": 2.478735632183908e-05, + "loss": 0.0918, + "step": 4387 + }, + { + "epoch": 5.086430486126204, + "grad_norm": 0.3519403636455536, + "learning_rate": 2.47816091954023e-05, + "loss": 0.0942, + "step": 4388 + }, + { + "epoch": 5.087589654422952, + "grad_norm": 0.45820310711860657, + "learning_rate": 2.4775862068965518e-05, + "loss": 0.0977, + "step": 4389 + }, + { + "epoch": 5.088748822719698, + "grad_norm": 0.810590386390686, + "learning_rate": 2.4770114942528736e-05, + "loss": 0.0983, + "step": 4390 + }, + { + "epoch": 5.089907991016446, + "grad_norm": 0.4400874972343445, + "learning_rate": 2.4764367816091954e-05, + "loss": 0.0962, + "step": 4391 + }, + { + "epoch": 5.091067159313193, + "grad_norm": 0.3362007737159729, + "learning_rate": 2.4758620689655173e-05, + "loss": 0.0957, + "step": 4392 + }, + { + "epoch": 5.09222632760994, + "grad_norm": 0.5532455444335938, + "learning_rate": 2.4752873563218394e-05, + "loss": 0.1131, + "step": 4393 + }, + { + "epoch": 5.093385495906687, + "grad_norm": 0.3802591860294342, + "learning_rate": 2.474712643678161e-05, + "loss": 0.091, + "step": 4394 + }, + { + "epoch": 5.094544664203434, + "grad_norm": 0.45833706855773926, + "learning_rate": 2.4741379310344828e-05, + "loss": 0.0942, + "step": 4395 + }, + { + "epoch": 5.095703832500181, + "grad_norm": 0.3089923858642578, + "learning_rate": 2.4735632183908046e-05, + "loss": 0.0918, + "step": 4396 + }, + { + "epoch": 5.096863000796928, + "grad_norm": 0.43163448572158813, + "learning_rate": 2.4729885057471268e-05, + "loss": 0.0963, + "step": 4397 + }, + { + "epoch": 5.098022169093675, + "grad_norm": 0.388334721326828, + "learning_rate": 2.4724137931034483e-05, + "loss": 0.0901, + "step": 4398 + }, + { + "epoch": 5.099181337390422, + "grad_norm": 0.5277796387672424, + "learning_rate": 2.47183908045977e-05, + "loss": 0.0934, + "step": 4399 + }, + { + "epoch": 5.10034050568717, + "grad_norm": 0.42713913321495056, + "learning_rate": 2.4712643678160922e-05, + "loss": 0.1081, + "step": 4400 + }, + { + "epoch": 5.101499673983916, + "grad_norm": 0.4533218741416931, + "learning_rate": 2.470689655172414e-05, + "loss": 0.088, + "step": 4401 + }, + { + "epoch": 5.102658842280664, + "grad_norm": 0.4103233516216278, + "learning_rate": 2.4701149425287356e-05, + "loss": 0.1027, + "step": 4402 + }, + { + "epoch": 5.1038180105774105, + "grad_norm": 0.5257367491722107, + "learning_rate": 2.4695402298850577e-05, + "loss": 0.0954, + "step": 4403 + }, + { + "epoch": 5.104977178874158, + "grad_norm": 0.5127593874931335, + "learning_rate": 2.4689655172413796e-05, + "loss": 0.0903, + "step": 4404 + }, + { + "epoch": 5.106136347170905, + "grad_norm": 0.4334225058555603, + "learning_rate": 2.4683908045977014e-05, + "loss": 0.0976, + "step": 4405 + }, + { + "epoch": 5.107295515467652, + "grad_norm": 0.5956823229789734, + "learning_rate": 2.467816091954023e-05, + "loss": 0.1057, + "step": 4406 + }, + { + "epoch": 5.108454683764399, + "grad_norm": 0.448223352432251, + "learning_rate": 2.467241379310345e-05, + "loss": 0.0955, + "step": 4407 + }, + { + "epoch": 5.1096138520611465, + "grad_norm": 0.5333186984062195, + "learning_rate": 2.466666666666667e-05, + "loss": 0.0998, + "step": 4408 + }, + { + "epoch": 5.110773020357893, + "grad_norm": 0.5879203081130981, + "learning_rate": 2.4660919540229887e-05, + "loss": 0.0954, + "step": 4409 + }, + { + "epoch": 5.111932188654641, + "grad_norm": 0.4079529047012329, + "learning_rate": 2.4655172413793105e-05, + "loss": 0.0925, + "step": 4410 + }, + { + "epoch": 5.113091356951387, + "grad_norm": 0.3445991277694702, + "learning_rate": 2.4649425287356324e-05, + "loss": 0.0894, + "step": 4411 + }, + { + "epoch": 5.114250525248134, + "grad_norm": 0.371430903673172, + "learning_rate": 2.4643678160919542e-05, + "loss": 0.0904, + "step": 4412 + }, + { + "epoch": 5.115409693544882, + "grad_norm": 0.4925328195095062, + "learning_rate": 2.4637931034482757e-05, + "loss": 0.0954, + "step": 4413 + }, + { + "epoch": 5.116568861841628, + "grad_norm": 0.4289555847644806, + "learning_rate": 2.463218390804598e-05, + "loss": 0.0909, + "step": 4414 + }, + { + "epoch": 5.117728030138376, + "grad_norm": 0.6553305983543396, + "learning_rate": 2.4626436781609197e-05, + "loss": 0.1009, + "step": 4415 + }, + { + "epoch": 5.1188871984351225, + "grad_norm": 0.4029340445995331, + "learning_rate": 2.4620689655172415e-05, + "loss": 0.0964, + "step": 4416 + }, + { + "epoch": 5.12004636673187, + "grad_norm": 0.48213991522789, + "learning_rate": 2.4614942528735633e-05, + "loss": 0.099, + "step": 4417 + }, + { + "epoch": 5.121205535028617, + "grad_norm": 0.5332958698272705, + "learning_rate": 2.4609195402298852e-05, + "loss": 0.0959, + "step": 4418 + }, + { + "epoch": 5.122364703325364, + "grad_norm": 0.48427054286003113, + "learning_rate": 2.460344827586207e-05, + "loss": 0.0936, + "step": 4419 + }, + { + "epoch": 5.123523871622111, + "grad_norm": 0.32984548807144165, + "learning_rate": 2.4597701149425288e-05, + "loss": 0.0927, + "step": 4420 + }, + { + "epoch": 5.1246830399188585, + "grad_norm": 0.6007506847381592, + "learning_rate": 2.4591954022988507e-05, + "loss": 0.0944, + "step": 4421 + }, + { + "epoch": 5.125842208215605, + "grad_norm": 0.4519351124763489, + "learning_rate": 2.4586206896551725e-05, + "loss": 0.0904, + "step": 4422 + }, + { + "epoch": 5.127001376512353, + "grad_norm": 0.44407084584236145, + "learning_rate": 2.4580459770114943e-05, + "loss": 0.0938, + "step": 4423 + }, + { + "epoch": 5.128160544809099, + "grad_norm": 0.4065387547016144, + "learning_rate": 2.4574712643678165e-05, + "loss": 0.0949, + "step": 4424 + }, + { + "epoch": 5.129319713105847, + "grad_norm": 0.42463329434394836, + "learning_rate": 2.456896551724138e-05, + "loss": 0.0859, + "step": 4425 + }, + { + "epoch": 5.130478881402594, + "grad_norm": 0.5019070506095886, + "learning_rate": 2.4563218390804598e-05, + "loss": 0.1039, + "step": 4426 + }, + { + "epoch": 5.13163804969934, + "grad_norm": 0.463072806596756, + "learning_rate": 2.4557471264367816e-05, + "loss": 0.1004, + "step": 4427 + }, + { + "epoch": 5.132797217996088, + "grad_norm": 0.32865774631500244, + "learning_rate": 2.4551724137931038e-05, + "loss": 0.0915, + "step": 4428 + }, + { + "epoch": 5.133956386292835, + "grad_norm": 0.26548025012016296, + "learning_rate": 2.4545977011494253e-05, + "loss": 0.0837, + "step": 4429 + }, + { + "epoch": 5.135115554589582, + "grad_norm": 0.3461511731147766, + "learning_rate": 2.454022988505747e-05, + "loss": 0.0882, + "step": 4430 + }, + { + "epoch": 5.136274722886329, + "grad_norm": 0.816858172416687, + "learning_rate": 2.4534482758620693e-05, + "loss": 0.0924, + "step": 4431 + }, + { + "epoch": 5.137433891183076, + "grad_norm": 0.3309417963027954, + "learning_rate": 2.4528735632183908e-05, + "loss": 0.0926, + "step": 4432 + }, + { + "epoch": 5.138593059479823, + "grad_norm": 0.6483179926872253, + "learning_rate": 2.4522988505747126e-05, + "loss": 0.0908, + "step": 4433 + }, + { + "epoch": 5.139752227776571, + "grad_norm": 0.3910187780857086, + "learning_rate": 2.4517241379310348e-05, + "loss": 0.0956, + "step": 4434 + }, + { + "epoch": 5.140911396073317, + "grad_norm": 0.4869095981121063, + "learning_rate": 2.4511494252873566e-05, + "loss": 0.0881, + "step": 4435 + }, + { + "epoch": 5.142070564370065, + "grad_norm": 0.6011160612106323, + "learning_rate": 2.450574712643678e-05, + "loss": 0.1055, + "step": 4436 + }, + { + "epoch": 5.1432297326668115, + "grad_norm": 0.38875725865364075, + "learning_rate": 2.45e-05, + "loss": 0.0919, + "step": 4437 + }, + { + "epoch": 5.144388900963559, + "grad_norm": 0.28228649497032166, + "learning_rate": 2.449425287356322e-05, + "loss": 0.0942, + "step": 4438 + }, + { + "epoch": 5.145548069260306, + "grad_norm": 0.3258858919143677, + "learning_rate": 2.448850574712644e-05, + "loss": 0.0845, + "step": 4439 + }, + { + "epoch": 5.146707237557052, + "grad_norm": 0.3680334985256195, + "learning_rate": 2.4482758620689654e-05, + "loss": 0.0922, + "step": 4440 + }, + { + "epoch": 5.1478664058538, + "grad_norm": 0.38462620973587036, + "learning_rate": 2.4477011494252876e-05, + "loss": 0.1046, + "step": 4441 + }, + { + "epoch": 5.149025574150547, + "grad_norm": 0.48732998967170715, + "learning_rate": 2.4471264367816094e-05, + "loss": 0.1, + "step": 4442 + }, + { + "epoch": 5.150184742447294, + "grad_norm": 0.45189419388771057, + "learning_rate": 2.4465517241379312e-05, + "loss": 0.0962, + "step": 4443 + }, + { + "epoch": 5.151343910744041, + "grad_norm": 0.4464831054210663, + "learning_rate": 2.445977011494253e-05, + "loss": 0.0986, + "step": 4444 + }, + { + "epoch": 5.152503079040788, + "grad_norm": 0.3762010931968689, + "learning_rate": 2.445402298850575e-05, + "loss": 0.1022, + "step": 4445 + }, + { + "epoch": 5.153662247337535, + "grad_norm": 0.3228253424167633, + "learning_rate": 2.4448275862068967e-05, + "loss": 0.0903, + "step": 4446 + }, + { + "epoch": 5.154821415634283, + "grad_norm": 0.40066200494766235, + "learning_rate": 2.4442528735632185e-05, + "loss": 0.103, + "step": 4447 + }, + { + "epoch": 5.155980583931029, + "grad_norm": 0.42761850357055664, + "learning_rate": 2.4436781609195404e-05, + "loss": 0.0986, + "step": 4448 + }, + { + "epoch": 5.157139752227777, + "grad_norm": 0.3788778483867645, + "learning_rate": 2.4431034482758622e-05, + "loss": 0.0954, + "step": 4449 + }, + { + "epoch": 5.1582989205245235, + "grad_norm": 0.35833922028541565, + "learning_rate": 2.442528735632184e-05, + "loss": 0.0897, + "step": 4450 + }, + { + "epoch": 5.159458088821271, + "grad_norm": 0.522954523563385, + "learning_rate": 2.441954022988506e-05, + "loss": 0.0864, + "step": 4451 + }, + { + "epoch": 5.160617257118018, + "grad_norm": 0.43595683574676514, + "learning_rate": 2.4413793103448277e-05, + "loss": 0.0897, + "step": 4452 + }, + { + "epoch": 5.161776425414765, + "grad_norm": 0.4088694155216217, + "learning_rate": 2.4408045977011495e-05, + "loss": 0.1002, + "step": 4453 + }, + { + "epoch": 5.162935593711512, + "grad_norm": 0.4870903491973877, + "learning_rate": 2.4402298850574714e-05, + "loss": 0.0898, + "step": 4454 + }, + { + "epoch": 5.164094762008259, + "grad_norm": 0.3365190029144287, + "learning_rate": 2.4396551724137932e-05, + "loss": 0.0888, + "step": 4455 + }, + { + "epoch": 5.165253930305006, + "grad_norm": 0.5863637328147888, + "learning_rate": 2.439080459770115e-05, + "loss": 0.1003, + "step": 4456 + }, + { + "epoch": 5.166413098601753, + "grad_norm": 0.3489425778388977, + "learning_rate": 2.438505747126437e-05, + "loss": 0.0891, + "step": 4457 + }, + { + "epoch": 5.1675722668985005, + "grad_norm": 0.42141303420066833, + "learning_rate": 2.4379310344827587e-05, + "loss": 0.0868, + "step": 4458 + }, + { + "epoch": 5.168731435195247, + "grad_norm": 0.5264760255813599, + "learning_rate": 2.4373563218390805e-05, + "loss": 0.1003, + "step": 4459 + }, + { + "epoch": 5.169890603491995, + "grad_norm": 0.4587344825267792, + "learning_rate": 2.4367816091954023e-05, + "loss": 0.0844, + "step": 4460 + }, + { + "epoch": 5.171049771788741, + "grad_norm": 0.5122568607330322, + "learning_rate": 2.436206896551724e-05, + "loss": 0.0842, + "step": 4461 + }, + { + "epoch": 5.172208940085489, + "grad_norm": 0.38229185342788696, + "learning_rate": 2.4356321839080463e-05, + "loss": 0.0903, + "step": 4462 + }, + { + "epoch": 5.173368108382236, + "grad_norm": 0.5392225980758667, + "learning_rate": 2.4350574712643678e-05, + "loss": 0.0971, + "step": 4463 + }, + { + "epoch": 5.174527276678983, + "grad_norm": 0.4981425106525421, + "learning_rate": 2.4344827586206896e-05, + "loss": 0.0872, + "step": 4464 + }, + { + "epoch": 5.17568644497573, + "grad_norm": 0.4396236538887024, + "learning_rate": 2.4339080459770118e-05, + "loss": 0.0926, + "step": 4465 + }, + { + "epoch": 5.176845613272477, + "grad_norm": 0.42154166102409363, + "learning_rate": 2.4333333333333336e-05, + "loss": 0.0882, + "step": 4466 + }, + { + "epoch": 5.178004781569224, + "grad_norm": 0.4124399721622467, + "learning_rate": 2.432758620689655e-05, + "loss": 0.092, + "step": 4467 + }, + { + "epoch": 5.179163949865972, + "grad_norm": 0.378948450088501, + "learning_rate": 2.432183908045977e-05, + "loss": 0.0951, + "step": 4468 + }, + { + "epoch": 5.180323118162718, + "grad_norm": 0.3714349567890167, + "learning_rate": 2.431609195402299e-05, + "loss": 0.0981, + "step": 4469 + }, + { + "epoch": 5.181482286459465, + "grad_norm": 0.3504725396633148, + "learning_rate": 2.4310344827586206e-05, + "loss": 0.0973, + "step": 4470 + }, + { + "epoch": 5.1826414547562125, + "grad_norm": 0.4103260636329651, + "learning_rate": 2.4304597701149424e-05, + "loss": 0.0957, + "step": 4471 + }, + { + "epoch": 5.183800623052959, + "grad_norm": 0.5066686868667603, + "learning_rate": 2.4298850574712646e-05, + "loss": 0.0986, + "step": 4472 + }, + { + "epoch": 5.184959791349707, + "grad_norm": 0.6200160980224609, + "learning_rate": 2.4293103448275864e-05, + "loss": 0.1011, + "step": 4473 + }, + { + "epoch": 5.186118959646453, + "grad_norm": 0.4090159833431244, + "learning_rate": 2.428735632183908e-05, + "loss": 0.0977, + "step": 4474 + }, + { + "epoch": 5.187278127943201, + "grad_norm": 0.45171108841896057, + "learning_rate": 2.42816091954023e-05, + "loss": 0.0855, + "step": 4475 + }, + { + "epoch": 5.188437296239948, + "grad_norm": 0.3407220244407654, + "learning_rate": 2.427586206896552e-05, + "loss": 0.0917, + "step": 4476 + }, + { + "epoch": 5.189596464536695, + "grad_norm": 0.3243277370929718, + "learning_rate": 2.4270114942528738e-05, + "loss": 0.0855, + "step": 4477 + }, + { + "epoch": 5.190755632833442, + "grad_norm": 0.5616956949234009, + "learning_rate": 2.4264367816091952e-05, + "loss": 0.0855, + "step": 4478 + }, + { + "epoch": 5.191914801130189, + "grad_norm": 0.369253009557724, + "learning_rate": 2.4258620689655174e-05, + "loss": 0.0832, + "step": 4479 + }, + { + "epoch": 5.193073969426936, + "grad_norm": 0.3325676918029785, + "learning_rate": 2.4252873563218392e-05, + "loss": 0.0883, + "step": 4480 + }, + { + "epoch": 5.194233137723684, + "grad_norm": 0.4389870762825012, + "learning_rate": 2.424712643678161e-05, + "loss": 0.1078, + "step": 4481 + }, + { + "epoch": 5.19539230602043, + "grad_norm": 0.3416706919670105, + "learning_rate": 2.424137931034483e-05, + "loss": 0.0885, + "step": 4482 + }, + { + "epoch": 5.196551474317177, + "grad_norm": 0.3958418369293213, + "learning_rate": 2.4235632183908047e-05, + "loss": 0.0959, + "step": 4483 + }, + { + "epoch": 5.197710642613925, + "grad_norm": 0.41249221563339233, + "learning_rate": 2.4229885057471266e-05, + "loss": 0.0972, + "step": 4484 + }, + { + "epoch": 5.198869810910671, + "grad_norm": 0.3895527720451355, + "learning_rate": 2.4224137931034484e-05, + "loss": 0.0847, + "step": 4485 + }, + { + "epoch": 5.200028979207419, + "grad_norm": 0.3692401349544525, + "learning_rate": 2.4218390804597702e-05, + "loss": 0.095, + "step": 4486 + }, + { + "epoch": 5.2011881475041655, + "grad_norm": 0.5421669483184814, + "learning_rate": 2.421264367816092e-05, + "loss": 0.0923, + "step": 4487 + }, + { + "epoch": 5.202347315800913, + "grad_norm": 0.40526431798934937, + "learning_rate": 2.420689655172414e-05, + "loss": 0.0903, + "step": 4488 + }, + { + "epoch": 5.20350648409766, + "grad_norm": 0.35065868496894836, + "learning_rate": 2.420114942528736e-05, + "loss": 0.0945, + "step": 4489 + }, + { + "epoch": 5.204665652394407, + "grad_norm": 0.34505459666252136, + "learning_rate": 2.4195402298850575e-05, + "loss": 0.0952, + "step": 4490 + }, + { + "epoch": 5.205824820691154, + "grad_norm": 0.43336015939712524, + "learning_rate": 2.4189655172413794e-05, + "loss": 0.0989, + "step": 4491 + }, + { + "epoch": 5.2069839889879015, + "grad_norm": 0.3147088885307312, + "learning_rate": 2.4183908045977012e-05, + "loss": 0.089, + "step": 4492 + }, + { + "epoch": 5.208143157284648, + "grad_norm": 0.567955493927002, + "learning_rate": 2.417816091954023e-05, + "loss": 0.088, + "step": 4493 + }, + { + "epoch": 5.209302325581396, + "grad_norm": 0.4204098582267761, + "learning_rate": 2.417241379310345e-05, + "loss": 0.0927, + "step": 4494 + }, + { + "epoch": 5.210461493878142, + "grad_norm": 0.4277987778186798, + "learning_rate": 2.4166666666666667e-05, + "loss": 0.0955, + "step": 4495 + }, + { + "epoch": 5.211620662174889, + "grad_norm": 0.44593873620033264, + "learning_rate": 2.416091954022989e-05, + "loss": 0.1008, + "step": 4496 + }, + { + "epoch": 5.212779830471637, + "grad_norm": 0.39049631357192993, + "learning_rate": 2.4155172413793103e-05, + "loss": 0.0957, + "step": 4497 + }, + { + "epoch": 5.213938998768383, + "grad_norm": 0.3604249954223633, + "learning_rate": 2.414942528735632e-05, + "loss": 0.1004, + "step": 4498 + }, + { + "epoch": 5.215098167065131, + "grad_norm": 0.3670724630355835, + "learning_rate": 2.4143678160919543e-05, + "loss": 0.0904, + "step": 4499 + }, + { + "epoch": 5.2162573353618775, + "grad_norm": 0.34518417716026306, + "learning_rate": 2.413793103448276e-05, + "loss": 0.0893, + "step": 4500 + }, + { + "epoch": 5.217416503658625, + "grad_norm": 0.39651983976364136, + "learning_rate": 2.4132183908045977e-05, + "loss": 0.0948, + "step": 4501 + }, + { + "epoch": 5.218575671955372, + "grad_norm": 0.4444870948791504, + "learning_rate": 2.4126436781609195e-05, + "loss": 0.0863, + "step": 4502 + }, + { + "epoch": 5.219734840252119, + "grad_norm": 0.5257106423377991, + "learning_rate": 2.4120689655172417e-05, + "loss": 0.0919, + "step": 4503 + }, + { + "epoch": 5.220894008548866, + "grad_norm": 0.45813608169555664, + "learning_rate": 2.4114942528735635e-05, + "loss": 0.1027, + "step": 4504 + }, + { + "epoch": 5.2220531768456135, + "grad_norm": 0.4071040749549866, + "learning_rate": 2.410919540229885e-05, + "loss": 0.0958, + "step": 4505 + }, + { + "epoch": 5.22321234514236, + "grad_norm": 0.4606774151325226, + "learning_rate": 2.410344827586207e-05, + "loss": 0.0926, + "step": 4506 + }, + { + "epoch": 5.224371513439108, + "grad_norm": 0.47253385186195374, + "learning_rate": 2.409770114942529e-05, + "loss": 0.1084, + "step": 4507 + }, + { + "epoch": 5.225530681735854, + "grad_norm": 0.4298096299171448, + "learning_rate": 2.4091954022988508e-05, + "loss": 0.0866, + "step": 4508 + }, + { + "epoch": 5.226689850032602, + "grad_norm": 0.843309760093689, + "learning_rate": 2.4086206896551726e-05, + "loss": 0.104, + "step": 4509 + }, + { + "epoch": 5.227849018329349, + "grad_norm": 0.5684033632278442, + "learning_rate": 2.4080459770114945e-05, + "loss": 0.0958, + "step": 4510 + }, + { + "epoch": 5.229008186626096, + "grad_norm": 0.6145815849304199, + "learning_rate": 2.4074712643678163e-05, + "loss": 0.0957, + "step": 4511 + }, + { + "epoch": 5.230167354922843, + "grad_norm": 0.5087504386901855, + "learning_rate": 2.4068965517241378e-05, + "loss": 0.0846, + "step": 4512 + }, + { + "epoch": 5.23132652321959, + "grad_norm": 0.49333977699279785, + "learning_rate": 2.40632183908046e-05, + "loss": 0.088, + "step": 4513 + }, + { + "epoch": 5.232485691516337, + "grad_norm": 0.29817384481430054, + "learning_rate": 2.4057471264367818e-05, + "loss": 0.086, + "step": 4514 + }, + { + "epoch": 5.233644859813084, + "grad_norm": 0.3775709867477417, + "learning_rate": 2.4051724137931036e-05, + "loss": 0.0947, + "step": 4515 + }, + { + "epoch": 5.234804028109831, + "grad_norm": 0.31703516840934753, + "learning_rate": 2.4045977011494254e-05, + "loss": 0.0883, + "step": 4516 + }, + { + "epoch": 5.235963196406578, + "grad_norm": 0.42791876196861267, + "learning_rate": 2.4040229885057473e-05, + "loss": 0.0933, + "step": 4517 + }, + { + "epoch": 5.237122364703326, + "grad_norm": 0.5185449123382568, + "learning_rate": 2.403448275862069e-05, + "loss": 0.1011, + "step": 4518 + }, + { + "epoch": 5.238281533000072, + "grad_norm": 0.4122971296310425, + "learning_rate": 2.402873563218391e-05, + "loss": 0.1013, + "step": 4519 + }, + { + "epoch": 5.23944070129682, + "grad_norm": 0.3461478054523468, + "learning_rate": 2.4022988505747127e-05, + "loss": 0.0909, + "step": 4520 + }, + { + "epoch": 5.2405998695935665, + "grad_norm": 0.38688674569129944, + "learning_rate": 2.4017241379310346e-05, + "loss": 0.0908, + "step": 4521 + }, + { + "epoch": 5.241759037890314, + "grad_norm": 0.3087342381477356, + "learning_rate": 2.4011494252873564e-05, + "loss": 0.0818, + "step": 4522 + }, + { + "epoch": 5.242918206187061, + "grad_norm": 0.46722832322120667, + "learning_rate": 2.4005747126436782e-05, + "loss": 0.09, + "step": 4523 + }, + { + "epoch": 5.244077374483808, + "grad_norm": 0.4601225256919861, + "learning_rate": 2.4e-05, + "loss": 0.1053, + "step": 4524 + }, + { + "epoch": 5.245236542780555, + "grad_norm": 0.3325468599796295, + "learning_rate": 2.399425287356322e-05, + "loss": 0.0875, + "step": 4525 + }, + { + "epoch": 5.246395711077302, + "grad_norm": 0.4317132830619812, + "learning_rate": 2.3988505747126437e-05, + "loss": 0.0961, + "step": 4526 + }, + { + "epoch": 5.247554879374049, + "grad_norm": 0.3786173462867737, + "learning_rate": 2.398275862068966e-05, + "loss": 0.088, + "step": 4527 + }, + { + "epoch": 5.248714047670796, + "grad_norm": 0.34360384941101074, + "learning_rate": 2.3977011494252874e-05, + "loss": 0.0882, + "step": 4528 + }, + { + "epoch": 5.249873215967543, + "grad_norm": 0.3245190382003784, + "learning_rate": 2.3971264367816092e-05, + "loss": 0.0888, + "step": 4529 + }, + { + "epoch": 5.25103238426429, + "grad_norm": 0.3508089780807495, + "learning_rate": 2.3965517241379314e-05, + "loss": 0.0838, + "step": 4530 + }, + { + "epoch": 5.252191552561038, + "grad_norm": 0.7918804287910461, + "learning_rate": 2.395977011494253e-05, + "loss": 0.1023, + "step": 4531 + }, + { + "epoch": 5.253350720857784, + "grad_norm": 0.5230761766433716, + "learning_rate": 2.3954022988505747e-05, + "loss": 0.0951, + "step": 4532 + }, + { + "epoch": 5.254509889154532, + "grad_norm": 0.3835771381855011, + "learning_rate": 2.3948275862068965e-05, + "loss": 0.0938, + "step": 4533 + }, + { + "epoch": 5.2556690574512785, + "grad_norm": 0.28895124793052673, + "learning_rate": 2.3942528735632187e-05, + "loss": 0.081, + "step": 4534 + }, + { + "epoch": 5.256828225748026, + "grad_norm": 0.36572563648223877, + "learning_rate": 2.3936781609195402e-05, + "loss": 0.0896, + "step": 4535 + }, + { + "epoch": 5.257987394044773, + "grad_norm": 0.4853745996952057, + "learning_rate": 2.393103448275862e-05, + "loss": 0.0952, + "step": 4536 + }, + { + "epoch": 5.25914656234152, + "grad_norm": 0.37866243720054626, + "learning_rate": 2.3925287356321842e-05, + "loss": 0.0936, + "step": 4537 + }, + { + "epoch": 5.260305730638267, + "grad_norm": 0.42627018690109253, + "learning_rate": 2.391954022988506e-05, + "loss": 0.0868, + "step": 4538 + }, + { + "epoch": 5.261464898935014, + "grad_norm": 0.3555832803249359, + "learning_rate": 2.3913793103448275e-05, + "loss": 0.0904, + "step": 4539 + }, + { + "epoch": 5.262624067231761, + "grad_norm": 0.4143363833427429, + "learning_rate": 2.3908045977011497e-05, + "loss": 0.0939, + "step": 4540 + }, + { + "epoch": 5.263783235528508, + "grad_norm": 0.3686874806880951, + "learning_rate": 2.3902298850574715e-05, + "loss": 0.0899, + "step": 4541 + }, + { + "epoch": 5.2649424038252555, + "grad_norm": 0.34536075592041016, + "learning_rate": 2.3896551724137933e-05, + "loss": 0.0869, + "step": 4542 + }, + { + "epoch": 5.266101572122002, + "grad_norm": 0.6285473704338074, + "learning_rate": 2.3890804597701148e-05, + "loss": 0.1043, + "step": 4543 + }, + { + "epoch": 5.26726074041875, + "grad_norm": 0.4580352306365967, + "learning_rate": 2.388505747126437e-05, + "loss": 0.0931, + "step": 4544 + }, + { + "epoch": 5.268419908715496, + "grad_norm": 0.3974003493785858, + "learning_rate": 2.3879310344827588e-05, + "loss": 0.0933, + "step": 4545 + }, + { + "epoch": 5.269579077012244, + "grad_norm": 0.4980361759662628, + "learning_rate": 2.3873563218390806e-05, + "loss": 0.095, + "step": 4546 + }, + { + "epoch": 5.270738245308991, + "grad_norm": 0.3423739969730377, + "learning_rate": 2.3867816091954025e-05, + "loss": 0.0922, + "step": 4547 + }, + { + "epoch": 5.271897413605738, + "grad_norm": 0.47561824321746826, + "learning_rate": 2.3862068965517243e-05, + "loss": 0.0896, + "step": 4548 + }, + { + "epoch": 5.273056581902485, + "grad_norm": 0.36675018072128296, + "learning_rate": 2.385632183908046e-05, + "loss": 0.0984, + "step": 4549 + }, + { + "epoch": 5.274215750199232, + "grad_norm": 0.507520854473114, + "learning_rate": 2.385057471264368e-05, + "loss": 0.092, + "step": 4550 + }, + { + "epoch": 5.275374918495979, + "grad_norm": 0.43633562326431274, + "learning_rate": 2.3844827586206898e-05, + "loss": 0.0979, + "step": 4551 + }, + { + "epoch": 5.276534086792727, + "grad_norm": 0.41792401671409607, + "learning_rate": 2.3839080459770116e-05, + "loss": 0.0897, + "step": 4552 + }, + { + "epoch": 5.277693255089473, + "grad_norm": 0.44738197326660156, + "learning_rate": 2.3833333333333334e-05, + "loss": 0.0915, + "step": 4553 + }, + { + "epoch": 5.278852423386221, + "grad_norm": 0.49772951006889343, + "learning_rate": 2.3827586206896553e-05, + "loss": 0.0982, + "step": 4554 + }, + { + "epoch": 5.2800115916829675, + "grad_norm": 0.5207464098930359, + "learning_rate": 2.382183908045977e-05, + "loss": 0.0929, + "step": 4555 + }, + { + "epoch": 5.281170759979714, + "grad_norm": 0.8150321841239929, + "learning_rate": 2.381609195402299e-05, + "loss": 0.1011, + "step": 4556 + }, + { + "epoch": 5.282329928276462, + "grad_norm": 0.4668427109718323, + "learning_rate": 2.3810344827586208e-05, + "loss": 0.0894, + "step": 4557 + }, + { + "epoch": 5.283489096573208, + "grad_norm": 0.42127272486686707, + "learning_rate": 2.3804597701149426e-05, + "loss": 0.0844, + "step": 4558 + }, + { + "epoch": 5.284648264869956, + "grad_norm": 0.6911650896072388, + "learning_rate": 2.3798850574712644e-05, + "loss": 0.1004, + "step": 4559 + }, + { + "epoch": 5.285807433166703, + "grad_norm": 0.37736475467681885, + "learning_rate": 2.3793103448275862e-05, + "loss": 0.0964, + "step": 4560 + }, + { + "epoch": 5.28696660146345, + "grad_norm": 0.4046756327152252, + "learning_rate": 2.3787356321839084e-05, + "loss": 0.0899, + "step": 4561 + }, + { + "epoch": 5.288125769760197, + "grad_norm": 0.3828308582305908, + "learning_rate": 2.37816091954023e-05, + "loss": 0.1046, + "step": 4562 + }, + { + "epoch": 5.289284938056944, + "grad_norm": 0.3368671238422394, + "learning_rate": 2.3775862068965517e-05, + "loss": 0.0895, + "step": 4563 + }, + { + "epoch": 5.290444106353691, + "grad_norm": 0.3442758321762085, + "learning_rate": 2.3770114942528736e-05, + "loss": 0.0895, + "step": 4564 + }, + { + "epoch": 5.291603274650439, + "grad_norm": 0.4719181954860687, + "learning_rate": 2.3764367816091957e-05, + "loss": 0.1, + "step": 4565 + }, + { + "epoch": 5.292762442947185, + "grad_norm": 0.37224331498146057, + "learning_rate": 2.3758620689655172e-05, + "loss": 0.1004, + "step": 4566 + }, + { + "epoch": 5.293921611243933, + "grad_norm": 0.4549705982208252, + "learning_rate": 2.375287356321839e-05, + "loss": 0.0958, + "step": 4567 + }, + { + "epoch": 5.29508077954068, + "grad_norm": 0.3962538242340088, + "learning_rate": 2.3747126436781612e-05, + "loss": 0.0984, + "step": 4568 + }, + { + "epoch": 5.296239947837426, + "grad_norm": 0.42686334252357483, + "learning_rate": 2.374137931034483e-05, + "loss": 0.0917, + "step": 4569 + }, + { + "epoch": 5.297399116134174, + "grad_norm": 0.4089353382587433, + "learning_rate": 2.3735632183908045e-05, + "loss": 0.0938, + "step": 4570 + }, + { + "epoch": 5.2985582844309205, + "grad_norm": 0.43393051624298096, + "learning_rate": 2.3729885057471267e-05, + "loss": 0.0894, + "step": 4571 + }, + { + "epoch": 5.299717452727668, + "grad_norm": 0.4832133650779724, + "learning_rate": 2.3724137931034485e-05, + "loss": 0.1016, + "step": 4572 + }, + { + "epoch": 5.300876621024415, + "grad_norm": 0.4066667854785919, + "learning_rate": 2.37183908045977e-05, + "loss": 0.0954, + "step": 4573 + }, + { + "epoch": 5.302035789321162, + "grad_norm": 0.40719154477119446, + "learning_rate": 2.371264367816092e-05, + "loss": 0.0924, + "step": 4574 + }, + { + "epoch": 5.303194957617909, + "grad_norm": 0.38603299856185913, + "learning_rate": 2.370689655172414e-05, + "loss": 0.0946, + "step": 4575 + }, + { + "epoch": 5.3043541259146565, + "grad_norm": 0.45713695883750916, + "learning_rate": 2.370114942528736e-05, + "loss": 0.0946, + "step": 4576 + }, + { + "epoch": 5.305513294211403, + "grad_norm": 0.43153274059295654, + "learning_rate": 2.3695402298850573e-05, + "loss": 0.0941, + "step": 4577 + }, + { + "epoch": 5.306672462508151, + "grad_norm": 0.5558722019195557, + "learning_rate": 2.3689655172413795e-05, + "loss": 0.0991, + "step": 4578 + }, + { + "epoch": 5.307831630804897, + "grad_norm": 0.3364887833595276, + "learning_rate": 2.3683908045977013e-05, + "loss": 0.0903, + "step": 4579 + }, + { + "epoch": 5.308990799101645, + "grad_norm": 0.39872419834136963, + "learning_rate": 2.367816091954023e-05, + "loss": 0.1015, + "step": 4580 + }, + { + "epoch": 5.310149967398392, + "grad_norm": 0.4545663297176361, + "learning_rate": 2.367241379310345e-05, + "loss": 0.1017, + "step": 4581 + }, + { + "epoch": 5.311309135695138, + "grad_norm": 0.4072364866733551, + "learning_rate": 2.3666666666666668e-05, + "loss": 0.0888, + "step": 4582 + }, + { + "epoch": 5.312468303991886, + "grad_norm": 0.373927503824234, + "learning_rate": 2.3660919540229886e-05, + "loss": 0.0973, + "step": 4583 + }, + { + "epoch": 5.3136274722886325, + "grad_norm": 0.35121285915374756, + "learning_rate": 2.3655172413793105e-05, + "loss": 0.0938, + "step": 4584 + }, + { + "epoch": 5.31478664058538, + "grad_norm": 0.4112243950366974, + "learning_rate": 2.3649425287356323e-05, + "loss": 0.1031, + "step": 4585 + }, + { + "epoch": 5.315945808882127, + "grad_norm": 0.4238256812095642, + "learning_rate": 2.364367816091954e-05, + "loss": 0.0929, + "step": 4586 + }, + { + "epoch": 5.317104977178874, + "grad_norm": 0.531015157699585, + "learning_rate": 2.363793103448276e-05, + "loss": 0.0988, + "step": 4587 + }, + { + "epoch": 5.318264145475621, + "grad_norm": 0.4540468752384186, + "learning_rate": 2.3632183908045978e-05, + "loss": 0.1005, + "step": 4588 + }, + { + "epoch": 5.3194233137723685, + "grad_norm": 0.30282753705978394, + "learning_rate": 2.3626436781609196e-05, + "loss": 0.0845, + "step": 4589 + }, + { + "epoch": 5.320582482069115, + "grad_norm": 0.3676490783691406, + "learning_rate": 2.3620689655172415e-05, + "loss": 0.0846, + "step": 4590 + }, + { + "epoch": 5.321741650365863, + "grad_norm": 0.4564397931098938, + "learning_rate": 2.3614942528735633e-05, + "loss": 0.086, + "step": 4591 + }, + { + "epoch": 5.3229008186626094, + "grad_norm": 0.3975692689418793, + "learning_rate": 2.360919540229885e-05, + "loss": 0.0934, + "step": 4592 + }, + { + "epoch": 5.324059986959357, + "grad_norm": 0.35405778884887695, + "learning_rate": 2.360344827586207e-05, + "loss": 0.0997, + "step": 4593 + }, + { + "epoch": 5.325219155256104, + "grad_norm": 0.3428707718849182, + "learning_rate": 2.3597701149425288e-05, + "loss": 0.0859, + "step": 4594 + }, + { + "epoch": 5.326378323552851, + "grad_norm": 0.41354939341545105, + "learning_rate": 2.359195402298851e-05, + "loss": 0.0982, + "step": 4595 + }, + { + "epoch": 5.327537491849598, + "grad_norm": 0.5002196431159973, + "learning_rate": 2.3586206896551724e-05, + "loss": 0.0968, + "step": 4596 + }, + { + "epoch": 5.3286966601463455, + "grad_norm": 0.40941765904426575, + "learning_rate": 2.3580459770114943e-05, + "loss": 0.0923, + "step": 4597 + }, + { + "epoch": 5.329855828443092, + "grad_norm": 0.37814703583717346, + "learning_rate": 2.357471264367816e-05, + "loss": 0.0916, + "step": 4598 + }, + { + "epoch": 5.331014996739839, + "grad_norm": 0.3758867681026459, + "learning_rate": 2.3568965517241383e-05, + "loss": 0.0851, + "step": 4599 + }, + { + "epoch": 5.332174165036586, + "grad_norm": 0.48879507184028625, + "learning_rate": 2.3563218390804597e-05, + "loss": 0.1009, + "step": 4600 + }, + { + "epoch": 5.333333333333333, + "grad_norm": 0.6215201616287231, + "learning_rate": 2.3557471264367816e-05, + "loss": 0.098, + "step": 4601 + }, + { + "epoch": 5.334492501630081, + "grad_norm": 0.48262956738471985, + "learning_rate": 2.3551724137931037e-05, + "loss": 0.0904, + "step": 4602 + }, + { + "epoch": 5.335651669926827, + "grad_norm": 0.40028294920921326, + "learning_rate": 2.3545977011494256e-05, + "loss": 0.0851, + "step": 4603 + }, + { + "epoch": 5.336810838223575, + "grad_norm": 0.47819679975509644, + "learning_rate": 2.354022988505747e-05, + "loss": 0.096, + "step": 4604 + }, + { + "epoch": 5.3379700065203215, + "grad_norm": 0.36950936913490295, + "learning_rate": 2.3534482758620692e-05, + "loss": 0.0909, + "step": 4605 + }, + { + "epoch": 5.339129174817069, + "grad_norm": 0.42461180686950684, + "learning_rate": 2.352873563218391e-05, + "loss": 0.0992, + "step": 4606 + }, + { + "epoch": 5.340288343113816, + "grad_norm": 0.3869633674621582, + "learning_rate": 2.352298850574713e-05, + "loss": 0.0915, + "step": 4607 + }, + { + "epoch": 5.341447511410563, + "grad_norm": 0.36021578311920166, + "learning_rate": 2.3517241379310344e-05, + "loss": 0.0952, + "step": 4608 + }, + { + "epoch": 5.34260667970731, + "grad_norm": 0.3799790143966675, + "learning_rate": 2.3511494252873565e-05, + "loss": 0.0881, + "step": 4609 + }, + { + "epoch": 5.3437658480040575, + "grad_norm": 0.4257470667362213, + "learning_rate": 2.3505747126436784e-05, + "loss": 0.1011, + "step": 4610 + }, + { + "epoch": 5.344925016300804, + "grad_norm": 0.5010349154472351, + "learning_rate": 2.35e-05, + "loss": 0.0897, + "step": 4611 + }, + { + "epoch": 5.346084184597551, + "grad_norm": 0.3810718357563019, + "learning_rate": 2.349425287356322e-05, + "loss": 0.0969, + "step": 4612 + }, + { + "epoch": 5.347243352894298, + "grad_norm": 0.4444500505924225, + "learning_rate": 2.348850574712644e-05, + "loss": 0.0941, + "step": 4613 + }, + { + "epoch": 5.348402521191045, + "grad_norm": 0.5119409561157227, + "learning_rate": 2.3482758620689657e-05, + "loss": 0.1, + "step": 4614 + }, + { + "epoch": 5.349561689487793, + "grad_norm": 0.31901487708091736, + "learning_rate": 2.3477011494252875e-05, + "loss": 0.0996, + "step": 4615 + }, + { + "epoch": 5.350720857784539, + "grad_norm": 0.33912405371665955, + "learning_rate": 2.3471264367816093e-05, + "loss": 0.0938, + "step": 4616 + }, + { + "epoch": 5.351880026081287, + "grad_norm": 0.4147634208202362, + "learning_rate": 2.3465517241379312e-05, + "loss": 0.0975, + "step": 4617 + }, + { + "epoch": 5.3530391943780335, + "grad_norm": 0.33564382791519165, + "learning_rate": 2.345977011494253e-05, + "loss": 0.0827, + "step": 4618 + }, + { + "epoch": 5.354198362674781, + "grad_norm": 0.47823843359947205, + "learning_rate": 2.345402298850575e-05, + "loss": 0.1004, + "step": 4619 + }, + { + "epoch": 5.355357530971528, + "grad_norm": 0.43068960309028625, + "learning_rate": 2.3448275862068967e-05, + "loss": 0.0898, + "step": 4620 + }, + { + "epoch": 5.356516699268275, + "grad_norm": 0.4093489646911621, + "learning_rate": 2.3442528735632185e-05, + "loss": 0.099, + "step": 4621 + }, + { + "epoch": 5.357675867565022, + "grad_norm": 0.4601415991783142, + "learning_rate": 2.3436781609195403e-05, + "loss": 0.0959, + "step": 4622 + }, + { + "epoch": 5.35883503586177, + "grad_norm": 0.42657336592674255, + "learning_rate": 2.343103448275862e-05, + "loss": 0.0862, + "step": 4623 + }, + { + "epoch": 5.359994204158516, + "grad_norm": 0.41843754053115845, + "learning_rate": 2.342528735632184e-05, + "loss": 0.0896, + "step": 4624 + }, + { + "epoch": 5.361153372455263, + "grad_norm": 0.4212128818035126, + "learning_rate": 2.3419540229885058e-05, + "loss": 0.0981, + "step": 4625 + }, + { + "epoch": 5.3623125407520105, + "grad_norm": 0.3441298305988312, + "learning_rate": 2.341379310344828e-05, + "loss": 0.0958, + "step": 4626 + }, + { + "epoch": 5.363471709048757, + "grad_norm": 0.398131787776947, + "learning_rate": 2.3408045977011495e-05, + "loss": 0.0944, + "step": 4627 + }, + { + "epoch": 5.364630877345505, + "grad_norm": 0.5075594186782837, + "learning_rate": 2.3402298850574713e-05, + "loss": 0.0958, + "step": 4628 + }, + { + "epoch": 5.365790045642251, + "grad_norm": 0.3779784142971039, + "learning_rate": 2.339655172413793e-05, + "loss": 0.1002, + "step": 4629 + }, + { + "epoch": 5.366949213938999, + "grad_norm": 0.39865589141845703, + "learning_rate": 2.339080459770115e-05, + "loss": 0.0853, + "step": 4630 + }, + { + "epoch": 5.368108382235746, + "grad_norm": 0.7066544890403748, + "learning_rate": 2.3385057471264368e-05, + "loss": 0.0903, + "step": 4631 + }, + { + "epoch": 5.369267550532493, + "grad_norm": 0.3441803455352783, + "learning_rate": 2.3379310344827586e-05, + "loss": 0.084, + "step": 4632 + }, + { + "epoch": 5.37042671882924, + "grad_norm": 0.4163109362125397, + "learning_rate": 2.3373563218390808e-05, + "loss": 0.095, + "step": 4633 + }, + { + "epoch": 5.371585887125987, + "grad_norm": 0.43912622332572937, + "learning_rate": 2.3367816091954023e-05, + "loss": 0.0922, + "step": 4634 + }, + { + "epoch": 5.372745055422734, + "grad_norm": 0.304113507270813, + "learning_rate": 2.336206896551724e-05, + "loss": 0.0875, + "step": 4635 + }, + { + "epoch": 5.373904223719482, + "grad_norm": 0.3352859318256378, + "learning_rate": 2.3356321839080463e-05, + "loss": 0.0966, + "step": 4636 + }, + { + "epoch": 5.375063392016228, + "grad_norm": 0.4194989800453186, + "learning_rate": 2.335057471264368e-05, + "loss": 0.1005, + "step": 4637 + }, + { + "epoch": 5.376222560312976, + "grad_norm": 0.3426196873188019, + "learning_rate": 2.3344827586206896e-05, + "loss": 0.0988, + "step": 4638 + }, + { + "epoch": 5.3773817286097225, + "grad_norm": 0.35888779163360596, + "learning_rate": 2.3339080459770114e-05, + "loss": 0.0897, + "step": 4639 + }, + { + "epoch": 5.378540896906469, + "grad_norm": 0.35968539118766785, + "learning_rate": 2.3333333333333336e-05, + "loss": 0.0873, + "step": 4640 + }, + { + "epoch": 5.379700065203217, + "grad_norm": 0.4251978099346161, + "learning_rate": 2.3327586206896554e-05, + "loss": 0.0904, + "step": 4641 + }, + { + "epoch": 5.380859233499963, + "grad_norm": 0.5857329964637756, + "learning_rate": 2.332183908045977e-05, + "loss": 0.0879, + "step": 4642 + }, + { + "epoch": 5.382018401796711, + "grad_norm": 0.5313498377799988, + "learning_rate": 2.331609195402299e-05, + "loss": 0.0975, + "step": 4643 + }, + { + "epoch": 5.383177570093458, + "grad_norm": 0.4486619234085083, + "learning_rate": 2.331034482758621e-05, + "loss": 0.09, + "step": 4644 + }, + { + "epoch": 5.384336738390205, + "grad_norm": 0.40548866987228394, + "learning_rate": 2.3304597701149427e-05, + "loss": 0.0929, + "step": 4645 + }, + { + "epoch": 5.385495906686952, + "grad_norm": 0.5880759954452515, + "learning_rate": 2.3298850574712646e-05, + "loss": 0.0945, + "step": 4646 + }, + { + "epoch": 5.386655074983699, + "grad_norm": 0.5098415017127991, + "learning_rate": 2.3293103448275864e-05, + "loss": 0.0874, + "step": 4647 + }, + { + "epoch": 5.387814243280446, + "grad_norm": 0.42802801728248596, + "learning_rate": 2.3287356321839082e-05, + "loss": 0.0853, + "step": 4648 + }, + { + "epoch": 5.388973411577194, + "grad_norm": 0.5465481281280518, + "learning_rate": 2.32816091954023e-05, + "loss": 0.0944, + "step": 4649 + }, + { + "epoch": 5.39013257987394, + "grad_norm": 0.4456057548522949, + "learning_rate": 2.327586206896552e-05, + "loss": 0.1031, + "step": 4650 + }, + { + "epoch": 5.391291748170688, + "grad_norm": 0.36883285641670227, + "learning_rate": 2.3270114942528737e-05, + "loss": 0.0992, + "step": 4651 + }, + { + "epoch": 5.392450916467435, + "grad_norm": 0.39192837476730347, + "learning_rate": 2.3264367816091955e-05, + "loss": 0.0943, + "step": 4652 + }, + { + "epoch": 5.393610084764182, + "grad_norm": 0.36094704270362854, + "learning_rate": 2.3258620689655174e-05, + "loss": 0.0896, + "step": 4653 + }, + { + "epoch": 5.394769253060929, + "grad_norm": 0.8563114404678345, + "learning_rate": 2.3252873563218392e-05, + "loss": 0.0834, + "step": 4654 + }, + { + "epoch": 5.3959284213576755, + "grad_norm": 0.4293108880519867, + "learning_rate": 2.324712643678161e-05, + "loss": 0.1027, + "step": 4655 + }, + { + "epoch": 5.397087589654423, + "grad_norm": 0.5573574900627136, + "learning_rate": 2.324137931034483e-05, + "loss": 0.0978, + "step": 4656 + }, + { + "epoch": 5.39824675795117, + "grad_norm": 0.3921107351779938, + "learning_rate": 2.3235632183908047e-05, + "loss": 0.09, + "step": 4657 + }, + { + "epoch": 5.399405926247917, + "grad_norm": 0.41322219371795654, + "learning_rate": 2.3229885057471265e-05, + "loss": 0.0815, + "step": 4658 + }, + { + "epoch": 5.400565094544664, + "grad_norm": 0.41785478591918945, + "learning_rate": 2.3224137931034483e-05, + "loss": 0.0907, + "step": 4659 + }, + { + "epoch": 5.4017242628414115, + "grad_norm": 0.39730241894721985, + "learning_rate": 2.3218390804597705e-05, + "loss": 0.098, + "step": 4660 + }, + { + "epoch": 5.402883431138158, + "grad_norm": 0.46797794103622437, + "learning_rate": 2.321264367816092e-05, + "loss": 0.0967, + "step": 4661 + }, + { + "epoch": 5.404042599434906, + "grad_norm": 0.42924076318740845, + "learning_rate": 2.3206896551724138e-05, + "loss": 0.0872, + "step": 4662 + }, + { + "epoch": 5.405201767731652, + "grad_norm": 0.4479629695415497, + "learning_rate": 2.3201149425287356e-05, + "loss": 0.0882, + "step": 4663 + }, + { + "epoch": 5.4063609360284, + "grad_norm": 0.32181426882743835, + "learning_rate": 2.3195402298850578e-05, + "loss": 0.0851, + "step": 4664 + }, + { + "epoch": 5.407520104325147, + "grad_norm": 0.44183018803596497, + "learning_rate": 2.3189655172413793e-05, + "loss": 0.0861, + "step": 4665 + }, + { + "epoch": 5.408679272621894, + "grad_norm": 0.4371998608112335, + "learning_rate": 2.318390804597701e-05, + "loss": 0.0945, + "step": 4666 + }, + { + "epoch": 5.409838440918641, + "grad_norm": 0.3353933095932007, + "learning_rate": 2.3178160919540233e-05, + "loss": 0.0867, + "step": 4667 + }, + { + "epoch": 5.4109976092153875, + "grad_norm": 0.35332971811294556, + "learning_rate": 2.317241379310345e-05, + "loss": 0.0795, + "step": 4668 + }, + { + "epoch": 5.412156777512135, + "grad_norm": 0.3787877857685089, + "learning_rate": 2.3166666666666666e-05, + "loss": 0.0977, + "step": 4669 + }, + { + "epoch": 5.413315945808882, + "grad_norm": 0.3937995135784149, + "learning_rate": 2.3160919540229885e-05, + "loss": 0.101, + "step": 4670 + }, + { + "epoch": 5.414475114105629, + "grad_norm": 0.3958158493041992, + "learning_rate": 2.3155172413793106e-05, + "loss": 0.0962, + "step": 4671 + }, + { + "epoch": 5.415634282402376, + "grad_norm": 0.5695372819900513, + "learning_rate": 2.314942528735632e-05, + "loss": 0.1024, + "step": 4672 + }, + { + "epoch": 5.4167934506991235, + "grad_norm": 0.4417628049850464, + "learning_rate": 2.314367816091954e-05, + "loss": 0.0931, + "step": 4673 + }, + { + "epoch": 5.41795261899587, + "grad_norm": 0.44972625374794006, + "learning_rate": 2.313793103448276e-05, + "loss": 0.1041, + "step": 4674 + }, + { + "epoch": 5.419111787292618, + "grad_norm": 0.375545471906662, + "learning_rate": 2.313218390804598e-05, + "loss": 0.0948, + "step": 4675 + }, + { + "epoch": 5.4202709555893644, + "grad_norm": 0.4629718065261841, + "learning_rate": 2.3126436781609194e-05, + "loss": 0.09, + "step": 4676 + }, + { + "epoch": 5.421430123886112, + "grad_norm": 0.462091326713562, + "learning_rate": 2.3120689655172416e-05, + "loss": 0.0955, + "step": 4677 + }, + { + "epoch": 5.422589292182859, + "grad_norm": 0.43810778856277466, + "learning_rate": 2.3114942528735634e-05, + "loss": 0.0946, + "step": 4678 + }, + { + "epoch": 5.423748460479606, + "grad_norm": 0.48332980275154114, + "learning_rate": 2.3109195402298853e-05, + "loss": 0.097, + "step": 4679 + }, + { + "epoch": 5.424907628776353, + "grad_norm": 0.4389806091785431, + "learning_rate": 2.3103448275862067e-05, + "loss": 0.0881, + "step": 4680 + }, + { + "epoch": 5.4260667970731, + "grad_norm": 0.4335387349128723, + "learning_rate": 2.309770114942529e-05, + "loss": 0.0946, + "step": 4681 + }, + { + "epoch": 5.427225965369847, + "grad_norm": 0.4705444872379303, + "learning_rate": 2.3091954022988507e-05, + "loss": 0.0989, + "step": 4682 + }, + { + "epoch": 5.428385133666594, + "grad_norm": 0.4291810989379883, + "learning_rate": 2.3086206896551726e-05, + "loss": 0.0942, + "step": 4683 + }, + { + "epoch": 5.429544301963341, + "grad_norm": 0.6216928958892822, + "learning_rate": 2.3080459770114944e-05, + "loss": 0.096, + "step": 4684 + }, + { + "epoch": 5.430703470260088, + "grad_norm": 0.3836531937122345, + "learning_rate": 2.3074712643678162e-05, + "loss": 0.088, + "step": 4685 + }, + { + "epoch": 5.431862638556836, + "grad_norm": 0.46026742458343506, + "learning_rate": 2.306896551724138e-05, + "loss": 0.0907, + "step": 4686 + }, + { + "epoch": 5.433021806853582, + "grad_norm": 0.45022183656692505, + "learning_rate": 2.30632183908046e-05, + "loss": 0.0981, + "step": 4687 + }, + { + "epoch": 5.43418097515033, + "grad_norm": 0.3708306550979614, + "learning_rate": 2.3057471264367817e-05, + "loss": 0.0984, + "step": 4688 + }, + { + "epoch": 5.4353401434470765, + "grad_norm": 0.3784964084625244, + "learning_rate": 2.3051724137931035e-05, + "loss": 0.0918, + "step": 4689 + }, + { + "epoch": 5.436499311743824, + "grad_norm": 0.6409464478492737, + "learning_rate": 2.3045977011494254e-05, + "loss": 0.0958, + "step": 4690 + }, + { + "epoch": 5.437658480040571, + "grad_norm": 0.5641512274742126, + "learning_rate": 2.3040229885057472e-05, + "loss": 0.0927, + "step": 4691 + }, + { + "epoch": 5.438817648337318, + "grad_norm": 0.42215609550476074, + "learning_rate": 2.303448275862069e-05, + "loss": 0.0903, + "step": 4692 + }, + { + "epoch": 5.439976816634065, + "grad_norm": 0.5272184610366821, + "learning_rate": 2.302873563218391e-05, + "loss": 0.0919, + "step": 4693 + }, + { + "epoch": 5.4411359849308125, + "grad_norm": 0.37865811586380005, + "learning_rate": 2.3022988505747127e-05, + "loss": 0.0928, + "step": 4694 + }, + { + "epoch": 5.442295153227559, + "grad_norm": 0.48235297203063965, + "learning_rate": 2.3017241379310345e-05, + "loss": 0.0885, + "step": 4695 + }, + { + "epoch": 5.443454321524307, + "grad_norm": 0.3501775562763214, + "learning_rate": 2.3011494252873563e-05, + "loss": 0.0847, + "step": 4696 + }, + { + "epoch": 5.444613489821053, + "grad_norm": 0.5579177141189575, + "learning_rate": 2.3005747126436782e-05, + "loss": 0.1053, + "step": 4697 + }, + { + "epoch": 5.4457726581178, + "grad_norm": 0.423701673746109, + "learning_rate": 2.3000000000000003e-05, + "loss": 0.0932, + "step": 4698 + }, + { + "epoch": 5.446931826414548, + "grad_norm": 0.4396893084049225, + "learning_rate": 2.299425287356322e-05, + "loss": 0.0968, + "step": 4699 + }, + { + "epoch": 5.448090994711294, + "grad_norm": 0.3700418472290039, + "learning_rate": 2.2988505747126437e-05, + "loss": 0.0908, + "step": 4700 + }, + { + "epoch": 5.449250163008042, + "grad_norm": 0.3896975815296173, + "learning_rate": 2.2982758620689658e-05, + "loss": 0.0947, + "step": 4701 + }, + { + "epoch": 5.4504093313047886, + "grad_norm": 0.3217882812023163, + "learning_rate": 2.2977011494252877e-05, + "loss": 0.0881, + "step": 4702 + }, + { + "epoch": 5.451568499601536, + "grad_norm": 0.47387391328811646, + "learning_rate": 2.297126436781609e-05, + "loss": 0.1008, + "step": 4703 + }, + { + "epoch": 5.452727667898283, + "grad_norm": 0.2585291862487793, + "learning_rate": 2.296551724137931e-05, + "loss": 0.0952, + "step": 4704 + }, + { + "epoch": 5.45388683619503, + "grad_norm": 0.4016747772693634, + "learning_rate": 2.295977011494253e-05, + "loss": 0.0852, + "step": 4705 + }, + { + "epoch": 5.455046004491777, + "grad_norm": 0.29344117641448975, + "learning_rate": 2.295402298850575e-05, + "loss": 0.0946, + "step": 4706 + }, + { + "epoch": 5.456205172788525, + "grad_norm": 0.334484338760376, + "learning_rate": 2.2948275862068965e-05, + "loss": 0.088, + "step": 4707 + }, + { + "epoch": 5.457364341085271, + "grad_norm": 0.5377522706985474, + "learning_rate": 2.2942528735632186e-05, + "loss": 0.0911, + "step": 4708 + }, + { + "epoch": 5.458523509382019, + "grad_norm": 0.3403340280056, + "learning_rate": 2.2936781609195405e-05, + "loss": 0.0961, + "step": 4709 + }, + { + "epoch": 5.4596826776787655, + "grad_norm": 0.370523601770401, + "learning_rate": 2.293103448275862e-05, + "loss": 0.0951, + "step": 4710 + }, + { + "epoch": 5.460841845975512, + "grad_norm": 0.3723559081554413, + "learning_rate": 2.292528735632184e-05, + "loss": 0.0967, + "step": 4711 + }, + { + "epoch": 5.46200101427226, + "grad_norm": 0.4718587398529053, + "learning_rate": 2.291954022988506e-05, + "loss": 0.0917, + "step": 4712 + }, + { + "epoch": 5.463160182569006, + "grad_norm": 0.3738349974155426, + "learning_rate": 2.2913793103448278e-05, + "loss": 0.0973, + "step": 4713 + }, + { + "epoch": 5.464319350865754, + "grad_norm": 0.4696074426174164, + "learning_rate": 2.2908045977011493e-05, + "loss": 0.095, + "step": 4714 + }, + { + "epoch": 5.465478519162501, + "grad_norm": 0.39924389123916626, + "learning_rate": 2.2902298850574714e-05, + "loss": 0.0899, + "step": 4715 + }, + { + "epoch": 5.466637687459248, + "grad_norm": 0.4714980125427246, + "learning_rate": 2.2896551724137933e-05, + "loss": 0.0892, + "step": 4716 + }, + { + "epoch": 5.467796855755995, + "grad_norm": 0.3208000361919403, + "learning_rate": 2.289080459770115e-05, + "loss": 0.0917, + "step": 4717 + }, + { + "epoch": 5.468956024052742, + "grad_norm": 0.5692875385284424, + "learning_rate": 2.288505747126437e-05, + "loss": 0.0917, + "step": 4718 + }, + { + "epoch": 5.470115192349489, + "grad_norm": 0.5145659446716309, + "learning_rate": 2.2879310344827587e-05, + "loss": 0.0994, + "step": 4719 + }, + { + "epoch": 5.471274360646237, + "grad_norm": 0.4095035791397095, + "learning_rate": 2.2873563218390806e-05, + "loss": 0.0857, + "step": 4720 + }, + { + "epoch": 5.472433528942983, + "grad_norm": 0.38376545906066895, + "learning_rate": 2.2867816091954024e-05, + "loss": 0.0959, + "step": 4721 + }, + { + "epoch": 5.473592697239731, + "grad_norm": 0.5485108494758606, + "learning_rate": 2.2862068965517242e-05, + "loss": 0.0857, + "step": 4722 + }, + { + "epoch": 5.4747518655364775, + "grad_norm": 0.6178170442581177, + "learning_rate": 2.285632183908046e-05, + "loss": 0.1075, + "step": 4723 + }, + { + "epoch": 5.475911033833224, + "grad_norm": 0.44804081320762634, + "learning_rate": 2.285057471264368e-05, + "loss": 0.0933, + "step": 4724 + }, + { + "epoch": 5.477070202129972, + "grad_norm": 0.7503616213798523, + "learning_rate": 2.2844827586206897e-05, + "loss": 0.0994, + "step": 4725 + }, + { + "epoch": 5.478229370426718, + "grad_norm": 0.4127942621707916, + "learning_rate": 2.2839080459770116e-05, + "loss": 0.0947, + "step": 4726 + }, + { + "epoch": 5.479388538723466, + "grad_norm": 0.4280507266521454, + "learning_rate": 2.2833333333333334e-05, + "loss": 0.0837, + "step": 4727 + }, + { + "epoch": 5.480547707020213, + "grad_norm": 0.41026708483695984, + "learning_rate": 2.2827586206896552e-05, + "loss": 0.0915, + "step": 4728 + }, + { + "epoch": 5.48170687531696, + "grad_norm": 0.3558439016342163, + "learning_rate": 2.2821839080459774e-05, + "loss": 0.0843, + "step": 4729 + }, + { + "epoch": 5.482866043613707, + "grad_norm": 0.3199481964111328, + "learning_rate": 2.281609195402299e-05, + "loss": 0.088, + "step": 4730 + }, + { + "epoch": 5.4840252119104544, + "grad_norm": 0.3802203834056854, + "learning_rate": 2.2810344827586207e-05, + "loss": 0.0955, + "step": 4731 + }, + { + "epoch": 5.485184380207201, + "grad_norm": 0.6358181238174438, + "learning_rate": 2.280459770114943e-05, + "loss": 0.1024, + "step": 4732 + }, + { + "epoch": 5.486343548503949, + "grad_norm": 0.3913860619068146, + "learning_rate": 2.2798850574712644e-05, + "loss": 0.0922, + "step": 4733 + }, + { + "epoch": 5.487502716800695, + "grad_norm": 0.4372271001338959, + "learning_rate": 2.2793103448275862e-05, + "loss": 0.0905, + "step": 4734 + }, + { + "epoch": 5.488661885097443, + "grad_norm": 0.4437418282032013, + "learning_rate": 2.278735632183908e-05, + "loss": 0.0815, + "step": 4735 + }, + { + "epoch": 5.48982105339419, + "grad_norm": 0.38980501890182495, + "learning_rate": 2.2781609195402302e-05, + "loss": 0.0888, + "step": 4736 + }, + { + "epoch": 5.490980221690937, + "grad_norm": 0.6082010865211487, + "learning_rate": 2.2775862068965517e-05, + "loss": 0.1109, + "step": 4737 + }, + { + "epoch": 5.492139389987684, + "grad_norm": 0.4929609000682831, + "learning_rate": 2.2770114942528735e-05, + "loss": 0.1048, + "step": 4738 + }, + { + "epoch": 5.493298558284431, + "grad_norm": 0.3931887745857239, + "learning_rate": 2.2764367816091957e-05, + "loss": 0.087, + "step": 4739 + }, + { + "epoch": 5.494457726581178, + "grad_norm": 0.4002040922641754, + "learning_rate": 2.2758620689655175e-05, + "loss": 0.0938, + "step": 4740 + }, + { + "epoch": 5.495616894877925, + "grad_norm": 0.6055549383163452, + "learning_rate": 2.275287356321839e-05, + "loss": 0.1004, + "step": 4741 + }, + { + "epoch": 5.496776063174672, + "grad_norm": 0.4150696098804474, + "learning_rate": 2.274712643678161e-05, + "loss": 0.0971, + "step": 4742 + }, + { + "epoch": 5.497935231471419, + "grad_norm": 0.43481725454330444, + "learning_rate": 2.274137931034483e-05, + "loss": 0.0936, + "step": 4743 + }, + { + "epoch": 5.4990943997681665, + "grad_norm": 0.4212779998779297, + "learning_rate": 2.2735632183908048e-05, + "loss": 0.0863, + "step": 4744 + }, + { + "epoch": 5.500253568064913, + "grad_norm": 0.3982880711555481, + "learning_rate": 2.2729885057471263e-05, + "loss": 0.092, + "step": 4745 + }, + { + "epoch": 5.501412736361661, + "grad_norm": 0.7657057642936707, + "learning_rate": 2.2724137931034485e-05, + "loss": 0.0915, + "step": 4746 + }, + { + "epoch": 5.502571904658407, + "grad_norm": 0.47100207209587097, + "learning_rate": 2.2718390804597703e-05, + "loss": 0.0896, + "step": 4747 + }, + { + "epoch": 5.503731072955155, + "grad_norm": 0.41695326566696167, + "learning_rate": 2.271264367816092e-05, + "loss": 0.1028, + "step": 4748 + }, + { + "epoch": 5.504890241251902, + "grad_norm": 0.38724851608276367, + "learning_rate": 2.270689655172414e-05, + "loss": 0.0922, + "step": 4749 + }, + { + "epoch": 5.506049409548649, + "grad_norm": 0.32382097840309143, + "learning_rate": 2.2701149425287358e-05, + "loss": 0.0894, + "step": 4750 + }, + { + "epoch": 5.507208577845396, + "grad_norm": 0.7724230289459229, + "learning_rate": 2.2695402298850576e-05, + "loss": 0.0947, + "step": 4751 + }, + { + "epoch": 5.508367746142143, + "grad_norm": 0.35658082365989685, + "learning_rate": 2.2689655172413794e-05, + "loss": 0.086, + "step": 4752 + }, + { + "epoch": 5.50952691443889, + "grad_norm": 0.38609573245048523, + "learning_rate": 2.2683908045977013e-05, + "loss": 0.0961, + "step": 4753 + }, + { + "epoch": 5.510686082735637, + "grad_norm": 0.37133869528770447, + "learning_rate": 2.267816091954023e-05, + "loss": 0.0916, + "step": 4754 + }, + { + "epoch": 5.511845251032384, + "grad_norm": 0.34857529401779175, + "learning_rate": 2.267241379310345e-05, + "loss": 0.0952, + "step": 4755 + }, + { + "epoch": 5.513004419329131, + "grad_norm": 0.3999149799346924, + "learning_rate": 2.2666666666666668e-05, + "loss": 0.1025, + "step": 4756 + }, + { + "epoch": 5.5141635876258785, + "grad_norm": 0.38613250851631165, + "learning_rate": 2.2660919540229886e-05, + "loss": 0.0912, + "step": 4757 + }, + { + "epoch": 5.515322755922625, + "grad_norm": 0.4043678939342499, + "learning_rate": 2.2655172413793104e-05, + "loss": 0.0991, + "step": 4758 + }, + { + "epoch": 5.516481924219373, + "grad_norm": 0.460321843624115, + "learning_rate": 2.2649425287356322e-05, + "loss": 0.0861, + "step": 4759 + }, + { + "epoch": 5.5176410925161194, + "grad_norm": 0.5290979146957397, + "learning_rate": 2.264367816091954e-05, + "loss": 0.0946, + "step": 4760 + }, + { + "epoch": 5.518800260812867, + "grad_norm": 0.4141453802585602, + "learning_rate": 2.263793103448276e-05, + "loss": 0.0918, + "step": 4761 + }, + { + "epoch": 5.519959429109614, + "grad_norm": 0.5237475633621216, + "learning_rate": 2.2632183908045977e-05, + "loss": 0.0966, + "step": 4762 + }, + { + "epoch": 5.521118597406361, + "grad_norm": 0.392376571893692, + "learning_rate": 2.26264367816092e-05, + "loss": 0.1033, + "step": 4763 + }, + { + "epoch": 5.522277765703108, + "grad_norm": 0.43914276361465454, + "learning_rate": 2.2620689655172414e-05, + "loss": 0.09, + "step": 4764 + }, + { + "epoch": 5.5234369339998555, + "grad_norm": 0.5471196174621582, + "learning_rate": 2.2614942528735632e-05, + "loss": 0.0959, + "step": 4765 + }, + { + "epoch": 5.524596102296602, + "grad_norm": 0.47647860646247864, + "learning_rate": 2.2609195402298854e-05, + "loss": 0.0975, + "step": 4766 + }, + { + "epoch": 5.525755270593349, + "grad_norm": 0.5217294096946716, + "learning_rate": 2.2603448275862072e-05, + "loss": 0.1029, + "step": 4767 + }, + { + "epoch": 5.526914438890096, + "grad_norm": 0.3615458309650421, + "learning_rate": 2.2597701149425287e-05, + "loss": 0.0837, + "step": 4768 + }, + { + "epoch": 5.528073607186844, + "grad_norm": 0.3933470547199249, + "learning_rate": 2.2591954022988505e-05, + "loss": 0.0959, + "step": 4769 + }, + { + "epoch": 5.529232775483591, + "grad_norm": 0.32372987270355225, + "learning_rate": 2.2586206896551727e-05, + "loss": 0.0963, + "step": 4770 + }, + { + "epoch": 5.530391943780337, + "grad_norm": 0.3662286102771759, + "learning_rate": 2.2580459770114942e-05, + "loss": 0.103, + "step": 4771 + }, + { + "epoch": 5.531551112077085, + "grad_norm": 0.5479937791824341, + "learning_rate": 2.257471264367816e-05, + "loss": 0.1051, + "step": 4772 + }, + { + "epoch": 5.5327102803738315, + "grad_norm": 0.43857720494270325, + "learning_rate": 2.2568965517241382e-05, + "loss": 0.0945, + "step": 4773 + }, + { + "epoch": 5.533869448670579, + "grad_norm": 0.45895788073539734, + "learning_rate": 2.25632183908046e-05, + "loss": 0.0961, + "step": 4774 + }, + { + "epoch": 5.535028616967326, + "grad_norm": 0.3341083526611328, + "learning_rate": 2.2557471264367815e-05, + "loss": 0.0899, + "step": 4775 + }, + { + "epoch": 5.536187785264073, + "grad_norm": 0.45365577936172485, + "learning_rate": 2.2551724137931033e-05, + "loss": 0.0899, + "step": 4776 + }, + { + "epoch": 5.53734695356082, + "grad_norm": 0.3847201466560364, + "learning_rate": 2.2545977011494255e-05, + "loss": 0.1055, + "step": 4777 + }, + { + "epoch": 5.5385061218575675, + "grad_norm": 0.570408821105957, + "learning_rate": 2.2540229885057473e-05, + "loss": 0.0844, + "step": 4778 + }, + { + "epoch": 5.539665290154314, + "grad_norm": 0.4045121669769287, + "learning_rate": 2.2534482758620688e-05, + "loss": 0.0953, + "step": 4779 + }, + { + "epoch": 5.540824458451062, + "grad_norm": 0.321380615234375, + "learning_rate": 2.252873563218391e-05, + "loss": 0.0908, + "step": 4780 + }, + { + "epoch": 5.541983626747808, + "grad_norm": 0.40816688537597656, + "learning_rate": 2.2522988505747128e-05, + "loss": 0.0952, + "step": 4781 + }, + { + "epoch": 5.543142795044556, + "grad_norm": 0.7444669604301453, + "learning_rate": 2.2517241379310347e-05, + "loss": 0.1046, + "step": 4782 + }, + { + "epoch": 5.544301963341303, + "grad_norm": 0.41582420468330383, + "learning_rate": 2.2511494252873565e-05, + "loss": 0.094, + "step": 4783 + }, + { + "epoch": 5.545461131638049, + "grad_norm": 0.2793687880039215, + "learning_rate": 2.2505747126436783e-05, + "loss": 0.0919, + "step": 4784 + }, + { + "epoch": 5.546620299934797, + "grad_norm": 0.530275821685791, + "learning_rate": 2.25e-05, + "loss": 0.0953, + "step": 4785 + }, + { + "epoch": 5.5477794682315436, + "grad_norm": 0.484336793422699, + "learning_rate": 2.249425287356322e-05, + "loss": 0.0929, + "step": 4786 + }, + { + "epoch": 5.548938636528291, + "grad_norm": 0.4025559723377228, + "learning_rate": 2.2488505747126438e-05, + "loss": 0.0982, + "step": 4787 + }, + { + "epoch": 5.550097804825038, + "grad_norm": 0.639001727104187, + "learning_rate": 2.2482758620689656e-05, + "loss": 0.0926, + "step": 4788 + }, + { + "epoch": 5.551256973121785, + "grad_norm": 0.4068092107772827, + "learning_rate": 2.2477011494252875e-05, + "loss": 0.0983, + "step": 4789 + }, + { + "epoch": 5.552416141418532, + "grad_norm": 0.4974704384803772, + "learning_rate": 2.2471264367816093e-05, + "loss": 0.0971, + "step": 4790 + }, + { + "epoch": 5.55357530971528, + "grad_norm": 0.5406764149665833, + "learning_rate": 2.246551724137931e-05, + "loss": 0.0927, + "step": 4791 + }, + { + "epoch": 5.554734478012026, + "grad_norm": 0.3397584855556488, + "learning_rate": 2.245977011494253e-05, + "loss": 0.0869, + "step": 4792 + }, + { + "epoch": 5.555893646308774, + "grad_norm": 0.3097866475582123, + "learning_rate": 2.2454022988505748e-05, + "loss": 0.0877, + "step": 4793 + }, + { + "epoch": 5.5570528146055205, + "grad_norm": 0.4206595718860626, + "learning_rate": 2.2448275862068966e-05, + "loss": 0.0911, + "step": 4794 + }, + { + "epoch": 5.558211982902268, + "grad_norm": 0.2879236042499542, + "learning_rate": 2.2442528735632184e-05, + "loss": 0.1, + "step": 4795 + }, + { + "epoch": 5.559371151199015, + "grad_norm": 0.3888731300830841, + "learning_rate": 2.2436781609195403e-05, + "loss": 0.0943, + "step": 4796 + }, + { + "epoch": 5.560530319495761, + "grad_norm": 0.3849053382873535, + "learning_rate": 2.2431034482758624e-05, + "loss": 0.0925, + "step": 4797 + }, + { + "epoch": 5.561689487792509, + "grad_norm": 0.464656263589859, + "learning_rate": 2.242528735632184e-05, + "loss": 0.1005, + "step": 4798 + }, + { + "epoch": 5.562848656089256, + "grad_norm": 0.4027196764945984, + "learning_rate": 2.2419540229885057e-05, + "loss": 0.1009, + "step": 4799 + }, + { + "epoch": 5.564007824386003, + "grad_norm": 0.47038623690605164, + "learning_rate": 2.2413793103448276e-05, + "loss": 0.0972, + "step": 4800 + }, + { + "epoch": 5.56516699268275, + "grad_norm": 0.35763296484947205, + "learning_rate": 2.2408045977011497e-05, + "loss": 0.0836, + "step": 4801 + }, + { + "epoch": 5.566326160979497, + "grad_norm": 0.4369265139102936, + "learning_rate": 2.2402298850574712e-05, + "loss": 0.0853, + "step": 4802 + }, + { + "epoch": 5.567485329276244, + "grad_norm": 0.43006211519241333, + "learning_rate": 2.239655172413793e-05, + "loss": 0.0973, + "step": 4803 + }, + { + "epoch": 5.568644497572992, + "grad_norm": 0.4107172191143036, + "learning_rate": 2.2390804597701152e-05, + "loss": 0.0965, + "step": 4804 + }, + { + "epoch": 5.569803665869738, + "grad_norm": 0.3978167176246643, + "learning_rate": 2.238505747126437e-05, + "loss": 0.0923, + "step": 4805 + }, + { + "epoch": 5.570962834166486, + "grad_norm": 0.49095186591148376, + "learning_rate": 2.2379310344827586e-05, + "loss": 0.0946, + "step": 4806 + }, + { + "epoch": 5.5721220024632325, + "grad_norm": 0.32563725113868713, + "learning_rate": 2.2373563218390807e-05, + "loss": 0.092, + "step": 4807 + }, + { + "epoch": 5.57328117075998, + "grad_norm": 0.42100727558135986, + "learning_rate": 2.2367816091954025e-05, + "loss": 0.0942, + "step": 4808 + }, + { + "epoch": 5.574440339056727, + "grad_norm": 0.39792951941490173, + "learning_rate": 2.236206896551724e-05, + "loss": 0.09, + "step": 4809 + }, + { + "epoch": 5.575599507353473, + "grad_norm": 0.5242913365364075, + "learning_rate": 2.235632183908046e-05, + "loss": 0.0943, + "step": 4810 + }, + { + "epoch": 5.576758675650221, + "grad_norm": 0.5006229281425476, + "learning_rate": 2.235057471264368e-05, + "loss": 0.0987, + "step": 4811 + }, + { + "epoch": 5.5779178439469685, + "grad_norm": 0.37062880396842957, + "learning_rate": 2.23448275862069e-05, + "loss": 0.0929, + "step": 4812 + }, + { + "epoch": 5.579077012243715, + "grad_norm": 0.5827232003211975, + "learning_rate": 2.2339080459770114e-05, + "loss": 0.1032, + "step": 4813 + }, + { + "epoch": 5.580236180540462, + "grad_norm": 0.30347374081611633, + "learning_rate": 2.2333333333333335e-05, + "loss": 0.0965, + "step": 4814 + }, + { + "epoch": 5.5813953488372094, + "grad_norm": 0.5079871416091919, + "learning_rate": 2.2327586206896554e-05, + "loss": 0.0942, + "step": 4815 + }, + { + "epoch": 5.582554517133956, + "grad_norm": 0.3671127259731293, + "learning_rate": 2.2321839080459772e-05, + "loss": 0.0885, + "step": 4816 + }, + { + "epoch": 5.583713685430704, + "grad_norm": 0.4037294089794159, + "learning_rate": 2.231609195402299e-05, + "loss": 0.0965, + "step": 4817 + }, + { + "epoch": 5.58487285372745, + "grad_norm": 0.44176700711250305, + "learning_rate": 2.231034482758621e-05, + "loss": 0.0977, + "step": 4818 + }, + { + "epoch": 5.586032022024198, + "grad_norm": 0.5387149453163147, + "learning_rate": 2.2304597701149427e-05, + "loss": 0.0961, + "step": 4819 + }, + { + "epoch": 5.587191190320945, + "grad_norm": 0.4820229411125183, + "learning_rate": 2.2298850574712645e-05, + "loss": 0.1048, + "step": 4820 + }, + { + "epoch": 5.588350358617692, + "grad_norm": 0.46356868743896484, + "learning_rate": 2.2293103448275863e-05, + "loss": 0.0917, + "step": 4821 + }, + { + "epoch": 5.589509526914439, + "grad_norm": 0.3790029287338257, + "learning_rate": 2.228735632183908e-05, + "loss": 0.0897, + "step": 4822 + }, + { + "epoch": 5.5906686952111855, + "grad_norm": 0.4080239534378052, + "learning_rate": 2.22816091954023e-05, + "loss": 0.0932, + "step": 4823 + }, + { + "epoch": 5.591827863507933, + "grad_norm": 0.5119190216064453, + "learning_rate": 2.2275862068965518e-05, + "loss": 0.099, + "step": 4824 + }, + { + "epoch": 5.592987031804681, + "grad_norm": 0.40316665172576904, + "learning_rate": 2.2270114942528736e-05, + "loss": 0.1001, + "step": 4825 + }, + { + "epoch": 5.594146200101427, + "grad_norm": 0.3797335922718048, + "learning_rate": 2.2264367816091955e-05, + "loss": 0.0963, + "step": 4826 + }, + { + "epoch": 5.595305368398174, + "grad_norm": 0.4197828471660614, + "learning_rate": 2.2258620689655173e-05, + "loss": 0.0921, + "step": 4827 + }, + { + "epoch": 5.5964645366949215, + "grad_norm": 0.41356906294822693, + "learning_rate": 2.2252873563218395e-05, + "loss": 0.0948, + "step": 4828 + }, + { + "epoch": 5.597623704991668, + "grad_norm": 0.5795599818229675, + "learning_rate": 2.224712643678161e-05, + "loss": 0.0966, + "step": 4829 + }, + { + "epoch": 5.598782873288416, + "grad_norm": 0.3532811999320984, + "learning_rate": 2.2241379310344828e-05, + "loss": 0.0926, + "step": 4830 + }, + { + "epoch": 5.599942041585162, + "grad_norm": 0.32101044058799744, + "learning_rate": 2.2235632183908046e-05, + "loss": 0.0955, + "step": 4831 + }, + { + "epoch": 5.60110120988191, + "grad_norm": 0.3435097634792328, + "learning_rate": 2.2229885057471264e-05, + "loss": 0.1026, + "step": 4832 + }, + { + "epoch": 5.602260378178657, + "grad_norm": 0.42557913064956665, + "learning_rate": 2.2224137931034483e-05, + "loss": 0.0857, + "step": 4833 + }, + { + "epoch": 5.603419546475404, + "grad_norm": 0.36306434869766235, + "learning_rate": 2.22183908045977e-05, + "loss": 0.0925, + "step": 4834 + }, + { + "epoch": 5.604578714772151, + "grad_norm": 0.45969343185424805, + "learning_rate": 2.2212643678160923e-05, + "loss": 0.0987, + "step": 4835 + }, + { + "epoch": 5.605737883068898, + "grad_norm": 0.39553216099739075, + "learning_rate": 2.2206896551724138e-05, + "loss": 0.0932, + "step": 4836 + }, + { + "epoch": 5.606897051365645, + "grad_norm": 0.6770229339599609, + "learning_rate": 2.2201149425287356e-05, + "loss": 0.0989, + "step": 4837 + }, + { + "epoch": 5.608056219662393, + "grad_norm": 0.5006446838378906, + "learning_rate": 2.2195402298850578e-05, + "loss": 0.0959, + "step": 4838 + }, + { + "epoch": 5.609215387959139, + "grad_norm": 0.33049294352531433, + "learning_rate": 2.2189655172413796e-05, + "loss": 0.0861, + "step": 4839 + }, + { + "epoch": 5.610374556255886, + "grad_norm": 0.5273866653442383, + "learning_rate": 2.218390804597701e-05, + "loss": 0.1108, + "step": 4840 + }, + { + "epoch": 5.6115337245526336, + "grad_norm": 0.45553502440452576, + "learning_rate": 2.217816091954023e-05, + "loss": 0.0959, + "step": 4841 + }, + { + "epoch": 5.61269289284938, + "grad_norm": 0.3138861358165741, + "learning_rate": 2.217241379310345e-05, + "loss": 0.086, + "step": 4842 + }, + { + "epoch": 5.613852061146128, + "grad_norm": 0.5423036217689514, + "learning_rate": 2.216666666666667e-05, + "loss": 0.102, + "step": 4843 + }, + { + "epoch": 5.6150112294428745, + "grad_norm": 0.5598729848861694, + "learning_rate": 2.2160919540229884e-05, + "loss": 0.0991, + "step": 4844 + }, + { + "epoch": 5.616170397739622, + "grad_norm": 0.5099043846130371, + "learning_rate": 2.2155172413793106e-05, + "loss": 0.0908, + "step": 4845 + }, + { + "epoch": 5.617329566036369, + "grad_norm": 0.40814903378486633, + "learning_rate": 2.2149425287356324e-05, + "loss": 0.0911, + "step": 4846 + }, + { + "epoch": 5.618488734333116, + "grad_norm": 0.3231344223022461, + "learning_rate": 2.2143678160919542e-05, + "loss": 0.0867, + "step": 4847 + }, + { + "epoch": 5.619647902629863, + "grad_norm": 0.3704584538936615, + "learning_rate": 2.213793103448276e-05, + "loss": 0.0857, + "step": 4848 + }, + { + "epoch": 5.6208070709266105, + "grad_norm": 0.49821195006370544, + "learning_rate": 2.213218390804598e-05, + "loss": 0.1002, + "step": 4849 + }, + { + "epoch": 5.621966239223357, + "grad_norm": 0.35286957025527954, + "learning_rate": 2.2126436781609197e-05, + "loss": 0.0895, + "step": 4850 + }, + { + "epoch": 5.623125407520105, + "grad_norm": 0.46417170763015747, + "learning_rate": 2.2120689655172412e-05, + "loss": 0.0919, + "step": 4851 + }, + { + "epoch": 5.624284575816851, + "grad_norm": 0.4302671253681183, + "learning_rate": 2.2114942528735634e-05, + "loss": 0.0954, + "step": 4852 + }, + { + "epoch": 5.625443744113598, + "grad_norm": 0.3605821132659912, + "learning_rate": 2.2109195402298852e-05, + "loss": 0.0913, + "step": 4853 + }, + { + "epoch": 5.626602912410346, + "grad_norm": 0.4404606223106384, + "learning_rate": 2.210344827586207e-05, + "loss": 0.0899, + "step": 4854 + }, + { + "epoch": 5.627762080707092, + "grad_norm": 0.4587152898311615, + "learning_rate": 2.209770114942529e-05, + "loss": 0.0897, + "step": 4855 + }, + { + "epoch": 5.62892124900384, + "grad_norm": 0.4051382839679718, + "learning_rate": 2.2091954022988507e-05, + "loss": 0.1017, + "step": 4856 + }, + { + "epoch": 5.6300804173005865, + "grad_norm": 0.4511359930038452, + "learning_rate": 2.2086206896551725e-05, + "loss": 0.0966, + "step": 4857 + }, + { + "epoch": 5.631239585597334, + "grad_norm": 0.43057432770729065, + "learning_rate": 2.2080459770114943e-05, + "loss": 0.1004, + "step": 4858 + }, + { + "epoch": 5.632398753894081, + "grad_norm": 0.41975724697113037, + "learning_rate": 2.207471264367816e-05, + "loss": 0.1025, + "step": 4859 + }, + { + "epoch": 5.633557922190828, + "grad_norm": 0.4029335677623749, + "learning_rate": 2.206896551724138e-05, + "loss": 0.0968, + "step": 4860 + }, + { + "epoch": 5.634717090487575, + "grad_norm": 0.296252965927124, + "learning_rate": 2.2063218390804598e-05, + "loss": 0.0929, + "step": 4861 + }, + { + "epoch": 5.6358762587843225, + "grad_norm": 0.3576434552669525, + "learning_rate": 2.205747126436782e-05, + "loss": 0.0906, + "step": 4862 + }, + { + "epoch": 5.637035427081069, + "grad_norm": 0.37971681356430054, + "learning_rate": 2.2051724137931035e-05, + "loss": 0.0954, + "step": 4863 + }, + { + "epoch": 5.638194595377817, + "grad_norm": 0.4097626805305481, + "learning_rate": 2.2045977011494253e-05, + "loss": 0.0987, + "step": 4864 + }, + { + "epoch": 5.639353763674563, + "grad_norm": 0.34203460812568665, + "learning_rate": 2.204022988505747e-05, + "loss": 0.0954, + "step": 4865 + }, + { + "epoch": 5.64051293197131, + "grad_norm": 0.4253942370414734, + "learning_rate": 2.2034482758620693e-05, + "loss": 0.0997, + "step": 4866 + }, + { + "epoch": 5.641672100268058, + "grad_norm": 0.5293704271316528, + "learning_rate": 2.2028735632183908e-05, + "loss": 0.0933, + "step": 4867 + }, + { + "epoch": 5.642831268564805, + "grad_norm": 0.36057615280151367, + "learning_rate": 2.2022988505747126e-05, + "loss": 0.0883, + "step": 4868 + }, + { + "epoch": 5.643990436861552, + "grad_norm": 0.44225749373435974, + "learning_rate": 2.2017241379310348e-05, + "loss": 0.0927, + "step": 4869 + }, + { + "epoch": 5.6451496051582986, + "grad_norm": 0.34227925539016724, + "learning_rate": 2.2011494252873563e-05, + "loss": 0.0883, + "step": 4870 + }, + { + "epoch": 5.646308773455046, + "grad_norm": 0.44135892391204834, + "learning_rate": 2.200574712643678e-05, + "loss": 0.094, + "step": 4871 + }, + { + "epoch": 5.647467941751793, + "grad_norm": 0.5417119860649109, + "learning_rate": 2.2000000000000003e-05, + "loss": 0.0943, + "step": 4872 + }, + { + "epoch": 5.64862711004854, + "grad_norm": 0.47763824462890625, + "learning_rate": 2.199425287356322e-05, + "loss": 0.1042, + "step": 4873 + }, + { + "epoch": 5.649786278345287, + "grad_norm": 0.4434187710285187, + "learning_rate": 2.1988505747126436e-05, + "loss": 0.098, + "step": 4874 + }, + { + "epoch": 5.650945446642035, + "grad_norm": 0.466353178024292, + "learning_rate": 2.1982758620689654e-05, + "loss": 0.109, + "step": 4875 + }, + { + "epoch": 5.652104614938781, + "grad_norm": 0.5113919377326965, + "learning_rate": 2.1977011494252876e-05, + "loss": 0.0935, + "step": 4876 + }, + { + "epoch": 5.653263783235529, + "grad_norm": 0.43153828382492065, + "learning_rate": 2.1971264367816094e-05, + "loss": 0.1003, + "step": 4877 + }, + { + "epoch": 5.6544229515322755, + "grad_norm": 0.40823087096214294, + "learning_rate": 2.196551724137931e-05, + "loss": 0.0954, + "step": 4878 + }, + { + "epoch": 5.655582119829023, + "grad_norm": 0.4650181531906128, + "learning_rate": 2.195977011494253e-05, + "loss": 0.0951, + "step": 4879 + }, + { + "epoch": 5.65674128812577, + "grad_norm": 0.41202208399772644, + "learning_rate": 2.195402298850575e-05, + "loss": 0.1001, + "step": 4880 + }, + { + "epoch": 5.657900456422517, + "grad_norm": 0.5160025954246521, + "learning_rate": 2.1948275862068967e-05, + "loss": 0.0914, + "step": 4881 + }, + { + "epoch": 5.659059624719264, + "grad_norm": 0.5344350337982178, + "learning_rate": 2.1942528735632186e-05, + "loss": 0.0995, + "step": 4882 + }, + { + "epoch": 5.660218793016011, + "grad_norm": 1.5580607652664185, + "learning_rate": 2.1936781609195404e-05, + "loss": 0.1032, + "step": 4883 + }, + { + "epoch": 5.661377961312758, + "grad_norm": 0.43727943301200867, + "learning_rate": 2.1931034482758622e-05, + "loss": 0.1002, + "step": 4884 + }, + { + "epoch": 5.662537129609505, + "grad_norm": 0.35894694924354553, + "learning_rate": 2.192528735632184e-05, + "loss": 0.1022, + "step": 4885 + }, + { + "epoch": 5.663696297906252, + "grad_norm": 0.46932268142700195, + "learning_rate": 2.191954022988506e-05, + "loss": 0.0913, + "step": 4886 + }, + { + "epoch": 5.664855466202999, + "grad_norm": 0.5814934372901917, + "learning_rate": 2.1913793103448277e-05, + "loss": 0.1021, + "step": 4887 + }, + { + "epoch": 5.666014634499747, + "grad_norm": 0.41701653599739075, + "learning_rate": 2.1908045977011495e-05, + "loss": 0.0984, + "step": 4888 + }, + { + "epoch": 5.667173802796493, + "grad_norm": 0.564414381980896, + "learning_rate": 2.1902298850574714e-05, + "loss": 0.1031, + "step": 4889 + }, + { + "epoch": 5.668332971093241, + "grad_norm": 0.40304380655288696, + "learning_rate": 2.1896551724137932e-05, + "loss": 0.0933, + "step": 4890 + }, + { + "epoch": 5.6694921393899875, + "grad_norm": 0.40032997727394104, + "learning_rate": 2.189080459770115e-05, + "loss": 0.0899, + "step": 4891 + }, + { + "epoch": 5.670651307686735, + "grad_norm": 0.48019522428512573, + "learning_rate": 2.188505747126437e-05, + "loss": 0.0989, + "step": 4892 + }, + { + "epoch": 5.671810475983482, + "grad_norm": 0.5299116969108582, + "learning_rate": 2.1879310344827587e-05, + "loss": 0.1001, + "step": 4893 + }, + { + "epoch": 5.672969644280229, + "grad_norm": 0.3729042112827301, + "learning_rate": 2.1873563218390805e-05, + "loss": 0.0886, + "step": 4894 + }, + { + "epoch": 5.674128812576976, + "grad_norm": 0.4367959201335907, + "learning_rate": 2.1867816091954023e-05, + "loss": 0.0877, + "step": 4895 + }, + { + "epoch": 5.675287980873723, + "grad_norm": 0.36830776929855347, + "learning_rate": 2.1862068965517242e-05, + "loss": 0.0926, + "step": 4896 + }, + { + "epoch": 5.67644714917047, + "grad_norm": 0.37894847989082336, + "learning_rate": 2.185632183908046e-05, + "loss": 0.0922, + "step": 4897 + }, + { + "epoch": 5.677606317467217, + "grad_norm": 0.4589332640171051, + "learning_rate": 2.185057471264368e-05, + "loss": 0.0984, + "step": 4898 + }, + { + "epoch": 5.6787654857639644, + "grad_norm": 0.3483442962169647, + "learning_rate": 2.1844827586206897e-05, + "loss": 0.0949, + "step": 4899 + }, + { + "epoch": 5.679924654060711, + "grad_norm": 0.4337550103664398, + "learning_rate": 2.183908045977012e-05, + "loss": 0.1002, + "step": 4900 + }, + { + "epoch": 5.681083822357459, + "grad_norm": 0.42401912808418274, + "learning_rate": 2.1833333333333333e-05, + "loss": 0.087, + "step": 4901 + }, + { + "epoch": 5.682242990654205, + "grad_norm": 0.40039029717445374, + "learning_rate": 2.182758620689655e-05, + "loss": 0.1024, + "step": 4902 + }, + { + "epoch": 5.683402158950953, + "grad_norm": 0.3798970878124237, + "learning_rate": 2.1821839080459773e-05, + "loss": 0.0913, + "step": 4903 + }, + { + "epoch": 5.6845613272477, + "grad_norm": 0.41023609042167664, + "learning_rate": 2.181609195402299e-05, + "loss": 0.0864, + "step": 4904 + }, + { + "epoch": 5.685720495544447, + "grad_norm": 0.5003272891044617, + "learning_rate": 2.1810344827586206e-05, + "loss": 0.096, + "step": 4905 + }, + { + "epoch": 5.686879663841194, + "grad_norm": 0.673970103263855, + "learning_rate": 2.1804597701149425e-05, + "loss": 0.0967, + "step": 4906 + }, + { + "epoch": 5.688038832137941, + "grad_norm": 0.4125644564628601, + "learning_rate": 2.1798850574712646e-05, + "loss": 0.097, + "step": 4907 + }, + { + "epoch": 5.689198000434688, + "grad_norm": 0.4496941566467285, + "learning_rate": 2.1793103448275865e-05, + "loss": 0.0917, + "step": 4908 + }, + { + "epoch": 5.690357168731435, + "grad_norm": 0.37504181265830994, + "learning_rate": 2.178735632183908e-05, + "loss": 0.0965, + "step": 4909 + }, + { + "epoch": 5.691516337028182, + "grad_norm": 0.4096089005470276, + "learning_rate": 2.17816091954023e-05, + "loss": 0.0958, + "step": 4910 + }, + { + "epoch": 5.69267550532493, + "grad_norm": 0.43474438786506653, + "learning_rate": 2.177586206896552e-05, + "loss": 0.0888, + "step": 4911 + }, + { + "epoch": 5.6938346736216765, + "grad_norm": 0.39973193407058716, + "learning_rate": 2.1770114942528734e-05, + "loss": 0.0946, + "step": 4912 + }, + { + "epoch": 5.694993841918423, + "grad_norm": 0.5031002163887024, + "learning_rate": 2.1764367816091956e-05, + "loss": 0.0978, + "step": 4913 + }, + { + "epoch": 5.696153010215171, + "grad_norm": 0.5082476139068604, + "learning_rate": 2.1758620689655174e-05, + "loss": 0.0917, + "step": 4914 + }, + { + "epoch": 5.697312178511917, + "grad_norm": 0.48969969153404236, + "learning_rate": 2.1752873563218393e-05, + "loss": 0.1036, + "step": 4915 + }, + { + "epoch": 5.698471346808665, + "grad_norm": 0.41776567697525024, + "learning_rate": 2.1747126436781608e-05, + "loss": 0.0936, + "step": 4916 + }, + { + "epoch": 5.699630515105412, + "grad_norm": 0.43276071548461914, + "learning_rate": 2.174137931034483e-05, + "loss": 0.0969, + "step": 4917 + }, + { + "epoch": 5.700789683402159, + "grad_norm": 0.44773709774017334, + "learning_rate": 2.1735632183908048e-05, + "loss": 0.0923, + "step": 4918 + }, + { + "epoch": 5.701948851698906, + "grad_norm": 0.3338507115840912, + "learning_rate": 2.1729885057471266e-05, + "loss": 0.0912, + "step": 4919 + }, + { + "epoch": 5.703108019995653, + "grad_norm": 0.6209862232208252, + "learning_rate": 2.1724137931034484e-05, + "loss": 0.096, + "step": 4920 + }, + { + "epoch": 5.7042671882924, + "grad_norm": 0.482950896024704, + "learning_rate": 2.1718390804597702e-05, + "loss": 0.0981, + "step": 4921 + }, + { + "epoch": 5.705426356589148, + "grad_norm": 0.4542859196662903, + "learning_rate": 2.171264367816092e-05, + "loss": 0.0949, + "step": 4922 + }, + { + "epoch": 5.706585524885894, + "grad_norm": 0.4882400333881378, + "learning_rate": 2.170689655172414e-05, + "loss": 0.0961, + "step": 4923 + }, + { + "epoch": 5.707744693182642, + "grad_norm": 0.4771440923213959, + "learning_rate": 2.1701149425287357e-05, + "loss": 0.1034, + "step": 4924 + }, + { + "epoch": 5.7089038614793886, + "grad_norm": 0.5598927140235901, + "learning_rate": 2.1695402298850576e-05, + "loss": 0.0996, + "step": 4925 + }, + { + "epoch": 5.710063029776135, + "grad_norm": 0.45021867752075195, + "learning_rate": 2.1689655172413794e-05, + "loss": 0.1006, + "step": 4926 + }, + { + "epoch": 5.711222198072883, + "grad_norm": 0.38733410835266113, + "learning_rate": 2.1683908045977016e-05, + "loss": 0.0896, + "step": 4927 + }, + { + "epoch": 5.7123813663696295, + "grad_norm": 0.3161242604255676, + "learning_rate": 2.167816091954023e-05, + "loss": 0.0883, + "step": 4928 + }, + { + "epoch": 5.713540534666377, + "grad_norm": 0.43492892384529114, + "learning_rate": 2.167241379310345e-05, + "loss": 0.0994, + "step": 4929 + }, + { + "epoch": 5.714699702963124, + "grad_norm": 0.34343111515045166, + "learning_rate": 2.1666666666666667e-05, + "loss": 0.0947, + "step": 4930 + }, + { + "epoch": 5.715858871259871, + "grad_norm": 1.0595885515213013, + "learning_rate": 2.1660919540229885e-05, + "loss": 0.1021, + "step": 4931 + }, + { + "epoch": 5.717018039556618, + "grad_norm": 0.4083106815814972, + "learning_rate": 2.1655172413793104e-05, + "loss": 0.0991, + "step": 4932 + }, + { + "epoch": 5.7181772078533655, + "grad_norm": 0.3929539918899536, + "learning_rate": 2.1649425287356322e-05, + "loss": 0.0956, + "step": 4933 + }, + { + "epoch": 5.719336376150112, + "grad_norm": 0.3941984176635742, + "learning_rate": 2.1643678160919544e-05, + "loss": 0.1015, + "step": 4934 + }, + { + "epoch": 5.72049554444686, + "grad_norm": 0.5016578435897827, + "learning_rate": 2.163793103448276e-05, + "loss": 0.1034, + "step": 4935 + }, + { + "epoch": 5.721654712743606, + "grad_norm": 0.41408535838127136, + "learning_rate": 2.1632183908045977e-05, + "loss": 0.0969, + "step": 4936 + }, + { + "epoch": 5.722813881040354, + "grad_norm": 0.3475428521633148, + "learning_rate": 2.1626436781609195e-05, + "loss": 0.0932, + "step": 4937 + }, + { + "epoch": 5.723973049337101, + "grad_norm": 0.315900057554245, + "learning_rate": 2.1620689655172417e-05, + "loss": 0.0907, + "step": 4938 + }, + { + "epoch": 5.725132217633847, + "grad_norm": 0.5354477763175964, + "learning_rate": 2.161494252873563e-05, + "loss": 0.0927, + "step": 4939 + }, + { + "epoch": 5.726291385930595, + "grad_norm": 0.40579307079315186, + "learning_rate": 2.160919540229885e-05, + "loss": 0.1103, + "step": 4940 + }, + { + "epoch": 5.7274505542273415, + "grad_norm": 0.3445356786251068, + "learning_rate": 2.160344827586207e-05, + "loss": 0.1003, + "step": 4941 + }, + { + "epoch": 5.728609722524089, + "grad_norm": 0.5686594247817993, + "learning_rate": 2.159770114942529e-05, + "loss": 0.1108, + "step": 4942 + }, + { + "epoch": 5.729768890820836, + "grad_norm": 0.47493574023246765, + "learning_rate": 2.1591954022988505e-05, + "loss": 0.1049, + "step": 4943 + }, + { + "epoch": 5.730928059117583, + "grad_norm": 0.3111097514629364, + "learning_rate": 2.1586206896551726e-05, + "loss": 0.0867, + "step": 4944 + }, + { + "epoch": 5.73208722741433, + "grad_norm": 0.4092866778373718, + "learning_rate": 2.1580459770114945e-05, + "loss": 0.0945, + "step": 4945 + }, + { + "epoch": 5.7332463957110775, + "grad_norm": 0.4263165593147278, + "learning_rate": 2.1574712643678163e-05, + "loss": 0.0818, + "step": 4946 + }, + { + "epoch": 5.734405564007824, + "grad_norm": 0.3667573928833008, + "learning_rate": 2.1568965517241378e-05, + "loss": 0.0882, + "step": 4947 + }, + { + "epoch": 5.735564732304572, + "grad_norm": 0.505990207195282, + "learning_rate": 2.15632183908046e-05, + "loss": 0.0843, + "step": 4948 + }, + { + "epoch": 5.736723900601318, + "grad_norm": 0.4376978278160095, + "learning_rate": 2.1557471264367818e-05, + "loss": 0.0911, + "step": 4949 + }, + { + "epoch": 5.737883068898066, + "grad_norm": 0.4979074001312256, + "learning_rate": 2.1551724137931033e-05, + "loss": 0.1033, + "step": 4950 + }, + { + "epoch": 5.739042237194813, + "grad_norm": 0.48788338899612427, + "learning_rate": 2.1545977011494255e-05, + "loss": 0.1081, + "step": 4951 + }, + { + "epoch": 5.740201405491559, + "grad_norm": 0.6209906935691833, + "learning_rate": 2.1540229885057473e-05, + "loss": 0.1015, + "step": 4952 + }, + { + "epoch": 5.741360573788307, + "grad_norm": 0.40411826968193054, + "learning_rate": 2.153448275862069e-05, + "loss": 0.0942, + "step": 4953 + }, + { + "epoch": 5.7425197420850544, + "grad_norm": 0.4680006206035614, + "learning_rate": 2.152873563218391e-05, + "loss": 0.0898, + "step": 4954 + }, + { + "epoch": 5.743678910381801, + "grad_norm": 0.4377489984035492, + "learning_rate": 2.1522988505747128e-05, + "loss": 0.0927, + "step": 4955 + }, + { + "epoch": 5.744838078678548, + "grad_norm": 0.6448618173599243, + "learning_rate": 2.1517241379310346e-05, + "loss": 0.0984, + "step": 4956 + }, + { + "epoch": 5.745997246975295, + "grad_norm": 0.44574832916259766, + "learning_rate": 2.1511494252873564e-05, + "loss": 0.1009, + "step": 4957 + }, + { + "epoch": 5.747156415272042, + "grad_norm": 0.5613508224487305, + "learning_rate": 2.1505747126436783e-05, + "loss": 0.0987, + "step": 4958 + }, + { + "epoch": 5.74831558356879, + "grad_norm": 0.5421482920646667, + "learning_rate": 2.15e-05, + "loss": 0.0975, + "step": 4959 + }, + { + "epoch": 5.749474751865536, + "grad_norm": 0.4239422082901001, + "learning_rate": 2.149425287356322e-05, + "loss": 0.0974, + "step": 4960 + }, + { + "epoch": 5.750633920162284, + "grad_norm": 0.7376547455787659, + "learning_rate": 2.1488505747126437e-05, + "loss": 0.0881, + "step": 4961 + }, + { + "epoch": 5.7517930884590305, + "grad_norm": 0.449830561876297, + "learning_rate": 2.1482758620689656e-05, + "loss": 0.098, + "step": 4962 + }, + { + "epoch": 5.752952256755778, + "grad_norm": 0.6881018280982971, + "learning_rate": 2.1477011494252874e-05, + "loss": 0.1013, + "step": 4963 + }, + { + "epoch": 5.754111425052525, + "grad_norm": 0.4860606789588928, + "learning_rate": 2.1471264367816092e-05, + "loss": 0.0979, + "step": 4964 + }, + { + "epoch": 5.755270593349272, + "grad_norm": 0.3773813247680664, + "learning_rate": 2.1465517241379314e-05, + "loss": 0.0878, + "step": 4965 + }, + { + "epoch": 5.756429761646019, + "grad_norm": 0.3880038857460022, + "learning_rate": 2.145977011494253e-05, + "loss": 0.0956, + "step": 4966 + }, + { + "epoch": 5.7575889299427665, + "grad_norm": 0.6223691701889038, + "learning_rate": 2.1454022988505747e-05, + "loss": 0.1005, + "step": 4967 + }, + { + "epoch": 5.758748098239513, + "grad_norm": 0.3865233361721039, + "learning_rate": 2.144827586206897e-05, + "loss": 0.1, + "step": 4968 + }, + { + "epoch": 5.75990726653626, + "grad_norm": 0.426666259765625, + "learning_rate": 2.1442528735632184e-05, + "loss": 0.0984, + "step": 4969 + }, + { + "epoch": 5.761066434833007, + "grad_norm": 0.3938763439655304, + "learning_rate": 2.1436781609195402e-05, + "loss": 0.096, + "step": 4970 + }, + { + "epoch": 5.762225603129754, + "grad_norm": 0.4771715998649597, + "learning_rate": 2.143103448275862e-05, + "loss": 0.0964, + "step": 4971 + }, + { + "epoch": 5.763384771426502, + "grad_norm": 0.3489798307418823, + "learning_rate": 2.1425287356321842e-05, + "loss": 0.0877, + "step": 4972 + }, + { + "epoch": 5.764543939723248, + "grad_norm": 0.36042097210884094, + "learning_rate": 2.1419540229885057e-05, + "loss": 0.0889, + "step": 4973 + }, + { + "epoch": 5.765703108019996, + "grad_norm": 0.41603612899780273, + "learning_rate": 2.1413793103448275e-05, + "loss": 0.095, + "step": 4974 + }, + { + "epoch": 5.7668622763167425, + "grad_norm": 0.4094390273094177, + "learning_rate": 2.1408045977011497e-05, + "loss": 0.0955, + "step": 4975 + }, + { + "epoch": 5.76802144461349, + "grad_norm": 0.4778774678707123, + "learning_rate": 2.1402298850574715e-05, + "loss": 0.1005, + "step": 4976 + }, + { + "epoch": 5.769180612910237, + "grad_norm": 0.4297574758529663, + "learning_rate": 2.139655172413793e-05, + "loss": 0.0954, + "step": 4977 + }, + { + "epoch": 5.770339781206984, + "grad_norm": 0.3127455711364746, + "learning_rate": 2.1390804597701152e-05, + "loss": 0.0896, + "step": 4978 + }, + { + "epoch": 5.771498949503731, + "grad_norm": 0.37766167521476746, + "learning_rate": 2.138505747126437e-05, + "loss": 0.0906, + "step": 4979 + }, + { + "epoch": 5.7726581178004785, + "grad_norm": 0.49125292897224426, + "learning_rate": 2.137931034482759e-05, + "loss": 0.0963, + "step": 4980 + }, + { + "epoch": 5.773817286097225, + "grad_norm": 0.4246441423892975, + "learning_rate": 2.1373563218390803e-05, + "loss": 0.0919, + "step": 4981 + }, + { + "epoch": 5.774976454393972, + "grad_norm": 0.366273432970047, + "learning_rate": 2.1367816091954025e-05, + "loss": 0.0896, + "step": 4982 + }, + { + "epoch": 5.7761356226907195, + "grad_norm": 0.4927791655063629, + "learning_rate": 2.1362068965517243e-05, + "loss": 0.1007, + "step": 4983 + }, + { + "epoch": 5.777294790987466, + "grad_norm": 0.29492560029029846, + "learning_rate": 2.135632183908046e-05, + "loss": 0.0959, + "step": 4984 + }, + { + "epoch": 5.778453959284214, + "grad_norm": 0.5132672786712646, + "learning_rate": 2.135057471264368e-05, + "loss": 0.0917, + "step": 4985 + }, + { + "epoch": 5.77961312758096, + "grad_norm": 0.3356097340583801, + "learning_rate": 2.1344827586206898e-05, + "loss": 0.083, + "step": 4986 + }, + { + "epoch": 5.780772295877708, + "grad_norm": 0.43617239594459534, + "learning_rate": 2.1339080459770116e-05, + "loss": 0.1005, + "step": 4987 + }, + { + "epoch": 5.781931464174455, + "grad_norm": 0.4654097259044647, + "learning_rate": 2.1333333333333335e-05, + "loss": 0.0922, + "step": 4988 + }, + { + "epoch": 5.783090632471202, + "grad_norm": 0.42662137746810913, + "learning_rate": 2.1327586206896553e-05, + "loss": 0.099, + "step": 4989 + }, + { + "epoch": 5.784249800767949, + "grad_norm": 0.4461085796356201, + "learning_rate": 2.132183908045977e-05, + "loss": 0.0907, + "step": 4990 + }, + { + "epoch": 5.785408969064696, + "grad_norm": 0.6144752502441406, + "learning_rate": 2.131609195402299e-05, + "loss": 0.0959, + "step": 4991 + }, + { + "epoch": 5.786568137361443, + "grad_norm": 0.4535113275051117, + "learning_rate": 2.1310344827586208e-05, + "loss": 0.0941, + "step": 4992 + }, + { + "epoch": 5.787727305658191, + "grad_norm": 0.556451141834259, + "learning_rate": 2.1304597701149426e-05, + "loss": 0.1009, + "step": 4993 + }, + { + "epoch": 5.788886473954937, + "grad_norm": 0.40173500776290894, + "learning_rate": 2.1298850574712644e-05, + "loss": 0.0952, + "step": 4994 + }, + { + "epoch": 5.790045642251684, + "grad_norm": 0.485877126455307, + "learning_rate": 2.1293103448275863e-05, + "loss": 0.0937, + "step": 4995 + }, + { + "epoch": 5.7912048105484315, + "grad_norm": 0.425545334815979, + "learning_rate": 2.128735632183908e-05, + "loss": 0.0954, + "step": 4996 + }, + { + "epoch": 5.792363978845179, + "grad_norm": 0.463670939207077, + "learning_rate": 2.12816091954023e-05, + "loss": 0.09, + "step": 4997 + }, + { + "epoch": 5.793523147141926, + "grad_norm": 0.3836846947669983, + "learning_rate": 2.1275862068965518e-05, + "loss": 0.0934, + "step": 4998 + }, + { + "epoch": 5.794682315438672, + "grad_norm": 0.7452934980392456, + "learning_rate": 2.127011494252874e-05, + "loss": 0.094, + "step": 4999 + }, + { + "epoch": 5.79584148373542, + "grad_norm": 0.364729106426239, + "learning_rate": 2.1264367816091954e-05, + "loss": 0.092, + "step": 5000 + }, + { + "epoch": 5.797000652032167, + "grad_norm": 0.560750424861908, + "learning_rate": 2.1258620689655172e-05, + "loss": 0.0987, + "step": 5001 + }, + { + "epoch": 5.798159820328914, + "grad_norm": 0.3720407485961914, + "learning_rate": 2.125287356321839e-05, + "loss": 0.0894, + "step": 5002 + }, + { + "epoch": 5.799318988625661, + "grad_norm": 0.3824548125267029, + "learning_rate": 2.1247126436781612e-05, + "loss": 0.0929, + "step": 5003 + }, + { + "epoch": 5.800478156922408, + "grad_norm": 0.3994615972042084, + "learning_rate": 2.1241379310344827e-05, + "loss": 0.1029, + "step": 5004 + }, + { + "epoch": 5.801637325219155, + "grad_norm": 0.4771389067173004, + "learning_rate": 2.1235632183908046e-05, + "loss": 0.0842, + "step": 5005 + }, + { + "epoch": 5.802796493515903, + "grad_norm": 0.4739455580711365, + "learning_rate": 2.1229885057471267e-05, + "loss": 0.1055, + "step": 5006 + }, + { + "epoch": 5.803955661812649, + "grad_norm": 0.5355760455131531, + "learning_rate": 2.1224137931034486e-05, + "loss": 0.0892, + "step": 5007 + }, + { + "epoch": 5.805114830109397, + "grad_norm": 0.5140407681465149, + "learning_rate": 2.12183908045977e-05, + "loss": 0.0986, + "step": 5008 + }, + { + "epoch": 5.8062739984061436, + "grad_norm": 0.3573043644428253, + "learning_rate": 2.1212643678160922e-05, + "loss": 0.0904, + "step": 5009 + }, + { + "epoch": 5.807433166702891, + "grad_norm": 0.3605383634567261, + "learning_rate": 2.120689655172414e-05, + "loss": 0.0984, + "step": 5010 + }, + { + "epoch": 5.808592334999638, + "grad_norm": 0.3857939541339874, + "learning_rate": 2.1201149425287355e-05, + "loss": 0.0911, + "step": 5011 + }, + { + "epoch": 5.8097515032963845, + "grad_norm": 0.3786557614803314, + "learning_rate": 2.1195402298850574e-05, + "loss": 0.0969, + "step": 5012 + }, + { + "epoch": 5.810910671593132, + "grad_norm": 0.4758943021297455, + "learning_rate": 2.1189655172413795e-05, + "loss": 0.1068, + "step": 5013 + }, + { + "epoch": 5.812069839889879, + "grad_norm": 0.5058685541152954, + "learning_rate": 2.1183908045977014e-05, + "loss": 0.0927, + "step": 5014 + }, + { + "epoch": 5.813229008186626, + "grad_norm": 0.31769004464149475, + "learning_rate": 2.117816091954023e-05, + "loss": 0.0904, + "step": 5015 + }, + { + "epoch": 5.814388176483373, + "grad_norm": 0.354096919298172, + "learning_rate": 2.117241379310345e-05, + "loss": 0.1018, + "step": 5016 + }, + { + "epoch": 5.8155473447801205, + "grad_norm": 0.41636335849761963, + "learning_rate": 2.116666666666667e-05, + "loss": 0.0969, + "step": 5017 + }, + { + "epoch": 5.816706513076867, + "grad_norm": 0.46119603514671326, + "learning_rate": 2.1160919540229887e-05, + "loss": 0.1032, + "step": 5018 + }, + { + "epoch": 5.817865681373615, + "grad_norm": 0.38647714257240295, + "learning_rate": 2.1155172413793105e-05, + "loss": 0.0972, + "step": 5019 + }, + { + "epoch": 5.819024849670361, + "grad_norm": 0.3750416338443756, + "learning_rate": 2.1149425287356323e-05, + "loss": 0.0897, + "step": 5020 + }, + { + "epoch": 5.820184017967109, + "grad_norm": 0.4357173442840576, + "learning_rate": 2.114367816091954e-05, + "loss": 0.0988, + "step": 5021 + }, + { + "epoch": 5.821343186263856, + "grad_norm": 0.2895772457122803, + "learning_rate": 2.113793103448276e-05, + "loss": 0.0955, + "step": 5022 + }, + { + "epoch": 5.822502354560603, + "grad_norm": 0.3872501850128174, + "learning_rate": 2.1132183908045978e-05, + "loss": 0.103, + "step": 5023 + }, + { + "epoch": 5.82366152285735, + "grad_norm": 0.40317487716674805, + "learning_rate": 2.1126436781609196e-05, + "loss": 0.0928, + "step": 5024 + }, + { + "epoch": 5.8248206911540965, + "grad_norm": 0.49441027641296387, + "learning_rate": 2.1120689655172415e-05, + "loss": 0.0944, + "step": 5025 + }, + { + "epoch": 5.825979859450844, + "grad_norm": 0.39836275577545166, + "learning_rate": 2.1114942528735633e-05, + "loss": 0.0926, + "step": 5026 + }, + { + "epoch": 5.827139027747591, + "grad_norm": 0.5158929824829102, + "learning_rate": 2.110919540229885e-05, + "loss": 0.0888, + "step": 5027 + }, + { + "epoch": 5.828298196044338, + "grad_norm": 0.4455549716949463, + "learning_rate": 2.110344827586207e-05, + "loss": 0.0907, + "step": 5028 + }, + { + "epoch": 5.829457364341085, + "grad_norm": 0.4579765498638153, + "learning_rate": 2.1097701149425288e-05, + "loss": 0.0903, + "step": 5029 + }, + { + "epoch": 5.8306165326378325, + "grad_norm": 0.6447916030883789, + "learning_rate": 2.1091954022988506e-05, + "loss": 0.0945, + "step": 5030 + }, + { + "epoch": 5.831775700934579, + "grad_norm": 0.44061580300331116, + "learning_rate": 2.1086206896551724e-05, + "loss": 0.1007, + "step": 5031 + }, + { + "epoch": 5.832934869231327, + "grad_norm": 0.46343931555747986, + "learning_rate": 2.1080459770114943e-05, + "loss": 0.1078, + "step": 5032 + }, + { + "epoch": 5.834094037528073, + "grad_norm": 0.4852432310581207, + "learning_rate": 2.1074712643678164e-05, + "loss": 0.1009, + "step": 5033 + }, + { + "epoch": 5.835253205824821, + "grad_norm": 0.2604130208492279, + "learning_rate": 2.106896551724138e-05, + "loss": 0.09, + "step": 5034 + }, + { + "epoch": 5.836412374121568, + "grad_norm": 0.6115668416023254, + "learning_rate": 2.1063218390804598e-05, + "loss": 0.1032, + "step": 5035 + }, + { + "epoch": 5.837571542418315, + "grad_norm": 0.4292698800563812, + "learning_rate": 2.1057471264367816e-05, + "loss": 0.0984, + "step": 5036 + }, + { + "epoch": 5.838730710715062, + "grad_norm": 0.42464151978492737, + "learning_rate": 2.1051724137931038e-05, + "loss": 0.0854, + "step": 5037 + }, + { + "epoch": 5.839889879011809, + "grad_norm": 0.36907604336738586, + "learning_rate": 2.1045977011494253e-05, + "loss": 0.0948, + "step": 5038 + }, + { + "epoch": 5.841049047308556, + "grad_norm": 0.43384525179862976, + "learning_rate": 2.104022988505747e-05, + "loss": 0.0955, + "step": 5039 + }, + { + "epoch": 5.842208215605304, + "grad_norm": 0.40040022134780884, + "learning_rate": 2.1034482758620692e-05, + "loss": 0.0971, + "step": 5040 + }, + { + "epoch": 5.84336738390205, + "grad_norm": 0.4272783100605011, + "learning_rate": 2.102873563218391e-05, + "loss": 0.0898, + "step": 5041 + }, + { + "epoch": 5.844526552198797, + "grad_norm": 0.3518434762954712, + "learning_rate": 2.1022988505747126e-05, + "loss": 0.0905, + "step": 5042 + }, + { + "epoch": 5.845685720495545, + "grad_norm": 0.39061328768730164, + "learning_rate": 2.1017241379310344e-05, + "loss": 0.0924, + "step": 5043 + }, + { + "epoch": 5.846844888792291, + "grad_norm": 0.4481010437011719, + "learning_rate": 2.1011494252873566e-05, + "loss": 0.0922, + "step": 5044 + }, + { + "epoch": 5.848004057089039, + "grad_norm": 0.5339946746826172, + "learning_rate": 2.1005747126436784e-05, + "loss": 0.1021, + "step": 5045 + }, + { + "epoch": 5.8491632253857855, + "grad_norm": 0.6023551821708679, + "learning_rate": 2.1e-05, + "loss": 0.0971, + "step": 5046 + }, + { + "epoch": 5.850322393682533, + "grad_norm": 0.41510429978370667, + "learning_rate": 2.099425287356322e-05, + "loss": 0.1031, + "step": 5047 + }, + { + "epoch": 5.85148156197928, + "grad_norm": 0.4894249439239502, + "learning_rate": 2.098850574712644e-05, + "loss": 0.0986, + "step": 5048 + }, + { + "epoch": 5.852640730276027, + "grad_norm": 0.3392792046070099, + "learning_rate": 2.0982758620689654e-05, + "loss": 0.0904, + "step": 5049 + }, + { + "epoch": 5.853799898572774, + "grad_norm": 0.43131858110427856, + "learning_rate": 2.0977011494252875e-05, + "loss": 0.0933, + "step": 5050 + }, + { + "epoch": 5.8549590668695215, + "grad_norm": 0.3799227476119995, + "learning_rate": 2.0971264367816094e-05, + "loss": 0.0892, + "step": 5051 + }, + { + "epoch": 5.856118235166268, + "grad_norm": 0.44021299481391907, + "learning_rate": 2.0965517241379312e-05, + "loss": 0.0978, + "step": 5052 + }, + { + "epoch": 5.857277403463016, + "grad_norm": 0.39485037326812744, + "learning_rate": 2.0959770114942527e-05, + "loss": 0.0918, + "step": 5053 + }, + { + "epoch": 5.858436571759762, + "grad_norm": 0.367180734872818, + "learning_rate": 2.095402298850575e-05, + "loss": 0.1015, + "step": 5054 + }, + { + "epoch": 5.859595740056509, + "grad_norm": 0.6080685257911682, + "learning_rate": 2.0948275862068967e-05, + "loss": 0.0916, + "step": 5055 + }, + { + "epoch": 5.860754908353257, + "grad_norm": 0.3705676198005676, + "learning_rate": 2.0942528735632185e-05, + "loss": 0.0935, + "step": 5056 + }, + { + "epoch": 5.861914076650003, + "grad_norm": 0.41616174578666687, + "learning_rate": 2.0936781609195403e-05, + "loss": 0.096, + "step": 5057 + }, + { + "epoch": 5.863073244946751, + "grad_norm": 0.3993401825428009, + "learning_rate": 2.0931034482758622e-05, + "loss": 0.0943, + "step": 5058 + }, + { + "epoch": 5.8642324132434975, + "grad_norm": 0.4943077564239502, + "learning_rate": 2.092528735632184e-05, + "loss": 0.096, + "step": 5059 + }, + { + "epoch": 5.865391581540245, + "grad_norm": 0.3290475010871887, + "learning_rate": 2.0919540229885058e-05, + "loss": 0.0948, + "step": 5060 + }, + { + "epoch": 5.866550749836992, + "grad_norm": 0.39300331473350525, + "learning_rate": 2.0913793103448277e-05, + "loss": 0.1004, + "step": 5061 + }, + { + "epoch": 5.867709918133739, + "grad_norm": 0.4835030138492584, + "learning_rate": 2.0908045977011495e-05, + "loss": 0.0884, + "step": 5062 + }, + { + "epoch": 5.868869086430486, + "grad_norm": 0.5800598859786987, + "learning_rate": 2.0902298850574713e-05, + "loss": 0.0988, + "step": 5063 + }, + { + "epoch": 5.8700282547272336, + "grad_norm": 0.4258168041706085, + "learning_rate": 2.0896551724137935e-05, + "loss": 0.1035, + "step": 5064 + }, + { + "epoch": 5.87118742302398, + "grad_norm": 0.4930982291698456, + "learning_rate": 2.089080459770115e-05, + "loss": 0.0923, + "step": 5065 + }, + { + "epoch": 5.872346591320728, + "grad_norm": 0.3994540870189667, + "learning_rate": 2.0885057471264368e-05, + "loss": 0.0916, + "step": 5066 + }, + { + "epoch": 5.8735057596174745, + "grad_norm": 0.35855406522750854, + "learning_rate": 2.0879310344827586e-05, + "loss": 0.0976, + "step": 5067 + }, + { + "epoch": 5.874664927914221, + "grad_norm": 0.37041381001472473, + "learning_rate": 2.0873563218390808e-05, + "loss": 0.0863, + "step": 5068 + }, + { + "epoch": 5.875824096210969, + "grad_norm": 0.44822248816490173, + "learning_rate": 2.0867816091954023e-05, + "loss": 0.0963, + "step": 5069 + }, + { + "epoch": 5.876983264507715, + "grad_norm": 0.3357725739479065, + "learning_rate": 2.086206896551724e-05, + "loss": 0.0877, + "step": 5070 + }, + { + "epoch": 5.878142432804463, + "grad_norm": 0.37162381410598755, + "learning_rate": 2.0856321839080463e-05, + "loss": 0.0874, + "step": 5071 + }, + { + "epoch": 5.87930160110121, + "grad_norm": 0.4773227274417877, + "learning_rate": 2.0850574712643678e-05, + "loss": 0.0965, + "step": 5072 + }, + { + "epoch": 5.880460769397957, + "grad_norm": 0.47884953022003174, + "learning_rate": 2.0844827586206896e-05, + "loss": 0.0946, + "step": 5073 + }, + { + "epoch": 5.881619937694704, + "grad_norm": 0.3373884856700897, + "learning_rate": 2.0839080459770118e-05, + "loss": 0.0949, + "step": 5074 + }, + { + "epoch": 5.882779105991451, + "grad_norm": 0.399009644985199, + "learning_rate": 2.0833333333333336e-05, + "loss": 0.0953, + "step": 5075 + }, + { + "epoch": 5.883938274288198, + "grad_norm": 0.44031986594200134, + "learning_rate": 2.082758620689655e-05, + "loss": 0.1024, + "step": 5076 + }, + { + "epoch": 5.885097442584946, + "grad_norm": 0.3464363217353821, + "learning_rate": 2.082183908045977e-05, + "loss": 0.1056, + "step": 5077 + }, + { + "epoch": 5.886256610881692, + "grad_norm": 0.4489310383796692, + "learning_rate": 2.081609195402299e-05, + "loss": 0.0995, + "step": 5078 + }, + { + "epoch": 5.88741577917844, + "grad_norm": 0.43739187717437744, + "learning_rate": 2.081034482758621e-05, + "loss": 0.0965, + "step": 5079 + }, + { + "epoch": 5.8885749474751865, + "grad_norm": 0.35651272535324097, + "learning_rate": 2.0804597701149424e-05, + "loss": 0.0944, + "step": 5080 + }, + { + "epoch": 5.889734115771933, + "grad_norm": 0.4365938603878021, + "learning_rate": 2.0798850574712646e-05, + "loss": 0.101, + "step": 5081 + }, + { + "epoch": 5.890893284068681, + "grad_norm": 0.4614748954772949, + "learning_rate": 2.0793103448275864e-05, + "loss": 0.0986, + "step": 5082 + }, + { + "epoch": 5.892052452365428, + "grad_norm": 0.3819468915462494, + "learning_rate": 2.0787356321839082e-05, + "loss": 0.0882, + "step": 5083 + }, + { + "epoch": 5.893211620662175, + "grad_norm": 0.5456834435462952, + "learning_rate": 2.07816091954023e-05, + "loss": 0.0931, + "step": 5084 + }, + { + "epoch": 5.894370788958922, + "grad_norm": 0.48817238211631775, + "learning_rate": 2.077586206896552e-05, + "loss": 0.0946, + "step": 5085 + }, + { + "epoch": 5.895529957255669, + "grad_norm": 0.5912837386131287, + "learning_rate": 2.0770114942528737e-05, + "loss": 0.0907, + "step": 5086 + }, + { + "epoch": 5.896689125552416, + "grad_norm": 0.38435614109039307, + "learning_rate": 2.0764367816091956e-05, + "loss": 0.0919, + "step": 5087 + }, + { + "epoch": 5.897848293849163, + "grad_norm": 0.3552255928516388, + "learning_rate": 2.0758620689655174e-05, + "loss": 0.0925, + "step": 5088 + }, + { + "epoch": 5.89900746214591, + "grad_norm": 0.3824387192726135, + "learning_rate": 2.0752873563218392e-05, + "loss": 0.1059, + "step": 5089 + }, + { + "epoch": 5.900166630442658, + "grad_norm": 0.43972963094711304, + "learning_rate": 2.074712643678161e-05, + "loss": 0.0966, + "step": 5090 + }, + { + "epoch": 5.901325798739404, + "grad_norm": 0.3910900354385376, + "learning_rate": 2.074137931034483e-05, + "loss": 0.0889, + "step": 5091 + }, + { + "epoch": 5.902484967036152, + "grad_norm": 0.45151054859161377, + "learning_rate": 2.0735632183908047e-05, + "loss": 0.1027, + "step": 5092 + }, + { + "epoch": 5.903644135332899, + "grad_norm": 0.3880411982536316, + "learning_rate": 2.0729885057471265e-05, + "loss": 0.0947, + "step": 5093 + }, + { + "epoch": 5.904803303629646, + "grad_norm": 0.40984082221984863, + "learning_rate": 2.0724137931034484e-05, + "loss": 0.0929, + "step": 5094 + }, + { + "epoch": 5.905962471926393, + "grad_norm": 0.32482361793518066, + "learning_rate": 2.0718390804597702e-05, + "loss": 0.0848, + "step": 5095 + }, + { + "epoch": 5.90712164022314, + "grad_norm": 0.438496470451355, + "learning_rate": 2.071264367816092e-05, + "loss": 0.088, + "step": 5096 + }, + { + "epoch": 5.908280808519887, + "grad_norm": 0.47613009810447693, + "learning_rate": 2.070689655172414e-05, + "loss": 0.0942, + "step": 5097 + }, + { + "epoch": 5.909439976816634, + "grad_norm": 0.3807111978530884, + "learning_rate": 2.0701149425287357e-05, + "loss": 0.096, + "step": 5098 + }, + { + "epoch": 5.910599145113381, + "grad_norm": 0.4180908799171448, + "learning_rate": 2.0695402298850575e-05, + "loss": 0.0892, + "step": 5099 + }, + { + "epoch": 5.911758313410128, + "grad_norm": 0.5334672927856445, + "learning_rate": 2.0689655172413793e-05, + "loss": 0.0952, + "step": 5100 + }, + { + "epoch": 5.9129174817068755, + "grad_norm": 0.4528830945491791, + "learning_rate": 2.068390804597701e-05, + "loss": 0.0967, + "step": 5101 + }, + { + "epoch": 5.914076650003622, + "grad_norm": 0.4010438323020935, + "learning_rate": 2.0678160919540233e-05, + "loss": 0.0969, + "step": 5102 + }, + { + "epoch": 5.91523581830037, + "grad_norm": 0.3919377624988556, + "learning_rate": 2.0672413793103448e-05, + "loss": 0.0836, + "step": 5103 + }, + { + "epoch": 5.916394986597116, + "grad_norm": 0.3746892511844635, + "learning_rate": 2.0666666666666666e-05, + "loss": 0.0957, + "step": 5104 + }, + { + "epoch": 5.917554154893864, + "grad_norm": 0.5136653780937195, + "learning_rate": 2.0660919540229888e-05, + "loss": 0.0989, + "step": 5105 + }, + { + "epoch": 5.918713323190611, + "grad_norm": 0.4311780631542206, + "learning_rate": 2.0655172413793106e-05, + "loss": 0.0998, + "step": 5106 + }, + { + "epoch": 5.919872491487358, + "grad_norm": 0.4143439829349518, + "learning_rate": 2.064942528735632e-05, + "loss": 0.0921, + "step": 5107 + }, + { + "epoch": 5.921031659784105, + "grad_norm": 0.857010543346405, + "learning_rate": 2.064367816091954e-05, + "loss": 0.0987, + "step": 5108 + }, + { + "epoch": 5.922190828080852, + "grad_norm": 0.5835907459259033, + "learning_rate": 2.063793103448276e-05, + "loss": 0.1125, + "step": 5109 + }, + { + "epoch": 5.923349996377599, + "grad_norm": 0.3759761154651642, + "learning_rate": 2.0632183908045976e-05, + "loss": 0.0884, + "step": 5110 + }, + { + "epoch": 5.924509164674346, + "grad_norm": 0.4615977704524994, + "learning_rate": 2.0626436781609194e-05, + "loss": 0.0969, + "step": 5111 + }, + { + "epoch": 5.925668332971093, + "grad_norm": 0.3750910758972168, + "learning_rate": 2.0620689655172416e-05, + "loss": 0.0927, + "step": 5112 + }, + { + "epoch": 5.92682750126784, + "grad_norm": 0.42511269450187683, + "learning_rate": 2.0614942528735634e-05, + "loss": 0.0899, + "step": 5113 + }, + { + "epoch": 5.9279866695645875, + "grad_norm": 0.5754655003547668, + "learning_rate": 2.060919540229885e-05, + "loss": 0.0914, + "step": 5114 + }, + { + "epoch": 5.929145837861334, + "grad_norm": 0.43884530663490295, + "learning_rate": 2.060344827586207e-05, + "loss": 0.0984, + "step": 5115 + }, + { + "epoch": 5.930305006158082, + "grad_norm": 0.3772994577884674, + "learning_rate": 2.059770114942529e-05, + "loss": 0.0964, + "step": 5116 + }, + { + "epoch": 5.931464174454828, + "grad_norm": 0.46012642979621887, + "learning_rate": 2.0591954022988508e-05, + "loss": 0.0927, + "step": 5117 + }, + { + "epoch": 5.932623342751576, + "grad_norm": 0.39248010516166687, + "learning_rate": 2.0586206896551722e-05, + "loss": 0.0925, + "step": 5118 + }, + { + "epoch": 5.933782511048323, + "grad_norm": 0.42726606130599976, + "learning_rate": 2.0580459770114944e-05, + "loss": 0.1009, + "step": 5119 + }, + { + "epoch": 5.93494167934507, + "grad_norm": 0.4554595947265625, + "learning_rate": 2.0574712643678162e-05, + "loss": 0.1023, + "step": 5120 + }, + { + "epoch": 5.936100847641817, + "grad_norm": 0.34201934933662415, + "learning_rate": 2.056896551724138e-05, + "loss": 0.095, + "step": 5121 + }, + { + "epoch": 5.9372600159385645, + "grad_norm": 0.4734445810317993, + "learning_rate": 2.05632183908046e-05, + "loss": 0.0948, + "step": 5122 + }, + { + "epoch": 5.938419184235311, + "grad_norm": 0.4069608449935913, + "learning_rate": 2.0557471264367817e-05, + "loss": 0.0912, + "step": 5123 + }, + { + "epoch": 5.939578352532058, + "grad_norm": 0.39140281081199646, + "learning_rate": 2.0551724137931036e-05, + "loss": 0.09, + "step": 5124 + }, + { + "epoch": 5.940737520828805, + "grad_norm": 0.547514796257019, + "learning_rate": 2.0545977011494254e-05, + "loss": 0.0972, + "step": 5125 + }, + { + "epoch": 5.941896689125553, + "grad_norm": 0.34587743878364563, + "learning_rate": 2.0540229885057472e-05, + "loss": 0.0912, + "step": 5126 + }, + { + "epoch": 5.9430558574223, + "grad_norm": 0.5674780607223511, + "learning_rate": 2.053448275862069e-05, + "loss": 0.1001, + "step": 5127 + }, + { + "epoch": 5.944215025719046, + "grad_norm": 0.3166492283344269, + "learning_rate": 2.052873563218391e-05, + "loss": 0.0838, + "step": 5128 + }, + { + "epoch": 5.945374194015794, + "grad_norm": 0.3704341948032379, + "learning_rate": 2.0522988505747127e-05, + "loss": 0.0883, + "step": 5129 + }, + { + "epoch": 5.9465333623125405, + "grad_norm": 0.3320915699005127, + "learning_rate": 2.0517241379310345e-05, + "loss": 0.0836, + "step": 5130 + }, + { + "epoch": 5.947692530609288, + "grad_norm": 0.40248364210128784, + "learning_rate": 2.0511494252873564e-05, + "loss": 0.0946, + "step": 5131 + }, + { + "epoch": 5.948851698906035, + "grad_norm": 0.523185670375824, + "learning_rate": 2.0505747126436782e-05, + "loss": 0.0928, + "step": 5132 + }, + { + "epoch": 5.950010867202782, + "grad_norm": 0.3942788243293762, + "learning_rate": 2.05e-05, + "loss": 0.0931, + "step": 5133 + }, + { + "epoch": 5.951170035499529, + "grad_norm": 0.39801275730133057, + "learning_rate": 2.049425287356322e-05, + "loss": 0.096, + "step": 5134 + }, + { + "epoch": 5.9523292037962765, + "grad_norm": 0.5294638276100159, + "learning_rate": 2.0488505747126437e-05, + "loss": 0.0982, + "step": 5135 + }, + { + "epoch": 5.953488372093023, + "grad_norm": 0.732921838760376, + "learning_rate": 2.048275862068966e-05, + "loss": 0.1096, + "step": 5136 + }, + { + "epoch": 5.95464754038977, + "grad_norm": 0.39925023913383484, + "learning_rate": 2.0477011494252873e-05, + "loss": 0.0964, + "step": 5137 + }, + { + "epoch": 5.955806708686517, + "grad_norm": 0.47731050848960876, + "learning_rate": 2.047126436781609e-05, + "loss": 0.0994, + "step": 5138 + }, + { + "epoch": 5.956965876983265, + "grad_norm": 0.48555245995521545, + "learning_rate": 2.0465517241379313e-05, + "loss": 0.0893, + "step": 5139 + }, + { + "epoch": 5.958125045280012, + "grad_norm": 0.33314958214759827, + "learning_rate": 2.045977011494253e-05, + "loss": 0.0874, + "step": 5140 + }, + { + "epoch": 5.959284213576758, + "grad_norm": 0.5869859457015991, + "learning_rate": 2.0454022988505747e-05, + "loss": 0.0995, + "step": 5141 + }, + { + "epoch": 5.960443381873506, + "grad_norm": 0.3285931348800659, + "learning_rate": 2.0448275862068965e-05, + "loss": 0.0937, + "step": 5142 + }, + { + "epoch": 5.9616025501702525, + "grad_norm": 0.3716854751110077, + "learning_rate": 2.0442528735632187e-05, + "loss": 0.0976, + "step": 5143 + }, + { + "epoch": 5.962761718467, + "grad_norm": 0.3806571960449219, + "learning_rate": 2.0436781609195405e-05, + "loss": 0.096, + "step": 5144 + }, + { + "epoch": 5.963920886763747, + "grad_norm": 0.4217926859855652, + "learning_rate": 2.043103448275862e-05, + "loss": 0.0966, + "step": 5145 + }, + { + "epoch": 5.965080055060494, + "grad_norm": 0.3270195424556732, + "learning_rate": 2.042528735632184e-05, + "loss": 0.0873, + "step": 5146 + }, + { + "epoch": 5.966239223357241, + "grad_norm": 0.4076954126358032, + "learning_rate": 2.041954022988506e-05, + "loss": 0.0909, + "step": 5147 + }, + { + "epoch": 5.9673983916539886, + "grad_norm": 0.491375595331192, + "learning_rate": 2.0413793103448278e-05, + "loss": 0.092, + "step": 5148 + }, + { + "epoch": 5.968557559950735, + "grad_norm": 0.42915698885917664, + "learning_rate": 2.0408045977011493e-05, + "loss": 0.0966, + "step": 5149 + }, + { + "epoch": 5.969716728247483, + "grad_norm": 0.38473838567733765, + "learning_rate": 2.0402298850574715e-05, + "loss": 0.0911, + "step": 5150 + }, + { + "epoch": 5.9708758965442295, + "grad_norm": 0.4680832028388977, + "learning_rate": 2.0396551724137933e-05, + "loss": 0.0904, + "step": 5151 + }, + { + "epoch": 5.972035064840977, + "grad_norm": 0.4412045180797577, + "learning_rate": 2.0390804597701148e-05, + "loss": 0.1048, + "step": 5152 + }, + { + "epoch": 5.973194233137724, + "grad_norm": 0.4312341809272766, + "learning_rate": 2.038505747126437e-05, + "loss": 0.096, + "step": 5153 + }, + { + "epoch": 5.97435340143447, + "grad_norm": 0.5846208333969116, + "learning_rate": 2.0379310344827588e-05, + "loss": 0.0986, + "step": 5154 + }, + { + "epoch": 5.975512569731218, + "grad_norm": 0.3806995749473572, + "learning_rate": 2.0373563218390806e-05, + "loss": 0.091, + "step": 5155 + }, + { + "epoch": 5.976671738027965, + "grad_norm": 0.3785600960254669, + "learning_rate": 2.0367816091954024e-05, + "loss": 0.0969, + "step": 5156 + }, + { + "epoch": 5.977830906324712, + "grad_norm": 0.3626193106174469, + "learning_rate": 2.0362068965517243e-05, + "loss": 0.09, + "step": 5157 + }, + { + "epoch": 5.978990074621459, + "grad_norm": 0.33364805579185486, + "learning_rate": 2.035632183908046e-05, + "loss": 0.0933, + "step": 5158 + }, + { + "epoch": 5.980149242918206, + "grad_norm": 0.4983973503112793, + "learning_rate": 2.035057471264368e-05, + "loss": 0.1003, + "step": 5159 + }, + { + "epoch": 5.981308411214953, + "grad_norm": 0.5828344225883484, + "learning_rate": 2.0344827586206897e-05, + "loss": 0.0986, + "step": 5160 + }, + { + "epoch": 5.982467579511701, + "grad_norm": 0.3898671269416809, + "learning_rate": 2.0339080459770116e-05, + "loss": 0.0839, + "step": 5161 + }, + { + "epoch": 5.983626747808447, + "grad_norm": 0.3521953523159027, + "learning_rate": 2.0333333333333334e-05, + "loss": 0.0837, + "step": 5162 + }, + { + "epoch": 5.984785916105195, + "grad_norm": 0.40425774455070496, + "learning_rate": 2.0327586206896552e-05, + "loss": 0.095, + "step": 5163 + }, + { + "epoch": 5.9859450844019415, + "grad_norm": 0.45948293805122375, + "learning_rate": 2.032183908045977e-05, + "loss": 0.0974, + "step": 5164 + }, + { + "epoch": 5.987104252698689, + "grad_norm": 0.40446847677230835, + "learning_rate": 2.031609195402299e-05, + "loss": 0.0932, + "step": 5165 + }, + { + "epoch": 5.988263420995436, + "grad_norm": 0.49766218662261963, + "learning_rate": 2.0310344827586207e-05, + "loss": 0.0998, + "step": 5166 + }, + { + "epoch": 5.989422589292182, + "grad_norm": 0.5645685195922852, + "learning_rate": 2.030459770114943e-05, + "loss": 0.1052, + "step": 5167 + }, + { + "epoch": 5.99058175758893, + "grad_norm": 0.28001391887664795, + "learning_rate": 2.0298850574712644e-05, + "loss": 0.0839, + "step": 5168 + }, + { + "epoch": 5.991740925885677, + "grad_norm": 0.3594200611114502, + "learning_rate": 2.0293103448275862e-05, + "loss": 0.0934, + "step": 5169 + }, + { + "epoch": 5.992900094182424, + "grad_norm": 0.34900590777397156, + "learning_rate": 2.0287356321839084e-05, + "loss": 0.0856, + "step": 5170 + }, + { + "epoch": 5.994059262479171, + "grad_norm": 0.4731251001358032, + "learning_rate": 2.02816091954023e-05, + "loss": 0.1025, + "step": 5171 + }, + { + "epoch": 5.995218430775918, + "grad_norm": 0.3666514754295349, + "learning_rate": 2.0275862068965517e-05, + "loss": 0.0929, + "step": 5172 + }, + { + "epoch": 5.996377599072665, + "grad_norm": 0.348015695810318, + "learning_rate": 2.0270114942528735e-05, + "loss": 0.0902, + "step": 5173 + }, + { + "epoch": 5.997536767369413, + "grad_norm": 0.3184606432914734, + "learning_rate": 2.0264367816091957e-05, + "loss": 0.0891, + "step": 5174 + }, + { + "epoch": 5.998695935666159, + "grad_norm": 0.7606669664382935, + "learning_rate": 2.0258620689655172e-05, + "loss": 0.1031, + "step": 5175 + }, + { + "epoch": 5.999855103962907, + "grad_norm": 0.6448565721511841, + "learning_rate": 2.025287356321839e-05, + "loss": 0.1023, + "step": 5176 + }, + { + "epoch": 5.999855103962907, + "eval_loss": 0.148734450340271, + "eval_runtime": 265.936, + "eval_samples_per_second": 5.768, + "eval_steps_per_second": 5.768, + "step": 5176 + } + ], + "logging_steps": 1, + "max_steps": 8620, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.3198416576181862e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}