diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7950 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 200, + "global_step": 1124, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017793594306049821, + "grad_norm": 2.3150479490250784, + "learning_rate": 9.99998046979289e-06, + "loss": 0.122, + "step": 1 + }, + { + "epoch": 0.0035587188612099642, + "grad_norm": 1.8939108981994268, + "learning_rate": 9.999921879324127e-06, + "loss": 0.0978, + "step": 2 + }, + { + "epoch": 0.005338078291814947, + "grad_norm": 2.2313302724289206, + "learning_rate": 9.999824229051425e-06, + "loss": 0.1139, + "step": 3 + }, + { + "epoch": 0.0071174377224199285, + "grad_norm": 2.147995036567701, + "learning_rate": 9.999687519737639e-06, + "loss": 0.124, + "step": 4 + }, + { + "epoch": 0.008896797153024912, + "grad_norm": 2.07600425424386, + "learning_rate": 9.99951175245075e-06, + "loss": 0.0982, + "step": 5 + }, + { + "epoch": 0.010676156583629894, + "grad_norm": 2.236609634956885, + "learning_rate": 9.999296928563868e-06, + "loss": 0.128, + "step": 6 + }, + { + "epoch": 0.012455516014234875, + "grad_norm": 2.46718595601003, + "learning_rate": 9.999043049755216e-06, + "loss": 0.1348, + "step": 7 + }, + { + "epoch": 0.014234875444839857, + "grad_norm": 1.5643536190598661, + "learning_rate": 9.998750118008117e-06, + "loss": 0.0829, + "step": 8 + }, + { + "epoch": 0.01601423487544484, + "grad_norm": 1.6749555322656917, + "learning_rate": 9.998418135610974e-06, + "loss": 0.0875, + "step": 9 + }, + { + "epoch": 0.017793594306049824, + "grad_norm": 2.102986051425079, + "learning_rate": 9.998047105157265e-06, + "loss": 0.1433, + "step": 10 + }, + { + "epoch": 0.019572953736654804, + "grad_norm": 1.6806961724301148, + "learning_rate": 9.997637029545509e-06, + "loss": 0.0864, + "step": 11 + }, + { + "epoch": 0.021352313167259787, + "grad_norm": 2.3501140885281218, + "learning_rate": 9.997187911979252e-06, + "loss": 0.1306, + "step": 12 + }, + { + "epoch": 0.023131672597864767, + "grad_norm": 2.2011534009760183, + "learning_rate": 9.996699755967035e-06, + "loss": 0.1418, + "step": 13 + }, + { + "epoch": 0.02491103202846975, + "grad_norm": 1.834656075860282, + "learning_rate": 9.996172565322375e-06, + "loss": 0.1054, + "step": 14 + }, + { + "epoch": 0.026690391459074734, + "grad_norm": 1.8702617116201332, + "learning_rate": 9.995606344163728e-06, + "loss": 0.1164, + "step": 15 + }, + { + "epoch": 0.028469750889679714, + "grad_norm": 1.4133556400197504, + "learning_rate": 9.995001096914462e-06, + "loss": 0.0937, + "step": 16 + }, + { + "epoch": 0.030249110320284697, + "grad_norm": 1.5517785688670929, + "learning_rate": 9.994356828302818e-06, + "loss": 0.1093, + "step": 17 + }, + { + "epoch": 0.03202846975088968, + "grad_norm": 1.3391448776499155, + "learning_rate": 9.993673543361874e-06, + "loss": 0.0953, + "step": 18 + }, + { + "epoch": 0.033807829181494664, + "grad_norm": 2.0203435862014003, + "learning_rate": 9.992951247429512e-06, + "loss": 0.1585, + "step": 19 + }, + { + "epoch": 0.03558718861209965, + "grad_norm": 1.415383314135429, + "learning_rate": 9.992189946148366e-06, + "loss": 0.0919, + "step": 20 + }, + { + "epoch": 0.037366548042704624, + "grad_norm": 1.1271360275786408, + "learning_rate": 9.991389645465786e-06, + "loss": 0.0756, + "step": 21 + }, + { + "epoch": 0.03914590747330961, + "grad_norm": 1.7287049496025613, + "learning_rate": 9.990550351633784e-06, + "loss": 0.1279, + "step": 22 + }, + { + "epoch": 0.04092526690391459, + "grad_norm": 1.454809662757122, + "learning_rate": 9.989672071208993e-06, + "loss": 0.1177, + "step": 23 + }, + { + "epoch": 0.042704626334519574, + "grad_norm": 1.8333181806129868, + "learning_rate": 9.988754811052616e-06, + "loss": 0.1488, + "step": 24 + }, + { + "epoch": 0.04448398576512456, + "grad_norm": 1.821106896278666, + "learning_rate": 9.987798578330365e-06, + "loss": 0.1426, + "step": 25 + }, + { + "epoch": 0.046263345195729534, + "grad_norm": 1.586635078482126, + "learning_rate": 9.986803380512406e-06, + "loss": 0.1186, + "step": 26 + }, + { + "epoch": 0.04804270462633452, + "grad_norm": 1.5546514668832576, + "learning_rate": 9.98576922537331e-06, + "loss": 0.1179, + "step": 27 + }, + { + "epoch": 0.0498220640569395, + "grad_norm": 1.613267015140376, + "learning_rate": 9.984696120991979e-06, + "loss": 0.1139, + "step": 28 + }, + { + "epoch": 0.051601423487544484, + "grad_norm": 2.4908063051717044, + "learning_rate": 9.983584075751598e-06, + "loss": 0.1268, + "step": 29 + }, + { + "epoch": 0.05338078291814947, + "grad_norm": 1.5857689757989275, + "learning_rate": 9.982433098339553e-06, + "loss": 0.1195, + "step": 30 + }, + { + "epoch": 0.05516014234875445, + "grad_norm": 2.0410218934139643, + "learning_rate": 9.981243197747375e-06, + "loss": 0.1461, + "step": 31 + }, + { + "epoch": 0.05693950177935943, + "grad_norm": 2.6117564459399083, + "learning_rate": 9.980014383270668e-06, + "loss": 0.1701, + "step": 32 + }, + { + "epoch": 0.05871886120996441, + "grad_norm": 1.66690519128048, + "learning_rate": 9.978746664509032e-06, + "loss": 0.1373, + "step": 33 + }, + { + "epoch": 0.060498220640569395, + "grad_norm": 1.4077261838750643, + "learning_rate": 9.97744005136599e-06, + "loss": 0.0957, + "step": 34 + }, + { + "epoch": 0.06227758007117438, + "grad_norm": 1.4663691854524623, + "learning_rate": 9.976094554048912e-06, + "loss": 0.1129, + "step": 35 + }, + { + "epoch": 0.06405693950177936, + "grad_norm": 1.7723259431724423, + "learning_rate": 9.974710183068935e-06, + "loss": 0.1218, + "step": 36 + }, + { + "epoch": 0.06583629893238434, + "grad_norm": 1.4100491042718053, + "learning_rate": 9.97328694924088e-06, + "loss": 0.1026, + "step": 37 + }, + { + "epoch": 0.06761565836298933, + "grad_norm": 1.605871591007416, + "learning_rate": 9.971824863683168e-06, + "loss": 0.126, + "step": 38 + }, + { + "epoch": 0.0693950177935943, + "grad_norm": 1.6815314399777885, + "learning_rate": 9.970323937817732e-06, + "loss": 0.1195, + "step": 39 + }, + { + "epoch": 0.0711743772241993, + "grad_norm": 1.6559595335176767, + "learning_rate": 9.968784183369929e-06, + "loss": 0.1109, + "step": 40 + }, + { + "epoch": 0.07295373665480427, + "grad_norm": 1.665846243493975, + "learning_rate": 9.96720561236845e-06, + "loss": 0.127, + "step": 41 + }, + { + "epoch": 0.07473309608540925, + "grad_norm": 1.7460396014674455, + "learning_rate": 9.965588237145219e-06, + "loss": 0.1236, + "step": 42 + }, + { + "epoch": 0.07651245551601424, + "grad_norm": 1.5742329771492725, + "learning_rate": 9.963932070335307e-06, + "loss": 0.1195, + "step": 43 + }, + { + "epoch": 0.07829181494661921, + "grad_norm": 1.6055445888172124, + "learning_rate": 9.962237124876828e-06, + "loss": 0.1119, + "step": 44 + }, + { + "epoch": 0.0800711743772242, + "grad_norm": 1.5925472414645612, + "learning_rate": 9.960503414010833e-06, + "loss": 0.1229, + "step": 45 + }, + { + "epoch": 0.08185053380782918, + "grad_norm": 1.726125819903223, + "learning_rate": 9.958730951281218e-06, + "loss": 0.1299, + "step": 46 + }, + { + "epoch": 0.08362989323843416, + "grad_norm": 1.7872198307678588, + "learning_rate": 9.956919750534607e-06, + "loss": 0.1268, + "step": 47 + }, + { + "epoch": 0.08540925266903915, + "grad_norm": 1.7461267629761665, + "learning_rate": 9.955069825920249e-06, + "loss": 0.1354, + "step": 48 + }, + { + "epoch": 0.08718861209964412, + "grad_norm": 1.5052492583066552, + "learning_rate": 9.953181191889913e-06, + "loss": 0.1229, + "step": 49 + }, + { + "epoch": 0.08896797153024912, + "grad_norm": 1.3366981185871216, + "learning_rate": 9.95125386319776e-06, + "loss": 0.1006, + "step": 50 + }, + { + "epoch": 0.09074733096085409, + "grad_norm": 2.0856819842733745, + "learning_rate": 9.949287854900243e-06, + "loss": 0.1538, + "step": 51 + }, + { + "epoch": 0.09252669039145907, + "grad_norm": 1.77197422683052, + "learning_rate": 9.947283182355982e-06, + "loss": 0.1285, + "step": 52 + }, + { + "epoch": 0.09430604982206406, + "grad_norm": 1.535942883793826, + "learning_rate": 9.945239861225644e-06, + "loss": 0.1205, + "step": 53 + }, + { + "epoch": 0.09608540925266904, + "grad_norm": 1.4327747010188872, + "learning_rate": 9.943157907471825e-06, + "loss": 0.1143, + "step": 54 + }, + { + "epoch": 0.09786476868327403, + "grad_norm": 1.2169464338940064, + "learning_rate": 9.941037337358918e-06, + "loss": 0.0863, + "step": 55 + }, + { + "epoch": 0.099644128113879, + "grad_norm": 1.6994774690848435, + "learning_rate": 9.938878167452991e-06, + "loss": 0.1319, + "step": 56 + }, + { + "epoch": 0.10142348754448399, + "grad_norm": 1.851572606439269, + "learning_rate": 9.936680414621663e-06, + "loss": 0.1201, + "step": 57 + }, + { + "epoch": 0.10320284697508897, + "grad_norm": 1.3210638554640264, + "learning_rate": 9.934444096033958e-06, + "loss": 0.0966, + "step": 58 + }, + { + "epoch": 0.10498220640569395, + "grad_norm": 1.3982161782311981, + "learning_rate": 9.932169229160183e-06, + "loss": 0.1185, + "step": 59 + }, + { + "epoch": 0.10676156583629894, + "grad_norm": 1.6551177125917558, + "learning_rate": 9.929855831771787e-06, + "loss": 0.1222, + "step": 60 + }, + { + "epoch": 0.10854092526690391, + "grad_norm": 1.5820583104836863, + "learning_rate": 9.927503921941218e-06, + "loss": 0.1175, + "step": 61 + }, + { + "epoch": 0.1103202846975089, + "grad_norm": 1.6447037739593402, + "learning_rate": 9.925113518041796e-06, + "loss": 0.1457, + "step": 62 + }, + { + "epoch": 0.11209964412811388, + "grad_norm": 1.8341557719751869, + "learning_rate": 9.922684638747551e-06, + "loss": 0.1761, + "step": 63 + }, + { + "epoch": 0.11387900355871886, + "grad_norm": 1.4609850441682664, + "learning_rate": 9.920217303033091e-06, + "loss": 0.1239, + "step": 64 + }, + { + "epoch": 0.11565836298932385, + "grad_norm": 1.7373126826984115, + "learning_rate": 9.917711530173444e-06, + "loss": 0.1248, + "step": 65 + }, + { + "epoch": 0.11743772241992882, + "grad_norm": 1.8252103368149044, + "learning_rate": 9.91516733974392e-06, + "loss": 0.129, + "step": 66 + }, + { + "epoch": 0.11921708185053381, + "grad_norm": 1.466618243317653, + "learning_rate": 9.912584751619943e-06, + "loss": 0.134, + "step": 67 + }, + { + "epoch": 0.12099644128113879, + "grad_norm": 1.3574022691057386, + "learning_rate": 9.909963785976902e-06, + "loss": 0.114, + "step": 68 + }, + { + "epoch": 0.12277580071174377, + "grad_norm": 1.3820161583631567, + "learning_rate": 9.907304463290004e-06, + "loss": 0.1136, + "step": 69 + }, + { + "epoch": 0.12455516014234876, + "grad_norm": 1.2947853990115923, + "learning_rate": 9.904606804334094e-06, + "loss": 0.1003, + "step": 70 + }, + { + "epoch": 0.12633451957295375, + "grad_norm": 1.4777665246391947, + "learning_rate": 9.901870830183506e-06, + "loss": 0.1301, + "step": 71 + }, + { + "epoch": 0.12811387900355872, + "grad_norm": 1.4865395343985397, + "learning_rate": 9.899096562211902e-06, + "loss": 0.128, + "step": 72 + }, + { + "epoch": 0.1298932384341637, + "grad_norm": 1.70322923018481, + "learning_rate": 9.896284022092088e-06, + "loss": 0.1537, + "step": 73 + }, + { + "epoch": 0.13167259786476868, + "grad_norm": 1.986776256607827, + "learning_rate": 9.893433231795864e-06, + "loss": 0.1749, + "step": 74 + }, + { + "epoch": 0.13345195729537365, + "grad_norm": 1.9833725532011965, + "learning_rate": 9.890544213593838e-06, + "loss": 0.1536, + "step": 75 + }, + { + "epoch": 0.13523131672597866, + "grad_norm": 1.6162836428207408, + "learning_rate": 9.887616990055262e-06, + "loss": 0.1361, + "step": 76 + }, + { + "epoch": 0.13701067615658363, + "grad_norm": 1.8798472208522492, + "learning_rate": 9.884651584047845e-06, + "loss": 0.1427, + "step": 77 + }, + { + "epoch": 0.1387900355871886, + "grad_norm": 1.6316750517814012, + "learning_rate": 9.881648018737587e-06, + "loss": 0.148, + "step": 78 + }, + { + "epoch": 0.14056939501779359, + "grad_norm": 1.4973899136660551, + "learning_rate": 9.878606317588588e-06, + "loss": 0.113, + "step": 79 + }, + { + "epoch": 0.1423487544483986, + "grad_norm": 2.085780240364436, + "learning_rate": 9.875526504362868e-06, + "loss": 0.1764, + "step": 80 + }, + { + "epoch": 0.14412811387900357, + "grad_norm": 1.7904131094985867, + "learning_rate": 9.872408603120187e-06, + "loss": 0.1559, + "step": 81 + }, + { + "epoch": 0.14590747330960854, + "grad_norm": 1.4759874941535067, + "learning_rate": 9.869252638217846e-06, + "loss": 0.115, + "step": 82 + }, + { + "epoch": 0.14768683274021352, + "grad_norm": 1.4156029940928385, + "learning_rate": 9.866058634310503e-06, + "loss": 0.1303, + "step": 83 + }, + { + "epoch": 0.1494661921708185, + "grad_norm": 3.022669330694427, + "learning_rate": 9.862826616349981e-06, + "loss": 0.1232, + "step": 84 + }, + { + "epoch": 0.1512455516014235, + "grad_norm": 1.6187399937545661, + "learning_rate": 9.859556609585075e-06, + "loss": 0.1416, + "step": 85 + }, + { + "epoch": 0.15302491103202848, + "grad_norm": 1.3579268088366394, + "learning_rate": 9.856248639561346e-06, + "loss": 0.1058, + "step": 86 + }, + { + "epoch": 0.15480427046263345, + "grad_norm": 1.656725551724757, + "learning_rate": 9.85290273212093e-06, + "loss": 0.1262, + "step": 87 + }, + { + "epoch": 0.15658362989323843, + "grad_norm": 1.649686322768587, + "learning_rate": 9.849518913402334e-06, + "loss": 0.1248, + "step": 88 + }, + { + "epoch": 0.1583629893238434, + "grad_norm": 1.4682326240875552, + "learning_rate": 9.84609720984023e-06, + "loss": 0.1228, + "step": 89 + }, + { + "epoch": 0.1601423487544484, + "grad_norm": 1.5211842011834333, + "learning_rate": 9.84263764816525e-06, + "loss": 0.1165, + "step": 90 + }, + { + "epoch": 0.1619217081850534, + "grad_norm": 1.374933263593323, + "learning_rate": 9.839140255403776e-06, + "loss": 0.1069, + "step": 91 + }, + { + "epoch": 0.16370106761565836, + "grad_norm": 1.4112889073693102, + "learning_rate": 9.83560505887773e-06, + "loss": 0.1174, + "step": 92 + }, + { + "epoch": 0.16548042704626334, + "grad_norm": 1.5338525971406016, + "learning_rate": 9.83203208620436e-06, + "loss": 0.1246, + "step": 93 + }, + { + "epoch": 0.16725978647686832, + "grad_norm": 1.7319956110286392, + "learning_rate": 9.828421365296023e-06, + "loss": 0.1309, + "step": 94 + }, + { + "epoch": 0.16903914590747332, + "grad_norm": 1.6459346421565015, + "learning_rate": 9.824772924359974e-06, + "loss": 0.1303, + "step": 95 + }, + { + "epoch": 0.1708185053380783, + "grad_norm": 1.461599854566481, + "learning_rate": 9.821086791898133e-06, + "loss": 0.1146, + "step": 96 + }, + { + "epoch": 0.17259786476868327, + "grad_norm": 1.2898506418812963, + "learning_rate": 9.817362996706872e-06, + "loss": 0.1268, + "step": 97 + }, + { + "epoch": 0.17437722419928825, + "grad_norm": 1.2574991907771362, + "learning_rate": 9.81360156787679e-06, + "loss": 0.0982, + "step": 98 + }, + { + "epoch": 0.17615658362989323, + "grad_norm": 1.2818741796501145, + "learning_rate": 9.809802534792477e-06, + "loss": 0.1101, + "step": 99 + }, + { + "epoch": 0.17793594306049823, + "grad_norm": 1.5340131341968406, + "learning_rate": 9.805965927132294e-06, + "loss": 0.1417, + "step": 100 + }, + { + "epoch": 0.1797153024911032, + "grad_norm": 1.3301864585315522, + "learning_rate": 9.802091774868143e-06, + "loss": 0.1186, + "step": 101 + }, + { + "epoch": 0.18149466192170818, + "grad_norm": 1.2920359977860623, + "learning_rate": 9.798180108265218e-06, + "loss": 0.1269, + "step": 102 + }, + { + "epoch": 0.18327402135231316, + "grad_norm": 1.1755565875809384, + "learning_rate": 9.794230957881785e-06, + "loss": 0.0911, + "step": 103 + }, + { + "epoch": 0.18505338078291814, + "grad_norm": 1.6520857630595822, + "learning_rate": 9.79024435456893e-06, + "loss": 0.1298, + "step": 104 + }, + { + "epoch": 0.18683274021352314, + "grad_norm": 1.5760599162246887, + "learning_rate": 9.786220329470334e-06, + "loss": 0.1267, + "step": 105 + }, + { + "epoch": 0.18861209964412812, + "grad_norm": 1.3518661794431308, + "learning_rate": 9.782158914022011e-06, + "loss": 0.1171, + "step": 106 + }, + { + "epoch": 0.1903914590747331, + "grad_norm": 1.5440563375880543, + "learning_rate": 9.778060139952075e-06, + "loss": 0.1394, + "step": 107 + }, + { + "epoch": 0.19217081850533807, + "grad_norm": 1.4630462324701143, + "learning_rate": 9.773924039280488e-06, + "loss": 0.1268, + "step": 108 + }, + { + "epoch": 0.19395017793594305, + "grad_norm": 1.0316882466769741, + "learning_rate": 9.769750644318814e-06, + "loss": 0.0804, + "step": 109 + }, + { + "epoch": 0.19572953736654805, + "grad_norm": 1.6630328401543009, + "learning_rate": 9.765539987669956e-06, + "loss": 0.1238, + "step": 110 + }, + { + "epoch": 0.19750889679715303, + "grad_norm": 1.9423875980674232, + "learning_rate": 9.761292102227917e-06, + "loss": 0.1492, + "step": 111 + }, + { + "epoch": 0.199288256227758, + "grad_norm": 1.8260444289563744, + "learning_rate": 9.757007021177529e-06, + "loss": 0.162, + "step": 112 + }, + { + "epoch": 0.20106761565836298, + "grad_norm": 1.2266156380036612, + "learning_rate": 9.752684777994197e-06, + "loss": 0.1074, + "step": 113 + }, + { + "epoch": 0.20284697508896798, + "grad_norm": 1.7197909934835105, + "learning_rate": 9.748325406443647e-06, + "loss": 0.1435, + "step": 114 + }, + { + "epoch": 0.20462633451957296, + "grad_norm": 1.5438074853803372, + "learning_rate": 9.743928940581646e-06, + "loss": 0.1354, + "step": 115 + }, + { + "epoch": 0.20640569395017794, + "grad_norm": 1.787303992084634, + "learning_rate": 9.739495414753754e-06, + "loss": 0.1702, + "step": 116 + }, + { + "epoch": 0.20818505338078291, + "grad_norm": 1.3361322549878205, + "learning_rate": 9.73502486359504e-06, + "loss": 0.1204, + "step": 117 + }, + { + "epoch": 0.2099644128113879, + "grad_norm": 1.8634116877054288, + "learning_rate": 9.73051732202982e-06, + "loss": 0.1503, + "step": 118 + }, + { + "epoch": 0.2117437722419929, + "grad_norm": 1.3209685505485607, + "learning_rate": 9.725972825271381e-06, + "loss": 0.1243, + "step": 119 + }, + { + "epoch": 0.21352313167259787, + "grad_norm": 1.3706471704388294, + "learning_rate": 9.721391408821713e-06, + "loss": 0.1188, + "step": 120 + }, + { + "epoch": 0.21530249110320285, + "grad_norm": 1.4324014907533043, + "learning_rate": 9.716773108471213e-06, + "loss": 0.1407, + "step": 121 + }, + { + "epoch": 0.21708185053380782, + "grad_norm": 1.2654918357754823, + "learning_rate": 9.712117960298433e-06, + "loss": 0.1244, + "step": 122 + }, + { + "epoch": 0.2188612099644128, + "grad_norm": 1.442113627072885, + "learning_rate": 9.707426000669773e-06, + "loss": 0.1237, + "step": 123 + }, + { + "epoch": 0.2206405693950178, + "grad_norm": 1.4969482774761218, + "learning_rate": 9.702697266239211e-06, + "loss": 0.1321, + "step": 124 + }, + { + "epoch": 0.22241992882562278, + "grad_norm": 1.700958655143985, + "learning_rate": 9.697931793948012e-06, + "loss": 0.1601, + "step": 125 + }, + { + "epoch": 0.22419928825622776, + "grad_norm": 1.2124183722997646, + "learning_rate": 9.693129621024441e-06, + "loss": 0.1201, + "step": 126 + }, + { + "epoch": 0.22597864768683273, + "grad_norm": 1.373929114776768, + "learning_rate": 9.68829078498347e-06, + "loss": 0.126, + "step": 127 + }, + { + "epoch": 0.2277580071174377, + "grad_norm": 1.6101541653048022, + "learning_rate": 9.683415323626487e-06, + "loss": 0.1356, + "step": 128 + }, + { + "epoch": 0.22953736654804271, + "grad_norm": 1.6548250356852086, + "learning_rate": 9.678503275040997e-06, + "loss": 0.1363, + "step": 129 + }, + { + "epoch": 0.2313167259786477, + "grad_norm": 1.3341015541239039, + "learning_rate": 9.673554677600336e-06, + "loss": 0.1264, + "step": 130 + }, + { + "epoch": 0.23309608540925267, + "grad_norm": 1.4498445900696748, + "learning_rate": 9.668569569963355e-06, + "loss": 0.129, + "step": 131 + }, + { + "epoch": 0.23487544483985764, + "grad_norm": 1.0216967537453667, + "learning_rate": 9.663547991074129e-06, + "loss": 0.0887, + "step": 132 + }, + { + "epoch": 0.23665480427046262, + "grad_norm": 1.5556609913797053, + "learning_rate": 9.658489980161643e-06, + "loss": 0.1288, + "step": 133 + }, + { + "epoch": 0.23843416370106763, + "grad_norm": 1.4639954349834507, + "learning_rate": 9.653395576739504e-06, + "loss": 0.1348, + "step": 134 + }, + { + "epoch": 0.2402135231316726, + "grad_norm": 1.560363889533299, + "learning_rate": 9.648264820605611e-06, + "loss": 0.126, + "step": 135 + }, + { + "epoch": 0.24199288256227758, + "grad_norm": 1.4265445896347981, + "learning_rate": 9.643097751841854e-06, + "loss": 0.1614, + "step": 136 + }, + { + "epoch": 0.24377224199288255, + "grad_norm": 1.3866143965138966, + "learning_rate": 9.637894410813803e-06, + "loss": 0.1271, + "step": 137 + }, + { + "epoch": 0.24555160142348753, + "grad_norm": 1.7617394024609352, + "learning_rate": 9.632654838170393e-06, + "loss": 0.1389, + "step": 138 + }, + { + "epoch": 0.24733096085409254, + "grad_norm": 1.800709153860104, + "learning_rate": 9.627379074843595e-06, + "loss": 0.1463, + "step": 139 + }, + { + "epoch": 0.2491103202846975, + "grad_norm": 1.5087034875671088, + "learning_rate": 9.622067162048111e-06, + "loss": 0.1355, + "step": 140 + }, + { + "epoch": 0.2508896797153025, + "grad_norm": 1.3010872178165098, + "learning_rate": 9.616719141281044e-06, + "loss": 0.1247, + "step": 141 + }, + { + "epoch": 0.2526690391459075, + "grad_norm": 1.693733824105587, + "learning_rate": 9.611335054321576e-06, + "loss": 0.1512, + "step": 142 + }, + { + "epoch": 0.25444839857651247, + "grad_norm": 1.7009155877264104, + "learning_rate": 9.605914943230637e-06, + "loss": 0.139, + "step": 143 + }, + { + "epoch": 0.25622775800711745, + "grad_norm": 1.2598989725965244, + "learning_rate": 9.600458850350588e-06, + "loss": 0.1015, + "step": 144 + }, + { + "epoch": 0.2580071174377224, + "grad_norm": 1.702225655277758, + "learning_rate": 9.594966818304875e-06, + "loss": 0.148, + "step": 145 + }, + { + "epoch": 0.2597864768683274, + "grad_norm": 1.5513490978514006, + "learning_rate": 9.589438889997712e-06, + "loss": 0.1128, + "step": 146 + }, + { + "epoch": 0.2615658362989324, + "grad_norm": 1.2744707776066644, + "learning_rate": 9.583875108613727e-06, + "loss": 0.1215, + "step": 147 + }, + { + "epoch": 0.26334519572953735, + "grad_norm": 1.3135762575973189, + "learning_rate": 9.578275517617646e-06, + "loss": 0.1236, + "step": 148 + }, + { + "epoch": 0.26512455516014233, + "grad_norm": 1.4288843773619255, + "learning_rate": 9.572640160753936e-06, + "loss": 0.125, + "step": 149 + }, + { + "epoch": 0.2669039145907473, + "grad_norm": 1.3282205635766462, + "learning_rate": 9.566969082046471e-06, + "loss": 0.1341, + "step": 150 + }, + { + "epoch": 0.26868327402135234, + "grad_norm": 1.1244630347090068, + "learning_rate": 9.561262325798188e-06, + "loss": 0.0983, + "step": 151 + }, + { + "epoch": 0.2704626334519573, + "grad_norm": 1.2707269972012794, + "learning_rate": 9.555519936590739e-06, + "loss": 0.1042, + "step": 152 + }, + { + "epoch": 0.2722419928825623, + "grad_norm": 1.186406870071293, + "learning_rate": 9.549741959284147e-06, + "loss": 0.107, + "step": 153 + }, + { + "epoch": 0.27402135231316727, + "grad_norm": 1.355372382394004, + "learning_rate": 9.543928439016445e-06, + "loss": 0.1293, + "step": 154 + }, + { + "epoch": 0.27580071174377224, + "grad_norm": 1.2854825846015958, + "learning_rate": 9.538079421203339e-06, + "loss": 0.1169, + "step": 155 + }, + { + "epoch": 0.2775800711743772, + "grad_norm": 1.342421664104859, + "learning_rate": 9.532194951537838e-06, + "loss": 0.119, + "step": 156 + }, + { + "epoch": 0.2793594306049822, + "grad_norm": 1.4275853921403812, + "learning_rate": 9.52627507598991e-06, + "loss": 0.1369, + "step": 157 + }, + { + "epoch": 0.28113879003558717, + "grad_norm": 1.1603688300775958, + "learning_rate": 9.52031984080611e-06, + "loss": 0.105, + "step": 158 + }, + { + "epoch": 0.28291814946619215, + "grad_norm": 1.2073570045612327, + "learning_rate": 9.514329292509227e-06, + "loss": 0.1002, + "step": 159 + }, + { + "epoch": 0.2846975088967972, + "grad_norm": 1.3272114836931819, + "learning_rate": 9.508303477897925e-06, + "loss": 0.1128, + "step": 160 + }, + { + "epoch": 0.28647686832740216, + "grad_norm": 1.5666821304561709, + "learning_rate": 9.502242444046365e-06, + "loss": 0.1309, + "step": 161 + }, + { + "epoch": 0.28825622775800713, + "grad_norm": 1.4794896082398679, + "learning_rate": 9.496146238303846e-06, + "loss": 0.1416, + "step": 162 + }, + { + "epoch": 0.2900355871886121, + "grad_norm": 1.5725933662263583, + "learning_rate": 9.49001490829443e-06, + "loss": 0.134, + "step": 163 + }, + { + "epoch": 0.2918149466192171, + "grad_norm": 1.7978657636477746, + "learning_rate": 9.483848501916578e-06, + "loss": 0.1656, + "step": 164 + }, + { + "epoch": 0.29359430604982206, + "grad_norm": 1.5385861975715274, + "learning_rate": 9.477647067342766e-06, + "loss": 0.1445, + "step": 165 + }, + { + "epoch": 0.29537366548042704, + "grad_norm": 1.522698741848138, + "learning_rate": 9.471410653019115e-06, + "loss": 0.1312, + "step": 166 + }, + { + "epoch": 0.297153024911032, + "grad_norm": 1.410495428928871, + "learning_rate": 9.46513930766501e-06, + "loss": 0.134, + "step": 167 + }, + { + "epoch": 0.298932384341637, + "grad_norm": 1.4040568466318382, + "learning_rate": 9.458833080272723e-06, + "loss": 0.1155, + "step": 168 + }, + { + "epoch": 0.30071174377224197, + "grad_norm": 1.4830902237059056, + "learning_rate": 9.45249202010702e-06, + "loss": 0.1206, + "step": 169 + }, + { + "epoch": 0.302491103202847, + "grad_norm": 1.4974950883468803, + "learning_rate": 9.446116176704791e-06, + "loss": 0.1315, + "step": 170 + }, + { + "epoch": 0.304270462633452, + "grad_norm": 1.4862268443324185, + "learning_rate": 9.439705599874653e-06, + "loss": 0.1257, + "step": 171 + }, + { + "epoch": 0.30604982206405695, + "grad_norm": 1.4722719069791255, + "learning_rate": 9.433260339696564e-06, + "loss": 0.1502, + "step": 172 + }, + { + "epoch": 0.30782918149466193, + "grad_norm": 1.5787824797877084, + "learning_rate": 9.426780446521429e-06, + "loss": 0.1414, + "step": 173 + }, + { + "epoch": 0.3096085409252669, + "grad_norm": 1.49817799896776, + "learning_rate": 9.42026597097071e-06, + "loss": 0.1369, + "step": 174 + }, + { + "epoch": 0.3113879003558719, + "grad_norm": 1.270997812099595, + "learning_rate": 9.413716963936033e-06, + "loss": 0.1123, + "step": 175 + }, + { + "epoch": 0.31316725978647686, + "grad_norm": 1.3577743246112806, + "learning_rate": 9.407133476578778e-06, + "loss": 0.1268, + "step": 176 + }, + { + "epoch": 0.31494661921708184, + "grad_norm": 1.6811814689000206, + "learning_rate": 9.400515560329698e-06, + "loss": 0.1591, + "step": 177 + }, + { + "epoch": 0.3167259786476868, + "grad_norm": 1.62265306436807, + "learning_rate": 9.393863266888501e-06, + "loss": 0.1249, + "step": 178 + }, + { + "epoch": 0.3185053380782918, + "grad_norm": 1.6559844780644963, + "learning_rate": 9.387176648223457e-06, + "loss": 0.1351, + "step": 179 + }, + { + "epoch": 0.3202846975088968, + "grad_norm": 1.5052377798410623, + "learning_rate": 9.38045575657098e-06, + "loss": 0.1286, + "step": 180 + }, + { + "epoch": 0.3220640569395018, + "grad_norm": 1.6097855117146187, + "learning_rate": 9.37370064443524e-06, + "loss": 0.1309, + "step": 181 + }, + { + "epoch": 0.3238434163701068, + "grad_norm": 1.1440848783853197, + "learning_rate": 9.366911364587726e-06, + "loss": 0.1013, + "step": 182 + }, + { + "epoch": 0.32562277580071175, + "grad_norm": 1.3468165746555294, + "learning_rate": 9.360087970066854e-06, + "loss": 0.1135, + "step": 183 + }, + { + "epoch": 0.3274021352313167, + "grad_norm": 1.2737294113762112, + "learning_rate": 9.353230514177553e-06, + "loss": 0.1071, + "step": 184 + }, + { + "epoch": 0.3291814946619217, + "grad_norm": 1.2633648785188174, + "learning_rate": 9.346339050490832e-06, + "loss": 0.1124, + "step": 185 + }, + { + "epoch": 0.3309608540925267, + "grad_norm": 1.6848943476461669, + "learning_rate": 9.33941363284338e-06, + "loss": 0.1412, + "step": 186 + }, + { + "epoch": 0.33274021352313166, + "grad_norm": 1.4478447540009096, + "learning_rate": 9.332454315337129e-06, + "loss": 0.1237, + "step": 187 + }, + { + "epoch": 0.33451957295373663, + "grad_norm": 1.5163911183865801, + "learning_rate": 9.325461152338846e-06, + "loss": 0.119, + "step": 188 + }, + { + "epoch": 0.33629893238434166, + "grad_norm": 1.4243691207294058, + "learning_rate": 9.3184341984797e-06, + "loss": 0.1313, + "step": 189 + }, + { + "epoch": 0.33807829181494664, + "grad_norm": 1.5754316661446959, + "learning_rate": 9.311373508654838e-06, + "loss": 0.143, + "step": 190 + }, + { + "epoch": 0.3398576512455516, + "grad_norm": 1.7595561412101806, + "learning_rate": 9.30427913802295e-06, + "loss": 0.1664, + "step": 191 + }, + { + "epoch": 0.3416370106761566, + "grad_norm": 1.5305055081563157, + "learning_rate": 9.297151142005852e-06, + "loss": 0.1441, + "step": 192 + }, + { + "epoch": 0.34341637010676157, + "grad_norm": 1.3894474006237214, + "learning_rate": 9.289989576288035e-06, + "loss": 0.1277, + "step": 193 + }, + { + "epoch": 0.34519572953736655, + "grad_norm": 1.6318584188727645, + "learning_rate": 9.282794496816244e-06, + "loss": 0.1396, + "step": 194 + }, + { + "epoch": 0.3469750889679715, + "grad_norm": 1.5744858909539523, + "learning_rate": 9.27556595979904e-06, + "loss": 0.1304, + "step": 195 + }, + { + "epoch": 0.3487544483985765, + "grad_norm": 1.445329684282647, + "learning_rate": 9.26830402170635e-06, + "loss": 0.1362, + "step": 196 + }, + { + "epoch": 0.3505338078291815, + "grad_norm": 1.270020121730852, + "learning_rate": 9.261008739269035e-06, + "loss": 0.1065, + "step": 197 + }, + { + "epoch": 0.35231316725978645, + "grad_norm": 1.4766701681456913, + "learning_rate": 9.253680169478448e-06, + "loss": 0.1328, + "step": 198 + }, + { + "epoch": 0.3540925266903915, + "grad_norm": 1.3450843767451717, + "learning_rate": 9.246318369585983e-06, + "loss": 0.12, + "step": 199 + }, + { + "epoch": 0.35587188612099646, + "grad_norm": 1.7259434960653937, + "learning_rate": 9.238923397102629e-06, + "loss": 0.1574, + "step": 200 + }, + { + "epoch": 0.35587188612099646, + "eval_loss": 0.14350858330726624, + "eval_runtime": 7.1569, + "eval_samples_per_second": 6.427, + "eval_steps_per_second": 1.677, + "step": 200 + }, + { + "epoch": 0.35765124555160144, + "grad_norm": 1.320682336442158, + "learning_rate": 9.231495309798525e-06, + "loss": 0.1144, + "step": 201 + }, + { + "epoch": 0.3594306049822064, + "grad_norm": 1.6706844104451077, + "learning_rate": 9.224034165702506e-06, + "loss": 0.1476, + "step": 202 + }, + { + "epoch": 0.3612099644128114, + "grad_norm": 1.2143790241578905, + "learning_rate": 9.216540023101646e-06, + "loss": 0.1125, + "step": 203 + }, + { + "epoch": 0.36298932384341637, + "grad_norm": 1.2796879356044104, + "learning_rate": 9.209012940540806e-06, + "loss": 0.1147, + "step": 204 + }, + { + "epoch": 0.36476868327402134, + "grad_norm": 1.1876329454738948, + "learning_rate": 9.20145297682218e-06, + "loss": 0.1237, + "step": 205 + }, + { + "epoch": 0.3665480427046263, + "grad_norm": 1.8113179178303425, + "learning_rate": 9.193860191004833e-06, + "loss": 0.1627, + "step": 206 + }, + { + "epoch": 0.3683274021352313, + "grad_norm": 1.3757046323563018, + "learning_rate": 9.186234642404234e-06, + "loss": 0.1425, + "step": 207 + }, + { + "epoch": 0.3701067615658363, + "grad_norm": 1.5977600885821075, + "learning_rate": 9.178576390591803e-06, + "loss": 0.143, + "step": 208 + }, + { + "epoch": 0.3718861209964413, + "grad_norm": 1.238765918591716, + "learning_rate": 9.170885495394435e-06, + "loss": 0.1114, + "step": 209 + }, + { + "epoch": 0.3736654804270463, + "grad_norm": 1.609455822309686, + "learning_rate": 9.16316201689404e-06, + "loss": 0.1301, + "step": 210 + }, + { + "epoch": 0.37544483985765126, + "grad_norm": 1.6349720466810955, + "learning_rate": 9.155406015427076e-06, + "loss": 0.1472, + "step": 211 + }, + { + "epoch": 0.37722419928825623, + "grad_norm": 1.5474843867777042, + "learning_rate": 9.147617551584066e-06, + "loss": 0.1233, + "step": 212 + }, + { + "epoch": 0.3790035587188612, + "grad_norm": 1.3427581892710871, + "learning_rate": 9.139796686209135e-06, + "loss": 0.1452, + "step": 213 + }, + { + "epoch": 0.3807829181494662, + "grad_norm": 1.2341773880550775, + "learning_rate": 9.131943480399531e-06, + "loss": 0.1161, + "step": 214 + }, + { + "epoch": 0.38256227758007116, + "grad_norm": 1.2218651328103987, + "learning_rate": 9.124057995505148e-06, + "loss": 0.1171, + "step": 215 + }, + { + "epoch": 0.38434163701067614, + "grad_norm": 2.0195390399472375, + "learning_rate": 9.11614029312805e-06, + "loss": 0.1916, + "step": 216 + }, + { + "epoch": 0.3861209964412811, + "grad_norm": 1.6276262849392316, + "learning_rate": 9.108190435121982e-06, + "loss": 0.1412, + "step": 217 + }, + { + "epoch": 0.3879003558718861, + "grad_norm": 1.5993031713110006, + "learning_rate": 9.100208483591892e-06, + "loss": 0.1371, + "step": 218 + }, + { + "epoch": 0.3896797153024911, + "grad_norm": 1.9861450557949942, + "learning_rate": 9.092194500893448e-06, + "loss": 0.2089, + "step": 219 + }, + { + "epoch": 0.3914590747330961, + "grad_norm": 1.1535216112521212, + "learning_rate": 9.084148549632547e-06, + "loss": 0.1138, + "step": 220 + }, + { + "epoch": 0.3932384341637011, + "grad_norm": 1.3959913981489842, + "learning_rate": 9.076070692664827e-06, + "loss": 0.1338, + "step": 221 + }, + { + "epoch": 0.39501779359430605, + "grad_norm": 1.4354143564652753, + "learning_rate": 9.067960993095176e-06, + "loss": 0.1239, + "step": 222 + }, + { + "epoch": 0.39679715302491103, + "grad_norm": 1.5853609477241963, + "learning_rate": 9.059819514277238e-06, + "loss": 0.1387, + "step": 223 + }, + { + "epoch": 0.398576512455516, + "grad_norm": 1.3681011687168392, + "learning_rate": 9.05164631981292e-06, + "loss": 0.1216, + "step": 224 + }, + { + "epoch": 0.400355871886121, + "grad_norm": 1.5374602485433884, + "learning_rate": 9.043441473551893e-06, + "loss": 0.1411, + "step": 225 + }, + { + "epoch": 0.40213523131672596, + "grad_norm": 1.4186030471717734, + "learning_rate": 9.035205039591099e-06, + "loss": 0.122, + "step": 226 + }, + { + "epoch": 0.40391459074733094, + "grad_norm": 1.6009811486025363, + "learning_rate": 9.02693708227424e-06, + "loss": 0.1353, + "step": 227 + }, + { + "epoch": 0.40569395017793597, + "grad_norm": 1.2788491645037854, + "learning_rate": 9.018637666191284e-06, + "loss": 0.1385, + "step": 228 + }, + { + "epoch": 0.40747330960854095, + "grad_norm": 1.5078579713611018, + "learning_rate": 9.010306856177958e-06, + "loss": 0.1513, + "step": 229 + }, + { + "epoch": 0.4092526690391459, + "grad_norm": 1.4697726498475028, + "learning_rate": 9.001944717315236e-06, + "loss": 0.1608, + "step": 230 + }, + { + "epoch": 0.4110320284697509, + "grad_norm": 1.5137805508042916, + "learning_rate": 8.993551314928846e-06, + "loss": 0.1453, + "step": 231 + }, + { + "epoch": 0.4128113879003559, + "grad_norm": 1.2388919480644867, + "learning_rate": 8.985126714588739e-06, + "loss": 0.1065, + "step": 232 + }, + { + "epoch": 0.41459074733096085, + "grad_norm": 1.306249760881592, + "learning_rate": 8.976670982108591e-06, + "loss": 0.1346, + "step": 233 + }, + { + "epoch": 0.41637010676156583, + "grad_norm": 1.5891863090400924, + "learning_rate": 8.968184183545285e-06, + "loss": 0.1597, + "step": 234 + }, + { + "epoch": 0.4181494661921708, + "grad_norm": 1.3973739341602045, + "learning_rate": 8.959666385198396e-06, + "loss": 0.1419, + "step": 235 + }, + { + "epoch": 0.4199288256227758, + "grad_norm": 1.2536983328151956, + "learning_rate": 8.951117653609666e-06, + "loss": 0.1144, + "step": 236 + }, + { + "epoch": 0.42170818505338076, + "grad_norm": 1.4199186973139797, + "learning_rate": 8.9425380555625e-06, + "loss": 0.1259, + "step": 237 + }, + { + "epoch": 0.4234875444839858, + "grad_norm": 0.9220821629683453, + "learning_rate": 8.933927658081423e-06, + "loss": 0.0888, + "step": 238 + }, + { + "epoch": 0.42526690391459077, + "grad_norm": 1.2827691741083203, + "learning_rate": 8.925286528431578e-06, + "loss": 0.1269, + "step": 239 + }, + { + "epoch": 0.42704626334519574, + "grad_norm": 1.3177385866979556, + "learning_rate": 8.916614734118184e-06, + "loss": 0.1095, + "step": 240 + }, + { + "epoch": 0.4288256227758007, + "grad_norm": 1.2056954035414371, + "learning_rate": 8.907912342886016e-06, + "loss": 0.1084, + "step": 241 + }, + { + "epoch": 0.4306049822064057, + "grad_norm": 1.3815430073888333, + "learning_rate": 8.899179422718877e-06, + "loss": 0.1219, + "step": 242 + }, + { + "epoch": 0.43238434163701067, + "grad_norm": 1.5443193700355489, + "learning_rate": 8.890416041839061e-06, + "loss": 0.1426, + "step": 243 + }, + { + "epoch": 0.43416370106761565, + "grad_norm": 1.1746397644812498, + "learning_rate": 8.881622268706825e-06, + "loss": 0.1065, + "step": 244 + }, + { + "epoch": 0.4359430604982206, + "grad_norm": 1.273586888709326, + "learning_rate": 8.872798172019856e-06, + "loss": 0.1096, + "step": 245 + }, + { + "epoch": 0.4377224199288256, + "grad_norm": 1.5330627946798503, + "learning_rate": 8.863943820712726e-06, + "loss": 0.148, + "step": 246 + }, + { + "epoch": 0.4395017793594306, + "grad_norm": 1.4473674740109983, + "learning_rate": 8.855059283956363e-06, + "loss": 0.1614, + "step": 247 + }, + { + "epoch": 0.4412811387900356, + "grad_norm": 1.3966876553974212, + "learning_rate": 8.8461446311575e-06, + "loss": 0.1188, + "step": 248 + }, + { + "epoch": 0.4430604982206406, + "grad_norm": 1.2451938008491792, + "learning_rate": 8.837199931958147e-06, + "loss": 0.1107, + "step": 249 + }, + { + "epoch": 0.44483985765124556, + "grad_norm": 1.4988031830504223, + "learning_rate": 8.828225256235035e-06, + "loss": 0.1432, + "step": 250 + }, + { + "epoch": 0.44661921708185054, + "grad_norm": 1.5912629518024164, + "learning_rate": 8.819220674099074e-06, + "loss": 0.1464, + "step": 251 + }, + { + "epoch": 0.4483985765124555, + "grad_norm": 1.2719995730790428, + "learning_rate": 8.810186255894804e-06, + "loss": 0.1157, + "step": 252 + }, + { + "epoch": 0.4501779359430605, + "grad_norm": 1.4898719921228123, + "learning_rate": 8.801122072199848e-06, + "loss": 0.1274, + "step": 253 + }, + { + "epoch": 0.45195729537366547, + "grad_norm": 1.142669526813139, + "learning_rate": 8.792028193824364e-06, + "loss": 0.0973, + "step": 254 + }, + { + "epoch": 0.45373665480427045, + "grad_norm": 1.1597871710395087, + "learning_rate": 8.782904691810478e-06, + "loss": 0.1037, + "step": 255 + }, + { + "epoch": 0.4555160142348754, + "grad_norm": 1.3046169397046161, + "learning_rate": 8.77375163743175e-06, + "loss": 0.12, + "step": 256 + }, + { + "epoch": 0.45729537366548045, + "grad_norm": 1.2590302458061433, + "learning_rate": 8.764569102192593e-06, + "loss": 0.1259, + "step": 257 + }, + { + "epoch": 0.45907473309608543, + "grad_norm": 1.0868758219614008, + "learning_rate": 8.755357157827735e-06, + "loss": 0.0808, + "step": 258 + }, + { + "epoch": 0.4608540925266904, + "grad_norm": 1.5140149116778991, + "learning_rate": 8.746115876301651e-06, + "loss": 0.1428, + "step": 259 + }, + { + "epoch": 0.4626334519572954, + "grad_norm": 1.2551774116868955, + "learning_rate": 8.736845329807994e-06, + "loss": 0.1126, + "step": 260 + }, + { + "epoch": 0.46441281138790036, + "grad_norm": 1.3740429683605675, + "learning_rate": 8.727545590769044e-06, + "loss": 0.1298, + "step": 261 + }, + { + "epoch": 0.46619217081850534, + "grad_norm": 1.2698932128049305, + "learning_rate": 8.718216731835131e-06, + "loss": 0.124, + "step": 262 + }, + { + "epoch": 0.4679715302491103, + "grad_norm": 1.4467109733868337, + "learning_rate": 8.708858825884075e-06, + "loss": 0.1551, + "step": 263 + }, + { + "epoch": 0.4697508896797153, + "grad_norm": 1.0706620357753582, + "learning_rate": 8.699471946020612e-06, + "loss": 0.1037, + "step": 264 + }, + { + "epoch": 0.47153024911032027, + "grad_norm": 1.3240848753489833, + "learning_rate": 8.690056165575825e-06, + "loss": 0.1201, + "step": 265 + }, + { + "epoch": 0.47330960854092524, + "grad_norm": 1.248506179235332, + "learning_rate": 8.680611558106571e-06, + "loss": 0.1187, + "step": 266 + }, + { + "epoch": 0.4750889679715303, + "grad_norm": 1.1915657853085637, + "learning_rate": 8.671138197394907e-06, + "loss": 0.1087, + "step": 267 + }, + { + "epoch": 0.47686832740213525, + "grad_norm": 1.0739255287858955, + "learning_rate": 8.661636157447511e-06, + "loss": 0.1076, + "step": 268 + }, + { + "epoch": 0.4786476868327402, + "grad_norm": 2.128310705291851, + "learning_rate": 8.652105512495106e-06, + "loss": 0.1559, + "step": 269 + }, + { + "epoch": 0.4804270462633452, + "grad_norm": 1.3275428163972811, + "learning_rate": 8.64254633699188e-06, + "loss": 0.1087, + "step": 270 + }, + { + "epoch": 0.4822064056939502, + "grad_norm": 1.4535231875783747, + "learning_rate": 8.632958705614905e-06, + "loss": 0.1384, + "step": 271 + }, + { + "epoch": 0.48398576512455516, + "grad_norm": 1.3903926810560971, + "learning_rate": 8.623342693263549e-06, + "loss": 0.124, + "step": 272 + }, + { + "epoch": 0.48576512455516013, + "grad_norm": 1.5886251019429265, + "learning_rate": 8.6136983750589e-06, + "loss": 0.1284, + "step": 273 + }, + { + "epoch": 0.4875444839857651, + "grad_norm": 1.3457498653312756, + "learning_rate": 8.604025826343167e-06, + "loss": 0.1277, + "step": 274 + }, + { + "epoch": 0.4893238434163701, + "grad_norm": 1.6347007230276474, + "learning_rate": 8.594325122679107e-06, + "loss": 0.1473, + "step": 275 + }, + { + "epoch": 0.49110320284697506, + "grad_norm": 1.312202897258491, + "learning_rate": 8.584596339849419e-06, + "loss": 0.1259, + "step": 276 + }, + { + "epoch": 0.4928825622775801, + "grad_norm": 1.019172687491143, + "learning_rate": 8.574839553856157e-06, + "loss": 0.106, + "step": 277 + }, + { + "epoch": 0.49466192170818507, + "grad_norm": 1.641994091224545, + "learning_rate": 8.565054840920145e-06, + "loss": 0.1579, + "step": 278 + }, + { + "epoch": 0.49644128113879005, + "grad_norm": 1.091593308412249, + "learning_rate": 8.55524227748037e-06, + "loss": 0.0974, + "step": 279 + }, + { + "epoch": 0.498220640569395, + "grad_norm": 1.35864549898435, + "learning_rate": 8.545401940193392e-06, + "loss": 0.1169, + "step": 280 + }, + { + "epoch": 0.5, + "grad_norm": 1.2614152792231237, + "learning_rate": 8.535533905932739e-06, + "loss": 0.1181, + "step": 281 + }, + { + "epoch": 0.501779359430605, + "grad_norm": 1.8405880597862696, + "learning_rate": 8.525638251788312e-06, + "loss": 0.171, + "step": 282 + }, + { + "epoch": 0.50355871886121, + "grad_norm": 1.4629610482911204, + "learning_rate": 8.515715055065783e-06, + "loss": 0.1373, + "step": 283 + }, + { + "epoch": 0.505338078291815, + "grad_norm": 1.4437157015001951, + "learning_rate": 8.505764393285985e-06, + "loss": 0.1523, + "step": 284 + }, + { + "epoch": 0.5071174377224199, + "grad_norm": 1.3335215309491937, + "learning_rate": 8.495786344184314e-06, + "loss": 0.1165, + "step": 285 + }, + { + "epoch": 0.5088967971530249, + "grad_norm": 1.5284154106030527, + "learning_rate": 8.485780985710113e-06, + "loss": 0.1409, + "step": 286 + }, + { + "epoch": 0.5106761565836299, + "grad_norm": 1.3810740242991313, + "learning_rate": 8.475748396026074e-06, + "loss": 0.1236, + "step": 287 + }, + { + "epoch": 0.5124555160142349, + "grad_norm": 1.3954563500504358, + "learning_rate": 8.46568865350762e-06, + "loss": 0.1352, + "step": 288 + }, + { + "epoch": 0.5142348754448398, + "grad_norm": 1.6775089275988135, + "learning_rate": 8.45560183674229e-06, + "loss": 0.1489, + "step": 289 + }, + { + "epoch": 0.5160142348754448, + "grad_norm": 1.5352255277145486, + "learning_rate": 8.445488024529133e-06, + "loss": 0.1369, + "step": 290 + }, + { + "epoch": 0.5177935943060499, + "grad_norm": 1.159621715420468, + "learning_rate": 8.435347295878087e-06, + "loss": 0.0929, + "step": 291 + }, + { + "epoch": 0.5195729537366548, + "grad_norm": 1.0512067858943523, + "learning_rate": 8.425179730009368e-06, + "loss": 0.0904, + "step": 292 + }, + { + "epoch": 0.5213523131672598, + "grad_norm": 1.344689361873844, + "learning_rate": 8.41498540635284e-06, + "loss": 0.1209, + "step": 293 + }, + { + "epoch": 0.5231316725978647, + "grad_norm": 1.523410468566144, + "learning_rate": 8.404764404547404e-06, + "loss": 0.1316, + "step": 294 + }, + { + "epoch": 0.5249110320284698, + "grad_norm": 1.5411238747103475, + "learning_rate": 8.394516804440374e-06, + "loss": 0.1216, + "step": 295 + }, + { + "epoch": 0.5266903914590747, + "grad_norm": 1.4807364157036627, + "learning_rate": 8.384242686086848e-06, + "loss": 0.1338, + "step": 296 + }, + { + "epoch": 0.5284697508896797, + "grad_norm": 1.5544335300649341, + "learning_rate": 8.373942129749094e-06, + "loss": 0.1408, + "step": 297 + }, + { + "epoch": 0.5302491103202847, + "grad_norm": 1.0951700655618808, + "learning_rate": 8.363615215895908e-06, + "loss": 0.1137, + "step": 298 + }, + { + "epoch": 0.5320284697508897, + "grad_norm": 1.6691744052416226, + "learning_rate": 8.353262025202e-06, + "loss": 0.1407, + "step": 299 + }, + { + "epoch": 0.5338078291814946, + "grad_norm": 1.2038366640700635, + "learning_rate": 8.342882638547351e-06, + "loss": 0.0999, + "step": 300 + }, + { + "epoch": 0.5355871886120996, + "grad_norm": 1.3228328594800742, + "learning_rate": 8.332477137016587e-06, + "loss": 0.1294, + "step": 301 + }, + { + "epoch": 0.5373665480427047, + "grad_norm": 1.2019400342779778, + "learning_rate": 8.322045601898354e-06, + "loss": 0.1132, + "step": 302 + }, + { + "epoch": 0.5391459074733096, + "grad_norm": 1.1050987223643727, + "learning_rate": 8.311588114684665e-06, + "loss": 0.0984, + "step": 303 + }, + { + "epoch": 0.5409252669039146, + "grad_norm": 1.5723866295190063, + "learning_rate": 8.301104757070276e-06, + "loss": 0.1798, + "step": 304 + }, + { + "epoch": 0.5427046263345195, + "grad_norm": 1.4608790946680539, + "learning_rate": 8.290595610952045e-06, + "loss": 0.1225, + "step": 305 + }, + { + "epoch": 0.5444839857651246, + "grad_norm": 1.6577147147163798, + "learning_rate": 8.280060758428294e-06, + "loss": 0.156, + "step": 306 + }, + { + "epoch": 0.5462633451957295, + "grad_norm": 1.050267994871322, + "learning_rate": 8.269500281798164e-06, + "loss": 0.1021, + "step": 307 + }, + { + "epoch": 0.5480427046263345, + "grad_norm": 1.4247760825283253, + "learning_rate": 8.258914263560971e-06, + "loss": 0.1308, + "step": 308 + }, + { + "epoch": 0.5498220640569395, + "grad_norm": 1.3828113803361126, + "learning_rate": 8.248302786415567e-06, + "loss": 0.1351, + "step": 309 + }, + { + "epoch": 0.5516014234875445, + "grad_norm": 1.5443946941737774, + "learning_rate": 8.237665933259693e-06, + "loss": 0.1364, + "step": 310 + }, + { + "epoch": 0.5533807829181495, + "grad_norm": 1.4421977265130892, + "learning_rate": 8.227003787189323e-06, + "loss": 0.137, + "step": 311 + }, + { + "epoch": 0.5551601423487544, + "grad_norm": 1.1201475328948052, + "learning_rate": 8.216316431498028e-06, + "loss": 0.0993, + "step": 312 + }, + { + "epoch": 0.5569395017793595, + "grad_norm": 1.3552493640066288, + "learning_rate": 8.205603949676317e-06, + "loss": 0.1407, + "step": 313 + }, + { + "epoch": 0.5587188612099644, + "grad_norm": 1.4907506738970107, + "learning_rate": 8.194866425410984e-06, + "loss": 0.1354, + "step": 314 + }, + { + "epoch": 0.5604982206405694, + "grad_norm": 1.5785749000090659, + "learning_rate": 8.184103942584456e-06, + "loss": 0.1315, + "step": 315 + }, + { + "epoch": 0.5622775800711743, + "grad_norm": 1.3054199528576296, + "learning_rate": 8.173316585274144e-06, + "loss": 0.1153, + "step": 316 + }, + { + "epoch": 0.5640569395017794, + "grad_norm": 1.116417188323314, + "learning_rate": 8.162504437751775e-06, + "loss": 0.1181, + "step": 317 + }, + { + "epoch": 0.5658362989323843, + "grad_norm": 1.178882361160108, + "learning_rate": 8.151667584482742e-06, + "loss": 0.1086, + "step": 318 + }, + { + "epoch": 0.5676156583629893, + "grad_norm": 1.6896395727044602, + "learning_rate": 8.140806110125442e-06, + "loss": 0.1513, + "step": 319 + }, + { + "epoch": 0.5693950177935944, + "grad_norm": 1.1945397628884609, + "learning_rate": 8.129920099530608e-06, + "loss": 0.1231, + "step": 320 + }, + { + "epoch": 0.5711743772241993, + "grad_norm": 0.9443054650652738, + "learning_rate": 8.119009637740663e-06, + "loss": 0.0796, + "step": 321 + }, + { + "epoch": 0.5729537366548043, + "grad_norm": 2.433644384258339, + "learning_rate": 8.108074809989032e-06, + "loss": 0.1251, + "step": 322 + }, + { + "epoch": 0.5747330960854092, + "grad_norm": 1.3155930644886165, + "learning_rate": 8.097115701699498e-06, + "loss": 0.1054, + "step": 323 + }, + { + "epoch": 0.5765124555160143, + "grad_norm": 1.5522925480478513, + "learning_rate": 8.086132398485525e-06, + "loss": 0.151, + "step": 324 + }, + { + "epoch": 0.5782918149466192, + "grad_norm": 1.0158520585816664, + "learning_rate": 8.075124986149583e-06, + "loss": 0.0975, + "step": 325 + }, + { + "epoch": 0.5800711743772242, + "grad_norm": 1.2548408212995026, + "learning_rate": 8.064093550682494e-06, + "loss": 0.1116, + "step": 326 + }, + { + "epoch": 0.5818505338078291, + "grad_norm": 1.2689643872591336, + "learning_rate": 8.053038178262742e-06, + "loss": 0.0964, + "step": 327 + }, + { + "epoch": 0.5836298932384342, + "grad_norm": 1.4848493760001857, + "learning_rate": 8.041958955255815e-06, + "loss": 0.1242, + "step": 328 + }, + { + "epoch": 0.5854092526690391, + "grad_norm": 1.3572943156840809, + "learning_rate": 8.030855968213518e-06, + "loss": 0.1227, + "step": 329 + }, + { + "epoch": 0.5871886120996441, + "grad_norm": 1.4470743444243865, + "learning_rate": 8.019729303873307e-06, + "loss": 0.1278, + "step": 330 + }, + { + "epoch": 0.5889679715302492, + "grad_norm": 1.0928820401030448, + "learning_rate": 8.008579049157607e-06, + "loss": 0.0948, + "step": 331 + }, + { + "epoch": 0.5907473309608541, + "grad_norm": 1.0906885389154088, + "learning_rate": 7.99740529117313e-06, + "loss": 0.1069, + "step": 332 + }, + { + "epoch": 0.5925266903914591, + "grad_norm": 1.357566203031381, + "learning_rate": 7.986208117210198e-06, + "loss": 0.1207, + "step": 333 + }, + { + "epoch": 0.594306049822064, + "grad_norm": 1.4650886000197623, + "learning_rate": 7.974987614742066e-06, + "loss": 0.1259, + "step": 334 + }, + { + "epoch": 0.5960854092526691, + "grad_norm": 1.6868619854111764, + "learning_rate": 7.963743871424224e-06, + "loss": 0.1536, + "step": 335 + }, + { + "epoch": 0.597864768683274, + "grad_norm": 1.3284237693189664, + "learning_rate": 7.952476975093729e-06, + "loss": 0.115, + "step": 336 + }, + { + "epoch": 0.599644128113879, + "grad_norm": 1.631884894939644, + "learning_rate": 7.941187013768508e-06, + "loss": 0.1497, + "step": 337 + }, + { + "epoch": 0.6014234875444839, + "grad_norm": 1.312006614151387, + "learning_rate": 7.929874075646673e-06, + "loss": 0.1091, + "step": 338 + }, + { + "epoch": 0.603202846975089, + "grad_norm": 1.7002380223023112, + "learning_rate": 7.918538249105835e-06, + "loss": 0.1372, + "step": 339 + }, + { + "epoch": 0.604982206405694, + "grad_norm": 1.6089822890973315, + "learning_rate": 7.907179622702409e-06, + "loss": 0.1412, + "step": 340 + }, + { + "epoch": 0.6067615658362989, + "grad_norm": 1.2525073857339628, + "learning_rate": 7.895798285170927e-06, + "loss": 0.115, + "step": 341 + }, + { + "epoch": 0.608540925266904, + "grad_norm": 1.2059351463607746, + "learning_rate": 7.88439432542334e-06, + "loss": 0.0976, + "step": 342 + }, + { + "epoch": 0.6103202846975089, + "grad_norm": 1.426350585134919, + "learning_rate": 7.872967832548327e-06, + "loss": 0.1332, + "step": 343 + }, + { + "epoch": 0.6120996441281139, + "grad_norm": 1.4459007236750945, + "learning_rate": 7.861518895810597e-06, + "loss": 0.1335, + "step": 344 + }, + { + "epoch": 0.6138790035587188, + "grad_norm": 1.3438963369991237, + "learning_rate": 7.850047604650188e-06, + "loss": 0.1357, + "step": 345 + }, + { + "epoch": 0.6156583629893239, + "grad_norm": 1.529991571860058, + "learning_rate": 7.838554048681783e-06, + "loss": 0.1397, + "step": 346 + }, + { + "epoch": 0.6174377224199288, + "grad_norm": 1.5019186177345394, + "learning_rate": 7.827038317693988e-06, + "loss": 0.1598, + "step": 347 + }, + { + "epoch": 0.6192170818505338, + "grad_norm": 4.408268242820045, + "learning_rate": 7.815500501648654e-06, + "loss": 0.137, + "step": 348 + }, + { + "epoch": 0.6209964412811388, + "grad_norm": 1.3802358392887788, + "learning_rate": 7.80394069068015e-06, + "loss": 0.1216, + "step": 349 + }, + { + "epoch": 0.6227758007117438, + "grad_norm": 1.4649494442602975, + "learning_rate": 7.79235897509468e-06, + "loss": 0.1352, + "step": 350 + }, + { + "epoch": 0.6245551601423488, + "grad_norm": 1.5282423268361176, + "learning_rate": 7.780755445369563e-06, + "loss": 0.1342, + "step": 351 + }, + { + "epoch": 0.6263345195729537, + "grad_norm": 1.7436997548881208, + "learning_rate": 7.769130192152538e-06, + "loss": 0.1587, + "step": 352 + }, + { + "epoch": 0.6281138790035588, + "grad_norm": 1.4492973660104254, + "learning_rate": 7.757483306261042e-06, + "loss": 0.1399, + "step": 353 + }, + { + "epoch": 0.6298932384341637, + "grad_norm": 1.8920898279442842, + "learning_rate": 7.745814878681516e-06, + "loss": 0.1533, + "step": 354 + }, + { + "epoch": 0.6316725978647687, + "grad_norm": 1.3034923915124113, + "learning_rate": 7.734125000568684e-06, + "loss": 0.1276, + "step": 355 + }, + { + "epoch": 0.6334519572953736, + "grad_norm": 1.2614478867062202, + "learning_rate": 7.722413763244837e-06, + "loss": 0.1185, + "step": 356 + }, + { + "epoch": 0.6352313167259787, + "grad_norm": 1.012501012775422, + "learning_rate": 7.710681258199136e-06, + "loss": 0.0942, + "step": 357 + }, + { + "epoch": 0.6370106761565836, + "grad_norm": 1.5046475142828677, + "learning_rate": 7.69892757708688e-06, + "loss": 0.1216, + "step": 358 + }, + { + "epoch": 0.6387900355871886, + "grad_norm": 1.8296956207768407, + "learning_rate": 7.687152811728799e-06, + "loss": 0.1275, + "step": 359 + }, + { + "epoch": 0.6405693950177936, + "grad_norm": 1.355044693816592, + "learning_rate": 7.675357054110337e-06, + "loss": 0.1284, + "step": 360 + }, + { + "epoch": 0.6423487544483986, + "grad_norm": 1.2658221782024195, + "learning_rate": 7.663540396380931e-06, + "loss": 0.109, + "step": 361 + }, + { + "epoch": 0.6441281138790036, + "grad_norm": 1.200866490873271, + "learning_rate": 7.651702930853287e-06, + "loss": 0.1073, + "step": 362 + }, + { + "epoch": 0.6459074733096085, + "grad_norm": 1.3708154597557751, + "learning_rate": 7.639844750002668e-06, + "loss": 0.1172, + "step": 363 + }, + { + "epoch": 0.6476868327402135, + "grad_norm": 1.2611137615038404, + "learning_rate": 7.627965946466167e-06, + "loss": 0.1298, + "step": 364 + }, + { + "epoch": 0.6494661921708185, + "grad_norm": 1.1850326627852785, + "learning_rate": 7.616066613041977e-06, + "loss": 0.1122, + "step": 365 + }, + { + "epoch": 0.6512455516014235, + "grad_norm": 1.3551706805297659, + "learning_rate": 7.6041468426886785e-06, + "loss": 0.1236, + "step": 366 + }, + { + "epoch": 0.6530249110320284, + "grad_norm": 1.0051973136101258, + "learning_rate": 7.592206728524507e-06, + "loss": 0.0851, + "step": 367 + }, + { + "epoch": 0.6548042704626335, + "grad_norm": 1.320125483804524, + "learning_rate": 7.580246363826621e-06, + "loss": 0.1091, + "step": 368 + }, + { + "epoch": 0.6565836298932385, + "grad_norm": 1.4979758702352242, + "learning_rate": 7.568265842030381e-06, + "loss": 0.1356, + "step": 369 + }, + { + "epoch": 0.6583629893238434, + "grad_norm": 1.250878469807032, + "learning_rate": 7.556265256728618e-06, + "loss": 0.1226, + "step": 370 + }, + { + "epoch": 0.6601423487544484, + "grad_norm": 1.2610023856998045, + "learning_rate": 7.544244701670894e-06, + "loss": 0.118, + "step": 371 + }, + { + "epoch": 0.6619217081850534, + "grad_norm": 1.4123352756784129, + "learning_rate": 7.532204270762786e-06, + "loss": 0.1309, + "step": 372 + }, + { + "epoch": 0.6637010676156584, + "grad_norm": 1.198475856415187, + "learning_rate": 7.520144058065133e-06, + "loss": 0.0976, + "step": 373 + }, + { + "epoch": 0.6654804270462633, + "grad_norm": 1.2455669749293448, + "learning_rate": 7.50806415779332e-06, + "loss": 0.103, + "step": 374 + }, + { + "epoch": 0.6672597864768683, + "grad_norm": 1.40954667227436, + "learning_rate": 7.495964664316525e-06, + "loss": 0.1207, + "step": 375 + }, + { + "epoch": 0.6690391459074733, + "grad_norm": 1.3646583479133876, + "learning_rate": 7.4838456721569975e-06, + "loss": 0.1183, + "step": 376 + }, + { + "epoch": 0.6708185053380783, + "grad_norm": 1.1672216677261495, + "learning_rate": 7.471707275989304e-06, + "loss": 0.1117, + "step": 377 + }, + { + "epoch": 0.6725978647686833, + "grad_norm": 1.2458080943094203, + "learning_rate": 7.459549570639602e-06, + "loss": 0.1077, + "step": 378 + }, + { + "epoch": 0.6743772241992882, + "grad_norm": 1.34992094298806, + "learning_rate": 7.447372651084896e-06, + "loss": 0.1187, + "step": 379 + }, + { + "epoch": 0.6761565836298933, + "grad_norm": 1.1392793075673076, + "learning_rate": 7.435176612452286e-06, + "loss": 0.1002, + "step": 380 + }, + { + "epoch": 0.6779359430604982, + "grad_norm": 1.509242478825323, + "learning_rate": 7.4229615500182396e-06, + "loss": 0.1332, + "step": 381 + }, + { + "epoch": 0.6797153024911032, + "grad_norm": 1.1082442571399327, + "learning_rate": 7.4107275592078345e-06, + "loss": 0.0994, + "step": 382 + }, + { + "epoch": 0.6814946619217082, + "grad_norm": 1.2190206115453381, + "learning_rate": 7.398474735594022e-06, + "loss": 0.1003, + "step": 383 + }, + { + "epoch": 0.6832740213523132, + "grad_norm": 1.4613581029411111, + "learning_rate": 7.386203174896872e-06, + "loss": 0.1334, + "step": 384 + }, + { + "epoch": 0.6850533807829181, + "grad_norm": 1.3880626311701247, + "learning_rate": 7.373912972982838e-06, + "loss": 0.1224, + "step": 385 + }, + { + "epoch": 0.6868327402135231, + "grad_norm": 1.20257242571399, + "learning_rate": 7.361604225863992e-06, + "loss": 0.1088, + "step": 386 + }, + { + "epoch": 0.6886120996441281, + "grad_norm": 1.5957397780322953, + "learning_rate": 7.349277029697287e-06, + "loss": 0.1374, + "step": 387 + }, + { + "epoch": 0.6903914590747331, + "grad_norm": 1.3620482394438482, + "learning_rate": 7.336931480783801e-06, + "loss": 0.1162, + "step": 388 + }, + { + "epoch": 0.6921708185053381, + "grad_norm": 1.5870093402282994, + "learning_rate": 7.3245676755679854e-06, + "loss": 0.1251, + "step": 389 + }, + { + "epoch": 0.693950177935943, + "grad_norm": 1.3841478989559841, + "learning_rate": 7.312185710636911e-06, + "loss": 0.1228, + "step": 390 + }, + { + "epoch": 0.6957295373665481, + "grad_norm": 1.2266321271403495, + "learning_rate": 7.299785682719512e-06, + "loss": 0.0946, + "step": 391 + }, + { + "epoch": 0.697508896797153, + "grad_norm": 1.3206399673108276, + "learning_rate": 7.287367688685835e-06, + "loss": 0.1178, + "step": 392 + }, + { + "epoch": 0.699288256227758, + "grad_norm": 1.7284010500339315, + "learning_rate": 7.274931825546279e-06, + "loss": 0.1377, + "step": 393 + }, + { + "epoch": 0.701067615658363, + "grad_norm": 1.237151195783981, + "learning_rate": 7.262478190450834e-06, + "loss": 0.1058, + "step": 394 + }, + { + "epoch": 0.702846975088968, + "grad_norm": 1.4359366891420557, + "learning_rate": 7.250006880688332e-06, + "loss": 0.143, + "step": 395 + }, + { + "epoch": 0.7046263345195729, + "grad_norm": 1.303179595056172, + "learning_rate": 7.2375179936856775e-06, + "loss": 0.1136, + "step": 396 + }, + { + "epoch": 0.7064056939501779, + "grad_norm": 1.1508347712381617, + "learning_rate": 7.22501162700709e-06, + "loss": 0.1176, + "step": 397 + }, + { + "epoch": 0.708185053380783, + "grad_norm": 1.2194552151231233, + "learning_rate": 7.21248787835334e-06, + "loss": 0.1085, + "step": 398 + }, + { + "epoch": 0.7099644128113879, + "grad_norm": 1.2550760299236767, + "learning_rate": 7.199946845560994e-06, + "loss": 0.108, + "step": 399 + }, + { + "epoch": 0.7117437722419929, + "grad_norm": 1.6680598620705607, + "learning_rate": 7.1873886266016365e-06, + "loss": 0.1605, + "step": 400 + }, + { + "epoch": 0.7117437722419929, + "eval_loss": 0.13593703508377075, + "eval_runtime": 7.1523, + "eval_samples_per_second": 6.431, + "eval_steps_per_second": 1.678, + "step": 400 + }, + { + "epoch": 0.7135231316725978, + "grad_norm": 1.3857617042325157, + "learning_rate": 7.174813319581115e-06, + "loss": 0.1307, + "step": 401 + }, + { + "epoch": 0.7153024911032029, + "grad_norm": 1.922593312155491, + "learning_rate": 7.162221022738768e-06, + "loss": 0.149, + "step": 402 + }, + { + "epoch": 0.7170818505338078, + "grad_norm": 1.4902600933078627, + "learning_rate": 7.149611834446664e-06, + "loss": 0.1505, + "step": 403 + }, + { + "epoch": 0.7188612099644128, + "grad_norm": 1.4168107428036665, + "learning_rate": 7.136985853208824e-06, + "loss": 0.1335, + "step": 404 + }, + { + "epoch": 0.7206405693950177, + "grad_norm": 1.2813571246783395, + "learning_rate": 7.124343177660462e-06, + "loss": 0.1123, + "step": 405 + }, + { + "epoch": 0.7224199288256228, + "grad_norm": 1.6025463929831045, + "learning_rate": 7.111683906567206e-06, + "loss": 0.1441, + "step": 406 + }, + { + "epoch": 0.7241992882562278, + "grad_norm": 1.3440377342055425, + "learning_rate": 7.099008138824329e-06, + "loss": 0.1126, + "step": 407 + }, + { + "epoch": 0.7259786476868327, + "grad_norm": 1.2724255186050077, + "learning_rate": 7.086315973455982e-06, + "loss": 0.1213, + "step": 408 + }, + { + "epoch": 0.7277580071174378, + "grad_norm": 1.4778145133673117, + "learning_rate": 7.0736075096144084e-06, + "loss": 0.1408, + "step": 409 + }, + { + "epoch": 0.7295373665480427, + "grad_norm": 1.0833048159494032, + "learning_rate": 7.060882846579182e-06, + "loss": 0.0987, + "step": 410 + }, + { + "epoch": 0.7313167259786477, + "grad_norm": 1.5143152577606858, + "learning_rate": 7.048142083756427e-06, + "loss": 0.1382, + "step": 411 + }, + { + "epoch": 0.7330960854092526, + "grad_norm": 1.3197221214599737, + "learning_rate": 7.035385320678035e-06, + "loss": 0.1258, + "step": 412 + }, + { + "epoch": 0.7348754448398577, + "grad_norm": 1.3755616798221522, + "learning_rate": 7.022612657000898e-06, + "loss": 0.1154, + "step": 413 + }, + { + "epoch": 0.7366548042704626, + "grad_norm": 1.3591609015544808, + "learning_rate": 7.0098241925061215e-06, + "loss": 0.1246, + "step": 414 + }, + { + "epoch": 0.7384341637010676, + "grad_norm": 1.7750632683995469, + "learning_rate": 6.997020027098249e-06, + "loss": 0.1326, + "step": 415 + }, + { + "epoch": 0.7402135231316725, + "grad_norm": 1.4083511956168764, + "learning_rate": 6.9842002608044844e-06, + "loss": 0.1382, + "step": 416 + }, + { + "epoch": 0.7419928825622776, + "grad_norm": 1.1617276411064767, + "learning_rate": 6.971364993773901e-06, + "loss": 0.1055, + "step": 417 + }, + { + "epoch": 0.7437722419928826, + "grad_norm": 1.4035719811413283, + "learning_rate": 6.958514326276669e-06, + "loss": 0.1101, + "step": 418 + }, + { + "epoch": 0.7455516014234875, + "grad_norm": 1.3293580992170293, + "learning_rate": 6.945648358703269e-06, + "loss": 0.1032, + "step": 419 + }, + { + "epoch": 0.7473309608540926, + "grad_norm": 1.3235544206067675, + "learning_rate": 6.932767191563703e-06, + "loss": 0.1407, + "step": 420 + }, + { + "epoch": 0.7491103202846975, + "grad_norm": 1.0915905612236356, + "learning_rate": 6.919870925486718e-06, + "loss": 0.0892, + "step": 421 + }, + { + "epoch": 0.7508896797153025, + "grad_norm": 1.0646199439982955, + "learning_rate": 6.906959661219011e-06, + "loss": 0.1003, + "step": 422 + }, + { + "epoch": 0.7526690391459074, + "grad_norm": 1.3132527820198336, + "learning_rate": 6.8940334996244505e-06, + "loss": 0.112, + "step": 423 + }, + { + "epoch": 0.7544483985765125, + "grad_norm": 1.3131228207975008, + "learning_rate": 6.881092541683279e-06, + "loss": 0.1218, + "step": 424 + }, + { + "epoch": 0.7562277580071174, + "grad_norm": 1.3253678432316216, + "learning_rate": 6.8681368884913345e-06, + "loss": 0.1053, + "step": 425 + }, + { + "epoch": 0.7580071174377224, + "grad_norm": 1.1045518764985973, + "learning_rate": 6.855166641259252e-06, + "loss": 0.1004, + "step": 426 + }, + { + "epoch": 0.7597864768683275, + "grad_norm": 1.1916871119247132, + "learning_rate": 6.8421819013116766e-06, + "loss": 0.1165, + "step": 427 + }, + { + "epoch": 0.7615658362989324, + "grad_norm": 1.2355688910032794, + "learning_rate": 6.829182770086474e-06, + "loss": 0.1293, + "step": 428 + }, + { + "epoch": 0.7633451957295374, + "grad_norm": 1.361448938887964, + "learning_rate": 6.816169349133934e-06, + "loss": 0.116, + "step": 429 + }, + { + "epoch": 0.7651245551601423, + "grad_norm": 1.1992796748417411, + "learning_rate": 6.803141740115979e-06, + "loss": 0.0894, + "step": 430 + }, + { + "epoch": 0.7669039145907474, + "grad_norm": 2.1019875524018112, + "learning_rate": 6.7901000448053676e-06, + "loss": 0.177, + "step": 431 + }, + { + "epoch": 0.7686832740213523, + "grad_norm": 1.113172252528109, + "learning_rate": 6.777044365084907e-06, + "loss": 0.0992, + "step": 432 + }, + { + "epoch": 0.7704626334519573, + "grad_norm": 1.1910343388250089, + "learning_rate": 6.763974802946649e-06, + "loss": 0.1074, + "step": 433 + }, + { + "epoch": 0.7722419928825622, + "grad_norm": 1.4591817103826428, + "learning_rate": 6.750891460491093e-06, + "loss": 0.1499, + "step": 434 + }, + { + "epoch": 0.7740213523131673, + "grad_norm": 1.5086158043764135, + "learning_rate": 6.737794439926395e-06, + "loss": 0.1474, + "step": 435 + }, + { + "epoch": 0.7758007117437722, + "grad_norm": 1.3160884455859283, + "learning_rate": 6.724683843567567e-06, + "loss": 0.1284, + "step": 436 + }, + { + "epoch": 0.7775800711743772, + "grad_norm": 1.329340529761385, + "learning_rate": 6.711559773835672e-06, + "loss": 0.1131, + "step": 437 + }, + { + "epoch": 0.7793594306049823, + "grad_norm": 1.4861279018952749, + "learning_rate": 6.69842233325703e-06, + "loss": 0.1161, + "step": 438 + }, + { + "epoch": 0.7811387900355872, + "grad_norm": 1.7627552929537986, + "learning_rate": 6.685271624462416e-06, + "loss": 0.154, + "step": 439 + }, + { + "epoch": 0.7829181494661922, + "grad_norm": 1.2573572355395808, + "learning_rate": 6.672107750186255e-06, + "loss": 0.1107, + "step": 440 + }, + { + "epoch": 0.7846975088967971, + "grad_norm": 0.8993430850299913, + "learning_rate": 6.658930813265825e-06, + "loss": 0.0867, + "step": 441 + }, + { + "epoch": 0.7864768683274022, + "grad_norm": 1.1796545941704497, + "learning_rate": 6.645740916640449e-06, + "loss": 0.1047, + "step": 442 + }, + { + "epoch": 0.7882562277580071, + "grad_norm": 1.1062583377101722, + "learning_rate": 6.63253816335069e-06, + "loss": 0.0928, + "step": 443 + }, + { + "epoch": 0.7900355871886121, + "grad_norm": 1.217718954992437, + "learning_rate": 6.619322656537552e-06, + "loss": 0.1095, + "step": 444 + }, + { + "epoch": 0.791814946619217, + "grad_norm": 1.2126501930341467, + "learning_rate": 6.606094499441671e-06, + "loss": 0.104, + "step": 445 + }, + { + "epoch": 0.7935943060498221, + "grad_norm": 1.3605714831184592, + "learning_rate": 6.592853795402502e-06, + "loss": 0.1124, + "step": 446 + }, + { + "epoch": 0.7953736654804271, + "grad_norm": 1.3391599831782586, + "learning_rate": 6.579600647857525e-06, + "loss": 0.1213, + "step": 447 + }, + { + "epoch": 0.797153024911032, + "grad_norm": 1.5955458337165742, + "learning_rate": 6.566335160341425e-06, + "loss": 0.1457, + "step": 448 + }, + { + "epoch": 0.798932384341637, + "grad_norm": 1.2065156195943607, + "learning_rate": 6.553057436485289e-06, + "loss": 0.1062, + "step": 449 + }, + { + "epoch": 0.800711743772242, + "grad_norm": 1.2116479228176436, + "learning_rate": 6.539767580015799e-06, + "loss": 0.1233, + "step": 450 + }, + { + "epoch": 0.802491103202847, + "grad_norm": 1.6108974803910467, + "learning_rate": 6.52646569475441e-06, + "loss": 0.1277, + "step": 451 + }, + { + "epoch": 0.8042704626334519, + "grad_norm": 1.409122193803232, + "learning_rate": 6.513151884616556e-06, + "loss": 0.1302, + "step": 452 + }, + { + "epoch": 0.806049822064057, + "grad_norm": 2.761863084728338, + "learning_rate": 6.499826253610823e-06, + "loss": 0.1292, + "step": 453 + }, + { + "epoch": 0.8078291814946619, + "grad_norm": 1.1298348277268495, + "learning_rate": 6.486488905838143e-06, + "loss": 0.1077, + "step": 454 + }, + { + "epoch": 0.8096085409252669, + "grad_norm": 1.304266904791572, + "learning_rate": 6.473139945490984e-06, + "loss": 0.0973, + "step": 455 + }, + { + "epoch": 0.8113879003558719, + "grad_norm": 1.470403225267748, + "learning_rate": 6.459779476852528e-06, + "loss": 0.129, + "step": 456 + }, + { + "epoch": 0.8131672597864769, + "grad_norm": 1.2994618526756987, + "learning_rate": 6.446407604295863e-06, + "loss": 0.1212, + "step": 457 + }, + { + "epoch": 0.8149466192170819, + "grad_norm": 1.376936560412594, + "learning_rate": 6.433024432283169e-06, + "loss": 0.1163, + "step": 458 + }, + { + "epoch": 0.8167259786476868, + "grad_norm": 1.4692918244800937, + "learning_rate": 6.41963006536489e-06, + "loss": 0.1243, + "step": 459 + }, + { + "epoch": 0.8185053380782918, + "grad_norm": 1.3813366689041184, + "learning_rate": 6.4062246081789316e-06, + "loss": 0.1189, + "step": 460 + }, + { + "epoch": 0.8202846975088968, + "grad_norm": 1.0647073329181291, + "learning_rate": 6.392808165449836e-06, + "loss": 0.0845, + "step": 461 + }, + { + "epoch": 0.8220640569395018, + "grad_norm": 1.3277693379270958, + "learning_rate": 6.379380841987965e-06, + "loss": 0.1114, + "step": 462 + }, + { + "epoch": 0.8238434163701067, + "grad_norm": 1.1626084033223911, + "learning_rate": 6.365942742688684e-06, + "loss": 0.0964, + "step": 463 + }, + { + "epoch": 0.8256227758007118, + "grad_norm": 1.5328885016841056, + "learning_rate": 6.352493972531535e-06, + "loss": 0.1348, + "step": 464 + }, + { + "epoch": 0.8274021352313167, + "grad_norm": 1.6975023292719447, + "learning_rate": 6.339034636579425e-06, + "loss": 0.1473, + "step": 465 + }, + { + "epoch": 0.8291814946619217, + "grad_norm": 1.2556416031368975, + "learning_rate": 6.325564839977802e-06, + "loss": 0.0927, + "step": 466 + }, + { + "epoch": 0.8309608540925267, + "grad_norm": 1.074064032510885, + "learning_rate": 6.312084687953835e-06, + "loss": 0.0969, + "step": 467 + }, + { + "epoch": 0.8327402135231317, + "grad_norm": 1.6835238670487385, + "learning_rate": 6.298594285815585e-06, + "loss": 0.1511, + "step": 468 + }, + { + "epoch": 0.8345195729537367, + "grad_norm": 1.4839581608826677, + "learning_rate": 6.2850937389511936e-06, + "loss": 0.1471, + "step": 469 + }, + { + "epoch": 0.8362989323843416, + "grad_norm": 1.1445849946515074, + "learning_rate": 6.271583152828049e-06, + "loss": 0.0941, + "step": 470 + }, + { + "epoch": 0.8380782918149466, + "grad_norm": 1.2301203032248997, + "learning_rate": 6.258062632991972e-06, + "loss": 0.0938, + "step": 471 + }, + { + "epoch": 0.8398576512455516, + "grad_norm": 1.229861866409161, + "learning_rate": 6.244532285066382e-06, + "loss": 0.118, + "step": 472 + }, + { + "epoch": 0.8416370106761566, + "grad_norm": 1.1644908033621832, + "learning_rate": 6.2309922147514775e-06, + "loss": 0.1081, + "step": 473 + }, + { + "epoch": 0.8434163701067615, + "grad_norm": 1.1517835145097681, + "learning_rate": 6.2174425278234115e-06, + "loss": 0.125, + "step": 474 + }, + { + "epoch": 0.8451957295373665, + "grad_norm": 1.277475692490733, + "learning_rate": 6.20388333013346e-06, + "loss": 0.0903, + "step": 475 + }, + { + "epoch": 0.8469750889679716, + "grad_norm": 1.4269406154560587, + "learning_rate": 6.190314727607196e-06, + "loss": 0.1368, + "step": 476 + }, + { + "epoch": 0.8487544483985765, + "grad_norm": 1.367125057436054, + "learning_rate": 6.176736826243671e-06, + "loss": 0.1166, + "step": 477 + }, + { + "epoch": 0.8505338078291815, + "grad_norm": 1.2159537541519017, + "learning_rate": 6.163149732114571e-06, + "loss": 0.1144, + "step": 478 + }, + { + "epoch": 0.8523131672597865, + "grad_norm": 1.3055103197822622, + "learning_rate": 6.149553551363404e-06, + "loss": 0.0998, + "step": 479 + }, + { + "epoch": 0.8540925266903915, + "grad_norm": 1.4481309925729284, + "learning_rate": 6.1359483902046605e-06, + "loss": 0.1303, + "step": 480 + }, + { + "epoch": 0.8558718861209964, + "grad_norm": 1.21028745853387, + "learning_rate": 6.122334354922984e-06, + "loss": 0.1058, + "step": 481 + }, + { + "epoch": 0.8576512455516014, + "grad_norm": 1.1426605648496042, + "learning_rate": 6.108711551872347e-06, + "loss": 0.1052, + "step": 482 + }, + { + "epoch": 0.8594306049822064, + "grad_norm": 1.4959349397230497, + "learning_rate": 6.095080087475218e-06, + "loss": 0.1219, + "step": 483 + }, + { + "epoch": 0.8612099644128114, + "grad_norm": 1.2524171813001315, + "learning_rate": 6.0814400682217236e-06, + "loss": 0.1206, + "step": 484 + }, + { + "epoch": 0.8629893238434164, + "grad_norm": 1.2423264837863424, + "learning_rate": 6.067791600668823e-06, + "loss": 0.0984, + "step": 485 + }, + { + "epoch": 0.8647686832740213, + "grad_norm": 1.0254265008644459, + "learning_rate": 6.054134791439479e-06, + "loss": 0.0761, + "step": 486 + }, + { + "epoch": 0.8665480427046264, + "grad_norm": 1.2467048105306586, + "learning_rate": 6.040469747221815e-06, + "loss": 0.0973, + "step": 487 + }, + { + "epoch": 0.8683274021352313, + "grad_norm": 1.3103708516140928, + "learning_rate": 6.026796574768288e-06, + "loss": 0.0972, + "step": 488 + }, + { + "epoch": 0.8701067615658363, + "grad_norm": 1.3063825459585219, + "learning_rate": 6.013115380894854e-06, + "loss": 0.1145, + "step": 489 + }, + { + "epoch": 0.8718861209964412, + "grad_norm": 1.1721813308111106, + "learning_rate": 5.999426272480133e-06, + "loss": 0.093, + "step": 490 + }, + { + "epoch": 0.8736654804270463, + "grad_norm": 1.2947034866641753, + "learning_rate": 5.985729356464575e-06, + "loss": 0.1156, + "step": 491 + }, + { + "epoch": 0.8754448398576512, + "grad_norm": 1.2238543229470935, + "learning_rate": 5.972024739849622e-06, + "loss": 0.1048, + "step": 492 + }, + { + "epoch": 0.8772241992882562, + "grad_norm": 0.899238492025567, + "learning_rate": 5.958312529696874e-06, + "loss": 0.0783, + "step": 493 + }, + { + "epoch": 0.8790035587188612, + "grad_norm": 1.3050605129441113, + "learning_rate": 5.944592833127253e-06, + "loss": 0.1346, + "step": 494 + }, + { + "epoch": 0.8807829181494662, + "grad_norm": 1.2659620119650414, + "learning_rate": 5.9308657573201645e-06, + "loss": 0.126, + "step": 495 + }, + { + "epoch": 0.8825622775800712, + "grad_norm": 1.053722658305837, + "learning_rate": 5.917131409512663e-06, + "loss": 0.0966, + "step": 496 + }, + { + "epoch": 0.8843416370106761, + "grad_norm": 1.075322132871375, + "learning_rate": 5.903389896998611e-06, + "loss": 0.1028, + "step": 497 + }, + { + "epoch": 0.8861209964412812, + "grad_norm": 1.4027881729705716, + "learning_rate": 5.889641327127843e-06, + "loss": 0.1107, + "step": 498 + }, + { + "epoch": 0.8879003558718861, + "grad_norm": 1.2532074853235078, + "learning_rate": 5.875885807305326e-06, + "loss": 0.1137, + "step": 499 + }, + { + "epoch": 0.8896797153024911, + "grad_norm": 1.1427736844803538, + "learning_rate": 5.862123444990319e-06, + "loss": 0.1033, + "step": 500 + }, + { + "epoch": 0.891459074733096, + "grad_norm": 1.480776373418844, + "learning_rate": 5.848354347695537e-06, + "loss": 0.1605, + "step": 501 + }, + { + "epoch": 0.8932384341637011, + "grad_norm": 1.439908135605425, + "learning_rate": 5.83457862298631e-06, + "loss": 0.1332, + "step": 502 + }, + { + "epoch": 0.895017793594306, + "grad_norm": 1.5682242600786214, + "learning_rate": 5.8207963784797396e-06, + "loss": 0.1268, + "step": 503 + }, + { + "epoch": 0.896797153024911, + "grad_norm": 1.2659971998362078, + "learning_rate": 5.807007721843862e-06, + "loss": 0.1192, + "step": 504 + }, + { + "epoch": 0.8985765124555161, + "grad_norm": 1.6373734655317385, + "learning_rate": 5.793212760796804e-06, + "loss": 0.1549, + "step": 505 + }, + { + "epoch": 0.900355871886121, + "grad_norm": 1.3284478949173315, + "learning_rate": 5.779411603105947e-06, + "loss": 0.1261, + "step": 506 + }, + { + "epoch": 0.902135231316726, + "grad_norm": 1.2988901561655044, + "learning_rate": 5.765604356587076e-06, + "loss": 0.1169, + "step": 507 + }, + { + "epoch": 0.9039145907473309, + "grad_norm": 1.3015631068654006, + "learning_rate": 5.751791129103545e-06, + "loss": 0.1225, + "step": 508 + }, + { + "epoch": 0.905693950177936, + "grad_norm": 1.462346480774781, + "learning_rate": 5.737972028565431e-06, + "loss": 0.1408, + "step": 509 + }, + { + "epoch": 0.9074733096085409, + "grad_norm": 1.300985474327381, + "learning_rate": 5.7241471629286934e-06, + "loss": 0.1221, + "step": 510 + }, + { + "epoch": 0.9092526690391459, + "grad_norm": 1.532776752619801, + "learning_rate": 5.7103166401943276e-06, + "loss": 0.1277, + "step": 511 + }, + { + "epoch": 0.9110320284697508, + "grad_norm": 1.3764231355293768, + "learning_rate": 5.696480568407523e-06, + "loss": 0.1175, + "step": 512 + }, + { + "epoch": 0.9128113879003559, + "grad_norm": 1.62882350692066, + "learning_rate": 5.682639055656817e-06, + "loss": 0.1329, + "step": 513 + }, + { + "epoch": 0.9145907473309609, + "grad_norm": 1.390699574289703, + "learning_rate": 5.668792210073255e-06, + "loss": 0.1379, + "step": 514 + }, + { + "epoch": 0.9163701067615658, + "grad_norm": 1.505728133737634, + "learning_rate": 5.654940139829544e-06, + "loss": 0.1289, + "step": 515 + }, + { + "epoch": 0.9181494661921709, + "grad_norm": 1.2352069871978812, + "learning_rate": 5.641082953139201e-06, + "loss": 0.1086, + "step": 516 + }, + { + "epoch": 0.9199288256227758, + "grad_norm": 0.9535889029324193, + "learning_rate": 5.6272207582557195e-06, + "loss": 0.0757, + "step": 517 + }, + { + "epoch": 0.9217081850533808, + "grad_norm": 1.239827493884586, + "learning_rate": 5.61335366347171e-06, + "loss": 0.0923, + "step": 518 + }, + { + "epoch": 0.9234875444839857, + "grad_norm": 1.2687618628655917, + "learning_rate": 5.599481777118071e-06, + "loss": 0.1205, + "step": 519 + }, + { + "epoch": 0.9252669039145908, + "grad_norm": 1.3827338167233967, + "learning_rate": 5.585605207563124e-06, + "loss": 0.1032, + "step": 520 + }, + { + "epoch": 0.9270462633451957, + "grad_norm": 1.2943750345053857, + "learning_rate": 5.571724063211782e-06, + "loss": 0.1056, + "step": 521 + }, + { + "epoch": 0.9288256227758007, + "grad_norm": 1.154204506725661, + "learning_rate": 5.557838452504692e-06, + "loss": 0.0865, + "step": 522 + }, + { + "epoch": 0.9306049822064056, + "grad_norm": 1.5555347050342136, + "learning_rate": 5.5439484839173996e-06, + "loss": 0.1236, + "step": 523 + }, + { + "epoch": 0.9323843416370107, + "grad_norm": 1.2504207552639395, + "learning_rate": 5.530054265959486e-06, + "loss": 0.1081, + "step": 524 + }, + { + "epoch": 0.9341637010676157, + "grad_norm": 1.2083286597070473, + "learning_rate": 5.516155907173735e-06, + "loss": 0.1185, + "step": 525 + }, + { + "epoch": 0.9359430604982206, + "grad_norm": 1.0566655375547844, + "learning_rate": 5.5022535161352764e-06, + "loss": 0.0912, + "step": 526 + }, + { + "epoch": 0.9377224199288257, + "grad_norm": 1.4227842566270228, + "learning_rate": 5.488347201450741e-06, + "loss": 0.1137, + "step": 527 + }, + { + "epoch": 0.9395017793594306, + "grad_norm": 1.1589922153147618, + "learning_rate": 5.47443707175741e-06, + "loss": 0.11, + "step": 528 + }, + { + "epoch": 0.9412811387900356, + "grad_norm": 1.2884702592213582, + "learning_rate": 5.46052323572237e-06, + "loss": 0.111, + "step": 529 + }, + { + "epoch": 0.9430604982206405, + "grad_norm": 1.058974783820137, + "learning_rate": 5.446605802041662e-06, + "loss": 0.0882, + "step": 530 + }, + { + "epoch": 0.9448398576512456, + "grad_norm": 1.0573388209912145, + "learning_rate": 5.432684879439428e-06, + "loss": 0.0883, + "step": 531 + }, + { + "epoch": 0.9466192170818505, + "grad_norm": 1.0408230290222384, + "learning_rate": 5.418760576667071e-06, + "loss": 0.0807, + "step": 532 + }, + { + "epoch": 0.9483985765124555, + "grad_norm": 1.3449177340112355, + "learning_rate": 5.404833002502398e-06, + "loss": 0.133, + "step": 533 + }, + { + "epoch": 0.9501779359430605, + "grad_norm": 1.5343246666409203, + "learning_rate": 5.39090226574877e-06, + "loss": 0.1214, + "step": 534 + }, + { + "epoch": 0.9519572953736655, + "grad_norm": 1.7169322765401176, + "learning_rate": 5.376968475234258e-06, + "loss": 0.1528, + "step": 535 + }, + { + "epoch": 0.9537366548042705, + "grad_norm": 0.9441690695498636, + "learning_rate": 5.363031739810787e-06, + "loss": 0.0776, + "step": 536 + }, + { + "epoch": 0.9555160142348754, + "grad_norm": 1.2393233775737496, + "learning_rate": 5.349092168353291e-06, + "loss": 0.1199, + "step": 537 + }, + { + "epoch": 0.9572953736654805, + "grad_norm": 1.1501632012710579, + "learning_rate": 5.335149869758855e-06, + "loss": 0.0998, + "step": 538 + }, + { + "epoch": 0.9590747330960854, + "grad_norm": 1.345261486444856, + "learning_rate": 5.32120495294587e-06, + "loss": 0.109, + "step": 539 + }, + { + "epoch": 0.9608540925266904, + "grad_norm": 1.1582897990190524, + "learning_rate": 5.3072575268531835e-06, + "loss": 0.1234, + "step": 540 + }, + { + "epoch": 0.9626334519572953, + "grad_norm": 1.3216175000984673, + "learning_rate": 5.293307700439242e-06, + "loss": 0.1168, + "step": 541 + }, + { + "epoch": 0.9644128113879004, + "grad_norm": 1.7012563356884112, + "learning_rate": 5.2793555826812456e-06, + "loss": 0.1258, + "step": 542 + }, + { + "epoch": 0.9661921708185054, + "grad_norm": 1.1504863796193265, + "learning_rate": 5.265401282574294e-06, + "loss": 0.0894, + "step": 543 + }, + { + "epoch": 0.9679715302491103, + "grad_norm": 1.4774605956962692, + "learning_rate": 5.2514449091305375e-06, + "loss": 0.1171, + "step": 544 + }, + { + "epoch": 0.9697508896797153, + "grad_norm": 1.5137108185784622, + "learning_rate": 5.237486571378317e-06, + "loss": 0.1267, + "step": 545 + }, + { + "epoch": 0.9715302491103203, + "grad_norm": 1.325817745776221, + "learning_rate": 5.22352637836133e-06, + "loss": 0.1328, + "step": 546 + }, + { + "epoch": 0.9733096085409253, + "grad_norm": 1.2365677758500844, + "learning_rate": 5.209564439137755e-06, + "loss": 0.1038, + "step": 547 + }, + { + "epoch": 0.9750889679715302, + "grad_norm": 1.4571885975836905, + "learning_rate": 5.195600862779421e-06, + "loss": 0.1502, + "step": 548 + }, + { + "epoch": 0.9768683274021353, + "grad_norm": 1.0593248552255834, + "learning_rate": 5.181635758370942e-06, + "loss": 0.0747, + "step": 549 + }, + { + "epoch": 0.9786476868327402, + "grad_norm": 1.4488899932240877, + "learning_rate": 5.167669235008871e-06, + "loss": 0.1304, + "step": 550 + }, + { + "epoch": 0.9804270462633452, + "grad_norm": 1.7408954349025696, + "learning_rate": 5.153701401800845e-06, + "loss": 0.1534, + "step": 551 + }, + { + "epoch": 0.9822064056939501, + "grad_norm": 1.2490011421157257, + "learning_rate": 5.139732367864736e-06, + "loss": 0.0977, + "step": 552 + }, + { + "epoch": 0.9839857651245552, + "grad_norm": 1.1591259360485227, + "learning_rate": 5.1257622423277934e-06, + "loss": 0.1078, + "step": 553 + }, + { + "epoch": 0.9857651245551602, + "grad_norm": 1.1133537193913574, + "learning_rate": 5.111791134325793e-06, + "loss": 0.1007, + "step": 554 + }, + { + "epoch": 0.9875444839857651, + "grad_norm": 1.071652029710098, + "learning_rate": 5.097819153002192e-06, + "loss": 0.0965, + "step": 555 + }, + { + "epoch": 0.9893238434163701, + "grad_norm": 1.128937600862167, + "learning_rate": 5.083846407507263e-06, + "loss": 0.0964, + "step": 556 + }, + { + "epoch": 0.9911032028469751, + "grad_norm": 1.5379767941730706, + "learning_rate": 5.0698730069972535e-06, + "loss": 0.1259, + "step": 557 + }, + { + "epoch": 0.9928825622775801, + "grad_norm": 1.0822578639544602, + "learning_rate": 5.055899060633524e-06, + "loss": 0.0888, + "step": 558 + }, + { + "epoch": 0.994661921708185, + "grad_norm": 1.1674467130438533, + "learning_rate": 5.041924677581702e-06, + "loss": 0.1125, + "step": 559 + }, + { + "epoch": 0.99644128113879, + "grad_norm": 1.301186922789244, + "learning_rate": 5.0279499670108245e-06, + "loss": 0.1121, + "step": 560 + }, + { + "epoch": 0.998220640569395, + "grad_norm": 1.2146433286202794, + "learning_rate": 5.013975038092491e-06, + "loss": 0.1064, + "step": 561 + }, + { + "epoch": 1.0, + "grad_norm": 1.233157977712758, + "learning_rate": 5e-06, + "loss": 0.0962, + "step": 562 + }, + { + "epoch": 1.001779359430605, + "grad_norm": 0.8547294512504904, + "learning_rate": 4.98602496190751e-06, + "loss": 0.0628, + "step": 563 + }, + { + "epoch": 1.00355871886121, + "grad_norm": 0.7027768071296912, + "learning_rate": 4.9720500329891755e-06, + "loss": 0.039, + "step": 564 + }, + { + "epoch": 1.0053380782918149, + "grad_norm": 0.9472521846024025, + "learning_rate": 4.9580753224183005e-06, + "loss": 0.0559, + "step": 565 + }, + { + "epoch": 1.00711743772242, + "grad_norm": 0.6979463772086726, + "learning_rate": 4.944100939366478e-06, + "loss": 0.0417, + "step": 566 + }, + { + "epoch": 1.008896797153025, + "grad_norm": 0.6600744135103447, + "learning_rate": 4.930126993002748e-06, + "loss": 0.0378, + "step": 567 + }, + { + "epoch": 1.01067615658363, + "grad_norm": 0.7943192303593823, + "learning_rate": 4.9161535924927375e-06, + "loss": 0.0453, + "step": 568 + }, + { + "epoch": 1.0124555160142348, + "grad_norm": 0.8250567876182562, + "learning_rate": 4.90218084699781e-06, + "loss": 0.0424, + "step": 569 + }, + { + "epoch": 1.0142348754448398, + "grad_norm": 1.0521929319778944, + "learning_rate": 4.888208865674208e-06, + "loss": 0.0514, + "step": 570 + }, + { + "epoch": 1.0160142348754448, + "grad_norm": 0.8860257632064427, + "learning_rate": 4.874237757672209e-06, + "loss": 0.0591, + "step": 571 + }, + { + "epoch": 1.0177935943060499, + "grad_norm": 1.264726936124226, + "learning_rate": 4.8602676321352646e-06, + "loss": 0.0725, + "step": 572 + }, + { + "epoch": 1.019572953736655, + "grad_norm": 0.9515788781923418, + "learning_rate": 4.846298598199155e-06, + "loss": 0.048, + "step": 573 + }, + { + "epoch": 1.0213523131672597, + "grad_norm": 0.9037679408864575, + "learning_rate": 4.832330764991131e-06, + "loss": 0.0397, + "step": 574 + }, + { + "epoch": 1.0231316725978647, + "grad_norm": 1.0679481665835724, + "learning_rate": 4.81836424162906e-06, + "loss": 0.0549, + "step": 575 + }, + { + "epoch": 1.0249110320284698, + "grad_norm": 0.9017964978842363, + "learning_rate": 4.80439913722058e-06, + "loss": 0.039, + "step": 576 + }, + { + "epoch": 1.0266903914590748, + "grad_norm": 1.0862464474920541, + "learning_rate": 4.790435560862247e-06, + "loss": 0.042, + "step": 577 + }, + { + "epoch": 1.0284697508896796, + "grad_norm": 1.2170259844730513, + "learning_rate": 4.776473621638673e-06, + "loss": 0.0527, + "step": 578 + }, + { + "epoch": 1.0302491103202847, + "grad_norm": 1.143487399181918, + "learning_rate": 4.762513428621684e-06, + "loss": 0.0543, + "step": 579 + }, + { + "epoch": 1.0320284697508897, + "grad_norm": 1.0670331451405415, + "learning_rate": 4.748555090869464e-06, + "loss": 0.0483, + "step": 580 + }, + { + "epoch": 1.0338078291814947, + "grad_norm": 1.0920316345540422, + "learning_rate": 4.734598717425706e-06, + "loss": 0.0522, + "step": 581 + }, + { + "epoch": 1.0355871886120998, + "grad_norm": 1.198081617990545, + "learning_rate": 4.720644417318755e-06, + "loss": 0.0576, + "step": 582 + }, + { + "epoch": 1.0373665480427046, + "grad_norm": 1.5096660129964996, + "learning_rate": 4.70669229956076e-06, + "loss": 0.0563, + "step": 583 + }, + { + "epoch": 1.0391459074733096, + "grad_norm": 1.0003149054802263, + "learning_rate": 4.692742473146818e-06, + "loss": 0.0389, + "step": 584 + }, + { + "epoch": 1.0409252669039146, + "grad_norm": 1.1652024399424679, + "learning_rate": 4.678795047054131e-06, + "loss": 0.0418, + "step": 585 + }, + { + "epoch": 1.0427046263345197, + "grad_norm": 1.21728635385485, + "learning_rate": 4.664850130241146e-06, + "loss": 0.0362, + "step": 586 + }, + { + "epoch": 1.0444839857651245, + "grad_norm": 0.9163900719555346, + "learning_rate": 4.650907831646711e-06, + "loss": 0.0418, + "step": 587 + }, + { + "epoch": 1.0462633451957295, + "grad_norm": 1.0262119807351613, + "learning_rate": 4.636968260189214e-06, + "loss": 0.0455, + "step": 588 + }, + { + "epoch": 1.0480427046263345, + "grad_norm": 0.9971199994097065, + "learning_rate": 4.623031524765744e-06, + "loss": 0.0458, + "step": 589 + }, + { + "epoch": 1.0498220640569396, + "grad_norm": 0.9971575590605182, + "learning_rate": 4.609097734251231e-06, + "loss": 0.0482, + "step": 590 + }, + { + "epoch": 1.0516014234875444, + "grad_norm": 1.4272943496871313, + "learning_rate": 4.595166997497605e-06, + "loss": 0.0552, + "step": 591 + }, + { + "epoch": 1.0533807829181494, + "grad_norm": 1.0928724453101657, + "learning_rate": 4.58123942333293e-06, + "loss": 0.048, + "step": 592 + }, + { + "epoch": 1.0551601423487544, + "grad_norm": 0.8185600592969815, + "learning_rate": 4.567315120560573e-06, + "loss": 0.0328, + "step": 593 + }, + { + "epoch": 1.0569395017793595, + "grad_norm": 0.9298681360924248, + "learning_rate": 4.553394197958339e-06, + "loss": 0.036, + "step": 594 + }, + { + "epoch": 1.0587188612099645, + "grad_norm": 0.8798971670232553, + "learning_rate": 4.539476764277631e-06, + "loss": 0.0317, + "step": 595 + }, + { + "epoch": 1.0604982206405693, + "grad_norm": 0.9060028669002683, + "learning_rate": 4.525562928242592e-06, + "loss": 0.0399, + "step": 596 + }, + { + "epoch": 1.0622775800711743, + "grad_norm": 1.4072724773876861, + "learning_rate": 4.511652798549261e-06, + "loss": 0.0585, + "step": 597 + }, + { + "epoch": 1.0640569395017794, + "grad_norm": 1.0768155616192439, + "learning_rate": 4.497746483864725e-06, + "loss": 0.0517, + "step": 598 + }, + { + "epoch": 1.0658362989323844, + "grad_norm": 0.8546328966763028, + "learning_rate": 4.483844092826267e-06, + "loss": 0.0369, + "step": 599 + }, + { + "epoch": 1.0676156583629894, + "grad_norm": 0.8162459071101226, + "learning_rate": 4.469945734040516e-06, + "loss": 0.0396, + "step": 600 + }, + { + "epoch": 1.0676156583629894, + "eval_loss": 0.13828444480895996, + "eval_runtime": 7.1428, + "eval_samples_per_second": 6.44, + "eval_steps_per_second": 1.68, + "step": 600 + }, + { + "epoch": 1.0693950177935942, + "grad_norm": 0.9806946940979351, + "learning_rate": 4.456051516082603e-06, + "loss": 0.0423, + "step": 601 + }, + { + "epoch": 1.0711743772241993, + "grad_norm": 1.3793546329301072, + "learning_rate": 4.442161547495309e-06, + "loss": 0.0644, + "step": 602 + }, + { + "epoch": 1.0729537366548043, + "grad_norm": 1.2611862441529713, + "learning_rate": 4.42827593678822e-06, + "loss": 0.0542, + "step": 603 + }, + { + "epoch": 1.0747330960854093, + "grad_norm": 0.9902983507703184, + "learning_rate": 4.414394792436877e-06, + "loss": 0.0365, + "step": 604 + }, + { + "epoch": 1.0765124555160142, + "grad_norm": 0.9289547591588659, + "learning_rate": 4.400518222881931e-06, + "loss": 0.0421, + "step": 605 + }, + { + "epoch": 1.0782918149466192, + "grad_norm": 0.9463300231205996, + "learning_rate": 4.386646336528291e-06, + "loss": 0.043, + "step": 606 + }, + { + "epoch": 1.0800711743772242, + "grad_norm": 0.9259371009942426, + "learning_rate": 4.372779241744282e-06, + "loss": 0.039, + "step": 607 + }, + { + "epoch": 1.0818505338078293, + "grad_norm": 0.8516637033570938, + "learning_rate": 4.358917046860799e-06, + "loss": 0.0341, + "step": 608 + }, + { + "epoch": 1.083629893238434, + "grad_norm": 0.9071862571901869, + "learning_rate": 4.345059860170458e-06, + "loss": 0.0356, + "step": 609 + }, + { + "epoch": 1.085409252669039, + "grad_norm": 1.0635580735336114, + "learning_rate": 4.331207789926746e-06, + "loss": 0.0441, + "step": 610 + }, + { + "epoch": 1.0871886120996441, + "grad_norm": 0.8830506087183804, + "learning_rate": 4.317360944343184e-06, + "loss": 0.0391, + "step": 611 + }, + { + "epoch": 1.0889679715302492, + "grad_norm": 0.8557175692266639, + "learning_rate": 4.303519431592479e-06, + "loss": 0.0294, + "step": 612 + }, + { + "epoch": 1.0907473309608542, + "grad_norm": 1.2510738747884549, + "learning_rate": 4.289683359805673e-06, + "loss": 0.0626, + "step": 613 + }, + { + "epoch": 1.092526690391459, + "grad_norm": 1.1086025882199797, + "learning_rate": 4.275852837071309e-06, + "loss": 0.0386, + "step": 614 + }, + { + "epoch": 1.094306049822064, + "grad_norm": 0.8466625547422796, + "learning_rate": 4.26202797143457e-06, + "loss": 0.0359, + "step": 615 + }, + { + "epoch": 1.096085409252669, + "grad_norm": 1.0320189897254877, + "learning_rate": 4.248208870896456e-06, + "loss": 0.0452, + "step": 616 + }, + { + "epoch": 1.097864768683274, + "grad_norm": 1.1139859554711828, + "learning_rate": 4.234395643412925e-06, + "loss": 0.0588, + "step": 617 + }, + { + "epoch": 1.099644128113879, + "grad_norm": 1.0833592021160374, + "learning_rate": 4.220588396894055e-06, + "loss": 0.0435, + "step": 618 + }, + { + "epoch": 1.101423487544484, + "grad_norm": 1.0747608563273376, + "learning_rate": 4.2067872392031965e-06, + "loss": 0.0439, + "step": 619 + }, + { + "epoch": 1.103202846975089, + "grad_norm": 1.21276524318406, + "learning_rate": 4.192992278156141e-06, + "loss": 0.0674, + "step": 620 + }, + { + "epoch": 1.104982206405694, + "grad_norm": 1.179304229566795, + "learning_rate": 4.179203621520262e-06, + "loss": 0.0538, + "step": 621 + }, + { + "epoch": 1.106761565836299, + "grad_norm": 0.99867215096944, + "learning_rate": 4.165421377013691e-06, + "loss": 0.0303, + "step": 622 + }, + { + "epoch": 1.1085409252669038, + "grad_norm": 0.92035257564448, + "learning_rate": 4.151645652304465e-06, + "loss": 0.0402, + "step": 623 + }, + { + "epoch": 1.1103202846975089, + "grad_norm": 1.0410983710015873, + "learning_rate": 4.137876555009684e-06, + "loss": 0.0493, + "step": 624 + }, + { + "epoch": 1.112099644128114, + "grad_norm": 1.1209427859065353, + "learning_rate": 4.124114192694676e-06, + "loss": 0.0457, + "step": 625 + }, + { + "epoch": 1.113879003558719, + "grad_norm": 0.8743076296872143, + "learning_rate": 4.110358672872158e-06, + "loss": 0.0422, + "step": 626 + }, + { + "epoch": 1.1156583629893237, + "grad_norm": 1.0728261185889085, + "learning_rate": 4.0966101030013915e-06, + "loss": 0.0476, + "step": 627 + }, + { + "epoch": 1.1174377224199288, + "grad_norm": 0.84062355751606, + "learning_rate": 4.082868590487339e-06, + "loss": 0.0373, + "step": 628 + }, + { + "epoch": 1.1192170818505338, + "grad_norm": 0.6922337708933334, + "learning_rate": 4.069134242679837e-06, + "loss": 0.0303, + "step": 629 + }, + { + "epoch": 1.1209964412811388, + "grad_norm": 1.094838178393759, + "learning_rate": 4.055407166872748e-06, + "loss": 0.0463, + "step": 630 + }, + { + "epoch": 1.1227758007117439, + "grad_norm": 0.9729388062399839, + "learning_rate": 4.041687470303127e-06, + "loss": 0.0427, + "step": 631 + }, + { + "epoch": 1.1245551601423487, + "grad_norm": 0.965017927999121, + "learning_rate": 4.02797526015038e-06, + "loss": 0.0502, + "step": 632 + }, + { + "epoch": 1.1263345195729537, + "grad_norm": 0.986790501677302, + "learning_rate": 4.014270643535427e-06, + "loss": 0.0407, + "step": 633 + }, + { + "epoch": 1.1281138790035588, + "grad_norm": 1.2417735052456353, + "learning_rate": 4.000573727519868e-06, + "loss": 0.0669, + "step": 634 + }, + { + "epoch": 1.1298932384341638, + "grad_norm": 0.9006479956694917, + "learning_rate": 3.9868846191051465e-06, + "loss": 0.0372, + "step": 635 + }, + { + "epoch": 1.1316725978647686, + "grad_norm": 1.1357157682949497, + "learning_rate": 3.973203425231715e-06, + "loss": 0.0542, + "step": 636 + }, + { + "epoch": 1.1334519572953736, + "grad_norm": 0.9210806832468151, + "learning_rate": 3.959530252778187e-06, + "loss": 0.0504, + "step": 637 + }, + { + "epoch": 1.1352313167259787, + "grad_norm": 1.2020387615526031, + "learning_rate": 3.945865208560522e-06, + "loss": 0.0668, + "step": 638 + }, + { + "epoch": 1.1370106761565837, + "grad_norm": 1.150306180182193, + "learning_rate": 3.932208399331177e-06, + "loss": 0.0527, + "step": 639 + }, + { + "epoch": 1.1387900355871885, + "grad_norm": 1.2018719772753494, + "learning_rate": 3.918559931778277e-06, + "loss": 0.0645, + "step": 640 + }, + { + "epoch": 1.1405693950177935, + "grad_norm": 0.8606898186114491, + "learning_rate": 3.904919912524784e-06, + "loss": 0.0334, + "step": 641 + }, + { + "epoch": 1.1423487544483986, + "grad_norm": 0.9525507385343362, + "learning_rate": 3.891288448127654e-06, + "loss": 0.0453, + "step": 642 + }, + { + "epoch": 1.1441281138790036, + "grad_norm": 1.001256472466121, + "learning_rate": 3.877665645077017e-06, + "loss": 0.0472, + "step": 643 + }, + { + "epoch": 1.1459074733096086, + "grad_norm": 1.183003789633624, + "learning_rate": 3.86405160979534e-06, + "loss": 0.0519, + "step": 644 + }, + { + "epoch": 1.1476868327402134, + "grad_norm": 0.8695848757571557, + "learning_rate": 3.850446448636597e-06, + "loss": 0.0422, + "step": 645 + }, + { + "epoch": 1.1494661921708185, + "grad_norm": 0.7483879064998394, + "learning_rate": 3.8368502678854296e-06, + "loss": 0.0339, + "step": 646 + }, + { + "epoch": 1.1512455516014235, + "grad_norm": 0.9406953847372124, + "learning_rate": 3.8232631737563306e-06, + "loss": 0.0451, + "step": 647 + }, + { + "epoch": 1.1530249110320285, + "grad_norm": 1.103408418773824, + "learning_rate": 3.809685272392804e-06, + "loss": 0.0431, + "step": 648 + }, + { + "epoch": 1.1548042704626336, + "grad_norm": 1.1014544973242129, + "learning_rate": 3.796116669866543e-06, + "loss": 0.0497, + "step": 649 + }, + { + "epoch": 1.1565836298932384, + "grad_norm": 0.926236874246472, + "learning_rate": 3.78255747217659e-06, + "loss": 0.0457, + "step": 650 + }, + { + "epoch": 1.1583629893238434, + "grad_norm": 0.9874886298329539, + "learning_rate": 3.769007785248523e-06, + "loss": 0.042, + "step": 651 + }, + { + "epoch": 1.1601423487544484, + "grad_norm": 0.9433772642686659, + "learning_rate": 3.7554677149336186e-06, + "loss": 0.0414, + "step": 652 + }, + { + "epoch": 1.1619217081850535, + "grad_norm": 1.0777671762969125, + "learning_rate": 3.7419373670080284e-06, + "loss": 0.0476, + "step": 653 + }, + { + "epoch": 1.1637010676156583, + "grad_norm": 1.1304402592387923, + "learning_rate": 3.7284168471719527e-06, + "loss": 0.0529, + "step": 654 + }, + { + "epoch": 1.1654804270462633, + "grad_norm": 1.0441551907434734, + "learning_rate": 3.7149062610488085e-06, + "loss": 0.0522, + "step": 655 + }, + { + "epoch": 1.1672597864768683, + "grad_norm": 0.9601081313393668, + "learning_rate": 3.701405714184416e-06, + "loss": 0.0461, + "step": 656 + }, + { + "epoch": 1.1690391459074734, + "grad_norm": 0.9071924144253114, + "learning_rate": 3.687915312046166e-06, + "loss": 0.0369, + "step": 657 + }, + { + "epoch": 1.1708185053380782, + "grad_norm": 1.0181711239434297, + "learning_rate": 3.6744351600221994e-06, + "loss": 0.0338, + "step": 658 + }, + { + "epoch": 1.1725978647686832, + "grad_norm": 1.1482871630180345, + "learning_rate": 3.6609653634205773e-06, + "loss": 0.0595, + "step": 659 + }, + { + "epoch": 1.1743772241992882, + "grad_norm": 1.1824425297449868, + "learning_rate": 3.647506027468467e-06, + "loss": 0.0436, + "step": 660 + }, + { + "epoch": 1.1761565836298933, + "grad_norm": 0.8846905635199142, + "learning_rate": 3.6340572573113176e-06, + "loss": 0.0365, + "step": 661 + }, + { + "epoch": 1.1779359430604983, + "grad_norm": 1.0178369029720964, + "learning_rate": 3.6206191580120346e-06, + "loss": 0.0489, + "step": 662 + }, + { + "epoch": 1.1797153024911031, + "grad_norm": 1.190092748578404, + "learning_rate": 3.6071918345501655e-06, + "loss": 0.0502, + "step": 663 + }, + { + "epoch": 1.1814946619217082, + "grad_norm": 0.8837883568681317, + "learning_rate": 3.5937753918210705e-06, + "loss": 0.0359, + "step": 664 + }, + { + "epoch": 1.1832740213523132, + "grad_norm": 0.925819103061139, + "learning_rate": 3.5803699346351117e-06, + "loss": 0.0365, + "step": 665 + }, + { + "epoch": 1.1850533807829182, + "grad_norm": 0.7996611763224647, + "learning_rate": 3.566975567716833e-06, + "loss": 0.0303, + "step": 666 + }, + { + "epoch": 1.1868327402135233, + "grad_norm": 1.0921199636245027, + "learning_rate": 3.5535923957041374e-06, + "loss": 0.0571, + "step": 667 + }, + { + "epoch": 1.188612099644128, + "grad_norm": 0.8005202972903217, + "learning_rate": 3.540220523147474e-06, + "loss": 0.0395, + "step": 668 + }, + { + "epoch": 1.190391459074733, + "grad_norm": 1.0764132328807485, + "learning_rate": 3.5268600545090183e-06, + "loss": 0.0396, + "step": 669 + }, + { + "epoch": 1.1921708185053381, + "grad_norm": 0.976616916449195, + "learning_rate": 3.513511094161858e-06, + "loss": 0.044, + "step": 670 + }, + { + "epoch": 1.193950177935943, + "grad_norm": 1.2329641087482588, + "learning_rate": 3.5001737463891793e-06, + "loss": 0.0458, + "step": 671 + }, + { + "epoch": 1.195729537366548, + "grad_norm": 0.9695655624227043, + "learning_rate": 3.4868481153834454e-06, + "loss": 0.0416, + "step": 672 + }, + { + "epoch": 1.197508896797153, + "grad_norm": 1.0587017444949576, + "learning_rate": 3.4735343052455905e-06, + "loss": 0.0432, + "step": 673 + }, + { + "epoch": 1.199288256227758, + "grad_norm": 1.1377978824885697, + "learning_rate": 3.4602324199842026e-06, + "loss": 0.0411, + "step": 674 + }, + { + "epoch": 1.201067615658363, + "grad_norm": 1.0649133398233424, + "learning_rate": 3.446942563514711e-06, + "loss": 0.043, + "step": 675 + }, + { + "epoch": 1.2028469750889679, + "grad_norm": 1.2154917888765664, + "learning_rate": 3.4336648396585777e-06, + "loss": 0.0422, + "step": 676 + }, + { + "epoch": 1.204626334519573, + "grad_norm": 1.0427405390699247, + "learning_rate": 3.4203993521424774e-06, + "loss": 0.0536, + "step": 677 + }, + { + "epoch": 1.206405693950178, + "grad_norm": 0.8919462337795426, + "learning_rate": 3.407146204597499e-06, + "loss": 0.0363, + "step": 678 + }, + { + "epoch": 1.208185053380783, + "grad_norm": 0.92588445633896, + "learning_rate": 3.3939055005583305e-06, + "loss": 0.0393, + "step": 679 + }, + { + "epoch": 1.209964412811388, + "grad_norm": 0.9646545633516691, + "learning_rate": 3.3806773434624475e-06, + "loss": 0.0438, + "step": 680 + }, + { + "epoch": 1.2117437722419928, + "grad_norm": 1.1776655307293824, + "learning_rate": 3.3674618366493117e-06, + "loss": 0.0534, + "step": 681 + }, + { + "epoch": 1.2135231316725978, + "grad_norm": 0.9296310818357466, + "learning_rate": 3.3542590833595533e-06, + "loss": 0.0414, + "step": 682 + }, + { + "epoch": 1.2153024911032029, + "grad_norm": 0.9130754996133634, + "learning_rate": 3.341069186734176e-06, + "loss": 0.0366, + "step": 683 + }, + { + "epoch": 1.217081850533808, + "grad_norm": 0.8398374744131705, + "learning_rate": 3.3278922498137455e-06, + "loss": 0.0408, + "step": 684 + }, + { + "epoch": 1.2188612099644127, + "grad_norm": 0.8194889930167317, + "learning_rate": 3.314728375537587e-06, + "loss": 0.0343, + "step": 685 + }, + { + "epoch": 1.2206405693950177, + "grad_norm": 1.0568307723228527, + "learning_rate": 3.3015776667429724e-06, + "loss": 0.0533, + "step": 686 + }, + { + "epoch": 1.2224199288256228, + "grad_norm": 0.846464404831591, + "learning_rate": 3.2884402261643296e-06, + "loss": 0.0317, + "step": 687 + }, + { + "epoch": 1.2241992882562278, + "grad_norm": 1.0612818751427298, + "learning_rate": 3.2753161564324344e-06, + "loss": 0.051, + "step": 688 + }, + { + "epoch": 1.2259786476868326, + "grad_norm": 1.0921889784490464, + "learning_rate": 3.262205560073605e-06, + "loss": 0.0315, + "step": 689 + }, + { + "epoch": 1.2277580071174377, + "grad_norm": 0.7877871308400063, + "learning_rate": 3.249108539508909e-06, + "loss": 0.0281, + "step": 690 + }, + { + "epoch": 1.2295373665480427, + "grad_norm": 1.2396290130990737, + "learning_rate": 3.2360251970533527e-06, + "loss": 0.055, + "step": 691 + }, + { + "epoch": 1.2313167259786477, + "grad_norm": 1.0214709327506637, + "learning_rate": 3.2229556349150947e-06, + "loss": 0.0492, + "step": 692 + }, + { + "epoch": 1.2330960854092528, + "grad_norm": 0.8568501040637228, + "learning_rate": 3.2098999551946337e-06, + "loss": 0.0337, + "step": 693 + }, + { + "epoch": 1.2348754448398576, + "grad_norm": 1.2308619987987102, + "learning_rate": 3.1968582598840234e-06, + "loss": 0.0532, + "step": 694 + }, + { + "epoch": 1.2366548042704626, + "grad_norm": 1.087501995164755, + "learning_rate": 3.183830650866068e-06, + "loss": 0.0381, + "step": 695 + }, + { + "epoch": 1.2384341637010676, + "grad_norm": 0.8919790084939676, + "learning_rate": 3.1708172299135266e-06, + "loss": 0.0376, + "step": 696 + }, + { + "epoch": 1.2402135231316727, + "grad_norm": 1.4264095905198482, + "learning_rate": 3.1578180986883234e-06, + "loss": 0.062, + "step": 697 + }, + { + "epoch": 1.2419928825622777, + "grad_norm": 1.1571895955783398, + "learning_rate": 3.1448333587407486e-06, + "loss": 0.0563, + "step": 698 + }, + { + "epoch": 1.2437722419928825, + "grad_norm": 0.9980157300161984, + "learning_rate": 3.131863111508667e-06, + "loss": 0.0454, + "step": 699 + }, + { + "epoch": 1.2455516014234875, + "grad_norm": 0.9157461897439566, + "learning_rate": 3.118907458316722e-06, + "loss": 0.0369, + "step": 700 + }, + { + "epoch": 1.2473309608540926, + "grad_norm": 1.0326915297964894, + "learning_rate": 3.105966500375551e-06, + "loss": 0.0347, + "step": 701 + }, + { + "epoch": 1.2491103202846976, + "grad_norm": 1.0895553545223995, + "learning_rate": 3.0930403387809892e-06, + "loss": 0.0483, + "step": 702 + }, + { + "epoch": 1.2508896797153026, + "grad_norm": 1.2156705365027338, + "learning_rate": 3.080129074513285e-06, + "loss": 0.0558, + "step": 703 + }, + { + "epoch": 1.2526690391459074, + "grad_norm": 1.4083175940812185, + "learning_rate": 3.067232808436299e-06, + "loss": 0.058, + "step": 704 + }, + { + "epoch": 1.2544483985765125, + "grad_norm": 0.8911382799239523, + "learning_rate": 3.0543516412967327e-06, + "loss": 0.037, + "step": 705 + }, + { + "epoch": 1.2562277580071175, + "grad_norm": 0.7617173335791619, + "learning_rate": 3.041485673723331e-06, + "loss": 0.0286, + "step": 706 + }, + { + "epoch": 1.2580071174377223, + "grad_norm": 0.9580614242960381, + "learning_rate": 3.0286350062261017e-06, + "loss": 0.0381, + "step": 707 + }, + { + "epoch": 1.2597864768683273, + "grad_norm": 0.9048540935872091, + "learning_rate": 3.0157997391955172e-06, + "loss": 0.0378, + "step": 708 + }, + { + "epoch": 1.2615658362989324, + "grad_norm": 1.2822234133916406, + "learning_rate": 3.0029799729017518e-06, + "loss": 0.0572, + "step": 709 + }, + { + "epoch": 1.2633451957295374, + "grad_norm": 0.8668230724849544, + "learning_rate": 2.9901758074938797e-06, + "loss": 0.0404, + "step": 710 + }, + { + "epoch": 1.2651245551601424, + "grad_norm": 0.902622064775602, + "learning_rate": 2.977387342999103e-06, + "loss": 0.0343, + "step": 711 + }, + { + "epoch": 1.2669039145907472, + "grad_norm": 1.3487596797513532, + "learning_rate": 2.964614679321966e-06, + "loss": 0.0511, + "step": 712 + }, + { + "epoch": 1.2686832740213523, + "grad_norm": 0.9802953443926045, + "learning_rate": 2.951857916243574e-06, + "loss": 0.0431, + "step": 713 + }, + { + "epoch": 1.2704626334519573, + "grad_norm": 1.4332880642967385, + "learning_rate": 2.9391171534208185e-06, + "loss": 0.0544, + "step": 714 + }, + { + "epoch": 1.2722419928825623, + "grad_norm": 1.1081952360257095, + "learning_rate": 2.9263924903855932e-06, + "loss": 0.0416, + "step": 715 + }, + { + "epoch": 1.2740213523131674, + "grad_norm": 1.0333605663846648, + "learning_rate": 2.9136840265440213e-06, + "loss": 0.0469, + "step": 716 + }, + { + "epoch": 1.2758007117437722, + "grad_norm": 1.0641882073169253, + "learning_rate": 2.9009918611756732e-06, + "loss": 0.0448, + "step": 717 + }, + { + "epoch": 1.2775800711743772, + "grad_norm": 0.8806910046198404, + "learning_rate": 2.8883160934327968e-06, + "loss": 0.0392, + "step": 718 + }, + { + "epoch": 1.2793594306049823, + "grad_norm": 1.3217512808066976, + "learning_rate": 2.8756568223395396e-06, + "loss": 0.0525, + "step": 719 + }, + { + "epoch": 1.281138790035587, + "grad_norm": 1.1066401990396189, + "learning_rate": 2.8630141467911777e-06, + "loss": 0.0469, + "step": 720 + }, + { + "epoch": 1.282918149466192, + "grad_norm": 0.8964680090502098, + "learning_rate": 2.8503881655533395e-06, + "loss": 0.0301, + "step": 721 + }, + { + "epoch": 1.2846975088967971, + "grad_norm": 0.9790179316595002, + "learning_rate": 2.837778977261235e-06, + "loss": 0.0373, + "step": 722 + }, + { + "epoch": 1.2864768683274022, + "grad_norm": 1.0244740989723553, + "learning_rate": 2.8251866804188875e-06, + "loss": 0.0413, + "step": 723 + }, + { + "epoch": 1.2882562277580072, + "grad_norm": 0.907577382814479, + "learning_rate": 2.812611373398365e-06, + "loss": 0.0379, + "step": 724 + }, + { + "epoch": 1.290035587188612, + "grad_norm": 0.9877905229412578, + "learning_rate": 2.8000531544390064e-06, + "loss": 0.0365, + "step": 725 + }, + { + "epoch": 1.291814946619217, + "grad_norm": 0.9382307500323435, + "learning_rate": 2.7875121216466595e-06, + "loss": 0.038, + "step": 726 + }, + { + "epoch": 1.293594306049822, + "grad_norm": 1.2668204222978174, + "learning_rate": 2.7749883729929105e-06, + "loss": 0.053, + "step": 727 + }, + { + "epoch": 1.295373665480427, + "grad_norm": 0.8999055571713811, + "learning_rate": 2.762482006314324e-06, + "loss": 0.0384, + "step": 728 + }, + { + "epoch": 1.2971530249110321, + "grad_norm": 1.0154187759025894, + "learning_rate": 2.7499931193116692e-06, + "loss": 0.0327, + "step": 729 + }, + { + "epoch": 1.298932384341637, + "grad_norm": 1.0802920648004704, + "learning_rate": 2.737521809549167e-06, + "loss": 0.0417, + "step": 730 + }, + { + "epoch": 1.300711743772242, + "grad_norm": 1.0066519304730417, + "learning_rate": 2.725068174453722e-06, + "loss": 0.0482, + "step": 731 + }, + { + "epoch": 1.302491103202847, + "grad_norm": 1.0410669256725513, + "learning_rate": 2.712632311314165e-06, + "loss": 0.0386, + "step": 732 + }, + { + "epoch": 1.304270462633452, + "grad_norm": 1.1234658089663005, + "learning_rate": 2.7002143172804875e-06, + "loss": 0.043, + "step": 733 + }, + { + "epoch": 1.306049822064057, + "grad_norm": 1.192568202231757, + "learning_rate": 2.6878142893630904e-06, + "loss": 0.0457, + "step": 734 + }, + { + "epoch": 1.3078291814946619, + "grad_norm": 0.9984922757453483, + "learning_rate": 2.6754323244320154e-06, + "loss": 0.0434, + "step": 735 + }, + { + "epoch": 1.309608540925267, + "grad_norm": 1.0029752401431848, + "learning_rate": 2.6630685192161995e-06, + "loss": 0.0351, + "step": 736 + }, + { + "epoch": 1.311387900355872, + "grad_norm": 1.039717365436853, + "learning_rate": 2.650722970302714e-06, + "loss": 0.0424, + "step": 737 + }, + { + "epoch": 1.3131672597864767, + "grad_norm": 1.3580396164284314, + "learning_rate": 2.638395774136009e-06, + "loss": 0.0567, + "step": 738 + }, + { + "epoch": 1.3149466192170818, + "grad_norm": 0.9444133237600162, + "learning_rate": 2.6260870270171645e-06, + "loss": 0.0357, + "step": 739 + }, + { + "epoch": 1.3167259786476868, + "grad_norm": 1.0580934161350533, + "learning_rate": 2.613796825103129e-06, + "loss": 0.0414, + "step": 740 + }, + { + "epoch": 1.3185053380782918, + "grad_norm": 0.9834182092745584, + "learning_rate": 2.60152526440598e-06, + "loss": 0.0387, + "step": 741 + }, + { + "epoch": 1.3202846975088969, + "grad_norm": 0.7616732359159014, + "learning_rate": 2.5892724407921667e-06, + "loss": 0.0289, + "step": 742 + }, + { + "epoch": 1.3220640569395017, + "grad_norm": 1.3915238757419404, + "learning_rate": 2.577038449981763e-06, + "loss": 0.0611, + "step": 743 + }, + { + "epoch": 1.3238434163701067, + "grad_norm": 1.2173657729432406, + "learning_rate": 2.564823387547716e-06, + "loss": 0.0425, + "step": 744 + }, + { + "epoch": 1.3256227758007118, + "grad_norm": 0.9135314344508192, + "learning_rate": 2.552627348915106e-06, + "loss": 0.0371, + "step": 745 + }, + { + "epoch": 1.3274021352313168, + "grad_norm": 1.0893426987998907, + "learning_rate": 2.5404504293603983e-06, + "loss": 0.0505, + "step": 746 + }, + { + "epoch": 1.3291814946619218, + "grad_norm": 1.2235742895097919, + "learning_rate": 2.528292724010697e-06, + "loss": 0.0546, + "step": 747 + }, + { + "epoch": 1.3309608540925266, + "grad_norm": 0.963631042620569, + "learning_rate": 2.5161543278430055e-06, + "loss": 0.0416, + "step": 748 + }, + { + "epoch": 1.3327402135231317, + "grad_norm": 1.2217909377948624, + "learning_rate": 2.5040353356834756e-06, + "loss": 0.0432, + "step": 749 + }, + { + "epoch": 1.3345195729537367, + "grad_norm": 1.049568511067747, + "learning_rate": 2.4919358422066816e-06, + "loss": 0.0416, + "step": 750 + }, + { + "epoch": 1.3362989323843417, + "grad_norm": 0.912111020602681, + "learning_rate": 2.4798559419348672e-06, + "loss": 0.0378, + "step": 751 + }, + { + "epoch": 1.3380782918149468, + "grad_norm": 0.8880929267694146, + "learning_rate": 2.4677957292372166e-06, + "loss": 0.0352, + "step": 752 + }, + { + "epoch": 1.3398576512455516, + "grad_norm": 0.9824476908080295, + "learning_rate": 2.455755298329107e-06, + "loss": 0.0367, + "step": 753 + }, + { + "epoch": 1.3416370106761566, + "grad_norm": 1.021419477068853, + "learning_rate": 2.4437347432713838e-06, + "loss": 0.0428, + "step": 754 + }, + { + "epoch": 1.3434163701067616, + "grad_norm": 1.1022589426865288, + "learning_rate": 2.431734157969619e-06, + "loss": 0.0395, + "step": 755 + }, + { + "epoch": 1.3451957295373664, + "grad_norm": 1.0934549678996628, + "learning_rate": 2.4197536361733792e-06, + "loss": 0.0477, + "step": 756 + }, + { + "epoch": 1.3469750889679715, + "grad_norm": 0.827138815781088, + "learning_rate": 2.407793271475495e-06, + "loss": 0.0304, + "step": 757 + }, + { + "epoch": 1.3487544483985765, + "grad_norm": 0.8432510777831591, + "learning_rate": 2.3958531573113223e-06, + "loss": 0.037, + "step": 758 + }, + { + "epoch": 1.3505338078291815, + "grad_norm": 1.0216803053267136, + "learning_rate": 2.3839333869580243e-06, + "loss": 0.0439, + "step": 759 + }, + { + "epoch": 1.3523131672597866, + "grad_norm": 0.9332421396895141, + "learning_rate": 2.372034053533835e-06, + "loss": 0.0368, + "step": 760 + }, + { + "epoch": 1.3540925266903914, + "grad_norm": 0.7104040014108932, + "learning_rate": 2.360155249997334e-06, + "loss": 0.0297, + "step": 761 + }, + { + "epoch": 1.3558718861209964, + "grad_norm": 0.6773597226062513, + "learning_rate": 2.348297069146715e-06, + "loss": 0.0245, + "step": 762 + }, + { + "epoch": 1.3576512455516014, + "grad_norm": 0.7243261986365364, + "learning_rate": 2.3364596036190706e-06, + "loss": 0.0285, + "step": 763 + }, + { + "epoch": 1.3594306049822065, + "grad_norm": 1.0093488845674181, + "learning_rate": 2.3246429458896637e-06, + "loss": 0.0378, + "step": 764 + }, + { + "epoch": 1.3612099644128115, + "grad_norm": 0.9584656799973454, + "learning_rate": 2.312847188271203e-06, + "loss": 0.0422, + "step": 765 + }, + { + "epoch": 1.3629893238434163, + "grad_norm": 0.7865628537475263, + "learning_rate": 2.301072422913123e-06, + "loss": 0.0225, + "step": 766 + }, + { + "epoch": 1.3647686832740213, + "grad_norm": 1.0060606728391541, + "learning_rate": 2.2893187418008666e-06, + "loss": 0.0384, + "step": 767 + }, + { + "epoch": 1.3665480427046264, + "grad_norm": 1.0806683796823793, + "learning_rate": 2.2775862367551642e-06, + "loss": 0.0451, + "step": 768 + }, + { + "epoch": 1.3683274021352312, + "grad_norm": 1.055751788962701, + "learning_rate": 2.265874999431318e-06, + "loss": 0.0472, + "step": 769 + }, + { + "epoch": 1.3701067615658362, + "grad_norm": 1.1232736822675, + "learning_rate": 2.254185121318484e-06, + "loss": 0.0355, + "step": 770 + }, + { + "epoch": 1.3718861209964412, + "grad_norm": 1.0562010911911415, + "learning_rate": 2.2425166937389596e-06, + "loss": 0.0405, + "step": 771 + }, + { + "epoch": 1.3736654804270463, + "grad_norm": 0.825143680854267, + "learning_rate": 2.2308698078474645e-06, + "loss": 0.0349, + "step": 772 + }, + { + "epoch": 1.3754448398576513, + "grad_norm": 1.001912955792229, + "learning_rate": 2.219244554630438e-06, + "loss": 0.0477, + "step": 773 + }, + { + "epoch": 1.3772241992882561, + "grad_norm": 0.7736758012090639, + "learning_rate": 2.207641024905322e-06, + "loss": 0.0288, + "step": 774 + }, + { + "epoch": 1.3790035587188612, + "grad_norm": 1.1162837602545816, + "learning_rate": 2.1960593093198508e-06, + "loss": 0.0316, + "step": 775 + }, + { + "epoch": 1.3807829181494662, + "grad_norm": 1.0237380141940946, + "learning_rate": 2.184499498351347e-06, + "loss": 0.0421, + "step": 776 + }, + { + "epoch": 1.3825622775800712, + "grad_norm": 0.9455947117365314, + "learning_rate": 2.172961682306011e-06, + "loss": 0.0399, + "step": 777 + }, + { + "epoch": 1.3843416370106763, + "grad_norm": 0.8891975255037335, + "learning_rate": 2.1614459513182173e-06, + "loss": 0.0341, + "step": 778 + }, + { + "epoch": 1.386120996441281, + "grad_norm": 0.8854067913254945, + "learning_rate": 2.149952395349813e-06, + "loss": 0.0296, + "step": 779 + }, + { + "epoch": 1.387900355871886, + "grad_norm": 1.1944275799619644, + "learning_rate": 2.1384811041894055e-06, + "loss": 0.045, + "step": 780 + }, + { + "epoch": 1.3896797153024911, + "grad_norm": 1.0432671686625763, + "learning_rate": 2.1270321674516736e-06, + "loss": 0.0467, + "step": 781 + }, + { + "epoch": 1.3914590747330962, + "grad_norm": 1.051590997554103, + "learning_rate": 2.1156056745766593e-06, + "loss": 0.0393, + "step": 782 + }, + { + "epoch": 1.3932384341637012, + "grad_norm": 0.7769767413327008, + "learning_rate": 2.104201714829074e-06, + "loss": 0.0351, + "step": 783 + }, + { + "epoch": 1.395017793594306, + "grad_norm": 0.9830455602830936, + "learning_rate": 2.0928203772975917e-06, + "loss": 0.0457, + "step": 784 + }, + { + "epoch": 1.396797153024911, + "grad_norm": 1.3957545926088177, + "learning_rate": 2.081461750894166e-06, + "loss": 0.0477, + "step": 785 + }, + { + "epoch": 1.398576512455516, + "grad_norm": 1.1761039690624113, + "learning_rate": 2.070125924353328e-06, + "loss": 0.0521, + "step": 786 + }, + { + "epoch": 1.4003558718861209, + "grad_norm": 1.110685537230009, + "learning_rate": 2.058812986231493e-06, + "loss": 0.052, + "step": 787 + }, + { + "epoch": 1.402135231316726, + "grad_norm": 1.2173352503367165, + "learning_rate": 2.0475230249062727e-06, + "loss": 0.0595, + "step": 788 + }, + { + "epoch": 1.403914590747331, + "grad_norm": 1.0171099640987131, + "learning_rate": 2.0362561285757766e-06, + "loss": 0.0409, + "step": 789 + }, + { + "epoch": 1.405693950177936, + "grad_norm": 1.0659961619337683, + "learning_rate": 2.0250123852579347e-06, + "loss": 0.0463, + "step": 790 + }, + { + "epoch": 1.407473309608541, + "grad_norm": 0.8217925051573368, + "learning_rate": 2.013791882789801e-06, + "loss": 0.0331, + "step": 791 + }, + { + "epoch": 1.4092526690391458, + "grad_norm": 1.0192070289515296, + "learning_rate": 2.0025947088268714e-06, + "loss": 0.0329, + "step": 792 + }, + { + "epoch": 1.4110320284697508, + "grad_norm": 0.8674257228315672, + "learning_rate": 1.9914209508423943e-06, + "loss": 0.0382, + "step": 793 + }, + { + "epoch": 1.4128113879003559, + "grad_norm": 1.039394594603558, + "learning_rate": 1.9802706961266936e-06, + "loss": 0.0514, + "step": 794 + }, + { + "epoch": 1.414590747330961, + "grad_norm": 1.2422673645671691, + "learning_rate": 1.969144031786483e-06, + "loss": 0.0509, + "step": 795 + }, + { + "epoch": 1.416370106761566, + "grad_norm": 0.9834861700370274, + "learning_rate": 1.958041044744186e-06, + "loss": 0.0478, + "step": 796 + }, + { + "epoch": 1.4181494661921707, + "grad_norm": 0.8072332504815498, + "learning_rate": 1.94696182173726e-06, + "loss": 0.0272, + "step": 797 + }, + { + "epoch": 1.4199288256227758, + "grad_norm": 1.041888748607498, + "learning_rate": 1.9359064493175077e-06, + "loss": 0.0461, + "step": 798 + }, + { + "epoch": 1.4217081850533808, + "grad_norm": 0.8547473536647248, + "learning_rate": 1.9248750138504176e-06, + "loss": 0.0463, + "step": 799 + }, + { + "epoch": 1.4234875444839858, + "grad_norm": 0.9160844863052742, + "learning_rate": 1.9138676015144765e-06, + "loss": 0.033, + "step": 800 + }, + { + "epoch": 1.4234875444839858, + "eval_loss": 0.13344429433345795, + "eval_runtime": 7.1527, + "eval_samples_per_second": 6.431, + "eval_steps_per_second": 1.678, + "step": 800 + }, + { + "epoch": 1.4252669039145909, + "grad_norm": 1.1728004609011466, + "learning_rate": 1.9028842983005036e-06, + "loss": 0.0444, + "step": 801 + }, + { + "epoch": 1.4270462633451957, + "grad_norm": 0.9182230252406717, + "learning_rate": 1.8919251900109697e-06, + "loss": 0.0406, + "step": 802 + }, + { + "epoch": 1.4288256227758007, + "grad_norm": 0.7740835233990916, + "learning_rate": 1.8809903622593395e-06, + "loss": 0.0288, + "step": 803 + }, + { + "epoch": 1.4306049822064058, + "grad_norm": 0.8681372416415234, + "learning_rate": 1.870079900469392e-06, + "loss": 0.036, + "step": 804 + }, + { + "epoch": 1.4323843416370106, + "grad_norm": 0.9510296033757211, + "learning_rate": 1.8591938898745593e-06, + "loss": 0.0378, + "step": 805 + }, + { + "epoch": 1.4341637010676156, + "grad_norm": 0.8671006576919247, + "learning_rate": 1.8483324155172594e-06, + "loss": 0.0359, + "step": 806 + }, + { + "epoch": 1.4359430604982206, + "grad_norm": 0.7456102665101375, + "learning_rate": 1.837495562248226e-06, + "loss": 0.0308, + "step": 807 + }, + { + "epoch": 1.4377224199288257, + "grad_norm": 0.9769160563014989, + "learning_rate": 1.8266834147258577e-06, + "loss": 0.0417, + "step": 808 + }, + { + "epoch": 1.4395017793594307, + "grad_norm": 0.9173281560783461, + "learning_rate": 1.8158960574155455e-06, + "loss": 0.038, + "step": 809 + }, + { + "epoch": 1.4412811387900355, + "grad_norm": 0.9096880827995866, + "learning_rate": 1.8051335745890196e-06, + "loss": 0.0355, + "step": 810 + }, + { + "epoch": 1.4430604982206405, + "grad_norm": 0.9740778727636218, + "learning_rate": 1.7943960503236856e-06, + "loss": 0.0502, + "step": 811 + }, + { + "epoch": 1.4448398576512456, + "grad_norm": 0.7435132732373217, + "learning_rate": 1.7836835685019732e-06, + "loss": 0.0307, + "step": 812 + }, + { + "epoch": 1.4466192170818506, + "grad_norm": 0.9787109046764307, + "learning_rate": 1.7729962128106787e-06, + "loss": 0.0319, + "step": 813 + }, + { + "epoch": 1.4483985765124556, + "grad_norm": 0.8743242569858206, + "learning_rate": 1.7623340667403089e-06, + "loss": 0.035, + "step": 814 + }, + { + "epoch": 1.4501779359430604, + "grad_norm": 1.218953043581514, + "learning_rate": 1.7516972135844352e-06, + "loss": 0.0562, + "step": 815 + }, + { + "epoch": 1.4519572953736655, + "grad_norm": 0.9847396476889714, + "learning_rate": 1.741085736439031e-06, + "loss": 0.0348, + "step": 816 + }, + { + "epoch": 1.4537366548042705, + "grad_norm": 1.0415611109464662, + "learning_rate": 1.730499718201838e-06, + "loss": 0.0366, + "step": 817 + }, + { + "epoch": 1.4555160142348753, + "grad_norm": 1.0787375133207857, + "learning_rate": 1.7199392415717064e-06, + "loss": 0.0404, + "step": 818 + }, + { + "epoch": 1.4572953736654806, + "grad_norm": 0.899906940151615, + "learning_rate": 1.7094043890479557e-06, + "loss": 0.0425, + "step": 819 + }, + { + "epoch": 1.4590747330960854, + "grad_norm": 0.7722333626507534, + "learning_rate": 1.698895242929725e-06, + "loss": 0.0288, + "step": 820 + }, + { + "epoch": 1.4608540925266904, + "grad_norm": 0.8863193084024836, + "learning_rate": 1.6884118853153358e-06, + "loss": 0.0314, + "step": 821 + }, + { + "epoch": 1.4626334519572954, + "grad_norm": 1.1082581073729272, + "learning_rate": 1.6779543981016478e-06, + "loss": 0.0404, + "step": 822 + }, + { + "epoch": 1.4644128113879002, + "grad_norm": 0.9955465189826128, + "learning_rate": 1.6675228629834133e-06, + "loss": 0.0437, + "step": 823 + }, + { + "epoch": 1.4661921708185053, + "grad_norm": 0.8741761714128741, + "learning_rate": 1.657117361452651e-06, + "loss": 0.0309, + "step": 824 + }, + { + "epoch": 1.4679715302491103, + "grad_norm": 0.7382086548860517, + "learning_rate": 1.6467379747980011e-06, + "loss": 0.0261, + "step": 825 + }, + { + "epoch": 1.4697508896797153, + "grad_norm": 0.8049471680135438, + "learning_rate": 1.6363847841040914e-06, + "loss": 0.0378, + "step": 826 + }, + { + "epoch": 1.4715302491103204, + "grad_norm": 0.8255741130622384, + "learning_rate": 1.626057870250906e-06, + "loss": 0.0272, + "step": 827 + }, + { + "epoch": 1.4733096085409252, + "grad_norm": 1.1347923689675379, + "learning_rate": 1.6157573139131527e-06, + "loss": 0.041, + "step": 828 + }, + { + "epoch": 1.4750889679715302, + "grad_norm": 0.7302911526168262, + "learning_rate": 1.605483195559628e-06, + "loss": 0.0274, + "step": 829 + }, + { + "epoch": 1.4768683274021353, + "grad_norm": 0.8999710524588727, + "learning_rate": 1.5952355954525966e-06, + "loss": 0.0331, + "step": 830 + }, + { + "epoch": 1.4786476868327403, + "grad_norm": 1.0409610978339072, + "learning_rate": 1.5850145936471607e-06, + "loss": 0.0423, + "step": 831 + }, + { + "epoch": 1.4804270462633453, + "grad_norm": 1.0603934000618, + "learning_rate": 1.5748202699906335e-06, + "loss": 0.0394, + "step": 832 + }, + { + "epoch": 1.4822064056939501, + "grad_norm": 0.7245019085777473, + "learning_rate": 1.5646527041219128e-06, + "loss": 0.0235, + "step": 833 + }, + { + "epoch": 1.4839857651245552, + "grad_norm": 1.0003933208850713, + "learning_rate": 1.5545119754708682e-06, + "loss": 0.0372, + "step": 834 + }, + { + "epoch": 1.4857651245551602, + "grad_norm": 1.117608737881438, + "learning_rate": 1.544398163257711e-06, + "loss": 0.0396, + "step": 835 + }, + { + "epoch": 1.487544483985765, + "grad_norm": 0.8069740981162163, + "learning_rate": 1.5343113464923808e-06, + "loss": 0.0307, + "step": 836 + }, + { + "epoch": 1.48932384341637, + "grad_norm": 0.8792976432346608, + "learning_rate": 1.524251603973927e-06, + "loss": 0.0244, + "step": 837 + }, + { + "epoch": 1.491103202846975, + "grad_norm": 0.7831127456375095, + "learning_rate": 1.5142190142898883e-06, + "loss": 0.0293, + "step": 838 + }, + { + "epoch": 1.49288256227758, + "grad_norm": 1.0227232562116537, + "learning_rate": 1.5042136558156883e-06, + "loss": 0.0417, + "step": 839 + }, + { + "epoch": 1.4946619217081851, + "grad_norm": 1.4504092045196866, + "learning_rate": 1.4942356067140162e-06, + "loss": 0.0529, + "step": 840 + }, + { + "epoch": 1.49644128113879, + "grad_norm": 0.7647593177747857, + "learning_rate": 1.4842849449342195e-06, + "loss": 0.0297, + "step": 841 + }, + { + "epoch": 1.498220640569395, + "grad_norm": 0.8803775447231733, + "learning_rate": 1.4743617482116896e-06, + "loss": 0.0328, + "step": 842 + }, + { + "epoch": 1.5, + "grad_norm": 1.2494408641089838, + "learning_rate": 1.4644660940672628e-06, + "loss": 0.0362, + "step": 843 + }, + { + "epoch": 1.501779359430605, + "grad_norm": 0.9133080570375326, + "learning_rate": 1.454598059806609e-06, + "loss": 0.0402, + "step": 844 + }, + { + "epoch": 1.50355871886121, + "grad_norm": 0.9824913432232948, + "learning_rate": 1.4447577225196296e-06, + "loss": 0.0357, + "step": 845 + }, + { + "epoch": 1.5053380782918149, + "grad_norm": 1.1078102826347958, + "learning_rate": 1.4349451590798564e-06, + "loss": 0.0446, + "step": 846 + }, + { + "epoch": 1.50711743772242, + "grad_norm": 0.9235569215634525, + "learning_rate": 1.4251604461438444e-06, + "loss": 0.0411, + "step": 847 + }, + { + "epoch": 1.508896797153025, + "grad_norm": 0.8344609241644628, + "learning_rate": 1.4154036601505834e-06, + "loss": 0.0264, + "step": 848 + }, + { + "epoch": 1.5106761565836297, + "grad_norm": 1.0208130325307214, + "learning_rate": 1.4056748773208933e-06, + "loss": 0.0345, + "step": 849 + }, + { + "epoch": 1.512455516014235, + "grad_norm": 1.18407278498687, + "learning_rate": 1.3959741736568339e-06, + "loss": 0.0441, + "step": 850 + }, + { + "epoch": 1.5142348754448398, + "grad_norm": 0.8407420899636274, + "learning_rate": 1.3863016249411027e-06, + "loss": 0.0282, + "step": 851 + }, + { + "epoch": 1.5160142348754448, + "grad_norm": 0.9358932612708689, + "learning_rate": 1.376657306736453e-06, + "loss": 0.0338, + "step": 852 + }, + { + "epoch": 1.5177935943060499, + "grad_norm": 1.0992150247198784, + "learning_rate": 1.3670412943850975e-06, + "loss": 0.0435, + "step": 853 + }, + { + "epoch": 1.5195729537366547, + "grad_norm": 0.7824450481280979, + "learning_rate": 1.3574536630081208e-06, + "loss": 0.0265, + "step": 854 + }, + { + "epoch": 1.52135231316726, + "grad_norm": 1.1561450581795805, + "learning_rate": 1.347894487504896e-06, + "loss": 0.046, + "step": 855 + }, + { + "epoch": 1.5231316725978647, + "grad_norm": 0.8541116368943703, + "learning_rate": 1.3383638425524909e-06, + "loss": 0.0287, + "step": 856 + }, + { + "epoch": 1.5249110320284698, + "grad_norm": 0.9794589585996938, + "learning_rate": 1.3288618026050943e-06, + "loss": 0.0387, + "step": 857 + }, + { + "epoch": 1.5266903914590748, + "grad_norm": 1.3255527026520937, + "learning_rate": 1.31938844189343e-06, + "loss": 0.0495, + "step": 858 + }, + { + "epoch": 1.5284697508896796, + "grad_norm": 1.0062401733112627, + "learning_rate": 1.3099438344241777e-06, + "loss": 0.0324, + "step": 859 + }, + { + "epoch": 1.5302491103202847, + "grad_norm": 0.9745469076864323, + "learning_rate": 1.3005280539793908e-06, + "loss": 0.0359, + "step": 860 + }, + { + "epoch": 1.5320284697508897, + "grad_norm": 0.8906837672794841, + "learning_rate": 1.2911411741159273e-06, + "loss": 0.0289, + "step": 861 + }, + { + "epoch": 1.5338078291814945, + "grad_norm": 1.0308169458364242, + "learning_rate": 1.2817832681648712e-06, + "loss": 0.0457, + "step": 862 + }, + { + "epoch": 1.5355871886120998, + "grad_norm": 1.066135031513164, + "learning_rate": 1.2724544092309581e-06, + "loss": 0.0408, + "step": 863 + }, + { + "epoch": 1.5373665480427046, + "grad_norm": 0.9990751775334056, + "learning_rate": 1.2631546701920073e-06, + "loss": 0.0376, + "step": 864 + }, + { + "epoch": 1.5391459074733096, + "grad_norm": 0.9279454422229131, + "learning_rate": 1.2538841236983519e-06, + "loss": 0.0366, + "step": 865 + }, + { + "epoch": 1.5409252669039146, + "grad_norm": 0.9293129114653861, + "learning_rate": 1.244642842172266e-06, + "loss": 0.0278, + "step": 866 + }, + { + "epoch": 1.5427046263345194, + "grad_norm": 0.9460528920250509, + "learning_rate": 1.2354308978074088e-06, + "loss": 0.0342, + "step": 867 + }, + { + "epoch": 1.5444839857651247, + "grad_norm": 0.9392590603382746, + "learning_rate": 1.2262483625682514e-06, + "loss": 0.0335, + "step": 868 + }, + { + "epoch": 1.5462633451957295, + "grad_norm": 0.8549256234285926, + "learning_rate": 1.2170953081895214e-06, + "loss": 0.0373, + "step": 869 + }, + { + "epoch": 1.5480427046263345, + "grad_norm": 1.1139473893560274, + "learning_rate": 1.2079718061756369e-06, + "loss": 0.0325, + "step": 870 + }, + { + "epoch": 1.5498220640569396, + "grad_norm": 0.9192491000883517, + "learning_rate": 1.1988779278001517e-06, + "loss": 0.0369, + "step": 871 + }, + { + "epoch": 1.5516014234875444, + "grad_norm": 1.0285308271904086, + "learning_rate": 1.1898137441051982e-06, + "loss": 0.0377, + "step": 872 + }, + { + "epoch": 1.5533807829181496, + "grad_norm": 1.0632381868799874, + "learning_rate": 1.1807793259009282e-06, + "loss": 0.048, + "step": 873 + }, + { + "epoch": 1.5551601423487544, + "grad_norm": 0.8941713488837103, + "learning_rate": 1.1717747437649657e-06, + "loss": 0.0344, + "step": 874 + }, + { + "epoch": 1.5569395017793595, + "grad_norm": 1.0320040270693944, + "learning_rate": 1.1628000680418533e-06, + "loss": 0.0353, + "step": 875 + }, + { + "epoch": 1.5587188612099645, + "grad_norm": 0.9879256137330606, + "learning_rate": 1.1538553688425002e-06, + "loss": 0.0377, + "step": 876 + }, + { + "epoch": 1.5604982206405693, + "grad_norm": 0.9870724594556394, + "learning_rate": 1.14494071604364e-06, + "loss": 0.0385, + "step": 877 + }, + { + "epoch": 1.5622775800711743, + "grad_norm": 0.8920953626415423, + "learning_rate": 1.1360561792872754e-06, + "loss": 0.0309, + "step": 878 + }, + { + "epoch": 1.5640569395017794, + "grad_norm": 1.1965648875425332, + "learning_rate": 1.127201827980145e-06, + "loss": 0.0499, + "step": 879 + }, + { + "epoch": 1.5658362989323842, + "grad_norm": 0.7294023315487677, + "learning_rate": 1.1183777312931748e-06, + "loss": 0.0216, + "step": 880 + }, + { + "epoch": 1.5676156583629894, + "grad_norm": 0.9443435501509589, + "learning_rate": 1.1095839581609407e-06, + "loss": 0.0356, + "step": 881 + }, + { + "epoch": 1.5693950177935942, + "grad_norm": 1.1836820331014046, + "learning_rate": 1.1008205772811248e-06, + "loss": 0.0469, + "step": 882 + }, + { + "epoch": 1.5711743772241993, + "grad_norm": 1.245498489523876, + "learning_rate": 1.0920876571139843e-06, + "loss": 0.0436, + "step": 883 + }, + { + "epoch": 1.5729537366548043, + "grad_norm": 0.8677975712025474, + "learning_rate": 1.0833852658818167e-06, + "loss": 0.0408, + "step": 884 + }, + { + "epoch": 1.5747330960854091, + "grad_norm": 1.0288353933027032, + "learning_rate": 1.0747134715684221e-06, + "loss": 0.0391, + "step": 885 + }, + { + "epoch": 1.5765124555160144, + "grad_norm": 0.9226405070071885, + "learning_rate": 1.0660723419185776e-06, + "loss": 0.0407, + "step": 886 + }, + { + "epoch": 1.5782918149466192, + "grad_norm": 1.0664753740055022, + "learning_rate": 1.0574619444375017e-06, + "loss": 0.0397, + "step": 887 + }, + { + "epoch": 1.5800711743772242, + "grad_norm": 0.7220467139616126, + "learning_rate": 1.0488823463903341e-06, + "loss": 0.0316, + "step": 888 + }, + { + "epoch": 1.5818505338078293, + "grad_norm": 0.9666684504136956, + "learning_rate": 1.0403336148016053e-06, + "loss": 0.0362, + "step": 889 + }, + { + "epoch": 1.583629893238434, + "grad_norm": 1.1138519460532204, + "learning_rate": 1.0318158164547159e-06, + "loss": 0.0418, + "step": 890 + }, + { + "epoch": 1.585409252669039, + "grad_norm": 1.0184250030450555, + "learning_rate": 1.0233290178914096e-06, + "loss": 0.0352, + "step": 891 + }, + { + "epoch": 1.5871886120996441, + "grad_norm": 0.9833961249391704, + "learning_rate": 1.014873285411262e-06, + "loss": 0.037, + "step": 892 + }, + { + "epoch": 1.5889679715302492, + "grad_norm": 1.104247240879798, + "learning_rate": 1.006448685071154e-06, + "loss": 0.0413, + "step": 893 + }, + { + "epoch": 1.5907473309608542, + "grad_norm": 1.1882452731519395, + "learning_rate": 9.980552826847635e-07, + "loss": 0.0495, + "step": 894 + }, + { + "epoch": 1.592526690391459, + "grad_norm": 0.922464997040305, + "learning_rate": 9.896931438220453e-07, + "loss": 0.0324, + "step": 895 + }, + { + "epoch": 1.594306049822064, + "grad_norm": 1.1687021193524112, + "learning_rate": 9.813623338087181e-07, + "loss": 0.0464, + "step": 896 + }, + { + "epoch": 1.596085409252669, + "grad_norm": 1.0576380798604628, + "learning_rate": 9.730629177257623e-07, + "loss": 0.0429, + "step": 897 + }, + { + "epoch": 1.5978647686832739, + "grad_norm": 0.7266477393669994, + "learning_rate": 9.64794960408903e-07, + "loss": 0.0252, + "step": 898 + }, + { + "epoch": 1.5996441281138791, + "grad_norm": 0.9373401670105959, + "learning_rate": 9.565585264481092e-07, + "loss": 0.0372, + "step": 899 + }, + { + "epoch": 1.601423487544484, + "grad_norm": 0.8094908621190549, + "learning_rate": 9.483536801870835e-07, + "loss": 0.0263, + "step": 900 + }, + { + "epoch": 1.603202846975089, + "grad_norm": 0.9316959396242535, + "learning_rate": 9.401804857227648e-07, + "loss": 0.0304, + "step": 901 + }, + { + "epoch": 1.604982206405694, + "grad_norm": 1.2345023187653172, + "learning_rate": 9.320390069048258e-07, + "loss": 0.0361, + "step": 902 + }, + { + "epoch": 1.6067615658362988, + "grad_norm": 0.8864431203430947, + "learning_rate": 9.239293073351735e-07, + "loss": 0.0393, + "step": 903 + }, + { + "epoch": 1.608540925266904, + "grad_norm": 1.0799385908443084, + "learning_rate": 9.158514503674543e-07, + "loss": 0.0312, + "step": 904 + }, + { + "epoch": 1.6103202846975089, + "grad_norm": 0.9286264177644702, + "learning_rate": 9.078054991065532e-07, + "loss": 0.0377, + "step": 905 + }, + { + "epoch": 1.612099644128114, + "grad_norm": 1.0576510841578954, + "learning_rate": 8.997915164081095e-07, + "loss": 0.0455, + "step": 906 + }, + { + "epoch": 1.613879003558719, + "grad_norm": 1.0854015603523357, + "learning_rate": 8.918095648780195e-07, + "loss": 0.0457, + "step": 907 + }, + { + "epoch": 1.6156583629893237, + "grad_norm": 1.0357731908203174, + "learning_rate": 8.838597068719518e-07, + "loss": 0.0326, + "step": 908 + }, + { + "epoch": 1.6174377224199288, + "grad_norm": 1.0776572347728248, + "learning_rate": 8.75942004494853e-07, + "loss": 0.0357, + "step": 909 + }, + { + "epoch": 1.6192170818505338, + "grad_norm": 1.1016984155454363, + "learning_rate": 8.680565196004704e-07, + "loss": 0.0372, + "step": 910 + }, + { + "epoch": 1.6209964412811388, + "grad_norm": 0.9593481902843042, + "learning_rate": 8.602033137908666e-07, + "loss": 0.0366, + "step": 911 + }, + { + "epoch": 1.6227758007117439, + "grad_norm": 0.9007774118716178, + "learning_rate": 8.523824484159348e-07, + "loss": 0.026, + "step": 912 + }, + { + "epoch": 1.6245551601423487, + "grad_norm": 0.835459063038804, + "learning_rate": 8.445939845729245e-07, + "loss": 0.031, + "step": 913 + }, + { + "epoch": 1.6263345195729537, + "grad_norm": 0.9006973001186873, + "learning_rate": 8.368379831059592e-07, + "loss": 0.0372, + "step": 914 + }, + { + "epoch": 1.6281138790035588, + "grad_norm": 0.8889233791215693, + "learning_rate": 8.29114504605566e-07, + "loss": 0.032, + "step": 915 + }, + { + "epoch": 1.6298932384341636, + "grad_norm": 0.7822876187058223, + "learning_rate": 8.21423609408199e-07, + "loss": 0.0343, + "step": 916 + }, + { + "epoch": 1.6316725978647688, + "grad_norm": 1.111594006680975, + "learning_rate": 8.137653575957666e-07, + "loss": 0.0333, + "step": 917 + }, + { + "epoch": 1.6334519572953736, + "grad_norm": 1.020072554845852, + "learning_rate": 8.061398089951678e-07, + "loss": 0.0396, + "step": 918 + }, + { + "epoch": 1.6352313167259787, + "grad_norm": 0.8190464315963275, + "learning_rate": 7.985470231778203e-07, + "loss": 0.0317, + "step": 919 + }, + { + "epoch": 1.6370106761565837, + "grad_norm": 1.1006473942958985, + "learning_rate": 7.909870594591951e-07, + "loss": 0.0402, + "step": 920 + }, + { + "epoch": 1.6387900355871885, + "grad_norm": 1.2854097468189731, + "learning_rate": 7.834599768983553e-07, + "loss": 0.059, + "step": 921 + }, + { + "epoch": 1.6405693950177938, + "grad_norm": 0.7474969053636427, + "learning_rate": 7.759658342974951e-07, + "loss": 0.0247, + "step": 922 + }, + { + "epoch": 1.6423487544483986, + "grad_norm": 1.0083988508192103, + "learning_rate": 7.685046902014747e-07, + "loss": 0.0419, + "step": 923 + }, + { + "epoch": 1.6441281138790036, + "grad_norm": 1.0169598550090309, + "learning_rate": 7.61076602897371e-07, + "loss": 0.0371, + "step": 924 + }, + { + "epoch": 1.6459074733096086, + "grad_norm": 0.6812514330927129, + "learning_rate": 7.536816304140177e-07, + "loss": 0.0214, + "step": 925 + }, + { + "epoch": 1.6476868327402134, + "grad_norm": 0.729300793084757, + "learning_rate": 7.46319830521553e-07, + "loss": 0.0264, + "step": 926 + }, + { + "epoch": 1.6494661921708185, + "grad_norm": 0.7769643900453452, + "learning_rate": 7.389912607309662e-07, + "loss": 0.0294, + "step": 927 + }, + { + "epoch": 1.6512455516014235, + "grad_norm": 1.4135897655988794, + "learning_rate": 7.316959782936516e-07, + "loss": 0.0534, + "step": 928 + }, + { + "epoch": 1.6530249110320283, + "grad_norm": 0.6549831689302487, + "learning_rate": 7.244340402009608e-07, + "loss": 0.0217, + "step": 929 + }, + { + "epoch": 1.6548042704626336, + "grad_norm": 1.3763992770680902, + "learning_rate": 7.172055031837572e-07, + "loss": 0.0488, + "step": 930 + }, + { + "epoch": 1.6565836298932384, + "grad_norm": 0.8616529850738661, + "learning_rate": 7.100104237119676e-07, + "loss": 0.0355, + "step": 931 + }, + { + "epoch": 1.6583629893238434, + "grad_norm": 0.7598794140897599, + "learning_rate": 7.028488579941506e-07, + "loss": 0.0315, + "step": 932 + }, + { + "epoch": 1.6601423487544484, + "grad_norm": 0.9493777396147747, + "learning_rate": 6.957208619770505e-07, + "loss": 0.0335, + "step": 933 + }, + { + "epoch": 1.6619217081850532, + "grad_norm": 1.1056410082318162, + "learning_rate": 6.886264913451635e-07, + "loss": 0.0535, + "step": 934 + }, + { + "epoch": 1.6637010676156585, + "grad_norm": 0.8602500906103365, + "learning_rate": 6.815658015203014e-07, + "loss": 0.0299, + "step": 935 + }, + { + "epoch": 1.6654804270462633, + "grad_norm": 1.1303626496472468, + "learning_rate": 6.745388476611553e-07, + "loss": 0.0423, + "step": 936 + }, + { + "epoch": 1.6672597864768683, + "grad_norm": 0.8689628192761325, + "learning_rate": 6.67545684662873e-07, + "loss": 0.0289, + "step": 937 + }, + { + "epoch": 1.6690391459074734, + "grad_norm": 0.6183132653126804, + "learning_rate": 6.605863671566221e-07, + "loss": 0.0211, + "step": 938 + }, + { + "epoch": 1.6708185053380782, + "grad_norm": 1.1071927526516232, + "learning_rate": 6.536609495091695e-07, + "loss": 0.0351, + "step": 939 + }, + { + "epoch": 1.6725978647686834, + "grad_norm": 1.24253757661684, + "learning_rate": 6.467694858224488e-07, + "loss": 0.0433, + "step": 940 + }, + { + "epoch": 1.6743772241992882, + "grad_norm": 1.0274713686960626, + "learning_rate": 6.399120299331468e-07, + "loss": 0.0334, + "step": 941 + }, + { + "epoch": 1.6761565836298933, + "grad_norm": 0.7811773870244468, + "learning_rate": 6.330886354122768e-07, + "loss": 0.0345, + "step": 942 + }, + { + "epoch": 1.6779359430604983, + "grad_norm": 0.8014714628419496, + "learning_rate": 6.262993555647617e-07, + "loss": 0.03, + "step": 943 + }, + { + "epoch": 1.6797153024911031, + "grad_norm": 0.712819687318543, + "learning_rate": 6.1954424342902e-07, + "loss": 0.0247, + "step": 944 + }, + { + "epoch": 1.6814946619217082, + "grad_norm": 1.6895299850842702, + "learning_rate": 6.128233517765448e-07, + "loss": 0.0636, + "step": 945 + }, + { + "epoch": 1.6832740213523132, + "grad_norm": 1.1318686557102933, + "learning_rate": 6.061367331114992e-07, + "loss": 0.0373, + "step": 946 + }, + { + "epoch": 1.685053380782918, + "grad_norm": 0.9494600382208942, + "learning_rate": 5.994844396703025e-07, + "loss": 0.0357, + "step": 947 + }, + { + "epoch": 1.6868327402135233, + "grad_norm": 0.9363995816428463, + "learning_rate": 5.928665234212233e-07, + "loss": 0.0314, + "step": 948 + }, + { + "epoch": 1.688612099644128, + "grad_norm": 0.923925511702288, + "learning_rate": 5.862830360639698e-07, + "loss": 0.0354, + "step": 949 + }, + { + "epoch": 1.690391459074733, + "grad_norm": 1.2897962873576785, + "learning_rate": 5.797340290292907e-07, + "loss": 0.0389, + "step": 950 + }, + { + "epoch": 1.6921708185053381, + "grad_norm": 1.0994665862302504, + "learning_rate": 5.732195534785723e-07, + "loss": 0.0426, + "step": 951 + }, + { + "epoch": 1.693950177935943, + "grad_norm": 0.8862456655327394, + "learning_rate": 5.667396603034369e-07, + "loss": 0.0305, + "step": 952 + }, + { + "epoch": 1.6957295373665482, + "grad_norm": 1.0399436152359842, + "learning_rate": 5.602944001253486e-07, + "loss": 0.0357, + "step": 953 + }, + { + "epoch": 1.697508896797153, + "grad_norm": 1.127261042231125, + "learning_rate": 5.538838232952104e-07, + "loss": 0.0429, + "step": 954 + }, + { + "epoch": 1.699288256227758, + "grad_norm": 1.3220910149982537, + "learning_rate": 5.475079798929816e-07, + "loss": 0.0525, + "step": 955 + }, + { + "epoch": 1.701067615658363, + "grad_norm": 0.8427521039388601, + "learning_rate": 5.411669197272795e-07, + "loss": 0.028, + "step": 956 + }, + { + "epoch": 1.7028469750889679, + "grad_norm": 0.8110328630321189, + "learning_rate": 5.348606923349903e-07, + "loss": 0.0272, + "step": 957 + }, + { + "epoch": 1.704626334519573, + "grad_norm": 0.9703971089524207, + "learning_rate": 5.285893469808855e-07, + "loss": 0.0306, + "step": 958 + }, + { + "epoch": 1.706405693950178, + "grad_norm": 0.9783456783947087, + "learning_rate": 5.223529326572352e-07, + "loss": 0.0324, + "step": 959 + }, + { + "epoch": 1.708185053380783, + "grad_norm": 1.1357183520545884, + "learning_rate": 5.161514980834232e-07, + "loss": 0.0414, + "step": 960 + }, + { + "epoch": 1.709964412811388, + "grad_norm": 0.8988070534208314, + "learning_rate": 5.099850917055709e-07, + "loss": 0.0364, + "step": 961 + }, + { + "epoch": 1.7117437722419928, + "grad_norm": 0.9583520955663613, + "learning_rate": 5.038537616961559e-07, + "loss": 0.0338, + "step": 962 + }, + { + "epoch": 1.7135231316725978, + "grad_norm": 0.8595342153911766, + "learning_rate": 4.977575559536358e-07, + "loss": 0.0312, + "step": 963 + }, + { + "epoch": 1.7153024911032029, + "grad_norm": 1.0564765306855832, + "learning_rate": 4.916965221020753e-07, + "loss": 0.0325, + "step": 964 + }, + { + "epoch": 1.7170818505338077, + "grad_norm": 0.9743798971111711, + "learning_rate": 4.856707074907729e-07, + "loss": 0.0298, + "step": 965 + }, + { + "epoch": 1.718861209964413, + "grad_norm": 0.9652182678782931, + "learning_rate": 4.796801591938922e-07, + "loss": 0.0299, + "step": 966 + }, + { + "epoch": 1.7206405693950177, + "grad_norm": 1.0309106085190303, + "learning_rate": 4.737249240100911e-07, + "loss": 0.0406, + "step": 967 + }, + { + "epoch": 1.7224199288256228, + "grad_norm": 0.8048364147759897, + "learning_rate": 4.6780504846216155e-07, + "loss": 0.0238, + "step": 968 + }, + { + "epoch": 1.7241992882562278, + "grad_norm": 0.829980596096878, + "learning_rate": 4.619205787966613e-07, + "loss": 0.0258, + "step": 969 + }, + { + "epoch": 1.7259786476868326, + "grad_norm": 0.7850276613356122, + "learning_rate": 4.560715609835548e-07, + "loss": 0.0279, + "step": 970 + }, + { + "epoch": 1.7277580071174379, + "grad_norm": 0.72358981782396, + "learning_rate": 4.5025804071585464e-07, + "loss": 0.0253, + "step": 971 + }, + { + "epoch": 1.7295373665480427, + "grad_norm": 1.2537660504584207, + "learning_rate": 4.4448006340926163e-07, + "loss": 0.0494, + "step": 972 + }, + { + "epoch": 1.7313167259786477, + "grad_norm": 1.1514738594475027, + "learning_rate": 4.3873767420181344e-07, + "loss": 0.0397, + "step": 973 + }, + { + "epoch": 1.7330960854092528, + "grad_norm": 1.2047378027191022, + "learning_rate": 4.3303091795353024e-07, + "loss": 0.0533, + "step": 974 + }, + { + "epoch": 1.7348754448398576, + "grad_norm": 1.0256152942776495, + "learning_rate": 4.2735983924606596e-07, + "loss": 0.038, + "step": 975 + }, + { + "epoch": 1.7366548042704626, + "grad_norm": 0.9338454433520231, + "learning_rate": 4.2172448238235464e-07, + "loss": 0.0256, + "step": 976 + }, + { + "epoch": 1.7384341637010676, + "grad_norm": 0.9707190528049798, + "learning_rate": 4.161248913862731e-07, + "loss": 0.0338, + "step": 977 + }, + { + "epoch": 1.7402135231316724, + "grad_norm": 1.1499217073787062, + "learning_rate": 4.1056111000228937e-07, + "loss": 0.0405, + "step": 978 + }, + { + "epoch": 1.7419928825622777, + "grad_norm": 0.8726242073853734, + "learning_rate": 4.0503318169512417e-07, + "loss": 0.0271, + "step": 979 + }, + { + "epoch": 1.7437722419928825, + "grad_norm": 0.7148465586895162, + "learning_rate": 3.9954114964941336e-07, + "loss": 0.0208, + "step": 980 + }, + { + "epoch": 1.7455516014234875, + "grad_norm": 1.0713331041877134, + "learning_rate": 3.9408505676936327e-07, + "loss": 0.0371, + "step": 981 + }, + { + "epoch": 1.7473309608540926, + "grad_norm": 0.9341399388188112, + "learning_rate": 3.886649456784253e-07, + "loss": 0.0365, + "step": 982 + }, + { + "epoch": 1.7491103202846974, + "grad_norm": 0.8781461911591342, + "learning_rate": 3.8328085871895624e-07, + "loss": 0.0356, + "step": 983 + }, + { + "epoch": 1.7508896797153026, + "grad_norm": 0.8235928265369706, + "learning_rate": 3.779328379518898e-07, + "loss": 0.0309, + "step": 984 + }, + { + "epoch": 1.7526690391459074, + "grad_norm": 1.0697899745201302, + "learning_rate": 3.7262092515640556e-07, + "loss": 0.0422, + "step": 985 + }, + { + "epoch": 1.7544483985765125, + "grad_norm": 0.6666305916620812, + "learning_rate": 3.673451618296081e-07, + "loss": 0.0212, + "step": 986 + }, + { + "epoch": 1.7562277580071175, + "grad_norm": 0.9773015872344643, + "learning_rate": 3.621055891861963e-07, + "loss": 0.0399, + "step": 987 + }, + { + "epoch": 1.7580071174377223, + "grad_norm": 0.8726900125139255, + "learning_rate": 3.56902248158148e-07, + "loss": 0.0284, + "step": 988 + }, + { + "epoch": 1.7597864768683276, + "grad_norm": 1.056018529557307, + "learning_rate": 3.517351793943913e-07, + "loss": 0.0332, + "step": 989 + }, + { + "epoch": 1.7615658362989324, + "grad_norm": 1.0742533280362974, + "learning_rate": 3.4660442326049704e-07, + "loss": 0.0287, + "step": 990 + }, + { + "epoch": 1.7633451957295374, + "grad_norm": 0.8740732505273735, + "learning_rate": 3.4151001983835696e-07, + "loss": 0.0312, + "step": 991 + }, + { + "epoch": 1.7651245551601424, + "grad_norm": 1.042293617449342, + "learning_rate": 3.364520089258727e-07, + "loss": 0.027, + "step": 992 + }, + { + "epoch": 1.7669039145907472, + "grad_norm": 0.9546977626208112, + "learning_rate": 3.314304300366461e-07, + "loss": 0.0322, + "step": 993 + }, + { + "epoch": 1.7686832740213523, + "grad_norm": 0.9596235610874881, + "learning_rate": 3.2644532239966444e-07, + "loss": 0.035, + "step": 994 + }, + { + "epoch": 1.7704626334519573, + "grad_norm": 0.6210034355033043, + "learning_rate": 3.2149672495900286e-07, + "loss": 0.0205, + "step": 995 + }, + { + "epoch": 1.7722419928825621, + "grad_norm": 0.8943573596906906, + "learning_rate": 3.165846763735153e-07, + "loss": 0.0303, + "step": 996 + }, + { + "epoch": 1.7740213523131674, + "grad_norm": 0.8460767272289282, + "learning_rate": 3.117092150165324e-07, + "loss": 0.0319, + "step": 997 + }, + { + "epoch": 1.7758007117437722, + "grad_norm": 1.04522303905914, + "learning_rate": 3.068703789755606e-07, + "loss": 0.0438, + "step": 998 + }, + { + "epoch": 1.7775800711743772, + "grad_norm": 0.9040058383703806, + "learning_rate": 3.020682060519886e-07, + "loss": 0.0286, + "step": 999 + }, + { + "epoch": 1.7793594306049823, + "grad_norm": 0.9881837067031481, + "learning_rate": 2.9730273376078923e-07, + "loss": 0.034, + "step": 1000 + }, + { + "epoch": 1.7793594306049823, + "eval_loss": 0.13149592280387878, + "eval_runtime": 7.1479, + "eval_samples_per_second": 6.435, + "eval_steps_per_second": 1.679, + "step": 1000 + }, + { + "epoch": 1.781138790035587, + "grad_norm": 1.4581976873648321, + "learning_rate": 2.9257399933022737e-07, + "loss": 0.0567, + "step": 1001 + }, + { + "epoch": 1.7829181494661923, + "grad_norm": 1.0237315149119115, + "learning_rate": 2.8788203970156805e-07, + "loss": 0.0244, + "step": 1002 + }, + { + "epoch": 1.7846975088967971, + "grad_norm": 0.8701743450681395, + "learning_rate": 2.832268915287878e-07, + "loss": 0.0331, + "step": 1003 + }, + { + "epoch": 1.7864768683274022, + "grad_norm": 1.0739171114521038, + "learning_rate": 2.7860859117828985e-07, + "loss": 0.0381, + "step": 1004 + }, + { + "epoch": 1.7882562277580072, + "grad_norm": 1.6838830467351256, + "learning_rate": 2.740271747286194e-07, + "loss": 0.0811, + "step": 1005 + }, + { + "epoch": 1.790035587188612, + "grad_norm": 0.9281432422574337, + "learning_rate": 2.6948267797018145e-07, + "loss": 0.0306, + "step": 1006 + }, + { + "epoch": 1.791814946619217, + "grad_norm": 0.9911134831801146, + "learning_rate": 2.649751364049613e-07, + "loss": 0.0247, + "step": 1007 + }, + { + "epoch": 1.793594306049822, + "grad_norm": 0.9377006808945917, + "learning_rate": 2.6050458524624735e-07, + "loss": 0.0274, + "step": 1008 + }, + { + "epoch": 1.795373665480427, + "grad_norm": 0.9878113112061964, + "learning_rate": 2.560710594183552e-07, + "loss": 0.0318, + "step": 1009 + }, + { + "epoch": 1.7971530249110321, + "grad_norm": 0.9601089668885241, + "learning_rate": 2.5167459355635524e-07, + "loss": 0.0407, + "step": 1010 + }, + { + "epoch": 1.798932384341637, + "grad_norm": 0.9761958090765598, + "learning_rate": 2.473152220058039e-07, + "loss": 0.0327, + "step": 1011 + }, + { + "epoch": 1.800711743772242, + "grad_norm": 1.0670590996228635, + "learning_rate": 2.429929788224722e-07, + "loss": 0.0424, + "step": 1012 + }, + { + "epoch": 1.802491103202847, + "grad_norm": 0.953963322767867, + "learning_rate": 2.38707897772083e-07, + "loss": 0.0393, + "step": 1013 + }, + { + "epoch": 1.8042704626334518, + "grad_norm": 0.8250896078926884, + "learning_rate": 2.3446001233004333e-07, + "loss": 0.0334, + "step": 1014 + }, + { + "epoch": 1.806049822064057, + "grad_norm": 1.076888919506632, + "learning_rate": 2.3024935568118745e-07, + "loss": 0.0398, + "step": 1015 + }, + { + "epoch": 1.8078291814946619, + "grad_norm": 0.9327333319282085, + "learning_rate": 2.2607596071951288e-07, + "loss": 0.031, + "step": 1016 + }, + { + "epoch": 1.809608540925267, + "grad_norm": 0.7903529688554496, + "learning_rate": 2.2193986004792667e-07, + "loss": 0.0296, + "step": 1017 + }, + { + "epoch": 1.811387900355872, + "grad_norm": 0.698661874665587, + "learning_rate": 2.1784108597799058e-07, + "loss": 0.0187, + "step": 1018 + }, + { + "epoch": 1.8131672597864767, + "grad_norm": 0.9513103743831585, + "learning_rate": 2.1377967052966685e-07, + "loss": 0.036, + "step": 1019 + }, + { + "epoch": 1.814946619217082, + "grad_norm": 0.7484374633432326, + "learning_rate": 2.0975564543107007e-07, + "loss": 0.0293, + "step": 1020 + }, + { + "epoch": 1.8167259786476868, + "grad_norm": 1.0107176687876676, + "learning_rate": 2.057690421182168e-07, + "loss": 0.0386, + "step": 1021 + }, + { + "epoch": 1.8185053380782918, + "grad_norm": 0.887083775932418, + "learning_rate": 2.01819891734783e-07, + "loss": 0.0351, + "step": 1022 + }, + { + "epoch": 1.8202846975088969, + "grad_norm": 0.9178747885855896, + "learning_rate": 1.979082251318576e-07, + "loss": 0.0359, + "step": 1023 + }, + { + "epoch": 1.8220640569395017, + "grad_norm": 0.7913010983811369, + "learning_rate": 1.9403407286770592e-07, + "loss": 0.0242, + "step": 1024 + }, + { + "epoch": 1.8238434163701067, + "grad_norm": 0.7955445400300014, + "learning_rate": 1.9019746520752502e-07, + "loss": 0.0239, + "step": 1025 + }, + { + "epoch": 1.8256227758007118, + "grad_norm": 0.8910759846092995, + "learning_rate": 1.8639843212321206e-07, + "loss": 0.0324, + "step": 1026 + }, + { + "epoch": 1.8274021352313166, + "grad_norm": 0.8221026284011368, + "learning_rate": 1.826370032931285e-07, + "loss": 0.0269, + "step": 1027 + }, + { + "epoch": 1.8291814946619218, + "grad_norm": 0.8929037997791472, + "learning_rate": 1.789132081018674e-07, + "loss": 0.0329, + "step": 1028 + }, + { + "epoch": 1.8309608540925266, + "grad_norm": 1.2085245846695714, + "learning_rate": 1.7522707564002706e-07, + "loss": 0.0385, + "step": 1029 + }, + { + "epoch": 1.8327402135231317, + "grad_norm": 0.796349936855367, + "learning_rate": 1.7157863470397718e-07, + "loss": 0.0278, + "step": 1030 + }, + { + "epoch": 1.8345195729537367, + "grad_norm": 0.9209938761836545, + "learning_rate": 1.6796791379564138e-07, + "loss": 0.0335, + "step": 1031 + }, + { + "epoch": 1.8362989323843415, + "grad_norm": 0.9306412458754235, + "learning_rate": 1.6439494112227173e-07, + "loss": 0.0296, + "step": 1032 + }, + { + "epoch": 1.8380782918149468, + "grad_norm": 0.8784943369954986, + "learning_rate": 1.6085974459622567e-07, + "loss": 0.036, + "step": 1033 + }, + { + "epoch": 1.8398576512455516, + "grad_norm": 0.7439946710703741, + "learning_rate": 1.573623518347517e-07, + "loss": 0.028, + "step": 1034 + }, + { + "epoch": 1.8416370106761566, + "grad_norm": 0.9743690068723833, + "learning_rate": 1.5390279015977117e-07, + "loss": 0.0384, + "step": 1035 + }, + { + "epoch": 1.8434163701067616, + "grad_norm": 0.8443085515127314, + "learning_rate": 1.5048108659766693e-07, + "loss": 0.0315, + "step": 1036 + }, + { + "epoch": 1.8451957295373664, + "grad_norm": 0.7285124904919683, + "learning_rate": 1.470972678790711e-07, + "loss": 0.0308, + "step": 1037 + }, + { + "epoch": 1.8469750889679717, + "grad_norm": 0.9991244860368302, + "learning_rate": 1.437513604386559e-07, + "loss": 0.0438, + "step": 1038 + }, + { + "epoch": 1.8487544483985765, + "grad_norm": 0.8028992434917028, + "learning_rate": 1.404433904149266e-07, + "loss": 0.0241, + "step": 1039 + }, + { + "epoch": 1.8505338078291815, + "grad_norm": 0.9832311451220926, + "learning_rate": 1.3717338365001943e-07, + "loss": 0.0351, + "step": 1040 + }, + { + "epoch": 1.8523131672597866, + "grad_norm": 1.0585927337878276, + "learning_rate": 1.3394136568949834e-07, + "loss": 0.0374, + "step": 1041 + }, + { + "epoch": 1.8540925266903914, + "grad_norm": 1.2862955458665684, + "learning_rate": 1.307473617821553e-07, + "loss": 0.0497, + "step": 1042 + }, + { + "epoch": 1.8558718861209964, + "grad_norm": 1.0823593920906582, + "learning_rate": 1.275913968798137e-07, + "loss": 0.0355, + "step": 1043 + }, + { + "epoch": 1.8576512455516014, + "grad_norm": 0.9149759606916206, + "learning_rate": 1.2447349563713186e-07, + "loss": 0.0356, + "step": 1044 + }, + { + "epoch": 1.8594306049822062, + "grad_norm": 0.9824134546293025, + "learning_rate": 1.213936824114137e-07, + "loss": 0.0348, + "step": 1045 + }, + { + "epoch": 1.8612099644128115, + "grad_norm": 0.935432985228546, + "learning_rate": 1.1835198126241509e-07, + "loss": 0.0283, + "step": 1046 + }, + { + "epoch": 1.8629893238434163, + "grad_norm": 1.3192106309529583, + "learning_rate": 1.1534841595215617e-07, + "loss": 0.0389, + "step": 1047 + }, + { + "epoch": 1.8647686832740213, + "grad_norm": 0.9870732568235638, + "learning_rate": 1.1238300994473983e-07, + "loss": 0.0302, + "step": 1048 + }, + { + "epoch": 1.8665480427046264, + "grad_norm": 0.9836332689828364, + "learning_rate": 1.0945578640616183e-07, + "loss": 0.038, + "step": 1049 + }, + { + "epoch": 1.8683274021352312, + "grad_norm": 0.8500081148991353, + "learning_rate": 1.0656676820413603e-07, + "loss": 0.0222, + "step": 1050 + }, + { + "epoch": 1.8701067615658364, + "grad_norm": 0.7301350794482695, + "learning_rate": 1.0371597790791166e-07, + "loss": 0.0227, + "step": 1051 + }, + { + "epoch": 1.8718861209964412, + "grad_norm": 1.005129217408169, + "learning_rate": 1.0090343778809908e-07, + "loss": 0.0386, + "step": 1052 + }, + { + "epoch": 1.8736654804270463, + "grad_norm": 1.1804794331103639, + "learning_rate": 9.812916981649433e-08, + "loss": 0.0461, + "step": 1053 + }, + { + "epoch": 1.8754448398576513, + "grad_norm": 0.9156692878233684, + "learning_rate": 9.539319566590766e-08, + "loss": 0.0376, + "step": 1054 + }, + { + "epoch": 1.8772241992882561, + "grad_norm": 1.0017513758167835, + "learning_rate": 9.269553670999743e-08, + "loss": 0.0362, + "step": 1055 + }, + { + "epoch": 1.8790035587188612, + "grad_norm": 1.302168151682292, + "learning_rate": 9.003621402309815e-08, + "loss": 0.0391, + "step": 1056 + }, + { + "epoch": 1.8807829181494662, + "grad_norm": 1.0771753414898826, + "learning_rate": 8.741524838005888e-08, + "loss": 0.0428, + "step": 1057 + }, + { + "epoch": 1.8825622775800712, + "grad_norm": 0.8229028021100799, + "learning_rate": 8.483266025608061e-08, + "loss": 0.0274, + "step": 1058 + }, + { + "epoch": 1.8843416370106763, + "grad_norm": 0.8754992831336158, + "learning_rate": 8.228846982655525e-08, + "loss": 0.0275, + "step": 1059 + }, + { + "epoch": 1.886120996441281, + "grad_norm": 0.9536393757277662, + "learning_rate": 7.978269696691021e-08, + "loss": 0.0323, + "step": 1060 + }, + { + "epoch": 1.887900355871886, + "grad_norm": 0.9170964479057592, + "learning_rate": 7.731536125244965e-08, + "loss": 0.0326, + "step": 1061 + }, + { + "epoch": 1.8896797153024911, + "grad_norm": 1.0022067938158241, + "learning_rate": 7.488648195820513e-08, + "loss": 0.0376, + "step": 1062 + }, + { + "epoch": 1.891459074733096, + "grad_norm": 1.2358298188436405, + "learning_rate": 7.249607805878245e-08, + "loss": 0.0371, + "step": 1063 + }, + { + "epoch": 1.8932384341637012, + "grad_norm": 1.0813593177347594, + "learning_rate": 7.014416822821557e-08, + "loss": 0.038, + "step": 1064 + }, + { + "epoch": 1.895017793594306, + "grad_norm": 1.0606118808208824, + "learning_rate": 6.783077083981793e-08, + "loss": 0.0297, + "step": 1065 + }, + { + "epoch": 1.896797153024911, + "grad_norm": 0.915941522935534, + "learning_rate": 6.55559039660425e-08, + "loss": 0.0325, + "step": 1066 + }, + { + "epoch": 1.898576512455516, + "grad_norm": 0.9030681191606825, + "learning_rate": 6.331958537833693e-08, + "loss": 0.0264, + "step": 1067 + }, + { + "epoch": 1.9003558718861209, + "grad_norm": 0.871513537590659, + "learning_rate": 6.112183254700866e-08, + "loss": 0.0338, + "step": 1068 + }, + { + "epoch": 1.9021352313167261, + "grad_norm": 1.0096976334247885, + "learning_rate": 5.8962662641083856e-08, + "loss": 0.0293, + "step": 1069 + }, + { + "epoch": 1.903914590747331, + "grad_norm": 0.9058276514087495, + "learning_rate": 5.6842092528176516e-08, + "loss": 0.0304, + "step": 1070 + }, + { + "epoch": 1.905693950177936, + "grad_norm": 0.8190544432997178, + "learning_rate": 5.476013877435626e-08, + "loss": 0.0298, + "step": 1071 + }, + { + "epoch": 1.907473309608541, + "grad_norm": 1.1099499267624018, + "learning_rate": 5.271681764401848e-08, + "loss": 0.0379, + "step": 1072 + }, + { + "epoch": 1.9092526690391458, + "grad_norm": 0.7380683514472781, + "learning_rate": 5.071214509975775e-08, + "loss": 0.0248, + "step": 1073 + }, + { + "epoch": 1.9110320284697508, + "grad_norm": 0.9929257149315531, + "learning_rate": 4.8746136802240716e-08, + "loss": 0.0352, + "step": 1074 + }, + { + "epoch": 1.9128113879003559, + "grad_norm": 0.8954709513119363, + "learning_rate": 4.6818808110087875e-08, + "loss": 0.0315, + "step": 1075 + }, + { + "epoch": 1.914590747330961, + "grad_norm": 1.0373977774959033, + "learning_rate": 4.493017407975087e-08, + "loss": 0.0378, + "step": 1076 + }, + { + "epoch": 1.916370106761566, + "grad_norm": 0.9324638936790517, + "learning_rate": 4.308024946539424e-08, + "loss": 0.0252, + "step": 1077 + }, + { + "epoch": 1.9181494661921707, + "grad_norm": 0.7670826159562243, + "learning_rate": 4.1269048718783344e-08, + "loss": 0.0228, + "step": 1078 + }, + { + "epoch": 1.9199288256227758, + "grad_norm": 0.7971799973017737, + "learning_rate": 3.9496585989167726e-08, + "loss": 0.0286, + "step": 1079 + }, + { + "epoch": 1.9217081850533808, + "grad_norm": 0.932883668858871, + "learning_rate": 3.776287512317345e-08, + "loss": 0.036, + "step": 1080 + }, + { + "epoch": 1.9234875444839856, + "grad_norm": 1.115101637449023, + "learning_rate": 3.606792966469375e-08, + "loss": 0.0359, + "step": 1081 + }, + { + "epoch": 1.9252669039145909, + "grad_norm": 0.8683734269285328, + "learning_rate": 3.4411762854782426e-08, + "loss": 0.0298, + "step": 1082 + }, + { + "epoch": 1.9270462633451957, + "grad_norm": 1.1836052474553045, + "learning_rate": 3.279438763155174e-08, + "loss": 0.0314, + "step": 1083 + }, + { + "epoch": 1.9288256227758007, + "grad_norm": 0.8822285835082575, + "learning_rate": 3.121581663007134e-08, + "loss": 0.0355, + "step": 1084 + }, + { + "epoch": 1.9306049822064058, + "grad_norm": 0.9851401744795133, + "learning_rate": 2.967606218226837e-08, + "loss": 0.0408, + "step": 1085 + }, + { + "epoch": 1.9323843416370106, + "grad_norm": 0.8277152849214361, + "learning_rate": 2.8175136316832e-08, + "loss": 0.0261, + "step": 1086 + }, + { + "epoch": 1.9341637010676158, + "grad_norm": 1.0130518925764556, + "learning_rate": 2.6713050759120117e-08, + "loss": 0.0387, + "step": 1087 + }, + { + "epoch": 1.9359430604982206, + "grad_norm": 0.9689860116457749, + "learning_rate": 2.528981693106558e-08, + "loss": 0.0341, + "step": 1088 + }, + { + "epoch": 1.9377224199288257, + "grad_norm": 0.9796668852631681, + "learning_rate": 2.3905445951089013e-08, + "loss": 0.0319, + "step": 1089 + }, + { + "epoch": 1.9395017793594307, + "grad_norm": 1.0813212494176583, + "learning_rate": 2.2559948634011673e-08, + "loss": 0.0326, + "step": 1090 + }, + { + "epoch": 1.9412811387900355, + "grad_norm": 1.0461265760023197, + "learning_rate": 2.125333549096942e-08, + "loss": 0.0387, + "step": 1091 + }, + { + "epoch": 1.9430604982206405, + "grad_norm": 0.811009205236011, + "learning_rate": 1.9985616729332747e-08, + "loss": 0.0307, + "step": 1092 + }, + { + "epoch": 1.9448398576512456, + "grad_norm": 1.021062435635048, + "learning_rate": 1.8756802252625773e-08, + "loss": 0.0331, + "step": 1093 + }, + { + "epoch": 1.9466192170818504, + "grad_norm": 0.8523317241217103, + "learning_rate": 1.75669016604485e-08, + "loss": 0.0325, + "step": 1094 + }, + { + "epoch": 1.9483985765124556, + "grad_norm": 1.04176434171501, + "learning_rate": 1.6415924248403547e-08, + "loss": 0.0348, + "step": 1095 + }, + { + "epoch": 1.9501779359430604, + "grad_norm": 0.7659336169960644, + "learning_rate": 1.5303879008021773e-08, + "loss": 0.0264, + "step": 1096 + }, + { + "epoch": 1.9519572953736655, + "grad_norm": 0.9108460203627384, + "learning_rate": 1.4230774626691756e-08, + "loss": 0.0271, + "step": 1097 + }, + { + "epoch": 1.9537366548042705, + "grad_norm": 1.371246031672071, + "learning_rate": 1.3196619487594875e-08, + "loss": 0.047, + "step": 1098 + }, + { + "epoch": 1.9555160142348753, + "grad_norm": 1.0394863712655835, + "learning_rate": 1.2201421669636448e-08, + "loss": 0.036, + "step": 1099 + }, + { + "epoch": 1.9572953736654806, + "grad_norm": 1.044634703938829, + "learning_rate": 1.1245188947384133e-08, + "loss": 0.0283, + "step": 1100 + }, + { + "epoch": 1.9590747330960854, + "grad_norm": 1.069614237257927, + "learning_rate": 1.0327928791006858e-08, + "loss": 0.0362, + "step": 1101 + }, + { + "epoch": 1.9608540925266904, + "grad_norm": 0.940847502161726, + "learning_rate": 9.449648366217645e-09, + "loss": 0.0334, + "step": 1102 + }, + { + "epoch": 1.9626334519572954, + "grad_norm": 0.8748418808894304, + "learning_rate": 8.61035453421588e-09, + "loss": 0.0347, + "step": 1103 + }, + { + "epoch": 1.9644128113879002, + "grad_norm": 0.8786054359850882, + "learning_rate": 7.81005385163458e-09, + "loss": 0.0409, + "step": 1104 + }, + { + "epoch": 1.9661921708185055, + "grad_norm": 0.8133475665653579, + "learning_rate": 7.048752570488205e-09, + "loss": 0.0313, + "step": 1105 + }, + { + "epoch": 1.9679715302491103, + "grad_norm": 1.0276320557124607, + "learning_rate": 6.326456638125478e-09, + "loss": 0.0323, + "step": 1106 + }, + { + "epoch": 1.9697508896797153, + "grad_norm": 0.8587607763955706, + "learning_rate": 5.643171697183314e-09, + "loss": 0.032, + "step": 1107 + }, + { + "epoch": 1.9715302491103204, + "grad_norm": 0.9712870242537757, + "learning_rate": 4.998903085539075e-09, + "loss": 0.0328, + "step": 1108 + }, + { + "epoch": 1.9733096085409252, + "grad_norm": 0.8166100766953054, + "learning_rate": 4.393655836272825e-09, + "loss": 0.0278, + "step": 1109 + }, + { + "epoch": 1.9750889679715302, + "grad_norm": 1.0364742911765257, + "learning_rate": 3.8274346776262514e-09, + "loss": 0.0354, + "step": 1110 + }, + { + "epoch": 1.9768683274021353, + "grad_norm": 1.0782721578446093, + "learning_rate": 3.300244032966582e-09, + "loss": 0.0384, + "step": 1111 + }, + { + "epoch": 1.97864768683274, + "grad_norm": 1.0709966092834682, + "learning_rate": 2.8120880207493928e-09, + "loss": 0.0334, + "step": 1112 + }, + { + "epoch": 1.9804270462633453, + "grad_norm": 0.7010588442056803, + "learning_rate": 2.362970454491409e-09, + "loss": 0.0187, + "step": 1113 + }, + { + "epoch": 1.9822064056939501, + "grad_norm": 1.2627454425207878, + "learning_rate": 1.952894842735531e-09, + "loss": 0.0402, + "step": 1114 + }, + { + "epoch": 1.9839857651245552, + "grad_norm": 0.6439934853719501, + "learning_rate": 1.5818643890258555e-09, + "loss": 0.0225, + "step": 1115 + }, + { + "epoch": 1.9857651245551602, + "grad_norm": 1.067338669364316, + "learning_rate": 1.2498819918843609e-09, + "loss": 0.0326, + "step": 1116 + }, + { + "epoch": 1.987544483985765, + "grad_norm": 1.1467918734632372, + "learning_rate": 9.569502447837053e-10, + "loss": 0.0289, + "step": 1117 + }, + { + "epoch": 1.9893238434163703, + "grad_norm": 1.0558293840041255, + "learning_rate": 7.03071436131686e-10, + "loss": 0.0322, + "step": 1118 + }, + { + "epoch": 1.991103202846975, + "grad_norm": 0.9210537233088608, + "learning_rate": 4.882475492506977e-10, + "loss": 0.0335, + "step": 1119 + }, + { + "epoch": 1.99288256227758, + "grad_norm": 1.1268509628147607, + "learning_rate": 3.124802623627465e-10, + "loss": 0.0446, + "step": 1120 + }, + { + "epoch": 1.9946619217081851, + "grad_norm": 1.1339107008229083, + "learning_rate": 1.7577094857557097e-10, + "loss": 0.0366, + "step": 1121 + }, + { + "epoch": 1.99644128113879, + "grad_norm": 1.0673822700368407, + "learning_rate": 7.812067587487093e-11, + "loss": 0.0326, + "step": 1122 + }, + { + "epoch": 1.998220640569395, + "grad_norm": 0.813539782832218, + "learning_rate": 1.9530207111539967e-11, + "loss": 0.0233, + "step": 1123 + }, + { + "epoch": 2.0, + "grad_norm": 0.5845581052680466, + "learning_rate": 0.0, + "loss": 0.0194, + "step": 1124 + }, + { + "epoch": 2.0, + "step": 1124, + "total_flos": 11497408118784.0, + "train_loss": 0.08073598971703086, + "train_runtime": 2558.5077, + "train_samples_per_second": 3.512, + "train_steps_per_second": 0.439 + } + ], + "logging_steps": 1, + "max_steps": 1124, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 2000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 11497408118784.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}