cutelemonlili's picture
Add files using upload-large-folder tool
cadddf2 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 200,
"global_step": 1124,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0017793594306049821,
"grad_norm": 2.3150479490250784,
"learning_rate": 9.99998046979289e-06,
"loss": 0.122,
"step": 1
},
{
"epoch": 0.0035587188612099642,
"grad_norm": 1.8939108981994268,
"learning_rate": 9.999921879324127e-06,
"loss": 0.0978,
"step": 2
},
{
"epoch": 0.005338078291814947,
"grad_norm": 2.2313302724289206,
"learning_rate": 9.999824229051425e-06,
"loss": 0.1139,
"step": 3
},
{
"epoch": 0.0071174377224199285,
"grad_norm": 2.147995036567701,
"learning_rate": 9.999687519737639e-06,
"loss": 0.124,
"step": 4
},
{
"epoch": 0.008896797153024912,
"grad_norm": 2.07600425424386,
"learning_rate": 9.99951175245075e-06,
"loss": 0.0982,
"step": 5
},
{
"epoch": 0.010676156583629894,
"grad_norm": 2.236609634956885,
"learning_rate": 9.999296928563868e-06,
"loss": 0.128,
"step": 6
},
{
"epoch": 0.012455516014234875,
"grad_norm": 2.46718595601003,
"learning_rate": 9.999043049755216e-06,
"loss": 0.1348,
"step": 7
},
{
"epoch": 0.014234875444839857,
"grad_norm": 1.5643536190598661,
"learning_rate": 9.998750118008117e-06,
"loss": 0.0829,
"step": 8
},
{
"epoch": 0.01601423487544484,
"grad_norm": 1.6749555322656917,
"learning_rate": 9.998418135610974e-06,
"loss": 0.0875,
"step": 9
},
{
"epoch": 0.017793594306049824,
"grad_norm": 2.102986051425079,
"learning_rate": 9.998047105157265e-06,
"loss": 0.1433,
"step": 10
},
{
"epoch": 0.019572953736654804,
"grad_norm": 1.6806961724301148,
"learning_rate": 9.997637029545509e-06,
"loss": 0.0864,
"step": 11
},
{
"epoch": 0.021352313167259787,
"grad_norm": 2.3501140885281218,
"learning_rate": 9.997187911979252e-06,
"loss": 0.1306,
"step": 12
},
{
"epoch": 0.023131672597864767,
"grad_norm": 2.2011534009760183,
"learning_rate": 9.996699755967035e-06,
"loss": 0.1418,
"step": 13
},
{
"epoch": 0.02491103202846975,
"grad_norm": 1.834656075860282,
"learning_rate": 9.996172565322375e-06,
"loss": 0.1054,
"step": 14
},
{
"epoch": 0.026690391459074734,
"grad_norm": 1.8702617116201332,
"learning_rate": 9.995606344163728e-06,
"loss": 0.1164,
"step": 15
},
{
"epoch": 0.028469750889679714,
"grad_norm": 1.4133556400197504,
"learning_rate": 9.995001096914462e-06,
"loss": 0.0937,
"step": 16
},
{
"epoch": 0.030249110320284697,
"grad_norm": 1.5517785688670929,
"learning_rate": 9.994356828302818e-06,
"loss": 0.1093,
"step": 17
},
{
"epoch": 0.03202846975088968,
"grad_norm": 1.3391448776499155,
"learning_rate": 9.993673543361874e-06,
"loss": 0.0953,
"step": 18
},
{
"epoch": 0.033807829181494664,
"grad_norm": 2.0203435862014003,
"learning_rate": 9.992951247429512e-06,
"loss": 0.1585,
"step": 19
},
{
"epoch": 0.03558718861209965,
"grad_norm": 1.415383314135429,
"learning_rate": 9.992189946148366e-06,
"loss": 0.0919,
"step": 20
},
{
"epoch": 0.037366548042704624,
"grad_norm": 1.1271360275786408,
"learning_rate": 9.991389645465786e-06,
"loss": 0.0756,
"step": 21
},
{
"epoch": 0.03914590747330961,
"grad_norm": 1.7287049496025613,
"learning_rate": 9.990550351633784e-06,
"loss": 0.1279,
"step": 22
},
{
"epoch": 0.04092526690391459,
"grad_norm": 1.454809662757122,
"learning_rate": 9.989672071208993e-06,
"loss": 0.1177,
"step": 23
},
{
"epoch": 0.042704626334519574,
"grad_norm": 1.8333181806129868,
"learning_rate": 9.988754811052616e-06,
"loss": 0.1488,
"step": 24
},
{
"epoch": 0.04448398576512456,
"grad_norm": 1.821106896278666,
"learning_rate": 9.987798578330365e-06,
"loss": 0.1426,
"step": 25
},
{
"epoch": 0.046263345195729534,
"grad_norm": 1.586635078482126,
"learning_rate": 9.986803380512406e-06,
"loss": 0.1186,
"step": 26
},
{
"epoch": 0.04804270462633452,
"grad_norm": 1.5546514668832576,
"learning_rate": 9.98576922537331e-06,
"loss": 0.1179,
"step": 27
},
{
"epoch": 0.0498220640569395,
"grad_norm": 1.613267015140376,
"learning_rate": 9.984696120991979e-06,
"loss": 0.1139,
"step": 28
},
{
"epoch": 0.051601423487544484,
"grad_norm": 2.4908063051717044,
"learning_rate": 9.983584075751598e-06,
"loss": 0.1268,
"step": 29
},
{
"epoch": 0.05338078291814947,
"grad_norm": 1.5857689757989275,
"learning_rate": 9.982433098339553e-06,
"loss": 0.1195,
"step": 30
},
{
"epoch": 0.05516014234875445,
"grad_norm": 2.0410218934139643,
"learning_rate": 9.981243197747375e-06,
"loss": 0.1461,
"step": 31
},
{
"epoch": 0.05693950177935943,
"grad_norm": 2.6117564459399083,
"learning_rate": 9.980014383270668e-06,
"loss": 0.1701,
"step": 32
},
{
"epoch": 0.05871886120996441,
"grad_norm": 1.66690519128048,
"learning_rate": 9.978746664509032e-06,
"loss": 0.1373,
"step": 33
},
{
"epoch": 0.060498220640569395,
"grad_norm": 1.4077261838750643,
"learning_rate": 9.97744005136599e-06,
"loss": 0.0957,
"step": 34
},
{
"epoch": 0.06227758007117438,
"grad_norm": 1.4663691854524623,
"learning_rate": 9.976094554048912e-06,
"loss": 0.1129,
"step": 35
},
{
"epoch": 0.06405693950177936,
"grad_norm": 1.7723259431724423,
"learning_rate": 9.974710183068935e-06,
"loss": 0.1218,
"step": 36
},
{
"epoch": 0.06583629893238434,
"grad_norm": 1.4100491042718053,
"learning_rate": 9.97328694924088e-06,
"loss": 0.1026,
"step": 37
},
{
"epoch": 0.06761565836298933,
"grad_norm": 1.605871591007416,
"learning_rate": 9.971824863683168e-06,
"loss": 0.126,
"step": 38
},
{
"epoch": 0.0693950177935943,
"grad_norm": 1.6815314399777885,
"learning_rate": 9.970323937817732e-06,
"loss": 0.1195,
"step": 39
},
{
"epoch": 0.0711743772241993,
"grad_norm": 1.6559595335176767,
"learning_rate": 9.968784183369929e-06,
"loss": 0.1109,
"step": 40
},
{
"epoch": 0.07295373665480427,
"grad_norm": 1.665846243493975,
"learning_rate": 9.96720561236845e-06,
"loss": 0.127,
"step": 41
},
{
"epoch": 0.07473309608540925,
"grad_norm": 1.7460396014674455,
"learning_rate": 9.965588237145219e-06,
"loss": 0.1236,
"step": 42
},
{
"epoch": 0.07651245551601424,
"grad_norm": 1.5742329771492725,
"learning_rate": 9.963932070335307e-06,
"loss": 0.1195,
"step": 43
},
{
"epoch": 0.07829181494661921,
"grad_norm": 1.6055445888172124,
"learning_rate": 9.962237124876828e-06,
"loss": 0.1119,
"step": 44
},
{
"epoch": 0.0800711743772242,
"grad_norm": 1.5925472414645612,
"learning_rate": 9.960503414010833e-06,
"loss": 0.1229,
"step": 45
},
{
"epoch": 0.08185053380782918,
"grad_norm": 1.726125819903223,
"learning_rate": 9.958730951281218e-06,
"loss": 0.1299,
"step": 46
},
{
"epoch": 0.08362989323843416,
"grad_norm": 1.7872198307678588,
"learning_rate": 9.956919750534607e-06,
"loss": 0.1268,
"step": 47
},
{
"epoch": 0.08540925266903915,
"grad_norm": 1.7461267629761665,
"learning_rate": 9.955069825920249e-06,
"loss": 0.1354,
"step": 48
},
{
"epoch": 0.08718861209964412,
"grad_norm": 1.5052492583066552,
"learning_rate": 9.953181191889913e-06,
"loss": 0.1229,
"step": 49
},
{
"epoch": 0.08896797153024912,
"grad_norm": 1.3366981185871216,
"learning_rate": 9.95125386319776e-06,
"loss": 0.1006,
"step": 50
},
{
"epoch": 0.09074733096085409,
"grad_norm": 2.0856819842733745,
"learning_rate": 9.949287854900243e-06,
"loss": 0.1538,
"step": 51
},
{
"epoch": 0.09252669039145907,
"grad_norm": 1.77197422683052,
"learning_rate": 9.947283182355982e-06,
"loss": 0.1285,
"step": 52
},
{
"epoch": 0.09430604982206406,
"grad_norm": 1.535942883793826,
"learning_rate": 9.945239861225644e-06,
"loss": 0.1205,
"step": 53
},
{
"epoch": 0.09608540925266904,
"grad_norm": 1.4327747010188872,
"learning_rate": 9.943157907471825e-06,
"loss": 0.1143,
"step": 54
},
{
"epoch": 0.09786476868327403,
"grad_norm": 1.2169464338940064,
"learning_rate": 9.941037337358918e-06,
"loss": 0.0863,
"step": 55
},
{
"epoch": 0.099644128113879,
"grad_norm": 1.6994774690848435,
"learning_rate": 9.938878167452991e-06,
"loss": 0.1319,
"step": 56
},
{
"epoch": 0.10142348754448399,
"grad_norm": 1.851572606439269,
"learning_rate": 9.936680414621663e-06,
"loss": 0.1201,
"step": 57
},
{
"epoch": 0.10320284697508897,
"grad_norm": 1.3210638554640264,
"learning_rate": 9.934444096033958e-06,
"loss": 0.0966,
"step": 58
},
{
"epoch": 0.10498220640569395,
"grad_norm": 1.3982161782311981,
"learning_rate": 9.932169229160183e-06,
"loss": 0.1185,
"step": 59
},
{
"epoch": 0.10676156583629894,
"grad_norm": 1.6551177125917558,
"learning_rate": 9.929855831771787e-06,
"loss": 0.1222,
"step": 60
},
{
"epoch": 0.10854092526690391,
"grad_norm": 1.5820583104836863,
"learning_rate": 9.927503921941218e-06,
"loss": 0.1175,
"step": 61
},
{
"epoch": 0.1103202846975089,
"grad_norm": 1.6447037739593402,
"learning_rate": 9.925113518041796e-06,
"loss": 0.1457,
"step": 62
},
{
"epoch": 0.11209964412811388,
"grad_norm": 1.8341557719751869,
"learning_rate": 9.922684638747551e-06,
"loss": 0.1761,
"step": 63
},
{
"epoch": 0.11387900355871886,
"grad_norm": 1.4609850441682664,
"learning_rate": 9.920217303033091e-06,
"loss": 0.1239,
"step": 64
},
{
"epoch": 0.11565836298932385,
"grad_norm": 1.7373126826984115,
"learning_rate": 9.917711530173444e-06,
"loss": 0.1248,
"step": 65
},
{
"epoch": 0.11743772241992882,
"grad_norm": 1.8252103368149044,
"learning_rate": 9.91516733974392e-06,
"loss": 0.129,
"step": 66
},
{
"epoch": 0.11921708185053381,
"grad_norm": 1.466618243317653,
"learning_rate": 9.912584751619943e-06,
"loss": 0.134,
"step": 67
},
{
"epoch": 0.12099644128113879,
"grad_norm": 1.3574022691057386,
"learning_rate": 9.909963785976902e-06,
"loss": 0.114,
"step": 68
},
{
"epoch": 0.12277580071174377,
"grad_norm": 1.3820161583631567,
"learning_rate": 9.907304463290004e-06,
"loss": 0.1136,
"step": 69
},
{
"epoch": 0.12455516014234876,
"grad_norm": 1.2947853990115923,
"learning_rate": 9.904606804334094e-06,
"loss": 0.1003,
"step": 70
},
{
"epoch": 0.12633451957295375,
"grad_norm": 1.4777665246391947,
"learning_rate": 9.901870830183506e-06,
"loss": 0.1301,
"step": 71
},
{
"epoch": 0.12811387900355872,
"grad_norm": 1.4865395343985397,
"learning_rate": 9.899096562211902e-06,
"loss": 0.128,
"step": 72
},
{
"epoch": 0.1298932384341637,
"grad_norm": 1.70322923018481,
"learning_rate": 9.896284022092088e-06,
"loss": 0.1537,
"step": 73
},
{
"epoch": 0.13167259786476868,
"grad_norm": 1.986776256607827,
"learning_rate": 9.893433231795864e-06,
"loss": 0.1749,
"step": 74
},
{
"epoch": 0.13345195729537365,
"grad_norm": 1.9833725532011965,
"learning_rate": 9.890544213593838e-06,
"loss": 0.1536,
"step": 75
},
{
"epoch": 0.13523131672597866,
"grad_norm": 1.6162836428207408,
"learning_rate": 9.887616990055262e-06,
"loss": 0.1361,
"step": 76
},
{
"epoch": 0.13701067615658363,
"grad_norm": 1.8798472208522492,
"learning_rate": 9.884651584047845e-06,
"loss": 0.1427,
"step": 77
},
{
"epoch": 0.1387900355871886,
"grad_norm": 1.6316750517814012,
"learning_rate": 9.881648018737587e-06,
"loss": 0.148,
"step": 78
},
{
"epoch": 0.14056939501779359,
"grad_norm": 1.4973899136660551,
"learning_rate": 9.878606317588588e-06,
"loss": 0.113,
"step": 79
},
{
"epoch": 0.1423487544483986,
"grad_norm": 2.085780240364436,
"learning_rate": 9.875526504362868e-06,
"loss": 0.1764,
"step": 80
},
{
"epoch": 0.14412811387900357,
"grad_norm": 1.7904131094985867,
"learning_rate": 9.872408603120187e-06,
"loss": 0.1559,
"step": 81
},
{
"epoch": 0.14590747330960854,
"grad_norm": 1.4759874941535067,
"learning_rate": 9.869252638217846e-06,
"loss": 0.115,
"step": 82
},
{
"epoch": 0.14768683274021352,
"grad_norm": 1.4156029940928385,
"learning_rate": 9.866058634310503e-06,
"loss": 0.1303,
"step": 83
},
{
"epoch": 0.1494661921708185,
"grad_norm": 3.022669330694427,
"learning_rate": 9.862826616349981e-06,
"loss": 0.1232,
"step": 84
},
{
"epoch": 0.1512455516014235,
"grad_norm": 1.6187399937545661,
"learning_rate": 9.859556609585075e-06,
"loss": 0.1416,
"step": 85
},
{
"epoch": 0.15302491103202848,
"grad_norm": 1.3579268088366394,
"learning_rate": 9.856248639561346e-06,
"loss": 0.1058,
"step": 86
},
{
"epoch": 0.15480427046263345,
"grad_norm": 1.656725551724757,
"learning_rate": 9.85290273212093e-06,
"loss": 0.1262,
"step": 87
},
{
"epoch": 0.15658362989323843,
"grad_norm": 1.649686322768587,
"learning_rate": 9.849518913402334e-06,
"loss": 0.1248,
"step": 88
},
{
"epoch": 0.1583629893238434,
"grad_norm": 1.4682326240875552,
"learning_rate": 9.84609720984023e-06,
"loss": 0.1228,
"step": 89
},
{
"epoch": 0.1601423487544484,
"grad_norm": 1.5211842011834333,
"learning_rate": 9.84263764816525e-06,
"loss": 0.1165,
"step": 90
},
{
"epoch": 0.1619217081850534,
"grad_norm": 1.374933263593323,
"learning_rate": 9.839140255403776e-06,
"loss": 0.1069,
"step": 91
},
{
"epoch": 0.16370106761565836,
"grad_norm": 1.4112889073693102,
"learning_rate": 9.83560505887773e-06,
"loss": 0.1174,
"step": 92
},
{
"epoch": 0.16548042704626334,
"grad_norm": 1.5338525971406016,
"learning_rate": 9.83203208620436e-06,
"loss": 0.1246,
"step": 93
},
{
"epoch": 0.16725978647686832,
"grad_norm": 1.7319956110286392,
"learning_rate": 9.828421365296023e-06,
"loss": 0.1309,
"step": 94
},
{
"epoch": 0.16903914590747332,
"grad_norm": 1.6459346421565015,
"learning_rate": 9.824772924359974e-06,
"loss": 0.1303,
"step": 95
},
{
"epoch": 0.1708185053380783,
"grad_norm": 1.461599854566481,
"learning_rate": 9.821086791898133e-06,
"loss": 0.1146,
"step": 96
},
{
"epoch": 0.17259786476868327,
"grad_norm": 1.2898506418812963,
"learning_rate": 9.817362996706872e-06,
"loss": 0.1268,
"step": 97
},
{
"epoch": 0.17437722419928825,
"grad_norm": 1.2574991907771362,
"learning_rate": 9.81360156787679e-06,
"loss": 0.0982,
"step": 98
},
{
"epoch": 0.17615658362989323,
"grad_norm": 1.2818741796501145,
"learning_rate": 9.809802534792477e-06,
"loss": 0.1101,
"step": 99
},
{
"epoch": 0.17793594306049823,
"grad_norm": 1.5340131341968406,
"learning_rate": 9.805965927132294e-06,
"loss": 0.1417,
"step": 100
},
{
"epoch": 0.1797153024911032,
"grad_norm": 1.3301864585315522,
"learning_rate": 9.802091774868143e-06,
"loss": 0.1186,
"step": 101
},
{
"epoch": 0.18149466192170818,
"grad_norm": 1.2920359977860623,
"learning_rate": 9.798180108265218e-06,
"loss": 0.1269,
"step": 102
},
{
"epoch": 0.18327402135231316,
"grad_norm": 1.1755565875809384,
"learning_rate": 9.794230957881785e-06,
"loss": 0.0911,
"step": 103
},
{
"epoch": 0.18505338078291814,
"grad_norm": 1.6520857630595822,
"learning_rate": 9.79024435456893e-06,
"loss": 0.1298,
"step": 104
},
{
"epoch": 0.18683274021352314,
"grad_norm": 1.5760599162246887,
"learning_rate": 9.786220329470334e-06,
"loss": 0.1267,
"step": 105
},
{
"epoch": 0.18861209964412812,
"grad_norm": 1.3518661794431308,
"learning_rate": 9.782158914022011e-06,
"loss": 0.1171,
"step": 106
},
{
"epoch": 0.1903914590747331,
"grad_norm": 1.5440563375880543,
"learning_rate": 9.778060139952075e-06,
"loss": 0.1394,
"step": 107
},
{
"epoch": 0.19217081850533807,
"grad_norm": 1.4630462324701143,
"learning_rate": 9.773924039280488e-06,
"loss": 0.1268,
"step": 108
},
{
"epoch": 0.19395017793594305,
"grad_norm": 1.0316882466769741,
"learning_rate": 9.769750644318814e-06,
"loss": 0.0804,
"step": 109
},
{
"epoch": 0.19572953736654805,
"grad_norm": 1.6630328401543009,
"learning_rate": 9.765539987669956e-06,
"loss": 0.1238,
"step": 110
},
{
"epoch": 0.19750889679715303,
"grad_norm": 1.9423875980674232,
"learning_rate": 9.761292102227917e-06,
"loss": 0.1492,
"step": 111
},
{
"epoch": 0.199288256227758,
"grad_norm": 1.8260444289563744,
"learning_rate": 9.757007021177529e-06,
"loss": 0.162,
"step": 112
},
{
"epoch": 0.20106761565836298,
"grad_norm": 1.2266156380036612,
"learning_rate": 9.752684777994197e-06,
"loss": 0.1074,
"step": 113
},
{
"epoch": 0.20284697508896798,
"grad_norm": 1.7197909934835105,
"learning_rate": 9.748325406443647e-06,
"loss": 0.1435,
"step": 114
},
{
"epoch": 0.20462633451957296,
"grad_norm": 1.5438074853803372,
"learning_rate": 9.743928940581646e-06,
"loss": 0.1354,
"step": 115
},
{
"epoch": 0.20640569395017794,
"grad_norm": 1.787303992084634,
"learning_rate": 9.739495414753754e-06,
"loss": 0.1702,
"step": 116
},
{
"epoch": 0.20818505338078291,
"grad_norm": 1.3361322549878205,
"learning_rate": 9.73502486359504e-06,
"loss": 0.1204,
"step": 117
},
{
"epoch": 0.2099644128113879,
"grad_norm": 1.8634116877054288,
"learning_rate": 9.73051732202982e-06,
"loss": 0.1503,
"step": 118
},
{
"epoch": 0.2117437722419929,
"grad_norm": 1.3209685505485607,
"learning_rate": 9.725972825271381e-06,
"loss": 0.1243,
"step": 119
},
{
"epoch": 0.21352313167259787,
"grad_norm": 1.3706471704388294,
"learning_rate": 9.721391408821713e-06,
"loss": 0.1188,
"step": 120
},
{
"epoch": 0.21530249110320285,
"grad_norm": 1.4324014907533043,
"learning_rate": 9.716773108471213e-06,
"loss": 0.1407,
"step": 121
},
{
"epoch": 0.21708185053380782,
"grad_norm": 1.2654918357754823,
"learning_rate": 9.712117960298433e-06,
"loss": 0.1244,
"step": 122
},
{
"epoch": 0.2188612099644128,
"grad_norm": 1.442113627072885,
"learning_rate": 9.707426000669773e-06,
"loss": 0.1237,
"step": 123
},
{
"epoch": 0.2206405693950178,
"grad_norm": 1.4969482774761218,
"learning_rate": 9.702697266239211e-06,
"loss": 0.1321,
"step": 124
},
{
"epoch": 0.22241992882562278,
"grad_norm": 1.700958655143985,
"learning_rate": 9.697931793948012e-06,
"loss": 0.1601,
"step": 125
},
{
"epoch": 0.22419928825622776,
"grad_norm": 1.2124183722997646,
"learning_rate": 9.693129621024441e-06,
"loss": 0.1201,
"step": 126
},
{
"epoch": 0.22597864768683273,
"grad_norm": 1.373929114776768,
"learning_rate": 9.68829078498347e-06,
"loss": 0.126,
"step": 127
},
{
"epoch": 0.2277580071174377,
"grad_norm": 1.6101541653048022,
"learning_rate": 9.683415323626487e-06,
"loss": 0.1356,
"step": 128
},
{
"epoch": 0.22953736654804271,
"grad_norm": 1.6548250356852086,
"learning_rate": 9.678503275040997e-06,
"loss": 0.1363,
"step": 129
},
{
"epoch": 0.2313167259786477,
"grad_norm": 1.3341015541239039,
"learning_rate": 9.673554677600336e-06,
"loss": 0.1264,
"step": 130
},
{
"epoch": 0.23309608540925267,
"grad_norm": 1.4498445900696748,
"learning_rate": 9.668569569963355e-06,
"loss": 0.129,
"step": 131
},
{
"epoch": 0.23487544483985764,
"grad_norm": 1.0216967537453667,
"learning_rate": 9.663547991074129e-06,
"loss": 0.0887,
"step": 132
},
{
"epoch": 0.23665480427046262,
"grad_norm": 1.5556609913797053,
"learning_rate": 9.658489980161643e-06,
"loss": 0.1288,
"step": 133
},
{
"epoch": 0.23843416370106763,
"grad_norm": 1.4639954349834507,
"learning_rate": 9.653395576739504e-06,
"loss": 0.1348,
"step": 134
},
{
"epoch": 0.2402135231316726,
"grad_norm": 1.560363889533299,
"learning_rate": 9.648264820605611e-06,
"loss": 0.126,
"step": 135
},
{
"epoch": 0.24199288256227758,
"grad_norm": 1.4265445896347981,
"learning_rate": 9.643097751841854e-06,
"loss": 0.1614,
"step": 136
},
{
"epoch": 0.24377224199288255,
"grad_norm": 1.3866143965138966,
"learning_rate": 9.637894410813803e-06,
"loss": 0.1271,
"step": 137
},
{
"epoch": 0.24555160142348753,
"grad_norm": 1.7617394024609352,
"learning_rate": 9.632654838170393e-06,
"loss": 0.1389,
"step": 138
},
{
"epoch": 0.24733096085409254,
"grad_norm": 1.800709153860104,
"learning_rate": 9.627379074843595e-06,
"loss": 0.1463,
"step": 139
},
{
"epoch": 0.2491103202846975,
"grad_norm": 1.5087034875671088,
"learning_rate": 9.622067162048111e-06,
"loss": 0.1355,
"step": 140
},
{
"epoch": 0.2508896797153025,
"grad_norm": 1.3010872178165098,
"learning_rate": 9.616719141281044e-06,
"loss": 0.1247,
"step": 141
},
{
"epoch": 0.2526690391459075,
"grad_norm": 1.693733824105587,
"learning_rate": 9.611335054321576e-06,
"loss": 0.1512,
"step": 142
},
{
"epoch": 0.25444839857651247,
"grad_norm": 1.7009155877264104,
"learning_rate": 9.605914943230637e-06,
"loss": 0.139,
"step": 143
},
{
"epoch": 0.25622775800711745,
"grad_norm": 1.2598989725965244,
"learning_rate": 9.600458850350588e-06,
"loss": 0.1015,
"step": 144
},
{
"epoch": 0.2580071174377224,
"grad_norm": 1.702225655277758,
"learning_rate": 9.594966818304875e-06,
"loss": 0.148,
"step": 145
},
{
"epoch": 0.2597864768683274,
"grad_norm": 1.5513490978514006,
"learning_rate": 9.589438889997712e-06,
"loss": 0.1128,
"step": 146
},
{
"epoch": 0.2615658362989324,
"grad_norm": 1.2744707776066644,
"learning_rate": 9.583875108613727e-06,
"loss": 0.1215,
"step": 147
},
{
"epoch": 0.26334519572953735,
"grad_norm": 1.3135762575973189,
"learning_rate": 9.578275517617646e-06,
"loss": 0.1236,
"step": 148
},
{
"epoch": 0.26512455516014233,
"grad_norm": 1.4288843773619255,
"learning_rate": 9.572640160753936e-06,
"loss": 0.125,
"step": 149
},
{
"epoch": 0.2669039145907473,
"grad_norm": 1.3282205635766462,
"learning_rate": 9.566969082046471e-06,
"loss": 0.1341,
"step": 150
},
{
"epoch": 0.26868327402135234,
"grad_norm": 1.1244630347090068,
"learning_rate": 9.561262325798188e-06,
"loss": 0.0983,
"step": 151
},
{
"epoch": 0.2704626334519573,
"grad_norm": 1.2707269972012794,
"learning_rate": 9.555519936590739e-06,
"loss": 0.1042,
"step": 152
},
{
"epoch": 0.2722419928825623,
"grad_norm": 1.186406870071293,
"learning_rate": 9.549741959284147e-06,
"loss": 0.107,
"step": 153
},
{
"epoch": 0.27402135231316727,
"grad_norm": 1.355372382394004,
"learning_rate": 9.543928439016445e-06,
"loss": 0.1293,
"step": 154
},
{
"epoch": 0.27580071174377224,
"grad_norm": 1.2854825846015958,
"learning_rate": 9.538079421203339e-06,
"loss": 0.1169,
"step": 155
},
{
"epoch": 0.2775800711743772,
"grad_norm": 1.342421664104859,
"learning_rate": 9.532194951537838e-06,
"loss": 0.119,
"step": 156
},
{
"epoch": 0.2793594306049822,
"grad_norm": 1.4275853921403812,
"learning_rate": 9.52627507598991e-06,
"loss": 0.1369,
"step": 157
},
{
"epoch": 0.28113879003558717,
"grad_norm": 1.1603688300775958,
"learning_rate": 9.52031984080611e-06,
"loss": 0.105,
"step": 158
},
{
"epoch": 0.28291814946619215,
"grad_norm": 1.2073570045612327,
"learning_rate": 9.514329292509227e-06,
"loss": 0.1002,
"step": 159
},
{
"epoch": 0.2846975088967972,
"grad_norm": 1.3272114836931819,
"learning_rate": 9.508303477897925e-06,
"loss": 0.1128,
"step": 160
},
{
"epoch": 0.28647686832740216,
"grad_norm": 1.5666821304561709,
"learning_rate": 9.502242444046365e-06,
"loss": 0.1309,
"step": 161
},
{
"epoch": 0.28825622775800713,
"grad_norm": 1.4794896082398679,
"learning_rate": 9.496146238303846e-06,
"loss": 0.1416,
"step": 162
},
{
"epoch": 0.2900355871886121,
"grad_norm": 1.5725933662263583,
"learning_rate": 9.49001490829443e-06,
"loss": 0.134,
"step": 163
},
{
"epoch": 0.2918149466192171,
"grad_norm": 1.7978657636477746,
"learning_rate": 9.483848501916578e-06,
"loss": 0.1656,
"step": 164
},
{
"epoch": 0.29359430604982206,
"grad_norm": 1.5385861975715274,
"learning_rate": 9.477647067342766e-06,
"loss": 0.1445,
"step": 165
},
{
"epoch": 0.29537366548042704,
"grad_norm": 1.522698741848138,
"learning_rate": 9.471410653019115e-06,
"loss": 0.1312,
"step": 166
},
{
"epoch": 0.297153024911032,
"grad_norm": 1.410495428928871,
"learning_rate": 9.46513930766501e-06,
"loss": 0.134,
"step": 167
},
{
"epoch": 0.298932384341637,
"grad_norm": 1.4040568466318382,
"learning_rate": 9.458833080272723e-06,
"loss": 0.1155,
"step": 168
},
{
"epoch": 0.30071174377224197,
"grad_norm": 1.4830902237059056,
"learning_rate": 9.45249202010702e-06,
"loss": 0.1206,
"step": 169
},
{
"epoch": 0.302491103202847,
"grad_norm": 1.4974950883468803,
"learning_rate": 9.446116176704791e-06,
"loss": 0.1315,
"step": 170
},
{
"epoch": 0.304270462633452,
"grad_norm": 1.4862268443324185,
"learning_rate": 9.439705599874653e-06,
"loss": 0.1257,
"step": 171
},
{
"epoch": 0.30604982206405695,
"grad_norm": 1.4722719069791255,
"learning_rate": 9.433260339696564e-06,
"loss": 0.1502,
"step": 172
},
{
"epoch": 0.30782918149466193,
"grad_norm": 1.5787824797877084,
"learning_rate": 9.426780446521429e-06,
"loss": 0.1414,
"step": 173
},
{
"epoch": 0.3096085409252669,
"grad_norm": 1.49817799896776,
"learning_rate": 9.42026597097071e-06,
"loss": 0.1369,
"step": 174
},
{
"epoch": 0.3113879003558719,
"grad_norm": 1.270997812099595,
"learning_rate": 9.413716963936033e-06,
"loss": 0.1123,
"step": 175
},
{
"epoch": 0.31316725978647686,
"grad_norm": 1.3577743246112806,
"learning_rate": 9.407133476578778e-06,
"loss": 0.1268,
"step": 176
},
{
"epoch": 0.31494661921708184,
"grad_norm": 1.6811814689000206,
"learning_rate": 9.400515560329698e-06,
"loss": 0.1591,
"step": 177
},
{
"epoch": 0.3167259786476868,
"grad_norm": 1.62265306436807,
"learning_rate": 9.393863266888501e-06,
"loss": 0.1249,
"step": 178
},
{
"epoch": 0.3185053380782918,
"grad_norm": 1.6559844780644963,
"learning_rate": 9.387176648223457e-06,
"loss": 0.1351,
"step": 179
},
{
"epoch": 0.3202846975088968,
"grad_norm": 1.5052377798410623,
"learning_rate": 9.38045575657098e-06,
"loss": 0.1286,
"step": 180
},
{
"epoch": 0.3220640569395018,
"grad_norm": 1.6097855117146187,
"learning_rate": 9.37370064443524e-06,
"loss": 0.1309,
"step": 181
},
{
"epoch": 0.3238434163701068,
"grad_norm": 1.1440848783853197,
"learning_rate": 9.366911364587726e-06,
"loss": 0.1013,
"step": 182
},
{
"epoch": 0.32562277580071175,
"grad_norm": 1.3468165746555294,
"learning_rate": 9.360087970066854e-06,
"loss": 0.1135,
"step": 183
},
{
"epoch": 0.3274021352313167,
"grad_norm": 1.2737294113762112,
"learning_rate": 9.353230514177553e-06,
"loss": 0.1071,
"step": 184
},
{
"epoch": 0.3291814946619217,
"grad_norm": 1.2633648785188174,
"learning_rate": 9.346339050490832e-06,
"loss": 0.1124,
"step": 185
},
{
"epoch": 0.3309608540925267,
"grad_norm": 1.6848943476461669,
"learning_rate": 9.33941363284338e-06,
"loss": 0.1412,
"step": 186
},
{
"epoch": 0.33274021352313166,
"grad_norm": 1.4478447540009096,
"learning_rate": 9.332454315337129e-06,
"loss": 0.1237,
"step": 187
},
{
"epoch": 0.33451957295373663,
"grad_norm": 1.5163911183865801,
"learning_rate": 9.325461152338846e-06,
"loss": 0.119,
"step": 188
},
{
"epoch": 0.33629893238434166,
"grad_norm": 1.4243691207294058,
"learning_rate": 9.3184341984797e-06,
"loss": 0.1313,
"step": 189
},
{
"epoch": 0.33807829181494664,
"grad_norm": 1.5754316661446959,
"learning_rate": 9.311373508654838e-06,
"loss": 0.143,
"step": 190
},
{
"epoch": 0.3398576512455516,
"grad_norm": 1.7595561412101806,
"learning_rate": 9.30427913802295e-06,
"loss": 0.1664,
"step": 191
},
{
"epoch": 0.3416370106761566,
"grad_norm": 1.5305055081563157,
"learning_rate": 9.297151142005852e-06,
"loss": 0.1441,
"step": 192
},
{
"epoch": 0.34341637010676157,
"grad_norm": 1.3894474006237214,
"learning_rate": 9.289989576288035e-06,
"loss": 0.1277,
"step": 193
},
{
"epoch": 0.34519572953736655,
"grad_norm": 1.6318584188727645,
"learning_rate": 9.282794496816244e-06,
"loss": 0.1396,
"step": 194
},
{
"epoch": 0.3469750889679715,
"grad_norm": 1.5744858909539523,
"learning_rate": 9.27556595979904e-06,
"loss": 0.1304,
"step": 195
},
{
"epoch": 0.3487544483985765,
"grad_norm": 1.445329684282647,
"learning_rate": 9.26830402170635e-06,
"loss": 0.1362,
"step": 196
},
{
"epoch": 0.3505338078291815,
"grad_norm": 1.270020121730852,
"learning_rate": 9.261008739269035e-06,
"loss": 0.1065,
"step": 197
},
{
"epoch": 0.35231316725978645,
"grad_norm": 1.4766701681456913,
"learning_rate": 9.253680169478448e-06,
"loss": 0.1328,
"step": 198
},
{
"epoch": 0.3540925266903915,
"grad_norm": 1.3450843767451717,
"learning_rate": 9.246318369585983e-06,
"loss": 0.12,
"step": 199
},
{
"epoch": 0.35587188612099646,
"grad_norm": 1.7259434960653937,
"learning_rate": 9.238923397102629e-06,
"loss": 0.1574,
"step": 200
},
{
"epoch": 0.35587188612099646,
"eval_loss": 0.14350858330726624,
"eval_runtime": 7.1569,
"eval_samples_per_second": 6.427,
"eval_steps_per_second": 1.677,
"step": 200
},
{
"epoch": 0.35765124555160144,
"grad_norm": 1.320682336442158,
"learning_rate": 9.231495309798525e-06,
"loss": 0.1144,
"step": 201
},
{
"epoch": 0.3594306049822064,
"grad_norm": 1.6706844104451077,
"learning_rate": 9.224034165702506e-06,
"loss": 0.1476,
"step": 202
},
{
"epoch": 0.3612099644128114,
"grad_norm": 1.2143790241578905,
"learning_rate": 9.216540023101646e-06,
"loss": 0.1125,
"step": 203
},
{
"epoch": 0.36298932384341637,
"grad_norm": 1.2796879356044104,
"learning_rate": 9.209012940540806e-06,
"loss": 0.1147,
"step": 204
},
{
"epoch": 0.36476868327402134,
"grad_norm": 1.1876329454738948,
"learning_rate": 9.20145297682218e-06,
"loss": 0.1237,
"step": 205
},
{
"epoch": 0.3665480427046263,
"grad_norm": 1.8113179178303425,
"learning_rate": 9.193860191004833e-06,
"loss": 0.1627,
"step": 206
},
{
"epoch": 0.3683274021352313,
"grad_norm": 1.3757046323563018,
"learning_rate": 9.186234642404234e-06,
"loss": 0.1425,
"step": 207
},
{
"epoch": 0.3701067615658363,
"grad_norm": 1.5977600885821075,
"learning_rate": 9.178576390591803e-06,
"loss": 0.143,
"step": 208
},
{
"epoch": 0.3718861209964413,
"grad_norm": 1.238765918591716,
"learning_rate": 9.170885495394435e-06,
"loss": 0.1114,
"step": 209
},
{
"epoch": 0.3736654804270463,
"grad_norm": 1.609455822309686,
"learning_rate": 9.16316201689404e-06,
"loss": 0.1301,
"step": 210
},
{
"epoch": 0.37544483985765126,
"grad_norm": 1.6349720466810955,
"learning_rate": 9.155406015427076e-06,
"loss": 0.1472,
"step": 211
},
{
"epoch": 0.37722419928825623,
"grad_norm": 1.5474843867777042,
"learning_rate": 9.147617551584066e-06,
"loss": 0.1233,
"step": 212
},
{
"epoch": 0.3790035587188612,
"grad_norm": 1.3427581892710871,
"learning_rate": 9.139796686209135e-06,
"loss": 0.1452,
"step": 213
},
{
"epoch": 0.3807829181494662,
"grad_norm": 1.2341773880550775,
"learning_rate": 9.131943480399531e-06,
"loss": 0.1161,
"step": 214
},
{
"epoch": 0.38256227758007116,
"grad_norm": 1.2218651328103987,
"learning_rate": 9.124057995505148e-06,
"loss": 0.1171,
"step": 215
},
{
"epoch": 0.38434163701067614,
"grad_norm": 2.0195390399472375,
"learning_rate": 9.11614029312805e-06,
"loss": 0.1916,
"step": 216
},
{
"epoch": 0.3861209964412811,
"grad_norm": 1.6276262849392316,
"learning_rate": 9.108190435121982e-06,
"loss": 0.1412,
"step": 217
},
{
"epoch": 0.3879003558718861,
"grad_norm": 1.5993031713110006,
"learning_rate": 9.100208483591892e-06,
"loss": 0.1371,
"step": 218
},
{
"epoch": 0.3896797153024911,
"grad_norm": 1.9861450557949942,
"learning_rate": 9.092194500893448e-06,
"loss": 0.2089,
"step": 219
},
{
"epoch": 0.3914590747330961,
"grad_norm": 1.1535216112521212,
"learning_rate": 9.084148549632547e-06,
"loss": 0.1138,
"step": 220
},
{
"epoch": 0.3932384341637011,
"grad_norm": 1.3959913981489842,
"learning_rate": 9.076070692664827e-06,
"loss": 0.1338,
"step": 221
},
{
"epoch": 0.39501779359430605,
"grad_norm": 1.4354143564652753,
"learning_rate": 9.067960993095176e-06,
"loss": 0.1239,
"step": 222
},
{
"epoch": 0.39679715302491103,
"grad_norm": 1.5853609477241963,
"learning_rate": 9.059819514277238e-06,
"loss": 0.1387,
"step": 223
},
{
"epoch": 0.398576512455516,
"grad_norm": 1.3681011687168392,
"learning_rate": 9.05164631981292e-06,
"loss": 0.1216,
"step": 224
},
{
"epoch": 0.400355871886121,
"grad_norm": 1.5374602485433884,
"learning_rate": 9.043441473551893e-06,
"loss": 0.1411,
"step": 225
},
{
"epoch": 0.40213523131672596,
"grad_norm": 1.4186030471717734,
"learning_rate": 9.035205039591099e-06,
"loss": 0.122,
"step": 226
},
{
"epoch": 0.40391459074733094,
"grad_norm": 1.6009811486025363,
"learning_rate": 9.02693708227424e-06,
"loss": 0.1353,
"step": 227
},
{
"epoch": 0.40569395017793597,
"grad_norm": 1.2788491645037854,
"learning_rate": 9.018637666191284e-06,
"loss": 0.1385,
"step": 228
},
{
"epoch": 0.40747330960854095,
"grad_norm": 1.5078579713611018,
"learning_rate": 9.010306856177958e-06,
"loss": 0.1513,
"step": 229
},
{
"epoch": 0.4092526690391459,
"grad_norm": 1.4697726498475028,
"learning_rate": 9.001944717315236e-06,
"loss": 0.1608,
"step": 230
},
{
"epoch": 0.4110320284697509,
"grad_norm": 1.5137805508042916,
"learning_rate": 8.993551314928846e-06,
"loss": 0.1453,
"step": 231
},
{
"epoch": 0.4128113879003559,
"grad_norm": 1.2388919480644867,
"learning_rate": 8.985126714588739e-06,
"loss": 0.1065,
"step": 232
},
{
"epoch": 0.41459074733096085,
"grad_norm": 1.306249760881592,
"learning_rate": 8.976670982108591e-06,
"loss": 0.1346,
"step": 233
},
{
"epoch": 0.41637010676156583,
"grad_norm": 1.5891863090400924,
"learning_rate": 8.968184183545285e-06,
"loss": 0.1597,
"step": 234
},
{
"epoch": 0.4181494661921708,
"grad_norm": 1.3973739341602045,
"learning_rate": 8.959666385198396e-06,
"loss": 0.1419,
"step": 235
},
{
"epoch": 0.4199288256227758,
"grad_norm": 1.2536983328151956,
"learning_rate": 8.951117653609666e-06,
"loss": 0.1144,
"step": 236
},
{
"epoch": 0.42170818505338076,
"grad_norm": 1.4199186973139797,
"learning_rate": 8.9425380555625e-06,
"loss": 0.1259,
"step": 237
},
{
"epoch": 0.4234875444839858,
"grad_norm": 0.9220821629683453,
"learning_rate": 8.933927658081423e-06,
"loss": 0.0888,
"step": 238
},
{
"epoch": 0.42526690391459077,
"grad_norm": 1.2827691741083203,
"learning_rate": 8.925286528431578e-06,
"loss": 0.1269,
"step": 239
},
{
"epoch": 0.42704626334519574,
"grad_norm": 1.3177385866979556,
"learning_rate": 8.916614734118184e-06,
"loss": 0.1095,
"step": 240
},
{
"epoch": 0.4288256227758007,
"grad_norm": 1.2056954035414371,
"learning_rate": 8.907912342886016e-06,
"loss": 0.1084,
"step": 241
},
{
"epoch": 0.4306049822064057,
"grad_norm": 1.3815430073888333,
"learning_rate": 8.899179422718877e-06,
"loss": 0.1219,
"step": 242
},
{
"epoch": 0.43238434163701067,
"grad_norm": 1.5443193700355489,
"learning_rate": 8.890416041839061e-06,
"loss": 0.1426,
"step": 243
},
{
"epoch": 0.43416370106761565,
"grad_norm": 1.1746397644812498,
"learning_rate": 8.881622268706825e-06,
"loss": 0.1065,
"step": 244
},
{
"epoch": 0.4359430604982206,
"grad_norm": 1.273586888709326,
"learning_rate": 8.872798172019856e-06,
"loss": 0.1096,
"step": 245
},
{
"epoch": 0.4377224199288256,
"grad_norm": 1.5330627946798503,
"learning_rate": 8.863943820712726e-06,
"loss": 0.148,
"step": 246
},
{
"epoch": 0.4395017793594306,
"grad_norm": 1.4473674740109983,
"learning_rate": 8.855059283956363e-06,
"loss": 0.1614,
"step": 247
},
{
"epoch": 0.4412811387900356,
"grad_norm": 1.3966876553974212,
"learning_rate": 8.8461446311575e-06,
"loss": 0.1188,
"step": 248
},
{
"epoch": 0.4430604982206406,
"grad_norm": 1.2451938008491792,
"learning_rate": 8.837199931958147e-06,
"loss": 0.1107,
"step": 249
},
{
"epoch": 0.44483985765124556,
"grad_norm": 1.4988031830504223,
"learning_rate": 8.828225256235035e-06,
"loss": 0.1432,
"step": 250
},
{
"epoch": 0.44661921708185054,
"grad_norm": 1.5912629518024164,
"learning_rate": 8.819220674099074e-06,
"loss": 0.1464,
"step": 251
},
{
"epoch": 0.4483985765124555,
"grad_norm": 1.2719995730790428,
"learning_rate": 8.810186255894804e-06,
"loss": 0.1157,
"step": 252
},
{
"epoch": 0.4501779359430605,
"grad_norm": 1.4898719921228123,
"learning_rate": 8.801122072199848e-06,
"loss": 0.1274,
"step": 253
},
{
"epoch": 0.45195729537366547,
"grad_norm": 1.142669526813139,
"learning_rate": 8.792028193824364e-06,
"loss": 0.0973,
"step": 254
},
{
"epoch": 0.45373665480427045,
"grad_norm": 1.1597871710395087,
"learning_rate": 8.782904691810478e-06,
"loss": 0.1037,
"step": 255
},
{
"epoch": 0.4555160142348754,
"grad_norm": 1.3046169397046161,
"learning_rate": 8.77375163743175e-06,
"loss": 0.12,
"step": 256
},
{
"epoch": 0.45729537366548045,
"grad_norm": 1.2590302458061433,
"learning_rate": 8.764569102192593e-06,
"loss": 0.1259,
"step": 257
},
{
"epoch": 0.45907473309608543,
"grad_norm": 1.0868758219614008,
"learning_rate": 8.755357157827735e-06,
"loss": 0.0808,
"step": 258
},
{
"epoch": 0.4608540925266904,
"grad_norm": 1.5140149116778991,
"learning_rate": 8.746115876301651e-06,
"loss": 0.1428,
"step": 259
},
{
"epoch": 0.4626334519572954,
"grad_norm": 1.2551774116868955,
"learning_rate": 8.736845329807994e-06,
"loss": 0.1126,
"step": 260
},
{
"epoch": 0.46441281138790036,
"grad_norm": 1.3740429683605675,
"learning_rate": 8.727545590769044e-06,
"loss": 0.1298,
"step": 261
},
{
"epoch": 0.46619217081850534,
"grad_norm": 1.2698932128049305,
"learning_rate": 8.718216731835131e-06,
"loss": 0.124,
"step": 262
},
{
"epoch": 0.4679715302491103,
"grad_norm": 1.4467109733868337,
"learning_rate": 8.708858825884075e-06,
"loss": 0.1551,
"step": 263
},
{
"epoch": 0.4697508896797153,
"grad_norm": 1.0706620357753582,
"learning_rate": 8.699471946020612e-06,
"loss": 0.1037,
"step": 264
},
{
"epoch": 0.47153024911032027,
"grad_norm": 1.3240848753489833,
"learning_rate": 8.690056165575825e-06,
"loss": 0.1201,
"step": 265
},
{
"epoch": 0.47330960854092524,
"grad_norm": 1.248506179235332,
"learning_rate": 8.680611558106571e-06,
"loss": 0.1187,
"step": 266
},
{
"epoch": 0.4750889679715303,
"grad_norm": 1.1915657853085637,
"learning_rate": 8.671138197394907e-06,
"loss": 0.1087,
"step": 267
},
{
"epoch": 0.47686832740213525,
"grad_norm": 1.0739255287858955,
"learning_rate": 8.661636157447511e-06,
"loss": 0.1076,
"step": 268
},
{
"epoch": 0.4786476868327402,
"grad_norm": 2.128310705291851,
"learning_rate": 8.652105512495106e-06,
"loss": 0.1559,
"step": 269
},
{
"epoch": 0.4804270462633452,
"grad_norm": 1.3275428163972811,
"learning_rate": 8.64254633699188e-06,
"loss": 0.1087,
"step": 270
},
{
"epoch": 0.4822064056939502,
"grad_norm": 1.4535231875783747,
"learning_rate": 8.632958705614905e-06,
"loss": 0.1384,
"step": 271
},
{
"epoch": 0.48398576512455516,
"grad_norm": 1.3903926810560971,
"learning_rate": 8.623342693263549e-06,
"loss": 0.124,
"step": 272
},
{
"epoch": 0.48576512455516013,
"grad_norm": 1.5886251019429265,
"learning_rate": 8.6136983750589e-06,
"loss": 0.1284,
"step": 273
},
{
"epoch": 0.4875444839857651,
"grad_norm": 1.3457498653312756,
"learning_rate": 8.604025826343167e-06,
"loss": 0.1277,
"step": 274
},
{
"epoch": 0.4893238434163701,
"grad_norm": 1.6347007230276474,
"learning_rate": 8.594325122679107e-06,
"loss": 0.1473,
"step": 275
},
{
"epoch": 0.49110320284697506,
"grad_norm": 1.312202897258491,
"learning_rate": 8.584596339849419e-06,
"loss": 0.1259,
"step": 276
},
{
"epoch": 0.4928825622775801,
"grad_norm": 1.019172687491143,
"learning_rate": 8.574839553856157e-06,
"loss": 0.106,
"step": 277
},
{
"epoch": 0.49466192170818507,
"grad_norm": 1.641994091224545,
"learning_rate": 8.565054840920145e-06,
"loss": 0.1579,
"step": 278
},
{
"epoch": 0.49644128113879005,
"grad_norm": 1.091593308412249,
"learning_rate": 8.55524227748037e-06,
"loss": 0.0974,
"step": 279
},
{
"epoch": 0.498220640569395,
"grad_norm": 1.35864549898435,
"learning_rate": 8.545401940193392e-06,
"loss": 0.1169,
"step": 280
},
{
"epoch": 0.5,
"grad_norm": 1.2614152792231237,
"learning_rate": 8.535533905932739e-06,
"loss": 0.1181,
"step": 281
},
{
"epoch": 0.501779359430605,
"grad_norm": 1.8405880597862696,
"learning_rate": 8.525638251788312e-06,
"loss": 0.171,
"step": 282
},
{
"epoch": 0.50355871886121,
"grad_norm": 1.4629610482911204,
"learning_rate": 8.515715055065783e-06,
"loss": 0.1373,
"step": 283
},
{
"epoch": 0.505338078291815,
"grad_norm": 1.4437157015001951,
"learning_rate": 8.505764393285985e-06,
"loss": 0.1523,
"step": 284
},
{
"epoch": 0.5071174377224199,
"grad_norm": 1.3335215309491937,
"learning_rate": 8.495786344184314e-06,
"loss": 0.1165,
"step": 285
},
{
"epoch": 0.5088967971530249,
"grad_norm": 1.5284154106030527,
"learning_rate": 8.485780985710113e-06,
"loss": 0.1409,
"step": 286
},
{
"epoch": 0.5106761565836299,
"grad_norm": 1.3810740242991313,
"learning_rate": 8.475748396026074e-06,
"loss": 0.1236,
"step": 287
},
{
"epoch": 0.5124555160142349,
"grad_norm": 1.3954563500504358,
"learning_rate": 8.46568865350762e-06,
"loss": 0.1352,
"step": 288
},
{
"epoch": 0.5142348754448398,
"grad_norm": 1.6775089275988135,
"learning_rate": 8.45560183674229e-06,
"loss": 0.1489,
"step": 289
},
{
"epoch": 0.5160142348754448,
"grad_norm": 1.5352255277145486,
"learning_rate": 8.445488024529133e-06,
"loss": 0.1369,
"step": 290
},
{
"epoch": 0.5177935943060499,
"grad_norm": 1.159621715420468,
"learning_rate": 8.435347295878087e-06,
"loss": 0.0929,
"step": 291
},
{
"epoch": 0.5195729537366548,
"grad_norm": 1.0512067858943523,
"learning_rate": 8.425179730009368e-06,
"loss": 0.0904,
"step": 292
},
{
"epoch": 0.5213523131672598,
"grad_norm": 1.344689361873844,
"learning_rate": 8.41498540635284e-06,
"loss": 0.1209,
"step": 293
},
{
"epoch": 0.5231316725978647,
"grad_norm": 1.523410468566144,
"learning_rate": 8.404764404547404e-06,
"loss": 0.1316,
"step": 294
},
{
"epoch": 0.5249110320284698,
"grad_norm": 1.5411238747103475,
"learning_rate": 8.394516804440374e-06,
"loss": 0.1216,
"step": 295
},
{
"epoch": 0.5266903914590747,
"grad_norm": 1.4807364157036627,
"learning_rate": 8.384242686086848e-06,
"loss": 0.1338,
"step": 296
},
{
"epoch": 0.5284697508896797,
"grad_norm": 1.5544335300649341,
"learning_rate": 8.373942129749094e-06,
"loss": 0.1408,
"step": 297
},
{
"epoch": 0.5302491103202847,
"grad_norm": 1.0951700655618808,
"learning_rate": 8.363615215895908e-06,
"loss": 0.1137,
"step": 298
},
{
"epoch": 0.5320284697508897,
"grad_norm": 1.6691744052416226,
"learning_rate": 8.353262025202e-06,
"loss": 0.1407,
"step": 299
},
{
"epoch": 0.5338078291814946,
"grad_norm": 1.2038366640700635,
"learning_rate": 8.342882638547351e-06,
"loss": 0.0999,
"step": 300
},
{
"epoch": 0.5355871886120996,
"grad_norm": 1.3228328594800742,
"learning_rate": 8.332477137016587e-06,
"loss": 0.1294,
"step": 301
},
{
"epoch": 0.5373665480427047,
"grad_norm": 1.2019400342779778,
"learning_rate": 8.322045601898354e-06,
"loss": 0.1132,
"step": 302
},
{
"epoch": 0.5391459074733096,
"grad_norm": 1.1050987223643727,
"learning_rate": 8.311588114684665e-06,
"loss": 0.0984,
"step": 303
},
{
"epoch": 0.5409252669039146,
"grad_norm": 1.5723866295190063,
"learning_rate": 8.301104757070276e-06,
"loss": 0.1798,
"step": 304
},
{
"epoch": 0.5427046263345195,
"grad_norm": 1.4608790946680539,
"learning_rate": 8.290595610952045e-06,
"loss": 0.1225,
"step": 305
},
{
"epoch": 0.5444839857651246,
"grad_norm": 1.6577147147163798,
"learning_rate": 8.280060758428294e-06,
"loss": 0.156,
"step": 306
},
{
"epoch": 0.5462633451957295,
"grad_norm": 1.050267994871322,
"learning_rate": 8.269500281798164e-06,
"loss": 0.1021,
"step": 307
},
{
"epoch": 0.5480427046263345,
"grad_norm": 1.4247760825283253,
"learning_rate": 8.258914263560971e-06,
"loss": 0.1308,
"step": 308
},
{
"epoch": 0.5498220640569395,
"grad_norm": 1.3828113803361126,
"learning_rate": 8.248302786415567e-06,
"loss": 0.1351,
"step": 309
},
{
"epoch": 0.5516014234875445,
"grad_norm": 1.5443946941737774,
"learning_rate": 8.237665933259693e-06,
"loss": 0.1364,
"step": 310
},
{
"epoch": 0.5533807829181495,
"grad_norm": 1.4421977265130892,
"learning_rate": 8.227003787189323e-06,
"loss": 0.137,
"step": 311
},
{
"epoch": 0.5551601423487544,
"grad_norm": 1.1201475328948052,
"learning_rate": 8.216316431498028e-06,
"loss": 0.0993,
"step": 312
},
{
"epoch": 0.5569395017793595,
"grad_norm": 1.3552493640066288,
"learning_rate": 8.205603949676317e-06,
"loss": 0.1407,
"step": 313
},
{
"epoch": 0.5587188612099644,
"grad_norm": 1.4907506738970107,
"learning_rate": 8.194866425410984e-06,
"loss": 0.1354,
"step": 314
},
{
"epoch": 0.5604982206405694,
"grad_norm": 1.5785749000090659,
"learning_rate": 8.184103942584456e-06,
"loss": 0.1315,
"step": 315
},
{
"epoch": 0.5622775800711743,
"grad_norm": 1.3054199528576296,
"learning_rate": 8.173316585274144e-06,
"loss": 0.1153,
"step": 316
},
{
"epoch": 0.5640569395017794,
"grad_norm": 1.116417188323314,
"learning_rate": 8.162504437751775e-06,
"loss": 0.1181,
"step": 317
},
{
"epoch": 0.5658362989323843,
"grad_norm": 1.178882361160108,
"learning_rate": 8.151667584482742e-06,
"loss": 0.1086,
"step": 318
},
{
"epoch": 0.5676156583629893,
"grad_norm": 1.6896395727044602,
"learning_rate": 8.140806110125442e-06,
"loss": 0.1513,
"step": 319
},
{
"epoch": 0.5693950177935944,
"grad_norm": 1.1945397628884609,
"learning_rate": 8.129920099530608e-06,
"loss": 0.1231,
"step": 320
},
{
"epoch": 0.5711743772241993,
"grad_norm": 0.9443054650652738,
"learning_rate": 8.119009637740663e-06,
"loss": 0.0796,
"step": 321
},
{
"epoch": 0.5729537366548043,
"grad_norm": 2.433644384258339,
"learning_rate": 8.108074809989032e-06,
"loss": 0.1251,
"step": 322
},
{
"epoch": 0.5747330960854092,
"grad_norm": 1.3155930644886165,
"learning_rate": 8.097115701699498e-06,
"loss": 0.1054,
"step": 323
},
{
"epoch": 0.5765124555160143,
"grad_norm": 1.5522925480478513,
"learning_rate": 8.086132398485525e-06,
"loss": 0.151,
"step": 324
},
{
"epoch": 0.5782918149466192,
"grad_norm": 1.0158520585816664,
"learning_rate": 8.075124986149583e-06,
"loss": 0.0975,
"step": 325
},
{
"epoch": 0.5800711743772242,
"grad_norm": 1.2548408212995026,
"learning_rate": 8.064093550682494e-06,
"loss": 0.1116,
"step": 326
},
{
"epoch": 0.5818505338078291,
"grad_norm": 1.2689643872591336,
"learning_rate": 8.053038178262742e-06,
"loss": 0.0964,
"step": 327
},
{
"epoch": 0.5836298932384342,
"grad_norm": 1.4848493760001857,
"learning_rate": 8.041958955255815e-06,
"loss": 0.1242,
"step": 328
},
{
"epoch": 0.5854092526690391,
"grad_norm": 1.3572943156840809,
"learning_rate": 8.030855968213518e-06,
"loss": 0.1227,
"step": 329
},
{
"epoch": 0.5871886120996441,
"grad_norm": 1.4470743444243865,
"learning_rate": 8.019729303873307e-06,
"loss": 0.1278,
"step": 330
},
{
"epoch": 0.5889679715302492,
"grad_norm": 1.0928820401030448,
"learning_rate": 8.008579049157607e-06,
"loss": 0.0948,
"step": 331
},
{
"epoch": 0.5907473309608541,
"grad_norm": 1.0906885389154088,
"learning_rate": 7.99740529117313e-06,
"loss": 0.1069,
"step": 332
},
{
"epoch": 0.5925266903914591,
"grad_norm": 1.357566203031381,
"learning_rate": 7.986208117210198e-06,
"loss": 0.1207,
"step": 333
},
{
"epoch": 0.594306049822064,
"grad_norm": 1.4650886000197623,
"learning_rate": 7.974987614742066e-06,
"loss": 0.1259,
"step": 334
},
{
"epoch": 0.5960854092526691,
"grad_norm": 1.6868619854111764,
"learning_rate": 7.963743871424224e-06,
"loss": 0.1536,
"step": 335
},
{
"epoch": 0.597864768683274,
"grad_norm": 1.3284237693189664,
"learning_rate": 7.952476975093729e-06,
"loss": 0.115,
"step": 336
},
{
"epoch": 0.599644128113879,
"grad_norm": 1.631884894939644,
"learning_rate": 7.941187013768508e-06,
"loss": 0.1497,
"step": 337
},
{
"epoch": 0.6014234875444839,
"grad_norm": 1.312006614151387,
"learning_rate": 7.929874075646673e-06,
"loss": 0.1091,
"step": 338
},
{
"epoch": 0.603202846975089,
"grad_norm": 1.7002380223023112,
"learning_rate": 7.918538249105835e-06,
"loss": 0.1372,
"step": 339
},
{
"epoch": 0.604982206405694,
"grad_norm": 1.6089822890973315,
"learning_rate": 7.907179622702409e-06,
"loss": 0.1412,
"step": 340
},
{
"epoch": 0.6067615658362989,
"grad_norm": 1.2525073857339628,
"learning_rate": 7.895798285170927e-06,
"loss": 0.115,
"step": 341
},
{
"epoch": 0.608540925266904,
"grad_norm": 1.2059351463607746,
"learning_rate": 7.88439432542334e-06,
"loss": 0.0976,
"step": 342
},
{
"epoch": 0.6103202846975089,
"grad_norm": 1.426350585134919,
"learning_rate": 7.872967832548327e-06,
"loss": 0.1332,
"step": 343
},
{
"epoch": 0.6120996441281139,
"grad_norm": 1.4459007236750945,
"learning_rate": 7.861518895810597e-06,
"loss": 0.1335,
"step": 344
},
{
"epoch": 0.6138790035587188,
"grad_norm": 1.3438963369991237,
"learning_rate": 7.850047604650188e-06,
"loss": 0.1357,
"step": 345
},
{
"epoch": 0.6156583629893239,
"grad_norm": 1.529991571860058,
"learning_rate": 7.838554048681783e-06,
"loss": 0.1397,
"step": 346
},
{
"epoch": 0.6174377224199288,
"grad_norm": 1.5019186177345394,
"learning_rate": 7.827038317693988e-06,
"loss": 0.1598,
"step": 347
},
{
"epoch": 0.6192170818505338,
"grad_norm": 4.408268242820045,
"learning_rate": 7.815500501648654e-06,
"loss": 0.137,
"step": 348
},
{
"epoch": 0.6209964412811388,
"grad_norm": 1.3802358392887788,
"learning_rate": 7.80394069068015e-06,
"loss": 0.1216,
"step": 349
},
{
"epoch": 0.6227758007117438,
"grad_norm": 1.4649494442602975,
"learning_rate": 7.79235897509468e-06,
"loss": 0.1352,
"step": 350
},
{
"epoch": 0.6245551601423488,
"grad_norm": 1.5282423268361176,
"learning_rate": 7.780755445369563e-06,
"loss": 0.1342,
"step": 351
},
{
"epoch": 0.6263345195729537,
"grad_norm": 1.7436997548881208,
"learning_rate": 7.769130192152538e-06,
"loss": 0.1587,
"step": 352
},
{
"epoch": 0.6281138790035588,
"grad_norm": 1.4492973660104254,
"learning_rate": 7.757483306261042e-06,
"loss": 0.1399,
"step": 353
},
{
"epoch": 0.6298932384341637,
"grad_norm": 1.8920898279442842,
"learning_rate": 7.745814878681516e-06,
"loss": 0.1533,
"step": 354
},
{
"epoch": 0.6316725978647687,
"grad_norm": 1.3034923915124113,
"learning_rate": 7.734125000568684e-06,
"loss": 0.1276,
"step": 355
},
{
"epoch": 0.6334519572953736,
"grad_norm": 1.2614478867062202,
"learning_rate": 7.722413763244837e-06,
"loss": 0.1185,
"step": 356
},
{
"epoch": 0.6352313167259787,
"grad_norm": 1.012501012775422,
"learning_rate": 7.710681258199136e-06,
"loss": 0.0942,
"step": 357
},
{
"epoch": 0.6370106761565836,
"grad_norm": 1.5046475142828677,
"learning_rate": 7.69892757708688e-06,
"loss": 0.1216,
"step": 358
},
{
"epoch": 0.6387900355871886,
"grad_norm": 1.8296956207768407,
"learning_rate": 7.687152811728799e-06,
"loss": 0.1275,
"step": 359
},
{
"epoch": 0.6405693950177936,
"grad_norm": 1.355044693816592,
"learning_rate": 7.675357054110337e-06,
"loss": 0.1284,
"step": 360
},
{
"epoch": 0.6423487544483986,
"grad_norm": 1.2658221782024195,
"learning_rate": 7.663540396380931e-06,
"loss": 0.109,
"step": 361
},
{
"epoch": 0.6441281138790036,
"grad_norm": 1.200866490873271,
"learning_rate": 7.651702930853287e-06,
"loss": 0.1073,
"step": 362
},
{
"epoch": 0.6459074733096085,
"grad_norm": 1.3708154597557751,
"learning_rate": 7.639844750002668e-06,
"loss": 0.1172,
"step": 363
},
{
"epoch": 0.6476868327402135,
"grad_norm": 1.2611137615038404,
"learning_rate": 7.627965946466167e-06,
"loss": 0.1298,
"step": 364
},
{
"epoch": 0.6494661921708185,
"grad_norm": 1.1850326627852785,
"learning_rate": 7.616066613041977e-06,
"loss": 0.1122,
"step": 365
},
{
"epoch": 0.6512455516014235,
"grad_norm": 1.3551706805297659,
"learning_rate": 7.6041468426886785e-06,
"loss": 0.1236,
"step": 366
},
{
"epoch": 0.6530249110320284,
"grad_norm": 1.0051973136101258,
"learning_rate": 7.592206728524507e-06,
"loss": 0.0851,
"step": 367
},
{
"epoch": 0.6548042704626335,
"grad_norm": 1.320125483804524,
"learning_rate": 7.580246363826621e-06,
"loss": 0.1091,
"step": 368
},
{
"epoch": 0.6565836298932385,
"grad_norm": 1.4979758702352242,
"learning_rate": 7.568265842030381e-06,
"loss": 0.1356,
"step": 369
},
{
"epoch": 0.6583629893238434,
"grad_norm": 1.250878469807032,
"learning_rate": 7.556265256728618e-06,
"loss": 0.1226,
"step": 370
},
{
"epoch": 0.6601423487544484,
"grad_norm": 1.2610023856998045,
"learning_rate": 7.544244701670894e-06,
"loss": 0.118,
"step": 371
},
{
"epoch": 0.6619217081850534,
"grad_norm": 1.4123352756784129,
"learning_rate": 7.532204270762786e-06,
"loss": 0.1309,
"step": 372
},
{
"epoch": 0.6637010676156584,
"grad_norm": 1.198475856415187,
"learning_rate": 7.520144058065133e-06,
"loss": 0.0976,
"step": 373
},
{
"epoch": 0.6654804270462633,
"grad_norm": 1.2455669749293448,
"learning_rate": 7.50806415779332e-06,
"loss": 0.103,
"step": 374
},
{
"epoch": 0.6672597864768683,
"grad_norm": 1.40954667227436,
"learning_rate": 7.495964664316525e-06,
"loss": 0.1207,
"step": 375
},
{
"epoch": 0.6690391459074733,
"grad_norm": 1.3646583479133876,
"learning_rate": 7.4838456721569975e-06,
"loss": 0.1183,
"step": 376
},
{
"epoch": 0.6708185053380783,
"grad_norm": 1.1672216677261495,
"learning_rate": 7.471707275989304e-06,
"loss": 0.1117,
"step": 377
},
{
"epoch": 0.6725978647686833,
"grad_norm": 1.2458080943094203,
"learning_rate": 7.459549570639602e-06,
"loss": 0.1077,
"step": 378
},
{
"epoch": 0.6743772241992882,
"grad_norm": 1.34992094298806,
"learning_rate": 7.447372651084896e-06,
"loss": 0.1187,
"step": 379
},
{
"epoch": 0.6761565836298933,
"grad_norm": 1.1392793075673076,
"learning_rate": 7.435176612452286e-06,
"loss": 0.1002,
"step": 380
},
{
"epoch": 0.6779359430604982,
"grad_norm": 1.509242478825323,
"learning_rate": 7.4229615500182396e-06,
"loss": 0.1332,
"step": 381
},
{
"epoch": 0.6797153024911032,
"grad_norm": 1.1082442571399327,
"learning_rate": 7.4107275592078345e-06,
"loss": 0.0994,
"step": 382
},
{
"epoch": 0.6814946619217082,
"grad_norm": 1.2190206115453381,
"learning_rate": 7.398474735594022e-06,
"loss": 0.1003,
"step": 383
},
{
"epoch": 0.6832740213523132,
"grad_norm": 1.4613581029411111,
"learning_rate": 7.386203174896872e-06,
"loss": 0.1334,
"step": 384
},
{
"epoch": 0.6850533807829181,
"grad_norm": 1.3880626311701247,
"learning_rate": 7.373912972982838e-06,
"loss": 0.1224,
"step": 385
},
{
"epoch": 0.6868327402135231,
"grad_norm": 1.20257242571399,
"learning_rate": 7.361604225863992e-06,
"loss": 0.1088,
"step": 386
},
{
"epoch": 0.6886120996441281,
"grad_norm": 1.5957397780322953,
"learning_rate": 7.349277029697287e-06,
"loss": 0.1374,
"step": 387
},
{
"epoch": 0.6903914590747331,
"grad_norm": 1.3620482394438482,
"learning_rate": 7.336931480783801e-06,
"loss": 0.1162,
"step": 388
},
{
"epoch": 0.6921708185053381,
"grad_norm": 1.5870093402282994,
"learning_rate": 7.3245676755679854e-06,
"loss": 0.1251,
"step": 389
},
{
"epoch": 0.693950177935943,
"grad_norm": 1.3841478989559841,
"learning_rate": 7.312185710636911e-06,
"loss": 0.1228,
"step": 390
},
{
"epoch": 0.6957295373665481,
"grad_norm": 1.2266321271403495,
"learning_rate": 7.299785682719512e-06,
"loss": 0.0946,
"step": 391
},
{
"epoch": 0.697508896797153,
"grad_norm": 1.3206399673108276,
"learning_rate": 7.287367688685835e-06,
"loss": 0.1178,
"step": 392
},
{
"epoch": 0.699288256227758,
"grad_norm": 1.7284010500339315,
"learning_rate": 7.274931825546279e-06,
"loss": 0.1377,
"step": 393
},
{
"epoch": 0.701067615658363,
"grad_norm": 1.237151195783981,
"learning_rate": 7.262478190450834e-06,
"loss": 0.1058,
"step": 394
},
{
"epoch": 0.702846975088968,
"grad_norm": 1.4359366891420557,
"learning_rate": 7.250006880688332e-06,
"loss": 0.143,
"step": 395
},
{
"epoch": 0.7046263345195729,
"grad_norm": 1.303179595056172,
"learning_rate": 7.2375179936856775e-06,
"loss": 0.1136,
"step": 396
},
{
"epoch": 0.7064056939501779,
"grad_norm": 1.1508347712381617,
"learning_rate": 7.22501162700709e-06,
"loss": 0.1176,
"step": 397
},
{
"epoch": 0.708185053380783,
"grad_norm": 1.2194552151231233,
"learning_rate": 7.21248787835334e-06,
"loss": 0.1085,
"step": 398
},
{
"epoch": 0.7099644128113879,
"grad_norm": 1.2550760299236767,
"learning_rate": 7.199946845560994e-06,
"loss": 0.108,
"step": 399
},
{
"epoch": 0.7117437722419929,
"grad_norm": 1.6680598620705607,
"learning_rate": 7.1873886266016365e-06,
"loss": 0.1605,
"step": 400
},
{
"epoch": 0.7117437722419929,
"eval_loss": 0.13593703508377075,
"eval_runtime": 7.1523,
"eval_samples_per_second": 6.431,
"eval_steps_per_second": 1.678,
"step": 400
},
{
"epoch": 0.7135231316725978,
"grad_norm": 1.3857617042325157,
"learning_rate": 7.174813319581115e-06,
"loss": 0.1307,
"step": 401
},
{
"epoch": 0.7153024911032029,
"grad_norm": 1.922593312155491,
"learning_rate": 7.162221022738768e-06,
"loss": 0.149,
"step": 402
},
{
"epoch": 0.7170818505338078,
"grad_norm": 1.4902600933078627,
"learning_rate": 7.149611834446664e-06,
"loss": 0.1505,
"step": 403
},
{
"epoch": 0.7188612099644128,
"grad_norm": 1.4168107428036665,
"learning_rate": 7.136985853208824e-06,
"loss": 0.1335,
"step": 404
},
{
"epoch": 0.7206405693950177,
"grad_norm": 1.2813571246783395,
"learning_rate": 7.124343177660462e-06,
"loss": 0.1123,
"step": 405
},
{
"epoch": 0.7224199288256228,
"grad_norm": 1.6025463929831045,
"learning_rate": 7.111683906567206e-06,
"loss": 0.1441,
"step": 406
},
{
"epoch": 0.7241992882562278,
"grad_norm": 1.3440377342055425,
"learning_rate": 7.099008138824329e-06,
"loss": 0.1126,
"step": 407
},
{
"epoch": 0.7259786476868327,
"grad_norm": 1.2724255186050077,
"learning_rate": 7.086315973455982e-06,
"loss": 0.1213,
"step": 408
},
{
"epoch": 0.7277580071174378,
"grad_norm": 1.4778145133673117,
"learning_rate": 7.0736075096144084e-06,
"loss": 0.1408,
"step": 409
},
{
"epoch": 0.7295373665480427,
"grad_norm": 1.0833048159494032,
"learning_rate": 7.060882846579182e-06,
"loss": 0.0987,
"step": 410
},
{
"epoch": 0.7313167259786477,
"grad_norm": 1.5143152577606858,
"learning_rate": 7.048142083756427e-06,
"loss": 0.1382,
"step": 411
},
{
"epoch": 0.7330960854092526,
"grad_norm": 1.3197221214599737,
"learning_rate": 7.035385320678035e-06,
"loss": 0.1258,
"step": 412
},
{
"epoch": 0.7348754448398577,
"grad_norm": 1.3755616798221522,
"learning_rate": 7.022612657000898e-06,
"loss": 0.1154,
"step": 413
},
{
"epoch": 0.7366548042704626,
"grad_norm": 1.3591609015544808,
"learning_rate": 7.0098241925061215e-06,
"loss": 0.1246,
"step": 414
},
{
"epoch": 0.7384341637010676,
"grad_norm": 1.7750632683995469,
"learning_rate": 6.997020027098249e-06,
"loss": 0.1326,
"step": 415
},
{
"epoch": 0.7402135231316725,
"grad_norm": 1.4083511956168764,
"learning_rate": 6.9842002608044844e-06,
"loss": 0.1382,
"step": 416
},
{
"epoch": 0.7419928825622776,
"grad_norm": 1.1617276411064767,
"learning_rate": 6.971364993773901e-06,
"loss": 0.1055,
"step": 417
},
{
"epoch": 0.7437722419928826,
"grad_norm": 1.4035719811413283,
"learning_rate": 6.958514326276669e-06,
"loss": 0.1101,
"step": 418
},
{
"epoch": 0.7455516014234875,
"grad_norm": 1.3293580992170293,
"learning_rate": 6.945648358703269e-06,
"loss": 0.1032,
"step": 419
},
{
"epoch": 0.7473309608540926,
"grad_norm": 1.3235544206067675,
"learning_rate": 6.932767191563703e-06,
"loss": 0.1407,
"step": 420
},
{
"epoch": 0.7491103202846975,
"grad_norm": 1.0915905612236356,
"learning_rate": 6.919870925486718e-06,
"loss": 0.0892,
"step": 421
},
{
"epoch": 0.7508896797153025,
"grad_norm": 1.0646199439982955,
"learning_rate": 6.906959661219011e-06,
"loss": 0.1003,
"step": 422
},
{
"epoch": 0.7526690391459074,
"grad_norm": 1.3132527820198336,
"learning_rate": 6.8940334996244505e-06,
"loss": 0.112,
"step": 423
},
{
"epoch": 0.7544483985765125,
"grad_norm": 1.3131228207975008,
"learning_rate": 6.881092541683279e-06,
"loss": 0.1218,
"step": 424
},
{
"epoch": 0.7562277580071174,
"grad_norm": 1.3253678432316216,
"learning_rate": 6.8681368884913345e-06,
"loss": 0.1053,
"step": 425
},
{
"epoch": 0.7580071174377224,
"grad_norm": 1.1045518764985973,
"learning_rate": 6.855166641259252e-06,
"loss": 0.1004,
"step": 426
},
{
"epoch": 0.7597864768683275,
"grad_norm": 1.1916871119247132,
"learning_rate": 6.8421819013116766e-06,
"loss": 0.1165,
"step": 427
},
{
"epoch": 0.7615658362989324,
"grad_norm": 1.2355688910032794,
"learning_rate": 6.829182770086474e-06,
"loss": 0.1293,
"step": 428
},
{
"epoch": 0.7633451957295374,
"grad_norm": 1.361448938887964,
"learning_rate": 6.816169349133934e-06,
"loss": 0.116,
"step": 429
},
{
"epoch": 0.7651245551601423,
"grad_norm": 1.1992796748417411,
"learning_rate": 6.803141740115979e-06,
"loss": 0.0894,
"step": 430
},
{
"epoch": 0.7669039145907474,
"grad_norm": 2.1019875524018112,
"learning_rate": 6.7901000448053676e-06,
"loss": 0.177,
"step": 431
},
{
"epoch": 0.7686832740213523,
"grad_norm": 1.113172252528109,
"learning_rate": 6.777044365084907e-06,
"loss": 0.0992,
"step": 432
},
{
"epoch": 0.7704626334519573,
"grad_norm": 1.1910343388250089,
"learning_rate": 6.763974802946649e-06,
"loss": 0.1074,
"step": 433
},
{
"epoch": 0.7722419928825622,
"grad_norm": 1.4591817103826428,
"learning_rate": 6.750891460491093e-06,
"loss": 0.1499,
"step": 434
},
{
"epoch": 0.7740213523131673,
"grad_norm": 1.5086158043764135,
"learning_rate": 6.737794439926395e-06,
"loss": 0.1474,
"step": 435
},
{
"epoch": 0.7758007117437722,
"grad_norm": 1.3160884455859283,
"learning_rate": 6.724683843567567e-06,
"loss": 0.1284,
"step": 436
},
{
"epoch": 0.7775800711743772,
"grad_norm": 1.329340529761385,
"learning_rate": 6.711559773835672e-06,
"loss": 0.1131,
"step": 437
},
{
"epoch": 0.7793594306049823,
"grad_norm": 1.4861279018952749,
"learning_rate": 6.69842233325703e-06,
"loss": 0.1161,
"step": 438
},
{
"epoch": 0.7811387900355872,
"grad_norm": 1.7627552929537986,
"learning_rate": 6.685271624462416e-06,
"loss": 0.154,
"step": 439
},
{
"epoch": 0.7829181494661922,
"grad_norm": 1.2573572355395808,
"learning_rate": 6.672107750186255e-06,
"loss": 0.1107,
"step": 440
},
{
"epoch": 0.7846975088967971,
"grad_norm": 0.8993430850299913,
"learning_rate": 6.658930813265825e-06,
"loss": 0.0867,
"step": 441
},
{
"epoch": 0.7864768683274022,
"grad_norm": 1.1796545941704497,
"learning_rate": 6.645740916640449e-06,
"loss": 0.1047,
"step": 442
},
{
"epoch": 0.7882562277580071,
"grad_norm": 1.1062583377101722,
"learning_rate": 6.63253816335069e-06,
"loss": 0.0928,
"step": 443
},
{
"epoch": 0.7900355871886121,
"grad_norm": 1.217718954992437,
"learning_rate": 6.619322656537552e-06,
"loss": 0.1095,
"step": 444
},
{
"epoch": 0.791814946619217,
"grad_norm": 1.2126501930341467,
"learning_rate": 6.606094499441671e-06,
"loss": 0.104,
"step": 445
},
{
"epoch": 0.7935943060498221,
"grad_norm": 1.3605714831184592,
"learning_rate": 6.592853795402502e-06,
"loss": 0.1124,
"step": 446
},
{
"epoch": 0.7953736654804271,
"grad_norm": 1.3391599831782586,
"learning_rate": 6.579600647857525e-06,
"loss": 0.1213,
"step": 447
},
{
"epoch": 0.797153024911032,
"grad_norm": 1.5955458337165742,
"learning_rate": 6.566335160341425e-06,
"loss": 0.1457,
"step": 448
},
{
"epoch": 0.798932384341637,
"grad_norm": 1.2065156195943607,
"learning_rate": 6.553057436485289e-06,
"loss": 0.1062,
"step": 449
},
{
"epoch": 0.800711743772242,
"grad_norm": 1.2116479228176436,
"learning_rate": 6.539767580015799e-06,
"loss": 0.1233,
"step": 450
},
{
"epoch": 0.802491103202847,
"grad_norm": 1.6108974803910467,
"learning_rate": 6.52646569475441e-06,
"loss": 0.1277,
"step": 451
},
{
"epoch": 0.8042704626334519,
"grad_norm": 1.409122193803232,
"learning_rate": 6.513151884616556e-06,
"loss": 0.1302,
"step": 452
},
{
"epoch": 0.806049822064057,
"grad_norm": 2.761863084728338,
"learning_rate": 6.499826253610823e-06,
"loss": 0.1292,
"step": 453
},
{
"epoch": 0.8078291814946619,
"grad_norm": 1.1298348277268495,
"learning_rate": 6.486488905838143e-06,
"loss": 0.1077,
"step": 454
},
{
"epoch": 0.8096085409252669,
"grad_norm": 1.304266904791572,
"learning_rate": 6.473139945490984e-06,
"loss": 0.0973,
"step": 455
},
{
"epoch": 0.8113879003558719,
"grad_norm": 1.470403225267748,
"learning_rate": 6.459779476852528e-06,
"loss": 0.129,
"step": 456
},
{
"epoch": 0.8131672597864769,
"grad_norm": 1.2994618526756987,
"learning_rate": 6.446407604295863e-06,
"loss": 0.1212,
"step": 457
},
{
"epoch": 0.8149466192170819,
"grad_norm": 1.376936560412594,
"learning_rate": 6.433024432283169e-06,
"loss": 0.1163,
"step": 458
},
{
"epoch": 0.8167259786476868,
"grad_norm": 1.4692918244800937,
"learning_rate": 6.41963006536489e-06,
"loss": 0.1243,
"step": 459
},
{
"epoch": 0.8185053380782918,
"grad_norm": 1.3813366689041184,
"learning_rate": 6.4062246081789316e-06,
"loss": 0.1189,
"step": 460
},
{
"epoch": 0.8202846975088968,
"grad_norm": 1.0647073329181291,
"learning_rate": 6.392808165449836e-06,
"loss": 0.0845,
"step": 461
},
{
"epoch": 0.8220640569395018,
"grad_norm": 1.3277693379270958,
"learning_rate": 6.379380841987965e-06,
"loss": 0.1114,
"step": 462
},
{
"epoch": 0.8238434163701067,
"grad_norm": 1.1626084033223911,
"learning_rate": 6.365942742688684e-06,
"loss": 0.0964,
"step": 463
},
{
"epoch": 0.8256227758007118,
"grad_norm": 1.5328885016841056,
"learning_rate": 6.352493972531535e-06,
"loss": 0.1348,
"step": 464
},
{
"epoch": 0.8274021352313167,
"grad_norm": 1.6975023292719447,
"learning_rate": 6.339034636579425e-06,
"loss": 0.1473,
"step": 465
},
{
"epoch": 0.8291814946619217,
"grad_norm": 1.2556416031368975,
"learning_rate": 6.325564839977802e-06,
"loss": 0.0927,
"step": 466
},
{
"epoch": 0.8309608540925267,
"grad_norm": 1.074064032510885,
"learning_rate": 6.312084687953835e-06,
"loss": 0.0969,
"step": 467
},
{
"epoch": 0.8327402135231317,
"grad_norm": 1.6835238670487385,
"learning_rate": 6.298594285815585e-06,
"loss": 0.1511,
"step": 468
},
{
"epoch": 0.8345195729537367,
"grad_norm": 1.4839581608826677,
"learning_rate": 6.2850937389511936e-06,
"loss": 0.1471,
"step": 469
},
{
"epoch": 0.8362989323843416,
"grad_norm": 1.1445849946515074,
"learning_rate": 6.271583152828049e-06,
"loss": 0.0941,
"step": 470
},
{
"epoch": 0.8380782918149466,
"grad_norm": 1.2301203032248997,
"learning_rate": 6.258062632991972e-06,
"loss": 0.0938,
"step": 471
},
{
"epoch": 0.8398576512455516,
"grad_norm": 1.229861866409161,
"learning_rate": 6.244532285066382e-06,
"loss": 0.118,
"step": 472
},
{
"epoch": 0.8416370106761566,
"grad_norm": 1.1644908033621832,
"learning_rate": 6.2309922147514775e-06,
"loss": 0.1081,
"step": 473
},
{
"epoch": 0.8434163701067615,
"grad_norm": 1.1517835145097681,
"learning_rate": 6.2174425278234115e-06,
"loss": 0.125,
"step": 474
},
{
"epoch": 0.8451957295373665,
"grad_norm": 1.277475692490733,
"learning_rate": 6.20388333013346e-06,
"loss": 0.0903,
"step": 475
},
{
"epoch": 0.8469750889679716,
"grad_norm": 1.4269406154560587,
"learning_rate": 6.190314727607196e-06,
"loss": 0.1368,
"step": 476
},
{
"epoch": 0.8487544483985765,
"grad_norm": 1.367125057436054,
"learning_rate": 6.176736826243671e-06,
"loss": 0.1166,
"step": 477
},
{
"epoch": 0.8505338078291815,
"grad_norm": 1.2159537541519017,
"learning_rate": 6.163149732114571e-06,
"loss": 0.1144,
"step": 478
},
{
"epoch": 0.8523131672597865,
"grad_norm": 1.3055103197822622,
"learning_rate": 6.149553551363404e-06,
"loss": 0.0998,
"step": 479
},
{
"epoch": 0.8540925266903915,
"grad_norm": 1.4481309925729284,
"learning_rate": 6.1359483902046605e-06,
"loss": 0.1303,
"step": 480
},
{
"epoch": 0.8558718861209964,
"grad_norm": 1.21028745853387,
"learning_rate": 6.122334354922984e-06,
"loss": 0.1058,
"step": 481
},
{
"epoch": 0.8576512455516014,
"grad_norm": 1.1426605648496042,
"learning_rate": 6.108711551872347e-06,
"loss": 0.1052,
"step": 482
},
{
"epoch": 0.8594306049822064,
"grad_norm": 1.4959349397230497,
"learning_rate": 6.095080087475218e-06,
"loss": 0.1219,
"step": 483
},
{
"epoch": 0.8612099644128114,
"grad_norm": 1.2524171813001315,
"learning_rate": 6.0814400682217236e-06,
"loss": 0.1206,
"step": 484
},
{
"epoch": 0.8629893238434164,
"grad_norm": 1.2423264837863424,
"learning_rate": 6.067791600668823e-06,
"loss": 0.0984,
"step": 485
},
{
"epoch": 0.8647686832740213,
"grad_norm": 1.0254265008644459,
"learning_rate": 6.054134791439479e-06,
"loss": 0.0761,
"step": 486
},
{
"epoch": 0.8665480427046264,
"grad_norm": 1.2467048105306586,
"learning_rate": 6.040469747221815e-06,
"loss": 0.0973,
"step": 487
},
{
"epoch": 0.8683274021352313,
"grad_norm": 1.3103708516140928,
"learning_rate": 6.026796574768288e-06,
"loss": 0.0972,
"step": 488
},
{
"epoch": 0.8701067615658363,
"grad_norm": 1.3063825459585219,
"learning_rate": 6.013115380894854e-06,
"loss": 0.1145,
"step": 489
},
{
"epoch": 0.8718861209964412,
"grad_norm": 1.1721813308111106,
"learning_rate": 5.999426272480133e-06,
"loss": 0.093,
"step": 490
},
{
"epoch": 0.8736654804270463,
"grad_norm": 1.2947034866641753,
"learning_rate": 5.985729356464575e-06,
"loss": 0.1156,
"step": 491
},
{
"epoch": 0.8754448398576512,
"grad_norm": 1.2238543229470935,
"learning_rate": 5.972024739849622e-06,
"loss": 0.1048,
"step": 492
},
{
"epoch": 0.8772241992882562,
"grad_norm": 0.899238492025567,
"learning_rate": 5.958312529696874e-06,
"loss": 0.0783,
"step": 493
},
{
"epoch": 0.8790035587188612,
"grad_norm": 1.3050605129441113,
"learning_rate": 5.944592833127253e-06,
"loss": 0.1346,
"step": 494
},
{
"epoch": 0.8807829181494662,
"grad_norm": 1.2659620119650414,
"learning_rate": 5.9308657573201645e-06,
"loss": 0.126,
"step": 495
},
{
"epoch": 0.8825622775800712,
"grad_norm": 1.053722658305837,
"learning_rate": 5.917131409512663e-06,
"loss": 0.0966,
"step": 496
},
{
"epoch": 0.8843416370106761,
"grad_norm": 1.075322132871375,
"learning_rate": 5.903389896998611e-06,
"loss": 0.1028,
"step": 497
},
{
"epoch": 0.8861209964412812,
"grad_norm": 1.4027881729705716,
"learning_rate": 5.889641327127843e-06,
"loss": 0.1107,
"step": 498
},
{
"epoch": 0.8879003558718861,
"grad_norm": 1.2532074853235078,
"learning_rate": 5.875885807305326e-06,
"loss": 0.1137,
"step": 499
},
{
"epoch": 0.8896797153024911,
"grad_norm": 1.1427736844803538,
"learning_rate": 5.862123444990319e-06,
"loss": 0.1033,
"step": 500
},
{
"epoch": 0.891459074733096,
"grad_norm": 1.480776373418844,
"learning_rate": 5.848354347695537e-06,
"loss": 0.1605,
"step": 501
},
{
"epoch": 0.8932384341637011,
"grad_norm": 1.439908135605425,
"learning_rate": 5.83457862298631e-06,
"loss": 0.1332,
"step": 502
},
{
"epoch": 0.895017793594306,
"grad_norm": 1.5682242600786214,
"learning_rate": 5.8207963784797396e-06,
"loss": 0.1268,
"step": 503
},
{
"epoch": 0.896797153024911,
"grad_norm": 1.2659971998362078,
"learning_rate": 5.807007721843862e-06,
"loss": 0.1192,
"step": 504
},
{
"epoch": 0.8985765124555161,
"grad_norm": 1.6373734655317385,
"learning_rate": 5.793212760796804e-06,
"loss": 0.1549,
"step": 505
},
{
"epoch": 0.900355871886121,
"grad_norm": 1.3284478949173315,
"learning_rate": 5.779411603105947e-06,
"loss": 0.1261,
"step": 506
},
{
"epoch": 0.902135231316726,
"grad_norm": 1.2988901561655044,
"learning_rate": 5.765604356587076e-06,
"loss": 0.1169,
"step": 507
},
{
"epoch": 0.9039145907473309,
"grad_norm": 1.3015631068654006,
"learning_rate": 5.751791129103545e-06,
"loss": 0.1225,
"step": 508
},
{
"epoch": 0.905693950177936,
"grad_norm": 1.462346480774781,
"learning_rate": 5.737972028565431e-06,
"loss": 0.1408,
"step": 509
},
{
"epoch": 0.9074733096085409,
"grad_norm": 1.300985474327381,
"learning_rate": 5.7241471629286934e-06,
"loss": 0.1221,
"step": 510
},
{
"epoch": 0.9092526690391459,
"grad_norm": 1.532776752619801,
"learning_rate": 5.7103166401943276e-06,
"loss": 0.1277,
"step": 511
},
{
"epoch": 0.9110320284697508,
"grad_norm": 1.3764231355293768,
"learning_rate": 5.696480568407523e-06,
"loss": 0.1175,
"step": 512
},
{
"epoch": 0.9128113879003559,
"grad_norm": 1.62882350692066,
"learning_rate": 5.682639055656817e-06,
"loss": 0.1329,
"step": 513
},
{
"epoch": 0.9145907473309609,
"grad_norm": 1.390699574289703,
"learning_rate": 5.668792210073255e-06,
"loss": 0.1379,
"step": 514
},
{
"epoch": 0.9163701067615658,
"grad_norm": 1.505728133737634,
"learning_rate": 5.654940139829544e-06,
"loss": 0.1289,
"step": 515
},
{
"epoch": 0.9181494661921709,
"grad_norm": 1.2352069871978812,
"learning_rate": 5.641082953139201e-06,
"loss": 0.1086,
"step": 516
},
{
"epoch": 0.9199288256227758,
"grad_norm": 0.9535889029324193,
"learning_rate": 5.6272207582557195e-06,
"loss": 0.0757,
"step": 517
},
{
"epoch": 0.9217081850533808,
"grad_norm": 1.239827493884586,
"learning_rate": 5.61335366347171e-06,
"loss": 0.0923,
"step": 518
},
{
"epoch": 0.9234875444839857,
"grad_norm": 1.2687618628655917,
"learning_rate": 5.599481777118071e-06,
"loss": 0.1205,
"step": 519
},
{
"epoch": 0.9252669039145908,
"grad_norm": 1.3827338167233967,
"learning_rate": 5.585605207563124e-06,
"loss": 0.1032,
"step": 520
},
{
"epoch": 0.9270462633451957,
"grad_norm": 1.2943750345053857,
"learning_rate": 5.571724063211782e-06,
"loss": 0.1056,
"step": 521
},
{
"epoch": 0.9288256227758007,
"grad_norm": 1.154204506725661,
"learning_rate": 5.557838452504692e-06,
"loss": 0.0865,
"step": 522
},
{
"epoch": 0.9306049822064056,
"grad_norm": 1.5555347050342136,
"learning_rate": 5.5439484839173996e-06,
"loss": 0.1236,
"step": 523
},
{
"epoch": 0.9323843416370107,
"grad_norm": 1.2504207552639395,
"learning_rate": 5.530054265959486e-06,
"loss": 0.1081,
"step": 524
},
{
"epoch": 0.9341637010676157,
"grad_norm": 1.2083286597070473,
"learning_rate": 5.516155907173735e-06,
"loss": 0.1185,
"step": 525
},
{
"epoch": 0.9359430604982206,
"grad_norm": 1.0566655375547844,
"learning_rate": 5.5022535161352764e-06,
"loss": 0.0912,
"step": 526
},
{
"epoch": 0.9377224199288257,
"grad_norm": 1.4227842566270228,
"learning_rate": 5.488347201450741e-06,
"loss": 0.1137,
"step": 527
},
{
"epoch": 0.9395017793594306,
"grad_norm": 1.1589922153147618,
"learning_rate": 5.47443707175741e-06,
"loss": 0.11,
"step": 528
},
{
"epoch": 0.9412811387900356,
"grad_norm": 1.2884702592213582,
"learning_rate": 5.46052323572237e-06,
"loss": 0.111,
"step": 529
},
{
"epoch": 0.9430604982206405,
"grad_norm": 1.058974783820137,
"learning_rate": 5.446605802041662e-06,
"loss": 0.0882,
"step": 530
},
{
"epoch": 0.9448398576512456,
"grad_norm": 1.0573388209912145,
"learning_rate": 5.432684879439428e-06,
"loss": 0.0883,
"step": 531
},
{
"epoch": 0.9466192170818505,
"grad_norm": 1.0408230290222384,
"learning_rate": 5.418760576667071e-06,
"loss": 0.0807,
"step": 532
},
{
"epoch": 0.9483985765124555,
"grad_norm": 1.3449177340112355,
"learning_rate": 5.404833002502398e-06,
"loss": 0.133,
"step": 533
},
{
"epoch": 0.9501779359430605,
"grad_norm": 1.5343246666409203,
"learning_rate": 5.39090226574877e-06,
"loss": 0.1214,
"step": 534
},
{
"epoch": 0.9519572953736655,
"grad_norm": 1.7169322765401176,
"learning_rate": 5.376968475234258e-06,
"loss": 0.1528,
"step": 535
},
{
"epoch": 0.9537366548042705,
"grad_norm": 0.9441690695498636,
"learning_rate": 5.363031739810787e-06,
"loss": 0.0776,
"step": 536
},
{
"epoch": 0.9555160142348754,
"grad_norm": 1.2393233775737496,
"learning_rate": 5.349092168353291e-06,
"loss": 0.1199,
"step": 537
},
{
"epoch": 0.9572953736654805,
"grad_norm": 1.1501632012710579,
"learning_rate": 5.335149869758855e-06,
"loss": 0.0998,
"step": 538
},
{
"epoch": 0.9590747330960854,
"grad_norm": 1.345261486444856,
"learning_rate": 5.32120495294587e-06,
"loss": 0.109,
"step": 539
},
{
"epoch": 0.9608540925266904,
"grad_norm": 1.1582897990190524,
"learning_rate": 5.3072575268531835e-06,
"loss": 0.1234,
"step": 540
},
{
"epoch": 0.9626334519572953,
"grad_norm": 1.3216175000984673,
"learning_rate": 5.293307700439242e-06,
"loss": 0.1168,
"step": 541
},
{
"epoch": 0.9644128113879004,
"grad_norm": 1.7012563356884112,
"learning_rate": 5.2793555826812456e-06,
"loss": 0.1258,
"step": 542
},
{
"epoch": 0.9661921708185054,
"grad_norm": 1.1504863796193265,
"learning_rate": 5.265401282574294e-06,
"loss": 0.0894,
"step": 543
},
{
"epoch": 0.9679715302491103,
"grad_norm": 1.4774605956962692,
"learning_rate": 5.2514449091305375e-06,
"loss": 0.1171,
"step": 544
},
{
"epoch": 0.9697508896797153,
"grad_norm": 1.5137108185784622,
"learning_rate": 5.237486571378317e-06,
"loss": 0.1267,
"step": 545
},
{
"epoch": 0.9715302491103203,
"grad_norm": 1.325817745776221,
"learning_rate": 5.22352637836133e-06,
"loss": 0.1328,
"step": 546
},
{
"epoch": 0.9733096085409253,
"grad_norm": 1.2365677758500844,
"learning_rate": 5.209564439137755e-06,
"loss": 0.1038,
"step": 547
},
{
"epoch": 0.9750889679715302,
"grad_norm": 1.4571885975836905,
"learning_rate": 5.195600862779421e-06,
"loss": 0.1502,
"step": 548
},
{
"epoch": 0.9768683274021353,
"grad_norm": 1.0593248552255834,
"learning_rate": 5.181635758370942e-06,
"loss": 0.0747,
"step": 549
},
{
"epoch": 0.9786476868327402,
"grad_norm": 1.4488899932240877,
"learning_rate": 5.167669235008871e-06,
"loss": 0.1304,
"step": 550
},
{
"epoch": 0.9804270462633452,
"grad_norm": 1.7408954349025696,
"learning_rate": 5.153701401800845e-06,
"loss": 0.1534,
"step": 551
},
{
"epoch": 0.9822064056939501,
"grad_norm": 1.2490011421157257,
"learning_rate": 5.139732367864736e-06,
"loss": 0.0977,
"step": 552
},
{
"epoch": 0.9839857651245552,
"grad_norm": 1.1591259360485227,
"learning_rate": 5.1257622423277934e-06,
"loss": 0.1078,
"step": 553
},
{
"epoch": 0.9857651245551602,
"grad_norm": 1.1133537193913574,
"learning_rate": 5.111791134325793e-06,
"loss": 0.1007,
"step": 554
},
{
"epoch": 0.9875444839857651,
"grad_norm": 1.071652029710098,
"learning_rate": 5.097819153002192e-06,
"loss": 0.0965,
"step": 555
},
{
"epoch": 0.9893238434163701,
"grad_norm": 1.128937600862167,
"learning_rate": 5.083846407507263e-06,
"loss": 0.0964,
"step": 556
},
{
"epoch": 0.9911032028469751,
"grad_norm": 1.5379767941730706,
"learning_rate": 5.0698730069972535e-06,
"loss": 0.1259,
"step": 557
},
{
"epoch": 0.9928825622775801,
"grad_norm": 1.0822578639544602,
"learning_rate": 5.055899060633524e-06,
"loss": 0.0888,
"step": 558
},
{
"epoch": 0.994661921708185,
"grad_norm": 1.1674467130438533,
"learning_rate": 5.041924677581702e-06,
"loss": 0.1125,
"step": 559
},
{
"epoch": 0.99644128113879,
"grad_norm": 1.301186922789244,
"learning_rate": 5.0279499670108245e-06,
"loss": 0.1121,
"step": 560
},
{
"epoch": 0.998220640569395,
"grad_norm": 1.2146433286202794,
"learning_rate": 5.013975038092491e-06,
"loss": 0.1064,
"step": 561
},
{
"epoch": 1.0,
"grad_norm": 1.233157977712758,
"learning_rate": 5e-06,
"loss": 0.0962,
"step": 562
},
{
"epoch": 1.001779359430605,
"grad_norm": 0.8547294512504904,
"learning_rate": 4.98602496190751e-06,
"loss": 0.0628,
"step": 563
},
{
"epoch": 1.00355871886121,
"grad_norm": 0.7027768071296912,
"learning_rate": 4.9720500329891755e-06,
"loss": 0.039,
"step": 564
},
{
"epoch": 1.0053380782918149,
"grad_norm": 0.9472521846024025,
"learning_rate": 4.9580753224183005e-06,
"loss": 0.0559,
"step": 565
},
{
"epoch": 1.00711743772242,
"grad_norm": 0.6979463772086726,
"learning_rate": 4.944100939366478e-06,
"loss": 0.0417,
"step": 566
},
{
"epoch": 1.008896797153025,
"grad_norm": 0.6600744135103447,
"learning_rate": 4.930126993002748e-06,
"loss": 0.0378,
"step": 567
},
{
"epoch": 1.01067615658363,
"grad_norm": 0.7943192303593823,
"learning_rate": 4.9161535924927375e-06,
"loss": 0.0453,
"step": 568
},
{
"epoch": 1.0124555160142348,
"grad_norm": 0.8250567876182562,
"learning_rate": 4.90218084699781e-06,
"loss": 0.0424,
"step": 569
},
{
"epoch": 1.0142348754448398,
"grad_norm": 1.0521929319778944,
"learning_rate": 4.888208865674208e-06,
"loss": 0.0514,
"step": 570
},
{
"epoch": 1.0160142348754448,
"grad_norm": 0.8860257632064427,
"learning_rate": 4.874237757672209e-06,
"loss": 0.0591,
"step": 571
},
{
"epoch": 1.0177935943060499,
"grad_norm": 1.264726936124226,
"learning_rate": 4.8602676321352646e-06,
"loss": 0.0725,
"step": 572
},
{
"epoch": 1.019572953736655,
"grad_norm": 0.9515788781923418,
"learning_rate": 4.846298598199155e-06,
"loss": 0.048,
"step": 573
},
{
"epoch": 1.0213523131672597,
"grad_norm": 0.9037679408864575,
"learning_rate": 4.832330764991131e-06,
"loss": 0.0397,
"step": 574
},
{
"epoch": 1.0231316725978647,
"grad_norm": 1.0679481665835724,
"learning_rate": 4.81836424162906e-06,
"loss": 0.0549,
"step": 575
},
{
"epoch": 1.0249110320284698,
"grad_norm": 0.9017964978842363,
"learning_rate": 4.80439913722058e-06,
"loss": 0.039,
"step": 576
},
{
"epoch": 1.0266903914590748,
"grad_norm": 1.0862464474920541,
"learning_rate": 4.790435560862247e-06,
"loss": 0.042,
"step": 577
},
{
"epoch": 1.0284697508896796,
"grad_norm": 1.2170259844730513,
"learning_rate": 4.776473621638673e-06,
"loss": 0.0527,
"step": 578
},
{
"epoch": 1.0302491103202847,
"grad_norm": 1.143487399181918,
"learning_rate": 4.762513428621684e-06,
"loss": 0.0543,
"step": 579
},
{
"epoch": 1.0320284697508897,
"grad_norm": 1.0670331451405415,
"learning_rate": 4.748555090869464e-06,
"loss": 0.0483,
"step": 580
},
{
"epoch": 1.0338078291814947,
"grad_norm": 1.0920316345540422,
"learning_rate": 4.734598717425706e-06,
"loss": 0.0522,
"step": 581
},
{
"epoch": 1.0355871886120998,
"grad_norm": 1.198081617990545,
"learning_rate": 4.720644417318755e-06,
"loss": 0.0576,
"step": 582
},
{
"epoch": 1.0373665480427046,
"grad_norm": 1.5096660129964996,
"learning_rate": 4.70669229956076e-06,
"loss": 0.0563,
"step": 583
},
{
"epoch": 1.0391459074733096,
"grad_norm": 1.0003149054802263,
"learning_rate": 4.692742473146818e-06,
"loss": 0.0389,
"step": 584
},
{
"epoch": 1.0409252669039146,
"grad_norm": 1.1652024399424679,
"learning_rate": 4.678795047054131e-06,
"loss": 0.0418,
"step": 585
},
{
"epoch": 1.0427046263345197,
"grad_norm": 1.21728635385485,
"learning_rate": 4.664850130241146e-06,
"loss": 0.0362,
"step": 586
},
{
"epoch": 1.0444839857651245,
"grad_norm": 0.9163900719555346,
"learning_rate": 4.650907831646711e-06,
"loss": 0.0418,
"step": 587
},
{
"epoch": 1.0462633451957295,
"grad_norm": 1.0262119807351613,
"learning_rate": 4.636968260189214e-06,
"loss": 0.0455,
"step": 588
},
{
"epoch": 1.0480427046263345,
"grad_norm": 0.9971199994097065,
"learning_rate": 4.623031524765744e-06,
"loss": 0.0458,
"step": 589
},
{
"epoch": 1.0498220640569396,
"grad_norm": 0.9971575590605182,
"learning_rate": 4.609097734251231e-06,
"loss": 0.0482,
"step": 590
},
{
"epoch": 1.0516014234875444,
"grad_norm": 1.4272943496871313,
"learning_rate": 4.595166997497605e-06,
"loss": 0.0552,
"step": 591
},
{
"epoch": 1.0533807829181494,
"grad_norm": 1.0928724453101657,
"learning_rate": 4.58123942333293e-06,
"loss": 0.048,
"step": 592
},
{
"epoch": 1.0551601423487544,
"grad_norm": 0.8185600592969815,
"learning_rate": 4.567315120560573e-06,
"loss": 0.0328,
"step": 593
},
{
"epoch": 1.0569395017793595,
"grad_norm": 0.9298681360924248,
"learning_rate": 4.553394197958339e-06,
"loss": 0.036,
"step": 594
},
{
"epoch": 1.0587188612099645,
"grad_norm": 0.8798971670232553,
"learning_rate": 4.539476764277631e-06,
"loss": 0.0317,
"step": 595
},
{
"epoch": 1.0604982206405693,
"grad_norm": 0.9060028669002683,
"learning_rate": 4.525562928242592e-06,
"loss": 0.0399,
"step": 596
},
{
"epoch": 1.0622775800711743,
"grad_norm": 1.4072724773876861,
"learning_rate": 4.511652798549261e-06,
"loss": 0.0585,
"step": 597
},
{
"epoch": 1.0640569395017794,
"grad_norm": 1.0768155616192439,
"learning_rate": 4.497746483864725e-06,
"loss": 0.0517,
"step": 598
},
{
"epoch": 1.0658362989323844,
"grad_norm": 0.8546328966763028,
"learning_rate": 4.483844092826267e-06,
"loss": 0.0369,
"step": 599
},
{
"epoch": 1.0676156583629894,
"grad_norm": 0.8162459071101226,
"learning_rate": 4.469945734040516e-06,
"loss": 0.0396,
"step": 600
},
{
"epoch": 1.0676156583629894,
"eval_loss": 0.13828444480895996,
"eval_runtime": 7.1428,
"eval_samples_per_second": 6.44,
"eval_steps_per_second": 1.68,
"step": 600
},
{
"epoch": 1.0693950177935942,
"grad_norm": 0.9806946940979351,
"learning_rate": 4.456051516082603e-06,
"loss": 0.0423,
"step": 601
},
{
"epoch": 1.0711743772241993,
"grad_norm": 1.3793546329301072,
"learning_rate": 4.442161547495309e-06,
"loss": 0.0644,
"step": 602
},
{
"epoch": 1.0729537366548043,
"grad_norm": 1.2611862441529713,
"learning_rate": 4.42827593678822e-06,
"loss": 0.0542,
"step": 603
},
{
"epoch": 1.0747330960854093,
"grad_norm": 0.9902983507703184,
"learning_rate": 4.414394792436877e-06,
"loss": 0.0365,
"step": 604
},
{
"epoch": 1.0765124555160142,
"grad_norm": 0.9289547591588659,
"learning_rate": 4.400518222881931e-06,
"loss": 0.0421,
"step": 605
},
{
"epoch": 1.0782918149466192,
"grad_norm": 0.9463300231205996,
"learning_rate": 4.386646336528291e-06,
"loss": 0.043,
"step": 606
},
{
"epoch": 1.0800711743772242,
"grad_norm": 0.9259371009942426,
"learning_rate": 4.372779241744282e-06,
"loss": 0.039,
"step": 607
},
{
"epoch": 1.0818505338078293,
"grad_norm": 0.8516637033570938,
"learning_rate": 4.358917046860799e-06,
"loss": 0.0341,
"step": 608
},
{
"epoch": 1.083629893238434,
"grad_norm": 0.9071862571901869,
"learning_rate": 4.345059860170458e-06,
"loss": 0.0356,
"step": 609
},
{
"epoch": 1.085409252669039,
"grad_norm": 1.0635580735336114,
"learning_rate": 4.331207789926746e-06,
"loss": 0.0441,
"step": 610
},
{
"epoch": 1.0871886120996441,
"grad_norm": 0.8830506087183804,
"learning_rate": 4.317360944343184e-06,
"loss": 0.0391,
"step": 611
},
{
"epoch": 1.0889679715302492,
"grad_norm": 0.8557175692266639,
"learning_rate": 4.303519431592479e-06,
"loss": 0.0294,
"step": 612
},
{
"epoch": 1.0907473309608542,
"grad_norm": 1.2510738747884549,
"learning_rate": 4.289683359805673e-06,
"loss": 0.0626,
"step": 613
},
{
"epoch": 1.092526690391459,
"grad_norm": 1.1086025882199797,
"learning_rate": 4.275852837071309e-06,
"loss": 0.0386,
"step": 614
},
{
"epoch": 1.094306049822064,
"grad_norm": 0.8466625547422796,
"learning_rate": 4.26202797143457e-06,
"loss": 0.0359,
"step": 615
},
{
"epoch": 1.096085409252669,
"grad_norm": 1.0320189897254877,
"learning_rate": 4.248208870896456e-06,
"loss": 0.0452,
"step": 616
},
{
"epoch": 1.097864768683274,
"grad_norm": 1.1139859554711828,
"learning_rate": 4.234395643412925e-06,
"loss": 0.0588,
"step": 617
},
{
"epoch": 1.099644128113879,
"grad_norm": 1.0833592021160374,
"learning_rate": 4.220588396894055e-06,
"loss": 0.0435,
"step": 618
},
{
"epoch": 1.101423487544484,
"grad_norm": 1.0747608563273376,
"learning_rate": 4.2067872392031965e-06,
"loss": 0.0439,
"step": 619
},
{
"epoch": 1.103202846975089,
"grad_norm": 1.21276524318406,
"learning_rate": 4.192992278156141e-06,
"loss": 0.0674,
"step": 620
},
{
"epoch": 1.104982206405694,
"grad_norm": 1.179304229566795,
"learning_rate": 4.179203621520262e-06,
"loss": 0.0538,
"step": 621
},
{
"epoch": 1.106761565836299,
"grad_norm": 0.99867215096944,
"learning_rate": 4.165421377013691e-06,
"loss": 0.0303,
"step": 622
},
{
"epoch": 1.1085409252669038,
"grad_norm": 0.92035257564448,
"learning_rate": 4.151645652304465e-06,
"loss": 0.0402,
"step": 623
},
{
"epoch": 1.1103202846975089,
"grad_norm": 1.0410983710015873,
"learning_rate": 4.137876555009684e-06,
"loss": 0.0493,
"step": 624
},
{
"epoch": 1.112099644128114,
"grad_norm": 1.1209427859065353,
"learning_rate": 4.124114192694676e-06,
"loss": 0.0457,
"step": 625
},
{
"epoch": 1.113879003558719,
"grad_norm": 0.8743076296872143,
"learning_rate": 4.110358672872158e-06,
"loss": 0.0422,
"step": 626
},
{
"epoch": 1.1156583629893237,
"grad_norm": 1.0728261185889085,
"learning_rate": 4.0966101030013915e-06,
"loss": 0.0476,
"step": 627
},
{
"epoch": 1.1174377224199288,
"grad_norm": 0.84062355751606,
"learning_rate": 4.082868590487339e-06,
"loss": 0.0373,
"step": 628
},
{
"epoch": 1.1192170818505338,
"grad_norm": 0.6922337708933334,
"learning_rate": 4.069134242679837e-06,
"loss": 0.0303,
"step": 629
},
{
"epoch": 1.1209964412811388,
"grad_norm": 1.094838178393759,
"learning_rate": 4.055407166872748e-06,
"loss": 0.0463,
"step": 630
},
{
"epoch": 1.1227758007117439,
"grad_norm": 0.9729388062399839,
"learning_rate": 4.041687470303127e-06,
"loss": 0.0427,
"step": 631
},
{
"epoch": 1.1245551601423487,
"grad_norm": 0.965017927999121,
"learning_rate": 4.02797526015038e-06,
"loss": 0.0502,
"step": 632
},
{
"epoch": 1.1263345195729537,
"grad_norm": 0.986790501677302,
"learning_rate": 4.014270643535427e-06,
"loss": 0.0407,
"step": 633
},
{
"epoch": 1.1281138790035588,
"grad_norm": 1.2417735052456353,
"learning_rate": 4.000573727519868e-06,
"loss": 0.0669,
"step": 634
},
{
"epoch": 1.1298932384341638,
"grad_norm": 0.9006479956694917,
"learning_rate": 3.9868846191051465e-06,
"loss": 0.0372,
"step": 635
},
{
"epoch": 1.1316725978647686,
"grad_norm": 1.1357157682949497,
"learning_rate": 3.973203425231715e-06,
"loss": 0.0542,
"step": 636
},
{
"epoch": 1.1334519572953736,
"grad_norm": 0.9210806832468151,
"learning_rate": 3.959530252778187e-06,
"loss": 0.0504,
"step": 637
},
{
"epoch": 1.1352313167259787,
"grad_norm": 1.2020387615526031,
"learning_rate": 3.945865208560522e-06,
"loss": 0.0668,
"step": 638
},
{
"epoch": 1.1370106761565837,
"grad_norm": 1.150306180182193,
"learning_rate": 3.932208399331177e-06,
"loss": 0.0527,
"step": 639
},
{
"epoch": 1.1387900355871885,
"grad_norm": 1.2018719772753494,
"learning_rate": 3.918559931778277e-06,
"loss": 0.0645,
"step": 640
},
{
"epoch": 1.1405693950177935,
"grad_norm": 0.8606898186114491,
"learning_rate": 3.904919912524784e-06,
"loss": 0.0334,
"step": 641
},
{
"epoch": 1.1423487544483986,
"grad_norm": 0.9525507385343362,
"learning_rate": 3.891288448127654e-06,
"loss": 0.0453,
"step": 642
},
{
"epoch": 1.1441281138790036,
"grad_norm": 1.001256472466121,
"learning_rate": 3.877665645077017e-06,
"loss": 0.0472,
"step": 643
},
{
"epoch": 1.1459074733096086,
"grad_norm": 1.183003789633624,
"learning_rate": 3.86405160979534e-06,
"loss": 0.0519,
"step": 644
},
{
"epoch": 1.1476868327402134,
"grad_norm": 0.8695848757571557,
"learning_rate": 3.850446448636597e-06,
"loss": 0.0422,
"step": 645
},
{
"epoch": 1.1494661921708185,
"grad_norm": 0.7483879064998394,
"learning_rate": 3.8368502678854296e-06,
"loss": 0.0339,
"step": 646
},
{
"epoch": 1.1512455516014235,
"grad_norm": 0.9406953847372124,
"learning_rate": 3.8232631737563306e-06,
"loss": 0.0451,
"step": 647
},
{
"epoch": 1.1530249110320285,
"grad_norm": 1.103408418773824,
"learning_rate": 3.809685272392804e-06,
"loss": 0.0431,
"step": 648
},
{
"epoch": 1.1548042704626336,
"grad_norm": 1.1014544973242129,
"learning_rate": 3.796116669866543e-06,
"loss": 0.0497,
"step": 649
},
{
"epoch": 1.1565836298932384,
"grad_norm": 0.926236874246472,
"learning_rate": 3.78255747217659e-06,
"loss": 0.0457,
"step": 650
},
{
"epoch": 1.1583629893238434,
"grad_norm": 0.9874886298329539,
"learning_rate": 3.769007785248523e-06,
"loss": 0.042,
"step": 651
},
{
"epoch": 1.1601423487544484,
"grad_norm": 0.9433772642686659,
"learning_rate": 3.7554677149336186e-06,
"loss": 0.0414,
"step": 652
},
{
"epoch": 1.1619217081850535,
"grad_norm": 1.0777671762969125,
"learning_rate": 3.7419373670080284e-06,
"loss": 0.0476,
"step": 653
},
{
"epoch": 1.1637010676156583,
"grad_norm": 1.1304402592387923,
"learning_rate": 3.7284168471719527e-06,
"loss": 0.0529,
"step": 654
},
{
"epoch": 1.1654804270462633,
"grad_norm": 1.0441551907434734,
"learning_rate": 3.7149062610488085e-06,
"loss": 0.0522,
"step": 655
},
{
"epoch": 1.1672597864768683,
"grad_norm": 0.9601081313393668,
"learning_rate": 3.701405714184416e-06,
"loss": 0.0461,
"step": 656
},
{
"epoch": 1.1690391459074734,
"grad_norm": 0.9071924144253114,
"learning_rate": 3.687915312046166e-06,
"loss": 0.0369,
"step": 657
},
{
"epoch": 1.1708185053380782,
"grad_norm": 1.0181711239434297,
"learning_rate": 3.6744351600221994e-06,
"loss": 0.0338,
"step": 658
},
{
"epoch": 1.1725978647686832,
"grad_norm": 1.1482871630180345,
"learning_rate": 3.6609653634205773e-06,
"loss": 0.0595,
"step": 659
},
{
"epoch": 1.1743772241992882,
"grad_norm": 1.1824425297449868,
"learning_rate": 3.647506027468467e-06,
"loss": 0.0436,
"step": 660
},
{
"epoch": 1.1761565836298933,
"grad_norm": 0.8846905635199142,
"learning_rate": 3.6340572573113176e-06,
"loss": 0.0365,
"step": 661
},
{
"epoch": 1.1779359430604983,
"grad_norm": 1.0178369029720964,
"learning_rate": 3.6206191580120346e-06,
"loss": 0.0489,
"step": 662
},
{
"epoch": 1.1797153024911031,
"grad_norm": 1.190092748578404,
"learning_rate": 3.6071918345501655e-06,
"loss": 0.0502,
"step": 663
},
{
"epoch": 1.1814946619217082,
"grad_norm": 0.8837883568681317,
"learning_rate": 3.5937753918210705e-06,
"loss": 0.0359,
"step": 664
},
{
"epoch": 1.1832740213523132,
"grad_norm": 0.925819103061139,
"learning_rate": 3.5803699346351117e-06,
"loss": 0.0365,
"step": 665
},
{
"epoch": 1.1850533807829182,
"grad_norm": 0.7996611763224647,
"learning_rate": 3.566975567716833e-06,
"loss": 0.0303,
"step": 666
},
{
"epoch": 1.1868327402135233,
"grad_norm": 1.0921199636245027,
"learning_rate": 3.5535923957041374e-06,
"loss": 0.0571,
"step": 667
},
{
"epoch": 1.188612099644128,
"grad_norm": 0.8005202972903217,
"learning_rate": 3.540220523147474e-06,
"loss": 0.0395,
"step": 668
},
{
"epoch": 1.190391459074733,
"grad_norm": 1.0764132328807485,
"learning_rate": 3.5268600545090183e-06,
"loss": 0.0396,
"step": 669
},
{
"epoch": 1.1921708185053381,
"grad_norm": 0.976616916449195,
"learning_rate": 3.513511094161858e-06,
"loss": 0.044,
"step": 670
},
{
"epoch": 1.193950177935943,
"grad_norm": 1.2329641087482588,
"learning_rate": 3.5001737463891793e-06,
"loss": 0.0458,
"step": 671
},
{
"epoch": 1.195729537366548,
"grad_norm": 0.9695655624227043,
"learning_rate": 3.4868481153834454e-06,
"loss": 0.0416,
"step": 672
},
{
"epoch": 1.197508896797153,
"grad_norm": 1.0587017444949576,
"learning_rate": 3.4735343052455905e-06,
"loss": 0.0432,
"step": 673
},
{
"epoch": 1.199288256227758,
"grad_norm": 1.1377978824885697,
"learning_rate": 3.4602324199842026e-06,
"loss": 0.0411,
"step": 674
},
{
"epoch": 1.201067615658363,
"grad_norm": 1.0649133398233424,
"learning_rate": 3.446942563514711e-06,
"loss": 0.043,
"step": 675
},
{
"epoch": 1.2028469750889679,
"grad_norm": 1.2154917888765664,
"learning_rate": 3.4336648396585777e-06,
"loss": 0.0422,
"step": 676
},
{
"epoch": 1.204626334519573,
"grad_norm": 1.0427405390699247,
"learning_rate": 3.4203993521424774e-06,
"loss": 0.0536,
"step": 677
},
{
"epoch": 1.206405693950178,
"grad_norm": 0.8919462337795426,
"learning_rate": 3.407146204597499e-06,
"loss": 0.0363,
"step": 678
},
{
"epoch": 1.208185053380783,
"grad_norm": 0.92588445633896,
"learning_rate": 3.3939055005583305e-06,
"loss": 0.0393,
"step": 679
},
{
"epoch": 1.209964412811388,
"grad_norm": 0.9646545633516691,
"learning_rate": 3.3806773434624475e-06,
"loss": 0.0438,
"step": 680
},
{
"epoch": 1.2117437722419928,
"grad_norm": 1.1776655307293824,
"learning_rate": 3.3674618366493117e-06,
"loss": 0.0534,
"step": 681
},
{
"epoch": 1.2135231316725978,
"grad_norm": 0.9296310818357466,
"learning_rate": 3.3542590833595533e-06,
"loss": 0.0414,
"step": 682
},
{
"epoch": 1.2153024911032029,
"grad_norm": 0.9130754996133634,
"learning_rate": 3.341069186734176e-06,
"loss": 0.0366,
"step": 683
},
{
"epoch": 1.217081850533808,
"grad_norm": 0.8398374744131705,
"learning_rate": 3.3278922498137455e-06,
"loss": 0.0408,
"step": 684
},
{
"epoch": 1.2188612099644127,
"grad_norm": 0.8194889930167317,
"learning_rate": 3.314728375537587e-06,
"loss": 0.0343,
"step": 685
},
{
"epoch": 1.2206405693950177,
"grad_norm": 1.0568307723228527,
"learning_rate": 3.3015776667429724e-06,
"loss": 0.0533,
"step": 686
},
{
"epoch": 1.2224199288256228,
"grad_norm": 0.846464404831591,
"learning_rate": 3.2884402261643296e-06,
"loss": 0.0317,
"step": 687
},
{
"epoch": 1.2241992882562278,
"grad_norm": 1.0612818751427298,
"learning_rate": 3.2753161564324344e-06,
"loss": 0.051,
"step": 688
},
{
"epoch": 1.2259786476868326,
"grad_norm": 1.0921889784490464,
"learning_rate": 3.262205560073605e-06,
"loss": 0.0315,
"step": 689
},
{
"epoch": 1.2277580071174377,
"grad_norm": 0.7877871308400063,
"learning_rate": 3.249108539508909e-06,
"loss": 0.0281,
"step": 690
},
{
"epoch": 1.2295373665480427,
"grad_norm": 1.2396290130990737,
"learning_rate": 3.2360251970533527e-06,
"loss": 0.055,
"step": 691
},
{
"epoch": 1.2313167259786477,
"grad_norm": 1.0214709327506637,
"learning_rate": 3.2229556349150947e-06,
"loss": 0.0492,
"step": 692
},
{
"epoch": 1.2330960854092528,
"grad_norm": 0.8568501040637228,
"learning_rate": 3.2098999551946337e-06,
"loss": 0.0337,
"step": 693
},
{
"epoch": 1.2348754448398576,
"grad_norm": 1.2308619987987102,
"learning_rate": 3.1968582598840234e-06,
"loss": 0.0532,
"step": 694
},
{
"epoch": 1.2366548042704626,
"grad_norm": 1.087501995164755,
"learning_rate": 3.183830650866068e-06,
"loss": 0.0381,
"step": 695
},
{
"epoch": 1.2384341637010676,
"grad_norm": 0.8919790084939676,
"learning_rate": 3.1708172299135266e-06,
"loss": 0.0376,
"step": 696
},
{
"epoch": 1.2402135231316727,
"grad_norm": 1.4264095905198482,
"learning_rate": 3.1578180986883234e-06,
"loss": 0.062,
"step": 697
},
{
"epoch": 1.2419928825622777,
"grad_norm": 1.1571895955783398,
"learning_rate": 3.1448333587407486e-06,
"loss": 0.0563,
"step": 698
},
{
"epoch": 1.2437722419928825,
"grad_norm": 0.9980157300161984,
"learning_rate": 3.131863111508667e-06,
"loss": 0.0454,
"step": 699
},
{
"epoch": 1.2455516014234875,
"grad_norm": 0.9157461897439566,
"learning_rate": 3.118907458316722e-06,
"loss": 0.0369,
"step": 700
},
{
"epoch": 1.2473309608540926,
"grad_norm": 1.0326915297964894,
"learning_rate": 3.105966500375551e-06,
"loss": 0.0347,
"step": 701
},
{
"epoch": 1.2491103202846976,
"grad_norm": 1.0895553545223995,
"learning_rate": 3.0930403387809892e-06,
"loss": 0.0483,
"step": 702
},
{
"epoch": 1.2508896797153026,
"grad_norm": 1.2156705365027338,
"learning_rate": 3.080129074513285e-06,
"loss": 0.0558,
"step": 703
},
{
"epoch": 1.2526690391459074,
"grad_norm": 1.4083175940812185,
"learning_rate": 3.067232808436299e-06,
"loss": 0.058,
"step": 704
},
{
"epoch": 1.2544483985765125,
"grad_norm": 0.8911382799239523,
"learning_rate": 3.0543516412967327e-06,
"loss": 0.037,
"step": 705
},
{
"epoch": 1.2562277580071175,
"grad_norm": 0.7617173335791619,
"learning_rate": 3.041485673723331e-06,
"loss": 0.0286,
"step": 706
},
{
"epoch": 1.2580071174377223,
"grad_norm": 0.9580614242960381,
"learning_rate": 3.0286350062261017e-06,
"loss": 0.0381,
"step": 707
},
{
"epoch": 1.2597864768683273,
"grad_norm": 0.9048540935872091,
"learning_rate": 3.0157997391955172e-06,
"loss": 0.0378,
"step": 708
},
{
"epoch": 1.2615658362989324,
"grad_norm": 1.2822234133916406,
"learning_rate": 3.0029799729017518e-06,
"loss": 0.0572,
"step": 709
},
{
"epoch": 1.2633451957295374,
"grad_norm": 0.8668230724849544,
"learning_rate": 2.9901758074938797e-06,
"loss": 0.0404,
"step": 710
},
{
"epoch": 1.2651245551601424,
"grad_norm": 0.902622064775602,
"learning_rate": 2.977387342999103e-06,
"loss": 0.0343,
"step": 711
},
{
"epoch": 1.2669039145907472,
"grad_norm": 1.3487596797513532,
"learning_rate": 2.964614679321966e-06,
"loss": 0.0511,
"step": 712
},
{
"epoch": 1.2686832740213523,
"grad_norm": 0.9802953443926045,
"learning_rate": 2.951857916243574e-06,
"loss": 0.0431,
"step": 713
},
{
"epoch": 1.2704626334519573,
"grad_norm": 1.4332880642967385,
"learning_rate": 2.9391171534208185e-06,
"loss": 0.0544,
"step": 714
},
{
"epoch": 1.2722419928825623,
"grad_norm": 1.1081952360257095,
"learning_rate": 2.9263924903855932e-06,
"loss": 0.0416,
"step": 715
},
{
"epoch": 1.2740213523131674,
"grad_norm": 1.0333605663846648,
"learning_rate": 2.9136840265440213e-06,
"loss": 0.0469,
"step": 716
},
{
"epoch": 1.2758007117437722,
"grad_norm": 1.0641882073169253,
"learning_rate": 2.9009918611756732e-06,
"loss": 0.0448,
"step": 717
},
{
"epoch": 1.2775800711743772,
"grad_norm": 0.8806910046198404,
"learning_rate": 2.8883160934327968e-06,
"loss": 0.0392,
"step": 718
},
{
"epoch": 1.2793594306049823,
"grad_norm": 1.3217512808066976,
"learning_rate": 2.8756568223395396e-06,
"loss": 0.0525,
"step": 719
},
{
"epoch": 1.281138790035587,
"grad_norm": 1.1066401990396189,
"learning_rate": 2.8630141467911777e-06,
"loss": 0.0469,
"step": 720
},
{
"epoch": 1.282918149466192,
"grad_norm": 0.8964680090502098,
"learning_rate": 2.8503881655533395e-06,
"loss": 0.0301,
"step": 721
},
{
"epoch": 1.2846975088967971,
"grad_norm": 0.9790179316595002,
"learning_rate": 2.837778977261235e-06,
"loss": 0.0373,
"step": 722
},
{
"epoch": 1.2864768683274022,
"grad_norm": 1.0244740989723553,
"learning_rate": 2.8251866804188875e-06,
"loss": 0.0413,
"step": 723
},
{
"epoch": 1.2882562277580072,
"grad_norm": 0.907577382814479,
"learning_rate": 2.812611373398365e-06,
"loss": 0.0379,
"step": 724
},
{
"epoch": 1.290035587188612,
"grad_norm": 0.9877905229412578,
"learning_rate": 2.8000531544390064e-06,
"loss": 0.0365,
"step": 725
},
{
"epoch": 1.291814946619217,
"grad_norm": 0.9382307500323435,
"learning_rate": 2.7875121216466595e-06,
"loss": 0.038,
"step": 726
},
{
"epoch": 1.293594306049822,
"grad_norm": 1.2668204222978174,
"learning_rate": 2.7749883729929105e-06,
"loss": 0.053,
"step": 727
},
{
"epoch": 1.295373665480427,
"grad_norm": 0.8999055571713811,
"learning_rate": 2.762482006314324e-06,
"loss": 0.0384,
"step": 728
},
{
"epoch": 1.2971530249110321,
"grad_norm": 1.0154187759025894,
"learning_rate": 2.7499931193116692e-06,
"loss": 0.0327,
"step": 729
},
{
"epoch": 1.298932384341637,
"grad_norm": 1.0802920648004704,
"learning_rate": 2.737521809549167e-06,
"loss": 0.0417,
"step": 730
},
{
"epoch": 1.300711743772242,
"grad_norm": 1.0066519304730417,
"learning_rate": 2.725068174453722e-06,
"loss": 0.0482,
"step": 731
},
{
"epoch": 1.302491103202847,
"grad_norm": 1.0410669256725513,
"learning_rate": 2.712632311314165e-06,
"loss": 0.0386,
"step": 732
},
{
"epoch": 1.304270462633452,
"grad_norm": 1.1234658089663005,
"learning_rate": 2.7002143172804875e-06,
"loss": 0.043,
"step": 733
},
{
"epoch": 1.306049822064057,
"grad_norm": 1.192568202231757,
"learning_rate": 2.6878142893630904e-06,
"loss": 0.0457,
"step": 734
},
{
"epoch": 1.3078291814946619,
"grad_norm": 0.9984922757453483,
"learning_rate": 2.6754323244320154e-06,
"loss": 0.0434,
"step": 735
},
{
"epoch": 1.309608540925267,
"grad_norm": 1.0029752401431848,
"learning_rate": 2.6630685192161995e-06,
"loss": 0.0351,
"step": 736
},
{
"epoch": 1.311387900355872,
"grad_norm": 1.039717365436853,
"learning_rate": 2.650722970302714e-06,
"loss": 0.0424,
"step": 737
},
{
"epoch": 1.3131672597864767,
"grad_norm": 1.3580396164284314,
"learning_rate": 2.638395774136009e-06,
"loss": 0.0567,
"step": 738
},
{
"epoch": 1.3149466192170818,
"grad_norm": 0.9444133237600162,
"learning_rate": 2.6260870270171645e-06,
"loss": 0.0357,
"step": 739
},
{
"epoch": 1.3167259786476868,
"grad_norm": 1.0580934161350533,
"learning_rate": 2.613796825103129e-06,
"loss": 0.0414,
"step": 740
},
{
"epoch": 1.3185053380782918,
"grad_norm": 0.9834182092745584,
"learning_rate": 2.60152526440598e-06,
"loss": 0.0387,
"step": 741
},
{
"epoch": 1.3202846975088969,
"grad_norm": 0.7616732359159014,
"learning_rate": 2.5892724407921667e-06,
"loss": 0.0289,
"step": 742
},
{
"epoch": 1.3220640569395017,
"grad_norm": 1.3915238757419404,
"learning_rate": 2.577038449981763e-06,
"loss": 0.0611,
"step": 743
},
{
"epoch": 1.3238434163701067,
"grad_norm": 1.2173657729432406,
"learning_rate": 2.564823387547716e-06,
"loss": 0.0425,
"step": 744
},
{
"epoch": 1.3256227758007118,
"grad_norm": 0.9135314344508192,
"learning_rate": 2.552627348915106e-06,
"loss": 0.0371,
"step": 745
},
{
"epoch": 1.3274021352313168,
"grad_norm": 1.0893426987998907,
"learning_rate": 2.5404504293603983e-06,
"loss": 0.0505,
"step": 746
},
{
"epoch": 1.3291814946619218,
"grad_norm": 1.2235742895097919,
"learning_rate": 2.528292724010697e-06,
"loss": 0.0546,
"step": 747
},
{
"epoch": 1.3309608540925266,
"grad_norm": 0.963631042620569,
"learning_rate": 2.5161543278430055e-06,
"loss": 0.0416,
"step": 748
},
{
"epoch": 1.3327402135231317,
"grad_norm": 1.2217909377948624,
"learning_rate": 2.5040353356834756e-06,
"loss": 0.0432,
"step": 749
},
{
"epoch": 1.3345195729537367,
"grad_norm": 1.049568511067747,
"learning_rate": 2.4919358422066816e-06,
"loss": 0.0416,
"step": 750
},
{
"epoch": 1.3362989323843417,
"grad_norm": 0.912111020602681,
"learning_rate": 2.4798559419348672e-06,
"loss": 0.0378,
"step": 751
},
{
"epoch": 1.3380782918149468,
"grad_norm": 0.8880929267694146,
"learning_rate": 2.4677957292372166e-06,
"loss": 0.0352,
"step": 752
},
{
"epoch": 1.3398576512455516,
"grad_norm": 0.9824476908080295,
"learning_rate": 2.455755298329107e-06,
"loss": 0.0367,
"step": 753
},
{
"epoch": 1.3416370106761566,
"grad_norm": 1.021419477068853,
"learning_rate": 2.4437347432713838e-06,
"loss": 0.0428,
"step": 754
},
{
"epoch": 1.3434163701067616,
"grad_norm": 1.1022589426865288,
"learning_rate": 2.431734157969619e-06,
"loss": 0.0395,
"step": 755
},
{
"epoch": 1.3451957295373664,
"grad_norm": 1.0934549678996628,
"learning_rate": 2.4197536361733792e-06,
"loss": 0.0477,
"step": 756
},
{
"epoch": 1.3469750889679715,
"grad_norm": 0.827138815781088,
"learning_rate": 2.407793271475495e-06,
"loss": 0.0304,
"step": 757
},
{
"epoch": 1.3487544483985765,
"grad_norm": 0.8432510777831591,
"learning_rate": 2.3958531573113223e-06,
"loss": 0.037,
"step": 758
},
{
"epoch": 1.3505338078291815,
"grad_norm": 1.0216803053267136,
"learning_rate": 2.3839333869580243e-06,
"loss": 0.0439,
"step": 759
},
{
"epoch": 1.3523131672597866,
"grad_norm": 0.9332421396895141,
"learning_rate": 2.372034053533835e-06,
"loss": 0.0368,
"step": 760
},
{
"epoch": 1.3540925266903914,
"grad_norm": 0.7104040014108932,
"learning_rate": 2.360155249997334e-06,
"loss": 0.0297,
"step": 761
},
{
"epoch": 1.3558718861209964,
"grad_norm": 0.6773597226062513,
"learning_rate": 2.348297069146715e-06,
"loss": 0.0245,
"step": 762
},
{
"epoch": 1.3576512455516014,
"grad_norm": 0.7243261986365364,
"learning_rate": 2.3364596036190706e-06,
"loss": 0.0285,
"step": 763
},
{
"epoch": 1.3594306049822065,
"grad_norm": 1.0093488845674181,
"learning_rate": 2.3246429458896637e-06,
"loss": 0.0378,
"step": 764
},
{
"epoch": 1.3612099644128115,
"grad_norm": 0.9584656799973454,
"learning_rate": 2.312847188271203e-06,
"loss": 0.0422,
"step": 765
},
{
"epoch": 1.3629893238434163,
"grad_norm": 0.7865628537475263,
"learning_rate": 2.301072422913123e-06,
"loss": 0.0225,
"step": 766
},
{
"epoch": 1.3647686832740213,
"grad_norm": 1.0060606728391541,
"learning_rate": 2.2893187418008666e-06,
"loss": 0.0384,
"step": 767
},
{
"epoch": 1.3665480427046264,
"grad_norm": 1.0806683796823793,
"learning_rate": 2.2775862367551642e-06,
"loss": 0.0451,
"step": 768
},
{
"epoch": 1.3683274021352312,
"grad_norm": 1.055751788962701,
"learning_rate": 2.265874999431318e-06,
"loss": 0.0472,
"step": 769
},
{
"epoch": 1.3701067615658362,
"grad_norm": 1.1232736822675,
"learning_rate": 2.254185121318484e-06,
"loss": 0.0355,
"step": 770
},
{
"epoch": 1.3718861209964412,
"grad_norm": 1.0562010911911415,
"learning_rate": 2.2425166937389596e-06,
"loss": 0.0405,
"step": 771
},
{
"epoch": 1.3736654804270463,
"grad_norm": 0.825143680854267,
"learning_rate": 2.2308698078474645e-06,
"loss": 0.0349,
"step": 772
},
{
"epoch": 1.3754448398576513,
"grad_norm": 1.001912955792229,
"learning_rate": 2.219244554630438e-06,
"loss": 0.0477,
"step": 773
},
{
"epoch": 1.3772241992882561,
"grad_norm": 0.7736758012090639,
"learning_rate": 2.207641024905322e-06,
"loss": 0.0288,
"step": 774
},
{
"epoch": 1.3790035587188612,
"grad_norm": 1.1162837602545816,
"learning_rate": 2.1960593093198508e-06,
"loss": 0.0316,
"step": 775
},
{
"epoch": 1.3807829181494662,
"grad_norm": 1.0237380141940946,
"learning_rate": 2.184499498351347e-06,
"loss": 0.0421,
"step": 776
},
{
"epoch": 1.3825622775800712,
"grad_norm": 0.9455947117365314,
"learning_rate": 2.172961682306011e-06,
"loss": 0.0399,
"step": 777
},
{
"epoch": 1.3843416370106763,
"grad_norm": 0.8891975255037335,
"learning_rate": 2.1614459513182173e-06,
"loss": 0.0341,
"step": 778
},
{
"epoch": 1.386120996441281,
"grad_norm": 0.8854067913254945,
"learning_rate": 2.149952395349813e-06,
"loss": 0.0296,
"step": 779
},
{
"epoch": 1.387900355871886,
"grad_norm": 1.1944275799619644,
"learning_rate": 2.1384811041894055e-06,
"loss": 0.045,
"step": 780
},
{
"epoch": 1.3896797153024911,
"grad_norm": 1.0432671686625763,
"learning_rate": 2.1270321674516736e-06,
"loss": 0.0467,
"step": 781
},
{
"epoch": 1.3914590747330962,
"grad_norm": 1.051590997554103,
"learning_rate": 2.1156056745766593e-06,
"loss": 0.0393,
"step": 782
},
{
"epoch": 1.3932384341637012,
"grad_norm": 0.7769767413327008,
"learning_rate": 2.104201714829074e-06,
"loss": 0.0351,
"step": 783
},
{
"epoch": 1.395017793594306,
"grad_norm": 0.9830455602830936,
"learning_rate": 2.0928203772975917e-06,
"loss": 0.0457,
"step": 784
},
{
"epoch": 1.396797153024911,
"grad_norm": 1.3957545926088177,
"learning_rate": 2.081461750894166e-06,
"loss": 0.0477,
"step": 785
},
{
"epoch": 1.398576512455516,
"grad_norm": 1.1761039690624113,
"learning_rate": 2.070125924353328e-06,
"loss": 0.0521,
"step": 786
},
{
"epoch": 1.4003558718861209,
"grad_norm": 1.110685537230009,
"learning_rate": 2.058812986231493e-06,
"loss": 0.052,
"step": 787
},
{
"epoch": 1.402135231316726,
"grad_norm": 1.2173352503367165,
"learning_rate": 2.0475230249062727e-06,
"loss": 0.0595,
"step": 788
},
{
"epoch": 1.403914590747331,
"grad_norm": 1.0171099640987131,
"learning_rate": 2.0362561285757766e-06,
"loss": 0.0409,
"step": 789
},
{
"epoch": 1.405693950177936,
"grad_norm": 1.0659961619337683,
"learning_rate": 2.0250123852579347e-06,
"loss": 0.0463,
"step": 790
},
{
"epoch": 1.407473309608541,
"grad_norm": 0.8217925051573368,
"learning_rate": 2.013791882789801e-06,
"loss": 0.0331,
"step": 791
},
{
"epoch": 1.4092526690391458,
"grad_norm": 1.0192070289515296,
"learning_rate": 2.0025947088268714e-06,
"loss": 0.0329,
"step": 792
},
{
"epoch": 1.4110320284697508,
"grad_norm": 0.8674257228315672,
"learning_rate": 1.9914209508423943e-06,
"loss": 0.0382,
"step": 793
},
{
"epoch": 1.4128113879003559,
"grad_norm": 1.039394594603558,
"learning_rate": 1.9802706961266936e-06,
"loss": 0.0514,
"step": 794
},
{
"epoch": 1.414590747330961,
"grad_norm": 1.2422673645671691,
"learning_rate": 1.969144031786483e-06,
"loss": 0.0509,
"step": 795
},
{
"epoch": 1.416370106761566,
"grad_norm": 0.9834861700370274,
"learning_rate": 1.958041044744186e-06,
"loss": 0.0478,
"step": 796
},
{
"epoch": 1.4181494661921707,
"grad_norm": 0.8072332504815498,
"learning_rate": 1.94696182173726e-06,
"loss": 0.0272,
"step": 797
},
{
"epoch": 1.4199288256227758,
"grad_norm": 1.041888748607498,
"learning_rate": 1.9359064493175077e-06,
"loss": 0.0461,
"step": 798
},
{
"epoch": 1.4217081850533808,
"grad_norm": 0.8547473536647248,
"learning_rate": 1.9248750138504176e-06,
"loss": 0.0463,
"step": 799
},
{
"epoch": 1.4234875444839858,
"grad_norm": 0.9160844863052742,
"learning_rate": 1.9138676015144765e-06,
"loss": 0.033,
"step": 800
},
{
"epoch": 1.4234875444839858,
"eval_loss": 0.13344429433345795,
"eval_runtime": 7.1527,
"eval_samples_per_second": 6.431,
"eval_steps_per_second": 1.678,
"step": 800
},
{
"epoch": 1.4252669039145909,
"grad_norm": 1.1728004609011466,
"learning_rate": 1.9028842983005036e-06,
"loss": 0.0444,
"step": 801
},
{
"epoch": 1.4270462633451957,
"grad_norm": 0.9182230252406717,
"learning_rate": 1.8919251900109697e-06,
"loss": 0.0406,
"step": 802
},
{
"epoch": 1.4288256227758007,
"grad_norm": 0.7740835233990916,
"learning_rate": 1.8809903622593395e-06,
"loss": 0.0288,
"step": 803
},
{
"epoch": 1.4306049822064058,
"grad_norm": 0.8681372416415234,
"learning_rate": 1.870079900469392e-06,
"loss": 0.036,
"step": 804
},
{
"epoch": 1.4323843416370106,
"grad_norm": 0.9510296033757211,
"learning_rate": 1.8591938898745593e-06,
"loss": 0.0378,
"step": 805
},
{
"epoch": 1.4341637010676156,
"grad_norm": 0.8671006576919247,
"learning_rate": 1.8483324155172594e-06,
"loss": 0.0359,
"step": 806
},
{
"epoch": 1.4359430604982206,
"grad_norm": 0.7456102665101375,
"learning_rate": 1.837495562248226e-06,
"loss": 0.0308,
"step": 807
},
{
"epoch": 1.4377224199288257,
"grad_norm": 0.9769160563014989,
"learning_rate": 1.8266834147258577e-06,
"loss": 0.0417,
"step": 808
},
{
"epoch": 1.4395017793594307,
"grad_norm": 0.9173281560783461,
"learning_rate": 1.8158960574155455e-06,
"loss": 0.038,
"step": 809
},
{
"epoch": 1.4412811387900355,
"grad_norm": 0.9096880827995866,
"learning_rate": 1.8051335745890196e-06,
"loss": 0.0355,
"step": 810
},
{
"epoch": 1.4430604982206405,
"grad_norm": 0.9740778727636218,
"learning_rate": 1.7943960503236856e-06,
"loss": 0.0502,
"step": 811
},
{
"epoch": 1.4448398576512456,
"grad_norm": 0.7435132732373217,
"learning_rate": 1.7836835685019732e-06,
"loss": 0.0307,
"step": 812
},
{
"epoch": 1.4466192170818506,
"grad_norm": 0.9787109046764307,
"learning_rate": 1.7729962128106787e-06,
"loss": 0.0319,
"step": 813
},
{
"epoch": 1.4483985765124556,
"grad_norm": 0.8743242569858206,
"learning_rate": 1.7623340667403089e-06,
"loss": 0.035,
"step": 814
},
{
"epoch": 1.4501779359430604,
"grad_norm": 1.218953043581514,
"learning_rate": 1.7516972135844352e-06,
"loss": 0.0562,
"step": 815
},
{
"epoch": 1.4519572953736655,
"grad_norm": 0.9847396476889714,
"learning_rate": 1.741085736439031e-06,
"loss": 0.0348,
"step": 816
},
{
"epoch": 1.4537366548042705,
"grad_norm": 1.0415611109464662,
"learning_rate": 1.730499718201838e-06,
"loss": 0.0366,
"step": 817
},
{
"epoch": 1.4555160142348753,
"grad_norm": 1.0787375133207857,
"learning_rate": 1.7199392415717064e-06,
"loss": 0.0404,
"step": 818
},
{
"epoch": 1.4572953736654806,
"grad_norm": 0.899906940151615,
"learning_rate": 1.7094043890479557e-06,
"loss": 0.0425,
"step": 819
},
{
"epoch": 1.4590747330960854,
"grad_norm": 0.7722333626507534,
"learning_rate": 1.698895242929725e-06,
"loss": 0.0288,
"step": 820
},
{
"epoch": 1.4608540925266904,
"grad_norm": 0.8863193084024836,
"learning_rate": 1.6884118853153358e-06,
"loss": 0.0314,
"step": 821
},
{
"epoch": 1.4626334519572954,
"grad_norm": 1.1082581073729272,
"learning_rate": 1.6779543981016478e-06,
"loss": 0.0404,
"step": 822
},
{
"epoch": 1.4644128113879002,
"grad_norm": 0.9955465189826128,
"learning_rate": 1.6675228629834133e-06,
"loss": 0.0437,
"step": 823
},
{
"epoch": 1.4661921708185053,
"grad_norm": 0.8741761714128741,
"learning_rate": 1.657117361452651e-06,
"loss": 0.0309,
"step": 824
},
{
"epoch": 1.4679715302491103,
"grad_norm": 0.7382086548860517,
"learning_rate": 1.6467379747980011e-06,
"loss": 0.0261,
"step": 825
},
{
"epoch": 1.4697508896797153,
"grad_norm": 0.8049471680135438,
"learning_rate": 1.6363847841040914e-06,
"loss": 0.0378,
"step": 826
},
{
"epoch": 1.4715302491103204,
"grad_norm": 0.8255741130622384,
"learning_rate": 1.626057870250906e-06,
"loss": 0.0272,
"step": 827
},
{
"epoch": 1.4733096085409252,
"grad_norm": 1.1347923689675379,
"learning_rate": 1.6157573139131527e-06,
"loss": 0.041,
"step": 828
},
{
"epoch": 1.4750889679715302,
"grad_norm": 0.7302911526168262,
"learning_rate": 1.605483195559628e-06,
"loss": 0.0274,
"step": 829
},
{
"epoch": 1.4768683274021353,
"grad_norm": 0.8999710524588727,
"learning_rate": 1.5952355954525966e-06,
"loss": 0.0331,
"step": 830
},
{
"epoch": 1.4786476868327403,
"grad_norm": 1.0409610978339072,
"learning_rate": 1.5850145936471607e-06,
"loss": 0.0423,
"step": 831
},
{
"epoch": 1.4804270462633453,
"grad_norm": 1.0603934000618,
"learning_rate": 1.5748202699906335e-06,
"loss": 0.0394,
"step": 832
},
{
"epoch": 1.4822064056939501,
"grad_norm": 0.7245019085777473,
"learning_rate": 1.5646527041219128e-06,
"loss": 0.0235,
"step": 833
},
{
"epoch": 1.4839857651245552,
"grad_norm": 1.0003933208850713,
"learning_rate": 1.5545119754708682e-06,
"loss": 0.0372,
"step": 834
},
{
"epoch": 1.4857651245551602,
"grad_norm": 1.117608737881438,
"learning_rate": 1.544398163257711e-06,
"loss": 0.0396,
"step": 835
},
{
"epoch": 1.487544483985765,
"grad_norm": 0.8069740981162163,
"learning_rate": 1.5343113464923808e-06,
"loss": 0.0307,
"step": 836
},
{
"epoch": 1.48932384341637,
"grad_norm": 0.8792976432346608,
"learning_rate": 1.524251603973927e-06,
"loss": 0.0244,
"step": 837
},
{
"epoch": 1.491103202846975,
"grad_norm": 0.7831127456375095,
"learning_rate": 1.5142190142898883e-06,
"loss": 0.0293,
"step": 838
},
{
"epoch": 1.49288256227758,
"grad_norm": 1.0227232562116537,
"learning_rate": 1.5042136558156883e-06,
"loss": 0.0417,
"step": 839
},
{
"epoch": 1.4946619217081851,
"grad_norm": 1.4504092045196866,
"learning_rate": 1.4942356067140162e-06,
"loss": 0.0529,
"step": 840
},
{
"epoch": 1.49644128113879,
"grad_norm": 0.7647593177747857,
"learning_rate": 1.4842849449342195e-06,
"loss": 0.0297,
"step": 841
},
{
"epoch": 1.498220640569395,
"grad_norm": 0.8803775447231733,
"learning_rate": 1.4743617482116896e-06,
"loss": 0.0328,
"step": 842
},
{
"epoch": 1.5,
"grad_norm": 1.2494408641089838,
"learning_rate": 1.4644660940672628e-06,
"loss": 0.0362,
"step": 843
},
{
"epoch": 1.501779359430605,
"grad_norm": 0.9133080570375326,
"learning_rate": 1.454598059806609e-06,
"loss": 0.0402,
"step": 844
},
{
"epoch": 1.50355871886121,
"grad_norm": 0.9824913432232948,
"learning_rate": 1.4447577225196296e-06,
"loss": 0.0357,
"step": 845
},
{
"epoch": 1.5053380782918149,
"grad_norm": 1.1078102826347958,
"learning_rate": 1.4349451590798564e-06,
"loss": 0.0446,
"step": 846
},
{
"epoch": 1.50711743772242,
"grad_norm": 0.9235569215634525,
"learning_rate": 1.4251604461438444e-06,
"loss": 0.0411,
"step": 847
},
{
"epoch": 1.508896797153025,
"grad_norm": 0.8344609241644628,
"learning_rate": 1.4154036601505834e-06,
"loss": 0.0264,
"step": 848
},
{
"epoch": 1.5106761565836297,
"grad_norm": 1.0208130325307214,
"learning_rate": 1.4056748773208933e-06,
"loss": 0.0345,
"step": 849
},
{
"epoch": 1.512455516014235,
"grad_norm": 1.18407278498687,
"learning_rate": 1.3959741736568339e-06,
"loss": 0.0441,
"step": 850
},
{
"epoch": 1.5142348754448398,
"grad_norm": 0.8407420899636274,
"learning_rate": 1.3863016249411027e-06,
"loss": 0.0282,
"step": 851
},
{
"epoch": 1.5160142348754448,
"grad_norm": 0.9358932612708689,
"learning_rate": 1.376657306736453e-06,
"loss": 0.0338,
"step": 852
},
{
"epoch": 1.5177935943060499,
"grad_norm": 1.0992150247198784,
"learning_rate": 1.3670412943850975e-06,
"loss": 0.0435,
"step": 853
},
{
"epoch": 1.5195729537366547,
"grad_norm": 0.7824450481280979,
"learning_rate": 1.3574536630081208e-06,
"loss": 0.0265,
"step": 854
},
{
"epoch": 1.52135231316726,
"grad_norm": 1.1561450581795805,
"learning_rate": 1.347894487504896e-06,
"loss": 0.046,
"step": 855
},
{
"epoch": 1.5231316725978647,
"grad_norm": 0.8541116368943703,
"learning_rate": 1.3383638425524909e-06,
"loss": 0.0287,
"step": 856
},
{
"epoch": 1.5249110320284698,
"grad_norm": 0.9794589585996938,
"learning_rate": 1.3288618026050943e-06,
"loss": 0.0387,
"step": 857
},
{
"epoch": 1.5266903914590748,
"grad_norm": 1.3255527026520937,
"learning_rate": 1.31938844189343e-06,
"loss": 0.0495,
"step": 858
},
{
"epoch": 1.5284697508896796,
"grad_norm": 1.0062401733112627,
"learning_rate": 1.3099438344241777e-06,
"loss": 0.0324,
"step": 859
},
{
"epoch": 1.5302491103202847,
"grad_norm": 0.9745469076864323,
"learning_rate": 1.3005280539793908e-06,
"loss": 0.0359,
"step": 860
},
{
"epoch": 1.5320284697508897,
"grad_norm": 0.8906837672794841,
"learning_rate": 1.2911411741159273e-06,
"loss": 0.0289,
"step": 861
},
{
"epoch": 1.5338078291814945,
"grad_norm": 1.0308169458364242,
"learning_rate": 1.2817832681648712e-06,
"loss": 0.0457,
"step": 862
},
{
"epoch": 1.5355871886120998,
"grad_norm": 1.066135031513164,
"learning_rate": 1.2724544092309581e-06,
"loss": 0.0408,
"step": 863
},
{
"epoch": 1.5373665480427046,
"grad_norm": 0.9990751775334056,
"learning_rate": 1.2631546701920073e-06,
"loss": 0.0376,
"step": 864
},
{
"epoch": 1.5391459074733096,
"grad_norm": 0.9279454422229131,
"learning_rate": 1.2538841236983519e-06,
"loss": 0.0366,
"step": 865
},
{
"epoch": 1.5409252669039146,
"grad_norm": 0.9293129114653861,
"learning_rate": 1.244642842172266e-06,
"loss": 0.0278,
"step": 866
},
{
"epoch": 1.5427046263345194,
"grad_norm": 0.9460528920250509,
"learning_rate": 1.2354308978074088e-06,
"loss": 0.0342,
"step": 867
},
{
"epoch": 1.5444839857651247,
"grad_norm": 0.9392590603382746,
"learning_rate": 1.2262483625682514e-06,
"loss": 0.0335,
"step": 868
},
{
"epoch": 1.5462633451957295,
"grad_norm": 0.8549256234285926,
"learning_rate": 1.2170953081895214e-06,
"loss": 0.0373,
"step": 869
},
{
"epoch": 1.5480427046263345,
"grad_norm": 1.1139473893560274,
"learning_rate": 1.2079718061756369e-06,
"loss": 0.0325,
"step": 870
},
{
"epoch": 1.5498220640569396,
"grad_norm": 0.9192491000883517,
"learning_rate": 1.1988779278001517e-06,
"loss": 0.0369,
"step": 871
},
{
"epoch": 1.5516014234875444,
"grad_norm": 1.0285308271904086,
"learning_rate": 1.1898137441051982e-06,
"loss": 0.0377,
"step": 872
},
{
"epoch": 1.5533807829181496,
"grad_norm": 1.0632381868799874,
"learning_rate": 1.1807793259009282e-06,
"loss": 0.048,
"step": 873
},
{
"epoch": 1.5551601423487544,
"grad_norm": 0.8941713488837103,
"learning_rate": 1.1717747437649657e-06,
"loss": 0.0344,
"step": 874
},
{
"epoch": 1.5569395017793595,
"grad_norm": 1.0320040270693944,
"learning_rate": 1.1628000680418533e-06,
"loss": 0.0353,
"step": 875
},
{
"epoch": 1.5587188612099645,
"grad_norm": 0.9879256137330606,
"learning_rate": 1.1538553688425002e-06,
"loss": 0.0377,
"step": 876
},
{
"epoch": 1.5604982206405693,
"grad_norm": 0.9870724594556394,
"learning_rate": 1.14494071604364e-06,
"loss": 0.0385,
"step": 877
},
{
"epoch": 1.5622775800711743,
"grad_norm": 0.8920953626415423,
"learning_rate": 1.1360561792872754e-06,
"loss": 0.0309,
"step": 878
},
{
"epoch": 1.5640569395017794,
"grad_norm": 1.1965648875425332,
"learning_rate": 1.127201827980145e-06,
"loss": 0.0499,
"step": 879
},
{
"epoch": 1.5658362989323842,
"grad_norm": 0.7294023315487677,
"learning_rate": 1.1183777312931748e-06,
"loss": 0.0216,
"step": 880
},
{
"epoch": 1.5676156583629894,
"grad_norm": 0.9443435501509589,
"learning_rate": 1.1095839581609407e-06,
"loss": 0.0356,
"step": 881
},
{
"epoch": 1.5693950177935942,
"grad_norm": 1.1836820331014046,
"learning_rate": 1.1008205772811248e-06,
"loss": 0.0469,
"step": 882
},
{
"epoch": 1.5711743772241993,
"grad_norm": 1.245498489523876,
"learning_rate": 1.0920876571139843e-06,
"loss": 0.0436,
"step": 883
},
{
"epoch": 1.5729537366548043,
"grad_norm": 0.8677975712025474,
"learning_rate": 1.0833852658818167e-06,
"loss": 0.0408,
"step": 884
},
{
"epoch": 1.5747330960854091,
"grad_norm": 1.0288353933027032,
"learning_rate": 1.0747134715684221e-06,
"loss": 0.0391,
"step": 885
},
{
"epoch": 1.5765124555160144,
"grad_norm": 0.9226405070071885,
"learning_rate": 1.0660723419185776e-06,
"loss": 0.0407,
"step": 886
},
{
"epoch": 1.5782918149466192,
"grad_norm": 1.0664753740055022,
"learning_rate": 1.0574619444375017e-06,
"loss": 0.0397,
"step": 887
},
{
"epoch": 1.5800711743772242,
"grad_norm": 0.7220467139616126,
"learning_rate": 1.0488823463903341e-06,
"loss": 0.0316,
"step": 888
},
{
"epoch": 1.5818505338078293,
"grad_norm": 0.9666684504136956,
"learning_rate": 1.0403336148016053e-06,
"loss": 0.0362,
"step": 889
},
{
"epoch": 1.583629893238434,
"grad_norm": 1.1138519460532204,
"learning_rate": 1.0318158164547159e-06,
"loss": 0.0418,
"step": 890
},
{
"epoch": 1.585409252669039,
"grad_norm": 1.0184250030450555,
"learning_rate": 1.0233290178914096e-06,
"loss": 0.0352,
"step": 891
},
{
"epoch": 1.5871886120996441,
"grad_norm": 0.9833961249391704,
"learning_rate": 1.014873285411262e-06,
"loss": 0.037,
"step": 892
},
{
"epoch": 1.5889679715302492,
"grad_norm": 1.104247240879798,
"learning_rate": 1.006448685071154e-06,
"loss": 0.0413,
"step": 893
},
{
"epoch": 1.5907473309608542,
"grad_norm": 1.1882452731519395,
"learning_rate": 9.980552826847635e-07,
"loss": 0.0495,
"step": 894
},
{
"epoch": 1.592526690391459,
"grad_norm": 0.922464997040305,
"learning_rate": 9.896931438220453e-07,
"loss": 0.0324,
"step": 895
},
{
"epoch": 1.594306049822064,
"grad_norm": 1.1687021193524112,
"learning_rate": 9.813623338087181e-07,
"loss": 0.0464,
"step": 896
},
{
"epoch": 1.596085409252669,
"grad_norm": 1.0576380798604628,
"learning_rate": 9.730629177257623e-07,
"loss": 0.0429,
"step": 897
},
{
"epoch": 1.5978647686832739,
"grad_norm": 0.7266477393669994,
"learning_rate": 9.64794960408903e-07,
"loss": 0.0252,
"step": 898
},
{
"epoch": 1.5996441281138791,
"grad_norm": 0.9373401670105959,
"learning_rate": 9.565585264481092e-07,
"loss": 0.0372,
"step": 899
},
{
"epoch": 1.601423487544484,
"grad_norm": 0.8094908621190549,
"learning_rate": 9.483536801870835e-07,
"loss": 0.0263,
"step": 900
},
{
"epoch": 1.603202846975089,
"grad_norm": 0.9316959396242535,
"learning_rate": 9.401804857227648e-07,
"loss": 0.0304,
"step": 901
},
{
"epoch": 1.604982206405694,
"grad_norm": 1.2345023187653172,
"learning_rate": 9.320390069048258e-07,
"loss": 0.0361,
"step": 902
},
{
"epoch": 1.6067615658362988,
"grad_norm": 0.8864431203430947,
"learning_rate": 9.239293073351735e-07,
"loss": 0.0393,
"step": 903
},
{
"epoch": 1.608540925266904,
"grad_norm": 1.0799385908443084,
"learning_rate": 9.158514503674543e-07,
"loss": 0.0312,
"step": 904
},
{
"epoch": 1.6103202846975089,
"grad_norm": 0.9286264177644702,
"learning_rate": 9.078054991065532e-07,
"loss": 0.0377,
"step": 905
},
{
"epoch": 1.612099644128114,
"grad_norm": 1.0576510841578954,
"learning_rate": 8.997915164081095e-07,
"loss": 0.0455,
"step": 906
},
{
"epoch": 1.613879003558719,
"grad_norm": 1.0854015603523357,
"learning_rate": 8.918095648780195e-07,
"loss": 0.0457,
"step": 907
},
{
"epoch": 1.6156583629893237,
"grad_norm": 1.0357731908203174,
"learning_rate": 8.838597068719518e-07,
"loss": 0.0326,
"step": 908
},
{
"epoch": 1.6174377224199288,
"grad_norm": 1.0776572347728248,
"learning_rate": 8.75942004494853e-07,
"loss": 0.0357,
"step": 909
},
{
"epoch": 1.6192170818505338,
"grad_norm": 1.1016984155454363,
"learning_rate": 8.680565196004704e-07,
"loss": 0.0372,
"step": 910
},
{
"epoch": 1.6209964412811388,
"grad_norm": 0.9593481902843042,
"learning_rate": 8.602033137908666e-07,
"loss": 0.0366,
"step": 911
},
{
"epoch": 1.6227758007117439,
"grad_norm": 0.9007774118716178,
"learning_rate": 8.523824484159348e-07,
"loss": 0.026,
"step": 912
},
{
"epoch": 1.6245551601423487,
"grad_norm": 0.835459063038804,
"learning_rate": 8.445939845729245e-07,
"loss": 0.031,
"step": 913
},
{
"epoch": 1.6263345195729537,
"grad_norm": 0.9006973001186873,
"learning_rate": 8.368379831059592e-07,
"loss": 0.0372,
"step": 914
},
{
"epoch": 1.6281138790035588,
"grad_norm": 0.8889233791215693,
"learning_rate": 8.29114504605566e-07,
"loss": 0.032,
"step": 915
},
{
"epoch": 1.6298932384341636,
"grad_norm": 0.7822876187058223,
"learning_rate": 8.21423609408199e-07,
"loss": 0.0343,
"step": 916
},
{
"epoch": 1.6316725978647688,
"grad_norm": 1.111594006680975,
"learning_rate": 8.137653575957666e-07,
"loss": 0.0333,
"step": 917
},
{
"epoch": 1.6334519572953736,
"grad_norm": 1.020072554845852,
"learning_rate": 8.061398089951678e-07,
"loss": 0.0396,
"step": 918
},
{
"epoch": 1.6352313167259787,
"grad_norm": 0.8190464315963275,
"learning_rate": 7.985470231778203e-07,
"loss": 0.0317,
"step": 919
},
{
"epoch": 1.6370106761565837,
"grad_norm": 1.1006473942958985,
"learning_rate": 7.909870594591951e-07,
"loss": 0.0402,
"step": 920
},
{
"epoch": 1.6387900355871885,
"grad_norm": 1.2854097468189731,
"learning_rate": 7.834599768983553e-07,
"loss": 0.059,
"step": 921
},
{
"epoch": 1.6405693950177938,
"grad_norm": 0.7474969053636427,
"learning_rate": 7.759658342974951e-07,
"loss": 0.0247,
"step": 922
},
{
"epoch": 1.6423487544483986,
"grad_norm": 1.0083988508192103,
"learning_rate": 7.685046902014747e-07,
"loss": 0.0419,
"step": 923
},
{
"epoch": 1.6441281138790036,
"grad_norm": 1.0169598550090309,
"learning_rate": 7.61076602897371e-07,
"loss": 0.0371,
"step": 924
},
{
"epoch": 1.6459074733096086,
"grad_norm": 0.6812514330927129,
"learning_rate": 7.536816304140177e-07,
"loss": 0.0214,
"step": 925
},
{
"epoch": 1.6476868327402134,
"grad_norm": 0.729300793084757,
"learning_rate": 7.46319830521553e-07,
"loss": 0.0264,
"step": 926
},
{
"epoch": 1.6494661921708185,
"grad_norm": 0.7769643900453452,
"learning_rate": 7.389912607309662e-07,
"loss": 0.0294,
"step": 927
},
{
"epoch": 1.6512455516014235,
"grad_norm": 1.4135897655988794,
"learning_rate": 7.316959782936516e-07,
"loss": 0.0534,
"step": 928
},
{
"epoch": 1.6530249110320283,
"grad_norm": 0.6549831689302487,
"learning_rate": 7.244340402009608e-07,
"loss": 0.0217,
"step": 929
},
{
"epoch": 1.6548042704626336,
"grad_norm": 1.3763992770680902,
"learning_rate": 7.172055031837572e-07,
"loss": 0.0488,
"step": 930
},
{
"epoch": 1.6565836298932384,
"grad_norm": 0.8616529850738661,
"learning_rate": 7.100104237119676e-07,
"loss": 0.0355,
"step": 931
},
{
"epoch": 1.6583629893238434,
"grad_norm": 0.7598794140897599,
"learning_rate": 7.028488579941506e-07,
"loss": 0.0315,
"step": 932
},
{
"epoch": 1.6601423487544484,
"grad_norm": 0.9493777396147747,
"learning_rate": 6.957208619770505e-07,
"loss": 0.0335,
"step": 933
},
{
"epoch": 1.6619217081850532,
"grad_norm": 1.1056410082318162,
"learning_rate": 6.886264913451635e-07,
"loss": 0.0535,
"step": 934
},
{
"epoch": 1.6637010676156585,
"grad_norm": 0.8602500906103365,
"learning_rate": 6.815658015203014e-07,
"loss": 0.0299,
"step": 935
},
{
"epoch": 1.6654804270462633,
"grad_norm": 1.1303626496472468,
"learning_rate": 6.745388476611553e-07,
"loss": 0.0423,
"step": 936
},
{
"epoch": 1.6672597864768683,
"grad_norm": 0.8689628192761325,
"learning_rate": 6.67545684662873e-07,
"loss": 0.0289,
"step": 937
},
{
"epoch": 1.6690391459074734,
"grad_norm": 0.6183132653126804,
"learning_rate": 6.605863671566221e-07,
"loss": 0.0211,
"step": 938
},
{
"epoch": 1.6708185053380782,
"grad_norm": 1.1071927526516232,
"learning_rate": 6.536609495091695e-07,
"loss": 0.0351,
"step": 939
},
{
"epoch": 1.6725978647686834,
"grad_norm": 1.24253757661684,
"learning_rate": 6.467694858224488e-07,
"loss": 0.0433,
"step": 940
},
{
"epoch": 1.6743772241992882,
"grad_norm": 1.0274713686960626,
"learning_rate": 6.399120299331468e-07,
"loss": 0.0334,
"step": 941
},
{
"epoch": 1.6761565836298933,
"grad_norm": 0.7811773870244468,
"learning_rate": 6.330886354122768e-07,
"loss": 0.0345,
"step": 942
},
{
"epoch": 1.6779359430604983,
"grad_norm": 0.8014714628419496,
"learning_rate": 6.262993555647617e-07,
"loss": 0.03,
"step": 943
},
{
"epoch": 1.6797153024911031,
"grad_norm": 0.712819687318543,
"learning_rate": 6.1954424342902e-07,
"loss": 0.0247,
"step": 944
},
{
"epoch": 1.6814946619217082,
"grad_norm": 1.6895299850842702,
"learning_rate": 6.128233517765448e-07,
"loss": 0.0636,
"step": 945
},
{
"epoch": 1.6832740213523132,
"grad_norm": 1.1318686557102933,
"learning_rate": 6.061367331114992e-07,
"loss": 0.0373,
"step": 946
},
{
"epoch": 1.685053380782918,
"grad_norm": 0.9494600382208942,
"learning_rate": 5.994844396703025e-07,
"loss": 0.0357,
"step": 947
},
{
"epoch": 1.6868327402135233,
"grad_norm": 0.9363995816428463,
"learning_rate": 5.928665234212233e-07,
"loss": 0.0314,
"step": 948
},
{
"epoch": 1.688612099644128,
"grad_norm": 0.923925511702288,
"learning_rate": 5.862830360639698e-07,
"loss": 0.0354,
"step": 949
},
{
"epoch": 1.690391459074733,
"grad_norm": 1.2897962873576785,
"learning_rate": 5.797340290292907e-07,
"loss": 0.0389,
"step": 950
},
{
"epoch": 1.6921708185053381,
"grad_norm": 1.0994665862302504,
"learning_rate": 5.732195534785723e-07,
"loss": 0.0426,
"step": 951
},
{
"epoch": 1.693950177935943,
"grad_norm": 0.8862456655327394,
"learning_rate": 5.667396603034369e-07,
"loss": 0.0305,
"step": 952
},
{
"epoch": 1.6957295373665482,
"grad_norm": 1.0399436152359842,
"learning_rate": 5.602944001253486e-07,
"loss": 0.0357,
"step": 953
},
{
"epoch": 1.697508896797153,
"grad_norm": 1.127261042231125,
"learning_rate": 5.538838232952104e-07,
"loss": 0.0429,
"step": 954
},
{
"epoch": 1.699288256227758,
"grad_norm": 1.3220910149982537,
"learning_rate": 5.475079798929816e-07,
"loss": 0.0525,
"step": 955
},
{
"epoch": 1.701067615658363,
"grad_norm": 0.8427521039388601,
"learning_rate": 5.411669197272795e-07,
"loss": 0.028,
"step": 956
},
{
"epoch": 1.7028469750889679,
"grad_norm": 0.8110328630321189,
"learning_rate": 5.348606923349903e-07,
"loss": 0.0272,
"step": 957
},
{
"epoch": 1.704626334519573,
"grad_norm": 0.9703971089524207,
"learning_rate": 5.285893469808855e-07,
"loss": 0.0306,
"step": 958
},
{
"epoch": 1.706405693950178,
"grad_norm": 0.9783456783947087,
"learning_rate": 5.223529326572352e-07,
"loss": 0.0324,
"step": 959
},
{
"epoch": 1.708185053380783,
"grad_norm": 1.1357183520545884,
"learning_rate": 5.161514980834232e-07,
"loss": 0.0414,
"step": 960
},
{
"epoch": 1.709964412811388,
"grad_norm": 0.8988070534208314,
"learning_rate": 5.099850917055709e-07,
"loss": 0.0364,
"step": 961
},
{
"epoch": 1.7117437722419928,
"grad_norm": 0.9583520955663613,
"learning_rate": 5.038537616961559e-07,
"loss": 0.0338,
"step": 962
},
{
"epoch": 1.7135231316725978,
"grad_norm": 0.8595342153911766,
"learning_rate": 4.977575559536358e-07,
"loss": 0.0312,
"step": 963
},
{
"epoch": 1.7153024911032029,
"grad_norm": 1.0564765306855832,
"learning_rate": 4.916965221020753e-07,
"loss": 0.0325,
"step": 964
},
{
"epoch": 1.7170818505338077,
"grad_norm": 0.9743798971111711,
"learning_rate": 4.856707074907729e-07,
"loss": 0.0298,
"step": 965
},
{
"epoch": 1.718861209964413,
"grad_norm": 0.9652182678782931,
"learning_rate": 4.796801591938922e-07,
"loss": 0.0299,
"step": 966
},
{
"epoch": 1.7206405693950177,
"grad_norm": 1.0309106085190303,
"learning_rate": 4.737249240100911e-07,
"loss": 0.0406,
"step": 967
},
{
"epoch": 1.7224199288256228,
"grad_norm": 0.8048364147759897,
"learning_rate": 4.6780504846216155e-07,
"loss": 0.0238,
"step": 968
},
{
"epoch": 1.7241992882562278,
"grad_norm": 0.829980596096878,
"learning_rate": 4.619205787966613e-07,
"loss": 0.0258,
"step": 969
},
{
"epoch": 1.7259786476868326,
"grad_norm": 0.7850276613356122,
"learning_rate": 4.560715609835548e-07,
"loss": 0.0279,
"step": 970
},
{
"epoch": 1.7277580071174379,
"grad_norm": 0.72358981782396,
"learning_rate": 4.5025804071585464e-07,
"loss": 0.0253,
"step": 971
},
{
"epoch": 1.7295373665480427,
"grad_norm": 1.2537660504584207,
"learning_rate": 4.4448006340926163e-07,
"loss": 0.0494,
"step": 972
},
{
"epoch": 1.7313167259786477,
"grad_norm": 1.1514738594475027,
"learning_rate": 4.3873767420181344e-07,
"loss": 0.0397,
"step": 973
},
{
"epoch": 1.7330960854092528,
"grad_norm": 1.2047378027191022,
"learning_rate": 4.3303091795353024e-07,
"loss": 0.0533,
"step": 974
},
{
"epoch": 1.7348754448398576,
"grad_norm": 1.0256152942776495,
"learning_rate": 4.2735983924606596e-07,
"loss": 0.038,
"step": 975
},
{
"epoch": 1.7366548042704626,
"grad_norm": 0.9338454433520231,
"learning_rate": 4.2172448238235464e-07,
"loss": 0.0256,
"step": 976
},
{
"epoch": 1.7384341637010676,
"grad_norm": 0.9707190528049798,
"learning_rate": 4.161248913862731e-07,
"loss": 0.0338,
"step": 977
},
{
"epoch": 1.7402135231316724,
"grad_norm": 1.1499217073787062,
"learning_rate": 4.1056111000228937e-07,
"loss": 0.0405,
"step": 978
},
{
"epoch": 1.7419928825622777,
"grad_norm": 0.8726242073853734,
"learning_rate": 4.0503318169512417e-07,
"loss": 0.0271,
"step": 979
},
{
"epoch": 1.7437722419928825,
"grad_norm": 0.7148465586895162,
"learning_rate": 3.9954114964941336e-07,
"loss": 0.0208,
"step": 980
},
{
"epoch": 1.7455516014234875,
"grad_norm": 1.0713331041877134,
"learning_rate": 3.9408505676936327e-07,
"loss": 0.0371,
"step": 981
},
{
"epoch": 1.7473309608540926,
"grad_norm": 0.9341399388188112,
"learning_rate": 3.886649456784253e-07,
"loss": 0.0365,
"step": 982
},
{
"epoch": 1.7491103202846974,
"grad_norm": 0.8781461911591342,
"learning_rate": 3.8328085871895624e-07,
"loss": 0.0356,
"step": 983
},
{
"epoch": 1.7508896797153026,
"grad_norm": 0.8235928265369706,
"learning_rate": 3.779328379518898e-07,
"loss": 0.0309,
"step": 984
},
{
"epoch": 1.7526690391459074,
"grad_norm": 1.0697899745201302,
"learning_rate": 3.7262092515640556e-07,
"loss": 0.0422,
"step": 985
},
{
"epoch": 1.7544483985765125,
"grad_norm": 0.6666305916620812,
"learning_rate": 3.673451618296081e-07,
"loss": 0.0212,
"step": 986
},
{
"epoch": 1.7562277580071175,
"grad_norm": 0.9773015872344643,
"learning_rate": 3.621055891861963e-07,
"loss": 0.0399,
"step": 987
},
{
"epoch": 1.7580071174377223,
"grad_norm": 0.8726900125139255,
"learning_rate": 3.56902248158148e-07,
"loss": 0.0284,
"step": 988
},
{
"epoch": 1.7597864768683276,
"grad_norm": 1.056018529557307,
"learning_rate": 3.517351793943913e-07,
"loss": 0.0332,
"step": 989
},
{
"epoch": 1.7615658362989324,
"grad_norm": 1.0742533280362974,
"learning_rate": 3.4660442326049704e-07,
"loss": 0.0287,
"step": 990
},
{
"epoch": 1.7633451957295374,
"grad_norm": 0.8740732505273735,
"learning_rate": 3.4151001983835696e-07,
"loss": 0.0312,
"step": 991
},
{
"epoch": 1.7651245551601424,
"grad_norm": 1.042293617449342,
"learning_rate": 3.364520089258727e-07,
"loss": 0.027,
"step": 992
},
{
"epoch": 1.7669039145907472,
"grad_norm": 0.9546977626208112,
"learning_rate": 3.314304300366461e-07,
"loss": 0.0322,
"step": 993
},
{
"epoch": 1.7686832740213523,
"grad_norm": 0.9596235610874881,
"learning_rate": 3.2644532239966444e-07,
"loss": 0.035,
"step": 994
},
{
"epoch": 1.7704626334519573,
"grad_norm": 0.6210034355033043,
"learning_rate": 3.2149672495900286e-07,
"loss": 0.0205,
"step": 995
},
{
"epoch": 1.7722419928825621,
"grad_norm": 0.8943573596906906,
"learning_rate": 3.165846763735153e-07,
"loss": 0.0303,
"step": 996
},
{
"epoch": 1.7740213523131674,
"grad_norm": 0.8460767272289282,
"learning_rate": 3.117092150165324e-07,
"loss": 0.0319,
"step": 997
},
{
"epoch": 1.7758007117437722,
"grad_norm": 1.04522303905914,
"learning_rate": 3.068703789755606e-07,
"loss": 0.0438,
"step": 998
},
{
"epoch": 1.7775800711743772,
"grad_norm": 0.9040058383703806,
"learning_rate": 3.020682060519886e-07,
"loss": 0.0286,
"step": 999
},
{
"epoch": 1.7793594306049823,
"grad_norm": 0.9881837067031481,
"learning_rate": 2.9730273376078923e-07,
"loss": 0.034,
"step": 1000
},
{
"epoch": 1.7793594306049823,
"eval_loss": 0.13149592280387878,
"eval_runtime": 7.1479,
"eval_samples_per_second": 6.435,
"eval_steps_per_second": 1.679,
"step": 1000
},
{
"epoch": 1.781138790035587,
"grad_norm": 1.4581976873648321,
"learning_rate": 2.9257399933022737e-07,
"loss": 0.0567,
"step": 1001
},
{
"epoch": 1.7829181494661923,
"grad_norm": 1.0237315149119115,
"learning_rate": 2.8788203970156805e-07,
"loss": 0.0244,
"step": 1002
},
{
"epoch": 1.7846975088967971,
"grad_norm": 0.8701743450681395,
"learning_rate": 2.832268915287878e-07,
"loss": 0.0331,
"step": 1003
},
{
"epoch": 1.7864768683274022,
"grad_norm": 1.0739171114521038,
"learning_rate": 2.7860859117828985e-07,
"loss": 0.0381,
"step": 1004
},
{
"epoch": 1.7882562277580072,
"grad_norm": 1.6838830467351256,
"learning_rate": 2.740271747286194e-07,
"loss": 0.0811,
"step": 1005
},
{
"epoch": 1.790035587188612,
"grad_norm": 0.9281432422574337,
"learning_rate": 2.6948267797018145e-07,
"loss": 0.0306,
"step": 1006
},
{
"epoch": 1.791814946619217,
"grad_norm": 0.9911134831801146,
"learning_rate": 2.649751364049613e-07,
"loss": 0.0247,
"step": 1007
},
{
"epoch": 1.793594306049822,
"grad_norm": 0.9377006808945917,
"learning_rate": 2.6050458524624735e-07,
"loss": 0.0274,
"step": 1008
},
{
"epoch": 1.795373665480427,
"grad_norm": 0.9878113112061964,
"learning_rate": 2.560710594183552e-07,
"loss": 0.0318,
"step": 1009
},
{
"epoch": 1.7971530249110321,
"grad_norm": 0.9601089668885241,
"learning_rate": 2.5167459355635524e-07,
"loss": 0.0407,
"step": 1010
},
{
"epoch": 1.798932384341637,
"grad_norm": 0.9761958090765598,
"learning_rate": 2.473152220058039e-07,
"loss": 0.0327,
"step": 1011
},
{
"epoch": 1.800711743772242,
"grad_norm": 1.0670590996228635,
"learning_rate": 2.429929788224722e-07,
"loss": 0.0424,
"step": 1012
},
{
"epoch": 1.802491103202847,
"grad_norm": 0.953963322767867,
"learning_rate": 2.38707897772083e-07,
"loss": 0.0393,
"step": 1013
},
{
"epoch": 1.8042704626334518,
"grad_norm": 0.8250896078926884,
"learning_rate": 2.3446001233004333e-07,
"loss": 0.0334,
"step": 1014
},
{
"epoch": 1.806049822064057,
"grad_norm": 1.076888919506632,
"learning_rate": 2.3024935568118745e-07,
"loss": 0.0398,
"step": 1015
},
{
"epoch": 1.8078291814946619,
"grad_norm": 0.9327333319282085,
"learning_rate": 2.2607596071951288e-07,
"loss": 0.031,
"step": 1016
},
{
"epoch": 1.809608540925267,
"grad_norm": 0.7903529688554496,
"learning_rate": 2.2193986004792667e-07,
"loss": 0.0296,
"step": 1017
},
{
"epoch": 1.811387900355872,
"grad_norm": 0.698661874665587,
"learning_rate": 2.1784108597799058e-07,
"loss": 0.0187,
"step": 1018
},
{
"epoch": 1.8131672597864767,
"grad_norm": 0.9513103743831585,
"learning_rate": 2.1377967052966685e-07,
"loss": 0.036,
"step": 1019
},
{
"epoch": 1.814946619217082,
"grad_norm": 0.7484374633432326,
"learning_rate": 2.0975564543107007e-07,
"loss": 0.0293,
"step": 1020
},
{
"epoch": 1.8167259786476868,
"grad_norm": 1.0107176687876676,
"learning_rate": 2.057690421182168e-07,
"loss": 0.0386,
"step": 1021
},
{
"epoch": 1.8185053380782918,
"grad_norm": 0.887083775932418,
"learning_rate": 2.01819891734783e-07,
"loss": 0.0351,
"step": 1022
},
{
"epoch": 1.8202846975088969,
"grad_norm": 0.9178747885855896,
"learning_rate": 1.979082251318576e-07,
"loss": 0.0359,
"step": 1023
},
{
"epoch": 1.8220640569395017,
"grad_norm": 0.7913010983811369,
"learning_rate": 1.9403407286770592e-07,
"loss": 0.0242,
"step": 1024
},
{
"epoch": 1.8238434163701067,
"grad_norm": 0.7955445400300014,
"learning_rate": 1.9019746520752502e-07,
"loss": 0.0239,
"step": 1025
},
{
"epoch": 1.8256227758007118,
"grad_norm": 0.8910759846092995,
"learning_rate": 1.8639843212321206e-07,
"loss": 0.0324,
"step": 1026
},
{
"epoch": 1.8274021352313166,
"grad_norm": 0.8221026284011368,
"learning_rate": 1.826370032931285e-07,
"loss": 0.0269,
"step": 1027
},
{
"epoch": 1.8291814946619218,
"grad_norm": 0.8929037997791472,
"learning_rate": 1.789132081018674e-07,
"loss": 0.0329,
"step": 1028
},
{
"epoch": 1.8309608540925266,
"grad_norm": 1.2085245846695714,
"learning_rate": 1.7522707564002706e-07,
"loss": 0.0385,
"step": 1029
},
{
"epoch": 1.8327402135231317,
"grad_norm": 0.796349936855367,
"learning_rate": 1.7157863470397718e-07,
"loss": 0.0278,
"step": 1030
},
{
"epoch": 1.8345195729537367,
"grad_norm": 0.9209938761836545,
"learning_rate": 1.6796791379564138e-07,
"loss": 0.0335,
"step": 1031
},
{
"epoch": 1.8362989323843415,
"grad_norm": 0.9306412458754235,
"learning_rate": 1.6439494112227173e-07,
"loss": 0.0296,
"step": 1032
},
{
"epoch": 1.8380782918149468,
"grad_norm": 0.8784943369954986,
"learning_rate": 1.6085974459622567e-07,
"loss": 0.036,
"step": 1033
},
{
"epoch": 1.8398576512455516,
"grad_norm": 0.7439946710703741,
"learning_rate": 1.573623518347517e-07,
"loss": 0.028,
"step": 1034
},
{
"epoch": 1.8416370106761566,
"grad_norm": 0.9743690068723833,
"learning_rate": 1.5390279015977117e-07,
"loss": 0.0384,
"step": 1035
},
{
"epoch": 1.8434163701067616,
"grad_norm": 0.8443085515127314,
"learning_rate": 1.5048108659766693e-07,
"loss": 0.0315,
"step": 1036
},
{
"epoch": 1.8451957295373664,
"grad_norm": 0.7285124904919683,
"learning_rate": 1.470972678790711e-07,
"loss": 0.0308,
"step": 1037
},
{
"epoch": 1.8469750889679717,
"grad_norm": 0.9991244860368302,
"learning_rate": 1.437513604386559e-07,
"loss": 0.0438,
"step": 1038
},
{
"epoch": 1.8487544483985765,
"grad_norm": 0.8028992434917028,
"learning_rate": 1.404433904149266e-07,
"loss": 0.0241,
"step": 1039
},
{
"epoch": 1.8505338078291815,
"grad_norm": 0.9832311451220926,
"learning_rate": 1.3717338365001943e-07,
"loss": 0.0351,
"step": 1040
},
{
"epoch": 1.8523131672597866,
"grad_norm": 1.0585927337878276,
"learning_rate": 1.3394136568949834e-07,
"loss": 0.0374,
"step": 1041
},
{
"epoch": 1.8540925266903914,
"grad_norm": 1.2862955458665684,
"learning_rate": 1.307473617821553e-07,
"loss": 0.0497,
"step": 1042
},
{
"epoch": 1.8558718861209964,
"grad_norm": 1.0823593920906582,
"learning_rate": 1.275913968798137e-07,
"loss": 0.0355,
"step": 1043
},
{
"epoch": 1.8576512455516014,
"grad_norm": 0.9149759606916206,
"learning_rate": 1.2447349563713186e-07,
"loss": 0.0356,
"step": 1044
},
{
"epoch": 1.8594306049822062,
"grad_norm": 0.9824134546293025,
"learning_rate": 1.213936824114137e-07,
"loss": 0.0348,
"step": 1045
},
{
"epoch": 1.8612099644128115,
"grad_norm": 0.935432985228546,
"learning_rate": 1.1835198126241509e-07,
"loss": 0.0283,
"step": 1046
},
{
"epoch": 1.8629893238434163,
"grad_norm": 1.3192106309529583,
"learning_rate": 1.1534841595215617e-07,
"loss": 0.0389,
"step": 1047
},
{
"epoch": 1.8647686832740213,
"grad_norm": 0.9870732568235638,
"learning_rate": 1.1238300994473983e-07,
"loss": 0.0302,
"step": 1048
},
{
"epoch": 1.8665480427046264,
"grad_norm": 0.9836332689828364,
"learning_rate": 1.0945578640616183e-07,
"loss": 0.038,
"step": 1049
},
{
"epoch": 1.8683274021352312,
"grad_norm": 0.8500081148991353,
"learning_rate": 1.0656676820413603e-07,
"loss": 0.0222,
"step": 1050
},
{
"epoch": 1.8701067615658364,
"grad_norm": 0.7301350794482695,
"learning_rate": 1.0371597790791166e-07,
"loss": 0.0227,
"step": 1051
},
{
"epoch": 1.8718861209964412,
"grad_norm": 1.005129217408169,
"learning_rate": 1.0090343778809908e-07,
"loss": 0.0386,
"step": 1052
},
{
"epoch": 1.8736654804270463,
"grad_norm": 1.1804794331103639,
"learning_rate": 9.812916981649433e-08,
"loss": 0.0461,
"step": 1053
},
{
"epoch": 1.8754448398576513,
"grad_norm": 0.9156692878233684,
"learning_rate": 9.539319566590766e-08,
"loss": 0.0376,
"step": 1054
},
{
"epoch": 1.8772241992882561,
"grad_norm": 1.0017513758167835,
"learning_rate": 9.269553670999743e-08,
"loss": 0.0362,
"step": 1055
},
{
"epoch": 1.8790035587188612,
"grad_norm": 1.302168151682292,
"learning_rate": 9.003621402309815e-08,
"loss": 0.0391,
"step": 1056
},
{
"epoch": 1.8807829181494662,
"grad_norm": 1.0771753414898826,
"learning_rate": 8.741524838005888e-08,
"loss": 0.0428,
"step": 1057
},
{
"epoch": 1.8825622775800712,
"grad_norm": 0.8229028021100799,
"learning_rate": 8.483266025608061e-08,
"loss": 0.0274,
"step": 1058
},
{
"epoch": 1.8843416370106763,
"grad_norm": 0.8754992831336158,
"learning_rate": 8.228846982655525e-08,
"loss": 0.0275,
"step": 1059
},
{
"epoch": 1.886120996441281,
"grad_norm": 0.9536393757277662,
"learning_rate": 7.978269696691021e-08,
"loss": 0.0323,
"step": 1060
},
{
"epoch": 1.887900355871886,
"grad_norm": 0.9170964479057592,
"learning_rate": 7.731536125244965e-08,
"loss": 0.0326,
"step": 1061
},
{
"epoch": 1.8896797153024911,
"grad_norm": 1.0022067938158241,
"learning_rate": 7.488648195820513e-08,
"loss": 0.0376,
"step": 1062
},
{
"epoch": 1.891459074733096,
"grad_norm": 1.2358298188436405,
"learning_rate": 7.249607805878245e-08,
"loss": 0.0371,
"step": 1063
},
{
"epoch": 1.8932384341637012,
"grad_norm": 1.0813593177347594,
"learning_rate": 7.014416822821557e-08,
"loss": 0.038,
"step": 1064
},
{
"epoch": 1.895017793594306,
"grad_norm": 1.0606118808208824,
"learning_rate": 6.783077083981793e-08,
"loss": 0.0297,
"step": 1065
},
{
"epoch": 1.896797153024911,
"grad_norm": 0.915941522935534,
"learning_rate": 6.55559039660425e-08,
"loss": 0.0325,
"step": 1066
},
{
"epoch": 1.898576512455516,
"grad_norm": 0.9030681191606825,
"learning_rate": 6.331958537833693e-08,
"loss": 0.0264,
"step": 1067
},
{
"epoch": 1.9003558718861209,
"grad_norm": 0.871513537590659,
"learning_rate": 6.112183254700866e-08,
"loss": 0.0338,
"step": 1068
},
{
"epoch": 1.9021352313167261,
"grad_norm": 1.0096976334247885,
"learning_rate": 5.8962662641083856e-08,
"loss": 0.0293,
"step": 1069
},
{
"epoch": 1.903914590747331,
"grad_norm": 0.9058276514087495,
"learning_rate": 5.6842092528176516e-08,
"loss": 0.0304,
"step": 1070
},
{
"epoch": 1.905693950177936,
"grad_norm": 0.8190544432997178,
"learning_rate": 5.476013877435626e-08,
"loss": 0.0298,
"step": 1071
},
{
"epoch": 1.907473309608541,
"grad_norm": 1.1099499267624018,
"learning_rate": 5.271681764401848e-08,
"loss": 0.0379,
"step": 1072
},
{
"epoch": 1.9092526690391458,
"grad_norm": 0.7380683514472781,
"learning_rate": 5.071214509975775e-08,
"loss": 0.0248,
"step": 1073
},
{
"epoch": 1.9110320284697508,
"grad_norm": 0.9929257149315531,
"learning_rate": 4.8746136802240716e-08,
"loss": 0.0352,
"step": 1074
},
{
"epoch": 1.9128113879003559,
"grad_norm": 0.8954709513119363,
"learning_rate": 4.6818808110087875e-08,
"loss": 0.0315,
"step": 1075
},
{
"epoch": 1.914590747330961,
"grad_norm": 1.0373977774959033,
"learning_rate": 4.493017407975087e-08,
"loss": 0.0378,
"step": 1076
},
{
"epoch": 1.916370106761566,
"grad_norm": 0.9324638936790517,
"learning_rate": 4.308024946539424e-08,
"loss": 0.0252,
"step": 1077
},
{
"epoch": 1.9181494661921707,
"grad_norm": 0.7670826159562243,
"learning_rate": 4.1269048718783344e-08,
"loss": 0.0228,
"step": 1078
},
{
"epoch": 1.9199288256227758,
"grad_norm": 0.7971799973017737,
"learning_rate": 3.9496585989167726e-08,
"loss": 0.0286,
"step": 1079
},
{
"epoch": 1.9217081850533808,
"grad_norm": 0.932883668858871,
"learning_rate": 3.776287512317345e-08,
"loss": 0.036,
"step": 1080
},
{
"epoch": 1.9234875444839856,
"grad_norm": 1.115101637449023,
"learning_rate": 3.606792966469375e-08,
"loss": 0.0359,
"step": 1081
},
{
"epoch": 1.9252669039145909,
"grad_norm": 0.8683734269285328,
"learning_rate": 3.4411762854782426e-08,
"loss": 0.0298,
"step": 1082
},
{
"epoch": 1.9270462633451957,
"grad_norm": 1.1836052474553045,
"learning_rate": 3.279438763155174e-08,
"loss": 0.0314,
"step": 1083
},
{
"epoch": 1.9288256227758007,
"grad_norm": 0.8822285835082575,
"learning_rate": 3.121581663007134e-08,
"loss": 0.0355,
"step": 1084
},
{
"epoch": 1.9306049822064058,
"grad_norm": 0.9851401744795133,
"learning_rate": 2.967606218226837e-08,
"loss": 0.0408,
"step": 1085
},
{
"epoch": 1.9323843416370106,
"grad_norm": 0.8277152849214361,
"learning_rate": 2.8175136316832e-08,
"loss": 0.0261,
"step": 1086
},
{
"epoch": 1.9341637010676158,
"grad_norm": 1.0130518925764556,
"learning_rate": 2.6713050759120117e-08,
"loss": 0.0387,
"step": 1087
},
{
"epoch": 1.9359430604982206,
"grad_norm": 0.9689860116457749,
"learning_rate": 2.528981693106558e-08,
"loss": 0.0341,
"step": 1088
},
{
"epoch": 1.9377224199288257,
"grad_norm": 0.9796668852631681,
"learning_rate": 2.3905445951089013e-08,
"loss": 0.0319,
"step": 1089
},
{
"epoch": 1.9395017793594307,
"grad_norm": 1.0813212494176583,
"learning_rate": 2.2559948634011673e-08,
"loss": 0.0326,
"step": 1090
},
{
"epoch": 1.9412811387900355,
"grad_norm": 1.0461265760023197,
"learning_rate": 2.125333549096942e-08,
"loss": 0.0387,
"step": 1091
},
{
"epoch": 1.9430604982206405,
"grad_norm": 0.811009205236011,
"learning_rate": 1.9985616729332747e-08,
"loss": 0.0307,
"step": 1092
},
{
"epoch": 1.9448398576512456,
"grad_norm": 1.021062435635048,
"learning_rate": 1.8756802252625773e-08,
"loss": 0.0331,
"step": 1093
},
{
"epoch": 1.9466192170818504,
"grad_norm": 0.8523317241217103,
"learning_rate": 1.75669016604485e-08,
"loss": 0.0325,
"step": 1094
},
{
"epoch": 1.9483985765124556,
"grad_norm": 1.04176434171501,
"learning_rate": 1.6415924248403547e-08,
"loss": 0.0348,
"step": 1095
},
{
"epoch": 1.9501779359430604,
"grad_norm": 0.7659336169960644,
"learning_rate": 1.5303879008021773e-08,
"loss": 0.0264,
"step": 1096
},
{
"epoch": 1.9519572953736655,
"grad_norm": 0.9108460203627384,
"learning_rate": 1.4230774626691756e-08,
"loss": 0.0271,
"step": 1097
},
{
"epoch": 1.9537366548042705,
"grad_norm": 1.371246031672071,
"learning_rate": 1.3196619487594875e-08,
"loss": 0.047,
"step": 1098
},
{
"epoch": 1.9555160142348753,
"grad_norm": 1.0394863712655835,
"learning_rate": 1.2201421669636448e-08,
"loss": 0.036,
"step": 1099
},
{
"epoch": 1.9572953736654806,
"grad_norm": 1.044634703938829,
"learning_rate": 1.1245188947384133e-08,
"loss": 0.0283,
"step": 1100
},
{
"epoch": 1.9590747330960854,
"grad_norm": 1.069614237257927,
"learning_rate": 1.0327928791006858e-08,
"loss": 0.0362,
"step": 1101
},
{
"epoch": 1.9608540925266904,
"grad_norm": 0.940847502161726,
"learning_rate": 9.449648366217645e-09,
"loss": 0.0334,
"step": 1102
},
{
"epoch": 1.9626334519572954,
"grad_norm": 0.8748418808894304,
"learning_rate": 8.61035453421588e-09,
"loss": 0.0347,
"step": 1103
},
{
"epoch": 1.9644128113879002,
"grad_norm": 0.8786054359850882,
"learning_rate": 7.81005385163458e-09,
"loss": 0.0409,
"step": 1104
},
{
"epoch": 1.9661921708185055,
"grad_norm": 0.8133475665653579,
"learning_rate": 7.048752570488205e-09,
"loss": 0.0313,
"step": 1105
},
{
"epoch": 1.9679715302491103,
"grad_norm": 1.0276320557124607,
"learning_rate": 6.326456638125478e-09,
"loss": 0.0323,
"step": 1106
},
{
"epoch": 1.9697508896797153,
"grad_norm": 0.8587607763955706,
"learning_rate": 5.643171697183314e-09,
"loss": 0.032,
"step": 1107
},
{
"epoch": 1.9715302491103204,
"grad_norm": 0.9712870242537757,
"learning_rate": 4.998903085539075e-09,
"loss": 0.0328,
"step": 1108
},
{
"epoch": 1.9733096085409252,
"grad_norm": 0.8166100766953054,
"learning_rate": 4.393655836272825e-09,
"loss": 0.0278,
"step": 1109
},
{
"epoch": 1.9750889679715302,
"grad_norm": 1.0364742911765257,
"learning_rate": 3.8274346776262514e-09,
"loss": 0.0354,
"step": 1110
},
{
"epoch": 1.9768683274021353,
"grad_norm": 1.0782721578446093,
"learning_rate": 3.300244032966582e-09,
"loss": 0.0384,
"step": 1111
},
{
"epoch": 1.97864768683274,
"grad_norm": 1.0709966092834682,
"learning_rate": 2.8120880207493928e-09,
"loss": 0.0334,
"step": 1112
},
{
"epoch": 1.9804270462633453,
"grad_norm": 0.7010588442056803,
"learning_rate": 2.362970454491409e-09,
"loss": 0.0187,
"step": 1113
},
{
"epoch": 1.9822064056939501,
"grad_norm": 1.2627454425207878,
"learning_rate": 1.952894842735531e-09,
"loss": 0.0402,
"step": 1114
},
{
"epoch": 1.9839857651245552,
"grad_norm": 0.6439934853719501,
"learning_rate": 1.5818643890258555e-09,
"loss": 0.0225,
"step": 1115
},
{
"epoch": 1.9857651245551602,
"grad_norm": 1.067338669364316,
"learning_rate": 1.2498819918843609e-09,
"loss": 0.0326,
"step": 1116
},
{
"epoch": 1.987544483985765,
"grad_norm": 1.1467918734632372,
"learning_rate": 9.569502447837053e-10,
"loss": 0.0289,
"step": 1117
},
{
"epoch": 1.9893238434163703,
"grad_norm": 1.0558293840041255,
"learning_rate": 7.03071436131686e-10,
"loss": 0.0322,
"step": 1118
},
{
"epoch": 1.991103202846975,
"grad_norm": 0.9210537233088608,
"learning_rate": 4.882475492506977e-10,
"loss": 0.0335,
"step": 1119
},
{
"epoch": 1.99288256227758,
"grad_norm": 1.1268509628147607,
"learning_rate": 3.124802623627465e-10,
"loss": 0.0446,
"step": 1120
},
{
"epoch": 1.9946619217081851,
"grad_norm": 1.1339107008229083,
"learning_rate": 1.7577094857557097e-10,
"loss": 0.0366,
"step": 1121
},
{
"epoch": 1.99644128113879,
"grad_norm": 1.0673822700368407,
"learning_rate": 7.812067587487093e-11,
"loss": 0.0326,
"step": 1122
},
{
"epoch": 1.998220640569395,
"grad_norm": 0.813539782832218,
"learning_rate": 1.9530207111539967e-11,
"loss": 0.0233,
"step": 1123
},
{
"epoch": 2.0,
"grad_norm": 0.5845581052680466,
"learning_rate": 0.0,
"loss": 0.0194,
"step": 1124
},
{
"epoch": 2.0,
"step": 1124,
"total_flos": 11497408118784.0,
"train_loss": 0.08073598971703086,
"train_runtime": 2558.5077,
"train_samples_per_second": 3.512,
"train_steps_per_second": 0.439
}
],
"logging_steps": 1,
"max_steps": 1124,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 11497408118784.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}