diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,19982 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.9994373329582222, + "eval_steps": 500, + "global_step": 14216, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002813335208890139, + "grad_norm": 8.1875, + "learning_rate": 5.625879043600562e-09, + "loss": 1.9967, + "step": 1 + }, + { + "epoch": 0.0014066676044450696, + "grad_norm": 8.25, + "learning_rate": 2.8129395218002812e-08, + "loss": 1.834, + "step": 5 + }, + { + "epoch": 0.0028133352088901393, + "grad_norm": 7.8125, + "learning_rate": 5.6258790436005624e-08, + "loss": 1.9701, + "step": 10 + }, + { + "epoch": 0.004220002813335209, + "grad_norm": 9.75, + "learning_rate": 8.438818565400843e-08, + "loss": 2.0358, + "step": 15 + }, + { + "epoch": 0.0056266704177802785, + "grad_norm": 9.25, + "learning_rate": 1.1251758087201125e-07, + "loss": 2.1257, + "step": 20 + }, + { + "epoch": 0.007033338022225348, + "grad_norm": 8.9375, + "learning_rate": 1.4064697609001405e-07, + "loss": 2.1333, + "step": 25 + }, + { + "epoch": 0.008440005626670417, + "grad_norm": 7.9375, + "learning_rate": 1.6877637130801686e-07, + "loss": 2.0451, + "step": 30 + }, + { + "epoch": 0.009846673231115488, + "grad_norm": 8.0625, + "learning_rate": 1.9690576652601966e-07, + "loss": 2.114, + "step": 35 + }, + { + "epoch": 0.011253340835560557, + "grad_norm": 10.6875, + "learning_rate": 2.250351617440225e-07, + "loss": 2.0872, + "step": 40 + }, + { + "epoch": 0.012660008440005626, + "grad_norm": 8.9375, + "learning_rate": 2.5316455696202533e-07, + "loss": 2.1654, + "step": 45 + }, + { + "epoch": 0.014066676044450697, + "grad_norm": 15.0625, + "learning_rate": 2.812939521800281e-07, + "loss": 1.8529, + "step": 50 + }, + { + "epoch": 0.015473343648895766, + "grad_norm": 15.1875, + "learning_rate": 3.0942334739803094e-07, + "loss": 2.0161, + "step": 55 + }, + { + "epoch": 0.016880011253340835, + "grad_norm": 8.875, + "learning_rate": 3.375527426160337e-07, + "loss": 1.9371, + "step": 60 + }, + { + "epoch": 0.018286678857785905, + "grad_norm": 10.0625, + "learning_rate": 3.6568213783403655e-07, + "loss": 1.7475, + "step": 65 + }, + { + "epoch": 0.019693346462230976, + "grad_norm": 8.625, + "learning_rate": 3.9381153305203933e-07, + "loss": 2.1096, + "step": 70 + }, + { + "epoch": 0.021100014066676043, + "grad_norm": 7.09375, + "learning_rate": 4.2194092827004216e-07, + "loss": 2.1636, + "step": 75 + }, + { + "epoch": 0.022506681671121114, + "grad_norm": 18.375, + "learning_rate": 4.50070323488045e-07, + "loss": 1.888, + "step": 80 + }, + { + "epoch": 0.023913349275566185, + "grad_norm": 9.6875, + "learning_rate": 4.781997187060478e-07, + "loss": 2.1679, + "step": 85 + }, + { + "epoch": 0.025320016880011252, + "grad_norm": 8.5, + "learning_rate": 5.063291139240507e-07, + "loss": 1.8771, + "step": 90 + }, + { + "epoch": 0.026726684484456323, + "grad_norm": 8.0625, + "learning_rate": 5.344585091420533e-07, + "loss": 1.9294, + "step": 95 + }, + { + "epoch": 0.028133352088901394, + "grad_norm": 8.1875, + "learning_rate": 5.625879043600562e-07, + "loss": 1.8677, + "step": 100 + }, + { + "epoch": 0.02954001969334646, + "grad_norm": 7.8125, + "learning_rate": 5.907172995780591e-07, + "loss": 1.9361, + "step": 105 + }, + { + "epoch": 0.03094668729779153, + "grad_norm": 10.3125, + "learning_rate": 6.188466947960619e-07, + "loss": 2.1789, + "step": 110 + }, + { + "epoch": 0.0323533549022366, + "grad_norm": 7.84375, + "learning_rate": 6.469760900140648e-07, + "loss": 2.0211, + "step": 115 + }, + { + "epoch": 0.03376002250668167, + "grad_norm": 7.9375, + "learning_rate": 6.751054852320674e-07, + "loss": 2.1231, + "step": 120 + }, + { + "epoch": 0.03516669011112674, + "grad_norm": 12.375, + "learning_rate": 7.032348804500703e-07, + "loss": 2.1527, + "step": 125 + }, + { + "epoch": 0.03657335771557181, + "grad_norm": 14.6875, + "learning_rate": 7.313642756680731e-07, + "loss": 1.7367, + "step": 130 + }, + { + "epoch": 0.03798002532001688, + "grad_norm": 8.3125, + "learning_rate": 7.59493670886076e-07, + "loss": 2.2069, + "step": 135 + }, + { + "epoch": 0.03938669292446195, + "grad_norm": 7.40625, + "learning_rate": 7.876230661040787e-07, + "loss": 2.1692, + "step": 140 + }, + { + "epoch": 0.040793360528907016, + "grad_norm": 10.1875, + "learning_rate": 8.157524613220815e-07, + "loss": 2.0095, + "step": 145 + }, + { + "epoch": 0.04220002813335209, + "grad_norm": 8.3125, + "learning_rate": 8.438818565400843e-07, + "loss": 1.6281, + "step": 150 + }, + { + "epoch": 0.04360669573779716, + "grad_norm": 7.625, + "learning_rate": 8.720112517580872e-07, + "loss": 1.759, + "step": 155 + }, + { + "epoch": 0.04501336334224223, + "grad_norm": 8.75, + "learning_rate": 9.0014064697609e-07, + "loss": 1.9759, + "step": 160 + }, + { + "epoch": 0.0464200309466873, + "grad_norm": 7.78125, + "learning_rate": 9.282700421940928e-07, + "loss": 2.0089, + "step": 165 + }, + { + "epoch": 0.04782669855113237, + "grad_norm": 15.375, + "learning_rate": 9.563994374120955e-07, + "loss": 1.8533, + "step": 170 + }, + { + "epoch": 0.04923336615557744, + "grad_norm": 11.625, + "learning_rate": 9.845288326300985e-07, + "loss": 2.0724, + "step": 175 + }, + { + "epoch": 0.050640033760022504, + "grad_norm": 7.53125, + "learning_rate": 1.0126582278481013e-06, + "loss": 2.2394, + "step": 180 + }, + { + "epoch": 0.052046701364467575, + "grad_norm": 7.21875, + "learning_rate": 1.040787623066104e-06, + "loss": 1.9738, + "step": 185 + }, + { + "epoch": 0.053453368968912646, + "grad_norm": 7.34375, + "learning_rate": 1.0689170182841067e-06, + "loss": 2.0486, + "step": 190 + }, + { + "epoch": 0.054860036573357716, + "grad_norm": 8.375, + "learning_rate": 1.0970464135021096e-06, + "loss": 1.88, + "step": 195 + }, + { + "epoch": 0.05626670417780279, + "grad_norm": 7.90625, + "learning_rate": 1.1251758087201124e-06, + "loss": 1.9277, + "step": 200 + }, + { + "epoch": 0.05767337178224786, + "grad_norm": 7.53125, + "learning_rate": 1.1533052039381152e-06, + "loss": 1.8934, + "step": 205 + }, + { + "epoch": 0.05908003938669292, + "grad_norm": 7.0, + "learning_rate": 1.1814345991561182e-06, + "loss": 2.2194, + "step": 210 + }, + { + "epoch": 0.06048670699113799, + "grad_norm": 6.375, + "learning_rate": 1.209563994374121e-06, + "loss": 1.7994, + "step": 215 + }, + { + "epoch": 0.06189337459558306, + "grad_norm": 4.65625, + "learning_rate": 1.2376933895921238e-06, + "loss": 1.7983, + "step": 220 + }, + { + "epoch": 0.06330004220002813, + "grad_norm": 5.46875, + "learning_rate": 1.2658227848101265e-06, + "loss": 1.9125, + "step": 225 + }, + { + "epoch": 0.0647067098044732, + "grad_norm": 6.1875, + "learning_rate": 1.2939521800281295e-06, + "loss": 1.9858, + "step": 230 + }, + { + "epoch": 0.06611337740891828, + "grad_norm": 5.21875, + "learning_rate": 1.322081575246132e-06, + "loss": 2.0835, + "step": 235 + }, + { + "epoch": 0.06752004501336334, + "grad_norm": 5.625, + "learning_rate": 1.3502109704641349e-06, + "loss": 2.0088, + "step": 240 + }, + { + "epoch": 0.06892671261780842, + "grad_norm": 4.6875, + "learning_rate": 1.3783403656821376e-06, + "loss": 2.0304, + "step": 245 + }, + { + "epoch": 0.07033338022225348, + "grad_norm": 4.21875, + "learning_rate": 1.4064697609001406e-06, + "loss": 2.1096, + "step": 250 + }, + { + "epoch": 0.07174004782669856, + "grad_norm": 9.25, + "learning_rate": 1.4345991561181434e-06, + "loss": 1.982, + "step": 255 + }, + { + "epoch": 0.07314671543114362, + "grad_norm": 4.5625, + "learning_rate": 1.4627285513361462e-06, + "loss": 1.9651, + "step": 260 + }, + { + "epoch": 0.07455338303558869, + "grad_norm": 4.4375, + "learning_rate": 1.4908579465541492e-06, + "loss": 1.8912, + "step": 265 + }, + { + "epoch": 0.07596005064003376, + "grad_norm": 5.25, + "learning_rate": 1.518987341772152e-06, + "loss": 1.8922, + "step": 270 + }, + { + "epoch": 0.07736671824447883, + "grad_norm": 4.65625, + "learning_rate": 1.5471167369901545e-06, + "loss": 2.033, + "step": 275 + }, + { + "epoch": 0.0787733858489239, + "grad_norm": 8.6875, + "learning_rate": 1.5752461322081573e-06, + "loss": 1.8743, + "step": 280 + }, + { + "epoch": 0.08018005345336897, + "grad_norm": 3.671875, + "learning_rate": 1.6033755274261603e-06, + "loss": 1.9284, + "step": 285 + }, + { + "epoch": 0.08158672105781403, + "grad_norm": 4.5, + "learning_rate": 1.631504922644163e-06, + "loss": 1.8721, + "step": 290 + }, + { + "epoch": 0.08299338866225911, + "grad_norm": 4.1875, + "learning_rate": 1.6596343178621659e-06, + "loss": 1.8106, + "step": 295 + }, + { + "epoch": 0.08440005626670417, + "grad_norm": 5.03125, + "learning_rate": 1.6877637130801686e-06, + "loss": 1.8406, + "step": 300 + }, + { + "epoch": 0.08580672387114925, + "grad_norm": 3.265625, + "learning_rate": 1.7158931082981716e-06, + "loss": 2.0746, + "step": 305 + }, + { + "epoch": 0.08721339147559432, + "grad_norm": 5.40625, + "learning_rate": 1.7440225035161744e-06, + "loss": 1.7973, + "step": 310 + }, + { + "epoch": 0.08862005908003939, + "grad_norm": 5.25, + "learning_rate": 1.772151898734177e-06, + "loss": 1.9815, + "step": 315 + }, + { + "epoch": 0.09002672668448446, + "grad_norm": 3.328125, + "learning_rate": 1.80028129395218e-06, + "loss": 1.9967, + "step": 320 + }, + { + "epoch": 0.09143339428892952, + "grad_norm": 2.6875, + "learning_rate": 1.8284106891701827e-06, + "loss": 1.8916, + "step": 325 + }, + { + "epoch": 0.0928400618933746, + "grad_norm": 5.25, + "learning_rate": 1.8565400843881855e-06, + "loss": 1.686, + "step": 330 + }, + { + "epoch": 0.09424672949781966, + "grad_norm": 4.28125, + "learning_rate": 1.8846694796061883e-06, + "loss": 2.0456, + "step": 335 + }, + { + "epoch": 0.09565339710226474, + "grad_norm": 3.421875, + "learning_rate": 1.912798874824191e-06, + "loss": 2.115, + "step": 340 + }, + { + "epoch": 0.0970600647067098, + "grad_norm": 3.828125, + "learning_rate": 1.940928270042194e-06, + "loss": 1.8874, + "step": 345 + }, + { + "epoch": 0.09846673231115488, + "grad_norm": 3.015625, + "learning_rate": 1.969057665260197e-06, + "loss": 2.0844, + "step": 350 + }, + { + "epoch": 0.09987339991559994, + "grad_norm": 3.21875, + "learning_rate": 1.9971870604782e-06, + "loss": 1.8196, + "step": 355 + }, + { + "epoch": 0.10128006752004501, + "grad_norm": 2.90625, + "learning_rate": 2.0253164556962026e-06, + "loss": 1.9226, + "step": 360 + }, + { + "epoch": 0.10268673512449009, + "grad_norm": 4.75, + "learning_rate": 2.0534458509142054e-06, + "loss": 1.8721, + "step": 365 + }, + { + "epoch": 0.10409340272893515, + "grad_norm": 3.984375, + "learning_rate": 2.081575246132208e-06, + "loss": 1.8233, + "step": 370 + }, + { + "epoch": 0.10550007033338023, + "grad_norm": 4.25, + "learning_rate": 2.109704641350211e-06, + "loss": 1.967, + "step": 375 + }, + { + "epoch": 0.10690673793782529, + "grad_norm": 3.625, + "learning_rate": 2.1378340365682133e-06, + "loss": 1.9782, + "step": 380 + }, + { + "epoch": 0.10831340554227036, + "grad_norm": 3.5625, + "learning_rate": 2.1659634317862165e-06, + "loss": 2.1084, + "step": 385 + }, + { + "epoch": 0.10972007314671543, + "grad_norm": 3.640625, + "learning_rate": 2.1940928270042193e-06, + "loss": 2.2131, + "step": 390 + }, + { + "epoch": 0.1111267407511605, + "grad_norm": 4.0625, + "learning_rate": 2.222222222222222e-06, + "loss": 1.8195, + "step": 395 + }, + { + "epoch": 0.11253340835560557, + "grad_norm": 4.15625, + "learning_rate": 2.250351617440225e-06, + "loss": 1.9049, + "step": 400 + }, + { + "epoch": 0.11394007596005064, + "grad_norm": 4.28125, + "learning_rate": 2.278481012658228e-06, + "loss": 1.9102, + "step": 405 + }, + { + "epoch": 0.11534674356449572, + "grad_norm": 5.875, + "learning_rate": 2.3066104078762304e-06, + "loss": 1.8624, + "step": 410 + }, + { + "epoch": 0.11675341116894078, + "grad_norm": 2.828125, + "learning_rate": 2.3347398030942336e-06, + "loss": 1.7784, + "step": 415 + }, + { + "epoch": 0.11816007877338584, + "grad_norm": 7.59375, + "learning_rate": 2.3628691983122364e-06, + "loss": 1.9591, + "step": 420 + }, + { + "epoch": 0.11956674637783092, + "grad_norm": 4.3125, + "learning_rate": 2.3909985935302387e-06, + "loss": 1.7737, + "step": 425 + }, + { + "epoch": 0.12097341398227598, + "grad_norm": 2.4375, + "learning_rate": 2.419127988748242e-06, + "loss": 1.7564, + "step": 430 + }, + { + "epoch": 0.12238008158672106, + "grad_norm": 2.9375, + "learning_rate": 2.4472573839662443e-06, + "loss": 1.8436, + "step": 435 + }, + { + "epoch": 0.12378674919116613, + "grad_norm": 2.21875, + "learning_rate": 2.4753867791842475e-06, + "loss": 1.9585, + "step": 440 + }, + { + "epoch": 0.1251934167956112, + "grad_norm": 5.0625, + "learning_rate": 2.5035161744022503e-06, + "loss": 1.7232, + "step": 445 + }, + { + "epoch": 0.12660008440005627, + "grad_norm": 3.03125, + "learning_rate": 2.531645569620253e-06, + "loss": 1.9946, + "step": 450 + }, + { + "epoch": 0.12800675200450135, + "grad_norm": 2.765625, + "learning_rate": 2.559774964838256e-06, + "loss": 1.9282, + "step": 455 + }, + { + "epoch": 0.1294134196089464, + "grad_norm": 4.1875, + "learning_rate": 2.587904360056259e-06, + "loss": 1.7587, + "step": 460 + }, + { + "epoch": 0.13082008721339147, + "grad_norm": 4.75, + "learning_rate": 2.6160337552742614e-06, + "loss": 1.785, + "step": 465 + }, + { + "epoch": 0.13222675481783655, + "grad_norm": 3.375, + "learning_rate": 2.644163150492264e-06, + "loss": 1.8986, + "step": 470 + }, + { + "epoch": 0.13363342242228163, + "grad_norm": 4.8125, + "learning_rate": 2.6722925457102674e-06, + "loss": 1.7295, + "step": 475 + }, + { + "epoch": 0.13504009002672668, + "grad_norm": 2.546875, + "learning_rate": 2.7004219409282697e-06, + "loss": 1.9288, + "step": 480 + }, + { + "epoch": 0.13644675763117176, + "grad_norm": 3.4375, + "learning_rate": 2.728551336146273e-06, + "loss": 1.7269, + "step": 485 + }, + { + "epoch": 0.13785342523561683, + "grad_norm": 3.296875, + "learning_rate": 2.7566807313642753e-06, + "loss": 1.4669, + "step": 490 + }, + { + "epoch": 0.13926009284006188, + "grad_norm": 3.84375, + "learning_rate": 2.7848101265822785e-06, + "loss": 1.9001, + "step": 495 + }, + { + "epoch": 0.14066676044450696, + "grad_norm": 3.34375, + "learning_rate": 2.8129395218002813e-06, + "loss": 1.7307, + "step": 500 + }, + { + "epoch": 0.14207342804895204, + "grad_norm": 4.4375, + "learning_rate": 2.8410689170182836e-06, + "loss": 1.8674, + "step": 505 + }, + { + "epoch": 0.14348009565339712, + "grad_norm": 12.375, + "learning_rate": 2.869198312236287e-06, + "loss": 1.9095, + "step": 510 + }, + { + "epoch": 0.14488676325784217, + "grad_norm": 2.53125, + "learning_rate": 2.8973277074542896e-06, + "loss": 1.8201, + "step": 515 + }, + { + "epoch": 0.14629343086228724, + "grad_norm": 3.625, + "learning_rate": 2.9254571026722924e-06, + "loss": 1.9156, + "step": 520 + }, + { + "epoch": 0.14770009846673232, + "grad_norm": 3.96875, + "learning_rate": 2.953586497890295e-06, + "loss": 1.7277, + "step": 525 + }, + { + "epoch": 0.14910676607117737, + "grad_norm": 4.25, + "learning_rate": 2.9817158931082984e-06, + "loss": 1.7154, + "step": 530 + }, + { + "epoch": 0.15051343367562245, + "grad_norm": 3.734375, + "learning_rate": 3.0098452883263007e-06, + "loss": 1.7168, + "step": 535 + }, + { + "epoch": 0.15192010128006753, + "grad_norm": 4.1875, + "learning_rate": 3.037974683544304e-06, + "loss": 2.0013, + "step": 540 + }, + { + "epoch": 0.15332676888451258, + "grad_norm": 2.890625, + "learning_rate": 3.0661040787623063e-06, + "loss": 1.8941, + "step": 545 + }, + { + "epoch": 0.15473343648895765, + "grad_norm": 3.046875, + "learning_rate": 3.094233473980309e-06, + "loss": 1.682, + "step": 550 + }, + { + "epoch": 0.15614010409340273, + "grad_norm": 5.75, + "learning_rate": 3.1223628691983123e-06, + "loss": 1.7576, + "step": 555 + }, + { + "epoch": 0.1575467716978478, + "grad_norm": 4.71875, + "learning_rate": 3.1504922644163146e-06, + "loss": 1.8123, + "step": 560 + }, + { + "epoch": 0.15895343930229286, + "grad_norm": 3.3125, + "learning_rate": 3.178621659634318e-06, + "loss": 1.6807, + "step": 565 + }, + { + "epoch": 0.16036010690673794, + "grad_norm": 2.859375, + "learning_rate": 3.2067510548523206e-06, + "loss": 1.6907, + "step": 570 + }, + { + "epoch": 0.16176677451118301, + "grad_norm": 3.0625, + "learning_rate": 3.2348804500703234e-06, + "loss": 1.892, + "step": 575 + }, + { + "epoch": 0.16317344211562806, + "grad_norm": 2.625, + "learning_rate": 3.263009845288326e-06, + "loss": 2.0725, + "step": 580 + }, + { + "epoch": 0.16458010972007314, + "grad_norm": 4.5, + "learning_rate": 3.2911392405063294e-06, + "loss": 1.8148, + "step": 585 + }, + { + "epoch": 0.16598677732451822, + "grad_norm": 3.71875, + "learning_rate": 3.3192686357243317e-06, + "loss": 1.7494, + "step": 590 + }, + { + "epoch": 0.1673934449289633, + "grad_norm": 4.53125, + "learning_rate": 3.3473980309423345e-06, + "loss": 1.8697, + "step": 595 + }, + { + "epoch": 0.16880011253340835, + "grad_norm": 4.65625, + "learning_rate": 3.3755274261603373e-06, + "loss": 1.9731, + "step": 600 + }, + { + "epoch": 0.17020678013785343, + "grad_norm": 2.859375, + "learning_rate": 3.40365682137834e-06, + "loss": 1.7131, + "step": 605 + }, + { + "epoch": 0.1716134477422985, + "grad_norm": 3.46875, + "learning_rate": 3.4317862165963433e-06, + "loss": 1.8335, + "step": 610 + }, + { + "epoch": 0.17302011534674355, + "grad_norm": 2.625, + "learning_rate": 3.4599156118143456e-06, + "loss": 1.8878, + "step": 615 + }, + { + "epoch": 0.17442678295118863, + "grad_norm": 4.5, + "learning_rate": 3.488045007032349e-06, + "loss": 1.6365, + "step": 620 + }, + { + "epoch": 0.1758334505556337, + "grad_norm": 5.28125, + "learning_rate": 3.5161744022503516e-06, + "loss": 1.8077, + "step": 625 + }, + { + "epoch": 0.17724011816007879, + "grad_norm": 3.78125, + "learning_rate": 3.544303797468354e-06, + "loss": 1.9373, + "step": 630 + }, + { + "epoch": 0.17864678576452384, + "grad_norm": 3.03125, + "learning_rate": 3.572433192686357e-06, + "loss": 1.8536, + "step": 635 + }, + { + "epoch": 0.1800534533689689, + "grad_norm": 4.03125, + "learning_rate": 3.60056258790436e-06, + "loss": 1.7992, + "step": 640 + }, + { + "epoch": 0.181460120973414, + "grad_norm": 2.65625, + "learning_rate": 3.6286919831223627e-06, + "loss": 1.7746, + "step": 645 + }, + { + "epoch": 0.18286678857785904, + "grad_norm": 3.125, + "learning_rate": 3.6568213783403655e-06, + "loss": 2.0007, + "step": 650 + }, + { + "epoch": 0.18427345618230412, + "grad_norm": 3.15625, + "learning_rate": 3.6849507735583683e-06, + "loss": 1.8947, + "step": 655 + }, + { + "epoch": 0.1856801237867492, + "grad_norm": 3.484375, + "learning_rate": 3.713080168776371e-06, + "loss": 1.8504, + "step": 660 + }, + { + "epoch": 0.18708679139119427, + "grad_norm": 3.15625, + "learning_rate": 3.7412095639943743e-06, + "loss": 1.7796, + "step": 665 + }, + { + "epoch": 0.18849345899563932, + "grad_norm": 3.296875, + "learning_rate": 3.7693389592123766e-06, + "loss": 1.7598, + "step": 670 + }, + { + "epoch": 0.1899001266000844, + "grad_norm": 2.765625, + "learning_rate": 3.7974683544303794e-06, + "loss": 1.6422, + "step": 675 + }, + { + "epoch": 0.19130679420452948, + "grad_norm": 3.546875, + "learning_rate": 3.825597749648382e-06, + "loss": 1.8552, + "step": 680 + }, + { + "epoch": 0.19271346180897453, + "grad_norm": 3.921875, + "learning_rate": 3.853727144866385e-06, + "loss": 1.5753, + "step": 685 + }, + { + "epoch": 0.1941201294134196, + "grad_norm": 3.421875, + "learning_rate": 3.881856540084388e-06, + "loss": 1.7207, + "step": 690 + }, + { + "epoch": 0.19552679701786468, + "grad_norm": 5.21875, + "learning_rate": 3.909985935302391e-06, + "loss": 1.5458, + "step": 695 + }, + { + "epoch": 0.19693346462230976, + "grad_norm": 4.03125, + "learning_rate": 3.938115330520394e-06, + "loss": 1.6871, + "step": 700 + }, + { + "epoch": 0.1983401322267548, + "grad_norm": 3.328125, + "learning_rate": 3.9662447257383965e-06, + "loss": 1.9603, + "step": 705 + }, + { + "epoch": 0.1997467998311999, + "grad_norm": 4.0625, + "learning_rate": 3.9943741209564e-06, + "loss": 1.6939, + "step": 710 + }, + { + "epoch": 0.20115346743564497, + "grad_norm": 4.03125, + "learning_rate": 4.022503516174402e-06, + "loss": 1.7818, + "step": 715 + }, + { + "epoch": 0.20256013504009002, + "grad_norm": 3.515625, + "learning_rate": 4.050632911392405e-06, + "loss": 1.7131, + "step": 720 + }, + { + "epoch": 0.2039668026445351, + "grad_norm": 2.984375, + "learning_rate": 4.078762306610408e-06, + "loss": 1.8219, + "step": 725 + }, + { + "epoch": 0.20537347024898017, + "grad_norm": 3.65625, + "learning_rate": 4.106891701828411e-06, + "loss": 1.6883, + "step": 730 + }, + { + "epoch": 0.20678013785342522, + "grad_norm": 3.453125, + "learning_rate": 4.135021097046413e-06, + "loss": 1.8892, + "step": 735 + }, + { + "epoch": 0.2081868054578703, + "grad_norm": 4.0625, + "learning_rate": 4.163150492264416e-06, + "loss": 1.6866, + "step": 740 + }, + { + "epoch": 0.20959347306231538, + "grad_norm": 3.296875, + "learning_rate": 4.191279887482419e-06, + "loss": 1.4289, + "step": 745 + }, + { + "epoch": 0.21100014066676046, + "grad_norm": 3.734375, + "learning_rate": 4.219409282700422e-06, + "loss": 1.5471, + "step": 750 + }, + { + "epoch": 0.2124068082712055, + "grad_norm": 3.59375, + "learning_rate": 4.247538677918425e-06, + "loss": 1.9359, + "step": 755 + }, + { + "epoch": 0.21381347587565058, + "grad_norm": 3.390625, + "learning_rate": 4.275668073136427e-06, + "loss": 1.9177, + "step": 760 + }, + { + "epoch": 0.21522014348009566, + "grad_norm": 3.53125, + "learning_rate": 4.30379746835443e-06, + "loss": 1.9106, + "step": 765 + }, + { + "epoch": 0.2166268110845407, + "grad_norm": 3.375, + "learning_rate": 4.331926863572433e-06, + "loss": 1.9653, + "step": 770 + }, + { + "epoch": 0.2180334786889858, + "grad_norm": 3.09375, + "learning_rate": 4.360056258790436e-06, + "loss": 1.7532, + "step": 775 + }, + { + "epoch": 0.21944014629343087, + "grad_norm": 3.09375, + "learning_rate": 4.388185654008439e-06, + "loss": 1.8319, + "step": 780 + }, + { + "epoch": 0.22084681389787594, + "grad_norm": 3.140625, + "learning_rate": 4.416315049226442e-06, + "loss": 2.0233, + "step": 785 + }, + { + "epoch": 0.222253481502321, + "grad_norm": 2.71875, + "learning_rate": 4.444444444444444e-06, + "loss": 1.9903, + "step": 790 + }, + { + "epoch": 0.22366014910676607, + "grad_norm": 2.4375, + "learning_rate": 4.4725738396624465e-06, + "loss": 2.0019, + "step": 795 + }, + { + "epoch": 0.22506681671121115, + "grad_norm": 3.171875, + "learning_rate": 4.50070323488045e-06, + "loss": 1.7976, + "step": 800 + }, + { + "epoch": 0.2264734843156562, + "grad_norm": 3.453125, + "learning_rate": 4.528832630098453e-06, + "loss": 1.8314, + "step": 805 + }, + { + "epoch": 0.22788015192010128, + "grad_norm": 3.046875, + "learning_rate": 4.556962025316456e-06, + "loss": 1.7646, + "step": 810 + }, + { + "epoch": 0.22928681952454635, + "grad_norm": 9.9375, + "learning_rate": 4.585091420534458e-06, + "loss": 1.5575, + "step": 815 + }, + { + "epoch": 0.23069348712899143, + "grad_norm": 4.5625, + "learning_rate": 4.613220815752461e-06, + "loss": 1.4394, + "step": 820 + }, + { + "epoch": 0.23210015473343648, + "grad_norm": 3.5625, + "learning_rate": 4.641350210970464e-06, + "loss": 1.6944, + "step": 825 + }, + { + "epoch": 0.23350682233788156, + "grad_norm": 3.625, + "learning_rate": 4.669479606188467e-06, + "loss": 1.9698, + "step": 830 + }, + { + "epoch": 0.23491348994232664, + "grad_norm": 2.46875, + "learning_rate": 4.69760900140647e-06, + "loss": 1.8369, + "step": 835 + }, + { + "epoch": 0.2363201575467717, + "grad_norm": 2.84375, + "learning_rate": 4.725738396624473e-06, + "loss": 2.0049, + "step": 840 + }, + { + "epoch": 0.23772682515121676, + "grad_norm": 4.90625, + "learning_rate": 4.753867791842475e-06, + "loss": 1.6301, + "step": 845 + }, + { + "epoch": 0.23913349275566184, + "grad_norm": 3.28125, + "learning_rate": 4.7819971870604775e-06, + "loss": 2.0206, + "step": 850 + }, + { + "epoch": 0.24054016036010692, + "grad_norm": 3.671875, + "learning_rate": 4.810126582278481e-06, + "loss": 1.9102, + "step": 855 + }, + { + "epoch": 0.24194682796455197, + "grad_norm": 2.515625, + "learning_rate": 4.838255977496484e-06, + "loss": 1.5072, + "step": 860 + }, + { + "epoch": 0.24335349556899705, + "grad_norm": 3.265625, + "learning_rate": 4.866385372714487e-06, + "loss": 1.5767, + "step": 865 + }, + { + "epoch": 0.24476016317344212, + "grad_norm": 3.140625, + "learning_rate": 4.894514767932489e-06, + "loss": 1.979, + "step": 870 + }, + { + "epoch": 0.24616683077788717, + "grad_norm": 3.71875, + "learning_rate": 4.922644163150492e-06, + "loss": 1.7614, + "step": 875 + }, + { + "epoch": 0.24757349838233225, + "grad_norm": 2.703125, + "learning_rate": 4.950773558368495e-06, + "loss": 1.7441, + "step": 880 + }, + { + "epoch": 0.24898016598677733, + "grad_norm": 3.546875, + "learning_rate": 4.978902953586497e-06, + "loss": 1.7426, + "step": 885 + }, + { + "epoch": 0.2503868335912224, + "grad_norm": 4.6875, + "learning_rate": 5.0070323488045006e-06, + "loss": 1.7325, + "step": 890 + }, + { + "epoch": 0.25179350119566746, + "grad_norm": 3.609375, + "learning_rate": 5.035161744022504e-06, + "loss": 1.8025, + "step": 895 + }, + { + "epoch": 0.25320016880011254, + "grad_norm": 4.5, + "learning_rate": 5.063291139240506e-06, + "loss": 1.7743, + "step": 900 + }, + { + "epoch": 0.2546068364045576, + "grad_norm": 4.125, + "learning_rate": 5.0914205344585085e-06, + "loss": 1.5968, + "step": 905 + }, + { + "epoch": 0.2560135040090027, + "grad_norm": 11.3125, + "learning_rate": 5.119549929676512e-06, + "loss": 1.7051, + "step": 910 + }, + { + "epoch": 0.25742017161344777, + "grad_norm": 3.09375, + "learning_rate": 5.147679324894515e-06, + "loss": 1.7582, + "step": 915 + }, + { + "epoch": 0.2588268392178928, + "grad_norm": 3.171875, + "learning_rate": 5.175808720112518e-06, + "loss": 1.7175, + "step": 920 + }, + { + "epoch": 0.26023350682233787, + "grad_norm": 4.0625, + "learning_rate": 5.20393811533052e-06, + "loss": 1.7261, + "step": 925 + }, + { + "epoch": 0.26164017442678295, + "grad_norm": 2.640625, + "learning_rate": 5.232067510548523e-06, + "loss": 1.7149, + "step": 930 + }, + { + "epoch": 0.263046842031228, + "grad_norm": 3.28125, + "learning_rate": 5.260196905766526e-06, + "loss": 1.7658, + "step": 935 + }, + { + "epoch": 0.2644535096356731, + "grad_norm": 3.296875, + "learning_rate": 5.288326300984528e-06, + "loss": 1.691, + "step": 940 + }, + { + "epoch": 0.2658601772401182, + "grad_norm": 2.96875, + "learning_rate": 5.3164556962025316e-06, + "loss": 2.1113, + "step": 945 + }, + { + "epoch": 0.26726684484456326, + "grad_norm": 2.671875, + "learning_rate": 5.344585091420535e-06, + "loss": 1.9024, + "step": 950 + }, + { + "epoch": 0.2686735124490083, + "grad_norm": 4.875, + "learning_rate": 5.372714486638537e-06, + "loss": 1.8391, + "step": 955 + }, + { + "epoch": 0.27008018005345336, + "grad_norm": 3.53125, + "learning_rate": 5.4008438818565395e-06, + "loss": 1.607, + "step": 960 + }, + { + "epoch": 0.27148684765789843, + "grad_norm": 4.5625, + "learning_rate": 5.428973277074543e-06, + "loss": 1.8196, + "step": 965 + }, + { + "epoch": 0.2728935152623435, + "grad_norm": 3.203125, + "learning_rate": 5.457102672292546e-06, + "loss": 1.6765, + "step": 970 + }, + { + "epoch": 0.2743001828667886, + "grad_norm": 3.015625, + "learning_rate": 5.485232067510548e-06, + "loss": 1.8108, + "step": 975 + }, + { + "epoch": 0.27570685047123367, + "grad_norm": 2.84375, + "learning_rate": 5.513361462728551e-06, + "loss": 1.8147, + "step": 980 + }, + { + "epoch": 0.27711351807567874, + "grad_norm": 3.296875, + "learning_rate": 5.541490857946554e-06, + "loss": 1.936, + "step": 985 + }, + { + "epoch": 0.27852018568012377, + "grad_norm": 3.8125, + "learning_rate": 5.569620253164557e-06, + "loss": 1.781, + "step": 990 + }, + { + "epoch": 0.27992685328456884, + "grad_norm": 3.84375, + "learning_rate": 5.597749648382559e-06, + "loss": 1.4884, + "step": 995 + }, + { + "epoch": 0.2813335208890139, + "grad_norm": 2.359375, + "learning_rate": 5.6258790436005626e-06, + "loss": 1.472, + "step": 1000 + }, + { + "epoch": 0.282740188493459, + "grad_norm": 2.703125, + "learning_rate": 5.654008438818566e-06, + "loss": 1.4928, + "step": 1005 + }, + { + "epoch": 0.2841468560979041, + "grad_norm": 3.203125, + "learning_rate": 5.682137834036567e-06, + "loss": 1.6619, + "step": 1010 + }, + { + "epoch": 0.28555352370234915, + "grad_norm": 3.5625, + "learning_rate": 5.7102672292545705e-06, + "loss": 2.0167, + "step": 1015 + }, + { + "epoch": 0.28696019130679423, + "grad_norm": 3.328125, + "learning_rate": 5.738396624472574e-06, + "loss": 1.7161, + "step": 1020 + }, + { + "epoch": 0.28836685891123925, + "grad_norm": 3.765625, + "learning_rate": 5.766526019690577e-06, + "loss": 1.9748, + "step": 1025 + }, + { + "epoch": 0.28977352651568433, + "grad_norm": 4.59375, + "learning_rate": 5.794655414908579e-06, + "loss": 1.7424, + "step": 1030 + }, + { + "epoch": 0.2911801941201294, + "grad_norm": 2.625, + "learning_rate": 5.822784810126582e-06, + "loss": 1.6562, + "step": 1035 + }, + { + "epoch": 0.2925868617245745, + "grad_norm": 2.90625, + "learning_rate": 5.850914205344585e-06, + "loss": 1.7536, + "step": 1040 + }, + { + "epoch": 0.29399352932901957, + "grad_norm": 2.4375, + "learning_rate": 5.879043600562588e-06, + "loss": 1.8031, + "step": 1045 + }, + { + "epoch": 0.29540019693346464, + "grad_norm": 2.953125, + "learning_rate": 5.90717299578059e-06, + "loss": 1.6396, + "step": 1050 + }, + { + "epoch": 0.29680686453790966, + "grad_norm": 2.78125, + "learning_rate": 5.9353023909985935e-06, + "loss": 1.7845, + "step": 1055 + }, + { + "epoch": 0.29821353214235474, + "grad_norm": 2.96875, + "learning_rate": 5.963431786216597e-06, + "loss": 1.9526, + "step": 1060 + }, + { + "epoch": 0.2996201997467998, + "grad_norm": 2.734375, + "learning_rate": 5.991561181434598e-06, + "loss": 1.5986, + "step": 1065 + }, + { + "epoch": 0.3010268673512449, + "grad_norm": 3.765625, + "learning_rate": 6.0196905766526015e-06, + "loss": 1.7045, + "step": 1070 + }, + { + "epoch": 0.30243353495569, + "grad_norm": 4.28125, + "learning_rate": 6.047819971870605e-06, + "loss": 1.6006, + "step": 1075 + }, + { + "epoch": 0.30384020256013505, + "grad_norm": 2.328125, + "learning_rate": 6.075949367088608e-06, + "loss": 1.8305, + "step": 1080 + }, + { + "epoch": 0.30524687016458013, + "grad_norm": 4.9375, + "learning_rate": 6.10407876230661e-06, + "loss": 1.6308, + "step": 1085 + }, + { + "epoch": 0.30665353776902515, + "grad_norm": 2.375, + "learning_rate": 6.132208157524613e-06, + "loss": 1.7865, + "step": 1090 + }, + { + "epoch": 0.30806020537347023, + "grad_norm": 2.734375, + "learning_rate": 6.160337552742616e-06, + "loss": 1.8331, + "step": 1095 + }, + { + "epoch": 0.3094668729779153, + "grad_norm": 3.015625, + "learning_rate": 6.188466947960618e-06, + "loss": 1.7786, + "step": 1100 + }, + { + "epoch": 0.3108735405823604, + "grad_norm": 3.0, + "learning_rate": 6.216596343178621e-06, + "loss": 2.0003, + "step": 1105 + }, + { + "epoch": 0.31228020818680546, + "grad_norm": 2.84375, + "learning_rate": 6.2447257383966245e-06, + "loss": 1.824, + "step": 1110 + }, + { + "epoch": 0.31368687579125054, + "grad_norm": 3.296875, + "learning_rate": 6.272855133614628e-06, + "loss": 1.8362, + "step": 1115 + }, + { + "epoch": 0.3150935433956956, + "grad_norm": 3.921875, + "learning_rate": 6.300984528832629e-06, + "loss": 1.6952, + "step": 1120 + }, + { + "epoch": 0.31650021100014064, + "grad_norm": 5.84375, + "learning_rate": 6.3291139240506325e-06, + "loss": 1.5749, + "step": 1125 + }, + { + "epoch": 0.3179068786045857, + "grad_norm": 3.21875, + "learning_rate": 6.357243319268636e-06, + "loss": 1.5969, + "step": 1130 + }, + { + "epoch": 0.3193135462090308, + "grad_norm": 4.375, + "learning_rate": 6.385372714486638e-06, + "loss": 1.5812, + "step": 1135 + }, + { + "epoch": 0.3207202138134759, + "grad_norm": 2.328125, + "learning_rate": 6.413502109704641e-06, + "loss": 1.9075, + "step": 1140 + }, + { + "epoch": 0.32212688141792095, + "grad_norm": 2.59375, + "learning_rate": 6.4416315049226436e-06, + "loss": 1.9003, + "step": 1145 + }, + { + "epoch": 0.32353354902236603, + "grad_norm": 3.046875, + "learning_rate": 6.469760900140647e-06, + "loss": 1.7614, + "step": 1150 + }, + { + "epoch": 0.3249402166268111, + "grad_norm": 4.9375, + "learning_rate": 6.497890295358649e-06, + "loss": 1.812, + "step": 1155 + }, + { + "epoch": 0.32634688423125613, + "grad_norm": 3.234375, + "learning_rate": 6.526019690576652e-06, + "loss": 1.6565, + "step": 1160 + }, + { + "epoch": 0.3277535518357012, + "grad_norm": 3.140625, + "learning_rate": 6.5541490857946555e-06, + "loss": 1.9087, + "step": 1165 + }, + { + "epoch": 0.3291602194401463, + "grad_norm": 3.375, + "learning_rate": 6.582278481012659e-06, + "loss": 1.6842, + "step": 1170 + }, + { + "epoch": 0.33056688704459136, + "grad_norm": 4.09375, + "learning_rate": 6.61040787623066e-06, + "loss": 1.4252, + "step": 1175 + }, + { + "epoch": 0.33197355464903644, + "grad_norm": 3.203125, + "learning_rate": 6.6385372714486634e-06, + "loss": 1.6601, + "step": 1180 + }, + { + "epoch": 0.3333802222534815, + "grad_norm": 2.59375, + "learning_rate": 6.666666666666667e-06, + "loss": 1.7866, + "step": 1185 + }, + { + "epoch": 0.3347868898579266, + "grad_norm": 3.109375, + "learning_rate": 6.694796061884669e-06, + "loss": 1.6193, + "step": 1190 + }, + { + "epoch": 0.3361935574623716, + "grad_norm": 3.234375, + "learning_rate": 6.722925457102672e-06, + "loss": 1.6185, + "step": 1195 + }, + { + "epoch": 0.3376002250668167, + "grad_norm": 3.296875, + "learning_rate": 6.7510548523206746e-06, + "loss": 1.7356, + "step": 1200 + }, + { + "epoch": 0.3390068926712618, + "grad_norm": 2.78125, + "learning_rate": 6.779184247538678e-06, + "loss": 1.8204, + "step": 1205 + }, + { + "epoch": 0.34041356027570685, + "grad_norm": 3.203125, + "learning_rate": 6.80731364275668e-06, + "loss": 1.6249, + "step": 1210 + }, + { + "epoch": 0.34182022788015193, + "grad_norm": 2.890625, + "learning_rate": 6.835443037974683e-06, + "loss": 1.8842, + "step": 1215 + }, + { + "epoch": 0.343226895484597, + "grad_norm": 2.53125, + "learning_rate": 6.8635724331926865e-06, + "loss": 1.7801, + "step": 1220 + }, + { + "epoch": 0.3446335630890421, + "grad_norm": 3.921875, + "learning_rate": 6.891701828410689e-06, + "loss": 1.4851, + "step": 1225 + }, + { + "epoch": 0.3460402306934871, + "grad_norm": 3.125, + "learning_rate": 6.919831223628691e-06, + "loss": 1.7178, + "step": 1230 + }, + { + "epoch": 0.3474468982979322, + "grad_norm": 2.109375, + "learning_rate": 6.9479606188466944e-06, + "loss": 1.6033, + "step": 1235 + }, + { + "epoch": 0.34885356590237726, + "grad_norm": 3.203125, + "learning_rate": 6.976090014064698e-06, + "loss": 1.7969, + "step": 1240 + }, + { + "epoch": 0.35026023350682234, + "grad_norm": 2.515625, + "learning_rate": 7.0042194092827e-06, + "loss": 1.8187, + "step": 1245 + }, + { + "epoch": 0.3516669011112674, + "grad_norm": 2.859375, + "learning_rate": 7.032348804500703e-06, + "loss": 1.819, + "step": 1250 + }, + { + "epoch": 0.3530735687157125, + "grad_norm": 2.578125, + "learning_rate": 7.0604781997187056e-06, + "loss": 1.7712, + "step": 1255 + }, + { + "epoch": 0.35448023632015757, + "grad_norm": 5.375, + "learning_rate": 7.088607594936708e-06, + "loss": 1.7155, + "step": 1260 + }, + { + "epoch": 0.3558869039246026, + "grad_norm": 3.296875, + "learning_rate": 7.116736990154711e-06, + "loss": 1.6361, + "step": 1265 + }, + { + "epoch": 0.35729357152904767, + "grad_norm": 3.796875, + "learning_rate": 7.144866385372714e-06, + "loss": 1.7478, + "step": 1270 + }, + { + "epoch": 0.35870023913349275, + "grad_norm": 3.0, + "learning_rate": 7.1729957805907175e-06, + "loss": 1.844, + "step": 1275 + }, + { + "epoch": 0.3601069067379378, + "grad_norm": 3.015625, + "learning_rate": 7.20112517580872e-06, + "loss": 1.6414, + "step": 1280 + }, + { + "epoch": 0.3615135743423829, + "grad_norm": 2.578125, + "learning_rate": 7.229254571026722e-06, + "loss": 1.7427, + "step": 1285 + }, + { + "epoch": 0.362920241946828, + "grad_norm": 2.359375, + "learning_rate": 7.2573839662447254e-06, + "loss": 1.7395, + "step": 1290 + }, + { + "epoch": 0.36432690955127306, + "grad_norm": 2.59375, + "learning_rate": 7.285513361462729e-06, + "loss": 1.8215, + "step": 1295 + }, + { + "epoch": 0.3657335771557181, + "grad_norm": 3.515625, + "learning_rate": 7.313642756680731e-06, + "loss": 1.7573, + "step": 1300 + }, + { + "epoch": 0.36714024476016316, + "grad_norm": 3.203125, + "learning_rate": 7.341772151898734e-06, + "loss": 1.841, + "step": 1305 + }, + { + "epoch": 0.36854691236460824, + "grad_norm": 3.015625, + "learning_rate": 7.3699015471167365e-06, + "loss": 1.842, + "step": 1310 + }, + { + "epoch": 0.3699535799690533, + "grad_norm": 2.875, + "learning_rate": 7.398030942334739e-06, + "loss": 2.0199, + "step": 1315 + }, + { + "epoch": 0.3713602475734984, + "grad_norm": 3.28125, + "learning_rate": 7.426160337552742e-06, + "loss": 1.8881, + "step": 1320 + }, + { + "epoch": 0.37276691517794347, + "grad_norm": 2.640625, + "learning_rate": 7.454289732770745e-06, + "loss": 1.7692, + "step": 1325 + }, + { + "epoch": 0.37417358278238855, + "grad_norm": 4.84375, + "learning_rate": 7.4824191279887485e-06, + "loss": 1.8311, + "step": 1330 + }, + { + "epoch": 0.37558025038683357, + "grad_norm": 5.34375, + "learning_rate": 7.510548523206751e-06, + "loss": 1.7703, + "step": 1335 + }, + { + "epoch": 0.37698691799127865, + "grad_norm": 2.59375, + "learning_rate": 7.538677918424753e-06, + "loss": 1.7416, + "step": 1340 + }, + { + "epoch": 0.3783935855957237, + "grad_norm": 3.109375, + "learning_rate": 7.566807313642756e-06, + "loss": 1.7398, + "step": 1345 + }, + { + "epoch": 0.3798002532001688, + "grad_norm": 2.875, + "learning_rate": 7.594936708860759e-06, + "loss": 1.571, + "step": 1350 + }, + { + "epoch": 0.3812069208046139, + "grad_norm": 3.34375, + "learning_rate": 7.623066104078762e-06, + "loss": 1.4582, + "step": 1355 + }, + { + "epoch": 0.38261358840905896, + "grad_norm": 3.609375, + "learning_rate": 7.651195499296764e-06, + "loss": 1.981, + "step": 1360 + }, + { + "epoch": 0.38402025601350404, + "grad_norm": 4.6875, + "learning_rate": 7.679324894514768e-06, + "loss": 1.611, + "step": 1365 + }, + { + "epoch": 0.38542692361794906, + "grad_norm": 2.828125, + "learning_rate": 7.70745428973277e-06, + "loss": 1.7369, + "step": 1370 + }, + { + "epoch": 0.38683359122239414, + "grad_norm": 3.234375, + "learning_rate": 7.735583684950773e-06, + "loss": 2.081, + "step": 1375 + }, + { + "epoch": 0.3882402588268392, + "grad_norm": 3.34375, + "learning_rate": 7.763713080168775e-06, + "loss": 1.5745, + "step": 1380 + }, + { + "epoch": 0.3896469264312843, + "grad_norm": 2.1875, + "learning_rate": 7.791842475386778e-06, + "loss": 1.7473, + "step": 1385 + }, + { + "epoch": 0.39105359403572937, + "grad_norm": 2.71875, + "learning_rate": 7.819971870604782e-06, + "loss": 1.7226, + "step": 1390 + }, + { + "epoch": 0.39246026164017445, + "grad_norm": 3.40625, + "learning_rate": 7.848101265822784e-06, + "loss": 1.5063, + "step": 1395 + }, + { + "epoch": 0.3938669292446195, + "grad_norm": 2.78125, + "learning_rate": 7.876230661040788e-06, + "loss": 1.5973, + "step": 1400 + }, + { + "epoch": 0.39527359684906455, + "grad_norm": 3.625, + "learning_rate": 7.904360056258789e-06, + "loss": 1.6063, + "step": 1405 + }, + { + "epoch": 0.3966802644535096, + "grad_norm": 2.78125, + "learning_rate": 7.932489451476793e-06, + "loss": 1.889, + "step": 1410 + }, + { + "epoch": 0.3980869320579547, + "grad_norm": 4.15625, + "learning_rate": 7.960618846694795e-06, + "loss": 1.65, + "step": 1415 + }, + { + "epoch": 0.3994935996623998, + "grad_norm": 4.34375, + "learning_rate": 7.9887482419128e-06, + "loss": 1.6546, + "step": 1420 + }, + { + "epoch": 0.40090026726684486, + "grad_norm": 4.6875, + "learning_rate": 7.999998914675671e-06, + "loss": 1.6465, + "step": 1425 + }, + { + "epoch": 0.40230693487128993, + "grad_norm": 2.6875, + "learning_rate": 7.999992282140243e-06, + "loss": 1.7975, + "step": 1430 + }, + { + "epoch": 0.40371360247573496, + "grad_norm": 3.625, + "learning_rate": 7.999979620037334e-06, + "loss": 1.7769, + "step": 1435 + }, + { + "epoch": 0.40512027008018003, + "grad_norm": 3.25, + "learning_rate": 7.999960928386025e-06, + "loss": 1.6168, + "step": 1440 + }, + { + "epoch": 0.4065269376846251, + "grad_norm": 3.28125, + "learning_rate": 7.9999362072145e-06, + "loss": 1.8668, + "step": 1445 + }, + { + "epoch": 0.4079336052890702, + "grad_norm": 2.875, + "learning_rate": 7.999905456560018e-06, + "loss": 1.8308, + "step": 1450 + }, + { + "epoch": 0.40934027289351527, + "grad_norm": 3.984375, + "learning_rate": 7.999868676468933e-06, + "loss": 1.7166, + "step": 1455 + }, + { + "epoch": 0.41074694049796034, + "grad_norm": 3.125, + "learning_rate": 7.99982586699669e-06, + "loss": 1.9376, + "step": 1460 + }, + { + "epoch": 0.4121536081024054, + "grad_norm": 2.578125, + "learning_rate": 7.999777028207818e-06, + "loss": 1.9246, + "step": 1465 + }, + { + "epoch": 0.41356027570685044, + "grad_norm": 3.703125, + "learning_rate": 7.999722160175935e-06, + "loss": 1.8283, + "step": 1470 + }, + { + "epoch": 0.4149669433112955, + "grad_norm": 2.6875, + "learning_rate": 7.99966126298375e-06, + "loss": 1.5573, + "step": 1475 + }, + { + "epoch": 0.4163736109157406, + "grad_norm": 2.9375, + "learning_rate": 7.99959433672306e-06, + "loss": 1.7425, + "step": 1480 + }, + { + "epoch": 0.4177802785201857, + "grad_norm": 3.03125, + "learning_rate": 7.999521381494747e-06, + "loss": 1.468, + "step": 1485 + }, + { + "epoch": 0.41918694612463075, + "grad_norm": 3.578125, + "learning_rate": 7.999442397408785e-06, + "loss": 2.0143, + "step": 1490 + }, + { + "epoch": 0.42059361372907583, + "grad_norm": 3.796875, + "learning_rate": 7.999357384584235e-06, + "loss": 1.5066, + "step": 1495 + }, + { + "epoch": 0.4220002813335209, + "grad_norm": 3.046875, + "learning_rate": 7.999266343149242e-06, + "loss": 1.5112, + "step": 1500 + }, + { + "epoch": 0.42340694893796593, + "grad_norm": 3.484375, + "learning_rate": 7.999169273241046e-06, + "loss": 1.6816, + "step": 1505 + }, + { + "epoch": 0.424813616542411, + "grad_norm": 4.34375, + "learning_rate": 7.999066175005965e-06, + "loss": 1.7814, + "step": 1510 + }, + { + "epoch": 0.4262202841468561, + "grad_norm": 3.5625, + "learning_rate": 7.99895704859941e-06, + "loss": 1.5777, + "step": 1515 + }, + { + "epoch": 0.42762695175130117, + "grad_norm": 2.765625, + "learning_rate": 7.99884189418588e-06, + "loss": 1.614, + "step": 1520 + }, + { + "epoch": 0.42903361935574624, + "grad_norm": 2.46875, + "learning_rate": 7.998720711938954e-06, + "loss": 1.3149, + "step": 1525 + }, + { + "epoch": 0.4304402869601913, + "grad_norm": 3.125, + "learning_rate": 7.998593502041306e-06, + "loss": 1.7651, + "step": 1530 + }, + { + "epoch": 0.4318469545646364, + "grad_norm": 3.0625, + "learning_rate": 7.998460264684688e-06, + "loss": 1.5757, + "step": 1535 + }, + { + "epoch": 0.4332536221690814, + "grad_norm": 2.625, + "learning_rate": 7.998321000069943e-06, + "loss": 1.9167, + "step": 1540 + }, + { + "epoch": 0.4346602897735265, + "grad_norm": 3.84375, + "learning_rate": 7.998175708406999e-06, + "loss": 1.388, + "step": 1545 + }, + { + "epoch": 0.4360669573779716, + "grad_norm": 2.71875, + "learning_rate": 7.998024389914864e-06, + "loss": 1.7055, + "step": 1550 + }, + { + "epoch": 0.43747362498241665, + "grad_norm": 3.71875, + "learning_rate": 7.997867044821638e-06, + "loss": 1.5819, + "step": 1555 + }, + { + "epoch": 0.43888029258686173, + "grad_norm": 4.40625, + "learning_rate": 7.997703673364501e-06, + "loss": 1.7198, + "step": 1560 + }, + { + "epoch": 0.4402869601913068, + "grad_norm": 2.96875, + "learning_rate": 7.997534275789718e-06, + "loss": 1.894, + "step": 1565 + }, + { + "epoch": 0.4416936277957519, + "grad_norm": 3.0, + "learning_rate": 7.99735885235264e-06, + "loss": 1.5067, + "step": 1570 + }, + { + "epoch": 0.4431002954001969, + "grad_norm": 3.8125, + "learning_rate": 7.997177403317696e-06, + "loss": 1.6449, + "step": 1575 + }, + { + "epoch": 0.444506963004642, + "grad_norm": 3.109375, + "learning_rate": 7.996989928958404e-06, + "loss": 1.8517, + "step": 1580 + }, + { + "epoch": 0.44591363060908706, + "grad_norm": 3.96875, + "learning_rate": 7.996796429557362e-06, + "loss": 1.7391, + "step": 1585 + }, + { + "epoch": 0.44732029821353214, + "grad_norm": 2.421875, + "learning_rate": 7.996596905406248e-06, + "loss": 1.8785, + "step": 1590 + }, + { + "epoch": 0.4487269658179772, + "grad_norm": 2.625, + "learning_rate": 7.996391356805825e-06, + "loss": 1.6024, + "step": 1595 + }, + { + "epoch": 0.4501336334224223, + "grad_norm": 6.21875, + "learning_rate": 7.996179784065935e-06, + "loss": 1.6681, + "step": 1600 + }, + { + "epoch": 0.4515403010268674, + "grad_norm": 3.390625, + "learning_rate": 7.995962187505502e-06, + "loss": 1.742, + "step": 1605 + }, + { + "epoch": 0.4529469686313124, + "grad_norm": 2.9375, + "learning_rate": 7.995738567452531e-06, + "loss": 1.477, + "step": 1610 + }, + { + "epoch": 0.4543536362357575, + "grad_norm": 2.953125, + "learning_rate": 7.995508924244104e-06, + "loss": 1.7455, + "step": 1615 + }, + { + "epoch": 0.45576030384020255, + "grad_norm": 2.625, + "learning_rate": 7.995273258226387e-06, + "loss": 1.7959, + "step": 1620 + }, + { + "epoch": 0.45716697144464763, + "grad_norm": 4.5625, + "learning_rate": 7.995031569754617e-06, + "loss": 1.7619, + "step": 1625 + }, + { + "epoch": 0.4585736390490927, + "grad_norm": 3.25, + "learning_rate": 7.994783859193119e-06, + "loss": 1.6018, + "step": 1630 + }, + { + "epoch": 0.4599803066535378, + "grad_norm": 2.921875, + "learning_rate": 7.994530126915285e-06, + "loss": 1.7328, + "step": 1635 + }, + { + "epoch": 0.46138697425798286, + "grad_norm": 4.65625, + "learning_rate": 7.994270373303593e-06, + "loss": 1.6123, + "step": 1640 + }, + { + "epoch": 0.4627936418624279, + "grad_norm": 2.96875, + "learning_rate": 7.994004598749597e-06, + "loss": 1.6376, + "step": 1645 + }, + { + "epoch": 0.46420030946687296, + "grad_norm": 2.59375, + "learning_rate": 7.99373280365392e-06, + "loss": 1.5974, + "step": 1650 + }, + { + "epoch": 0.46560697707131804, + "grad_norm": 2.859375, + "learning_rate": 7.993454988426265e-06, + "loss": 1.3981, + "step": 1655 + }, + { + "epoch": 0.4670136446757631, + "grad_norm": 2.953125, + "learning_rate": 7.993171153485412e-06, + "loss": 1.7091, + "step": 1660 + }, + { + "epoch": 0.4684203122802082, + "grad_norm": 2.515625, + "learning_rate": 7.992881299259208e-06, + "loss": 1.4902, + "step": 1665 + }, + { + "epoch": 0.4698269798846533, + "grad_norm": 2.59375, + "learning_rate": 7.99258542618458e-06, + "loss": 1.8131, + "step": 1670 + }, + { + "epoch": 0.47123364748909835, + "grad_norm": 3.640625, + "learning_rate": 7.992283534707527e-06, + "loss": 1.5423, + "step": 1675 + }, + { + "epoch": 0.4726403150935434, + "grad_norm": 3.359375, + "learning_rate": 7.991975625283116e-06, + "loss": 1.7866, + "step": 1680 + }, + { + "epoch": 0.47404698269798845, + "grad_norm": 3.71875, + "learning_rate": 7.991661698375489e-06, + "loss": 1.4981, + "step": 1685 + }, + { + "epoch": 0.47545365030243353, + "grad_norm": 2.21875, + "learning_rate": 7.991341754457858e-06, + "loss": 1.6244, + "step": 1690 + }, + { + "epoch": 0.4768603179068786, + "grad_norm": 3.4375, + "learning_rate": 7.991015794012506e-06, + "loss": 1.6531, + "step": 1695 + }, + { + "epoch": 0.4782669855113237, + "grad_norm": 3.0625, + "learning_rate": 7.990683817530783e-06, + "loss": 1.6086, + "step": 1700 + }, + { + "epoch": 0.47967365311576876, + "grad_norm": 2.859375, + "learning_rate": 7.990345825513106e-06, + "loss": 1.3878, + "step": 1705 + }, + { + "epoch": 0.48108032072021384, + "grad_norm": 3.40625, + "learning_rate": 7.990001818468968e-06, + "loss": 1.658, + "step": 1710 + }, + { + "epoch": 0.48248698832465886, + "grad_norm": 2.84375, + "learning_rate": 7.989651796916918e-06, + "loss": 1.9873, + "step": 1715 + }, + { + "epoch": 0.48389365592910394, + "grad_norm": 2.484375, + "learning_rate": 7.98929576138458e-06, + "loss": 1.75, + "step": 1720 + }, + { + "epoch": 0.485300323533549, + "grad_norm": 5.1875, + "learning_rate": 7.98893371240864e-06, + "loss": 1.5848, + "step": 1725 + }, + { + "epoch": 0.4867069911379941, + "grad_norm": 2.59375, + "learning_rate": 7.988565650534847e-06, + "loss": 1.6315, + "step": 1730 + }, + { + "epoch": 0.48811365874243917, + "grad_norm": 2.28125, + "learning_rate": 7.988191576318015e-06, + "loss": 1.613, + "step": 1735 + }, + { + "epoch": 0.48952032634688425, + "grad_norm": 3.640625, + "learning_rate": 7.987811490322025e-06, + "loss": 1.4464, + "step": 1740 + }, + { + "epoch": 0.4909269939513293, + "grad_norm": 3.84375, + "learning_rate": 7.987425393119813e-06, + "loss": 1.7572, + "step": 1745 + }, + { + "epoch": 0.49233366155577435, + "grad_norm": 3.625, + "learning_rate": 7.987033285293382e-06, + "loss": 1.5372, + "step": 1750 + }, + { + "epoch": 0.4937403291602194, + "grad_norm": 9.3125, + "learning_rate": 7.986635167433794e-06, + "loss": 1.8296, + "step": 1755 + }, + { + "epoch": 0.4951469967646645, + "grad_norm": 3.515625, + "learning_rate": 7.986231040141167e-06, + "loss": 1.8108, + "step": 1760 + }, + { + "epoch": 0.4965536643691096, + "grad_norm": 2.40625, + "learning_rate": 7.985820904024682e-06, + "loss": 1.6946, + "step": 1765 + }, + { + "epoch": 0.49796033197355466, + "grad_norm": 3.25, + "learning_rate": 7.985404759702576e-06, + "loss": 1.5829, + "step": 1770 + }, + { + "epoch": 0.49936699957799974, + "grad_norm": 3.40625, + "learning_rate": 7.984982607802143e-06, + "loss": 1.7967, + "step": 1775 + }, + { + "epoch": 0.5007736671824448, + "grad_norm": 4.46875, + "learning_rate": 7.984554448959733e-06, + "loss": 1.7127, + "step": 1780 + }, + { + "epoch": 0.5021803347868898, + "grad_norm": 3.3125, + "learning_rate": 7.984120283820747e-06, + "loss": 1.7665, + "step": 1785 + }, + { + "epoch": 0.5035870023913349, + "grad_norm": 4.1875, + "learning_rate": 7.983680113039648e-06, + "loss": 1.6801, + "step": 1790 + }, + { + "epoch": 0.50499366999578, + "grad_norm": 5.15625, + "learning_rate": 7.983233937279946e-06, + "loss": 1.7679, + "step": 1795 + }, + { + "epoch": 0.5064003376002251, + "grad_norm": 3.1875, + "learning_rate": 7.982781757214201e-06, + "loss": 1.5918, + "step": 1800 + }, + { + "epoch": 0.5078070052046701, + "grad_norm": 2.890625, + "learning_rate": 7.982323573524031e-06, + "loss": 1.5204, + "step": 1805 + }, + { + "epoch": 0.5092136728091152, + "grad_norm": 4.09375, + "learning_rate": 7.981859386900095e-06, + "loss": 1.791, + "step": 1810 + }, + { + "epoch": 0.5106203404135603, + "grad_norm": 5.90625, + "learning_rate": 7.98138919804211e-06, + "loss": 1.6106, + "step": 1815 + }, + { + "epoch": 0.5120270080180054, + "grad_norm": 3.8125, + "learning_rate": 7.980913007658834e-06, + "loss": 1.6606, + "step": 1820 + }, + { + "epoch": 0.5134336756224505, + "grad_norm": 3.828125, + "learning_rate": 7.980430816468074e-06, + "loss": 1.5026, + "step": 1825 + }, + { + "epoch": 0.5148403432268955, + "grad_norm": 9.0, + "learning_rate": 7.979942625196683e-06, + "loss": 1.4711, + "step": 1830 + }, + { + "epoch": 0.5162470108313405, + "grad_norm": 3.703125, + "learning_rate": 7.979448434580558e-06, + "loss": 1.8645, + "step": 1835 + }, + { + "epoch": 0.5176536784357856, + "grad_norm": 3.5625, + "learning_rate": 7.978948245364639e-06, + "loss": 1.7167, + "step": 1840 + }, + { + "epoch": 0.5190603460402307, + "grad_norm": 4.125, + "learning_rate": 7.97844205830291e-06, + "loss": 1.6777, + "step": 1845 + }, + { + "epoch": 0.5204670136446757, + "grad_norm": 3.0625, + "learning_rate": 7.977929874158391e-06, + "loss": 1.7533, + "step": 1850 + }, + { + "epoch": 0.5218736812491208, + "grad_norm": 2.734375, + "learning_rate": 7.97741169370315e-06, + "loss": 1.7477, + "step": 1855 + }, + { + "epoch": 0.5232803488535659, + "grad_norm": 2.765625, + "learning_rate": 7.976887517718287e-06, + "loss": 1.8136, + "step": 1860 + }, + { + "epoch": 0.524687016458011, + "grad_norm": 3.25, + "learning_rate": 7.976357346993943e-06, + "loss": 1.5982, + "step": 1865 + }, + { + "epoch": 0.526093684062456, + "grad_norm": 3.734375, + "learning_rate": 7.975821182329293e-06, + "loss": 1.6659, + "step": 1870 + }, + { + "epoch": 0.5275003516669011, + "grad_norm": 7.0625, + "learning_rate": 7.975279024532551e-06, + "loss": 1.7181, + "step": 1875 + }, + { + "epoch": 0.5289070192713462, + "grad_norm": 3.109375, + "learning_rate": 7.974730874420964e-06, + "loss": 1.7149, + "step": 1880 + }, + { + "epoch": 0.5303136868757913, + "grad_norm": 4.71875, + "learning_rate": 7.974176732820807e-06, + "loss": 1.5102, + "step": 1885 + }, + { + "epoch": 0.5317203544802364, + "grad_norm": 2.890625, + "learning_rate": 7.973616600567391e-06, + "loss": 1.7282, + "step": 1890 + }, + { + "epoch": 0.5331270220846814, + "grad_norm": 2.5625, + "learning_rate": 7.973050478505058e-06, + "loss": 1.5252, + "step": 1895 + }, + { + "epoch": 0.5345336896891265, + "grad_norm": 3.296875, + "learning_rate": 7.972478367487176e-06, + "loss": 1.6819, + "step": 1900 + }, + { + "epoch": 0.5359403572935715, + "grad_norm": 3.3125, + "learning_rate": 7.971900268376144e-06, + "loss": 1.5836, + "step": 1905 + }, + { + "epoch": 0.5373470248980166, + "grad_norm": 4.1875, + "learning_rate": 7.971316182043384e-06, + "loss": 1.6865, + "step": 1910 + }, + { + "epoch": 0.5387536925024616, + "grad_norm": 2.421875, + "learning_rate": 7.970726109369344e-06, + "loss": 1.6588, + "step": 1915 + }, + { + "epoch": 0.5401603601069067, + "grad_norm": 4.6875, + "learning_rate": 7.970130051243498e-06, + "loss": 1.7915, + "step": 1920 + }, + { + "epoch": 0.5415670277113518, + "grad_norm": 2.515625, + "learning_rate": 7.969528008564342e-06, + "loss": 1.7502, + "step": 1925 + }, + { + "epoch": 0.5429736953157969, + "grad_norm": 4.15625, + "learning_rate": 7.96891998223939e-06, + "loss": 1.5522, + "step": 1930 + }, + { + "epoch": 0.544380362920242, + "grad_norm": 3.625, + "learning_rate": 7.968305973185177e-06, + "loss": 1.8124, + "step": 1935 + }, + { + "epoch": 0.545787030524687, + "grad_norm": 3.953125, + "learning_rate": 7.96768598232726e-06, + "loss": 1.7465, + "step": 1940 + }, + { + "epoch": 0.5471936981291321, + "grad_norm": 3.03125, + "learning_rate": 7.967060010600207e-06, + "loss": 1.7834, + "step": 1945 + }, + { + "epoch": 0.5486003657335772, + "grad_norm": 3.265625, + "learning_rate": 7.966428058947607e-06, + "loss": 1.5552, + "step": 1950 + }, + { + "epoch": 0.5500070333380223, + "grad_norm": 5.0625, + "learning_rate": 7.965790128322056e-06, + "loss": 1.6003, + "step": 1955 + }, + { + "epoch": 0.5514137009424673, + "grad_norm": 2.171875, + "learning_rate": 7.965146219685173e-06, + "loss": 1.6883, + "step": 1960 + }, + { + "epoch": 0.5528203685469124, + "grad_norm": 2.953125, + "learning_rate": 7.96449633400758e-06, + "loss": 1.5172, + "step": 1965 + }, + { + "epoch": 0.5542270361513575, + "grad_norm": 2.75, + "learning_rate": 7.963840472268913e-06, + "loss": 1.5894, + "step": 1970 + }, + { + "epoch": 0.5556337037558025, + "grad_norm": 2.765625, + "learning_rate": 7.963178635457812e-06, + "loss": 1.5496, + "step": 1975 + }, + { + "epoch": 0.5570403713602475, + "grad_norm": 2.984375, + "learning_rate": 7.962510824571927e-06, + "loss": 1.8202, + "step": 1980 + }, + { + "epoch": 0.5584470389646926, + "grad_norm": 3.0, + "learning_rate": 7.961837040617912e-06, + "loss": 1.6368, + "step": 1985 + }, + { + "epoch": 0.5598537065691377, + "grad_norm": 2.890625, + "learning_rate": 7.961157284611427e-06, + "loss": 1.7324, + "step": 1990 + }, + { + "epoch": 0.5612603741735828, + "grad_norm": 2.734375, + "learning_rate": 7.960471557577132e-06, + "loss": 1.5617, + "step": 1995 + }, + { + "epoch": 0.5626670417780278, + "grad_norm": 2.78125, + "learning_rate": 7.959779860548688e-06, + "loss": 1.7674, + "step": 2000 + }, + { + "epoch": 0.5640737093824729, + "grad_norm": 3.5625, + "learning_rate": 7.959082194568757e-06, + "loss": 1.8521, + "step": 2005 + }, + { + "epoch": 0.565480376986918, + "grad_norm": 2.203125, + "learning_rate": 7.958378560688997e-06, + "loss": 1.605, + "step": 2010 + }, + { + "epoch": 0.5668870445913631, + "grad_norm": 3.09375, + "learning_rate": 7.957668959970058e-06, + "loss": 1.6868, + "step": 2015 + }, + { + "epoch": 0.5682937121958082, + "grad_norm": 2.203125, + "learning_rate": 7.956953393481593e-06, + "loss": 1.9079, + "step": 2020 + }, + { + "epoch": 0.5697003798002532, + "grad_norm": 4.4375, + "learning_rate": 7.956231862302242e-06, + "loss": 1.6886, + "step": 2025 + }, + { + "epoch": 0.5711070474046983, + "grad_norm": 2.578125, + "learning_rate": 7.955504367519637e-06, + "loss": 1.6482, + "step": 2030 + }, + { + "epoch": 0.5725137150091434, + "grad_norm": 2.359375, + "learning_rate": 7.954770910230399e-06, + "loss": 1.9038, + "step": 2035 + }, + { + "epoch": 0.5739203826135885, + "grad_norm": 2.34375, + "learning_rate": 7.954031491540138e-06, + "loss": 1.7288, + "step": 2040 + }, + { + "epoch": 0.5753270502180334, + "grad_norm": 2.5625, + "learning_rate": 7.953286112563452e-06, + "loss": 1.6836, + "step": 2045 + }, + { + "epoch": 0.5767337178224785, + "grad_norm": 2.828125, + "learning_rate": 7.952534774423918e-06, + "loss": 1.6717, + "step": 2050 + }, + { + "epoch": 0.5781403854269236, + "grad_norm": 4.6875, + "learning_rate": 7.951777478254102e-06, + "loss": 1.6014, + "step": 2055 + }, + { + "epoch": 0.5795470530313687, + "grad_norm": 2.703125, + "learning_rate": 7.951014225195548e-06, + "loss": 1.4636, + "step": 2060 + }, + { + "epoch": 0.5809537206358137, + "grad_norm": 2.484375, + "learning_rate": 7.950245016398778e-06, + "loss": 1.4488, + "step": 2065 + }, + { + "epoch": 0.5823603882402588, + "grad_norm": 3.6875, + "learning_rate": 7.949469853023294e-06, + "loss": 1.5397, + "step": 2070 + }, + { + "epoch": 0.5837670558447039, + "grad_norm": 3.46875, + "learning_rate": 7.948688736237573e-06, + "loss": 1.751, + "step": 2075 + }, + { + "epoch": 0.585173723449149, + "grad_norm": 3.703125, + "learning_rate": 7.947901667219067e-06, + "loss": 1.7123, + "step": 2080 + }, + { + "epoch": 0.586580391053594, + "grad_norm": 4.53125, + "learning_rate": 7.9471086471542e-06, + "loss": 1.7476, + "step": 2085 + }, + { + "epoch": 0.5879870586580391, + "grad_norm": 2.640625, + "learning_rate": 7.946309677238364e-06, + "loss": 1.8185, + "step": 2090 + }, + { + "epoch": 0.5893937262624842, + "grad_norm": 3.625, + "learning_rate": 7.945504758675926e-06, + "loss": 1.5302, + "step": 2095 + }, + { + "epoch": 0.5908003938669293, + "grad_norm": 3.078125, + "learning_rate": 7.944693892680213e-06, + "loss": 1.4795, + "step": 2100 + }, + { + "epoch": 0.5922070614713744, + "grad_norm": 3.03125, + "learning_rate": 7.943877080473521e-06, + "loss": 1.4504, + "step": 2105 + }, + { + "epoch": 0.5936137290758193, + "grad_norm": 3.5625, + "learning_rate": 7.94305432328711e-06, + "loss": 1.636, + "step": 2110 + }, + { + "epoch": 0.5950203966802644, + "grad_norm": 3.546875, + "learning_rate": 7.942225622361197e-06, + "loss": 1.6305, + "step": 2115 + }, + { + "epoch": 0.5964270642847095, + "grad_norm": 3.328125, + "learning_rate": 7.941390978944963e-06, + "loss": 1.7123, + "step": 2120 + }, + { + "epoch": 0.5978337318891546, + "grad_norm": 2.96875, + "learning_rate": 7.940550394296545e-06, + "loss": 1.7594, + "step": 2125 + }, + { + "epoch": 0.5992403994935996, + "grad_norm": 3.078125, + "learning_rate": 7.939703869683038e-06, + "loss": 1.5839, + "step": 2130 + }, + { + "epoch": 0.6006470670980447, + "grad_norm": 2.875, + "learning_rate": 7.938851406380484e-06, + "loss": 1.5178, + "step": 2135 + }, + { + "epoch": 0.6020537347024898, + "grad_norm": 2.515625, + "learning_rate": 7.937993005673886e-06, + "loss": 1.7266, + "step": 2140 + }, + { + "epoch": 0.6034604023069349, + "grad_norm": 3.59375, + "learning_rate": 7.93712866885719e-06, + "loss": 1.5003, + "step": 2145 + }, + { + "epoch": 0.60486706991138, + "grad_norm": 3.078125, + "learning_rate": 7.936258397233296e-06, + "loss": 1.6785, + "step": 2150 + }, + { + "epoch": 0.606273737515825, + "grad_norm": 2.484375, + "learning_rate": 7.935382192114043e-06, + "loss": 1.7834, + "step": 2155 + }, + { + "epoch": 0.6076804051202701, + "grad_norm": 2.953125, + "learning_rate": 7.93450005482022e-06, + "loss": 1.7403, + "step": 2160 + }, + { + "epoch": 0.6090870727247152, + "grad_norm": 3.046875, + "learning_rate": 7.933611986681556e-06, + "loss": 1.7666, + "step": 2165 + }, + { + "epoch": 0.6104937403291603, + "grad_norm": 2.546875, + "learning_rate": 7.93271798903672e-06, + "loss": 1.5542, + "step": 2170 + }, + { + "epoch": 0.6119004079336053, + "grad_norm": 2.96875, + "learning_rate": 7.931818063233322e-06, + "loss": 1.8542, + "step": 2175 + }, + { + "epoch": 0.6133070755380503, + "grad_norm": 4.75, + "learning_rate": 7.930912210627902e-06, + "loss": 1.5718, + "step": 2180 + }, + { + "epoch": 0.6147137431424954, + "grad_norm": 2.8125, + "learning_rate": 7.930000432585939e-06, + "loss": 1.5713, + "step": 2185 + }, + { + "epoch": 0.6161204107469405, + "grad_norm": 2.625, + "learning_rate": 7.929082730481841e-06, + "loss": 1.8829, + "step": 2190 + }, + { + "epoch": 0.6175270783513855, + "grad_norm": 2.96875, + "learning_rate": 7.928159105698949e-06, + "loss": 1.9, + "step": 2195 + }, + { + "epoch": 0.6189337459558306, + "grad_norm": 3.703125, + "learning_rate": 7.927229559629529e-06, + "loss": 1.6255, + "step": 2200 + }, + { + "epoch": 0.6203404135602757, + "grad_norm": 2.984375, + "learning_rate": 7.926294093674777e-06, + "loss": 1.4732, + "step": 2205 + }, + { + "epoch": 0.6217470811647208, + "grad_norm": 2.40625, + "learning_rate": 7.925352709244804e-06, + "loss": 1.6324, + "step": 2210 + }, + { + "epoch": 0.6231537487691658, + "grad_norm": 3.703125, + "learning_rate": 7.924405407758654e-06, + "loss": 1.5333, + "step": 2215 + }, + { + "epoch": 0.6245604163736109, + "grad_norm": 3.1875, + "learning_rate": 7.923452190644279e-06, + "loss": 1.7322, + "step": 2220 + }, + { + "epoch": 0.625967083978056, + "grad_norm": 2.5625, + "learning_rate": 7.922493059338556e-06, + "loss": 1.7649, + "step": 2225 + }, + { + "epoch": 0.6273737515825011, + "grad_norm": 2.75, + "learning_rate": 7.921528015287276e-06, + "loss": 1.6691, + "step": 2230 + }, + { + "epoch": 0.6287804191869462, + "grad_norm": 2.859375, + "learning_rate": 7.920557059945137e-06, + "loss": 1.7656, + "step": 2235 + }, + { + "epoch": 0.6301870867913912, + "grad_norm": 2.6875, + "learning_rate": 7.919580194775758e-06, + "loss": 1.7602, + "step": 2240 + }, + { + "epoch": 0.6315937543958363, + "grad_norm": 2.515625, + "learning_rate": 7.918597421251656e-06, + "loss": 1.7364, + "step": 2245 + }, + { + "epoch": 0.6330004220002813, + "grad_norm": 3.09375, + "learning_rate": 7.917608740854259e-06, + "loss": 1.6754, + "step": 2250 + }, + { + "epoch": 0.6344070896047264, + "grad_norm": 2.921875, + "learning_rate": 7.9166141550739e-06, + "loss": 1.5354, + "step": 2255 + }, + { + "epoch": 0.6358137572091714, + "grad_norm": 3.890625, + "learning_rate": 7.915613665409813e-06, + "loss": 1.532, + "step": 2260 + }, + { + "epoch": 0.6372204248136165, + "grad_norm": 2.25, + "learning_rate": 7.914607273370129e-06, + "loss": 1.6626, + "step": 2265 + }, + { + "epoch": 0.6386270924180616, + "grad_norm": 3.671875, + "learning_rate": 7.913594980471877e-06, + "loss": 1.7334, + "step": 2270 + }, + { + "epoch": 0.6400337600225067, + "grad_norm": 3.265625, + "learning_rate": 7.912576788240987e-06, + "loss": 1.537, + "step": 2275 + }, + { + "epoch": 0.6414404276269517, + "grad_norm": 3.40625, + "learning_rate": 7.911552698212271e-06, + "loss": 1.7401, + "step": 2280 + }, + { + "epoch": 0.6428470952313968, + "grad_norm": 3.03125, + "learning_rate": 7.910522711929444e-06, + "loss": 1.7289, + "step": 2285 + }, + { + "epoch": 0.6442537628358419, + "grad_norm": 3.125, + "learning_rate": 7.909486830945092e-06, + "loss": 1.5732, + "step": 2290 + }, + { + "epoch": 0.645660430440287, + "grad_norm": 2.875, + "learning_rate": 7.908445056820707e-06, + "loss": 1.7419, + "step": 2295 + }, + { + "epoch": 0.6470670980447321, + "grad_norm": 3.59375, + "learning_rate": 7.907397391126647e-06, + "loss": 1.6438, + "step": 2300 + }, + { + "epoch": 0.6484737656491771, + "grad_norm": 3.03125, + "learning_rate": 7.906343835442159e-06, + "loss": 1.3731, + "step": 2305 + }, + { + "epoch": 0.6498804332536222, + "grad_norm": 2.640625, + "learning_rate": 7.90528439135537e-06, + "loss": 1.6436, + "step": 2310 + }, + { + "epoch": 0.6512871008580673, + "grad_norm": 4.125, + "learning_rate": 7.904219060463277e-06, + "loss": 1.8662, + "step": 2315 + }, + { + "epoch": 0.6526937684625123, + "grad_norm": 4.59375, + "learning_rate": 7.903147844371757e-06, + "loss": 1.7982, + "step": 2320 + }, + { + "epoch": 0.6541004360669573, + "grad_norm": 2.484375, + "learning_rate": 7.902070744695553e-06, + "loss": 1.6941, + "step": 2325 + }, + { + "epoch": 0.6555071036714024, + "grad_norm": 2.9375, + "learning_rate": 7.900987763058281e-06, + "loss": 1.8189, + "step": 2330 + }, + { + "epoch": 0.6569137712758475, + "grad_norm": 2.65625, + "learning_rate": 7.899898901092425e-06, + "loss": 1.6437, + "step": 2335 + }, + { + "epoch": 0.6583204388802926, + "grad_norm": 2.953125, + "learning_rate": 7.898804160439322e-06, + "loss": 1.5489, + "step": 2340 + }, + { + "epoch": 0.6597271064847376, + "grad_norm": 3.03125, + "learning_rate": 7.897703542749186e-06, + "loss": 1.5735, + "step": 2345 + }, + { + "epoch": 0.6611337740891827, + "grad_norm": 2.703125, + "learning_rate": 7.896597049681078e-06, + "loss": 1.561, + "step": 2350 + }, + { + "epoch": 0.6625404416936278, + "grad_norm": 2.359375, + "learning_rate": 7.895484682902921e-06, + "loss": 1.8226, + "step": 2355 + }, + { + "epoch": 0.6639471092980729, + "grad_norm": 3.71875, + "learning_rate": 7.89436644409149e-06, + "loss": 1.6574, + "step": 2360 + }, + { + "epoch": 0.665353776902518, + "grad_norm": 4.71875, + "learning_rate": 7.893242334932415e-06, + "loss": 1.4988, + "step": 2365 + }, + { + "epoch": 0.666760444506963, + "grad_norm": 2.96875, + "learning_rate": 7.892112357120171e-06, + "loss": 1.7978, + "step": 2370 + }, + { + "epoch": 0.6681671121114081, + "grad_norm": 2.859375, + "learning_rate": 7.890976512358079e-06, + "loss": 1.6548, + "step": 2375 + }, + { + "epoch": 0.6695737797158532, + "grad_norm": 2.953125, + "learning_rate": 7.889834802358309e-06, + "loss": 1.6971, + "step": 2380 + }, + { + "epoch": 0.6709804473202983, + "grad_norm": 2.578125, + "learning_rate": 7.888687228841864e-06, + "loss": 1.5706, + "step": 2385 + }, + { + "epoch": 0.6723871149247432, + "grad_norm": 3.640625, + "learning_rate": 7.887533793538594e-06, + "loss": 1.4289, + "step": 2390 + }, + { + "epoch": 0.6737937825291883, + "grad_norm": 2.28125, + "learning_rate": 7.886374498187178e-06, + "loss": 1.7071, + "step": 2395 + }, + { + "epoch": 0.6752004501336334, + "grad_norm": 3.40625, + "learning_rate": 7.885209344535135e-06, + "loss": 1.8025, + "step": 2400 + }, + { + "epoch": 0.6766071177380785, + "grad_norm": 3.953125, + "learning_rate": 7.884038334338812e-06, + "loss": 1.6936, + "step": 2405 + }, + { + "epoch": 0.6780137853425235, + "grad_norm": 2.171875, + "learning_rate": 7.88286146936338e-06, + "loss": 1.8952, + "step": 2410 + }, + { + "epoch": 0.6794204529469686, + "grad_norm": 2.515625, + "learning_rate": 7.881678751382842e-06, + "loss": 1.5186, + "step": 2415 + }, + { + "epoch": 0.6808271205514137, + "grad_norm": 3.09375, + "learning_rate": 7.880490182180022e-06, + "loss": 1.9398, + "step": 2420 + }, + { + "epoch": 0.6822337881558588, + "grad_norm": 3.265625, + "learning_rate": 7.879295763546558e-06, + "loss": 1.7953, + "step": 2425 + }, + { + "epoch": 0.6836404557603039, + "grad_norm": 2.84375, + "learning_rate": 7.878095497282916e-06, + "loss": 1.8955, + "step": 2430 + }, + { + "epoch": 0.6850471233647489, + "grad_norm": 3.25, + "learning_rate": 7.876889385198367e-06, + "loss": 1.5763, + "step": 2435 + }, + { + "epoch": 0.686453790969194, + "grad_norm": 2.734375, + "learning_rate": 7.875677429111e-06, + "loss": 1.5909, + "step": 2440 + }, + { + "epoch": 0.6878604585736391, + "grad_norm": 5.9375, + "learning_rate": 7.874459630847711e-06, + "loss": 1.6029, + "step": 2445 + }, + { + "epoch": 0.6892671261780842, + "grad_norm": 2.5625, + "learning_rate": 7.873235992244203e-06, + "loss": 1.4505, + "step": 2450 + }, + { + "epoch": 0.6906737937825291, + "grad_norm": 3.46875, + "learning_rate": 7.872006515144983e-06, + "loss": 1.8632, + "step": 2455 + }, + { + "epoch": 0.6920804613869742, + "grad_norm": 3.6875, + "learning_rate": 7.870771201403356e-06, + "loss": 1.6993, + "step": 2460 + }, + { + "epoch": 0.6934871289914193, + "grad_norm": 2.6875, + "learning_rate": 7.86953005288143e-06, + "loss": 1.803, + "step": 2465 + }, + { + "epoch": 0.6948937965958644, + "grad_norm": 3.09375, + "learning_rate": 7.868283071450105e-06, + "loss": 1.7066, + "step": 2470 + }, + { + "epoch": 0.6963004642003094, + "grad_norm": 3.875, + "learning_rate": 7.867030258989072e-06, + "loss": 1.6787, + "step": 2475 + }, + { + "epoch": 0.6977071318047545, + "grad_norm": 3.375, + "learning_rate": 7.865771617386817e-06, + "loss": 1.5385, + "step": 2480 + }, + { + "epoch": 0.6991137994091996, + "grad_norm": 3.171875, + "learning_rate": 7.86450714854061e-06, + "loss": 1.6667, + "step": 2485 + }, + { + "epoch": 0.7005204670136447, + "grad_norm": 1.9609375, + "learning_rate": 7.863236854356502e-06, + "loss": 1.6079, + "step": 2490 + }, + { + "epoch": 0.7019271346180898, + "grad_norm": 3.328125, + "learning_rate": 7.861960736749331e-06, + "loss": 1.7048, + "step": 2495 + }, + { + "epoch": 0.7033338022225348, + "grad_norm": 3.640625, + "learning_rate": 7.860678797642707e-06, + "loss": 1.855, + "step": 2500 + }, + { + "epoch": 0.7047404698269799, + "grad_norm": 2.359375, + "learning_rate": 7.859391038969021e-06, + "loss": 1.8016, + "step": 2505 + }, + { + "epoch": 0.706147137431425, + "grad_norm": 2.515625, + "learning_rate": 7.858097462669432e-06, + "loss": 1.8948, + "step": 2510 + }, + { + "epoch": 0.7075538050358701, + "grad_norm": 2.71875, + "learning_rate": 7.85679807069387e-06, + "loss": 1.5018, + "step": 2515 + }, + { + "epoch": 0.7089604726403151, + "grad_norm": 4.03125, + "learning_rate": 7.855492865001033e-06, + "loss": 1.7206, + "step": 2520 + }, + { + "epoch": 0.7103671402447601, + "grad_norm": 3.515625, + "learning_rate": 7.85418184755838e-06, + "loss": 1.5677, + "step": 2525 + }, + { + "epoch": 0.7117738078492052, + "grad_norm": 2.9375, + "learning_rate": 7.852865020342133e-06, + "loss": 1.7892, + "step": 2530 + }, + { + "epoch": 0.7131804754536503, + "grad_norm": 3.8125, + "learning_rate": 7.851542385337269e-06, + "loss": 1.3885, + "step": 2535 + }, + { + "epoch": 0.7145871430580953, + "grad_norm": 3.8125, + "learning_rate": 7.850213944537522e-06, + "loss": 1.6664, + "step": 2540 + }, + { + "epoch": 0.7159938106625404, + "grad_norm": 2.765625, + "learning_rate": 7.848879699945377e-06, + "loss": 1.5967, + "step": 2545 + }, + { + "epoch": 0.7174004782669855, + "grad_norm": 3.828125, + "learning_rate": 7.847539653572066e-06, + "loss": 1.5588, + "step": 2550 + }, + { + "epoch": 0.7188071458714306, + "grad_norm": 4.9375, + "learning_rate": 7.846193807437571e-06, + "loss": 1.7426, + "step": 2555 + }, + { + "epoch": 0.7202138134758757, + "grad_norm": 2.859375, + "learning_rate": 7.84484216357061e-06, + "loss": 1.7578, + "step": 2560 + }, + { + "epoch": 0.7216204810803207, + "grad_norm": 3.671875, + "learning_rate": 7.843484724008645e-06, + "loss": 1.5375, + "step": 2565 + }, + { + "epoch": 0.7230271486847658, + "grad_norm": 4.15625, + "learning_rate": 7.842121490797876e-06, + "loss": 1.4915, + "step": 2570 + }, + { + "epoch": 0.7244338162892109, + "grad_norm": 2.84375, + "learning_rate": 7.840752465993228e-06, + "loss": 1.385, + "step": 2575 + }, + { + "epoch": 0.725840483893656, + "grad_norm": 3.765625, + "learning_rate": 7.839377651658368e-06, + "loss": 1.3509, + "step": 2580 + }, + { + "epoch": 0.727247151498101, + "grad_norm": 2.40625, + "learning_rate": 7.837997049865677e-06, + "loss": 1.6331, + "step": 2585 + }, + { + "epoch": 0.7286538191025461, + "grad_norm": 3.265625, + "learning_rate": 7.836610662696273e-06, + "loss": 1.785, + "step": 2590 + }, + { + "epoch": 0.7300604867069911, + "grad_norm": 4.0, + "learning_rate": 7.835218492239987e-06, + "loss": 1.7578, + "step": 2595 + }, + { + "epoch": 0.7314671543114362, + "grad_norm": 3.875, + "learning_rate": 7.833820540595369e-06, + "loss": 1.7416, + "step": 2600 + }, + { + "epoch": 0.7328738219158812, + "grad_norm": 3.96875, + "learning_rate": 7.832416809869684e-06, + "loss": 1.7128, + "step": 2605 + }, + { + "epoch": 0.7342804895203263, + "grad_norm": 2.78125, + "learning_rate": 7.831007302178908e-06, + "loss": 1.7317, + "step": 2610 + }, + { + "epoch": 0.7356871571247714, + "grad_norm": 3.296875, + "learning_rate": 7.829592019647729e-06, + "loss": 1.365, + "step": 2615 + }, + { + "epoch": 0.7370938247292165, + "grad_norm": 3.390625, + "learning_rate": 7.82817096440953e-06, + "loss": 1.7279, + "step": 2620 + }, + { + "epoch": 0.7385004923336616, + "grad_norm": 4.53125, + "learning_rate": 7.826744138606408e-06, + "loss": 1.2845, + "step": 2625 + }, + { + "epoch": 0.7399071599381066, + "grad_norm": 3.265625, + "learning_rate": 7.825311544389149e-06, + "loss": 1.5838, + "step": 2630 + }, + { + "epoch": 0.7413138275425517, + "grad_norm": 2.75, + "learning_rate": 7.82387318391724e-06, + "loss": 1.6606, + "step": 2635 + }, + { + "epoch": 0.7427204951469968, + "grad_norm": 3.328125, + "learning_rate": 7.822429059358859e-06, + "loss": 1.767, + "step": 2640 + }, + { + "epoch": 0.7441271627514419, + "grad_norm": 3.203125, + "learning_rate": 7.820979172890869e-06, + "loss": 1.6674, + "step": 2645 + }, + { + "epoch": 0.7455338303558869, + "grad_norm": 2.78125, + "learning_rate": 7.819523526698824e-06, + "loss": 1.7634, + "step": 2650 + }, + { + "epoch": 0.746940497960332, + "grad_norm": 3.25, + "learning_rate": 7.818062122976954e-06, + "loss": 1.6022, + "step": 2655 + }, + { + "epoch": 0.7483471655647771, + "grad_norm": 2.84375, + "learning_rate": 7.816594963928176e-06, + "loss": 1.6332, + "step": 2660 + }, + { + "epoch": 0.7497538331692221, + "grad_norm": 5.9375, + "learning_rate": 7.815122051764075e-06, + "loss": 1.711, + "step": 2665 + }, + { + "epoch": 0.7511605007736671, + "grad_norm": 3.265625, + "learning_rate": 7.813643388704912e-06, + "loss": 1.4206, + "step": 2670 + }, + { + "epoch": 0.7525671683781122, + "grad_norm": 2.46875, + "learning_rate": 7.812158976979614e-06, + "loss": 1.5857, + "step": 2675 + }, + { + "epoch": 0.7539738359825573, + "grad_norm": 2.921875, + "learning_rate": 7.810668818825778e-06, + "loss": 1.6887, + "step": 2680 + }, + { + "epoch": 0.7553805035870024, + "grad_norm": 3.34375, + "learning_rate": 7.80917291648966e-06, + "loss": 1.4489, + "step": 2685 + }, + { + "epoch": 0.7567871711914474, + "grad_norm": 2.6875, + "learning_rate": 7.807671272226175e-06, + "loss": 1.7821, + "step": 2690 + }, + { + "epoch": 0.7581938387958925, + "grad_norm": 2.046875, + "learning_rate": 7.806163888298894e-06, + "loss": 1.449, + "step": 2695 + }, + { + "epoch": 0.7596005064003376, + "grad_norm": 2.6875, + "learning_rate": 7.80465076698004e-06, + "loss": 1.7329, + "step": 2700 + }, + { + "epoch": 0.7610071740047827, + "grad_norm": 3.203125, + "learning_rate": 7.80313191055048e-06, + "loss": 1.5321, + "step": 2705 + }, + { + "epoch": 0.7624138416092278, + "grad_norm": 3.203125, + "learning_rate": 7.801607321299738e-06, + "loss": 1.3949, + "step": 2710 + }, + { + "epoch": 0.7638205092136728, + "grad_norm": 3.25, + "learning_rate": 7.800077001525966e-06, + "loss": 1.6693, + "step": 2715 + }, + { + "epoch": 0.7652271768181179, + "grad_norm": 3.09375, + "learning_rate": 7.798540953535962e-06, + "loss": 1.3889, + "step": 2720 + }, + { + "epoch": 0.766633844422563, + "grad_norm": 3.015625, + "learning_rate": 7.796999179645157e-06, + "loss": 1.8232, + "step": 2725 + }, + { + "epoch": 0.7680405120270081, + "grad_norm": 3.0, + "learning_rate": 7.795451682177613e-06, + "loss": 1.5121, + "step": 2730 + }, + { + "epoch": 0.769447179631453, + "grad_norm": 4.28125, + "learning_rate": 7.793898463466018e-06, + "loss": 1.7762, + "step": 2735 + }, + { + "epoch": 0.7708538472358981, + "grad_norm": 3.375, + "learning_rate": 7.792339525851686e-06, + "loss": 1.6207, + "step": 2740 + }, + { + "epoch": 0.7722605148403432, + "grad_norm": 2.734375, + "learning_rate": 7.790774871684554e-06, + "loss": 1.6523, + "step": 2745 + }, + { + "epoch": 0.7736671824447883, + "grad_norm": 3.15625, + "learning_rate": 7.789204503323172e-06, + "loss": 1.8693, + "step": 2750 + }, + { + "epoch": 0.7750738500492333, + "grad_norm": 3.921875, + "learning_rate": 7.787628423134702e-06, + "loss": 1.5125, + "step": 2755 + }, + { + "epoch": 0.7764805176536784, + "grad_norm": 2.5625, + "learning_rate": 7.786046633494924e-06, + "loss": 1.4513, + "step": 2760 + }, + { + "epoch": 0.7778871852581235, + "grad_norm": 3.265625, + "learning_rate": 7.784459136788217e-06, + "loss": 1.7672, + "step": 2765 + }, + { + "epoch": 0.7792938528625686, + "grad_norm": 3.203125, + "learning_rate": 7.782865935407566e-06, + "loss": 1.6418, + "step": 2770 + }, + { + "epoch": 0.7807005204670137, + "grad_norm": 3.515625, + "learning_rate": 7.781267031754553e-06, + "loss": 1.6256, + "step": 2775 + }, + { + "epoch": 0.7821071880714587, + "grad_norm": 3.25, + "learning_rate": 7.779662428239359e-06, + "loss": 1.4991, + "step": 2780 + }, + { + "epoch": 0.7835138556759038, + "grad_norm": 2.984375, + "learning_rate": 7.778052127280754e-06, + "loss": 1.7964, + "step": 2785 + }, + { + "epoch": 0.7849205232803489, + "grad_norm": 3.21875, + "learning_rate": 7.776436131306096e-06, + "loss": 1.5225, + "step": 2790 + }, + { + "epoch": 0.786327190884794, + "grad_norm": 3.359375, + "learning_rate": 7.774814442751332e-06, + "loss": 1.5578, + "step": 2795 + }, + { + "epoch": 0.787733858489239, + "grad_norm": 3.984375, + "learning_rate": 7.773187064060981e-06, + "loss": 1.6137, + "step": 2800 + }, + { + "epoch": 0.789140526093684, + "grad_norm": 3.921875, + "learning_rate": 7.771553997688153e-06, + "loss": 1.604, + "step": 2805 + }, + { + "epoch": 0.7905471936981291, + "grad_norm": 3.0, + "learning_rate": 7.769915246094519e-06, + "loss": 1.583, + "step": 2810 + }, + { + "epoch": 0.7919538613025742, + "grad_norm": 5.0, + "learning_rate": 7.768270811750326e-06, + "loss": 1.613, + "step": 2815 + }, + { + "epoch": 0.7933605289070192, + "grad_norm": 3.6875, + "learning_rate": 7.766620697134385e-06, + "loss": 1.6581, + "step": 2820 + }, + { + "epoch": 0.7947671965114643, + "grad_norm": 3.03125, + "learning_rate": 7.76496490473407e-06, + "loss": 1.5417, + "step": 2825 + }, + { + "epoch": 0.7961738641159094, + "grad_norm": 2.953125, + "learning_rate": 7.763303437045313e-06, + "loss": 1.9133, + "step": 2830 + }, + { + "epoch": 0.7975805317203545, + "grad_norm": 3.53125, + "learning_rate": 7.761636296572605e-06, + "loss": 1.8703, + "step": 2835 + }, + { + "epoch": 0.7989871993247996, + "grad_norm": 4.9375, + "learning_rate": 7.759963485828982e-06, + "loss": 1.5487, + "step": 2840 + }, + { + "epoch": 0.8003938669292446, + "grad_norm": 3.0625, + "learning_rate": 7.75828500733603e-06, + "loss": 1.7283, + "step": 2845 + }, + { + "epoch": 0.8018005345336897, + "grad_norm": 3.875, + "learning_rate": 7.75660086362388e-06, + "loss": 1.4787, + "step": 2850 + }, + { + "epoch": 0.8032072021381348, + "grad_norm": 3.65625, + "learning_rate": 7.754911057231202e-06, + "loss": 1.6337, + "step": 2855 + }, + { + "epoch": 0.8046138697425799, + "grad_norm": 2.703125, + "learning_rate": 7.7532155907052e-06, + "loss": 1.6622, + "step": 2860 + }, + { + "epoch": 0.806020537347025, + "grad_norm": 2.671875, + "learning_rate": 7.751514466601611e-06, + "loss": 1.8183, + "step": 2865 + }, + { + "epoch": 0.8074272049514699, + "grad_norm": 3.015625, + "learning_rate": 7.749807687484702e-06, + "loss": 1.5107, + "step": 2870 + }, + { + "epoch": 0.808833872555915, + "grad_norm": 3.5, + "learning_rate": 7.748095255927262e-06, + "loss": 1.3176, + "step": 2875 + }, + { + "epoch": 0.8102405401603601, + "grad_norm": 3.265625, + "learning_rate": 7.746377174510603e-06, + "loss": 1.7498, + "step": 2880 + }, + { + "epoch": 0.8116472077648051, + "grad_norm": 2.75, + "learning_rate": 7.74465344582455e-06, + "loss": 1.7046, + "step": 2885 + }, + { + "epoch": 0.8130538753692502, + "grad_norm": 3.265625, + "learning_rate": 7.742924072467442e-06, + "loss": 1.6646, + "step": 2890 + }, + { + "epoch": 0.8144605429736953, + "grad_norm": 3.25, + "learning_rate": 7.74118905704613e-06, + "loss": 1.5339, + "step": 2895 + }, + { + "epoch": 0.8158672105781404, + "grad_norm": 3.375, + "learning_rate": 7.739448402175967e-06, + "loss": 1.7851, + "step": 2900 + }, + { + "epoch": 0.8172738781825855, + "grad_norm": 2.390625, + "learning_rate": 7.737702110480804e-06, + "loss": 1.7214, + "step": 2905 + }, + { + "epoch": 0.8186805457870305, + "grad_norm": 8.6875, + "learning_rate": 7.735950184592994e-06, + "loss": 1.7116, + "step": 2910 + }, + { + "epoch": 0.8200872133914756, + "grad_norm": 3.21875, + "learning_rate": 7.734192627153382e-06, + "loss": 1.761, + "step": 2915 + }, + { + "epoch": 0.8214938809959207, + "grad_norm": 2.84375, + "learning_rate": 7.732429440811297e-06, + "loss": 1.6123, + "step": 2920 + }, + { + "epoch": 0.8229005486003658, + "grad_norm": 3.40625, + "learning_rate": 7.730660628224563e-06, + "loss": 1.6235, + "step": 2925 + }, + { + "epoch": 0.8243072162048108, + "grad_norm": 3.265625, + "learning_rate": 7.728886192059474e-06, + "loss": 1.8165, + "step": 2930 + }, + { + "epoch": 0.8257138838092559, + "grad_norm": 3.171875, + "learning_rate": 7.727106134990808e-06, + "loss": 1.5903, + "step": 2935 + }, + { + "epoch": 0.8271205514137009, + "grad_norm": 3.21875, + "learning_rate": 7.725320459701813e-06, + "loss": 1.3788, + "step": 2940 + }, + { + "epoch": 0.828527219018146, + "grad_norm": 3.453125, + "learning_rate": 7.723529168884205e-06, + "loss": 1.6623, + "step": 2945 + }, + { + "epoch": 0.829933886622591, + "grad_norm": 3.796875, + "learning_rate": 7.72173226523817e-06, + "loss": 1.6399, + "step": 2950 + }, + { + "epoch": 0.8313405542270361, + "grad_norm": 3.375, + "learning_rate": 7.719929751472348e-06, + "loss": 1.677, + "step": 2955 + }, + { + "epoch": 0.8327472218314812, + "grad_norm": 2.984375, + "learning_rate": 7.71812163030384e-06, + "loss": 1.3748, + "step": 2960 + }, + { + "epoch": 0.8341538894359263, + "grad_norm": 5.59375, + "learning_rate": 7.7163079044582e-06, + "loss": 1.7135, + "step": 2965 + }, + { + "epoch": 0.8355605570403714, + "grad_norm": 2.71875, + "learning_rate": 7.714488576669427e-06, + "loss": 1.7007, + "step": 2970 + }, + { + "epoch": 0.8369672246448164, + "grad_norm": 3.734375, + "learning_rate": 7.712663649679966e-06, + "loss": 1.5469, + "step": 2975 + }, + { + "epoch": 0.8383738922492615, + "grad_norm": 4.71875, + "learning_rate": 7.710833126240702e-06, + "loss": 1.5403, + "step": 2980 + }, + { + "epoch": 0.8397805598537066, + "grad_norm": 3.0625, + "learning_rate": 7.70899700911096e-06, + "loss": 1.5021, + "step": 2985 + }, + { + "epoch": 0.8411872274581517, + "grad_norm": 2.71875, + "learning_rate": 7.707155301058488e-06, + "loss": 1.7806, + "step": 2990 + }, + { + "epoch": 0.8425938950625967, + "grad_norm": 3.328125, + "learning_rate": 7.705308004859471e-06, + "loss": 1.5206, + "step": 2995 + }, + { + "epoch": 0.8440005626670418, + "grad_norm": 3.40625, + "learning_rate": 7.703455123298512e-06, + "loss": 1.5729, + "step": 3000 + }, + { + "epoch": 0.8454072302714869, + "grad_norm": 3.75, + "learning_rate": 7.701596659168637e-06, + "loss": 1.6561, + "step": 3005 + }, + { + "epoch": 0.8468138978759319, + "grad_norm": 4.125, + "learning_rate": 7.699732615271283e-06, + "loss": 1.4492, + "step": 3010 + }, + { + "epoch": 0.8482205654803769, + "grad_norm": 2.671875, + "learning_rate": 7.697862994416301e-06, + "loss": 1.7029, + "step": 3015 + }, + { + "epoch": 0.849627233084822, + "grad_norm": 3.765625, + "learning_rate": 7.695987799421947e-06, + "loss": 1.4796, + "step": 3020 + }, + { + "epoch": 0.8510339006892671, + "grad_norm": 3.015625, + "learning_rate": 7.694107033114882e-06, + "loss": 1.6716, + "step": 3025 + }, + { + "epoch": 0.8524405682937122, + "grad_norm": 2.953125, + "learning_rate": 7.692220698330161e-06, + "loss": 1.6725, + "step": 3030 + }, + { + "epoch": 0.8538472358981573, + "grad_norm": 4.21875, + "learning_rate": 7.690328797911235e-06, + "loss": 1.7419, + "step": 3035 + }, + { + "epoch": 0.8552539035026023, + "grad_norm": 3.359375, + "learning_rate": 7.688431334709947e-06, + "loss": 1.4105, + "step": 3040 + }, + { + "epoch": 0.8566605711070474, + "grad_norm": 3.078125, + "learning_rate": 7.686528311586523e-06, + "loss": 1.2964, + "step": 3045 + }, + { + "epoch": 0.8580672387114925, + "grad_norm": 2.9375, + "learning_rate": 7.684619731409566e-06, + "loss": 1.4006, + "step": 3050 + }, + { + "epoch": 0.8594739063159376, + "grad_norm": 4.03125, + "learning_rate": 7.682705597056066e-06, + "loss": 1.6018, + "step": 3055 + }, + { + "epoch": 0.8608805739203826, + "grad_norm": 9.1875, + "learning_rate": 7.680785911411375e-06, + "loss": 1.8649, + "step": 3060 + }, + { + "epoch": 0.8622872415248277, + "grad_norm": 3.125, + "learning_rate": 7.678860677369218e-06, + "loss": 1.5029, + "step": 3065 + }, + { + "epoch": 0.8636939091292728, + "grad_norm": 2.765625, + "learning_rate": 7.676929897831684e-06, + "loss": 1.4425, + "step": 3070 + }, + { + "epoch": 0.8651005767337179, + "grad_norm": 3.265625, + "learning_rate": 7.674993575709218e-06, + "loss": 1.7859, + "step": 3075 + }, + { + "epoch": 0.8665072443381628, + "grad_norm": 2.4375, + "learning_rate": 7.673051713920624e-06, + "loss": 1.7595, + "step": 3080 + }, + { + "epoch": 0.8679139119426079, + "grad_norm": 4.3125, + "learning_rate": 7.671104315393053e-06, + "loss": 1.3791, + "step": 3085 + }, + { + "epoch": 0.869320579547053, + "grad_norm": 2.546875, + "learning_rate": 7.669151383062003e-06, + "loss": 1.8302, + "step": 3090 + }, + { + "epoch": 0.8707272471514981, + "grad_norm": 2.6875, + "learning_rate": 7.667192919871313e-06, + "loss": 1.6029, + "step": 3095 + }, + { + "epoch": 0.8721339147559432, + "grad_norm": 3.09375, + "learning_rate": 7.665228928773164e-06, + "loss": 1.7232, + "step": 3100 + }, + { + "epoch": 0.8735405823603882, + "grad_norm": 2.734375, + "learning_rate": 7.663259412728062e-06, + "loss": 1.6841, + "step": 3105 + }, + { + "epoch": 0.8749472499648333, + "grad_norm": 3.171875, + "learning_rate": 7.661284374704848e-06, + "loss": 1.5463, + "step": 3110 + }, + { + "epoch": 0.8763539175692784, + "grad_norm": 3.0, + "learning_rate": 7.659303817680682e-06, + "loss": 1.6332, + "step": 3115 + }, + { + "epoch": 0.8777605851737235, + "grad_norm": 2.53125, + "learning_rate": 7.657317744641047e-06, + "loss": 1.9633, + "step": 3120 + }, + { + "epoch": 0.8791672527781685, + "grad_norm": 2.21875, + "learning_rate": 7.655326158579739e-06, + "loss": 1.6777, + "step": 3125 + }, + { + "epoch": 0.8805739203826136, + "grad_norm": 2.4375, + "learning_rate": 7.65332906249886e-06, + "loss": 1.5168, + "step": 3130 + }, + { + "epoch": 0.8819805879870587, + "grad_norm": 3.0, + "learning_rate": 7.65132645940883e-06, + "loss": 1.8697, + "step": 3135 + }, + { + "epoch": 0.8833872555915038, + "grad_norm": 3.265625, + "learning_rate": 7.649318352328356e-06, + "loss": 1.3569, + "step": 3140 + }, + { + "epoch": 0.8847939231959489, + "grad_norm": 2.671875, + "learning_rate": 7.647304744284452e-06, + "loss": 1.6584, + "step": 3145 + }, + { + "epoch": 0.8862005908003938, + "grad_norm": 3.109375, + "learning_rate": 7.645285638312418e-06, + "loss": 1.6259, + "step": 3150 + }, + { + "epoch": 0.8876072584048389, + "grad_norm": 2.5625, + "learning_rate": 7.643261037455844e-06, + "loss": 1.7632, + "step": 3155 + }, + { + "epoch": 0.889013926009284, + "grad_norm": 3.0625, + "learning_rate": 7.641230944766605e-06, + "loss": 1.5457, + "step": 3160 + }, + { + "epoch": 0.890420593613729, + "grad_norm": 2.734375, + "learning_rate": 7.63919536330485e-06, + "loss": 1.599, + "step": 3165 + }, + { + "epoch": 0.8918272612181741, + "grad_norm": 3.421875, + "learning_rate": 7.637154296139003e-06, + "loss": 1.733, + "step": 3170 + }, + { + "epoch": 0.8932339288226192, + "grad_norm": 3.484375, + "learning_rate": 7.63510774634576e-06, + "loss": 1.6175, + "step": 3175 + }, + { + "epoch": 0.8946405964270643, + "grad_norm": 3.09375, + "learning_rate": 7.633055717010078e-06, + "loss": 1.7642, + "step": 3180 + }, + { + "epoch": 0.8960472640315094, + "grad_norm": 3.109375, + "learning_rate": 7.630998211225177e-06, + "loss": 1.5858, + "step": 3185 + }, + { + "epoch": 0.8974539316359544, + "grad_norm": 2.421875, + "learning_rate": 7.62893523209253e-06, + "loss": 1.4382, + "step": 3190 + }, + { + "epoch": 0.8988605992403995, + "grad_norm": 2.75, + "learning_rate": 7.62686678272186e-06, + "loss": 1.7159, + "step": 3195 + }, + { + "epoch": 0.9002672668448446, + "grad_norm": 4.5, + "learning_rate": 7.624792866231137e-06, + "loss": 1.4671, + "step": 3200 + }, + { + "epoch": 0.9016739344492897, + "grad_norm": 2.21875, + "learning_rate": 7.622713485746573e-06, + "loss": 1.6495, + "step": 3205 + }, + { + "epoch": 0.9030806020537347, + "grad_norm": 2.984375, + "learning_rate": 7.620628644402613e-06, + "loss": 1.4105, + "step": 3210 + }, + { + "epoch": 0.9044872696581798, + "grad_norm": 3.5, + "learning_rate": 7.618538345341938e-06, + "loss": 1.55, + "step": 3215 + }, + { + "epoch": 0.9058939372626248, + "grad_norm": 3.28125, + "learning_rate": 7.6164425917154545e-06, + "loss": 1.4803, + "step": 3220 + }, + { + "epoch": 0.9073006048670699, + "grad_norm": 3.359375, + "learning_rate": 7.614341386682289e-06, + "loss": 1.4704, + "step": 3225 + }, + { + "epoch": 0.908707272471515, + "grad_norm": 6.6875, + "learning_rate": 7.612234733409786e-06, + "loss": 1.7306, + "step": 3230 + }, + { + "epoch": 0.91011394007596, + "grad_norm": 4.25, + "learning_rate": 7.610122635073507e-06, + "loss": 1.7463, + "step": 3235 + }, + { + "epoch": 0.9115206076804051, + "grad_norm": 3.5, + "learning_rate": 7.608005094857213e-06, + "loss": 1.7264, + "step": 3240 + }, + { + "epoch": 0.9129272752848502, + "grad_norm": 3.4375, + "learning_rate": 7.60588211595288e-06, + "loss": 1.5772, + "step": 3245 + }, + { + "epoch": 0.9143339428892953, + "grad_norm": 3.359375, + "learning_rate": 7.603753701560669e-06, + "loss": 1.7315, + "step": 3250 + }, + { + "epoch": 0.9157406104937403, + "grad_norm": 2.328125, + "learning_rate": 7.6016198548889446e-06, + "loss": 1.5967, + "step": 3255 + }, + { + "epoch": 0.9171472780981854, + "grad_norm": 3.328125, + "learning_rate": 7.599480579154253e-06, + "loss": 1.5748, + "step": 3260 + }, + { + "epoch": 0.9185539457026305, + "grad_norm": 3.46875, + "learning_rate": 7.59733587758133e-06, + "loss": 1.6969, + "step": 3265 + }, + { + "epoch": 0.9199606133070756, + "grad_norm": 2.6875, + "learning_rate": 7.595185753403086e-06, + "loss": 1.3966, + "step": 3270 + }, + { + "epoch": 0.9213672809115206, + "grad_norm": 3.65625, + "learning_rate": 7.593030209860608e-06, + "loss": 1.8503, + "step": 3275 + }, + { + "epoch": 0.9227739485159657, + "grad_norm": 3.46875, + "learning_rate": 7.590869250203151e-06, + "loss": 1.4599, + "step": 3280 + }, + { + "epoch": 0.9241806161204107, + "grad_norm": 3.828125, + "learning_rate": 7.588702877688133e-06, + "loss": 1.7539, + "step": 3285 + }, + { + "epoch": 0.9255872837248558, + "grad_norm": 4.75, + "learning_rate": 7.586531095581135e-06, + "loss": 1.6305, + "step": 3290 + }, + { + "epoch": 0.9269939513293008, + "grad_norm": 3.4375, + "learning_rate": 7.584353907155886e-06, + "loss": 1.9388, + "step": 3295 + }, + { + "epoch": 0.9284006189337459, + "grad_norm": 2.671875, + "learning_rate": 7.5821713156942725e-06, + "loss": 1.6272, + "step": 3300 + }, + { + "epoch": 0.929807286538191, + "grad_norm": 3.75, + "learning_rate": 7.57998332448632e-06, + "loss": 1.6722, + "step": 3305 + }, + { + "epoch": 0.9312139541426361, + "grad_norm": 2.96875, + "learning_rate": 7.577789936830194e-06, + "loss": 1.533, + "step": 3310 + }, + { + "epoch": 0.9326206217470812, + "grad_norm": 3.6875, + "learning_rate": 7.575591156032198e-06, + "loss": 1.6827, + "step": 3315 + }, + { + "epoch": 0.9340272893515262, + "grad_norm": 2.53125, + "learning_rate": 7.573386985406761e-06, + "loss": 1.5372, + "step": 3320 + }, + { + "epoch": 0.9354339569559713, + "grad_norm": 2.75, + "learning_rate": 7.571177428276439e-06, + "loss": 1.749, + "step": 3325 + }, + { + "epoch": 0.9368406245604164, + "grad_norm": 3.71875, + "learning_rate": 7.568962487971905e-06, + "loss": 1.5339, + "step": 3330 + }, + { + "epoch": 0.9382472921648615, + "grad_norm": 3.390625, + "learning_rate": 7.56674216783195e-06, + "loss": 1.416, + "step": 3335 + }, + { + "epoch": 0.9396539597693065, + "grad_norm": 3.03125, + "learning_rate": 7.564516471203474e-06, + "loss": 1.6917, + "step": 3340 + }, + { + "epoch": 0.9410606273737516, + "grad_norm": 3.6875, + "learning_rate": 7.562285401441478e-06, + "loss": 1.6447, + "step": 3345 + }, + { + "epoch": 0.9424672949781967, + "grad_norm": 3.6875, + "learning_rate": 7.560048961909068e-06, + "loss": 1.7682, + "step": 3350 + }, + { + "epoch": 0.9438739625826417, + "grad_norm": 2.953125, + "learning_rate": 7.5578071559774384e-06, + "loss": 1.5933, + "step": 3355 + }, + { + "epoch": 0.9452806301870867, + "grad_norm": 3.5, + "learning_rate": 7.555559987025878e-06, + "loss": 1.7298, + "step": 3360 + }, + { + "epoch": 0.9466872977915318, + "grad_norm": 3.3125, + "learning_rate": 7.553307458441755e-06, + "loss": 1.7833, + "step": 3365 + }, + { + "epoch": 0.9480939653959769, + "grad_norm": 3.078125, + "learning_rate": 7.551049573620521e-06, + "loss": 1.7445, + "step": 3370 + }, + { + "epoch": 0.949500633000422, + "grad_norm": 3.734375, + "learning_rate": 7.5487863359656994e-06, + "loss": 1.3276, + "step": 3375 + }, + { + "epoch": 0.9509073006048671, + "grad_norm": 3.140625, + "learning_rate": 7.546517748888882e-06, + "loss": 1.7944, + "step": 3380 + }, + { + "epoch": 0.9523139682093121, + "grad_norm": 2.578125, + "learning_rate": 7.544243815809729e-06, + "loss": 1.6007, + "step": 3385 + }, + { + "epoch": 0.9537206358137572, + "grad_norm": 2.953125, + "learning_rate": 7.54196454015595e-06, + "loss": 1.6207, + "step": 3390 + }, + { + "epoch": 0.9551273034182023, + "grad_norm": 2.78125, + "learning_rate": 7.539679925363316e-06, + "loss": 1.534, + "step": 3395 + }, + { + "epoch": 0.9565339710226474, + "grad_norm": 4.40625, + "learning_rate": 7.5373899748756435e-06, + "loss": 1.5586, + "step": 3400 + }, + { + "epoch": 0.9579406386270924, + "grad_norm": 4.59375, + "learning_rate": 7.53509469214479e-06, + "loss": 1.3578, + "step": 3405 + }, + { + "epoch": 0.9593473062315375, + "grad_norm": 4.09375, + "learning_rate": 7.532794080630655e-06, + "loss": 1.7003, + "step": 3410 + }, + { + "epoch": 0.9607539738359826, + "grad_norm": 3.15625, + "learning_rate": 7.530488143801166e-06, + "loss": 1.6732, + "step": 3415 + }, + { + "epoch": 0.9621606414404277, + "grad_norm": 3.609375, + "learning_rate": 7.528176885132283e-06, + "loss": 1.675, + "step": 3420 + }, + { + "epoch": 0.9635673090448726, + "grad_norm": 2.90625, + "learning_rate": 7.525860308107983e-06, + "loss": 1.6125, + "step": 3425 + }, + { + "epoch": 0.9649739766493177, + "grad_norm": 2.875, + "learning_rate": 7.523538416220264e-06, + "loss": 1.621, + "step": 3430 + }, + { + "epoch": 0.9663806442537628, + "grad_norm": 3.640625, + "learning_rate": 7.52121121296913e-06, + "loss": 1.7444, + "step": 3435 + }, + { + "epoch": 0.9677873118582079, + "grad_norm": 3.140625, + "learning_rate": 7.518878701862599e-06, + "loss": 1.8158, + "step": 3440 + }, + { + "epoch": 0.969193979462653, + "grad_norm": 3.0, + "learning_rate": 7.5165408864166845e-06, + "loss": 1.6573, + "step": 3445 + }, + { + "epoch": 0.970600647067098, + "grad_norm": 2.984375, + "learning_rate": 7.514197770155398e-06, + "loss": 1.5889, + "step": 3450 + }, + { + "epoch": 0.9720073146715431, + "grad_norm": 3.0, + "learning_rate": 7.511849356610738e-06, + "loss": 1.3515, + "step": 3455 + }, + { + "epoch": 0.9734139822759882, + "grad_norm": 2.96875, + "learning_rate": 7.5094956493226955e-06, + "loss": 1.6675, + "step": 3460 + }, + { + "epoch": 0.9748206498804333, + "grad_norm": 3.09375, + "learning_rate": 7.507136651839233e-06, + "loss": 1.5557, + "step": 3465 + }, + { + "epoch": 0.9762273174848783, + "grad_norm": 3.0, + "learning_rate": 7.504772367716292e-06, + "loss": 1.4733, + "step": 3470 + }, + { + "epoch": 0.9776339850893234, + "grad_norm": 3.09375, + "learning_rate": 7.5024028005177814e-06, + "loss": 1.6102, + "step": 3475 + }, + { + "epoch": 0.9790406526937685, + "grad_norm": 2.890625, + "learning_rate": 7.500027953815577e-06, + "loss": 1.5866, + "step": 3480 + }, + { + "epoch": 0.9804473202982136, + "grad_norm": 3.3125, + "learning_rate": 7.497647831189506e-06, + "loss": 1.5003, + "step": 3485 + }, + { + "epoch": 0.9818539879026587, + "grad_norm": 2.78125, + "learning_rate": 7.495262436227356e-06, + "loss": 1.6196, + "step": 3490 + }, + { + "epoch": 0.9832606555071036, + "grad_norm": 3.828125, + "learning_rate": 7.492871772524859e-06, + "loss": 1.6242, + "step": 3495 + }, + { + "epoch": 0.9846673231115487, + "grad_norm": 4.6875, + "learning_rate": 7.490475843685686e-06, + "loss": 1.635, + "step": 3500 + }, + { + "epoch": 0.9860739907159938, + "grad_norm": 3.25, + "learning_rate": 7.488074653321452e-06, + "loss": 1.5958, + "step": 3505 + }, + { + "epoch": 0.9874806583204389, + "grad_norm": 2.8125, + "learning_rate": 7.485668205051696e-06, + "loss": 1.5429, + "step": 3510 + }, + { + "epoch": 0.9888873259248839, + "grad_norm": 2.59375, + "learning_rate": 7.4832565025038855e-06, + "loss": 1.8323, + "step": 3515 + }, + { + "epoch": 0.990293993529329, + "grad_norm": 2.921875, + "learning_rate": 7.480839549313409e-06, + "loss": 1.6328, + "step": 3520 + }, + { + "epoch": 0.9917006611337741, + "grad_norm": 2.453125, + "learning_rate": 7.478417349123569e-06, + "loss": 1.6957, + "step": 3525 + }, + { + "epoch": 0.9931073287382192, + "grad_norm": 4.28125, + "learning_rate": 7.475989905585578e-06, + "loss": 1.4479, + "step": 3530 + }, + { + "epoch": 0.9945139963426642, + "grad_norm": 2.640625, + "learning_rate": 7.473557222358551e-06, + "loss": 1.5085, + "step": 3535 + }, + { + "epoch": 0.9959206639471093, + "grad_norm": 4.375, + "learning_rate": 7.471119303109502e-06, + "loss": 1.5398, + "step": 3540 + }, + { + "epoch": 0.9973273315515544, + "grad_norm": 2.234375, + "learning_rate": 7.468676151513339e-06, + "loss": 1.503, + "step": 3545 + }, + { + "epoch": 0.9987339991559995, + "grad_norm": 3.796875, + "learning_rate": 7.4662277712528536e-06, + "loss": 1.6194, + "step": 3550 + }, + { + "epoch": 0.9998593332395554, + "eval_loss": 1.611290454864502, + "eval_runtime": 329.7515, + "eval_samples_per_second": 9.577, + "eval_steps_per_second": 4.788, + "step": 3554 + }, + { + "epoch": 1.0001406667604444, + "grad_norm": 4.03125, + "learning_rate": 7.463774166018723e-06, + "loss": 1.6976, + "step": 3555 + }, + { + "epoch": 1.0015473343648895, + "grad_norm": 3.25, + "learning_rate": 7.461315339509499e-06, + "loss": 1.6923, + "step": 3560 + }, + { + "epoch": 1.0029540019693346, + "grad_norm": 2.90625, + "learning_rate": 7.458851295431601e-06, + "loss": 1.5505, + "step": 3565 + }, + { + "epoch": 1.0043606695737797, + "grad_norm": 3.140625, + "learning_rate": 7.456382037499322e-06, + "loss": 1.5004, + "step": 3570 + }, + { + "epoch": 1.0057673371782248, + "grad_norm": 3.234375, + "learning_rate": 7.453907569434804e-06, + "loss": 1.5441, + "step": 3575 + }, + { + "epoch": 1.0071740047826698, + "grad_norm": 3.1875, + "learning_rate": 7.451427894968049e-06, + "loss": 1.4524, + "step": 3580 + }, + { + "epoch": 1.008580672387115, + "grad_norm": 3.15625, + "learning_rate": 7.448943017836903e-06, + "loss": 1.5271, + "step": 3585 + }, + { + "epoch": 1.00998733999156, + "grad_norm": 3.03125, + "learning_rate": 7.44645294178706e-06, + "loss": 1.5947, + "step": 3590 + }, + { + "epoch": 1.011394007596005, + "grad_norm": 2.171875, + "learning_rate": 7.443957670572046e-06, + "loss": 1.7221, + "step": 3595 + }, + { + "epoch": 1.0128006752004501, + "grad_norm": 4.1875, + "learning_rate": 7.4414572079532205e-06, + "loss": 1.538, + "step": 3600 + }, + { + "epoch": 1.0142073428048952, + "grad_norm": 3.296875, + "learning_rate": 7.438951557699767e-06, + "loss": 1.4715, + "step": 3605 + }, + { + "epoch": 1.0156140104093403, + "grad_norm": 3.171875, + "learning_rate": 7.436440723588688e-06, + "loss": 1.5015, + "step": 3610 + }, + { + "epoch": 1.0170206780137854, + "grad_norm": 3.640625, + "learning_rate": 7.433924709404806e-06, + "loss": 1.5526, + "step": 3615 + }, + { + "epoch": 1.0184273456182305, + "grad_norm": 3.9375, + "learning_rate": 7.4314035189407436e-06, + "loss": 1.2446, + "step": 3620 + }, + { + "epoch": 1.0198340132226755, + "grad_norm": 3.734375, + "learning_rate": 7.428877155996934e-06, + "loss": 1.6687, + "step": 3625 + }, + { + "epoch": 1.0212406808271206, + "grad_norm": 2.09375, + "learning_rate": 7.4263456243816e-06, + "loss": 1.3963, + "step": 3630 + }, + { + "epoch": 1.0226473484315657, + "grad_norm": 2.6875, + "learning_rate": 7.42380892791076e-06, + "loss": 1.606, + "step": 3635 + }, + { + "epoch": 1.0240540160360108, + "grad_norm": 2.734375, + "learning_rate": 7.421267070408218e-06, + "loss": 1.6143, + "step": 3640 + }, + { + "epoch": 1.0254606836404558, + "grad_norm": 2.96875, + "learning_rate": 7.418720055705556e-06, + "loss": 1.6649, + "step": 3645 + }, + { + "epoch": 1.026867351244901, + "grad_norm": 2.296875, + "learning_rate": 7.416167887642132e-06, + "loss": 1.6411, + "step": 3650 + }, + { + "epoch": 1.028274018849346, + "grad_norm": 4.28125, + "learning_rate": 7.413610570065069e-06, + "loss": 1.7296, + "step": 3655 + }, + { + "epoch": 1.029680686453791, + "grad_norm": 2.96875, + "learning_rate": 7.411048106829253e-06, + "loss": 1.6742, + "step": 3660 + }, + { + "epoch": 1.031087354058236, + "grad_norm": 3.40625, + "learning_rate": 7.408480501797333e-06, + "loss": 1.4126, + "step": 3665 + }, + { + "epoch": 1.032494021662681, + "grad_norm": 5.0625, + "learning_rate": 7.405907758839698e-06, + "loss": 1.6467, + "step": 3670 + }, + { + "epoch": 1.033900689267126, + "grad_norm": 4.46875, + "learning_rate": 7.403329881834489e-06, + "loss": 1.4774, + "step": 3675 + }, + { + "epoch": 1.0353073568715712, + "grad_norm": 3.140625, + "learning_rate": 7.400746874667586e-06, + "loss": 1.6924, + "step": 3680 + }, + { + "epoch": 1.0367140244760162, + "grad_norm": 3.375, + "learning_rate": 7.398158741232598e-06, + "loss": 1.635, + "step": 3685 + }, + { + "epoch": 1.0381206920804613, + "grad_norm": 5.84375, + "learning_rate": 7.395565485430866e-06, + "loss": 1.4753, + "step": 3690 + }, + { + "epoch": 1.0395273596849064, + "grad_norm": 3.484375, + "learning_rate": 7.392967111171448e-06, + "loss": 1.4399, + "step": 3695 + }, + { + "epoch": 1.0409340272893515, + "grad_norm": 3.234375, + "learning_rate": 7.390363622371122e-06, + "loss": 1.6359, + "step": 3700 + }, + { + "epoch": 1.0423406948937965, + "grad_norm": 3.03125, + "learning_rate": 7.387755022954373e-06, + "loss": 1.6028, + "step": 3705 + }, + { + "epoch": 1.0437473624982416, + "grad_norm": 2.359375, + "learning_rate": 7.385141316853388e-06, + "loss": 1.4053, + "step": 3710 + }, + { + "epoch": 1.0451540301026867, + "grad_norm": 2.875, + "learning_rate": 7.382522508008056e-06, + "loss": 1.5238, + "step": 3715 + }, + { + "epoch": 1.0465606977071318, + "grad_norm": 2.15625, + "learning_rate": 7.379898600365956e-06, + "loss": 1.7097, + "step": 3720 + }, + { + "epoch": 1.0479673653115769, + "grad_norm": 2.8125, + "learning_rate": 7.377269597882351e-06, + "loss": 1.5788, + "step": 3725 + }, + { + "epoch": 1.049374032916022, + "grad_norm": 2.3125, + "learning_rate": 7.374635504520186e-06, + "loss": 1.6983, + "step": 3730 + }, + { + "epoch": 1.050780700520467, + "grad_norm": 4.15625, + "learning_rate": 7.371996324250083e-06, + "loss": 1.5622, + "step": 3735 + }, + { + "epoch": 1.052187368124912, + "grad_norm": 3.5625, + "learning_rate": 7.369352061050324e-06, + "loss": 1.4207, + "step": 3740 + }, + { + "epoch": 1.0535940357293572, + "grad_norm": 2.3125, + "learning_rate": 7.366702718906859e-06, + "loss": 1.5624, + "step": 3745 + }, + { + "epoch": 1.0550007033338022, + "grad_norm": 3.75, + "learning_rate": 7.364048301813293e-06, + "loss": 1.3799, + "step": 3750 + }, + { + "epoch": 1.0564073709382473, + "grad_norm": 3.65625, + "learning_rate": 7.361388813770881e-06, + "loss": 1.781, + "step": 3755 + }, + { + "epoch": 1.0578140385426924, + "grad_norm": 3.96875, + "learning_rate": 7.35872425878852e-06, + "loss": 1.5601, + "step": 3760 + }, + { + "epoch": 1.0592207061471375, + "grad_norm": 2.71875, + "learning_rate": 7.356054640882747e-06, + "loss": 1.5782, + "step": 3765 + }, + { + "epoch": 1.0606273737515826, + "grad_norm": 3.40625, + "learning_rate": 7.35337996407773e-06, + "loss": 1.7065, + "step": 3770 + }, + { + "epoch": 1.0620340413560276, + "grad_norm": 3.5, + "learning_rate": 7.350700232405263e-06, + "loss": 1.7547, + "step": 3775 + }, + { + "epoch": 1.0634407089604727, + "grad_norm": 3.03125, + "learning_rate": 7.3480154499047585e-06, + "loss": 1.797, + "step": 3780 + }, + { + "epoch": 1.0648473765649178, + "grad_norm": 4.0625, + "learning_rate": 7.345325620623246e-06, + "loss": 1.6371, + "step": 3785 + }, + { + "epoch": 1.0662540441693629, + "grad_norm": 3.328125, + "learning_rate": 7.3426307486153575e-06, + "loss": 1.6329, + "step": 3790 + }, + { + "epoch": 1.067660711773808, + "grad_norm": 2.625, + "learning_rate": 7.339930837943331e-06, + "loss": 1.5983, + "step": 3795 + }, + { + "epoch": 1.0690673793782528, + "grad_norm": 3.03125, + "learning_rate": 7.337225892676997e-06, + "loss": 1.6337, + "step": 3800 + }, + { + "epoch": 1.070474046982698, + "grad_norm": 2.640625, + "learning_rate": 7.334515916893774e-06, + "loss": 1.6965, + "step": 3805 + }, + { + "epoch": 1.071880714587143, + "grad_norm": 3.421875, + "learning_rate": 7.3318009146786695e-06, + "loss": 1.5862, + "step": 3810 + }, + { + "epoch": 1.073287382191588, + "grad_norm": 3.25, + "learning_rate": 7.3290808901242595e-06, + "loss": 1.5898, + "step": 3815 + }, + { + "epoch": 1.0746940497960331, + "grad_norm": 3.359375, + "learning_rate": 7.326355847330698e-06, + "loss": 1.5154, + "step": 3820 + }, + { + "epoch": 1.0761007174004782, + "grad_norm": 2.9375, + "learning_rate": 7.323625790405698e-06, + "loss": 1.6193, + "step": 3825 + }, + { + "epoch": 1.0775073850049233, + "grad_norm": 2.65625, + "learning_rate": 7.320890723464535e-06, + "loss": 1.7274, + "step": 3830 + }, + { + "epoch": 1.0789140526093683, + "grad_norm": 3.765625, + "learning_rate": 7.3181506506300324e-06, + "loss": 1.418, + "step": 3835 + }, + { + "epoch": 1.0803207202138134, + "grad_norm": 3.140625, + "learning_rate": 7.315405576032563e-06, + "loss": 1.6552, + "step": 3840 + }, + { + "epoch": 1.0817273878182585, + "grad_norm": 4.65625, + "learning_rate": 7.3126555038100374e-06, + "loss": 1.5523, + "step": 3845 + }, + { + "epoch": 1.0831340554227036, + "grad_norm": 3.359375, + "learning_rate": 7.3099004381079e-06, + "loss": 1.4922, + "step": 3850 + }, + { + "epoch": 1.0845407230271487, + "grad_norm": 2.9375, + "learning_rate": 7.307140383079125e-06, + "loss": 1.5572, + "step": 3855 + }, + { + "epoch": 1.0859473906315937, + "grad_norm": 2.953125, + "learning_rate": 7.304375342884201e-06, + "loss": 1.4289, + "step": 3860 + }, + { + "epoch": 1.0873540582360388, + "grad_norm": 2.8125, + "learning_rate": 7.301605321691138e-06, + "loss": 1.7568, + "step": 3865 + }, + { + "epoch": 1.088760725840484, + "grad_norm": 3.125, + "learning_rate": 7.2988303236754515e-06, + "loss": 1.4492, + "step": 3870 + }, + { + "epoch": 1.090167393444929, + "grad_norm": 3.125, + "learning_rate": 7.296050353020156e-06, + "loss": 1.8061, + "step": 3875 + }, + { + "epoch": 1.091574061049374, + "grad_norm": 3.03125, + "learning_rate": 7.293265413915767e-06, + "loss": 1.775, + "step": 3880 + }, + { + "epoch": 1.0929807286538191, + "grad_norm": 2.828125, + "learning_rate": 7.290475510560288e-06, + "loss": 1.6663, + "step": 3885 + }, + { + "epoch": 1.0943873962582642, + "grad_norm": 2.75, + "learning_rate": 7.287680647159202e-06, + "loss": 1.6116, + "step": 3890 + }, + { + "epoch": 1.0957940638627093, + "grad_norm": 3.015625, + "learning_rate": 7.2848808279254745e-06, + "loss": 1.4655, + "step": 3895 + }, + { + "epoch": 1.0972007314671544, + "grad_norm": 2.71875, + "learning_rate": 7.282076057079537e-06, + "loss": 1.7702, + "step": 3900 + }, + { + "epoch": 1.0986073990715994, + "grad_norm": 3.171875, + "learning_rate": 7.2792663388492865e-06, + "loss": 1.4311, + "step": 3905 + }, + { + "epoch": 1.1000140666760445, + "grad_norm": 3.28125, + "learning_rate": 7.2764516774700775e-06, + "loss": 1.6944, + "step": 3910 + }, + { + "epoch": 1.1014207342804896, + "grad_norm": 3.015625, + "learning_rate": 7.273632077184716e-06, + "loss": 1.7071, + "step": 3915 + }, + { + "epoch": 1.1028274018849347, + "grad_norm": 2.609375, + "learning_rate": 7.270807542243453e-06, + "loss": 1.5593, + "step": 3920 + }, + { + "epoch": 1.1042340694893797, + "grad_norm": 2.765625, + "learning_rate": 7.2679780769039775e-06, + "loss": 1.5341, + "step": 3925 + }, + { + "epoch": 1.1056407370938248, + "grad_norm": 2.34375, + "learning_rate": 7.26514368543141e-06, + "loss": 1.8932, + "step": 3930 + }, + { + "epoch": 1.10704740469827, + "grad_norm": 3.203125, + "learning_rate": 7.262304372098299e-06, + "loss": 1.7494, + "step": 3935 + }, + { + "epoch": 1.108454072302715, + "grad_norm": 4.0625, + "learning_rate": 7.259460141184609e-06, + "loss": 1.6125, + "step": 3940 + }, + { + "epoch": 1.1098607399071598, + "grad_norm": 2.515625, + "learning_rate": 7.25661099697772e-06, + "loss": 1.7168, + "step": 3945 + }, + { + "epoch": 1.111267407511605, + "grad_norm": 2.734375, + "learning_rate": 7.253756943772416e-06, + "loss": 1.7773, + "step": 3950 + }, + { + "epoch": 1.11267407511605, + "grad_norm": 2.859375, + "learning_rate": 7.250897985870884e-06, + "loss": 1.6824, + "step": 3955 + }, + { + "epoch": 1.114080742720495, + "grad_norm": 2.625, + "learning_rate": 7.248034127582698e-06, + "loss": 1.6164, + "step": 3960 + }, + { + "epoch": 1.1154874103249401, + "grad_norm": 2.984375, + "learning_rate": 7.245165373224829e-06, + "loss": 1.6013, + "step": 3965 + }, + { + "epoch": 1.1168940779293852, + "grad_norm": 3.953125, + "learning_rate": 7.242291727121617e-06, + "loss": 1.6286, + "step": 3970 + }, + { + "epoch": 1.1183007455338303, + "grad_norm": 2.859375, + "learning_rate": 7.2394131936047845e-06, + "loss": 1.5001, + "step": 3975 + }, + { + "epoch": 1.1197074131382754, + "grad_norm": 3.109375, + "learning_rate": 7.236529777013416e-06, + "loss": 1.5944, + "step": 3980 + }, + { + "epoch": 1.1211140807427205, + "grad_norm": 3.140625, + "learning_rate": 7.233641481693959e-06, + "loss": 1.5331, + "step": 3985 + }, + { + "epoch": 1.1225207483471655, + "grad_norm": 3.140625, + "learning_rate": 7.230748312000216e-06, + "loss": 1.772, + "step": 3990 + }, + { + "epoch": 1.1239274159516106, + "grad_norm": 4.5625, + "learning_rate": 7.227850272293334e-06, + "loss": 1.5381, + "step": 3995 + }, + { + "epoch": 1.1253340835560557, + "grad_norm": 3.140625, + "learning_rate": 7.224947366941805e-06, + "loss": 1.7734, + "step": 4000 + }, + { + "epoch": 1.1267407511605008, + "grad_norm": 2.390625, + "learning_rate": 7.2220396003214525e-06, + "loss": 1.703, + "step": 4005 + }, + { + "epoch": 1.1281474187649458, + "grad_norm": 2.734375, + "learning_rate": 7.219126976815427e-06, + "loss": 1.3594, + "step": 4010 + }, + { + "epoch": 1.129554086369391, + "grad_norm": 2.90625, + "learning_rate": 7.216209500814205e-06, + "loss": 1.3872, + "step": 4015 + }, + { + "epoch": 1.130960753973836, + "grad_norm": 3.296875, + "learning_rate": 7.213287176715571e-06, + "loss": 1.5567, + "step": 4020 + }, + { + "epoch": 1.132367421578281, + "grad_norm": 2.703125, + "learning_rate": 7.210360008924625e-06, + "loss": 1.5977, + "step": 4025 + }, + { + "epoch": 1.1337740891827262, + "grad_norm": 3.375, + "learning_rate": 7.207428001853762e-06, + "loss": 1.7796, + "step": 4030 + }, + { + "epoch": 1.1351807567871712, + "grad_norm": 2.703125, + "learning_rate": 7.204491159922675e-06, + "loss": 1.7479, + "step": 4035 + }, + { + "epoch": 1.1365874243916163, + "grad_norm": 3.59375, + "learning_rate": 7.201549487558344e-06, + "loss": 1.5466, + "step": 4040 + }, + { + "epoch": 1.1379940919960614, + "grad_norm": 2.375, + "learning_rate": 7.198602989195029e-06, + "loss": 1.5066, + "step": 4045 + }, + { + "epoch": 1.1394007596005065, + "grad_norm": 4.78125, + "learning_rate": 7.19565166927427e-06, + "loss": 1.3287, + "step": 4050 + }, + { + "epoch": 1.1408074272049515, + "grad_norm": 3.34375, + "learning_rate": 7.192695532244867e-06, + "loss": 1.4833, + "step": 4055 + }, + { + "epoch": 1.1422140948093966, + "grad_norm": 2.59375, + "learning_rate": 7.1897345825628875e-06, + "loss": 1.5049, + "step": 4060 + }, + { + "epoch": 1.1436207624138417, + "grad_norm": 3.359375, + "learning_rate": 7.186768824691652e-06, + "loss": 1.7101, + "step": 4065 + }, + { + "epoch": 1.1450274300182868, + "grad_norm": 2.9375, + "learning_rate": 7.183798263101729e-06, + "loss": 1.6311, + "step": 4070 + }, + { + "epoch": 1.1464340976227319, + "grad_norm": 4.3125, + "learning_rate": 7.180822902270926e-06, + "loss": 1.2967, + "step": 4075 + }, + { + "epoch": 1.1478407652271767, + "grad_norm": 2.921875, + "learning_rate": 7.177842746684287e-06, + "loss": 1.3902, + "step": 4080 + }, + { + "epoch": 1.149247432831622, + "grad_norm": 2.796875, + "learning_rate": 7.174857800834083e-06, + "loss": 1.5712, + "step": 4085 + }, + { + "epoch": 1.1506541004360669, + "grad_norm": 3.546875, + "learning_rate": 7.171868069219804e-06, + "loss": 1.5564, + "step": 4090 + }, + { + "epoch": 1.152060768040512, + "grad_norm": 2.953125, + "learning_rate": 7.16887355634816e-06, + "loss": 1.435, + "step": 4095 + }, + { + "epoch": 1.153467435644957, + "grad_norm": 3.875, + "learning_rate": 7.16587426673306e-06, + "loss": 1.6016, + "step": 4100 + }, + { + "epoch": 1.154874103249402, + "grad_norm": 3.5625, + "learning_rate": 7.16287020489562e-06, + "loss": 1.6038, + "step": 4105 + }, + { + "epoch": 1.1562807708538472, + "grad_norm": 3.6875, + "learning_rate": 7.159861375364146e-06, + "loss": 1.5753, + "step": 4110 + }, + { + "epoch": 1.1576874384582923, + "grad_norm": 4.5, + "learning_rate": 7.156847782674132e-06, + "loss": 1.6043, + "step": 4115 + }, + { + "epoch": 1.1590941060627373, + "grad_norm": 2.9375, + "learning_rate": 7.153829431368252e-06, + "loss": 1.5679, + "step": 4120 + }, + { + "epoch": 1.1605007736671824, + "grad_norm": 3.3125, + "learning_rate": 7.150806325996354e-06, + "loss": 1.5646, + "step": 4125 + }, + { + "epoch": 1.1619074412716275, + "grad_norm": 3.078125, + "learning_rate": 7.147778471115449e-06, + "loss": 1.693, + "step": 4130 + }, + { + "epoch": 1.1633141088760726, + "grad_norm": 3.609375, + "learning_rate": 7.144745871289711e-06, + "loss": 1.6215, + "step": 4135 + }, + { + "epoch": 1.1647207764805176, + "grad_norm": 4.15625, + "learning_rate": 7.141708531090467e-06, + "loss": 1.5527, + "step": 4140 + }, + { + "epoch": 1.1661274440849627, + "grad_norm": 2.921875, + "learning_rate": 7.138666455096183e-06, + "loss": 1.8448, + "step": 4145 + }, + { + "epoch": 1.1675341116894078, + "grad_norm": 4.34375, + "learning_rate": 7.1356196478924734e-06, + "loss": 1.7495, + "step": 4150 + }, + { + "epoch": 1.1689407792938529, + "grad_norm": 3.625, + "learning_rate": 7.132568114072077e-06, + "loss": 1.4359, + "step": 4155 + }, + { + "epoch": 1.170347446898298, + "grad_norm": 4.0625, + "learning_rate": 7.12951185823486e-06, + "loss": 1.6285, + "step": 4160 + }, + { + "epoch": 1.171754114502743, + "grad_norm": 2.671875, + "learning_rate": 7.126450884987807e-06, + "loss": 1.7377, + "step": 4165 + }, + { + "epoch": 1.173160782107188, + "grad_norm": 2.609375, + "learning_rate": 7.123385198945012e-06, + "loss": 1.7262, + "step": 4170 + }, + { + "epoch": 1.1745674497116332, + "grad_norm": 3.375, + "learning_rate": 7.120314804727676e-06, + "loss": 1.5923, + "step": 4175 + }, + { + "epoch": 1.1759741173160783, + "grad_norm": 3.09375, + "learning_rate": 7.117239706964094e-06, + "loss": 1.7389, + "step": 4180 + }, + { + "epoch": 1.1773807849205233, + "grad_norm": 3.125, + "learning_rate": 7.114159910289652e-06, + "loss": 1.7112, + "step": 4185 + }, + { + "epoch": 1.1787874525249684, + "grad_norm": 3.21875, + "learning_rate": 7.111075419346821e-06, + "loss": 1.7166, + "step": 4190 + }, + { + "epoch": 1.1801941201294135, + "grad_norm": 2.859375, + "learning_rate": 7.107986238785145e-06, + "loss": 1.4257, + "step": 4195 + }, + { + "epoch": 1.1816007877338586, + "grad_norm": 3.421875, + "learning_rate": 7.10489237326124e-06, + "loss": 1.6122, + "step": 4200 + }, + { + "epoch": 1.1830074553383036, + "grad_norm": 9.8125, + "learning_rate": 7.101793827438781e-06, + "loss": 1.415, + "step": 4205 + }, + { + "epoch": 1.1844141229427487, + "grad_norm": 3.046875, + "learning_rate": 7.098690605988501e-06, + "loss": 1.5566, + "step": 4210 + }, + { + "epoch": 1.1858207905471936, + "grad_norm": 2.3125, + "learning_rate": 7.095582713588179e-06, + "loss": 1.3807, + "step": 4215 + }, + { + "epoch": 1.1872274581516389, + "grad_norm": 3.265625, + "learning_rate": 7.092470154922638e-06, + "loss": 1.7086, + "step": 4220 + }, + { + "epoch": 1.1886341257560837, + "grad_norm": 2.296875, + "learning_rate": 7.089352934683729e-06, + "loss": 1.5553, + "step": 4225 + }, + { + "epoch": 1.1900407933605288, + "grad_norm": 3.40625, + "learning_rate": 7.086231057570337e-06, + "loss": 1.3568, + "step": 4230 + }, + { + "epoch": 1.191447460964974, + "grad_norm": 3.765625, + "learning_rate": 7.083104528288361e-06, + "loss": 1.5711, + "step": 4235 + }, + { + "epoch": 1.192854128569419, + "grad_norm": 2.609375, + "learning_rate": 7.079973351550716e-06, + "loss": 1.5155, + "step": 4240 + }, + { + "epoch": 1.194260796173864, + "grad_norm": 3.578125, + "learning_rate": 7.076837532077321e-06, + "loss": 1.5077, + "step": 4245 + }, + { + "epoch": 1.1956674637783091, + "grad_norm": 3.15625, + "learning_rate": 7.073697074595095e-06, + "loss": 1.5776, + "step": 4250 + }, + { + "epoch": 1.1970741313827542, + "grad_norm": 4.25, + "learning_rate": 7.070551983837945e-06, + "loss": 1.5348, + "step": 4255 + }, + { + "epoch": 1.1984807989871993, + "grad_norm": 2.90625, + "learning_rate": 7.067402264546766e-06, + "loss": 1.7168, + "step": 4260 + }, + { + "epoch": 1.1998874665916444, + "grad_norm": 3.875, + "learning_rate": 7.064247921469429e-06, + "loss": 1.6378, + "step": 4265 + }, + { + "epoch": 1.2012941341960894, + "grad_norm": 2.765625, + "learning_rate": 7.061088959360772e-06, + "loss": 1.4526, + "step": 4270 + }, + { + "epoch": 1.2027008018005345, + "grad_norm": 3.609375, + "learning_rate": 7.0579253829826e-06, + "loss": 1.6888, + "step": 4275 + }, + { + "epoch": 1.2041074694049796, + "grad_norm": 4.0625, + "learning_rate": 7.05475719710367e-06, + "loss": 1.6771, + "step": 4280 + }, + { + "epoch": 1.2055141370094247, + "grad_norm": 3.125, + "learning_rate": 7.051584406499691e-06, + "loss": 1.7519, + "step": 4285 + }, + { + "epoch": 1.2069208046138697, + "grad_norm": 2.703125, + "learning_rate": 7.048407015953309e-06, + "loss": 1.6397, + "step": 4290 + }, + { + "epoch": 1.2083274722183148, + "grad_norm": 4.75, + "learning_rate": 7.045225030254107e-06, + "loss": 1.6319, + "step": 4295 + }, + { + "epoch": 1.20973413982276, + "grad_norm": 3.296875, + "learning_rate": 7.042038454198593e-06, + "loss": 1.5327, + "step": 4300 + }, + { + "epoch": 1.211140807427205, + "grad_norm": 4.8125, + "learning_rate": 7.038847292590196e-06, + "loss": 1.4904, + "step": 4305 + }, + { + "epoch": 1.21254747503165, + "grad_norm": 3.78125, + "learning_rate": 7.0356515502392555e-06, + "loss": 1.4568, + "step": 4310 + }, + { + "epoch": 1.2139541426360951, + "grad_norm": 3.625, + "learning_rate": 7.032451231963016e-06, + "loss": 1.5869, + "step": 4315 + }, + { + "epoch": 1.2153608102405402, + "grad_norm": 3.78125, + "learning_rate": 7.0292463425856235e-06, + "loss": 1.6826, + "step": 4320 + }, + { + "epoch": 1.2167674778449853, + "grad_norm": 3.328125, + "learning_rate": 7.026036886938108e-06, + "loss": 1.5787, + "step": 4325 + }, + { + "epoch": 1.2181741454494304, + "grad_norm": 4.34375, + "learning_rate": 7.022822869858389e-06, + "loss": 1.575, + "step": 4330 + }, + { + "epoch": 1.2195808130538754, + "grad_norm": 2.703125, + "learning_rate": 7.0196042961912575e-06, + "loss": 1.5395, + "step": 4335 + }, + { + "epoch": 1.2209874806583205, + "grad_norm": 2.421875, + "learning_rate": 7.016381170788375e-06, + "loss": 1.6192, + "step": 4340 + }, + { + "epoch": 1.2223941482627656, + "grad_norm": 2.84375, + "learning_rate": 7.013153498508263e-06, + "loss": 1.6263, + "step": 4345 + }, + { + "epoch": 1.2238008158672105, + "grad_norm": 4.34375, + "learning_rate": 7.009921284216299e-06, + "loss": 1.4436, + "step": 4350 + }, + { + "epoch": 1.2252074834716558, + "grad_norm": 3.4375, + "learning_rate": 7.006684532784707e-06, + "loss": 1.425, + "step": 4355 + }, + { + "epoch": 1.2266141510761006, + "grad_norm": 4.375, + "learning_rate": 7.003443249092547e-06, + "loss": 1.3797, + "step": 4360 + }, + { + "epoch": 1.228020818680546, + "grad_norm": 6.65625, + "learning_rate": 7.000197438025715e-06, + "loss": 1.7619, + "step": 4365 + }, + { + "epoch": 1.2294274862849908, + "grad_norm": 2.75, + "learning_rate": 6.9969471044769275e-06, + "loss": 1.6214, + "step": 4370 + }, + { + "epoch": 1.2308341538894358, + "grad_norm": 2.578125, + "learning_rate": 6.993692253345722e-06, + "loss": 1.7026, + "step": 4375 + }, + { + "epoch": 1.232240821493881, + "grad_norm": 3.140625, + "learning_rate": 6.990432889538444e-06, + "loss": 1.5261, + "step": 4380 + }, + { + "epoch": 1.233647489098326, + "grad_norm": 3.78125, + "learning_rate": 6.98716901796824e-06, + "loss": 1.6586, + "step": 4385 + }, + { + "epoch": 1.235054156702771, + "grad_norm": 3.65625, + "learning_rate": 6.983900643555056e-06, + "loss": 1.5024, + "step": 4390 + }, + { + "epoch": 1.2364608243072162, + "grad_norm": 3.875, + "learning_rate": 6.980627771225618e-06, + "loss": 1.5133, + "step": 4395 + }, + { + "epoch": 1.2378674919116612, + "grad_norm": 2.6875, + "learning_rate": 6.977350405913442e-06, + "loss": 1.4485, + "step": 4400 + }, + { + "epoch": 1.2392741595161063, + "grad_norm": 3.46875, + "learning_rate": 6.974068552558806e-06, + "loss": 1.6037, + "step": 4405 + }, + { + "epoch": 1.2406808271205514, + "grad_norm": 2.640625, + "learning_rate": 6.970782216108764e-06, + "loss": 1.833, + "step": 4410 + }, + { + "epoch": 1.2420874947249965, + "grad_norm": 2.890625, + "learning_rate": 6.967491401517118e-06, + "loss": 1.6397, + "step": 4415 + }, + { + "epoch": 1.2434941623294415, + "grad_norm": 2.984375, + "learning_rate": 6.964196113744427e-06, + "loss": 1.6655, + "step": 4420 + }, + { + "epoch": 1.2449008299338866, + "grad_norm": 2.46875, + "learning_rate": 6.960896357757989e-06, + "loss": 1.588, + "step": 4425 + }, + { + "epoch": 1.2463074975383317, + "grad_norm": 3.03125, + "learning_rate": 6.957592138531841e-06, + "loss": 1.7372, + "step": 4430 + }, + { + "epoch": 1.2477141651427768, + "grad_norm": 3.375, + "learning_rate": 6.954283461046744e-06, + "loss": 1.459, + "step": 4435 + }, + { + "epoch": 1.2491208327472219, + "grad_norm": 2.828125, + "learning_rate": 6.950970330290182e-06, + "loss": 1.5329, + "step": 4440 + }, + { + "epoch": 1.250527500351667, + "grad_norm": 4.0, + "learning_rate": 6.947652751256351e-06, + "loss": 1.6937, + "step": 4445 + }, + { + "epoch": 1.251934167956112, + "grad_norm": 3.640625, + "learning_rate": 6.944330728946153e-06, + "loss": 1.5424, + "step": 4450 + }, + { + "epoch": 1.253340835560557, + "grad_norm": 2.765625, + "learning_rate": 6.941004268367185e-06, + "loss": 1.4694, + "step": 4455 + }, + { + "epoch": 1.2547475031650022, + "grad_norm": 3.140625, + "learning_rate": 6.937673374533738e-06, + "loss": 1.5039, + "step": 4460 + }, + { + "epoch": 1.2561541707694472, + "grad_norm": 2.71875, + "learning_rate": 6.934338052466785e-06, + "loss": 1.5262, + "step": 4465 + }, + { + "epoch": 1.2575608383738923, + "grad_norm": 4.3125, + "learning_rate": 6.93099830719397e-06, + "loss": 1.5302, + "step": 4470 + }, + { + "epoch": 1.2589675059783374, + "grad_norm": 3.15625, + "learning_rate": 6.92765414374961e-06, + "loss": 1.6213, + "step": 4475 + }, + { + "epoch": 1.2603741735827825, + "grad_norm": 4.25, + "learning_rate": 6.924305567174678e-06, + "loss": 1.4612, + "step": 4480 + }, + { + "epoch": 1.2617808411872273, + "grad_norm": 3.75, + "learning_rate": 6.920952582516802e-06, + "loss": 1.5592, + "step": 4485 + }, + { + "epoch": 1.2631875087916726, + "grad_norm": 5.59375, + "learning_rate": 6.917595194830253e-06, + "loss": 1.709, + "step": 4490 + }, + { + "epoch": 1.2645941763961175, + "grad_norm": 3.3125, + "learning_rate": 6.91423340917594e-06, + "loss": 1.4438, + "step": 4495 + }, + { + "epoch": 1.2660008440005628, + "grad_norm": 2.859375, + "learning_rate": 6.9108672306214e-06, + "loss": 1.9044, + "step": 4500 + }, + { + "epoch": 1.2674075116050076, + "grad_norm": 2.671875, + "learning_rate": 6.907496664240796e-06, + "loss": 1.6878, + "step": 4505 + }, + { + "epoch": 1.268814179209453, + "grad_norm": 3.84375, + "learning_rate": 6.9041217151149e-06, + "loss": 1.529, + "step": 4510 + }, + { + "epoch": 1.2702208468138978, + "grad_norm": 2.984375, + "learning_rate": 6.900742388331091e-06, + "loss": 1.5458, + "step": 4515 + }, + { + "epoch": 1.2716275144183429, + "grad_norm": 3.5, + "learning_rate": 6.897358688983351e-06, + "loss": 1.5387, + "step": 4520 + }, + { + "epoch": 1.273034182022788, + "grad_norm": 3.140625, + "learning_rate": 6.893970622172251e-06, + "loss": 1.6777, + "step": 4525 + }, + { + "epoch": 1.274440849627233, + "grad_norm": 3.984375, + "learning_rate": 6.890578193004944e-06, + "loss": 1.654, + "step": 4530 + }, + { + "epoch": 1.275847517231678, + "grad_norm": 2.890625, + "learning_rate": 6.88718140659516e-06, + "loss": 1.7976, + "step": 4535 + }, + { + "epoch": 1.2772541848361232, + "grad_norm": 2.625, + "learning_rate": 6.883780268063198e-06, + "loss": 1.4941, + "step": 4540 + }, + { + "epoch": 1.2786608524405683, + "grad_norm": 3.453125, + "learning_rate": 6.880374782535915e-06, + "loss": 1.6671, + "step": 4545 + }, + { + "epoch": 1.2800675200450133, + "grad_norm": 3.953125, + "learning_rate": 6.8769649551467235e-06, + "loss": 1.3581, + "step": 4550 + }, + { + "epoch": 1.2814741876494584, + "grad_norm": 2.3125, + "learning_rate": 6.87355079103558e-06, + "loss": 1.4248, + "step": 4555 + }, + { + "epoch": 1.2828808552539035, + "grad_norm": 2.25, + "learning_rate": 6.8701322953489755e-06, + "loss": 1.7387, + "step": 4560 + }, + { + "epoch": 1.2842875228583486, + "grad_norm": 2.703125, + "learning_rate": 6.866709473239932e-06, + "loss": 1.6799, + "step": 4565 + }, + { + "epoch": 1.2856941904627937, + "grad_norm": 3.5625, + "learning_rate": 6.8632823298679985e-06, + "loss": 1.492, + "step": 4570 + }, + { + "epoch": 1.2871008580672387, + "grad_norm": 3.5, + "learning_rate": 6.859850870399229e-06, + "loss": 1.7618, + "step": 4575 + }, + { + "epoch": 1.2885075256716838, + "grad_norm": 3.40625, + "learning_rate": 6.856415100006188e-06, + "loss": 1.6899, + "step": 4580 + }, + { + "epoch": 1.2899141932761289, + "grad_norm": 4.46875, + "learning_rate": 6.852975023867939e-06, + "loss": 1.3663, + "step": 4585 + }, + { + "epoch": 1.291320860880574, + "grad_norm": 3.109375, + "learning_rate": 6.849530647170033e-06, + "loss": 1.5616, + "step": 4590 + }, + { + "epoch": 1.292727528485019, + "grad_norm": 3.1875, + "learning_rate": 6.846081975104507e-06, + "loss": 1.8242, + "step": 4595 + }, + { + "epoch": 1.2941341960894641, + "grad_norm": 3.109375, + "learning_rate": 6.842629012869872e-06, + "loss": 1.7529, + "step": 4600 + }, + { + "epoch": 1.2955408636939092, + "grad_norm": 2.734375, + "learning_rate": 6.839171765671104e-06, + "loss": 1.6587, + "step": 4605 + }, + { + "epoch": 1.2969475312983543, + "grad_norm": 3.34375, + "learning_rate": 6.835710238719638e-06, + "loss": 1.7018, + "step": 4610 + }, + { + "epoch": 1.2983541989027994, + "grad_norm": 2.734375, + "learning_rate": 6.832244437233364e-06, + "loss": 1.6016, + "step": 4615 + }, + { + "epoch": 1.2997608665072442, + "grad_norm": 3.578125, + "learning_rate": 6.828774366436613e-06, + "loss": 1.8004, + "step": 4620 + }, + { + "epoch": 1.3011675341116895, + "grad_norm": 2.890625, + "learning_rate": 6.82530003156015e-06, + "loss": 1.5459, + "step": 4625 + }, + { + "epoch": 1.3025742017161344, + "grad_norm": 3.125, + "learning_rate": 6.82182143784117e-06, + "loss": 1.3069, + "step": 4630 + }, + { + "epoch": 1.3039808693205797, + "grad_norm": 3.40625, + "learning_rate": 6.818338590523288e-06, + "loss": 1.7134, + "step": 4635 + }, + { + "epoch": 1.3053875369250245, + "grad_norm": 2.421875, + "learning_rate": 6.8148514948565275e-06, + "loss": 1.3821, + "step": 4640 + }, + { + "epoch": 1.3067942045294698, + "grad_norm": 4.40625, + "learning_rate": 6.81136015609732e-06, + "loss": 1.5428, + "step": 4645 + }, + { + "epoch": 1.3082008721339147, + "grad_norm": 2.1875, + "learning_rate": 6.8078645795084925e-06, + "loss": 1.8607, + "step": 4650 + }, + { + "epoch": 1.3096075397383597, + "grad_norm": 6.96875, + "learning_rate": 6.804364770359257e-06, + "loss": 1.6046, + "step": 4655 + }, + { + "epoch": 1.3110142073428048, + "grad_norm": 2.921875, + "learning_rate": 6.8008607339252075e-06, + "loss": 1.6103, + "step": 4660 + }, + { + "epoch": 1.31242087494725, + "grad_norm": 3.546875, + "learning_rate": 6.797352475488311e-06, + "loss": 1.3723, + "step": 4665 + }, + { + "epoch": 1.313827542551695, + "grad_norm": 3.21875, + "learning_rate": 6.7938400003368975e-06, + "loss": 1.6436, + "step": 4670 + }, + { + "epoch": 1.31523421015614, + "grad_norm": 2.65625, + "learning_rate": 6.790323313765654e-06, + "loss": 1.4495, + "step": 4675 + }, + { + "epoch": 1.3166408777605851, + "grad_norm": 3.296875, + "learning_rate": 6.786802421075615e-06, + "loss": 1.7251, + "step": 4680 + }, + { + "epoch": 1.3180475453650302, + "grad_norm": 3.265625, + "learning_rate": 6.783277327574156e-06, + "loss": 1.6689, + "step": 4685 + }, + { + "epoch": 1.3194542129694753, + "grad_norm": 4.0, + "learning_rate": 6.779748038574986e-06, + "loss": 1.821, + "step": 4690 + }, + { + "epoch": 1.3208608805739204, + "grad_norm": 3.59375, + "learning_rate": 6.776214559398134e-06, + "loss": 1.7114, + "step": 4695 + }, + { + "epoch": 1.3222675481783654, + "grad_norm": 2.953125, + "learning_rate": 6.772676895369951e-06, + "loss": 1.6903, + "step": 4700 + }, + { + "epoch": 1.3236742157828105, + "grad_norm": 4.15625, + "learning_rate": 6.769135051823092e-06, + "loss": 1.5268, + "step": 4705 + }, + { + "epoch": 1.3250808833872556, + "grad_norm": 2.78125, + "learning_rate": 6.7655890340965125e-06, + "loss": 1.8307, + "step": 4710 + }, + { + "epoch": 1.3264875509917007, + "grad_norm": 2.84375, + "learning_rate": 6.762038847535461e-06, + "loss": 1.6079, + "step": 4715 + }, + { + "epoch": 1.3278942185961458, + "grad_norm": 3.078125, + "learning_rate": 6.758484497491473e-06, + "loss": 1.5382, + "step": 4720 + }, + { + "epoch": 1.3293008862005908, + "grad_norm": 4.46875, + "learning_rate": 6.754925989322353e-06, + "loss": 1.5711, + "step": 4725 + }, + { + "epoch": 1.330707553805036, + "grad_norm": 3.640625, + "learning_rate": 6.751363328392182e-06, + "loss": 1.7392, + "step": 4730 + }, + { + "epoch": 1.332114221409481, + "grad_norm": 3.265625, + "learning_rate": 6.747796520071293e-06, + "loss": 1.4317, + "step": 4735 + }, + { + "epoch": 1.333520889013926, + "grad_norm": 3.046875, + "learning_rate": 6.744225569736276e-06, + "loss": 1.7177, + "step": 4740 + }, + { + "epoch": 1.3349275566183711, + "grad_norm": 1.96875, + "learning_rate": 6.740650482769963e-06, + "loss": 1.7749, + "step": 4745 + }, + { + "epoch": 1.3363342242228162, + "grad_norm": 2.6875, + "learning_rate": 6.737071264561421e-06, + "loss": 1.6097, + "step": 4750 + }, + { + "epoch": 1.337740891827261, + "grad_norm": 2.9375, + "learning_rate": 6.733487920505945e-06, + "loss": 1.8134, + "step": 4755 + }, + { + "epoch": 1.3391475594317064, + "grad_norm": 3.484375, + "learning_rate": 6.729900456005049e-06, + "loss": 1.7359, + "step": 4760 + }, + { + "epoch": 1.3405542270361512, + "grad_norm": 3.140625, + "learning_rate": 6.7263088764664575e-06, + "loss": 1.5704, + "step": 4765 + }, + { + "epoch": 1.3419608946405965, + "grad_norm": 4.09375, + "learning_rate": 6.7227131873041e-06, + "loss": 1.4039, + "step": 4770 + }, + { + "epoch": 1.3433675622450414, + "grad_norm": 3.4375, + "learning_rate": 6.719113393938099e-06, + "loss": 1.5993, + "step": 4775 + }, + { + "epoch": 1.3447742298494867, + "grad_norm": 3.171875, + "learning_rate": 6.715509501794763e-06, + "loss": 1.7964, + "step": 4780 + }, + { + "epoch": 1.3461808974539315, + "grad_norm": 2.96875, + "learning_rate": 6.711901516306583e-06, + "loss": 1.4551, + "step": 4785 + }, + { + "epoch": 1.3475875650583766, + "grad_norm": 3.1875, + "learning_rate": 6.708289442912216e-06, + "loss": 1.4375, + "step": 4790 + }, + { + "epoch": 1.3489942326628217, + "grad_norm": 3.265625, + "learning_rate": 6.7046732870564816e-06, + "loss": 1.4929, + "step": 4795 + }, + { + "epoch": 1.3504009002672668, + "grad_norm": 4.5, + "learning_rate": 6.7010530541903565e-06, + "loss": 1.5064, + "step": 4800 + }, + { + "epoch": 1.3518075678717119, + "grad_norm": 2.671875, + "learning_rate": 6.697428749770958e-06, + "loss": 1.6775, + "step": 4805 + }, + { + "epoch": 1.353214235476157, + "grad_norm": 2.84375, + "learning_rate": 6.693800379261546e-06, + "loss": 1.6935, + "step": 4810 + }, + { + "epoch": 1.354620903080602, + "grad_norm": 3.390625, + "learning_rate": 6.690167948131506e-06, + "loss": 1.5423, + "step": 4815 + }, + { + "epoch": 1.356027570685047, + "grad_norm": 3.28125, + "learning_rate": 6.686531461856345e-06, + "loss": 1.644, + "step": 4820 + }, + { + "epoch": 1.3574342382894922, + "grad_norm": 3.625, + "learning_rate": 6.6828909259176865e-06, + "loss": 1.7919, + "step": 4825 + }, + { + "epoch": 1.3588409058939372, + "grad_norm": 3.5, + "learning_rate": 6.6792463458032534e-06, + "loss": 1.5362, + "step": 4830 + }, + { + "epoch": 1.3602475734983823, + "grad_norm": 4.46875, + "learning_rate": 6.675597727006866e-06, + "loss": 1.7212, + "step": 4835 + }, + { + "epoch": 1.3616542411028274, + "grad_norm": 2.421875, + "learning_rate": 6.671945075028434e-06, + "loss": 1.6932, + "step": 4840 + }, + { + "epoch": 1.3630609087072725, + "grad_norm": 2.671875, + "learning_rate": 6.668288395373946e-06, + "loss": 1.594, + "step": 4845 + }, + { + "epoch": 1.3644675763117176, + "grad_norm": 2.578125, + "learning_rate": 6.664627693555462e-06, + "loss": 1.8579, + "step": 4850 + }, + { + "epoch": 1.3658742439161626, + "grad_norm": 4.09375, + "learning_rate": 6.660962975091104e-06, + "loss": 1.6345, + "step": 4855 + }, + { + "epoch": 1.3672809115206077, + "grad_norm": 2.703125, + "learning_rate": 6.657294245505051e-06, + "loss": 1.6163, + "step": 4860 + }, + { + "epoch": 1.3686875791250528, + "grad_norm": 5.0625, + "learning_rate": 6.653621510327525e-06, + "loss": 1.6408, + "step": 4865 + }, + { + "epoch": 1.3700942467294979, + "grad_norm": 3.578125, + "learning_rate": 6.64994477509479e-06, + "loss": 1.7848, + "step": 4870 + }, + { + "epoch": 1.371500914333943, + "grad_norm": 6.28125, + "learning_rate": 6.646264045349134e-06, + "loss": 1.6636, + "step": 4875 + }, + { + "epoch": 1.372907581938388, + "grad_norm": 3.515625, + "learning_rate": 6.642579326638872e-06, + "loss": 1.4139, + "step": 4880 + }, + { + "epoch": 1.374314249542833, + "grad_norm": 4.125, + "learning_rate": 6.638890624518332e-06, + "loss": 1.5858, + "step": 4885 + }, + { + "epoch": 1.3757209171472782, + "grad_norm": 3.4375, + "learning_rate": 6.63519794454784e-06, + "loss": 1.5255, + "step": 4890 + }, + { + "epoch": 1.3771275847517233, + "grad_norm": 3.03125, + "learning_rate": 6.631501292293725e-06, + "loss": 1.7696, + "step": 4895 + }, + { + "epoch": 1.378534252356168, + "grad_norm": 3.28125, + "learning_rate": 6.627800673328302e-06, + "loss": 1.6701, + "step": 4900 + }, + { + "epoch": 1.3799409199606134, + "grad_norm": 3.015625, + "learning_rate": 6.624096093229863e-06, + "loss": 1.6558, + "step": 4905 + }, + { + "epoch": 1.3813475875650583, + "grad_norm": 5.09375, + "learning_rate": 6.620387557582672e-06, + "loss": 1.531, + "step": 4910 + }, + { + "epoch": 1.3827542551695036, + "grad_norm": 3.4375, + "learning_rate": 6.616675071976958e-06, + "loss": 1.4897, + "step": 4915 + }, + { + "epoch": 1.3841609227739484, + "grad_norm": 3.265625, + "learning_rate": 6.612958642008904e-06, + "loss": 1.6086, + "step": 4920 + }, + { + "epoch": 1.3855675903783937, + "grad_norm": 3.65625, + "learning_rate": 6.609238273280633e-06, + "loss": 1.496, + "step": 4925 + }, + { + "epoch": 1.3869742579828386, + "grad_norm": 3.203125, + "learning_rate": 6.605513971400212e-06, + "loss": 1.7276, + "step": 4930 + }, + { + "epoch": 1.3883809255872837, + "grad_norm": 4.46875, + "learning_rate": 6.601785741981634e-06, + "loss": 1.634, + "step": 4935 + }, + { + "epoch": 1.3897875931917287, + "grad_norm": 3.796875, + "learning_rate": 6.5980535906448114e-06, + "loss": 1.5213, + "step": 4940 + }, + { + "epoch": 1.3911942607961738, + "grad_norm": 4.96875, + "learning_rate": 6.594317523015571e-06, + "loss": 1.6396, + "step": 4945 + }, + { + "epoch": 1.3926009284006189, + "grad_norm": 3.40625, + "learning_rate": 6.590577544725642e-06, + "loss": 1.4338, + "step": 4950 + }, + { + "epoch": 1.394007596005064, + "grad_norm": 2.59375, + "learning_rate": 6.586833661412646e-06, + "loss": 1.8134, + "step": 4955 + }, + { + "epoch": 1.395414263609509, + "grad_norm": 2.859375, + "learning_rate": 6.583085878720095e-06, + "loss": 1.5643, + "step": 4960 + }, + { + "epoch": 1.3968209312139541, + "grad_norm": 2.59375, + "learning_rate": 6.579334202297376e-06, + "loss": 1.5369, + "step": 4965 + }, + { + "epoch": 1.3982275988183992, + "grad_norm": 3.171875, + "learning_rate": 6.575578637799747e-06, + "loss": 1.6658, + "step": 4970 + }, + { + "epoch": 1.3996342664228443, + "grad_norm": 2.109375, + "learning_rate": 6.5718191908883265e-06, + "loss": 1.7028, + "step": 4975 + }, + { + "epoch": 1.4010409340272894, + "grad_norm": 3.28125, + "learning_rate": 6.568055867230086e-06, + "loss": 1.4661, + "step": 4980 + }, + { + "epoch": 1.4024476016317344, + "grad_norm": 3.890625, + "learning_rate": 6.564288672497838e-06, + "loss": 1.4148, + "step": 4985 + }, + { + "epoch": 1.4038542692361795, + "grad_norm": 3.15625, + "learning_rate": 6.560517612370232e-06, + "loss": 1.4908, + "step": 4990 + }, + { + "epoch": 1.4052609368406246, + "grad_norm": 3.5, + "learning_rate": 6.556742692531747e-06, + "loss": 1.7999, + "step": 4995 + }, + { + "epoch": 1.4066676044450697, + "grad_norm": 3.25, + "learning_rate": 6.552963918672675e-06, + "loss": 1.5283, + "step": 5000 + }, + { + "epoch": 1.4080742720495147, + "grad_norm": 3.53125, + "learning_rate": 6.549181296489121e-06, + "loss": 1.6524, + "step": 5005 + }, + { + "epoch": 1.4094809396539598, + "grad_norm": 2.59375, + "learning_rate": 6.545394831682989e-06, + "loss": 1.5902, + "step": 5010 + }, + { + "epoch": 1.410887607258405, + "grad_norm": 3.015625, + "learning_rate": 6.541604529961978e-06, + "loss": 1.6948, + "step": 5015 + }, + { + "epoch": 1.41229427486285, + "grad_norm": 2.84375, + "learning_rate": 6.537810397039568e-06, + "loss": 1.4649, + "step": 5020 + }, + { + "epoch": 1.413700942467295, + "grad_norm": 4.0, + "learning_rate": 6.534012438635015e-06, + "loss": 1.5802, + "step": 5025 + }, + { + "epoch": 1.4151076100717401, + "grad_norm": 3.296875, + "learning_rate": 6.530210660473341e-06, + "loss": 1.6675, + "step": 5030 + }, + { + "epoch": 1.416514277676185, + "grad_norm": 2.921875, + "learning_rate": 6.526405068285329e-06, + "loss": 1.7445, + "step": 5035 + }, + { + "epoch": 1.4179209452806303, + "grad_norm": 2.546875, + "learning_rate": 6.522595667807506e-06, + "loss": 1.6208, + "step": 5040 + }, + { + "epoch": 1.4193276128850751, + "grad_norm": 4.15625, + "learning_rate": 6.518782464782144e-06, + "loss": 1.6644, + "step": 5045 + }, + { + "epoch": 1.4207342804895204, + "grad_norm": 3.890625, + "learning_rate": 6.514965464957246e-06, + "loss": 1.4335, + "step": 5050 + }, + { + "epoch": 1.4221409480939653, + "grad_norm": 2.9375, + "learning_rate": 6.511144674086536e-06, + "loss": 1.5942, + "step": 5055 + }, + { + "epoch": 1.4235476156984106, + "grad_norm": 3.859375, + "learning_rate": 6.507320097929453e-06, + "loss": 1.5058, + "step": 5060 + }, + { + "epoch": 1.4249542833028555, + "grad_norm": 2.765625, + "learning_rate": 6.5034917422511465e-06, + "loss": 1.6495, + "step": 5065 + }, + { + "epoch": 1.4263609509073005, + "grad_norm": 3.703125, + "learning_rate": 6.499659612822458e-06, + "loss": 1.6415, + "step": 5070 + }, + { + "epoch": 1.4277676185117456, + "grad_norm": 2.890625, + "learning_rate": 6.49582371541992e-06, + "loss": 1.5523, + "step": 5075 + }, + { + "epoch": 1.4291742861161907, + "grad_norm": 2.875, + "learning_rate": 6.491984055825744e-06, + "loss": 1.5582, + "step": 5080 + }, + { + "epoch": 1.4305809537206358, + "grad_norm": 3.359375, + "learning_rate": 6.488140639827812e-06, + "loss": 1.4445, + "step": 5085 + }, + { + "epoch": 1.4319876213250808, + "grad_norm": 3.609375, + "learning_rate": 6.484293473219671e-06, + "loss": 1.8517, + "step": 5090 + }, + { + "epoch": 1.433394288929526, + "grad_norm": 3.015625, + "learning_rate": 6.480442561800517e-06, + "loss": 1.753, + "step": 5095 + }, + { + "epoch": 1.434800956533971, + "grad_norm": 3.8125, + "learning_rate": 6.4765879113751965e-06, + "loss": 1.6244, + "step": 5100 + }, + { + "epoch": 1.436207624138416, + "grad_norm": 3.375, + "learning_rate": 6.472729527754188e-06, + "loss": 1.6974, + "step": 5105 + }, + { + "epoch": 1.4376142917428612, + "grad_norm": 3.59375, + "learning_rate": 6.4688674167536e-06, + "loss": 1.5025, + "step": 5110 + }, + { + "epoch": 1.4390209593473062, + "grad_norm": 3.046875, + "learning_rate": 6.465001584195157e-06, + "loss": 1.3674, + "step": 5115 + }, + { + "epoch": 1.4404276269517513, + "grad_norm": 3.65625, + "learning_rate": 6.461132035906196e-06, + "loss": 1.627, + "step": 5120 + }, + { + "epoch": 1.4418342945561964, + "grad_norm": 3.890625, + "learning_rate": 6.4572587777196534e-06, + "loss": 1.6016, + "step": 5125 + }, + { + "epoch": 1.4432409621606415, + "grad_norm": 3.1875, + "learning_rate": 6.453381815474059e-06, + "loss": 1.5537, + "step": 5130 + }, + { + "epoch": 1.4446476297650865, + "grad_norm": 5.5, + "learning_rate": 6.4495011550135245e-06, + "loss": 1.7066, + "step": 5135 + }, + { + "epoch": 1.4460542973695316, + "grad_norm": 5.09375, + "learning_rate": 6.4456168021877376e-06, + "loss": 1.4199, + "step": 5140 + }, + { + "epoch": 1.4474609649739767, + "grad_norm": 3.515625, + "learning_rate": 6.4417287628519504e-06, + "loss": 1.6078, + "step": 5145 + }, + { + "epoch": 1.4488676325784218, + "grad_norm": 5.9375, + "learning_rate": 6.437837042866975e-06, + "loss": 1.6382, + "step": 5150 + }, + { + "epoch": 1.4502743001828668, + "grad_norm": 4.53125, + "learning_rate": 6.43394164809917e-06, + "loss": 1.7017, + "step": 5155 + }, + { + "epoch": 1.451680967787312, + "grad_norm": 2.953125, + "learning_rate": 6.4300425844204305e-06, + "loss": 1.4553, + "step": 5160 + }, + { + "epoch": 1.453087635391757, + "grad_norm": 4.03125, + "learning_rate": 6.426139857708187e-06, + "loss": 1.4535, + "step": 5165 + }, + { + "epoch": 1.4544943029962019, + "grad_norm": 2.8125, + "learning_rate": 6.422233473845388e-06, + "loss": 1.578, + "step": 5170 + }, + { + "epoch": 1.4559009706006472, + "grad_norm": 2.875, + "learning_rate": 6.418323438720497e-06, + "loss": 1.5562, + "step": 5175 + }, + { + "epoch": 1.457307638205092, + "grad_norm": 3.34375, + "learning_rate": 6.414409758227482e-06, + "loss": 1.4599, + "step": 5180 + }, + { + "epoch": 1.4587143058095373, + "grad_norm": 3.875, + "learning_rate": 6.4104924382657995e-06, + "loss": 1.5497, + "step": 5185 + }, + { + "epoch": 1.4601209734139822, + "grad_norm": 3.34375, + "learning_rate": 6.4065714847404035e-06, + "loss": 1.4751, + "step": 5190 + }, + { + "epoch": 1.4615276410184275, + "grad_norm": 2.765625, + "learning_rate": 6.402646903561715e-06, + "loss": 1.7455, + "step": 5195 + }, + { + "epoch": 1.4629343086228723, + "grad_norm": 4.5625, + "learning_rate": 6.398718700645628e-06, + "loss": 1.4352, + "step": 5200 + }, + { + "epoch": 1.4643409762273174, + "grad_norm": 3.65625, + "learning_rate": 6.394786881913496e-06, + "loss": 1.6347, + "step": 5205 + }, + { + "epoch": 1.4657476438317625, + "grad_norm": 8.8125, + "learning_rate": 6.39085145329212e-06, + "loss": 1.4356, + "step": 5210 + }, + { + "epoch": 1.4671543114362076, + "grad_norm": 3.78125, + "learning_rate": 6.386912420713746e-06, + "loss": 1.6925, + "step": 5215 + }, + { + "epoch": 1.4685609790406526, + "grad_norm": 4.96875, + "learning_rate": 6.382969790116052e-06, + "loss": 1.6203, + "step": 5220 + }, + { + "epoch": 1.4699676466450977, + "grad_norm": 4.0, + "learning_rate": 6.379023567442136e-06, + "loss": 1.5191, + "step": 5225 + }, + { + "epoch": 1.4713743142495428, + "grad_norm": 4.5625, + "learning_rate": 6.375073758640516e-06, + "loss": 1.1834, + "step": 5230 + }, + { + "epoch": 1.4727809818539879, + "grad_norm": 3.53125, + "learning_rate": 6.371120369665112e-06, + "loss": 1.4152, + "step": 5235 + }, + { + "epoch": 1.474187649458433, + "grad_norm": 3.21875, + "learning_rate": 6.3671634064752425e-06, + "loss": 1.3893, + "step": 5240 + }, + { + "epoch": 1.475594317062878, + "grad_norm": 3.671875, + "learning_rate": 6.3632028750356125e-06, + "loss": 1.7045, + "step": 5245 + }, + { + "epoch": 1.477000984667323, + "grad_norm": 3.640625, + "learning_rate": 6.359238781316307e-06, + "loss": 1.653, + "step": 5250 + }, + { + "epoch": 1.4784076522717682, + "grad_norm": 3.25, + "learning_rate": 6.35527113129278e-06, + "loss": 1.6699, + "step": 5255 + }, + { + "epoch": 1.4798143198762133, + "grad_norm": 2.65625, + "learning_rate": 6.351299930945846e-06, + "loss": 1.5831, + "step": 5260 + }, + { + "epoch": 1.4812209874806583, + "grad_norm": 3.015625, + "learning_rate": 6.347325186261672e-06, + "loss": 1.5442, + "step": 5265 + }, + { + "epoch": 1.4826276550851034, + "grad_norm": 4.0, + "learning_rate": 6.343346903231769e-06, + "loss": 1.5064, + "step": 5270 + }, + { + "epoch": 1.4840343226895485, + "grad_norm": 3.0, + "learning_rate": 6.339365087852977e-06, + "loss": 1.7953, + "step": 5275 + }, + { + "epoch": 1.4854409902939936, + "grad_norm": 3.203125, + "learning_rate": 6.335379746127465e-06, + "loss": 1.7672, + "step": 5280 + }, + { + "epoch": 1.4868476578984386, + "grad_norm": 2.3125, + "learning_rate": 6.3313908840627165e-06, + "loss": 1.4996, + "step": 5285 + }, + { + "epoch": 1.4882543255028837, + "grad_norm": 2.59375, + "learning_rate": 6.327398507671523e-06, + "loss": 1.7181, + "step": 5290 + }, + { + "epoch": 1.4896609931073288, + "grad_norm": 3.171875, + "learning_rate": 6.3234026229719685e-06, + "loss": 1.7623, + "step": 5295 + }, + { + "epoch": 1.4910676607117739, + "grad_norm": 3.46875, + "learning_rate": 6.319403235987431e-06, + "loss": 1.9328, + "step": 5300 + }, + { + "epoch": 1.492474328316219, + "grad_norm": 3.625, + "learning_rate": 6.315400352746566e-06, + "loss": 1.6441, + "step": 5305 + }, + { + "epoch": 1.493880995920664, + "grad_norm": 3.921875, + "learning_rate": 6.311393979283296e-06, + "loss": 1.5421, + "step": 5310 + }, + { + "epoch": 1.495287663525109, + "grad_norm": 4.46875, + "learning_rate": 6.307384121636811e-06, + "loss": 1.6609, + "step": 5315 + }, + { + "epoch": 1.4966943311295542, + "grad_norm": 4.15625, + "learning_rate": 6.303370785851545e-06, + "loss": 1.7574, + "step": 5320 + }, + { + "epoch": 1.498100998733999, + "grad_norm": 2.9375, + "learning_rate": 6.299353977977184e-06, + "loss": 1.7455, + "step": 5325 + }, + { + "epoch": 1.4995076663384443, + "grad_norm": 3.015625, + "learning_rate": 6.295333704068641e-06, + "loss": 1.6189, + "step": 5330 + }, + { + "epoch": 1.5009143339428892, + "grad_norm": 4.25, + "learning_rate": 6.2913099701860565e-06, + "loss": 1.5672, + "step": 5335 + }, + { + "epoch": 1.5023210015473345, + "grad_norm": 3.6875, + "learning_rate": 6.287282782394786e-06, + "loss": 1.6904, + "step": 5340 + }, + { + "epoch": 1.5037276691517794, + "grad_norm": 2.890625, + "learning_rate": 6.283252146765391e-06, + "loss": 1.5382, + "step": 5345 + }, + { + "epoch": 1.5051343367562247, + "grad_norm": 2.53125, + "learning_rate": 6.279218069373631e-06, + "loss": 1.7194, + "step": 5350 + }, + { + "epoch": 1.5065410043606695, + "grad_norm": 3.140625, + "learning_rate": 6.275180556300452e-06, + "loss": 1.5181, + "step": 5355 + }, + { + "epoch": 1.5079476719651146, + "grad_norm": 3.765625, + "learning_rate": 6.27113961363198e-06, + "loss": 1.5198, + "step": 5360 + }, + { + "epoch": 1.5093543395695597, + "grad_norm": 2.421875, + "learning_rate": 6.267095247459514e-06, + "loss": 1.7598, + "step": 5365 + }, + { + "epoch": 1.5107610071740047, + "grad_norm": 3.828125, + "learning_rate": 6.263047463879506e-06, + "loss": 1.7883, + "step": 5370 + }, + { + "epoch": 1.5121676747784498, + "grad_norm": 4.4375, + "learning_rate": 6.258996268993568e-06, + "loss": 1.4889, + "step": 5375 + }, + { + "epoch": 1.513574342382895, + "grad_norm": 3.28125, + "learning_rate": 6.254941668908447e-06, + "loss": 1.5787, + "step": 5380 + }, + { + "epoch": 1.51498100998734, + "grad_norm": 4.9375, + "learning_rate": 6.250883669736028e-06, + "loss": 1.4366, + "step": 5385 + }, + { + "epoch": 1.516387677591785, + "grad_norm": 3.578125, + "learning_rate": 6.246822277593317e-06, + "loss": 1.5613, + "step": 5390 + }, + { + "epoch": 1.5177943451962301, + "grad_norm": 3.40625, + "learning_rate": 6.242757498602435e-06, + "loss": 1.7432, + "step": 5395 + }, + { + "epoch": 1.5192010128006752, + "grad_norm": 3.546875, + "learning_rate": 6.238689338890608e-06, + "loss": 1.4916, + "step": 5400 + }, + { + "epoch": 1.5206076804051203, + "grad_norm": 3.828125, + "learning_rate": 6.23461780459016e-06, + "loss": 1.4807, + "step": 5405 + }, + { + "epoch": 1.5220143480095654, + "grad_norm": 4.09375, + "learning_rate": 6.2305429018385e-06, + "loss": 1.7498, + "step": 5410 + }, + { + "epoch": 1.5234210156140104, + "grad_norm": 3.0, + "learning_rate": 6.226464636778116e-06, + "loss": 1.6306, + "step": 5415 + }, + { + "epoch": 1.5248276832184555, + "grad_norm": 2.890625, + "learning_rate": 6.222383015556562e-06, + "loss": 1.5031, + "step": 5420 + }, + { + "epoch": 1.5262343508229006, + "grad_norm": 2.578125, + "learning_rate": 6.2182980443264545e-06, + "loss": 1.607, + "step": 5425 + }, + { + "epoch": 1.5276410184273455, + "grad_norm": 3.59375, + "learning_rate": 6.2142097292454555e-06, + "loss": 1.5335, + "step": 5430 + }, + { + "epoch": 1.5290476860317908, + "grad_norm": 4.09375, + "learning_rate": 6.210118076476271e-06, + "loss": 1.6874, + "step": 5435 + }, + { + "epoch": 1.5304543536362356, + "grad_norm": 3.125, + "learning_rate": 6.206023092186637e-06, + "loss": 1.707, + "step": 5440 + }, + { + "epoch": 1.531861021240681, + "grad_norm": 2.859375, + "learning_rate": 6.20192478254931e-06, + "loss": 1.5466, + "step": 5445 + }, + { + "epoch": 1.5332676888451258, + "grad_norm": 2.9375, + "learning_rate": 6.197823153742064e-06, + "loss": 1.4202, + "step": 5450 + }, + { + "epoch": 1.534674356449571, + "grad_norm": 2.953125, + "learning_rate": 6.19371821194767e-06, + "loss": 1.7848, + "step": 5455 + }, + { + "epoch": 1.536081024054016, + "grad_norm": 3.625, + "learning_rate": 6.189609963353897e-06, + "loss": 1.6941, + "step": 5460 + }, + { + "epoch": 1.5374876916584612, + "grad_norm": 2.5, + "learning_rate": 6.185498414153494e-06, + "loss": 1.6015, + "step": 5465 + }, + { + "epoch": 1.538894359262906, + "grad_norm": 3.75, + "learning_rate": 6.181383570544195e-06, + "loss": 1.6375, + "step": 5470 + }, + { + "epoch": 1.5403010268673514, + "grad_norm": 3.390625, + "learning_rate": 6.17726543872869e-06, + "loss": 1.3859, + "step": 5475 + }, + { + "epoch": 1.5417076944717962, + "grad_norm": 3.734375, + "learning_rate": 6.1731440249146286e-06, + "loss": 1.6703, + "step": 5480 + }, + { + "epoch": 1.5431143620762415, + "grad_norm": 2.796875, + "learning_rate": 6.169019335314612e-06, + "loss": 1.6369, + "step": 5485 + }, + { + "epoch": 1.5445210296806864, + "grad_norm": 2.921875, + "learning_rate": 6.164891376146173e-06, + "loss": 1.488, + "step": 5490 + }, + { + "epoch": 1.5459276972851317, + "grad_norm": 2.625, + "learning_rate": 6.160760153631775e-06, + "loss": 1.4559, + "step": 5495 + }, + { + "epoch": 1.5473343648895765, + "grad_norm": 3.25, + "learning_rate": 6.156625673998804e-06, + "loss": 1.5765, + "step": 5500 + }, + { + "epoch": 1.5487410324940216, + "grad_norm": 5.75, + "learning_rate": 6.152487943479551e-06, + "loss": 1.6279, + "step": 5505 + }, + { + "epoch": 1.5501477000984667, + "grad_norm": 3.484375, + "learning_rate": 6.14834696831121e-06, + "loss": 1.4578, + "step": 5510 + }, + { + "epoch": 1.5515543677029118, + "grad_norm": 4.59375, + "learning_rate": 6.144202754735866e-06, + "loss": 1.4965, + "step": 5515 + }, + { + "epoch": 1.5529610353073569, + "grad_norm": 4.1875, + "learning_rate": 6.140055309000482e-06, + "loss": 1.4934, + "step": 5520 + }, + { + "epoch": 1.554367702911802, + "grad_norm": 3.265625, + "learning_rate": 6.135904637356901e-06, + "loss": 1.2053, + "step": 5525 + }, + { + "epoch": 1.555774370516247, + "grad_norm": 3.203125, + "learning_rate": 6.13175074606182e-06, + "loss": 1.9499, + "step": 5530 + }, + { + "epoch": 1.557181038120692, + "grad_norm": 3.03125, + "learning_rate": 6.127593641376793e-06, + "loss": 1.681, + "step": 5535 + }, + { + "epoch": 1.5585877057251372, + "grad_norm": 3.15625, + "learning_rate": 6.12343332956822e-06, + "loss": 1.5296, + "step": 5540 + }, + { + "epoch": 1.5599943733295822, + "grad_norm": 3.5, + "learning_rate": 6.119269816907332e-06, + "loss": 1.4921, + "step": 5545 + }, + { + "epoch": 1.5614010409340273, + "grad_norm": 4.65625, + "learning_rate": 6.115103109670187e-06, + "loss": 1.4377, + "step": 5550 + }, + { + "epoch": 1.5628077085384724, + "grad_norm": 3.703125, + "learning_rate": 6.110933214137657e-06, + "loss": 1.3758, + "step": 5555 + }, + { + "epoch": 1.5642143761429175, + "grad_norm": 3.640625, + "learning_rate": 6.10676013659542e-06, + "loss": 1.3795, + "step": 5560 + }, + { + "epoch": 1.5656210437473626, + "grad_norm": 3.125, + "learning_rate": 6.1025838833339545e-06, + "loss": 1.4445, + "step": 5565 + }, + { + "epoch": 1.5670277113518076, + "grad_norm": 3.328125, + "learning_rate": 6.0984044606485185e-06, + "loss": 1.6025, + "step": 5570 + }, + { + "epoch": 1.5684343789562525, + "grad_norm": 4.21875, + "learning_rate": 6.094221874839157e-06, + "loss": 1.6672, + "step": 5575 + }, + { + "epoch": 1.5698410465606978, + "grad_norm": 3.015625, + "learning_rate": 6.090036132210673e-06, + "loss": 1.7502, + "step": 5580 + }, + { + "epoch": 1.5712477141651426, + "grad_norm": 3.25, + "learning_rate": 6.085847239072634e-06, + "loss": 1.5658, + "step": 5585 + }, + { + "epoch": 1.572654381769588, + "grad_norm": 3.28125, + "learning_rate": 6.081655201739359e-06, + "loss": 1.5319, + "step": 5590 + }, + { + "epoch": 1.5740610493740328, + "grad_norm": 3.953125, + "learning_rate": 6.077460026529901e-06, + "loss": 1.4186, + "step": 5595 + }, + { + "epoch": 1.575467716978478, + "grad_norm": 2.59375, + "learning_rate": 6.073261719768044e-06, + "loss": 1.4423, + "step": 5600 + }, + { + "epoch": 1.576874384582923, + "grad_norm": 2.4375, + "learning_rate": 6.069060287782296e-06, + "loss": 1.6494, + "step": 5605 + }, + { + "epoch": 1.5782810521873682, + "grad_norm": 4.90625, + "learning_rate": 6.064855736905872e-06, + "loss": 1.5472, + "step": 5610 + }, + { + "epoch": 1.579687719791813, + "grad_norm": 3.203125, + "learning_rate": 6.060648073476691e-06, + "loss": 1.7602, + "step": 5615 + }, + { + "epoch": 1.5810943873962584, + "grad_norm": 3.609375, + "learning_rate": 6.056437303837362e-06, + "loss": 1.5036, + "step": 5620 + }, + { + "epoch": 1.5825010550007033, + "grad_norm": 2.53125, + "learning_rate": 6.052223434335179e-06, + "loss": 1.7547, + "step": 5625 + }, + { + "epoch": 1.5839077226051486, + "grad_norm": 3.15625, + "learning_rate": 6.0480064713221036e-06, + "loss": 1.3458, + "step": 5630 + }, + { + "epoch": 1.5853143902095934, + "grad_norm": 3.046875, + "learning_rate": 6.043786421154767e-06, + "loss": 1.6531, + "step": 5635 + }, + { + "epoch": 1.5867210578140385, + "grad_norm": 3.015625, + "learning_rate": 6.0395632901944485e-06, + "loss": 1.5451, + "step": 5640 + }, + { + "epoch": 1.5881277254184836, + "grad_norm": 2.96875, + "learning_rate": 6.035337084807077e-06, + "loss": 1.5324, + "step": 5645 + }, + { + "epoch": 1.5895343930229286, + "grad_norm": 4.84375, + "learning_rate": 6.031107811363208e-06, + "loss": 1.6149, + "step": 5650 + }, + { + "epoch": 1.5909410606273737, + "grad_norm": 2.984375, + "learning_rate": 6.026875476238031e-06, + "loss": 1.7126, + "step": 5655 + }, + { + "epoch": 1.5923477282318188, + "grad_norm": 3.09375, + "learning_rate": 6.022640085811341e-06, + "loss": 1.4252, + "step": 5660 + }, + { + "epoch": 1.5937543958362639, + "grad_norm": 2.796875, + "learning_rate": 6.018401646467546e-06, + "loss": 1.5583, + "step": 5665 + }, + { + "epoch": 1.595161063440709, + "grad_norm": 3.125, + "learning_rate": 6.014160164595648e-06, + "loss": 1.6051, + "step": 5670 + }, + { + "epoch": 1.596567731045154, + "grad_norm": 4.40625, + "learning_rate": 6.009915646589231e-06, + "loss": 1.5316, + "step": 5675 + }, + { + "epoch": 1.5979743986495991, + "grad_norm": 3.5625, + "learning_rate": 6.005668098846465e-06, + "loss": 1.711, + "step": 5680 + }, + { + "epoch": 1.5993810662540442, + "grad_norm": 2.28125, + "learning_rate": 6.001417527770076e-06, + "loss": 1.5814, + "step": 5685 + }, + { + "epoch": 1.6007877338584893, + "grad_norm": 3.015625, + "learning_rate": 5.9971639397673565e-06, + "loss": 1.768, + "step": 5690 + }, + { + "epoch": 1.6021944014629343, + "grad_norm": 3.421875, + "learning_rate": 5.992907341250142e-06, + "loss": 1.6234, + "step": 5695 + }, + { + "epoch": 1.6036010690673794, + "grad_norm": 2.75, + "learning_rate": 5.988647738634803e-06, + "loss": 1.5887, + "step": 5700 + }, + { + "epoch": 1.6050077366718245, + "grad_norm": 3.28125, + "learning_rate": 5.984385138342248e-06, + "loss": 1.7158, + "step": 5705 + }, + { + "epoch": 1.6064144042762694, + "grad_norm": 4.5625, + "learning_rate": 5.980119546797895e-06, + "loss": 1.4228, + "step": 5710 + }, + { + "epoch": 1.6078210718807147, + "grad_norm": 2.5625, + "learning_rate": 5.975850970431675e-06, + "loss": 1.6542, + "step": 5715 + }, + { + "epoch": 1.6092277394851595, + "grad_norm": 2.34375, + "learning_rate": 5.971579415678018e-06, + "loss": 1.616, + "step": 5720 + }, + { + "epoch": 1.6106344070896048, + "grad_norm": 2.625, + "learning_rate": 5.967304888975844e-06, + "loss": 1.6365, + "step": 5725 + }, + { + "epoch": 1.6120410746940497, + "grad_norm": 3.421875, + "learning_rate": 5.9630273967685505e-06, + "loss": 1.5371, + "step": 5730 + }, + { + "epoch": 1.613447742298495, + "grad_norm": 2.875, + "learning_rate": 5.958746945504009e-06, + "loss": 1.8092, + "step": 5735 + }, + { + "epoch": 1.6148544099029398, + "grad_norm": 2.734375, + "learning_rate": 5.954463541634547e-06, + "loss": 1.4343, + "step": 5740 + }, + { + "epoch": 1.6162610775073851, + "grad_norm": 2.4375, + "learning_rate": 5.950177191616946e-06, + "loss": 1.5462, + "step": 5745 + }, + { + "epoch": 1.61766774511183, + "grad_norm": 2.9375, + "learning_rate": 5.94588790191243e-06, + "loss": 1.3474, + "step": 5750 + }, + { + "epoch": 1.6190744127162753, + "grad_norm": 3.296875, + "learning_rate": 5.941595678986648e-06, + "loss": 1.4929, + "step": 5755 + }, + { + "epoch": 1.6204810803207201, + "grad_norm": 3.0, + "learning_rate": 5.937300529309677e-06, + "loss": 1.6187, + "step": 5760 + }, + { + "epoch": 1.6218877479251654, + "grad_norm": 3.28125, + "learning_rate": 5.933002459356004e-06, + "loss": 1.5226, + "step": 5765 + }, + { + "epoch": 1.6232944155296103, + "grad_norm": 2.90625, + "learning_rate": 5.928701475604515e-06, + "loss": 1.7075, + "step": 5770 + }, + { + "epoch": 1.6247010831340554, + "grad_norm": 3.609375, + "learning_rate": 5.924397584538491e-06, + "loss": 1.6602, + "step": 5775 + }, + { + "epoch": 1.6261077507385004, + "grad_norm": 9.3125, + "learning_rate": 5.920090792645595e-06, + "loss": 1.6669, + "step": 5780 + }, + { + "epoch": 1.6275144183429455, + "grad_norm": 3.484375, + "learning_rate": 5.915781106417863e-06, + "loss": 1.4352, + "step": 5785 + }, + { + "epoch": 1.6289210859473906, + "grad_norm": 2.296875, + "learning_rate": 5.911468532351694e-06, + "loss": 1.5932, + "step": 5790 + }, + { + "epoch": 1.6303277535518357, + "grad_norm": 2.546875, + "learning_rate": 5.907153076947839e-06, + "loss": 1.422, + "step": 5795 + }, + { + "epoch": 1.6317344211562808, + "grad_norm": 2.953125, + "learning_rate": 5.9028347467113926e-06, + "loss": 1.4173, + "step": 5800 + }, + { + "epoch": 1.6331410887607258, + "grad_norm": 3.984375, + "learning_rate": 5.898513548151782e-06, + "loss": 1.6911, + "step": 5805 + }, + { + "epoch": 1.634547756365171, + "grad_norm": 3.421875, + "learning_rate": 5.894189487782763e-06, + "loss": 1.7045, + "step": 5810 + }, + { + "epoch": 1.635954423969616, + "grad_norm": 2.96875, + "learning_rate": 5.889862572122399e-06, + "loss": 1.4862, + "step": 5815 + }, + { + "epoch": 1.637361091574061, + "grad_norm": 3.53125, + "learning_rate": 5.88553280769306e-06, + "loss": 1.3872, + "step": 5820 + }, + { + "epoch": 1.6387677591785061, + "grad_norm": 2.59375, + "learning_rate": 5.88120020102141e-06, + "loss": 1.5883, + "step": 5825 + }, + { + "epoch": 1.6401744267829512, + "grad_norm": 2.765625, + "learning_rate": 5.876864758638401e-06, + "loss": 1.4336, + "step": 5830 + }, + { + "epoch": 1.6415810943873963, + "grad_norm": 3.46875, + "learning_rate": 5.872526487079253e-06, + "loss": 1.528, + "step": 5835 + }, + { + "epoch": 1.6429877619918414, + "grad_norm": 4.9375, + "learning_rate": 5.868185392883454e-06, + "loss": 1.6604, + "step": 5840 + }, + { + "epoch": 1.6443944295962862, + "grad_norm": 3.21875, + "learning_rate": 5.8638414825947476e-06, + "loss": 1.6125, + "step": 5845 + }, + { + "epoch": 1.6458010972007315, + "grad_norm": 2.859375, + "learning_rate": 5.859494762761122e-06, + "loss": 1.3778, + "step": 5850 + }, + { + "epoch": 1.6472077648051764, + "grad_norm": 3.015625, + "learning_rate": 5.855145239934797e-06, + "loss": 1.7263, + "step": 5855 + }, + { + "epoch": 1.6486144324096217, + "grad_norm": 3.953125, + "learning_rate": 5.850792920672225e-06, + "loss": 1.3598, + "step": 5860 + }, + { + "epoch": 1.6500211000140665, + "grad_norm": 3.3125, + "learning_rate": 5.846437811534068e-06, + "loss": 1.5234, + "step": 5865 + }, + { + "epoch": 1.6514277676185118, + "grad_norm": 3.03125, + "learning_rate": 5.842079919085192e-06, + "loss": 1.6599, + "step": 5870 + }, + { + "epoch": 1.6528344352229567, + "grad_norm": 5.46875, + "learning_rate": 5.837719249894665e-06, + "loss": 1.7368, + "step": 5875 + }, + { + "epoch": 1.654241102827402, + "grad_norm": 3.734375, + "learning_rate": 5.833355810535734e-06, + "loss": 1.6333, + "step": 5880 + }, + { + "epoch": 1.6556477704318469, + "grad_norm": 3.796875, + "learning_rate": 5.8289896075858255e-06, + "loss": 1.7083, + "step": 5885 + }, + { + "epoch": 1.6570544380362922, + "grad_norm": 3.421875, + "learning_rate": 5.824620647626533e-06, + "loss": 1.7947, + "step": 5890 + }, + { + "epoch": 1.658461105640737, + "grad_norm": 3.328125, + "learning_rate": 5.820248937243602e-06, + "loss": 1.5477, + "step": 5895 + }, + { + "epoch": 1.6598677732451823, + "grad_norm": 2.953125, + "learning_rate": 5.815874483026926e-06, + "loss": 1.7948, + "step": 5900 + }, + { + "epoch": 1.6612744408496272, + "grad_norm": 3.75, + "learning_rate": 5.811497291570535e-06, + "loss": 1.2987, + "step": 5905 + }, + { + "epoch": 1.6626811084540725, + "grad_norm": 3.046875, + "learning_rate": 5.807117369472585e-06, + "loss": 1.6014, + "step": 5910 + }, + { + "epoch": 1.6640877760585173, + "grad_norm": 4.28125, + "learning_rate": 5.8027347233353465e-06, + "loss": 1.5098, + "step": 5915 + }, + { + "epoch": 1.6654944436629624, + "grad_norm": 4.34375, + "learning_rate": 5.798349359765198e-06, + "loss": 1.5863, + "step": 5920 + }, + { + "epoch": 1.6669011112674075, + "grad_norm": 3.875, + "learning_rate": 5.793961285372614e-06, + "loss": 1.5017, + "step": 5925 + }, + { + "epoch": 1.6683077788718526, + "grad_norm": 5.0, + "learning_rate": 5.789570506772154e-06, + "loss": 1.4812, + "step": 5930 + }, + { + "epoch": 1.6697144464762976, + "grad_norm": 2.1875, + "learning_rate": 5.785177030582455e-06, + "loss": 1.613, + "step": 5935 + }, + { + "epoch": 1.6711211140807427, + "grad_norm": 2.46875, + "learning_rate": 5.7807808634262205e-06, + "loss": 1.4484, + "step": 5940 + }, + { + "epoch": 1.6725277816851878, + "grad_norm": 2.78125, + "learning_rate": 5.776382011930211e-06, + "loss": 1.5913, + "step": 5945 + }, + { + "epoch": 1.6739344492896329, + "grad_norm": 3.578125, + "learning_rate": 5.77198048272523e-06, + "loss": 1.5753, + "step": 5950 + }, + { + "epoch": 1.675341116894078, + "grad_norm": 3.375, + "learning_rate": 5.767576282446121e-06, + "loss": 1.6776, + "step": 5955 + }, + { + "epoch": 1.676747784498523, + "grad_norm": 4.21875, + "learning_rate": 5.763169417731751e-06, + "loss": 1.5151, + "step": 5960 + }, + { + "epoch": 1.678154452102968, + "grad_norm": 4.84375, + "learning_rate": 5.758759895225008e-06, + "loss": 1.2589, + "step": 5965 + }, + { + "epoch": 1.6795611197074132, + "grad_norm": 3.375, + "learning_rate": 5.75434772157278e-06, + "loss": 1.5056, + "step": 5970 + }, + { + "epoch": 1.6809677873118583, + "grad_norm": 2.828125, + "learning_rate": 5.749932903425957e-06, + "loss": 1.7067, + "step": 5975 + }, + { + "epoch": 1.6823744549163033, + "grad_norm": 3.4375, + "learning_rate": 5.745515447439411e-06, + "loss": 1.4572, + "step": 5980 + }, + { + "epoch": 1.6837811225207484, + "grad_norm": 2.90625, + "learning_rate": 5.741095360271992e-06, + "loss": 1.6003, + "step": 5985 + }, + { + "epoch": 1.6851877901251933, + "grad_norm": 3.859375, + "learning_rate": 5.736672648586518e-06, + "loss": 1.5607, + "step": 5990 + }, + { + "epoch": 1.6865944577296386, + "grad_norm": 3.09375, + "learning_rate": 5.732247319049761e-06, + "loss": 1.6931, + "step": 5995 + }, + { + "epoch": 1.6880011253340834, + "grad_norm": 3.375, + "learning_rate": 5.727819378332437e-06, + "loss": 1.4869, + "step": 6000 + }, + { + "epoch": 1.6894077929385287, + "grad_norm": 3.640625, + "learning_rate": 5.723388833109205e-06, + "loss": 1.6263, + "step": 6005 + }, + { + "epoch": 1.6908144605429736, + "grad_norm": 4.59375, + "learning_rate": 5.718955690058644e-06, + "loss": 1.4953, + "step": 6010 + }, + { + "epoch": 1.6922211281474189, + "grad_norm": 4.46875, + "learning_rate": 5.714519955863249e-06, + "loss": 1.378, + "step": 6015 + }, + { + "epoch": 1.6936277957518637, + "grad_norm": 3.546875, + "learning_rate": 5.710081637209425e-06, + "loss": 1.6132, + "step": 6020 + }, + { + "epoch": 1.695034463356309, + "grad_norm": 2.578125, + "learning_rate": 5.705640740787467e-06, + "loss": 1.6166, + "step": 6025 + }, + { + "epoch": 1.6964411309607539, + "grad_norm": 3.296875, + "learning_rate": 5.701197273291563e-06, + "loss": 1.4201, + "step": 6030 + }, + { + "epoch": 1.6978477985651992, + "grad_norm": 4.03125, + "learning_rate": 5.696751241419771e-06, + "loss": 1.6117, + "step": 6035 + }, + { + "epoch": 1.699254466169644, + "grad_norm": 3.6875, + "learning_rate": 5.692302651874016e-06, + "loss": 1.6267, + "step": 6040 + }, + { + "epoch": 1.7006611337740893, + "grad_norm": 3.71875, + "learning_rate": 5.68785151136008e-06, + "loss": 1.6265, + "step": 6045 + }, + { + "epoch": 1.7020678013785342, + "grad_norm": 3.09375, + "learning_rate": 5.683397826587586e-06, + "loss": 1.502, + "step": 6050 + }, + { + "epoch": 1.7034744689829793, + "grad_norm": 3.4375, + "learning_rate": 5.678941604269999e-06, + "loss": 1.66, + "step": 6055 + }, + { + "epoch": 1.7048811365874244, + "grad_norm": 3.46875, + "learning_rate": 5.674482851124603e-06, + "loss": 1.6555, + "step": 6060 + }, + { + "epoch": 1.7062878041918694, + "grad_norm": 2.84375, + "learning_rate": 5.670021573872498e-06, + "loss": 1.6171, + "step": 6065 + }, + { + "epoch": 1.7076944717963145, + "grad_norm": 3.03125, + "learning_rate": 5.665557779238593e-06, + "loss": 1.723, + "step": 6070 + }, + { + "epoch": 1.7091011394007596, + "grad_norm": 3.28125, + "learning_rate": 5.661091473951587e-06, + "loss": 1.5578, + "step": 6075 + }, + { + "epoch": 1.7105078070052047, + "grad_norm": 2.765625, + "learning_rate": 5.656622664743965e-06, + "loss": 1.6143, + "step": 6080 + }, + { + "epoch": 1.7119144746096497, + "grad_norm": 4.71875, + "learning_rate": 5.652151358351988e-06, + "loss": 1.5817, + "step": 6085 + }, + { + "epoch": 1.7133211422140948, + "grad_norm": 3.375, + "learning_rate": 5.64767756151568e-06, + "loss": 1.5214, + "step": 6090 + }, + { + "epoch": 1.71472780981854, + "grad_norm": 3.171875, + "learning_rate": 5.643201280978816e-06, + "loss": 1.6993, + "step": 6095 + }, + { + "epoch": 1.716134477422985, + "grad_norm": 3.375, + "learning_rate": 5.638722523488921e-06, + "loss": 1.6905, + "step": 6100 + }, + { + "epoch": 1.71754114502743, + "grad_norm": 3.125, + "learning_rate": 5.63424129579725e-06, + "loss": 1.7052, + "step": 6105 + }, + { + "epoch": 1.7189478126318751, + "grad_norm": 2.875, + "learning_rate": 5.629757604658781e-06, + "loss": 1.6538, + "step": 6110 + }, + { + "epoch": 1.7203544802363202, + "grad_norm": 3.15625, + "learning_rate": 5.625271456832209e-06, + "loss": 1.4619, + "step": 6115 + }, + { + "epoch": 1.7217611478407653, + "grad_norm": 3.25, + "learning_rate": 5.620782859079929e-06, + "loss": 1.7455, + "step": 6120 + }, + { + "epoch": 1.7231678154452101, + "grad_norm": 2.453125, + "learning_rate": 5.6162918181680264e-06, + "loss": 1.5303, + "step": 6125 + }, + { + "epoch": 1.7245744830496554, + "grad_norm": 3.0625, + "learning_rate": 5.611798340866278e-06, + "loss": 1.6425, + "step": 6130 + }, + { + "epoch": 1.7259811506541003, + "grad_norm": 4.3125, + "learning_rate": 5.607302433948126e-06, + "loss": 1.7051, + "step": 6135 + }, + { + "epoch": 1.7273878182585456, + "grad_norm": 3.53125, + "learning_rate": 5.602804104190674e-06, + "loss": 1.5346, + "step": 6140 + }, + { + "epoch": 1.7287944858629904, + "grad_norm": 3.3125, + "learning_rate": 5.598303358374686e-06, + "loss": 1.5561, + "step": 6145 + }, + { + "epoch": 1.7302011534674357, + "grad_norm": 3.75, + "learning_rate": 5.5938002032845596e-06, + "loss": 1.6081, + "step": 6150 + }, + { + "epoch": 1.7316078210718806, + "grad_norm": 2.84375, + "learning_rate": 5.589294645708326e-06, + "loss": 1.7825, + "step": 6155 + }, + { + "epoch": 1.733014488676326, + "grad_norm": 3.46875, + "learning_rate": 5.584786692437644e-06, + "loss": 1.4638, + "step": 6160 + }, + { + "epoch": 1.7344211562807708, + "grad_norm": 3.3125, + "learning_rate": 5.580276350267774e-06, + "loss": 1.8662, + "step": 6165 + }, + { + "epoch": 1.735827823885216, + "grad_norm": 2.828125, + "learning_rate": 5.575763625997584e-06, + "loss": 1.8925, + "step": 6170 + }, + { + "epoch": 1.737234491489661, + "grad_norm": 3.203125, + "learning_rate": 5.5712485264295314e-06, + "loss": 1.4838, + "step": 6175 + }, + { + "epoch": 1.7386411590941062, + "grad_norm": 3.65625, + "learning_rate": 5.566731058369655e-06, + "loss": 1.6893, + "step": 6180 + }, + { + "epoch": 1.740047826698551, + "grad_norm": 2.90625, + "learning_rate": 5.562211228627559e-06, + "loss": 1.426, + "step": 6185 + }, + { + "epoch": 1.7414544943029961, + "grad_norm": 3.953125, + "learning_rate": 5.557689044016414e-06, + "loss": 1.4707, + "step": 6190 + }, + { + "epoch": 1.7428611619074412, + "grad_norm": 3.15625, + "learning_rate": 5.553164511352936e-06, + "loss": 1.5854, + "step": 6195 + }, + { + "epoch": 1.7442678295118863, + "grad_norm": 3.15625, + "learning_rate": 5.548637637457383e-06, + "loss": 1.7302, + "step": 6200 + }, + { + "epoch": 1.7456744971163314, + "grad_norm": 3.234375, + "learning_rate": 5.544108429153541e-06, + "loss": 1.5283, + "step": 6205 + }, + { + "epoch": 1.7470811647207765, + "grad_norm": 2.609375, + "learning_rate": 5.539576893268714e-06, + "loss": 1.8094, + "step": 6210 + }, + { + "epoch": 1.7484878323252215, + "grad_norm": 2.875, + "learning_rate": 5.535043036633716e-06, + "loss": 1.4656, + "step": 6215 + }, + { + "epoch": 1.7498944999296666, + "grad_norm": 3.75, + "learning_rate": 5.530506866082858e-06, + "loss": 1.5639, + "step": 6220 + }, + { + "epoch": 1.7513011675341117, + "grad_norm": 3.53125, + "learning_rate": 5.525968388453943e-06, + "loss": 1.6606, + "step": 6225 + }, + { + "epoch": 1.7527078351385568, + "grad_norm": 2.625, + "learning_rate": 5.521427610588246e-06, + "loss": 1.5511, + "step": 6230 + }, + { + "epoch": 1.7541145027430018, + "grad_norm": 3.546875, + "learning_rate": 5.51688453933051e-06, + "loss": 1.3522, + "step": 6235 + }, + { + "epoch": 1.755521170347447, + "grad_norm": 2.78125, + "learning_rate": 5.51233918152894e-06, + "loss": 1.5404, + "step": 6240 + }, + { + "epoch": 1.756927837951892, + "grad_norm": 3.125, + "learning_rate": 5.507791544035183e-06, + "loss": 1.4346, + "step": 6245 + }, + { + "epoch": 1.758334505556337, + "grad_norm": 3.828125, + "learning_rate": 5.5032416337043255e-06, + "loss": 1.5443, + "step": 6250 + }, + { + "epoch": 1.7597411731607822, + "grad_norm": 3.734375, + "learning_rate": 5.498689457394877e-06, + "loss": 1.5366, + "step": 6255 + }, + { + "epoch": 1.761147840765227, + "grad_norm": 3.453125, + "learning_rate": 5.494135021968766e-06, + "loss": 1.5194, + "step": 6260 + }, + { + "epoch": 1.7625545083696723, + "grad_norm": 4.15625, + "learning_rate": 5.489578334291323e-06, + "loss": 1.6533, + "step": 6265 + }, + { + "epoch": 1.7639611759741172, + "grad_norm": 3.125, + "learning_rate": 5.485019401231275e-06, + "loss": 1.5755, + "step": 6270 + }, + { + "epoch": 1.7653678435785625, + "grad_norm": 3.71875, + "learning_rate": 5.480458229660736e-06, + "loss": 1.5212, + "step": 6275 + }, + { + "epoch": 1.7667745111830073, + "grad_norm": 3.453125, + "learning_rate": 5.4758948264551905e-06, + "loss": 1.3376, + "step": 6280 + }, + { + "epoch": 1.7681811787874526, + "grad_norm": 3.03125, + "learning_rate": 5.471329198493489e-06, + "loss": 1.4777, + "step": 6285 + }, + { + "epoch": 1.7695878463918975, + "grad_norm": 2.859375, + "learning_rate": 5.466761352657836e-06, + "loss": 1.5956, + "step": 6290 + }, + { + "epoch": 1.7709945139963428, + "grad_norm": 2.71875, + "learning_rate": 5.462191295833777e-06, + "loss": 1.6315, + "step": 6295 + }, + { + "epoch": 1.7724011816007876, + "grad_norm": 3.53125, + "learning_rate": 5.457619034910193e-06, + "loss": 1.4154, + "step": 6300 + }, + { + "epoch": 1.773807849205233, + "grad_norm": 2.921875, + "learning_rate": 5.453044576779286e-06, + "loss": 1.5447, + "step": 6305 + }, + { + "epoch": 1.7752145168096778, + "grad_norm": 3.5625, + "learning_rate": 5.448467928336571e-06, + "loss": 1.4752, + "step": 6310 + }, + { + "epoch": 1.776621184414123, + "grad_norm": 3.453125, + "learning_rate": 5.4438890964808605e-06, + "loss": 1.5837, + "step": 6315 + }, + { + "epoch": 1.778027852018568, + "grad_norm": 3.09375, + "learning_rate": 5.439308088114267e-06, + "loss": 1.3614, + "step": 6320 + }, + { + "epoch": 1.7794345196230132, + "grad_norm": 3.546875, + "learning_rate": 5.434724910142175e-06, + "loss": 1.4906, + "step": 6325 + }, + { + "epoch": 1.780841187227458, + "grad_norm": 5.59375, + "learning_rate": 5.430139569473244e-06, + "loss": 1.4475, + "step": 6330 + }, + { + "epoch": 1.7822478548319032, + "grad_norm": 2.75, + "learning_rate": 5.425552073019392e-06, + "loss": 1.6079, + "step": 6335 + }, + { + "epoch": 1.7836545224363483, + "grad_norm": 3.953125, + "learning_rate": 5.420962427695789e-06, + "loss": 1.6304, + "step": 6340 + }, + { + "epoch": 1.7850611900407933, + "grad_norm": 3.109375, + "learning_rate": 5.416370640420842e-06, + "loss": 1.8129, + "step": 6345 + }, + { + "epoch": 1.7864678576452384, + "grad_norm": 3.796875, + "learning_rate": 5.411776718116185e-06, + "loss": 1.6699, + "step": 6350 + }, + { + "epoch": 1.7878745252496835, + "grad_norm": 3.21875, + "learning_rate": 5.4071806677066744e-06, + "loss": 1.5165, + "step": 6355 + }, + { + "epoch": 1.7892811928541286, + "grad_norm": 2.640625, + "learning_rate": 5.402582496120372e-06, + "loss": 1.6656, + "step": 6360 + }, + { + "epoch": 1.7906878604585736, + "grad_norm": 2.703125, + "learning_rate": 5.397982210288536e-06, + "loss": 1.6198, + "step": 6365 + }, + { + "epoch": 1.7920945280630187, + "grad_norm": 2.234375, + "learning_rate": 5.393379817145617e-06, + "loss": 1.8459, + "step": 6370 + }, + { + "epoch": 1.7935011956674638, + "grad_norm": 3.4375, + "learning_rate": 5.388775323629236e-06, + "loss": 1.4617, + "step": 6375 + }, + { + "epoch": 1.7949078632719089, + "grad_norm": 2.78125, + "learning_rate": 5.384168736680182e-06, + "loss": 1.6081, + "step": 6380 + }, + { + "epoch": 1.796314530876354, + "grad_norm": 3.421875, + "learning_rate": 5.379560063242403e-06, + "loss": 1.5674, + "step": 6385 + }, + { + "epoch": 1.797721198480799, + "grad_norm": 2.890625, + "learning_rate": 5.374949310262985e-06, + "loss": 1.3943, + "step": 6390 + }, + { + "epoch": 1.799127866085244, + "grad_norm": 4.25, + "learning_rate": 5.370336484692156e-06, + "loss": 1.5251, + "step": 6395 + }, + { + "epoch": 1.8005345336896892, + "grad_norm": 3.328125, + "learning_rate": 5.3657215934832645e-06, + "loss": 1.5846, + "step": 6400 + }, + { + "epoch": 1.801941201294134, + "grad_norm": 3.953125, + "learning_rate": 5.361104643592773e-06, + "loss": 1.7833, + "step": 6405 + }, + { + "epoch": 1.8033478688985793, + "grad_norm": 3.453125, + "learning_rate": 5.356485641980249e-06, + "loss": 1.2723, + "step": 6410 + }, + { + "epoch": 1.8047545365030242, + "grad_norm": 3.546875, + "learning_rate": 5.351864595608349e-06, + "loss": 1.4335, + "step": 6415 + }, + { + "epoch": 1.8061612041074695, + "grad_norm": 3.71875, + "learning_rate": 5.347241511442816e-06, + "loss": 1.4807, + "step": 6420 + }, + { + "epoch": 1.8075678717119144, + "grad_norm": 3.28125, + "learning_rate": 5.342616396452463e-06, + "loss": 1.4345, + "step": 6425 + }, + { + "epoch": 1.8089745393163597, + "grad_norm": 5.28125, + "learning_rate": 5.337989257609163e-06, + "loss": 1.2621, + "step": 6430 + }, + { + "epoch": 1.8103812069208045, + "grad_norm": 3.0, + "learning_rate": 5.333360101887843e-06, + "loss": 1.6888, + "step": 6435 + }, + { + "epoch": 1.8117878745252498, + "grad_norm": 2.65625, + "learning_rate": 5.328728936266466e-06, + "loss": 1.5794, + "step": 6440 + }, + { + "epoch": 1.8131945421296947, + "grad_norm": 2.984375, + "learning_rate": 5.324095767726027e-06, + "loss": 1.5931, + "step": 6445 + }, + { + "epoch": 1.81460120973414, + "grad_norm": 3.078125, + "learning_rate": 5.319460603250541e-06, + "loss": 1.6198, + "step": 6450 + }, + { + "epoch": 1.8160078773385848, + "grad_norm": 4.96875, + "learning_rate": 5.314823449827031e-06, + "loss": 1.5342, + "step": 6455 + }, + { + "epoch": 1.8174145449430301, + "grad_norm": 4.0625, + "learning_rate": 5.310184314445515e-06, + "loss": 1.7853, + "step": 6460 + }, + { + "epoch": 1.818821212547475, + "grad_norm": 2.4375, + "learning_rate": 5.305543204099006e-06, + "loss": 1.7108, + "step": 6465 + }, + { + "epoch": 1.82022788015192, + "grad_norm": 4.4375, + "learning_rate": 5.3009001257834875e-06, + "loss": 1.6419, + "step": 6470 + }, + { + "epoch": 1.8216345477563651, + "grad_norm": 3.21875, + "learning_rate": 5.29625508649791e-06, + "loss": 1.6769, + "step": 6475 + }, + { + "epoch": 1.8230412153608102, + "grad_norm": 3.640625, + "learning_rate": 5.291608093244183e-06, + "loss": 1.659, + "step": 6480 + }, + { + "epoch": 1.8244478829652553, + "grad_norm": 3.125, + "learning_rate": 5.286959153027162e-06, + "loss": 1.8752, + "step": 6485 + }, + { + "epoch": 1.8258545505697004, + "grad_norm": 3.40625, + "learning_rate": 5.28230827285463e-06, + "loss": 1.6693, + "step": 6490 + }, + { + "epoch": 1.8272612181741454, + "grad_norm": 4.28125, + "learning_rate": 5.277655459737303e-06, + "loss": 1.4653, + "step": 6495 + }, + { + "epoch": 1.8286678857785905, + "grad_norm": 2.96875, + "learning_rate": 5.2730007206888074e-06, + "loss": 1.5908, + "step": 6500 + }, + { + "epoch": 1.8300745533830356, + "grad_norm": 3.109375, + "learning_rate": 5.268344062725671e-06, + "loss": 1.6227, + "step": 6505 + }, + { + "epoch": 1.8314812209874807, + "grad_norm": 3.796875, + "learning_rate": 5.263685492867317e-06, + "loss": 1.4596, + "step": 6510 + }, + { + "epoch": 1.8328878885919258, + "grad_norm": 3.0, + "learning_rate": 5.259025018136049e-06, + "loss": 1.5202, + "step": 6515 + }, + { + "epoch": 1.8342945561963708, + "grad_norm": 3.46875, + "learning_rate": 5.25436264555704e-06, + "loss": 1.6152, + "step": 6520 + }, + { + "epoch": 1.835701223800816, + "grad_norm": 2.390625, + "learning_rate": 5.249698382158329e-06, + "loss": 1.7461, + "step": 6525 + }, + { + "epoch": 1.837107891405261, + "grad_norm": 3.28125, + "learning_rate": 5.245032234970801e-06, + "loss": 1.6038, + "step": 6530 + }, + { + "epoch": 1.838514559009706, + "grad_norm": 3.625, + "learning_rate": 5.240364211028183e-06, + "loss": 1.7761, + "step": 6535 + }, + { + "epoch": 1.839921226614151, + "grad_norm": 4.5, + "learning_rate": 5.235694317367028e-06, + "loss": 1.682, + "step": 6540 + }, + { + "epoch": 1.8413278942185962, + "grad_norm": 2.046875, + "learning_rate": 5.231022561026712e-06, + "loss": 1.7722, + "step": 6545 + }, + { + "epoch": 1.842734561823041, + "grad_norm": 3.484375, + "learning_rate": 5.226348949049414e-06, + "loss": 1.4845, + "step": 6550 + }, + { + "epoch": 1.8441412294274864, + "grad_norm": 3.1875, + "learning_rate": 5.2216734884801126e-06, + "loss": 1.6831, + "step": 6555 + }, + { + "epoch": 1.8455478970319312, + "grad_norm": 2.828125, + "learning_rate": 5.216996186366573e-06, + "loss": 1.6796, + "step": 6560 + }, + { + "epoch": 1.8469545646363765, + "grad_norm": 2.328125, + "learning_rate": 5.212317049759336e-06, + "loss": 1.6936, + "step": 6565 + }, + { + "epoch": 1.8483612322408214, + "grad_norm": 2.90625, + "learning_rate": 5.207636085711707e-06, + "loss": 1.8625, + "step": 6570 + }, + { + "epoch": 1.8497678998452667, + "grad_norm": 3.15625, + "learning_rate": 5.202953301279748e-06, + "loss": 1.7222, + "step": 6575 + }, + { + "epoch": 1.8511745674497115, + "grad_norm": 2.984375, + "learning_rate": 5.198268703522263e-06, + "loss": 1.437, + "step": 6580 + }, + { + "epoch": 1.8525812350541568, + "grad_norm": 2.96875, + "learning_rate": 5.1935822995007896e-06, + "loss": 1.6653, + "step": 6585 + }, + { + "epoch": 1.8539879026586017, + "grad_norm": 3.65625, + "learning_rate": 5.188894096279591e-06, + "loss": 1.5089, + "step": 6590 + }, + { + "epoch": 1.855394570263047, + "grad_norm": 4.40625, + "learning_rate": 5.184204100925639e-06, + "loss": 1.4483, + "step": 6595 + }, + { + "epoch": 1.8568012378674918, + "grad_norm": 6.53125, + "learning_rate": 5.179512320508606e-06, + "loss": 1.4933, + "step": 6600 + }, + { + "epoch": 1.858207905471937, + "grad_norm": 2.765625, + "learning_rate": 5.17481876210086e-06, + "loss": 1.4914, + "step": 6605 + }, + { + "epoch": 1.859614573076382, + "grad_norm": 3.46875, + "learning_rate": 5.170123432777446e-06, + "loss": 1.6377, + "step": 6610 + }, + { + "epoch": 1.861021240680827, + "grad_norm": 2.390625, + "learning_rate": 5.165426339616078e-06, + "loss": 1.6628, + "step": 6615 + }, + { + "epoch": 1.8624279082852722, + "grad_norm": 3.140625, + "learning_rate": 5.160727489697131e-06, + "loss": 1.6961, + "step": 6620 + }, + { + "epoch": 1.8638345758897172, + "grad_norm": 2.890625, + "learning_rate": 5.156026890103626e-06, + "loss": 1.5194, + "step": 6625 + }, + { + "epoch": 1.8652412434941623, + "grad_norm": 4.25, + "learning_rate": 5.1513245479212215e-06, + "loss": 1.5143, + "step": 6630 + }, + { + "epoch": 1.8666479110986074, + "grad_norm": 2.796875, + "learning_rate": 5.146620470238205e-06, + "loss": 1.6946, + "step": 6635 + }, + { + "epoch": 1.8680545787030525, + "grad_norm": 2.953125, + "learning_rate": 5.1419146641454784e-06, + "loss": 1.413, + "step": 6640 + }, + { + "epoch": 1.8694612463074975, + "grad_norm": 3.359375, + "learning_rate": 5.137207136736549e-06, + "loss": 1.8252, + "step": 6645 + }, + { + "epoch": 1.8708679139119426, + "grad_norm": 3.03125, + "learning_rate": 5.132497895107518e-06, + "loss": 1.5078, + "step": 6650 + }, + { + "epoch": 1.8722745815163877, + "grad_norm": 2.828125, + "learning_rate": 5.127786946357074e-06, + "loss": 1.6362, + "step": 6655 + }, + { + "epoch": 1.8736812491208328, + "grad_norm": 3.390625, + "learning_rate": 5.123074297586475e-06, + "loss": 1.6801, + "step": 6660 + }, + { + "epoch": 1.8750879167252779, + "grad_norm": 3.953125, + "learning_rate": 5.118359955899542e-06, + "loss": 1.4851, + "step": 6665 + }, + { + "epoch": 1.876494584329723, + "grad_norm": 3.375, + "learning_rate": 5.113643928402651e-06, + "loss": 1.5629, + "step": 6670 + }, + { + "epoch": 1.8779012519341678, + "grad_norm": 2.15625, + "learning_rate": 5.108926222204716e-06, + "loss": 1.7819, + "step": 6675 + }, + { + "epoch": 1.879307919538613, + "grad_norm": 3.8125, + "learning_rate": 5.104206844417184e-06, + "loss": 1.6994, + "step": 6680 + }, + { + "epoch": 1.880714587143058, + "grad_norm": 2.859375, + "learning_rate": 5.099485802154019e-06, + "loss": 1.5289, + "step": 6685 + }, + { + "epoch": 1.8821212547475032, + "grad_norm": 2.875, + "learning_rate": 5.094763102531697e-06, + "loss": 1.4957, + "step": 6690 + }, + { + "epoch": 1.883527922351948, + "grad_norm": 3.296875, + "learning_rate": 5.09003875266919e-06, + "loss": 1.6463, + "step": 6695 + }, + { + "epoch": 1.8849345899563934, + "grad_norm": 2.5625, + "learning_rate": 5.085312759687958e-06, + "loss": 1.5889, + "step": 6700 + }, + { + "epoch": 1.8863412575608383, + "grad_norm": 2.90625, + "learning_rate": 5.080585130711938e-06, + "loss": 1.79, + "step": 6705 + }, + { + "epoch": 1.8877479251652836, + "grad_norm": 3.75, + "learning_rate": 5.0758558728675345e-06, + "loss": 1.8305, + "step": 6710 + }, + { + "epoch": 1.8891545927697284, + "grad_norm": 4.28125, + "learning_rate": 5.0711249932836035e-06, + "loss": 1.4388, + "step": 6715 + }, + { + "epoch": 1.8905612603741737, + "grad_norm": 2.390625, + "learning_rate": 5.066392499091451e-06, + "loss": 1.7699, + "step": 6720 + }, + { + "epoch": 1.8919679279786186, + "grad_norm": 2.765625, + "learning_rate": 5.061658397424814e-06, + "loss": 1.7132, + "step": 6725 + }, + { + "epoch": 1.8933745955830639, + "grad_norm": 4.59375, + "learning_rate": 5.056922695419849e-06, + "loss": 1.3894, + "step": 6730 + }, + { + "epoch": 1.8947812631875087, + "grad_norm": 2.921875, + "learning_rate": 5.052185400215134e-06, + "loss": 1.476, + "step": 6735 + }, + { + "epoch": 1.896187930791954, + "grad_norm": 3.703125, + "learning_rate": 5.047446518951638e-06, + "loss": 1.61, + "step": 6740 + }, + { + "epoch": 1.8975945983963989, + "grad_norm": 3.734375, + "learning_rate": 5.042706058772728e-06, + "loss": 1.6619, + "step": 6745 + }, + { + "epoch": 1.899001266000844, + "grad_norm": 5.875, + "learning_rate": 5.037964026824148e-06, + "loss": 1.4887, + "step": 6750 + }, + { + "epoch": 1.900407933605289, + "grad_norm": 3.453125, + "learning_rate": 5.033220430254015e-06, + "loss": 1.5818, + "step": 6755 + }, + { + "epoch": 1.9018146012097341, + "grad_norm": 3.390625, + "learning_rate": 5.0284752762128e-06, + "loss": 1.4241, + "step": 6760 + }, + { + "epoch": 1.9032212688141792, + "grad_norm": 3.34375, + "learning_rate": 5.023728571853322e-06, + "loss": 1.6242, + "step": 6765 + }, + { + "epoch": 1.9046279364186243, + "grad_norm": 3.578125, + "learning_rate": 5.018980324330741e-06, + "loss": 1.7302, + "step": 6770 + }, + { + "epoch": 1.9060346040230693, + "grad_norm": 2.8125, + "learning_rate": 5.014230540802538e-06, + "loss": 1.7645, + "step": 6775 + }, + { + "epoch": 1.9074412716275144, + "grad_norm": 4.78125, + "learning_rate": 5.009479228428513e-06, + "loss": 1.4511, + "step": 6780 + }, + { + "epoch": 1.9088479392319595, + "grad_norm": 3.609375, + "learning_rate": 5.00472639437077e-06, + "loss": 1.5802, + "step": 6785 + }, + { + "epoch": 1.9102546068364046, + "grad_norm": 3.703125, + "learning_rate": 4.999972045793705e-06, + "loss": 1.5002, + "step": 6790 + }, + { + "epoch": 1.9116612744408497, + "grad_norm": 3.328125, + "learning_rate": 4.995216189863999e-06, + "loss": 1.3874, + "step": 6795 + }, + { + "epoch": 1.9130679420452947, + "grad_norm": 3.25, + "learning_rate": 4.990458833750606e-06, + "loss": 1.5605, + "step": 6800 + }, + { + "epoch": 1.9144746096497398, + "grad_norm": 3.390625, + "learning_rate": 4.985699984624736e-06, + "loss": 1.6657, + "step": 6805 + }, + { + "epoch": 1.915881277254185, + "grad_norm": 3.46875, + "learning_rate": 4.980939649659856e-06, + "loss": 1.574, + "step": 6810 + }, + { + "epoch": 1.91728794485863, + "grad_norm": 3.21875, + "learning_rate": 4.976177836031669e-06, + "loss": 1.5493, + "step": 6815 + }, + { + "epoch": 1.9186946124630748, + "grad_norm": 3.859375, + "learning_rate": 4.97141455091811e-06, + "loss": 1.7608, + "step": 6820 + }, + { + "epoch": 1.9201012800675201, + "grad_norm": 2.34375, + "learning_rate": 4.966649801499327e-06, + "loss": 1.4364, + "step": 6825 + }, + { + "epoch": 1.921507947671965, + "grad_norm": 4.09375, + "learning_rate": 4.961883594957681e-06, + "loss": 1.2724, + "step": 6830 + }, + { + "epoch": 1.9229146152764103, + "grad_norm": 3.328125, + "learning_rate": 4.957115938477726e-06, + "loss": 1.4279, + "step": 6835 + }, + { + "epoch": 1.9243212828808551, + "grad_norm": 5.6875, + "learning_rate": 4.952346839246202e-06, + "loss": 1.414, + "step": 6840 + }, + { + "epoch": 1.9257279504853004, + "grad_norm": 2.9375, + "learning_rate": 4.947576304452025e-06, + "loss": 1.6354, + "step": 6845 + }, + { + "epoch": 1.9271346180897453, + "grad_norm": 3.875, + "learning_rate": 4.942804341286274e-06, + "loss": 1.4963, + "step": 6850 + }, + { + "epoch": 1.9285412856941906, + "grad_norm": 3.1875, + "learning_rate": 4.938030956942181e-06, + "loss": 1.6742, + "step": 6855 + }, + { + "epoch": 1.9299479532986354, + "grad_norm": 3.84375, + "learning_rate": 4.933256158615121e-06, + "loss": 1.5579, + "step": 6860 + }, + { + "epoch": 1.9313546209030807, + "grad_norm": 3.03125, + "learning_rate": 4.9284799535026e-06, + "loss": 1.3593, + "step": 6865 + }, + { + "epoch": 1.9327612885075256, + "grad_norm": 4.03125, + "learning_rate": 4.923702348804244e-06, + "loss": 1.397, + "step": 6870 + }, + { + "epoch": 1.934167956111971, + "grad_norm": 3.203125, + "learning_rate": 4.918923351721791e-06, + "loss": 1.652, + "step": 6875 + }, + { + "epoch": 1.9355746237164158, + "grad_norm": 3.078125, + "learning_rate": 4.9141429694590745e-06, + "loss": 1.6749, + "step": 6880 + }, + { + "epoch": 1.9369812913208608, + "grad_norm": 3.359375, + "learning_rate": 4.909361209222018e-06, + "loss": 1.7391, + "step": 6885 + }, + { + "epoch": 1.938387958925306, + "grad_norm": 5.0625, + "learning_rate": 4.9045780782186225e-06, + "loss": 1.8264, + "step": 6890 + }, + { + "epoch": 1.939794626529751, + "grad_norm": 2.890625, + "learning_rate": 4.899793583658955e-06, + "loss": 1.4543, + "step": 6895 + }, + { + "epoch": 1.941201294134196, + "grad_norm": 3.515625, + "learning_rate": 4.895007732755138e-06, + "loss": 1.6126, + "step": 6900 + }, + { + "epoch": 1.9426079617386411, + "grad_norm": 4.875, + "learning_rate": 4.890220532721336e-06, + "loss": 1.8178, + "step": 6905 + }, + { + "epoch": 1.9440146293430862, + "grad_norm": 3.6875, + "learning_rate": 4.885431990773752e-06, + "loss": 1.574, + "step": 6910 + }, + { + "epoch": 1.9454212969475313, + "grad_norm": 3.3125, + "learning_rate": 4.880642114130609e-06, + "loss": 1.3841, + "step": 6915 + }, + { + "epoch": 1.9468279645519764, + "grad_norm": 5.8125, + "learning_rate": 4.875850910012138e-06, + "loss": 1.3911, + "step": 6920 + }, + { + "epoch": 1.9482346321564215, + "grad_norm": 3.203125, + "learning_rate": 4.87105838564058e-06, + "loss": 1.791, + "step": 6925 + }, + { + "epoch": 1.9496412997608665, + "grad_norm": 2.96875, + "learning_rate": 4.8662645482401584e-06, + "loss": 1.6071, + "step": 6930 + }, + { + "epoch": 1.9510479673653116, + "grad_norm": 2.9375, + "learning_rate": 4.861469405037079e-06, + "loss": 1.3506, + "step": 6935 + }, + { + "epoch": 1.9524546349697567, + "grad_norm": 2.96875, + "learning_rate": 4.856672963259518e-06, + "loss": 1.2333, + "step": 6940 + }, + { + "epoch": 1.9538613025742018, + "grad_norm": 3.015625, + "learning_rate": 4.851875230137603e-06, + "loss": 1.6049, + "step": 6945 + }, + { + "epoch": 1.9552679701786468, + "grad_norm": 3.734375, + "learning_rate": 4.847076212903414e-06, + "loss": 1.8223, + "step": 6950 + }, + { + "epoch": 1.9566746377830917, + "grad_norm": 4.25, + "learning_rate": 4.842275918790965e-06, + "loss": 1.5588, + "step": 6955 + }, + { + "epoch": 1.958081305387537, + "grad_norm": 2.625, + "learning_rate": 4.837474355036191e-06, + "loss": 1.6483, + "step": 6960 + }, + { + "epoch": 1.9594879729919819, + "grad_norm": 2.546875, + "learning_rate": 4.83267152887695e-06, + "loss": 1.6539, + "step": 6965 + }, + { + "epoch": 1.9608946405964272, + "grad_norm": 3.203125, + "learning_rate": 4.8278674475529915e-06, + "loss": 1.5487, + "step": 6970 + }, + { + "epoch": 1.962301308200872, + "grad_norm": 2.984375, + "learning_rate": 4.823062118305966e-06, + "loss": 1.3615, + "step": 6975 + }, + { + "epoch": 1.9637079758053173, + "grad_norm": 2.84375, + "learning_rate": 4.8182555483794e-06, + "loss": 1.6147, + "step": 6980 + }, + { + "epoch": 1.9651146434097622, + "grad_norm": 2.953125, + "learning_rate": 4.813447745018692e-06, + "loss": 1.7156, + "step": 6985 + }, + { + "epoch": 1.9665213110142075, + "grad_norm": 3.125, + "learning_rate": 4.808638715471101e-06, + "loss": 1.5149, + "step": 6990 + }, + { + "epoch": 1.9679279786186523, + "grad_norm": 4.96875, + "learning_rate": 4.803828466985732e-06, + "loss": 1.2237, + "step": 6995 + }, + { + "epoch": 1.9693346462230976, + "grad_norm": 3.0, + "learning_rate": 4.799017006813527e-06, + "loss": 1.5996, + "step": 7000 + }, + { + "epoch": 1.9707413138275425, + "grad_norm": 3.359375, + "learning_rate": 4.794204342207259e-06, + "loss": 1.5614, + "step": 7005 + }, + { + "epoch": 1.9721479814319878, + "grad_norm": 2.90625, + "learning_rate": 4.789390480421512e-06, + "loss": 1.65, + "step": 7010 + }, + { + "epoch": 1.9735546490364326, + "grad_norm": 2.640625, + "learning_rate": 4.784575428712676e-06, + "loss": 1.5145, + "step": 7015 + }, + { + "epoch": 1.9749613166408777, + "grad_norm": 3.640625, + "learning_rate": 4.7797591943389355e-06, + "loss": 1.6309, + "step": 7020 + }, + { + "epoch": 1.9763679842453228, + "grad_norm": 3.359375, + "learning_rate": 4.774941784560256e-06, + "loss": 1.7972, + "step": 7025 + }, + { + "epoch": 1.9777746518497679, + "grad_norm": 3.546875, + "learning_rate": 4.770123206638376e-06, + "loss": 1.6059, + "step": 7030 + }, + { + "epoch": 1.979181319454213, + "grad_norm": 2.28125, + "learning_rate": 4.765303467836794e-06, + "loss": 1.6813, + "step": 7035 + }, + { + "epoch": 1.980587987058658, + "grad_norm": 2.890625, + "learning_rate": 4.760482575420762e-06, + "loss": 1.6176, + "step": 7040 + }, + { + "epoch": 1.981994654663103, + "grad_norm": 2.8125, + "learning_rate": 4.755660536657266e-06, + "loss": 1.689, + "step": 7045 + }, + { + "epoch": 1.9834013222675482, + "grad_norm": 3.21875, + "learning_rate": 4.7508373588150216e-06, + "loss": 1.3935, + "step": 7050 + }, + { + "epoch": 1.9848079898719933, + "grad_norm": 3.984375, + "learning_rate": 4.746013049164463e-06, + "loss": 1.4812, + "step": 7055 + }, + { + "epoch": 1.9862146574764383, + "grad_norm": 3.796875, + "learning_rate": 4.74118761497773e-06, + "loss": 1.7011, + "step": 7060 + }, + { + "epoch": 1.9876213250808834, + "grad_norm": 2.8125, + "learning_rate": 4.7363610635286536e-06, + "loss": 1.6525, + "step": 7065 + }, + { + "epoch": 1.9890279926853285, + "grad_norm": 2.96875, + "learning_rate": 4.731533402092756e-06, + "loss": 1.7934, + "step": 7070 + }, + { + "epoch": 1.9904346602897736, + "grad_norm": 4.6875, + "learning_rate": 4.726704637947228e-06, + "loss": 1.2473, + "step": 7075 + }, + { + "epoch": 1.9918413278942186, + "grad_norm": 3.609375, + "learning_rate": 4.721874778370921e-06, + "loss": 1.2767, + "step": 7080 + }, + { + "epoch": 1.9932479954986637, + "grad_norm": 4.625, + "learning_rate": 4.717043830644344e-06, + "loss": 1.5313, + "step": 7085 + }, + { + "epoch": 1.9946546631031086, + "grad_norm": 3.4375, + "learning_rate": 4.7122118020496385e-06, + "loss": 1.6033, + "step": 7090 + }, + { + "epoch": 1.9960613307075539, + "grad_norm": 3.40625, + "learning_rate": 4.707378699870582e-06, + "loss": 1.7435, + "step": 7095 + }, + { + "epoch": 1.9974679983119987, + "grad_norm": 3.59375, + "learning_rate": 4.702544531392565e-06, + "loss": 1.6058, + "step": 7100 + }, + { + "epoch": 1.998874665916444, + "grad_norm": 3.875, + "learning_rate": 4.697709303902592e-06, + "loss": 1.7475, + "step": 7105 + }, + { + "epoch": 2.0, + "eval_loss": 1.5795072317123413, + "eval_runtime": 330.9822, + "eval_samples_per_second": 9.541, + "eval_steps_per_second": 4.771, + "step": 7109 + }, + { + "epoch": 2.000281333520889, + "grad_norm": 2.78125, + "learning_rate": 4.6928730246892536e-06, + "loss": 1.5257, + "step": 7110 + }, + { + "epoch": 2.001688001125334, + "grad_norm": 4.125, + "learning_rate": 4.6880357010427375e-06, + "loss": 1.5725, + "step": 7115 + }, + { + "epoch": 2.003094668729779, + "grad_norm": 3.875, + "learning_rate": 4.683197340254798e-06, + "loss": 1.3852, + "step": 7120 + }, + { + "epoch": 2.0045013363342243, + "grad_norm": 3.71875, + "learning_rate": 4.678357949618754e-06, + "loss": 1.6272, + "step": 7125 + }, + { + "epoch": 2.005908003938669, + "grad_norm": 3.546875, + "learning_rate": 4.673517536429479e-06, + "loss": 1.4325, + "step": 7130 + }, + { + "epoch": 2.0073146715431145, + "grad_norm": 4.03125, + "learning_rate": 4.6686761079833855e-06, + "loss": 1.5526, + "step": 7135 + }, + { + "epoch": 2.0087213391475593, + "grad_norm": 3.15625, + "learning_rate": 4.663833671578418e-06, + "loss": 1.7071, + "step": 7140 + }, + { + "epoch": 2.0101280067520046, + "grad_norm": 4.0625, + "learning_rate": 4.6589902345140394e-06, + "loss": 1.2759, + "step": 7145 + }, + { + "epoch": 2.0115346743564495, + "grad_norm": 4.71875, + "learning_rate": 4.654145804091223e-06, + "loss": 1.6467, + "step": 7150 + }, + { + "epoch": 2.012941341960895, + "grad_norm": 3.09375, + "learning_rate": 4.649300387612436e-06, + "loss": 1.4278, + "step": 7155 + }, + { + "epoch": 2.0143480095653397, + "grad_norm": 3.34375, + "learning_rate": 4.644453992381633e-06, + "loss": 1.4842, + "step": 7160 + }, + { + "epoch": 2.015754677169785, + "grad_norm": 3.3125, + "learning_rate": 4.639606625704249e-06, + "loss": 1.9023, + "step": 7165 + }, + { + "epoch": 2.01716134477423, + "grad_norm": 2.84375, + "learning_rate": 4.634758294887175e-06, + "loss": 1.6057, + "step": 7170 + }, + { + "epoch": 2.018568012378675, + "grad_norm": 3.90625, + "learning_rate": 4.629909007238762e-06, + "loss": 1.7244, + "step": 7175 + }, + { + "epoch": 2.01997467998312, + "grad_norm": 4.0, + "learning_rate": 4.6250587700688e-06, + "loss": 1.8344, + "step": 7180 + }, + { + "epoch": 2.0213813475875653, + "grad_norm": 3.515625, + "learning_rate": 4.620207590688512e-06, + "loss": 1.5223, + "step": 7185 + }, + { + "epoch": 2.02278801519201, + "grad_norm": 3.203125, + "learning_rate": 4.61535547641054e-06, + "loss": 1.7127, + "step": 7190 + }, + { + "epoch": 2.024194682796455, + "grad_norm": 3.25, + "learning_rate": 4.610502434548934e-06, + "loss": 1.8786, + "step": 7195 + }, + { + "epoch": 2.0256013504009003, + "grad_norm": 3.765625, + "learning_rate": 4.6056484724191476e-06, + "loss": 1.5157, + "step": 7200 + }, + { + "epoch": 2.027008018005345, + "grad_norm": 3.5625, + "learning_rate": 4.600793597338015e-06, + "loss": 1.624, + "step": 7205 + }, + { + "epoch": 2.0284146856097904, + "grad_norm": 3.765625, + "learning_rate": 4.59593781662375e-06, + "loss": 1.6772, + "step": 7210 + }, + { + "epoch": 2.0298213532142353, + "grad_norm": 3.375, + "learning_rate": 4.591081137595933e-06, + "loss": 1.3959, + "step": 7215 + }, + { + "epoch": 2.0312280208186806, + "grad_norm": 5.09375, + "learning_rate": 4.5862235675754935e-06, + "loss": 1.5757, + "step": 7220 + }, + { + "epoch": 2.0326346884231254, + "grad_norm": 2.671875, + "learning_rate": 4.58136511388471e-06, + "loss": 1.3308, + "step": 7225 + }, + { + "epoch": 2.0340413560275707, + "grad_norm": 3.09375, + "learning_rate": 4.5765057838471884e-06, + "loss": 1.685, + "step": 7230 + }, + { + "epoch": 2.0354480236320156, + "grad_norm": 2.34375, + "learning_rate": 4.571645584787858e-06, + "loss": 1.6677, + "step": 7235 + }, + { + "epoch": 2.036854691236461, + "grad_norm": 3.34375, + "learning_rate": 4.566784524032958e-06, + "loss": 1.5998, + "step": 7240 + }, + { + "epoch": 2.0382613588409058, + "grad_norm": 2.609375, + "learning_rate": 4.561922608910025e-06, + "loss": 1.4078, + "step": 7245 + }, + { + "epoch": 2.039668026445351, + "grad_norm": 3.34375, + "learning_rate": 4.557059846747886e-06, + "loss": 1.5471, + "step": 7250 + }, + { + "epoch": 2.041074694049796, + "grad_norm": 3.84375, + "learning_rate": 4.5521962448766416e-06, + "loss": 1.4645, + "step": 7255 + }, + { + "epoch": 2.042481361654241, + "grad_norm": 3.703125, + "learning_rate": 4.547331810627661e-06, + "loss": 1.4794, + "step": 7260 + }, + { + "epoch": 2.043888029258686, + "grad_norm": 3.734375, + "learning_rate": 4.542466551333568e-06, + "loss": 1.672, + "step": 7265 + }, + { + "epoch": 2.0452946968631314, + "grad_norm": 2.3125, + "learning_rate": 4.5376004743282255e-06, + "loss": 1.573, + "step": 7270 + }, + { + "epoch": 2.0467013644675762, + "grad_norm": 3.0625, + "learning_rate": 4.532733586946736e-06, + "loss": 1.6004, + "step": 7275 + }, + { + "epoch": 2.0481080320720215, + "grad_norm": 4.1875, + "learning_rate": 4.527865896525419e-06, + "loss": 1.7544, + "step": 7280 + }, + { + "epoch": 2.0495146996764664, + "grad_norm": 3.1875, + "learning_rate": 4.522997410401805e-06, + "loss": 1.4427, + "step": 7285 + }, + { + "epoch": 2.0509213672809117, + "grad_norm": 2.828125, + "learning_rate": 4.518128135914625e-06, + "loss": 1.5777, + "step": 7290 + }, + { + "epoch": 2.0523280348853565, + "grad_norm": 3.359375, + "learning_rate": 4.5132580804037984e-06, + "loss": 1.3477, + "step": 7295 + }, + { + "epoch": 2.053734702489802, + "grad_norm": 3.03125, + "learning_rate": 4.50838725121042e-06, + "loss": 1.6699, + "step": 7300 + }, + { + "epoch": 2.0551413700942467, + "grad_norm": 3.890625, + "learning_rate": 4.5035156556767555e-06, + "loss": 1.5229, + "step": 7305 + }, + { + "epoch": 2.056548037698692, + "grad_norm": 2.5625, + "learning_rate": 4.498643301146219e-06, + "loss": 1.7442, + "step": 7310 + }, + { + "epoch": 2.057954705303137, + "grad_norm": 2.859375, + "learning_rate": 4.493770194963374e-06, + "loss": 1.4243, + "step": 7315 + }, + { + "epoch": 2.059361372907582, + "grad_norm": 3.921875, + "learning_rate": 4.488896344473914e-06, + "loss": 1.653, + "step": 7320 + }, + { + "epoch": 2.060768040512027, + "grad_norm": 3.53125, + "learning_rate": 4.484021757024658e-06, + "loss": 1.6292, + "step": 7325 + }, + { + "epoch": 2.062174708116472, + "grad_norm": 3.0, + "learning_rate": 4.479146439963533e-06, + "loss": 1.4996, + "step": 7330 + }, + { + "epoch": 2.063581375720917, + "grad_norm": 4.15625, + "learning_rate": 4.474270400639565e-06, + "loss": 1.7773, + "step": 7335 + }, + { + "epoch": 2.064988043325362, + "grad_norm": 2.25, + "learning_rate": 4.469393646402872e-06, + "loss": 1.7348, + "step": 7340 + }, + { + "epoch": 2.0663947109298073, + "grad_norm": 3.359375, + "learning_rate": 4.4645161846046465e-06, + "loss": 1.351, + "step": 7345 + }, + { + "epoch": 2.067801378534252, + "grad_norm": 3.875, + "learning_rate": 4.459638022597149e-06, + "loss": 1.1414, + "step": 7350 + }, + { + "epoch": 2.0692080461386975, + "grad_norm": 3.078125, + "learning_rate": 4.454759167733697e-06, + "loss": 1.4292, + "step": 7355 + }, + { + "epoch": 2.0706147137431423, + "grad_norm": 3.375, + "learning_rate": 4.449879627368649e-06, + "loss": 1.608, + "step": 7360 + }, + { + "epoch": 2.0720213813475876, + "grad_norm": 3.3125, + "learning_rate": 4.4449994088574e-06, + "loss": 1.5892, + "step": 7365 + }, + { + "epoch": 2.0734280489520325, + "grad_norm": 4.875, + "learning_rate": 4.440118519556366e-06, + "loss": 1.5495, + "step": 7370 + }, + { + "epoch": 2.0748347165564778, + "grad_norm": 4.4375, + "learning_rate": 4.435236966822972e-06, + "loss": 1.7177, + "step": 7375 + }, + { + "epoch": 2.0762413841609226, + "grad_norm": 3.8125, + "learning_rate": 4.430354758015648e-06, + "loss": 1.4, + "step": 7380 + }, + { + "epoch": 2.077648051765368, + "grad_norm": 3.484375, + "learning_rate": 4.425471900493806e-06, + "loss": 1.6965, + "step": 7385 + }, + { + "epoch": 2.079054719369813, + "grad_norm": 3.53125, + "learning_rate": 4.420588401617845e-06, + "loss": 1.4151, + "step": 7390 + }, + { + "epoch": 2.080461386974258, + "grad_norm": 3.546875, + "learning_rate": 4.415704268749123e-06, + "loss": 1.5518, + "step": 7395 + }, + { + "epoch": 2.081868054578703, + "grad_norm": 2.484375, + "learning_rate": 4.410819509249956e-06, + "loss": 1.4657, + "step": 7400 + }, + { + "epoch": 2.0832747221831482, + "grad_norm": 3.25, + "learning_rate": 4.405934130483606e-06, + "loss": 1.5352, + "step": 7405 + }, + { + "epoch": 2.084681389787593, + "grad_norm": 3.640625, + "learning_rate": 4.401048139814268e-06, + "loss": 1.6914, + "step": 7410 + }, + { + "epoch": 2.0860880573920384, + "grad_norm": 2.78125, + "learning_rate": 4.3961615446070564e-06, + "loss": 1.5646, + "step": 7415 + }, + { + "epoch": 2.0874947249964833, + "grad_norm": 3.421875, + "learning_rate": 4.391274352228002e-06, + "loss": 1.5388, + "step": 7420 + }, + { + "epoch": 2.0889013926009286, + "grad_norm": 2.84375, + "learning_rate": 4.3863865700440316e-06, + "loss": 1.5648, + "step": 7425 + }, + { + "epoch": 2.0903080602053734, + "grad_norm": 3.546875, + "learning_rate": 4.3814982054229604e-06, + "loss": 1.455, + "step": 7430 + }, + { + "epoch": 2.0917147278098187, + "grad_norm": 4.125, + "learning_rate": 4.37660926573349e-06, + "loss": 1.5373, + "step": 7435 + }, + { + "epoch": 2.0931213954142636, + "grad_norm": 2.765625, + "learning_rate": 4.371719758345176e-06, + "loss": 1.5629, + "step": 7440 + }, + { + "epoch": 2.094528063018709, + "grad_norm": 2.984375, + "learning_rate": 4.366829690628439e-06, + "loss": 1.7871, + "step": 7445 + }, + { + "epoch": 2.0959347306231537, + "grad_norm": 3.28125, + "learning_rate": 4.3619390699545425e-06, + "loss": 1.4415, + "step": 7450 + }, + { + "epoch": 2.097341398227599, + "grad_norm": 3.0, + "learning_rate": 4.357047903695582e-06, + "loss": 1.4951, + "step": 7455 + }, + { + "epoch": 2.098748065832044, + "grad_norm": 3.125, + "learning_rate": 4.352156199224474e-06, + "loss": 1.7415, + "step": 7460 + }, + { + "epoch": 2.100154733436489, + "grad_norm": 3.28125, + "learning_rate": 4.347263963914951e-06, + "loss": 1.5393, + "step": 7465 + }, + { + "epoch": 2.101561401040934, + "grad_norm": 2.859375, + "learning_rate": 4.3423712051415415e-06, + "loss": 1.5584, + "step": 7470 + }, + { + "epoch": 2.102968068645379, + "grad_norm": 2.859375, + "learning_rate": 4.337477930279565e-06, + "loss": 1.4683, + "step": 7475 + }, + { + "epoch": 2.104374736249824, + "grad_norm": 3.328125, + "learning_rate": 4.332584146705119e-06, + "loss": 1.7479, + "step": 7480 + }, + { + "epoch": 2.105781403854269, + "grad_norm": 4.65625, + "learning_rate": 4.327689861795066e-06, + "loss": 1.6146, + "step": 7485 + }, + { + "epoch": 2.1071880714587143, + "grad_norm": 3.65625, + "learning_rate": 4.322795082927027e-06, + "loss": 1.5614, + "step": 7490 + }, + { + "epoch": 2.108594739063159, + "grad_norm": 3.078125, + "learning_rate": 4.317899817479363e-06, + "loss": 1.4357, + "step": 7495 + }, + { + "epoch": 2.1100014066676045, + "grad_norm": 3.546875, + "learning_rate": 4.313004072831177e-06, + "loss": 1.6396, + "step": 7500 + }, + { + "epoch": 2.1114080742720494, + "grad_norm": 4.0625, + "learning_rate": 4.308107856362284e-06, + "loss": 1.4383, + "step": 7505 + }, + { + "epoch": 2.1128147418764947, + "grad_norm": 3.34375, + "learning_rate": 4.303211175453216e-06, + "loss": 1.3966, + "step": 7510 + }, + { + "epoch": 2.1142214094809395, + "grad_norm": 3.421875, + "learning_rate": 4.2983140374852076e-06, + "loss": 1.3908, + "step": 7515 + }, + { + "epoch": 2.115628077085385, + "grad_norm": 2.59375, + "learning_rate": 4.293416449840175e-06, + "loss": 1.6366, + "step": 7520 + }, + { + "epoch": 2.1170347446898297, + "grad_norm": 2.890625, + "learning_rate": 4.288518419900718e-06, + "loss": 1.596, + "step": 7525 + }, + { + "epoch": 2.118441412294275, + "grad_norm": 2.796875, + "learning_rate": 4.2836199550501e-06, + "loss": 1.7149, + "step": 7530 + }, + { + "epoch": 2.11984807989872, + "grad_norm": 4.0625, + "learning_rate": 4.278721062672244e-06, + "loss": 1.4818, + "step": 7535 + }, + { + "epoch": 2.121254747503165, + "grad_norm": 4.21875, + "learning_rate": 4.273821750151712e-06, + "loss": 1.4746, + "step": 7540 + }, + { + "epoch": 2.12266141510761, + "grad_norm": 2.59375, + "learning_rate": 4.268922024873705e-06, + "loss": 1.7616, + "step": 7545 + }, + { + "epoch": 2.1240680827120553, + "grad_norm": 2.78125, + "learning_rate": 4.264021894224042e-06, + "loss": 1.6073, + "step": 7550 + }, + { + "epoch": 2.1254747503165, + "grad_norm": 3.40625, + "learning_rate": 4.259121365589152e-06, + "loss": 1.4738, + "step": 7555 + }, + { + "epoch": 2.1268814179209454, + "grad_norm": 3.359375, + "learning_rate": 4.25422044635607e-06, + "loss": 1.7426, + "step": 7560 + }, + { + "epoch": 2.1282880855253903, + "grad_norm": 4.25, + "learning_rate": 4.249319143912415e-06, + "loss": 1.595, + "step": 7565 + }, + { + "epoch": 2.1296947531298356, + "grad_norm": 2.671875, + "learning_rate": 4.244417465646382e-06, + "loss": 1.7493, + "step": 7570 + }, + { + "epoch": 2.1311014207342804, + "grad_norm": 3.9375, + "learning_rate": 4.239515418946739e-06, + "loss": 1.5547, + "step": 7575 + }, + { + "epoch": 2.1325080883387257, + "grad_norm": 3.5625, + "learning_rate": 4.234613011202804e-06, + "loss": 1.4594, + "step": 7580 + }, + { + "epoch": 2.1339147559431706, + "grad_norm": 2.859375, + "learning_rate": 4.22971024980444e-06, + "loss": 1.7132, + "step": 7585 + }, + { + "epoch": 2.135321423547616, + "grad_norm": 3.15625, + "learning_rate": 4.2248071421420445e-06, + "loss": 1.5946, + "step": 7590 + }, + { + "epoch": 2.1367280911520607, + "grad_norm": 3.5, + "learning_rate": 4.219903695606538e-06, + "loss": 1.3981, + "step": 7595 + }, + { + "epoch": 2.1381347587565056, + "grad_norm": 3.15625, + "learning_rate": 4.214999917589347e-06, + "loss": 1.463, + "step": 7600 + }, + { + "epoch": 2.139541426360951, + "grad_norm": 3.59375, + "learning_rate": 4.210095815482404e-06, + "loss": 1.3421, + "step": 7605 + }, + { + "epoch": 2.140948093965396, + "grad_norm": 2.484375, + "learning_rate": 4.205191396678126e-06, + "loss": 1.2721, + "step": 7610 + }, + { + "epoch": 2.142354761569841, + "grad_norm": 3.75, + "learning_rate": 4.200286668569407e-06, + "loss": 1.6508, + "step": 7615 + }, + { + "epoch": 2.143761429174286, + "grad_norm": 3.484375, + "learning_rate": 4.195381638549609e-06, + "loss": 1.569, + "step": 7620 + }, + { + "epoch": 2.145168096778731, + "grad_norm": 2.890625, + "learning_rate": 4.190476314012551e-06, + "loss": 1.5305, + "step": 7625 + }, + { + "epoch": 2.146574764383176, + "grad_norm": 3.03125, + "learning_rate": 4.185570702352491e-06, + "loss": 1.4491, + "step": 7630 + }, + { + "epoch": 2.1479814319876214, + "grad_norm": 3.203125, + "learning_rate": 4.180664810964121e-06, + "loss": 1.656, + "step": 7635 + }, + { + "epoch": 2.1493880995920662, + "grad_norm": 2.578125, + "learning_rate": 4.175758647242561e-06, + "loss": 1.4181, + "step": 7640 + }, + { + "epoch": 2.1507947671965115, + "grad_norm": 3.078125, + "learning_rate": 4.170852218583333e-06, + "loss": 1.3543, + "step": 7645 + }, + { + "epoch": 2.1522014348009564, + "grad_norm": 3.46875, + "learning_rate": 4.1659455323823615e-06, + "loss": 1.6262, + "step": 7650 + }, + { + "epoch": 2.1536081024054017, + "grad_norm": 2.953125, + "learning_rate": 4.161038596035963e-06, + "loss": 1.6097, + "step": 7655 + }, + { + "epoch": 2.1550147700098465, + "grad_norm": 3.15625, + "learning_rate": 4.156131416940824e-06, + "loss": 1.6947, + "step": 7660 + }, + { + "epoch": 2.156421437614292, + "grad_norm": 4.03125, + "learning_rate": 4.151224002494002e-06, + "loss": 1.5401, + "step": 7665 + }, + { + "epoch": 2.1578281052187367, + "grad_norm": 3.78125, + "learning_rate": 4.146316360092909e-06, + "loss": 1.6713, + "step": 7670 + }, + { + "epoch": 2.159234772823182, + "grad_norm": 4.0, + "learning_rate": 4.141408497135299e-06, + "loss": 1.5029, + "step": 7675 + }, + { + "epoch": 2.160641440427627, + "grad_norm": 3.484375, + "learning_rate": 4.136500421019258e-06, + "loss": 1.5932, + "step": 7680 + }, + { + "epoch": 2.162048108032072, + "grad_norm": 2.59375, + "learning_rate": 4.131592139143195e-06, + "loss": 1.6317, + "step": 7685 + }, + { + "epoch": 2.163454775636517, + "grad_norm": 3.46875, + "learning_rate": 4.126683658905829e-06, + "loss": 1.603, + "step": 7690 + }, + { + "epoch": 2.1648614432409623, + "grad_norm": 4.3125, + "learning_rate": 4.121774987706177e-06, + "loss": 1.5748, + "step": 7695 + }, + { + "epoch": 2.166268110845407, + "grad_norm": 3.65625, + "learning_rate": 4.116866132943544e-06, + "loss": 1.4496, + "step": 7700 + }, + { + "epoch": 2.1676747784498525, + "grad_norm": 2.859375, + "learning_rate": 4.111957102017513e-06, + "loss": 1.3999, + "step": 7705 + }, + { + "epoch": 2.1690814460542973, + "grad_norm": 3.140625, + "learning_rate": 4.10704790232793e-06, + "loss": 1.6404, + "step": 7710 + }, + { + "epoch": 2.1704881136587426, + "grad_norm": 2.765625, + "learning_rate": 4.102138541274898e-06, + "loss": 1.6583, + "step": 7715 + }, + { + "epoch": 2.1718947812631875, + "grad_norm": 3.234375, + "learning_rate": 4.097229026258762e-06, + "loss": 1.6281, + "step": 7720 + }, + { + "epoch": 2.1733014488676328, + "grad_norm": 2.6875, + "learning_rate": 4.092319364680101e-06, + "loss": 1.6493, + "step": 7725 + }, + { + "epoch": 2.1747081164720776, + "grad_norm": 2.5625, + "learning_rate": 4.08740956393971e-06, + "loss": 1.7934, + "step": 7730 + }, + { + "epoch": 2.176114784076523, + "grad_norm": 3.78125, + "learning_rate": 4.082499631438599e-06, + "loss": 1.6603, + "step": 7735 + }, + { + "epoch": 2.177521451680968, + "grad_norm": 4.25, + "learning_rate": 4.077589574577975e-06, + "loss": 1.5522, + "step": 7740 + }, + { + "epoch": 2.1789281192854126, + "grad_norm": 4.84375, + "learning_rate": 4.07267940075923e-06, + "loss": 1.6806, + "step": 7745 + }, + { + "epoch": 2.180334786889858, + "grad_norm": 4.375, + "learning_rate": 4.067769117383936e-06, + "loss": 1.5831, + "step": 7750 + }, + { + "epoch": 2.181741454494303, + "grad_norm": 3.71875, + "learning_rate": 4.0628587318538295e-06, + "loss": 1.5568, + "step": 7755 + }, + { + "epoch": 2.183148122098748, + "grad_norm": 2.875, + "learning_rate": 4.057948251570798e-06, + "loss": 1.6057, + "step": 7760 + }, + { + "epoch": 2.184554789703193, + "grad_norm": 8.8125, + "learning_rate": 4.053037683936875e-06, + "loss": 1.3439, + "step": 7765 + }, + { + "epoch": 2.1859614573076382, + "grad_norm": 3.53125, + "learning_rate": 4.048127036354224e-06, + "loss": 1.4215, + "step": 7770 + }, + { + "epoch": 2.187368124912083, + "grad_norm": 3.078125, + "learning_rate": 4.0432163162251295e-06, + "loss": 1.4706, + "step": 7775 + }, + { + "epoch": 2.1887747925165284, + "grad_norm": 3.5625, + "learning_rate": 4.038305530951986e-06, + "loss": 1.5168, + "step": 7780 + }, + { + "epoch": 2.1901814601209733, + "grad_norm": 3.890625, + "learning_rate": 4.033394687937284e-06, + "loss": 1.7687, + "step": 7785 + }, + { + "epoch": 2.1915881277254186, + "grad_norm": 3.40625, + "learning_rate": 4.028483794583606e-06, + "loss": 1.6538, + "step": 7790 + }, + { + "epoch": 2.1929947953298634, + "grad_norm": 4.21875, + "learning_rate": 4.023572858293602e-06, + "loss": 1.6807, + "step": 7795 + }, + { + "epoch": 2.1944014629343087, + "grad_norm": 2.78125, + "learning_rate": 4.018661886469996e-06, + "loss": 1.5059, + "step": 7800 + }, + { + "epoch": 2.1958081305387536, + "grad_norm": 3.953125, + "learning_rate": 4.01375088651556e-06, + "loss": 1.5728, + "step": 7805 + }, + { + "epoch": 2.197214798143199, + "grad_norm": 3.484375, + "learning_rate": 4.008839865833108e-06, + "loss": 1.5053, + "step": 7810 + }, + { + "epoch": 2.1986214657476437, + "grad_norm": 3.0, + "learning_rate": 4.0039288318254895e-06, + "loss": 1.544, + "step": 7815 + }, + { + "epoch": 2.200028133352089, + "grad_norm": 3.28125, + "learning_rate": 3.999017791895571e-06, + "loss": 1.4862, + "step": 7820 + }, + { + "epoch": 2.201434800956534, + "grad_norm": 3.171875, + "learning_rate": 3.994106753446225e-06, + "loss": 1.6569, + "step": 7825 + }, + { + "epoch": 2.202841468560979, + "grad_norm": 2.859375, + "learning_rate": 3.989195723880332e-06, + "loss": 1.484, + "step": 7830 + }, + { + "epoch": 2.204248136165424, + "grad_norm": 2.375, + "learning_rate": 3.984284710600746e-06, + "loss": 1.4287, + "step": 7835 + }, + { + "epoch": 2.2056548037698693, + "grad_norm": 2.890625, + "learning_rate": 3.979373721010306e-06, + "loss": 1.7917, + "step": 7840 + }, + { + "epoch": 2.207061471374314, + "grad_norm": 3.015625, + "learning_rate": 3.97446276251181e-06, + "loss": 1.5958, + "step": 7845 + }, + { + "epoch": 2.2084681389787595, + "grad_norm": 2.703125, + "learning_rate": 3.969551842508014e-06, + "loss": 1.7798, + "step": 7850 + }, + { + "epoch": 2.2098748065832043, + "grad_norm": 3.40625, + "learning_rate": 3.964640968401612e-06, + "loss": 1.6493, + "step": 7855 + }, + { + "epoch": 2.2112814741876496, + "grad_norm": 3.734375, + "learning_rate": 3.959730147595228e-06, + "loss": 1.3387, + "step": 7860 + }, + { + "epoch": 2.2126881417920945, + "grad_norm": 5.8125, + "learning_rate": 3.954819387491411e-06, + "loss": 1.6065, + "step": 7865 + }, + { + "epoch": 2.21409480939654, + "grad_norm": 3.828125, + "learning_rate": 3.949908695492612e-06, + "loss": 1.4857, + "step": 7870 + }, + { + "epoch": 2.2155014770009847, + "grad_norm": 3.0625, + "learning_rate": 3.944998079001185e-06, + "loss": 1.6135, + "step": 7875 + }, + { + "epoch": 2.21690814460543, + "grad_norm": 3.625, + "learning_rate": 3.940087545419365e-06, + "loss": 1.4992, + "step": 7880 + }, + { + "epoch": 2.218314812209875, + "grad_norm": 5.03125, + "learning_rate": 3.9351771021492686e-06, + "loss": 1.4218, + "step": 7885 + }, + { + "epoch": 2.2197214798143197, + "grad_norm": 3.28125, + "learning_rate": 3.9302667565928676e-06, + "loss": 1.6094, + "step": 7890 + }, + { + "epoch": 2.221128147418765, + "grad_norm": 2.375, + "learning_rate": 3.925356516151996e-06, + "loss": 1.4345, + "step": 7895 + }, + { + "epoch": 2.22253481502321, + "grad_norm": 5.0625, + "learning_rate": 3.920446388228319e-06, + "loss": 1.3853, + "step": 7900 + }, + { + "epoch": 2.223941482627655, + "grad_norm": 3.15625, + "learning_rate": 3.915536380223344e-06, + "loss": 1.6986, + "step": 7905 + }, + { + "epoch": 2.2253481502321, + "grad_norm": 2.96875, + "learning_rate": 3.910626499538387e-06, + "loss": 1.8287, + "step": 7910 + }, + { + "epoch": 2.2267548178365453, + "grad_norm": 3.15625, + "learning_rate": 3.9057167535745795e-06, + "loss": 1.453, + "step": 7915 + }, + { + "epoch": 2.22816148544099, + "grad_norm": 2.515625, + "learning_rate": 3.900807149732843e-06, + "loss": 1.5434, + "step": 7920 + }, + { + "epoch": 2.2295681530454354, + "grad_norm": 3.234375, + "learning_rate": 3.895897695413892e-06, + "loss": 1.6757, + "step": 7925 + }, + { + "epoch": 2.2309748206498803, + "grad_norm": 3.25, + "learning_rate": 3.890988398018212e-06, + "loss": 1.4654, + "step": 7930 + }, + { + "epoch": 2.2323814882543256, + "grad_norm": 3.9375, + "learning_rate": 3.886079264946052e-06, + "loss": 1.4485, + "step": 7935 + }, + { + "epoch": 2.2337881558587704, + "grad_norm": 2.765625, + "learning_rate": 3.881170303597412e-06, + "loss": 1.5495, + "step": 7940 + }, + { + "epoch": 2.2351948234632157, + "grad_norm": 2.703125, + "learning_rate": 3.8762615213720365e-06, + "loss": 1.4623, + "step": 7945 + }, + { + "epoch": 2.2366014910676606, + "grad_norm": 2.9375, + "learning_rate": 3.871352925669398e-06, + "loss": 1.7347, + "step": 7950 + }, + { + "epoch": 2.238008158672106, + "grad_norm": 4.96875, + "learning_rate": 3.866444523888687e-06, + "loss": 1.5133, + "step": 7955 + }, + { + "epoch": 2.2394148262765508, + "grad_norm": 3.234375, + "learning_rate": 3.861536323428805e-06, + "loss": 1.6838, + "step": 7960 + }, + { + "epoch": 2.240821493880996, + "grad_norm": 3.15625, + "learning_rate": 3.856628331688346e-06, + "loss": 1.3434, + "step": 7965 + }, + { + "epoch": 2.242228161485441, + "grad_norm": 2.34375, + "learning_rate": 3.8517205560655895e-06, + "loss": 1.6505, + "step": 7970 + }, + { + "epoch": 2.243634829089886, + "grad_norm": 3.359375, + "learning_rate": 3.846813003958493e-06, + "loss": 1.5625, + "step": 7975 + }, + { + "epoch": 2.245041496694331, + "grad_norm": 2.375, + "learning_rate": 3.841905682764676e-06, + "loss": 1.6773, + "step": 7980 + }, + { + "epoch": 2.2464481642987764, + "grad_norm": 3.53125, + "learning_rate": 3.836998599881406e-06, + "loss": 1.6122, + "step": 7985 + }, + { + "epoch": 2.247854831903221, + "grad_norm": 3.359375, + "learning_rate": 3.832091762705595e-06, + "loss": 1.705, + "step": 7990 + }, + { + "epoch": 2.2492614995076665, + "grad_norm": 3.0, + "learning_rate": 3.827185178633787e-06, + "loss": 1.5123, + "step": 7995 + }, + { + "epoch": 2.2506681671121114, + "grad_norm": 4.03125, + "learning_rate": 3.822278855062136e-06, + "loss": 1.4593, + "step": 8000 + }, + { + "epoch": 2.2520748347165567, + "grad_norm": 3.421875, + "learning_rate": 3.8173727993864115e-06, + "loss": 1.4751, + "step": 8005 + }, + { + "epoch": 2.2534815023210015, + "grad_norm": 3.34375, + "learning_rate": 3.8124670190019755e-06, + "loss": 1.3753, + "step": 8010 + }, + { + "epoch": 2.2548881699254464, + "grad_norm": 3.0625, + "learning_rate": 3.807561521303777e-06, + "loss": 1.6663, + "step": 8015 + }, + { + "epoch": 2.2562948375298917, + "grad_norm": 3.0, + "learning_rate": 3.802656313686336e-06, + "loss": 1.5228, + "step": 8020 + }, + { + "epoch": 2.257701505134337, + "grad_norm": 2.828125, + "learning_rate": 3.7977514035437383e-06, + "loss": 1.6107, + "step": 8025 + }, + { + "epoch": 2.259108172738782, + "grad_norm": 3.25, + "learning_rate": 3.7928467982696174e-06, + "loss": 1.6885, + "step": 8030 + }, + { + "epoch": 2.2605148403432267, + "grad_norm": 3.359375, + "learning_rate": 3.7879425052571525e-06, + "loss": 1.5311, + "step": 8035 + }, + { + "epoch": 2.261921507947672, + "grad_norm": 3.09375, + "learning_rate": 3.783038531899047e-06, + "loss": 1.681, + "step": 8040 + }, + { + "epoch": 2.263328175552117, + "grad_norm": 3.03125, + "learning_rate": 3.7781348855875263e-06, + "loss": 1.1735, + "step": 8045 + }, + { + "epoch": 2.264734843156562, + "grad_norm": 3.34375, + "learning_rate": 3.7732315737143205e-06, + "loss": 1.615, + "step": 8050 + }, + { + "epoch": 2.266141510761007, + "grad_norm": 3.640625, + "learning_rate": 3.768328603670658e-06, + "loss": 1.4219, + "step": 8055 + }, + { + "epoch": 2.2675481783654523, + "grad_norm": 3.9375, + "learning_rate": 3.7634259828472467e-06, + "loss": 1.5921, + "step": 8060 + }, + { + "epoch": 2.268954845969897, + "grad_norm": 4.125, + "learning_rate": 3.7585237186342743e-06, + "loss": 1.6932, + "step": 8065 + }, + { + "epoch": 2.2703615135743425, + "grad_norm": 3.65625, + "learning_rate": 3.753621818421388e-06, + "loss": 1.7495, + "step": 8070 + }, + { + "epoch": 2.2717681811787873, + "grad_norm": 3.78125, + "learning_rate": 3.7487202895976864e-06, + "loss": 1.6282, + "step": 8075 + }, + { + "epoch": 2.2731748487832326, + "grad_norm": 3.140625, + "learning_rate": 3.743819139551708e-06, + "loss": 1.6892, + "step": 8080 + }, + { + "epoch": 2.2745815163876775, + "grad_norm": 2.984375, + "learning_rate": 3.7389183756714207e-06, + "loss": 1.5165, + "step": 8085 + }, + { + "epoch": 2.2759881839921228, + "grad_norm": 3.453125, + "learning_rate": 3.7340180053442127e-06, + "loss": 1.6025, + "step": 8090 + }, + { + "epoch": 2.2773948515965676, + "grad_norm": 4.3125, + "learning_rate": 3.7291180359568735e-06, + "loss": 1.7114, + "step": 8095 + }, + { + "epoch": 2.278801519201013, + "grad_norm": 4.03125, + "learning_rate": 3.724218474895593e-06, + "loss": 1.6153, + "step": 8100 + }, + { + "epoch": 2.280208186805458, + "grad_norm": 2.875, + "learning_rate": 3.719319329545943e-06, + "loss": 1.5714, + "step": 8105 + }, + { + "epoch": 2.281614854409903, + "grad_norm": 3.234375, + "learning_rate": 3.7144206072928704e-06, + "loss": 1.7677, + "step": 8110 + }, + { + "epoch": 2.283021522014348, + "grad_norm": 3.3125, + "learning_rate": 3.709522315520683e-06, + "loss": 1.4741, + "step": 8115 + }, + { + "epoch": 2.2844281896187932, + "grad_norm": 3.734375, + "learning_rate": 3.704624461613043e-06, + "loss": 1.7835, + "step": 8120 + }, + { + "epoch": 2.285834857223238, + "grad_norm": 3.765625, + "learning_rate": 3.6997270529529445e-06, + "loss": 1.8235, + "step": 8125 + }, + { + "epoch": 2.2872415248276834, + "grad_norm": 4.25, + "learning_rate": 3.69483009692272e-06, + "loss": 1.5382, + "step": 8130 + }, + { + "epoch": 2.2886481924321282, + "grad_norm": 5.4375, + "learning_rate": 3.6899336009040132e-06, + "loss": 1.5006, + "step": 8135 + }, + { + "epoch": 2.2900548600365735, + "grad_norm": 2.96875, + "learning_rate": 3.685037572277778e-06, + "loss": 1.8186, + "step": 8140 + }, + { + "epoch": 2.2914615276410184, + "grad_norm": 2.515625, + "learning_rate": 3.6801420184242626e-06, + "loss": 1.7269, + "step": 8145 + }, + { + "epoch": 2.2928681952454637, + "grad_norm": 3.0625, + "learning_rate": 3.6752469467229975e-06, + "loss": 1.5189, + "step": 8150 + }, + { + "epoch": 2.2942748628499086, + "grad_norm": 3.75, + "learning_rate": 3.6703523645527915e-06, + "loss": 1.6616, + "step": 8155 + }, + { + "epoch": 2.2956815304543534, + "grad_norm": 4.09375, + "learning_rate": 3.6654582792917074e-06, + "loss": 1.6282, + "step": 8160 + }, + { + "epoch": 2.2970881980587987, + "grad_norm": 3.640625, + "learning_rate": 3.660564698317069e-06, + "loss": 1.5604, + "step": 8165 + }, + { + "epoch": 2.298494865663244, + "grad_norm": 4.5625, + "learning_rate": 3.6556716290054306e-06, + "loss": 1.4532, + "step": 8170 + }, + { + "epoch": 2.299901533267689, + "grad_norm": 3.59375, + "learning_rate": 3.650779078732582e-06, + "loss": 1.7224, + "step": 8175 + }, + { + "epoch": 2.3013082008721337, + "grad_norm": 4.03125, + "learning_rate": 3.6458870548735255e-06, + "loss": 1.6006, + "step": 8180 + }, + { + "epoch": 2.302714868476579, + "grad_norm": 4.03125, + "learning_rate": 3.6409955648024756e-06, + "loss": 1.3733, + "step": 8185 + }, + { + "epoch": 2.304121536081024, + "grad_norm": 3.109375, + "learning_rate": 3.6361046158928343e-06, + "loss": 1.6563, + "step": 8190 + }, + { + "epoch": 2.305528203685469, + "grad_norm": 2.796875, + "learning_rate": 3.631214215517198e-06, + "loss": 1.7411, + "step": 8195 + }, + { + "epoch": 2.306934871289914, + "grad_norm": 3.140625, + "learning_rate": 3.6263243710473258e-06, + "loss": 1.5237, + "step": 8200 + }, + { + "epoch": 2.3083415388943593, + "grad_norm": 2.921875, + "learning_rate": 3.621435089854146e-06, + "loss": 1.685, + "step": 8205 + }, + { + "epoch": 2.309748206498804, + "grad_norm": 3.84375, + "learning_rate": 3.616546379307736e-06, + "loss": 1.3735, + "step": 8210 + }, + { + "epoch": 2.3111548741032495, + "grad_norm": 3.265625, + "learning_rate": 3.611658246777311e-06, + "loss": 1.5922, + "step": 8215 + }, + { + "epoch": 2.3125615417076943, + "grad_norm": 3.5625, + "learning_rate": 3.6067706996312196e-06, + "loss": 1.6493, + "step": 8220 + }, + { + "epoch": 2.3139682093121396, + "grad_norm": 3.21875, + "learning_rate": 3.601883745236919e-06, + "loss": 1.3882, + "step": 8225 + }, + { + "epoch": 2.3153748769165845, + "grad_norm": 4.46875, + "learning_rate": 3.5969973909609857e-06, + "loss": 1.483, + "step": 8230 + }, + { + "epoch": 2.31678154452103, + "grad_norm": 3.984375, + "learning_rate": 3.592111644169079e-06, + "loss": 1.3885, + "step": 8235 + }, + { + "epoch": 2.3181882121254747, + "grad_norm": 3.53125, + "learning_rate": 3.5872265122259517e-06, + "loss": 1.4347, + "step": 8240 + }, + { + "epoch": 2.31959487972992, + "grad_norm": 2.96875, + "learning_rate": 3.5823420024954233e-06, + "loss": 1.7898, + "step": 8245 + }, + { + "epoch": 2.321001547334365, + "grad_norm": 3.125, + "learning_rate": 3.577458122340382e-06, + "loss": 1.4103, + "step": 8250 + }, + { + "epoch": 2.32240821493881, + "grad_norm": 3.8125, + "learning_rate": 3.572574879122758e-06, + "loss": 1.644, + "step": 8255 + }, + { + "epoch": 2.323814882543255, + "grad_norm": 3.046875, + "learning_rate": 3.5676922802035324e-06, + "loss": 1.5583, + "step": 8260 + }, + { + "epoch": 2.3252215501477003, + "grad_norm": 2.609375, + "learning_rate": 3.562810332942705e-06, + "loss": 1.4209, + "step": 8265 + }, + { + "epoch": 2.326628217752145, + "grad_norm": 4.75, + "learning_rate": 3.5579290446992996e-06, + "loss": 1.476, + "step": 8270 + }, + { + "epoch": 2.3280348853565904, + "grad_norm": 3.875, + "learning_rate": 3.553048422831344e-06, + "loss": 1.5637, + "step": 8275 + }, + { + "epoch": 2.3294415529610353, + "grad_norm": 3.796875, + "learning_rate": 3.548168474695862e-06, + "loss": 1.4261, + "step": 8280 + }, + { + "epoch": 2.33084822056548, + "grad_norm": 2.765625, + "learning_rate": 3.5432892076488636e-06, + "loss": 1.4481, + "step": 8285 + }, + { + "epoch": 2.3322548881699254, + "grad_norm": 3.25, + "learning_rate": 3.5384106290453275e-06, + "loss": 1.6372, + "step": 8290 + }, + { + "epoch": 2.3336615557743707, + "grad_norm": 3.0625, + "learning_rate": 3.5335327462392014e-06, + "loss": 1.6052, + "step": 8295 + }, + { + "epoch": 2.3350682233788156, + "grad_norm": 3.8125, + "learning_rate": 3.5286555665833763e-06, + "loss": 1.5632, + "step": 8300 + }, + { + "epoch": 2.3364748909832604, + "grad_norm": 2.078125, + "learning_rate": 3.52377909742969e-06, + "loss": 1.6063, + "step": 8305 + }, + { + "epoch": 2.3378815585877057, + "grad_norm": 3.40625, + "learning_rate": 3.5189033461289057e-06, + "loss": 1.1783, + "step": 8310 + }, + { + "epoch": 2.339288226192151, + "grad_norm": 3.5625, + "learning_rate": 3.514028320030706e-06, + "loss": 1.6411, + "step": 8315 + }, + { + "epoch": 2.340694893796596, + "grad_norm": 2.515625, + "learning_rate": 3.5091540264836788e-06, + "loss": 1.5097, + "step": 8320 + }, + { + "epoch": 2.3421015614010408, + "grad_norm": 5.40625, + "learning_rate": 3.5042804728353112e-06, + "loss": 1.6188, + "step": 8325 + }, + { + "epoch": 2.343508229005486, + "grad_norm": 3.875, + "learning_rate": 3.499407666431969e-06, + "loss": 1.5711, + "step": 8330 + }, + { + "epoch": 2.344914896609931, + "grad_norm": 4.03125, + "learning_rate": 3.4945356146188977e-06, + "loss": 1.2004, + "step": 8335 + }, + { + "epoch": 2.346321564214376, + "grad_norm": 3.953125, + "learning_rate": 3.489664324740201e-06, + "loss": 1.7899, + "step": 8340 + }, + { + "epoch": 2.347728231818821, + "grad_norm": 4.8125, + "learning_rate": 3.4847938041388376e-06, + "loss": 1.6106, + "step": 8345 + }, + { + "epoch": 2.3491348994232664, + "grad_norm": 2.40625, + "learning_rate": 3.4799240601566036e-06, + "loss": 1.4558, + "step": 8350 + }, + { + "epoch": 2.350541567027711, + "grad_norm": 3.25, + "learning_rate": 3.4750551001341257e-06, + "loss": 1.4972, + "step": 8355 + }, + { + "epoch": 2.3519482346321565, + "grad_norm": 3.96875, + "learning_rate": 3.4701869314108503e-06, + "loss": 1.4276, + "step": 8360 + }, + { + "epoch": 2.3533549022366014, + "grad_norm": 2.90625, + "learning_rate": 3.465319561325027e-06, + "loss": 1.6449, + "step": 8365 + }, + { + "epoch": 2.3547615698410467, + "grad_norm": 3.53125, + "learning_rate": 3.460452997213707e-06, + "loss": 1.6125, + "step": 8370 + }, + { + "epoch": 2.3561682374454915, + "grad_norm": 3.8125, + "learning_rate": 3.4555872464127207e-06, + "loss": 1.443, + "step": 8375 + }, + { + "epoch": 2.357574905049937, + "grad_norm": 3.390625, + "learning_rate": 3.4507223162566776e-06, + "loss": 1.5425, + "step": 8380 + }, + { + "epoch": 2.3589815726543817, + "grad_norm": 3.890625, + "learning_rate": 3.445858214078946e-06, + "loss": 1.5239, + "step": 8385 + }, + { + "epoch": 2.360388240258827, + "grad_norm": 4.28125, + "learning_rate": 3.440994947211652e-06, + "loss": 1.3468, + "step": 8390 + }, + { + "epoch": 2.361794907863272, + "grad_norm": 3.140625, + "learning_rate": 3.4361325229856537e-06, + "loss": 1.7437, + "step": 8395 + }, + { + "epoch": 2.363201575467717, + "grad_norm": 2.96875, + "learning_rate": 3.4312709487305474e-06, + "loss": 1.6956, + "step": 8400 + }, + { + "epoch": 2.364608243072162, + "grad_norm": 5.5, + "learning_rate": 3.4264102317746424e-06, + "loss": 1.631, + "step": 8405 + }, + { + "epoch": 2.3660149106766073, + "grad_norm": 2.96875, + "learning_rate": 3.4215503794449613e-06, + "loss": 1.6996, + "step": 8410 + }, + { + "epoch": 2.367421578281052, + "grad_norm": 2.734375, + "learning_rate": 3.416691399067217e-06, + "loss": 1.6483, + "step": 8415 + }, + { + "epoch": 2.3688282458854975, + "grad_norm": 2.546875, + "learning_rate": 3.4118332979658116e-06, + "loss": 1.5546, + "step": 8420 + }, + { + "epoch": 2.3702349134899423, + "grad_norm": 2.953125, + "learning_rate": 3.406976083463824e-06, + "loss": 1.7836, + "step": 8425 + }, + { + "epoch": 2.371641581094387, + "grad_norm": 4.21875, + "learning_rate": 3.4021197628829902e-06, + "loss": 1.5147, + "step": 8430 + }, + { + "epoch": 2.3730482486988325, + "grad_norm": 2.71875, + "learning_rate": 3.3972643435437062e-06, + "loss": 1.6279, + "step": 8435 + }, + { + "epoch": 2.3744549163032778, + "grad_norm": 2.359375, + "learning_rate": 3.392409832765002e-06, + "loss": 1.8905, + "step": 8440 + }, + { + "epoch": 2.3758615839077226, + "grad_norm": 3.296875, + "learning_rate": 3.387556237864545e-06, + "loss": 1.6089, + "step": 8445 + }, + { + "epoch": 2.3772682515121675, + "grad_norm": 2.609375, + "learning_rate": 3.3827035661586165e-06, + "loss": 1.3313, + "step": 8450 + }, + { + "epoch": 2.3786749191166128, + "grad_norm": 3.15625, + "learning_rate": 3.3778518249621117e-06, + "loss": 1.5232, + "step": 8455 + }, + { + "epoch": 2.3800815867210576, + "grad_norm": 2.90625, + "learning_rate": 3.3730010215885155e-06, + "loss": 1.6252, + "step": 8460 + }, + { + "epoch": 2.381488254325503, + "grad_norm": 3.234375, + "learning_rate": 3.368151163349907e-06, + "loss": 1.397, + "step": 8465 + }, + { + "epoch": 2.382894921929948, + "grad_norm": 3.828125, + "learning_rate": 3.363302257556935e-06, + "loss": 1.6473, + "step": 8470 + }, + { + "epoch": 2.384301589534393, + "grad_norm": 3.765625, + "learning_rate": 3.3584543115188167e-06, + "loss": 1.6578, + "step": 8475 + }, + { + "epoch": 2.385708257138838, + "grad_norm": 2.421875, + "learning_rate": 3.353607332543319e-06, + "loss": 1.4938, + "step": 8480 + }, + { + "epoch": 2.3871149247432832, + "grad_norm": 2.375, + "learning_rate": 3.348761327936755e-06, + "loss": 1.5833, + "step": 8485 + }, + { + "epoch": 2.388521592347728, + "grad_norm": 3.125, + "learning_rate": 3.3439163050039637e-06, + "loss": 1.5118, + "step": 8490 + }, + { + "epoch": 2.3899282599521734, + "grad_norm": 5.875, + "learning_rate": 3.339072271048308e-06, + "loss": 1.635, + "step": 8495 + }, + { + "epoch": 2.3913349275566183, + "grad_norm": 3.0, + "learning_rate": 3.3342292333716626e-06, + "loss": 1.5718, + "step": 8500 + }, + { + "epoch": 2.3927415951610636, + "grad_norm": 3.171875, + "learning_rate": 3.3293871992743935e-06, + "loss": 1.6902, + "step": 8505 + }, + { + "epoch": 2.3941482627655084, + "grad_norm": 3.609375, + "learning_rate": 3.32454617605536e-06, + "loss": 1.6936, + "step": 8510 + }, + { + "epoch": 2.3955549303699537, + "grad_norm": 3.859375, + "learning_rate": 3.3197061710118926e-06, + "loss": 1.6683, + "step": 8515 + }, + { + "epoch": 2.3969615979743986, + "grad_norm": 2.515625, + "learning_rate": 3.314867191439794e-06, + "loss": 1.5576, + "step": 8520 + }, + { + "epoch": 2.398368265578844, + "grad_norm": 2.59375, + "learning_rate": 3.3100292446333103e-06, + "loss": 1.737, + "step": 8525 + }, + { + "epoch": 2.3997749331832887, + "grad_norm": 2.859375, + "learning_rate": 3.305192337885144e-06, + "loss": 1.6273, + "step": 8530 + }, + { + "epoch": 2.401181600787734, + "grad_norm": 3.0625, + "learning_rate": 3.3003564784864185e-06, + "loss": 1.8476, + "step": 8535 + }, + { + "epoch": 2.402588268392179, + "grad_norm": 2.75, + "learning_rate": 3.2955216737266854e-06, + "loss": 1.6457, + "step": 8540 + }, + { + "epoch": 2.403994935996624, + "grad_norm": 3.828125, + "learning_rate": 3.2906879308939024e-06, + "loss": 1.6365, + "step": 8545 + }, + { + "epoch": 2.405401603601069, + "grad_norm": 2.75, + "learning_rate": 3.2858552572744306e-06, + "loss": 1.7155, + "step": 8550 + }, + { + "epoch": 2.4068082712055143, + "grad_norm": 4.40625, + "learning_rate": 3.2810236601530134e-06, + "loss": 1.7192, + "step": 8555 + }, + { + "epoch": 2.408214938809959, + "grad_norm": 3.171875, + "learning_rate": 3.2761931468127777e-06, + "loss": 1.5524, + "step": 8560 + }, + { + "epoch": 2.4096216064144045, + "grad_norm": 5.9375, + "learning_rate": 3.2713637245352154e-06, + "loss": 1.4309, + "step": 8565 + }, + { + "epoch": 2.4110282740188493, + "grad_norm": 9.5625, + "learning_rate": 3.2665354006001687e-06, + "loss": 1.5507, + "step": 8570 + }, + { + "epoch": 2.412434941623294, + "grad_norm": 4.375, + "learning_rate": 3.2617081822858303e-06, + "loss": 1.7103, + "step": 8575 + }, + { + "epoch": 2.4138416092277395, + "grad_norm": 3.921875, + "learning_rate": 3.256882076868723e-06, + "loss": 1.2869, + "step": 8580 + }, + { + "epoch": 2.415248276832185, + "grad_norm": 4.5, + "learning_rate": 3.252057091623695e-06, + "loss": 1.554, + "step": 8585 + }, + { + "epoch": 2.4166549444366296, + "grad_norm": 4.21875, + "learning_rate": 3.2472332338238994e-06, + "loss": 1.5645, + "step": 8590 + }, + { + "epoch": 2.4180616120410745, + "grad_norm": 2.9375, + "learning_rate": 3.2424105107407996e-06, + "loss": 1.6879, + "step": 8595 + }, + { + "epoch": 2.41946827964552, + "grad_norm": 4.03125, + "learning_rate": 3.237588929644139e-06, + "loss": 1.507, + "step": 8600 + }, + { + "epoch": 2.4208749472499647, + "grad_norm": 2.984375, + "learning_rate": 3.2327684978019464e-06, + "loss": 1.4155, + "step": 8605 + }, + { + "epoch": 2.42228161485441, + "grad_norm": 3.34375, + "learning_rate": 3.227949222480513e-06, + "loss": 1.5608, + "step": 8610 + }, + { + "epoch": 2.423688282458855, + "grad_norm": 3.375, + "learning_rate": 3.223131110944393e-06, + "loss": 1.6058, + "step": 8615 + }, + { + "epoch": 2.4250949500633, + "grad_norm": 4.125, + "learning_rate": 3.218314170456378e-06, + "loss": 1.5107, + "step": 8620 + }, + { + "epoch": 2.426501617667745, + "grad_norm": 2.140625, + "learning_rate": 3.2134984082775036e-06, + "loss": 1.86, + "step": 8625 + }, + { + "epoch": 2.4279082852721903, + "grad_norm": 2.75, + "learning_rate": 3.2086838316670204e-06, + "loss": 1.5441, + "step": 8630 + }, + { + "epoch": 2.429314952876635, + "grad_norm": 3.3125, + "learning_rate": 3.2038704478823983e-06, + "loss": 1.6657, + "step": 8635 + }, + { + "epoch": 2.4307216204810804, + "grad_norm": 4.03125, + "learning_rate": 3.1990582641793078e-06, + "loss": 1.6059, + "step": 8640 + }, + { + "epoch": 2.4321282880855253, + "grad_norm": 3.1875, + "learning_rate": 3.1942472878116066e-06, + "loss": 1.5391, + "step": 8645 + }, + { + "epoch": 2.4335349556899706, + "grad_norm": 3.703125, + "learning_rate": 3.1894375260313384e-06, + "loss": 1.4821, + "step": 8650 + }, + { + "epoch": 2.4349416232944154, + "grad_norm": 3.734375, + "learning_rate": 3.18462898608871e-06, + "loss": 1.581, + "step": 8655 + }, + { + "epoch": 2.4363482908988607, + "grad_norm": 2.59375, + "learning_rate": 3.1798216752320934e-06, + "loss": 1.6751, + "step": 8660 + }, + { + "epoch": 2.4377549585033056, + "grad_norm": 3.78125, + "learning_rate": 3.175015600707999e-06, + "loss": 1.4178, + "step": 8665 + }, + { + "epoch": 2.439161626107751, + "grad_norm": 3.078125, + "learning_rate": 3.1702107697610825e-06, + "loss": 1.4115, + "step": 8670 + }, + { + "epoch": 2.4405682937121957, + "grad_norm": 4.34375, + "learning_rate": 3.1654071896341184e-06, + "loss": 1.354, + "step": 8675 + }, + { + "epoch": 2.441974961316641, + "grad_norm": 3.0, + "learning_rate": 3.1606048675680002e-06, + "loss": 1.7495, + "step": 8680 + }, + { + "epoch": 2.443381628921086, + "grad_norm": 3.15625, + "learning_rate": 3.1558038108017213e-06, + "loss": 1.6442, + "step": 8685 + }, + { + "epoch": 2.444788296525531, + "grad_norm": 3.78125, + "learning_rate": 3.151004026572372e-06, + "loss": 1.5592, + "step": 8690 + }, + { + "epoch": 2.446194964129976, + "grad_norm": 2.75, + "learning_rate": 3.146205522115119e-06, + "loss": 1.7754, + "step": 8695 + }, + { + "epoch": 2.447601631734421, + "grad_norm": 2.828125, + "learning_rate": 3.141408304663205e-06, + "loss": 1.461, + "step": 8700 + }, + { + "epoch": 2.449008299338866, + "grad_norm": 3.21875, + "learning_rate": 3.1366123814479293e-06, + "loss": 1.3845, + "step": 8705 + }, + { + "epoch": 2.4504149669433115, + "grad_norm": 3.5625, + "learning_rate": 3.1318177596986425e-06, + "loss": 1.4798, + "step": 8710 + }, + { + "epoch": 2.4518216345477564, + "grad_norm": 2.859375, + "learning_rate": 3.127024446642732e-06, + "loss": 1.702, + "step": 8715 + }, + { + "epoch": 2.4532283021522012, + "grad_norm": 4.96875, + "learning_rate": 3.1222324495056124e-06, + "loss": 1.8042, + "step": 8720 + }, + { + "epoch": 2.4546349697566465, + "grad_norm": 2.640625, + "learning_rate": 3.1174417755107177e-06, + "loss": 1.5827, + "step": 8725 + }, + { + "epoch": 2.456041637361092, + "grad_norm": 3.515625, + "learning_rate": 3.112652431879481e-06, + "loss": 1.7237, + "step": 8730 + }, + { + "epoch": 2.4574483049655367, + "grad_norm": 3.234375, + "learning_rate": 3.1078644258313365e-06, + "loss": 1.5673, + "step": 8735 + }, + { + "epoch": 2.4588549725699815, + "grad_norm": 2.859375, + "learning_rate": 3.1030777645836974e-06, + "loss": 1.5594, + "step": 8740 + }, + { + "epoch": 2.460261640174427, + "grad_norm": 3.09375, + "learning_rate": 3.0982924553519548e-06, + "loss": 1.4208, + "step": 8745 + }, + { + "epoch": 2.4616683077788717, + "grad_norm": 3.046875, + "learning_rate": 3.0935085053494557e-06, + "loss": 1.5434, + "step": 8750 + }, + { + "epoch": 2.463074975383317, + "grad_norm": 3.390625, + "learning_rate": 3.088725921787505e-06, + "loss": 1.6963, + "step": 8755 + }, + { + "epoch": 2.464481642987762, + "grad_norm": 3.34375, + "learning_rate": 3.0839447118753407e-06, + "loss": 1.5322, + "step": 8760 + }, + { + "epoch": 2.465888310592207, + "grad_norm": 4.75, + "learning_rate": 3.0791648828201354e-06, + "loss": 1.6891, + "step": 8765 + }, + { + "epoch": 2.467294978196652, + "grad_norm": 3.203125, + "learning_rate": 3.0743864418269777e-06, + "loss": 1.652, + "step": 8770 + }, + { + "epoch": 2.4687016458010973, + "grad_norm": 3.0625, + "learning_rate": 3.069609396098865e-06, + "loss": 1.5267, + "step": 8775 + }, + { + "epoch": 2.470108313405542, + "grad_norm": 3.6875, + "learning_rate": 3.064833752836692e-06, + "loss": 1.4356, + "step": 8780 + }, + { + "epoch": 2.4715149810099875, + "grad_norm": 3.546875, + "learning_rate": 3.0600595192392364e-06, + "loss": 1.6106, + "step": 8785 + }, + { + "epoch": 2.4729216486144323, + "grad_norm": 2.90625, + "learning_rate": 3.055286702503156e-06, + "loss": 1.4059, + "step": 8790 + }, + { + "epoch": 2.4743283162188776, + "grad_norm": 3.875, + "learning_rate": 3.050515309822966e-06, + "loss": 1.5044, + "step": 8795 + }, + { + "epoch": 2.4757349838233225, + "grad_norm": 2.703125, + "learning_rate": 3.0457453483910417e-06, + "loss": 1.5502, + "step": 8800 + }, + { + "epoch": 2.4771416514277678, + "grad_norm": 3.421875, + "learning_rate": 3.0409768253975967e-06, + "loss": 1.6319, + "step": 8805 + }, + { + "epoch": 2.4785483190322126, + "grad_norm": 3.15625, + "learning_rate": 3.0362097480306787e-06, + "loss": 1.4717, + "step": 8810 + }, + { + "epoch": 2.479954986636658, + "grad_norm": 3.0, + "learning_rate": 3.031444123476154e-06, + "loss": 1.702, + "step": 8815 + }, + { + "epoch": 2.4813616542411028, + "grad_norm": 4.3125, + "learning_rate": 3.0266799589177023e-06, + "loss": 1.7863, + "step": 8820 + }, + { + "epoch": 2.482768321845548, + "grad_norm": 3.9375, + "learning_rate": 3.021917261536797e-06, + "loss": 1.1797, + "step": 8825 + }, + { + "epoch": 2.484174989449993, + "grad_norm": 3.5625, + "learning_rate": 3.0171560385127066e-06, + "loss": 1.608, + "step": 8830 + }, + { + "epoch": 2.4855816570544382, + "grad_norm": 3.609375, + "learning_rate": 3.012396297022471e-06, + "loss": 1.2161, + "step": 8835 + }, + { + "epoch": 2.486988324658883, + "grad_norm": 3.1875, + "learning_rate": 3.0076380442409023e-06, + "loss": 1.6565, + "step": 8840 + }, + { + "epoch": 2.488394992263328, + "grad_norm": 2.1875, + "learning_rate": 3.0028812873405636e-06, + "loss": 1.745, + "step": 8845 + }, + { + "epoch": 2.4898016598677732, + "grad_norm": 3.03125, + "learning_rate": 2.9981260334917666e-06, + "loss": 1.6035, + "step": 8850 + }, + { + "epoch": 2.4912083274722185, + "grad_norm": 3.3125, + "learning_rate": 2.9933722898625575e-06, + "loss": 1.558, + "step": 8855 + }, + { + "epoch": 2.4926149950766634, + "grad_norm": 3.40625, + "learning_rate": 2.988620063618701e-06, + "loss": 1.6606, + "step": 8860 + }, + { + "epoch": 2.4940216626811083, + "grad_norm": 4.25, + "learning_rate": 2.9838693619236823e-06, + "loss": 1.5781, + "step": 8865 + }, + { + "epoch": 2.4954283302855536, + "grad_norm": 3.203125, + "learning_rate": 2.9791201919386807e-06, + "loss": 1.6853, + "step": 8870 + }, + { + "epoch": 2.4968349978899984, + "grad_norm": 2.8125, + "learning_rate": 2.974372560822573e-06, + "loss": 1.5243, + "step": 8875 + }, + { + "epoch": 2.4982416654944437, + "grad_norm": 4.8125, + "learning_rate": 2.9696264757319113e-06, + "loss": 1.5861, + "step": 8880 + }, + { + "epoch": 2.4996483330988886, + "grad_norm": 4.21875, + "learning_rate": 2.9648819438209228e-06, + "loss": 1.6032, + "step": 8885 + }, + { + "epoch": 2.501055000703334, + "grad_norm": 2.765625, + "learning_rate": 2.960138972241485e-06, + "loss": 1.6981, + "step": 8890 + }, + { + "epoch": 2.5024616683077787, + "grad_norm": 2.859375, + "learning_rate": 2.955397568143134e-06, + "loss": 1.5882, + "step": 8895 + }, + { + "epoch": 2.503868335912224, + "grad_norm": 3.171875, + "learning_rate": 2.950657738673033e-06, + "loss": 1.5093, + "step": 8900 + }, + { + "epoch": 2.505275003516669, + "grad_norm": 3.0625, + "learning_rate": 2.945919490975979e-06, + "loss": 1.7241, + "step": 8905 + }, + { + "epoch": 2.506681671121114, + "grad_norm": 3.703125, + "learning_rate": 2.9411828321943804e-06, + "loss": 1.546, + "step": 8910 + }, + { + "epoch": 2.508088338725559, + "grad_norm": 3.109375, + "learning_rate": 2.9364477694682546e-06, + "loss": 1.6981, + "step": 8915 + }, + { + "epoch": 2.5094950063300043, + "grad_norm": 3.015625, + "learning_rate": 2.9317143099352056e-06, + "loss": 1.5693, + "step": 8920 + }, + { + "epoch": 2.510901673934449, + "grad_norm": 3.078125, + "learning_rate": 2.926982460730429e-06, + "loss": 1.436, + "step": 8925 + }, + { + "epoch": 2.5123083415388945, + "grad_norm": 3.625, + "learning_rate": 2.922252228986691e-06, + "loss": 1.7639, + "step": 8930 + }, + { + "epoch": 2.5137150091433393, + "grad_norm": 3.453125, + "learning_rate": 2.917523621834314e-06, + "loss": 1.568, + "step": 8935 + }, + { + "epoch": 2.5151216767477846, + "grad_norm": 2.765625, + "learning_rate": 2.9127966464011787e-06, + "loss": 1.7766, + "step": 8940 + }, + { + "epoch": 2.5165283443522295, + "grad_norm": 3.578125, + "learning_rate": 2.908071309812702e-06, + "loss": 1.4208, + "step": 8945 + }, + { + "epoch": 2.517935011956675, + "grad_norm": 3.265625, + "learning_rate": 2.9033476191918338e-06, + "loss": 1.7117, + "step": 8950 + }, + { + "epoch": 2.5193416795611197, + "grad_norm": 3.203125, + "learning_rate": 2.8986255816590365e-06, + "loss": 1.8485, + "step": 8955 + }, + { + "epoch": 2.520748347165565, + "grad_norm": 3.34375, + "learning_rate": 2.8939052043322895e-06, + "loss": 1.6935, + "step": 8960 + }, + { + "epoch": 2.52215501477001, + "grad_norm": 3.4375, + "learning_rate": 2.8891864943270603e-06, + "loss": 1.6999, + "step": 8965 + }, + { + "epoch": 2.5235616823744547, + "grad_norm": 3.671875, + "learning_rate": 2.884469458756312e-06, + "loss": 1.4383, + "step": 8970 + }, + { + "epoch": 2.5249683499789, + "grad_norm": 3.390625, + "learning_rate": 2.8797541047304764e-06, + "loss": 1.5988, + "step": 8975 + }, + { + "epoch": 2.5263750175833453, + "grad_norm": 3.59375, + "learning_rate": 2.875040439357456e-06, + "loss": 1.6122, + "step": 8980 + }, + { + "epoch": 2.52778168518779, + "grad_norm": 4.5625, + "learning_rate": 2.8703284697426015e-06, + "loss": 1.5492, + "step": 8985 + }, + { + "epoch": 2.529188352792235, + "grad_norm": 3.71875, + "learning_rate": 2.8656182029887148e-06, + "loss": 1.2839, + "step": 8990 + }, + { + "epoch": 2.5305950203966803, + "grad_norm": 2.40625, + "learning_rate": 2.8609096461960276e-06, + "loss": 1.8007, + "step": 8995 + }, + { + "epoch": 2.5320016880011256, + "grad_norm": 3.3125, + "learning_rate": 2.8562028064621917e-06, + "loss": 1.4047, + "step": 9000 + }, + { + "epoch": 2.5334083556055704, + "grad_norm": 3.21875, + "learning_rate": 2.851497690882274e-06, + "loss": 1.5034, + "step": 9005 + }, + { + "epoch": 2.5348150232100153, + "grad_norm": 2.953125, + "learning_rate": 2.84679430654874e-06, + "loss": 1.6142, + "step": 9010 + }, + { + "epoch": 2.5362216908144606, + "grad_norm": 3.9375, + "learning_rate": 2.842092660551448e-06, + "loss": 1.3421, + "step": 9015 + }, + { + "epoch": 2.537628358418906, + "grad_norm": 2.984375, + "learning_rate": 2.837392759977634e-06, + "loss": 1.6882, + "step": 9020 + }, + { + "epoch": 2.5390350260233507, + "grad_norm": 4.34375, + "learning_rate": 2.832694611911905e-06, + "loss": 1.4796, + "step": 9025 + }, + { + "epoch": 2.5404416936277956, + "grad_norm": 4.09375, + "learning_rate": 2.8279982234362223e-06, + "loss": 1.4512, + "step": 9030 + }, + { + "epoch": 2.541848361232241, + "grad_norm": 5.0, + "learning_rate": 2.8233036016299e-06, + "loss": 1.6511, + "step": 9035 + }, + { + "epoch": 2.5432550288366857, + "grad_norm": 2.796875, + "learning_rate": 2.818610753569583e-06, + "loss": 1.5149, + "step": 9040 + }, + { + "epoch": 2.544661696441131, + "grad_norm": 3.40625, + "learning_rate": 2.8139196863292497e-06, + "loss": 1.4205, + "step": 9045 + }, + { + "epoch": 2.546068364045576, + "grad_norm": 3.921875, + "learning_rate": 2.8092304069801875e-06, + "loss": 1.4803, + "step": 9050 + }, + { + "epoch": 2.547475031650021, + "grad_norm": 3.84375, + "learning_rate": 2.8045429225909953e-06, + "loss": 1.5202, + "step": 9055 + }, + { + "epoch": 2.548881699254466, + "grad_norm": 2.90625, + "learning_rate": 2.799857240227558e-06, + "loss": 1.5887, + "step": 9060 + }, + { + "epoch": 2.5502883668589114, + "grad_norm": 2.734375, + "learning_rate": 2.795173366953051e-06, + "loss": 1.6058, + "step": 9065 + }, + { + "epoch": 2.551695034463356, + "grad_norm": 3.796875, + "learning_rate": 2.7904913098279213e-06, + "loss": 1.6265, + "step": 9070 + }, + { + "epoch": 2.5531017020678015, + "grad_norm": 3.703125, + "learning_rate": 2.7858110759098753e-06, + "loss": 1.4793, + "step": 9075 + }, + { + "epoch": 2.5545083696722464, + "grad_norm": 3.4375, + "learning_rate": 2.7811326722538755e-06, + "loss": 1.5202, + "step": 9080 + }, + { + "epoch": 2.5559150372766917, + "grad_norm": 2.6875, + "learning_rate": 2.776456105912121e-06, + "loss": 1.6495, + "step": 9085 + }, + { + "epoch": 2.5573217048811365, + "grad_norm": 3.390625, + "learning_rate": 2.771781383934046e-06, + "loss": 1.6192, + "step": 9090 + }, + { + "epoch": 2.5587283724855814, + "grad_norm": 3.53125, + "learning_rate": 2.767108513366299e-06, + "loss": 1.4899, + "step": 9095 + }, + { + "epoch": 2.5601350400900267, + "grad_norm": 3.390625, + "learning_rate": 2.7624375012527423e-06, + "loss": 1.6238, + "step": 9100 + }, + { + "epoch": 2.561541707694472, + "grad_norm": 3.453125, + "learning_rate": 2.757768354634435e-06, + "loss": 1.142, + "step": 9105 + }, + { + "epoch": 2.562948375298917, + "grad_norm": 3.359375, + "learning_rate": 2.7531010805496245e-06, + "loss": 1.5075, + "step": 9110 + }, + { + "epoch": 2.5643550429033617, + "grad_norm": 2.703125, + "learning_rate": 2.748435686033735e-06, + "loss": 1.5947, + "step": 9115 + }, + { + "epoch": 2.565761710507807, + "grad_norm": 4.71875, + "learning_rate": 2.7437721781193596e-06, + "loss": 1.5573, + "step": 9120 + }, + { + "epoch": 2.5671683781122523, + "grad_norm": 2.828125, + "learning_rate": 2.7391105638362422e-06, + "loss": 1.4712, + "step": 9125 + }, + { + "epoch": 2.568575045716697, + "grad_norm": 4.53125, + "learning_rate": 2.734450850211278e-06, + "loss": 1.5379, + "step": 9130 + }, + { + "epoch": 2.569981713321142, + "grad_norm": 3.0, + "learning_rate": 2.7297930442684958e-06, + "loss": 1.2337, + "step": 9135 + }, + { + "epoch": 2.5713883809255873, + "grad_norm": 3.296875, + "learning_rate": 2.7251371530290464e-06, + "loss": 1.5338, + "step": 9140 + }, + { + "epoch": 2.5727950485300326, + "grad_norm": 3.609375, + "learning_rate": 2.720483183511197e-06, + "loss": 1.3647, + "step": 9145 + }, + { + "epoch": 2.5742017161344775, + "grad_norm": 3.625, + "learning_rate": 2.715831142730316e-06, + "loss": 1.4631, + "step": 9150 + }, + { + "epoch": 2.5756083837389223, + "grad_norm": 4.34375, + "learning_rate": 2.711181037698867e-06, + "loss": 1.4525, + "step": 9155 + }, + { + "epoch": 2.5770150513433676, + "grad_norm": 4.53125, + "learning_rate": 2.706532875426392e-06, + "loss": 1.6094, + "step": 9160 + }, + { + "epoch": 2.5784217189478125, + "grad_norm": 3.984375, + "learning_rate": 2.7018866629195077e-06, + "loss": 1.4726, + "step": 9165 + }, + { + "epoch": 2.5798283865522578, + "grad_norm": 2.953125, + "learning_rate": 2.69724240718189e-06, + "loss": 1.507, + "step": 9170 + }, + { + "epoch": 2.5812350541567026, + "grad_norm": 2.5625, + "learning_rate": 2.692600115214267e-06, + "loss": 1.586, + "step": 9175 + }, + { + "epoch": 2.582641721761148, + "grad_norm": 3.71875, + "learning_rate": 2.6879597940144038e-06, + "loss": 1.5724, + "step": 9180 + }, + { + "epoch": 2.584048389365593, + "grad_norm": 4.5625, + "learning_rate": 2.683321450577098e-06, + "loss": 1.6215, + "step": 9185 + }, + { + "epoch": 2.585455056970038, + "grad_norm": 3.1875, + "learning_rate": 2.678685091894162e-06, + "loss": 1.6285, + "step": 9190 + }, + { + "epoch": 2.586861724574483, + "grad_norm": 3.671875, + "learning_rate": 2.674050724954421e-06, + "loss": 1.4316, + "step": 9195 + }, + { + "epoch": 2.5882683921789282, + "grad_norm": 3.265625, + "learning_rate": 2.6694183567436936e-06, + "loss": 1.4817, + "step": 9200 + }, + { + "epoch": 2.589675059783373, + "grad_norm": 2.65625, + "learning_rate": 2.664787994244788e-06, + "loss": 1.5046, + "step": 9205 + }, + { + "epoch": 2.5910817273878184, + "grad_norm": 3.234375, + "learning_rate": 2.66015964443749e-06, + "loss": 1.4744, + "step": 9210 + }, + { + "epoch": 2.5924883949922632, + "grad_norm": 3.234375, + "learning_rate": 2.655533314298548e-06, + "loss": 1.6461, + "step": 9215 + }, + { + "epoch": 2.5938950625967085, + "grad_norm": 2.953125, + "learning_rate": 2.6509090108016707e-06, + "loss": 1.6825, + "step": 9220 + }, + { + "epoch": 2.5953017302011534, + "grad_norm": 4.3125, + "learning_rate": 2.646286740917504e-06, + "loss": 1.5424, + "step": 9225 + }, + { + "epoch": 2.5967083978055987, + "grad_norm": 10.125, + "learning_rate": 2.641666511613639e-06, + "loss": 1.5151, + "step": 9230 + }, + { + "epoch": 2.5981150654100436, + "grad_norm": 3.1875, + "learning_rate": 2.637048329854581e-06, + "loss": 1.6234, + "step": 9235 + }, + { + "epoch": 2.5995217330144884, + "grad_norm": 3.0, + "learning_rate": 2.632432202601755e-06, + "loss": 1.7191, + "step": 9240 + }, + { + "epoch": 2.6009284006189337, + "grad_norm": 4.09375, + "learning_rate": 2.6278181368134873e-06, + "loss": 1.7231, + "step": 9245 + }, + { + "epoch": 2.602335068223379, + "grad_norm": 3.046875, + "learning_rate": 2.623206139444997e-06, + "loss": 1.6375, + "step": 9250 + }, + { + "epoch": 2.603741735827824, + "grad_norm": 3.46875, + "learning_rate": 2.6185962174483815e-06, + "loss": 1.4945, + "step": 9255 + }, + { + "epoch": 2.6051484034322687, + "grad_norm": 2.828125, + "learning_rate": 2.6139883777726185e-06, + "loss": 1.7805, + "step": 9260 + }, + { + "epoch": 2.606555071036714, + "grad_norm": 2.640625, + "learning_rate": 2.609382627363536e-06, + "loss": 1.6192, + "step": 9265 + }, + { + "epoch": 2.6079617386411593, + "grad_norm": 4.09375, + "learning_rate": 2.6047789731638224e-06, + "loss": 1.4136, + "step": 9270 + }, + { + "epoch": 2.609368406245604, + "grad_norm": 3.9375, + "learning_rate": 2.600177422112999e-06, + "loss": 1.5048, + "step": 9275 + }, + { + "epoch": 2.610775073850049, + "grad_norm": 3.078125, + "learning_rate": 2.5955779811474213e-06, + "loss": 1.6703, + "step": 9280 + }, + { + "epoch": 2.6121817414544943, + "grad_norm": 3.34375, + "learning_rate": 2.5909806572002634e-06, + "loss": 1.6786, + "step": 9285 + }, + { + "epoch": 2.6135884090589396, + "grad_norm": 4.875, + "learning_rate": 2.5863854572015057e-06, + "loss": 1.6156, + "step": 9290 + }, + { + "epoch": 2.6149950766633845, + "grad_norm": 3.0, + "learning_rate": 2.5817923880779308e-06, + "loss": 1.6928, + "step": 9295 + }, + { + "epoch": 2.6164017442678293, + "grad_norm": 2.875, + "learning_rate": 2.577201456753104e-06, + "loss": 1.6488, + "step": 9300 + }, + { + "epoch": 2.6178084118722746, + "grad_norm": 2.65625, + "learning_rate": 2.572612670147374e-06, + "loss": 1.6353, + "step": 9305 + }, + { + "epoch": 2.6192150794767195, + "grad_norm": 3.125, + "learning_rate": 2.5680260351778523e-06, + "loss": 1.4599, + "step": 9310 + }, + { + "epoch": 2.620621747081165, + "grad_norm": 2.265625, + "learning_rate": 2.56344155875841e-06, + "loss": 1.4838, + "step": 9315 + }, + { + "epoch": 2.6220284146856097, + "grad_norm": 3.34375, + "learning_rate": 2.55885924779966e-06, + "loss": 1.5812, + "step": 9320 + }, + { + "epoch": 2.623435082290055, + "grad_norm": 3.234375, + "learning_rate": 2.5542791092089586e-06, + "loss": 1.4303, + "step": 9325 + }, + { + "epoch": 2.6248417498945, + "grad_norm": 2.828125, + "learning_rate": 2.549701149890377e-06, + "loss": 1.6144, + "step": 9330 + }, + { + "epoch": 2.626248417498945, + "grad_norm": 3.4375, + "learning_rate": 2.545125376744712e-06, + "loss": 1.588, + "step": 9335 + }, + { + "epoch": 2.62765508510339, + "grad_norm": 3.046875, + "learning_rate": 2.540551796669457e-06, + "loss": 1.6687, + "step": 9340 + }, + { + "epoch": 2.6290617527078353, + "grad_norm": 4.21875, + "learning_rate": 2.535980416558804e-06, + "loss": 1.5527, + "step": 9345 + }, + { + "epoch": 2.63046842031228, + "grad_norm": 2.453125, + "learning_rate": 2.531411243303629e-06, + "loss": 1.429, + "step": 9350 + }, + { + "epoch": 2.6318750879167254, + "grad_norm": 2.859375, + "learning_rate": 2.526844283791477e-06, + "loss": 1.4638, + "step": 9355 + }, + { + "epoch": 2.6332817555211703, + "grad_norm": 4.0, + "learning_rate": 2.5222795449065623e-06, + "loss": 1.8405, + "step": 9360 + }, + { + "epoch": 2.6346884231256156, + "grad_norm": 3.15625, + "learning_rate": 2.5177170335297445e-06, + "loss": 1.6266, + "step": 9365 + }, + { + "epoch": 2.6360950907300604, + "grad_norm": 4.09375, + "learning_rate": 2.5131567565385327e-06, + "loss": 1.5076, + "step": 9370 + }, + { + "epoch": 2.6375017583345057, + "grad_norm": 2.515625, + "learning_rate": 2.5085987208070628e-06, + "loss": 1.6445, + "step": 9375 + }, + { + "epoch": 2.6389084259389506, + "grad_norm": 2.703125, + "learning_rate": 2.5040429332060953e-06, + "loss": 1.6817, + "step": 9380 + }, + { + "epoch": 2.6403150935433954, + "grad_norm": 3.375, + "learning_rate": 2.499489400602999e-06, + "loss": 1.7791, + "step": 9385 + }, + { + "epoch": 2.6417217611478407, + "grad_norm": 3.171875, + "learning_rate": 2.4949381298617478e-06, + "loss": 1.5416, + "step": 9390 + }, + { + "epoch": 2.643128428752286, + "grad_norm": 2.96875, + "learning_rate": 2.4903891278429002e-06, + "loss": 1.7715, + "step": 9395 + }, + { + "epoch": 2.644535096356731, + "grad_norm": 2.609375, + "learning_rate": 2.485842401403601e-06, + "loss": 1.6767, + "step": 9400 + }, + { + "epoch": 2.6459417639611758, + "grad_norm": 3.109375, + "learning_rate": 2.4812979573975595e-06, + "loss": 1.7507, + "step": 9405 + }, + { + "epoch": 2.647348431565621, + "grad_norm": 2.953125, + "learning_rate": 2.476755802675049e-06, + "loss": 1.608, + "step": 9410 + }, + { + "epoch": 2.6487550991700664, + "grad_norm": 3.25, + "learning_rate": 2.4722159440828877e-06, + "loss": 1.5923, + "step": 9415 + }, + { + "epoch": 2.650161766774511, + "grad_norm": 3.4375, + "learning_rate": 2.467678388464436e-06, + "loss": 1.4541, + "step": 9420 + }, + { + "epoch": 2.651568434378956, + "grad_norm": 3.796875, + "learning_rate": 2.4631431426595826e-06, + "loss": 1.6113, + "step": 9425 + }, + { + "epoch": 2.6529751019834014, + "grad_norm": 2.65625, + "learning_rate": 2.4586102135047314e-06, + "loss": 1.6558, + "step": 9430 + }, + { + "epoch": 2.6543817695878467, + "grad_norm": 3.0625, + "learning_rate": 2.4540796078327966e-06, + "loss": 1.7034, + "step": 9435 + }, + { + "epoch": 2.6557884371922915, + "grad_norm": 3.71875, + "learning_rate": 2.4495513324731897e-06, + "loss": 1.68, + "step": 9440 + }, + { + "epoch": 2.6571951047967364, + "grad_norm": 3.140625, + "learning_rate": 2.4450253942518105e-06, + "loss": 1.675, + "step": 9445 + }, + { + "epoch": 2.6586017724011817, + "grad_norm": 2.359375, + "learning_rate": 2.4405017999910324e-06, + "loss": 1.6803, + "step": 9450 + }, + { + "epoch": 2.6600084400056265, + "grad_norm": 3.796875, + "learning_rate": 2.4359805565097006e-06, + "loss": 1.5876, + "step": 9455 + }, + { + "epoch": 2.661415107610072, + "grad_norm": 3.34375, + "learning_rate": 2.431461670623111e-06, + "loss": 1.7244, + "step": 9460 + }, + { + "epoch": 2.6628217752145167, + "grad_norm": 3.0, + "learning_rate": 2.4269451491430103e-06, + "loss": 1.7627, + "step": 9465 + }, + { + "epoch": 2.664228442818962, + "grad_norm": 4.84375, + "learning_rate": 2.422430998877578e-06, + "loss": 1.4034, + "step": 9470 + }, + { + "epoch": 2.665635110423407, + "grad_norm": 3.734375, + "learning_rate": 2.417919226631423e-06, + "loss": 1.5969, + "step": 9475 + }, + { + "epoch": 2.667041778027852, + "grad_norm": 4.4375, + "learning_rate": 2.413409839205565e-06, + "loss": 1.4532, + "step": 9480 + }, + { + "epoch": 2.668448445632297, + "grad_norm": 2.5, + "learning_rate": 2.4089028433974335e-06, + "loss": 1.346, + "step": 9485 + }, + { + "epoch": 2.6698551132367423, + "grad_norm": 2.765625, + "learning_rate": 2.4043982460008466e-06, + "loss": 1.3301, + "step": 9490 + }, + { + "epoch": 2.671261780841187, + "grad_norm": 3.875, + "learning_rate": 2.3998960538060138e-06, + "loss": 1.5903, + "step": 9495 + }, + { + "epoch": 2.6726684484456324, + "grad_norm": 4.65625, + "learning_rate": 2.3953962735995167e-06, + "loss": 1.4014, + "step": 9500 + }, + { + "epoch": 2.6740751160500773, + "grad_norm": 2.53125, + "learning_rate": 2.390898912164298e-06, + "loss": 1.8069, + "step": 9505 + }, + { + "epoch": 2.675481783654522, + "grad_norm": 3.359375, + "learning_rate": 2.3864039762796583e-06, + "loss": 1.5562, + "step": 9510 + }, + { + "epoch": 2.6768884512589675, + "grad_norm": 3.328125, + "learning_rate": 2.38191147272124e-06, + "loss": 1.7332, + "step": 9515 + }, + { + "epoch": 2.6782951188634128, + "grad_norm": 2.890625, + "learning_rate": 2.3774214082610217e-06, + "loss": 1.5451, + "step": 9520 + }, + { + "epoch": 2.6797017864678576, + "grad_norm": 3.78125, + "learning_rate": 2.3729337896672996e-06, + "loss": 1.3584, + "step": 9525 + }, + { + "epoch": 2.6811084540723025, + "grad_norm": 4.8125, + "learning_rate": 2.3684486237046886e-06, + "loss": 1.5757, + "step": 9530 + }, + { + "epoch": 2.6825151216767478, + "grad_norm": 3.125, + "learning_rate": 2.3639659171341036e-06, + "loss": 1.617, + "step": 9535 + }, + { + "epoch": 2.683921789281193, + "grad_norm": 4.09375, + "learning_rate": 2.3594856767127542e-06, + "loss": 1.6402, + "step": 9540 + }, + { + "epoch": 2.685328456885638, + "grad_norm": 3.375, + "learning_rate": 2.35500790919413e-06, + "loss": 1.7177, + "step": 9545 + }, + { + "epoch": 2.686735124490083, + "grad_norm": 3.671875, + "learning_rate": 2.3505326213279964e-06, + "loss": 1.5701, + "step": 9550 + }, + { + "epoch": 2.688141792094528, + "grad_norm": 3.515625, + "learning_rate": 2.346059819860376e-06, + "loss": 1.5764, + "step": 9555 + }, + { + "epoch": 2.6895484596989734, + "grad_norm": 2.5, + "learning_rate": 2.3415895115335477e-06, + "loss": 1.7206, + "step": 9560 + }, + { + "epoch": 2.6909551273034182, + "grad_norm": 2.859375, + "learning_rate": 2.3371217030860337e-06, + "loss": 1.7077, + "step": 9565 + }, + { + "epoch": 2.692361794907863, + "grad_norm": 2.984375, + "learning_rate": 2.3326564012525804e-06, + "loss": 1.6132, + "step": 9570 + }, + { + "epoch": 2.6937684625123084, + "grad_norm": 2.8125, + "learning_rate": 2.3281936127641644e-06, + "loss": 1.5924, + "step": 9575 + }, + { + "epoch": 2.6951751301167532, + "grad_norm": 4.09375, + "learning_rate": 2.3237333443479676e-06, + "loss": 1.3612, + "step": 9580 + }, + { + "epoch": 2.6965817977211985, + "grad_norm": 3.125, + "learning_rate": 2.3192756027273766e-06, + "loss": 1.6446, + "step": 9585 + }, + { + "epoch": 2.6979884653256434, + "grad_norm": 3.21875, + "learning_rate": 2.3148203946219644e-06, + "loss": 1.9269, + "step": 9590 + }, + { + "epoch": 2.6993951329300887, + "grad_norm": 3.5625, + "learning_rate": 2.3103677267474934e-06, + "loss": 1.5727, + "step": 9595 + }, + { + "epoch": 2.7008018005345336, + "grad_norm": 3.0625, + "learning_rate": 2.3059176058158897e-06, + "loss": 1.6316, + "step": 9600 + }, + { + "epoch": 2.702208468138979, + "grad_norm": 4.4375, + "learning_rate": 2.3014700385352425e-06, + "loss": 1.2899, + "step": 9605 + }, + { + "epoch": 2.7036151357434237, + "grad_norm": 3.90625, + "learning_rate": 2.2970250316097914e-06, + "loss": 1.5956, + "step": 9610 + }, + { + "epoch": 2.705021803347869, + "grad_norm": 2.609375, + "learning_rate": 2.292582591739916e-06, + "loss": 1.4859, + "step": 9615 + }, + { + "epoch": 2.706428470952314, + "grad_norm": 3.984375, + "learning_rate": 2.2881427256221263e-06, + "loss": 1.5503, + "step": 9620 + }, + { + "epoch": 2.707835138556759, + "grad_norm": 2.5, + "learning_rate": 2.283705439949056e-06, + "loss": 1.4448, + "step": 9625 + }, + { + "epoch": 2.709241806161204, + "grad_norm": 3.78125, + "learning_rate": 2.2792707414094447e-06, + "loss": 1.5078, + "step": 9630 + }, + { + "epoch": 2.7106484737656493, + "grad_norm": 4.09375, + "learning_rate": 2.2748386366881327e-06, + "loss": 1.5553, + "step": 9635 + }, + { + "epoch": 2.712055141370094, + "grad_norm": 3.921875, + "learning_rate": 2.2704091324660557e-06, + "loss": 1.6245, + "step": 9640 + }, + { + "epoch": 2.7134618089745395, + "grad_norm": 2.484375, + "learning_rate": 2.26598223542022e-06, + "loss": 1.4752, + "step": 9645 + }, + { + "epoch": 2.7148684765789843, + "grad_norm": 4.09375, + "learning_rate": 2.2615579522237103e-06, + "loss": 1.6861, + "step": 9650 + }, + { + "epoch": 2.716275144183429, + "grad_norm": 4.28125, + "learning_rate": 2.2571362895456673e-06, + "loss": 1.3737, + "step": 9655 + }, + { + "epoch": 2.7176818117878745, + "grad_norm": 4.03125, + "learning_rate": 2.2527172540512817e-06, + "loss": 1.7256, + "step": 9660 + }, + { + "epoch": 2.71908847939232, + "grad_norm": 3.421875, + "learning_rate": 2.248300852401784e-06, + "loss": 1.4467, + "step": 9665 + }, + { + "epoch": 2.7204951469967646, + "grad_norm": 2.5625, + "learning_rate": 2.2438870912544386e-06, + "loss": 1.7083, + "step": 9670 + }, + { + "epoch": 2.7219018146012095, + "grad_norm": 2.703125, + "learning_rate": 2.239475977262521e-06, + "loss": 1.6565, + "step": 9675 + }, + { + "epoch": 2.723308482205655, + "grad_norm": 3.765625, + "learning_rate": 2.2350675170753247e-06, + "loss": 1.4021, + "step": 9680 + }, + { + "epoch": 2.7247151498101, + "grad_norm": 4.03125, + "learning_rate": 2.230661717338138e-06, + "loss": 1.5423, + "step": 9685 + }, + { + "epoch": 2.726121817414545, + "grad_norm": 2.9375, + "learning_rate": 2.2262585846922418e-06, + "loss": 1.4289, + "step": 9690 + }, + { + "epoch": 2.72752848501899, + "grad_norm": 3.84375, + "learning_rate": 2.2218581257748927e-06, + "loss": 1.4817, + "step": 9695 + }, + { + "epoch": 2.728935152623435, + "grad_norm": 4.09375, + "learning_rate": 2.2174603472193224e-06, + "loss": 1.3842, + "step": 9700 + }, + { + "epoch": 2.7303418202278804, + "grad_norm": 3.84375, + "learning_rate": 2.213065255654719e-06, + "loss": 1.5652, + "step": 9705 + }, + { + "epoch": 2.7317484878323253, + "grad_norm": 2.828125, + "learning_rate": 2.2086728577062178e-06, + "loss": 1.3789, + "step": 9710 + }, + { + "epoch": 2.73315515543677, + "grad_norm": 4.1875, + "learning_rate": 2.204283159994902e-06, + "loss": 1.3959, + "step": 9715 + }, + { + "epoch": 2.7345618230412154, + "grad_norm": 3.78125, + "learning_rate": 2.199896169137772e-06, + "loss": 1.7312, + "step": 9720 + }, + { + "epoch": 2.7359684906456603, + "grad_norm": 4.5, + "learning_rate": 2.19551189174776e-06, + "loss": 1.4778, + "step": 9725 + }, + { + "epoch": 2.7373751582501056, + "grad_norm": 3.6875, + "learning_rate": 2.1911303344337014e-06, + "loss": 1.4665, + "step": 9730 + }, + { + "epoch": 2.7387818258545504, + "grad_norm": 2.9375, + "learning_rate": 2.186751503800332e-06, + "loss": 1.5714, + "step": 9735 + }, + { + "epoch": 2.7401884934589957, + "grad_norm": 3.703125, + "learning_rate": 2.1823754064482786e-06, + "loss": 1.5682, + "step": 9740 + }, + { + "epoch": 2.7415951610634406, + "grad_norm": 3.890625, + "learning_rate": 2.1780020489740506e-06, + "loss": 1.4708, + "step": 9745 + }, + { + "epoch": 2.743001828667886, + "grad_norm": 3.015625, + "learning_rate": 2.1736314379700177e-06, + "loss": 1.5423, + "step": 9750 + }, + { + "epoch": 2.7444084962723307, + "grad_norm": 3.203125, + "learning_rate": 2.1692635800244222e-06, + "loss": 1.7049, + "step": 9755 + }, + { + "epoch": 2.745815163876776, + "grad_norm": 4.625, + "learning_rate": 2.164898481721348e-06, + "loss": 1.4801, + "step": 9760 + }, + { + "epoch": 2.747221831481221, + "grad_norm": 3.546875, + "learning_rate": 2.160536149640721e-06, + "loss": 1.6165, + "step": 9765 + }, + { + "epoch": 2.748628499085666, + "grad_norm": 4.90625, + "learning_rate": 2.1561765903582985e-06, + "loss": 1.4022, + "step": 9770 + }, + { + "epoch": 2.750035166690111, + "grad_norm": 6.375, + "learning_rate": 2.151819810445656e-06, + "loss": 1.3853, + "step": 9775 + }, + { + "epoch": 2.7514418342945564, + "grad_norm": 3.4375, + "learning_rate": 2.147465816470183e-06, + "loss": 1.5349, + "step": 9780 + }, + { + "epoch": 2.752848501899001, + "grad_norm": 2.765625, + "learning_rate": 2.1431146149950673e-06, + "loss": 1.575, + "step": 9785 + }, + { + "epoch": 2.7542551695034465, + "grad_norm": 2.953125, + "learning_rate": 2.138766212579286e-06, + "loss": 1.4022, + "step": 9790 + }, + { + "epoch": 2.7556618371078914, + "grad_norm": 3.484375, + "learning_rate": 2.1344206157775963e-06, + "loss": 1.5342, + "step": 9795 + }, + { + "epoch": 2.757068504712336, + "grad_norm": 3.984375, + "learning_rate": 2.130077831140534e-06, + "loss": 1.6823, + "step": 9800 + }, + { + "epoch": 2.7584751723167815, + "grad_norm": 3.6875, + "learning_rate": 2.125737865214383e-06, + "loss": 1.5458, + "step": 9805 + }, + { + "epoch": 2.759881839921227, + "grad_norm": 3.46875, + "learning_rate": 2.12140072454119e-06, + "loss": 1.5436, + "step": 9810 + }, + { + "epoch": 2.7612885075256717, + "grad_norm": 3.453125, + "learning_rate": 2.1170664156587374e-06, + "loss": 1.4426, + "step": 9815 + }, + { + "epoch": 2.7626951751301165, + "grad_norm": 2.3125, + "learning_rate": 2.1127349451005387e-06, + "loss": 1.6613, + "step": 9820 + }, + { + "epoch": 2.764101842734562, + "grad_norm": 3.515625, + "learning_rate": 2.1084063193958292e-06, + "loss": 1.522, + "step": 9825 + }, + { + "epoch": 2.765508510339007, + "grad_norm": 3.4375, + "learning_rate": 2.104080545069561e-06, + "loss": 1.6962, + "step": 9830 + }, + { + "epoch": 2.766915177943452, + "grad_norm": 4.4375, + "learning_rate": 2.0997576286423773e-06, + "loss": 1.5804, + "step": 9835 + }, + { + "epoch": 2.768321845547897, + "grad_norm": 3.25, + "learning_rate": 2.0954375766306256e-06, + "loss": 1.5818, + "step": 9840 + }, + { + "epoch": 2.769728513152342, + "grad_norm": 3.28125, + "learning_rate": 2.0911203955463262e-06, + "loss": 1.7594, + "step": 9845 + }, + { + "epoch": 2.7711351807567874, + "grad_norm": 3.5, + "learning_rate": 2.0868060918971754e-06, + "loss": 1.541, + "step": 9850 + }, + { + "epoch": 2.7725418483612323, + "grad_norm": 3.71875, + "learning_rate": 2.082494672186535e-06, + "loss": 1.6876, + "step": 9855 + }, + { + "epoch": 2.773948515965677, + "grad_norm": 2.828125, + "learning_rate": 2.078186142913414e-06, + "loss": 1.4451, + "step": 9860 + }, + { + "epoch": 2.7753551835701225, + "grad_norm": 3.71875, + "learning_rate": 2.0738805105724676e-06, + "loss": 1.8509, + "step": 9865 + }, + { + "epoch": 2.7767618511745673, + "grad_norm": 5.75, + "learning_rate": 2.069577781653982e-06, + "loss": 1.648, + "step": 9870 + }, + { + "epoch": 2.7781685187790126, + "grad_norm": 5.15625, + "learning_rate": 2.065277962643873e-06, + "loss": 1.5538, + "step": 9875 + }, + { + "epoch": 2.7795751863834575, + "grad_norm": 3.5, + "learning_rate": 2.0609810600236586e-06, + "loss": 1.4278, + "step": 9880 + }, + { + "epoch": 2.7809818539879028, + "grad_norm": 3.75, + "learning_rate": 2.056687080270473e-06, + "loss": 1.5183, + "step": 9885 + }, + { + "epoch": 2.7823885215923476, + "grad_norm": 3.859375, + "learning_rate": 2.0523960298570368e-06, + "loss": 1.4949, + "step": 9890 + }, + { + "epoch": 2.783795189196793, + "grad_norm": 2.734375, + "learning_rate": 2.0481079152516564e-06, + "loss": 1.5487, + "step": 9895 + }, + { + "epoch": 2.7852018568012378, + "grad_norm": 3.59375, + "learning_rate": 2.043822742918212e-06, + "loss": 1.6569, + "step": 9900 + }, + { + "epoch": 2.786608524405683, + "grad_norm": 4.53125, + "learning_rate": 2.0395405193161557e-06, + "loss": 1.5651, + "step": 9905 + }, + { + "epoch": 2.788015192010128, + "grad_norm": 3.625, + "learning_rate": 2.0352612509004816e-06, + "loss": 1.7325, + "step": 9910 + }, + { + "epoch": 2.7894218596145732, + "grad_norm": 3.421875, + "learning_rate": 2.030984944121742e-06, + "loss": 1.6342, + "step": 9915 + }, + { + "epoch": 2.790828527219018, + "grad_norm": 3.015625, + "learning_rate": 2.0267116054260174e-06, + "loss": 1.6308, + "step": 9920 + }, + { + "epoch": 2.792235194823463, + "grad_norm": 2.5625, + "learning_rate": 2.0224412412549153e-06, + "loss": 1.5591, + "step": 9925 + }, + { + "epoch": 2.7936418624279082, + "grad_norm": 3.546875, + "learning_rate": 2.0181738580455626e-06, + "loss": 1.6227, + "step": 9930 + }, + { + "epoch": 2.7950485300323535, + "grad_norm": 3.1875, + "learning_rate": 2.013909462230589e-06, + "loss": 1.6837, + "step": 9935 + }, + { + "epoch": 2.7964551976367984, + "grad_norm": 5.125, + "learning_rate": 2.009648060238123e-06, + "loss": 1.5786, + "step": 9940 + }, + { + "epoch": 2.7978618652412433, + "grad_norm": 3.515625, + "learning_rate": 2.0053896584917804e-06, + "loss": 1.5091, + "step": 9945 + }, + { + "epoch": 2.7992685328456886, + "grad_norm": 3.359375, + "learning_rate": 2.001134263410652e-06, + "loss": 1.6791, + "step": 9950 + }, + { + "epoch": 2.800675200450134, + "grad_norm": 4.09375, + "learning_rate": 1.9968818814092975e-06, + "loss": 1.6616, + "step": 9955 + }, + { + "epoch": 2.8020818680545787, + "grad_norm": 2.3125, + "learning_rate": 1.9926325188977382e-06, + "loss": 1.5081, + "step": 9960 + }, + { + "epoch": 2.8034885356590236, + "grad_norm": 3.625, + "learning_rate": 1.98838618228144e-06, + "loss": 1.5836, + "step": 9965 + }, + { + "epoch": 2.804895203263469, + "grad_norm": 2.9375, + "learning_rate": 1.9841428779613085e-06, + "loss": 1.8027, + "step": 9970 + }, + { + "epoch": 2.806301870867914, + "grad_norm": 3.5625, + "learning_rate": 1.979902612333678e-06, + "loss": 1.6843, + "step": 9975 + }, + { + "epoch": 2.807708538472359, + "grad_norm": 3.640625, + "learning_rate": 1.9756653917903024e-06, + "loss": 1.6057, + "step": 9980 + }, + { + "epoch": 2.809115206076804, + "grad_norm": 3.625, + "learning_rate": 1.9714312227183448e-06, + "loss": 1.4281, + "step": 9985 + }, + { + "epoch": 2.810521873681249, + "grad_norm": 3.4375, + "learning_rate": 1.9672001115003734e-06, + "loss": 1.5506, + "step": 9990 + }, + { + "epoch": 2.811928541285694, + "grad_norm": 3.140625, + "learning_rate": 1.96297206451434e-06, + "loss": 1.4472, + "step": 9995 + }, + { + "epoch": 2.8133352088901393, + "grad_norm": 2.390625, + "learning_rate": 1.95874708813358e-06, + "loss": 1.7408, + "step": 10000 + }, + { + "epoch": 2.814741876494584, + "grad_norm": 2.796875, + "learning_rate": 1.9545251887268055e-06, + "loss": 1.8876, + "step": 10005 + }, + { + "epoch": 2.8161485440990295, + "grad_norm": 3.125, + "learning_rate": 1.9503063726580794e-06, + "loss": 1.6134, + "step": 10010 + }, + { + "epoch": 2.8175552117034743, + "grad_norm": 4.125, + "learning_rate": 1.9460906462868266e-06, + "loss": 1.3413, + "step": 10015 + }, + { + "epoch": 2.8189618793079196, + "grad_norm": 3.875, + "learning_rate": 1.941878015967811e-06, + "loss": 1.5506, + "step": 10020 + }, + { + "epoch": 2.8203685469123645, + "grad_norm": 3.75, + "learning_rate": 1.9376684880511283e-06, + "loss": 1.7976, + "step": 10025 + }, + { + "epoch": 2.82177521451681, + "grad_norm": 2.28125, + "learning_rate": 1.9334620688821986e-06, + "loss": 1.5492, + "step": 10030 + }, + { + "epoch": 2.8231818821212546, + "grad_norm": 2.84375, + "learning_rate": 1.9292587648017597e-06, + "loss": 1.6118, + "step": 10035 + }, + { + "epoch": 2.8245885497257, + "grad_norm": 5.40625, + "learning_rate": 1.925058582145844e-06, + "loss": 1.4627, + "step": 10040 + }, + { + "epoch": 2.825995217330145, + "grad_norm": 3.171875, + "learning_rate": 1.9208615272457907e-06, + "loss": 1.9225, + "step": 10045 + }, + { + "epoch": 2.82740188493459, + "grad_norm": 3.375, + "learning_rate": 1.916667606428216e-06, + "loss": 1.4313, + "step": 10050 + }, + { + "epoch": 2.828808552539035, + "grad_norm": 3.28125, + "learning_rate": 1.9124768260150144e-06, + "loss": 1.7128, + "step": 10055 + }, + { + "epoch": 2.8302152201434803, + "grad_norm": 3.984375, + "learning_rate": 1.9082891923233453e-06, + "loss": 1.4724, + "step": 10060 + }, + { + "epoch": 2.831621887747925, + "grad_norm": 4.15625, + "learning_rate": 1.9041047116656279e-06, + "loss": 1.5864, + "step": 10065 + }, + { + "epoch": 2.83302855535237, + "grad_norm": 3.859375, + "learning_rate": 1.8999233903495262e-06, + "loss": 1.5653, + "step": 10070 + }, + { + "epoch": 2.8344352229568153, + "grad_norm": 3.765625, + "learning_rate": 1.8957452346779399e-06, + "loss": 1.681, + "step": 10075 + }, + { + "epoch": 2.8358418905612606, + "grad_norm": 2.265625, + "learning_rate": 1.8915702509490035e-06, + "loss": 1.5478, + "step": 10080 + }, + { + "epoch": 2.8372485581657054, + "grad_norm": 2.953125, + "learning_rate": 1.88739844545606e-06, + "loss": 1.6975, + "step": 10085 + }, + { + "epoch": 2.8386552257701503, + "grad_norm": 3.140625, + "learning_rate": 1.8832298244876718e-06, + "loss": 1.618, + "step": 10090 + }, + { + "epoch": 2.8400618933745956, + "grad_norm": 3.546875, + "learning_rate": 1.8790643943275946e-06, + "loss": 1.4996, + "step": 10095 + }, + { + "epoch": 2.841468560979041, + "grad_norm": 2.359375, + "learning_rate": 1.8749021612547762e-06, + "loss": 1.6171, + "step": 10100 + }, + { + "epoch": 2.8428752285834857, + "grad_norm": 3.390625, + "learning_rate": 1.8707431315433433e-06, + "loss": 1.4352, + "step": 10105 + }, + { + "epoch": 2.8442818961879306, + "grad_norm": 3.609375, + "learning_rate": 1.8665873114626001e-06, + "loss": 1.4832, + "step": 10110 + }, + { + "epoch": 2.845688563792376, + "grad_norm": 2.5, + "learning_rate": 1.8624347072770026e-06, + "loss": 1.6562, + "step": 10115 + }, + { + "epoch": 2.847095231396821, + "grad_norm": 4.0, + "learning_rate": 1.8582853252461686e-06, + "loss": 1.5724, + "step": 10120 + }, + { + "epoch": 2.848501899001266, + "grad_norm": 3.453125, + "learning_rate": 1.8541391716248533e-06, + "loss": 1.6138, + "step": 10125 + }, + { + "epoch": 2.849908566605711, + "grad_norm": 2.953125, + "learning_rate": 1.849996252662946e-06, + "loss": 1.6924, + "step": 10130 + }, + { + "epoch": 2.851315234210156, + "grad_norm": 3.015625, + "learning_rate": 1.8458565746054657e-06, + "loss": 1.5711, + "step": 10135 + }, + { + "epoch": 2.852721901814601, + "grad_norm": 3.796875, + "learning_rate": 1.8417201436925352e-06, + "loss": 1.399, + "step": 10140 + }, + { + "epoch": 2.8541285694190464, + "grad_norm": 5.0, + "learning_rate": 1.8375869661593933e-06, + "loss": 1.5267, + "step": 10145 + }, + { + "epoch": 2.855535237023491, + "grad_norm": 2.890625, + "learning_rate": 1.8334570482363687e-06, + "loss": 1.6057, + "step": 10150 + }, + { + "epoch": 2.8569419046279365, + "grad_norm": 3.21875, + "learning_rate": 1.8293303961488783e-06, + "loss": 1.5148, + "step": 10155 + }, + { + "epoch": 2.8583485722323814, + "grad_norm": 3.125, + "learning_rate": 1.8252070161174142e-06, + "loss": 1.6119, + "step": 10160 + }, + { + "epoch": 2.8597552398368267, + "grad_norm": 4.25, + "learning_rate": 1.8210869143575432e-06, + "loss": 1.5562, + "step": 10165 + }, + { + "epoch": 2.8611619074412715, + "grad_norm": 3.3125, + "learning_rate": 1.8169700970798777e-06, + "loss": 1.6375, + "step": 10170 + }, + { + "epoch": 2.862568575045717, + "grad_norm": 4.0, + "learning_rate": 1.8128565704900925e-06, + "loss": 1.536, + "step": 10175 + }, + { + "epoch": 2.8639752426501617, + "grad_norm": 3.140625, + "learning_rate": 1.8087463407888942e-06, + "loss": 1.7519, + "step": 10180 + }, + { + "epoch": 2.865381910254607, + "grad_norm": 5.0625, + "learning_rate": 1.8046394141720208e-06, + "loss": 1.4386, + "step": 10185 + }, + { + "epoch": 2.866788577859052, + "grad_norm": 3.703125, + "learning_rate": 1.8005357968302318e-06, + "loss": 1.7448, + "step": 10190 + }, + { + "epoch": 2.868195245463497, + "grad_norm": 3.453125, + "learning_rate": 1.796435494949302e-06, + "loss": 1.4544, + "step": 10195 + }, + { + "epoch": 2.869601913067942, + "grad_norm": 6.90625, + "learning_rate": 1.7923385147099999e-06, + "loss": 1.4657, + "step": 10200 + }, + { + "epoch": 2.8710085806723873, + "grad_norm": 2.890625, + "learning_rate": 1.7882448622880943e-06, + "loss": 1.6822, + "step": 10205 + }, + { + "epoch": 2.872415248276832, + "grad_norm": 2.640625, + "learning_rate": 1.7841545438543392e-06, + "loss": 1.5488, + "step": 10210 + }, + { + "epoch": 2.873821915881277, + "grad_norm": 4.65625, + "learning_rate": 1.7800675655744528e-06, + "loss": 1.5145, + "step": 10215 + }, + { + "epoch": 2.8752285834857223, + "grad_norm": 2.953125, + "learning_rate": 1.7759839336091296e-06, + "loss": 1.6704, + "step": 10220 + }, + { + "epoch": 2.8766352510901676, + "grad_norm": 2.875, + "learning_rate": 1.771903654114013e-06, + "loss": 1.5645, + "step": 10225 + }, + { + "epoch": 2.8780419186946125, + "grad_norm": 3.234375, + "learning_rate": 1.7678267332396958e-06, + "loss": 1.727, + "step": 10230 + }, + { + "epoch": 2.8794485862990573, + "grad_norm": 3.046875, + "learning_rate": 1.7637531771317056e-06, + "loss": 1.647, + "step": 10235 + }, + { + "epoch": 2.8808552539035026, + "grad_norm": 3.328125, + "learning_rate": 1.7596829919305037e-06, + "loss": 1.4213, + "step": 10240 + }, + { + "epoch": 2.882261921507948, + "grad_norm": 5.28125, + "learning_rate": 1.7556161837714606e-06, + "loss": 1.4697, + "step": 10245 + }, + { + "epoch": 2.8836685891123928, + "grad_norm": 3.703125, + "learning_rate": 1.7515527587848652e-06, + "loss": 1.6989, + "step": 10250 + }, + { + "epoch": 2.8850752567168376, + "grad_norm": 2.6875, + "learning_rate": 1.7474927230959025e-06, + "loss": 1.7053, + "step": 10255 + }, + { + "epoch": 2.886481924321283, + "grad_norm": 3.46875, + "learning_rate": 1.7434360828246488e-06, + "loss": 1.4548, + "step": 10260 + }, + { + "epoch": 2.887888591925728, + "grad_norm": 3.890625, + "learning_rate": 1.7393828440860613e-06, + "loss": 1.575, + "step": 10265 + }, + { + "epoch": 2.889295259530173, + "grad_norm": 5.0, + "learning_rate": 1.7353330129899736e-06, + "loss": 1.7149, + "step": 10270 + }, + { + "epoch": 2.890701927134618, + "grad_norm": 3.5, + "learning_rate": 1.731286595641078e-06, + "loss": 1.7091, + "step": 10275 + }, + { + "epoch": 2.8921085947390632, + "grad_norm": 3.25, + "learning_rate": 1.7272435981389237e-06, + "loss": 1.5249, + "step": 10280 + }, + { + "epoch": 2.893515262343508, + "grad_norm": 3.765625, + "learning_rate": 1.7232040265779038e-06, + "loss": 1.5553, + "step": 10285 + }, + { + "epoch": 2.8949219299479534, + "grad_norm": 4.875, + "learning_rate": 1.7191678870472459e-06, + "loss": 1.3392, + "step": 10290 + }, + { + "epoch": 2.8963285975523982, + "grad_norm": 4.0625, + "learning_rate": 1.715135185631008e-06, + "loss": 1.6271, + "step": 10295 + }, + { + "epoch": 2.8977352651568435, + "grad_norm": 4.1875, + "learning_rate": 1.7111059284080627e-06, + "loss": 1.7061, + "step": 10300 + }, + { + "epoch": 2.8991419327612884, + "grad_norm": 2.671875, + "learning_rate": 1.7070801214520904e-06, + "loss": 1.4828, + "step": 10305 + }, + { + "epoch": 2.9005486003657337, + "grad_norm": 3.1875, + "learning_rate": 1.703057770831572e-06, + "loss": 1.6617, + "step": 10310 + }, + { + "epoch": 2.9019552679701786, + "grad_norm": 3.28125, + "learning_rate": 1.699038882609778e-06, + "loss": 1.6372, + "step": 10315 + }, + { + "epoch": 2.903361935574624, + "grad_norm": 2.8125, + "learning_rate": 1.695023462844757e-06, + "loss": 1.6865, + "step": 10320 + }, + { + "epoch": 2.9047686031790687, + "grad_norm": 3.359375, + "learning_rate": 1.6910115175893362e-06, + "loss": 1.5584, + "step": 10325 + }, + { + "epoch": 2.906175270783514, + "grad_norm": 3.125, + "learning_rate": 1.6870030528910983e-06, + "loss": 1.6264, + "step": 10330 + }, + { + "epoch": 2.907581938387959, + "grad_norm": 3.28125, + "learning_rate": 1.6829980747923828e-06, + "loss": 1.6237, + "step": 10335 + }, + { + "epoch": 2.9089886059924037, + "grad_norm": 2.796875, + "learning_rate": 1.6789965893302723e-06, + "loss": 1.7661, + "step": 10340 + }, + { + "epoch": 2.910395273596849, + "grad_norm": 3.609375, + "learning_rate": 1.6749986025365836e-06, + "loss": 1.5507, + "step": 10345 + }, + { + "epoch": 2.9118019412012943, + "grad_norm": 4.09375, + "learning_rate": 1.6710041204378649e-06, + "loss": 1.525, + "step": 10350 + }, + { + "epoch": 2.913208608805739, + "grad_norm": 3.546875, + "learning_rate": 1.667013149055375e-06, + "loss": 1.4026, + "step": 10355 + }, + { + "epoch": 2.914615276410184, + "grad_norm": 3.25, + "learning_rate": 1.6630256944050842e-06, + "loss": 1.7627, + "step": 10360 + }, + { + "epoch": 2.9160219440146293, + "grad_norm": 3.453125, + "learning_rate": 1.659041762497659e-06, + "loss": 1.6459, + "step": 10365 + }, + { + "epoch": 2.9174286116190746, + "grad_norm": 3.25, + "learning_rate": 1.6550613593384614e-06, + "loss": 1.6321, + "step": 10370 + }, + { + "epoch": 2.9188352792235195, + "grad_norm": 3.765625, + "learning_rate": 1.6510844909275257e-06, + "loss": 1.5042, + "step": 10375 + }, + { + "epoch": 2.9202419468279643, + "grad_norm": 3.53125, + "learning_rate": 1.6471111632595665e-06, + "loss": 1.4446, + "step": 10380 + }, + { + "epoch": 2.9216486144324096, + "grad_norm": 3.65625, + "learning_rate": 1.6431413823239551e-06, + "loss": 1.6827, + "step": 10385 + }, + { + "epoch": 2.923055282036855, + "grad_norm": 3.765625, + "learning_rate": 1.6391751541047189e-06, + "loss": 1.4172, + "step": 10390 + }, + { + "epoch": 2.9244619496413, + "grad_norm": 3.671875, + "learning_rate": 1.6352124845805286e-06, + "loss": 1.4812, + "step": 10395 + }, + { + "epoch": 2.9258686172457447, + "grad_norm": 3.90625, + "learning_rate": 1.6312533797246957e-06, + "loss": 1.4395, + "step": 10400 + }, + { + "epoch": 2.92727528485019, + "grad_norm": 2.78125, + "learning_rate": 1.627297845505148e-06, + "loss": 1.4606, + "step": 10405 + }, + { + "epoch": 2.928681952454635, + "grad_norm": 2.71875, + "learning_rate": 1.6233458878844418e-06, + "loss": 1.7314, + "step": 10410 + }, + { + "epoch": 2.93008862005908, + "grad_norm": 2.5625, + "learning_rate": 1.6193975128197356e-06, + "loss": 1.7334, + "step": 10415 + }, + { + "epoch": 2.931495287663525, + "grad_norm": 4.75, + "learning_rate": 1.6154527262627889e-06, + "loss": 1.5945, + "step": 10420 + }, + { + "epoch": 2.9329019552679703, + "grad_norm": 4.78125, + "learning_rate": 1.6115115341599542e-06, + "loss": 1.5073, + "step": 10425 + }, + { + "epoch": 2.934308622872415, + "grad_norm": 3.984375, + "learning_rate": 1.6075739424521623e-06, + "loss": 1.8863, + "step": 10430 + }, + { + "epoch": 2.9357152904768604, + "grad_norm": 3.359375, + "learning_rate": 1.6036399570749194e-06, + "loss": 1.7013, + "step": 10435 + }, + { + "epoch": 2.9371219580813053, + "grad_norm": 2.625, + "learning_rate": 1.5997095839582927e-06, + "loss": 1.7134, + "step": 10440 + }, + { + "epoch": 2.9385286256857506, + "grad_norm": 3.703125, + "learning_rate": 1.59578282902691e-06, + "loss": 1.5742, + "step": 10445 + }, + { + "epoch": 2.9399352932901954, + "grad_norm": 2.734375, + "learning_rate": 1.5918596981999359e-06, + "loss": 1.5151, + "step": 10450 + }, + { + "epoch": 2.9413419608946407, + "grad_norm": 7.09375, + "learning_rate": 1.5879401973910813e-06, + "loss": 1.6329, + "step": 10455 + }, + { + "epoch": 2.9427486284990856, + "grad_norm": 3.078125, + "learning_rate": 1.58402433250858e-06, + "loss": 1.4513, + "step": 10460 + }, + { + "epoch": 2.944155296103531, + "grad_norm": 2.875, + "learning_rate": 1.5801121094551863e-06, + "loss": 1.5627, + "step": 10465 + }, + { + "epoch": 2.9455619637079757, + "grad_norm": 2.5, + "learning_rate": 1.5762035341281634e-06, + "loss": 1.7602, + "step": 10470 + }, + { + "epoch": 2.946968631312421, + "grad_norm": 3.34375, + "learning_rate": 1.5722986124192813e-06, + "loss": 1.486, + "step": 10475 + }, + { + "epoch": 2.948375298916866, + "grad_norm": 6.3125, + "learning_rate": 1.5683973502147936e-06, + "loss": 1.6301, + "step": 10480 + }, + { + "epoch": 2.9497819665213107, + "grad_norm": 4.5625, + "learning_rate": 1.564499753395446e-06, + "loss": 1.4134, + "step": 10485 + }, + { + "epoch": 2.951188634125756, + "grad_norm": 3.28125, + "learning_rate": 1.5606058278364546e-06, + "loss": 1.5875, + "step": 10490 + }, + { + "epoch": 2.9525953017302013, + "grad_norm": 2.625, + "learning_rate": 1.5567155794075016e-06, + "loss": 1.6243, + "step": 10495 + }, + { + "epoch": 2.954001969334646, + "grad_norm": 3.25, + "learning_rate": 1.55282901397273e-06, + "loss": 1.5925, + "step": 10500 + }, + { + "epoch": 2.955408636939091, + "grad_norm": 2.96875, + "learning_rate": 1.548946137390724e-06, + "loss": 1.4384, + "step": 10505 + }, + { + "epoch": 2.9568153045435364, + "grad_norm": 3.515625, + "learning_rate": 1.5450669555145153e-06, + "loss": 1.4555, + "step": 10510 + }, + { + "epoch": 2.9582219721479817, + "grad_norm": 2.84375, + "learning_rate": 1.54119147419156e-06, + "loss": 1.7412, + "step": 10515 + }, + { + "epoch": 2.9596286397524265, + "grad_norm": 3.234375, + "learning_rate": 1.5373196992637403e-06, + "loss": 1.3866, + "step": 10520 + }, + { + "epoch": 2.9610353073568714, + "grad_norm": 3.34375, + "learning_rate": 1.5334516365673462e-06, + "loss": 1.6634, + "step": 10525 + }, + { + "epoch": 2.9624419749613167, + "grad_norm": 3.65625, + "learning_rate": 1.529587291933081e-06, + "loss": 1.5429, + "step": 10530 + }, + { + "epoch": 2.963848642565762, + "grad_norm": 4.4375, + "learning_rate": 1.5257266711860308e-06, + "loss": 1.6514, + "step": 10535 + }, + { + "epoch": 2.965255310170207, + "grad_norm": 2.46875, + "learning_rate": 1.5218697801456802e-06, + "loss": 1.5855, + "step": 10540 + }, + { + "epoch": 2.9666619777746517, + "grad_norm": 3.796875, + "learning_rate": 1.5180166246258846e-06, + "loss": 1.5531, + "step": 10545 + }, + { + "epoch": 2.968068645379097, + "grad_norm": 2.84375, + "learning_rate": 1.5141672104348708e-06, + "loss": 1.6195, + "step": 10550 + }, + { + "epoch": 2.969475312983542, + "grad_norm": 3.171875, + "learning_rate": 1.5103215433752245e-06, + "loss": 1.616, + "step": 10555 + }, + { + "epoch": 2.970881980587987, + "grad_norm": 3.6875, + "learning_rate": 1.5064796292438868e-06, + "loss": 1.5293, + "step": 10560 + }, + { + "epoch": 2.972288648192432, + "grad_norm": 3.640625, + "learning_rate": 1.502641473832137e-06, + "loss": 1.4734, + "step": 10565 + }, + { + "epoch": 2.9736953157968773, + "grad_norm": 2.984375, + "learning_rate": 1.4988070829255902e-06, + "loss": 1.772, + "step": 10570 + }, + { + "epoch": 2.975101983401322, + "grad_norm": 3.046875, + "learning_rate": 1.4949764623041907e-06, + "loss": 1.6811, + "step": 10575 + }, + { + "epoch": 2.9765086510057674, + "grad_norm": 3.4375, + "learning_rate": 1.4911496177421903e-06, + "loss": 1.6491, + "step": 10580 + }, + { + "epoch": 2.9779153186102123, + "grad_norm": 3.515625, + "learning_rate": 1.4873265550081593e-06, + "loss": 1.5423, + "step": 10585 + }, + { + "epoch": 2.9793219862146576, + "grad_norm": 2.984375, + "learning_rate": 1.4835072798649607e-06, + "loss": 1.8184, + "step": 10590 + }, + { + "epoch": 2.9807286538191025, + "grad_norm": 2.765625, + "learning_rate": 1.47969179806975e-06, + "loss": 1.658, + "step": 10595 + }, + { + "epoch": 2.9821353214235478, + "grad_norm": 2.640625, + "learning_rate": 1.4758801153739632e-06, + "loss": 1.7618, + "step": 10600 + }, + { + "epoch": 2.9835419890279926, + "grad_norm": 3.890625, + "learning_rate": 1.4720722375233154e-06, + "loss": 1.693, + "step": 10605 + }, + { + "epoch": 2.984948656632438, + "grad_norm": 3.25, + "learning_rate": 1.4682681702577756e-06, + "loss": 1.5824, + "step": 10610 + }, + { + "epoch": 2.9863553242368828, + "grad_norm": 4.90625, + "learning_rate": 1.4644679193115793e-06, + "loss": 1.3904, + "step": 10615 + }, + { + "epoch": 2.987761991841328, + "grad_norm": 4.625, + "learning_rate": 1.4606714904132034e-06, + "loss": 1.4736, + "step": 10620 + }, + { + "epoch": 2.989168659445773, + "grad_norm": 3.859375, + "learning_rate": 1.4568788892853653e-06, + "loss": 1.4967, + "step": 10625 + }, + { + "epoch": 2.990575327050218, + "grad_norm": 3.3125, + "learning_rate": 1.4530901216450113e-06, + "loss": 1.5161, + "step": 10630 + }, + { + "epoch": 2.991981994654663, + "grad_norm": 3.90625, + "learning_rate": 1.4493051932033113e-06, + "loss": 1.6695, + "step": 10635 + }, + { + "epoch": 2.9933886622591084, + "grad_norm": 4.5625, + "learning_rate": 1.4455241096656466e-06, + "loss": 1.4315, + "step": 10640 + }, + { + "epoch": 2.9947953298635532, + "grad_norm": 2.96875, + "learning_rate": 1.4417468767316022e-06, + "loss": 1.7368, + "step": 10645 + }, + { + "epoch": 2.996201997467998, + "grad_norm": 4.53125, + "learning_rate": 1.437973500094959e-06, + "loss": 1.5735, + "step": 10650 + }, + { + "epoch": 2.9976086650724434, + "grad_norm": 2.828125, + "learning_rate": 1.4342039854436849e-06, + "loss": 1.4599, + "step": 10655 + }, + { + "epoch": 2.9990153326768887, + "grad_norm": 2.8125, + "learning_rate": 1.4304383384599281e-06, + "loss": 1.7635, + "step": 10660 + }, + { + "epoch": 2.9998593332395553, + "eval_loss": 1.5767865180969238, + "eval_runtime": 330.5515, + "eval_samples_per_second": 9.554, + "eval_steps_per_second": 4.777, + "step": 10663 + }, + { + "epoch": 3.0004220002813335, + "grad_norm": 3.015625, + "learning_rate": 1.4266765648200045e-06, + "loss": 1.6251, + "step": 10665 + }, + { + "epoch": 3.0018286678857784, + "grad_norm": 2.96875, + "learning_rate": 1.4229186701943925e-06, + "loss": 1.4799, + "step": 10670 + }, + { + "epoch": 3.0032353354902237, + "grad_norm": 3.015625, + "learning_rate": 1.4191646602477216e-06, + "loss": 1.5905, + "step": 10675 + }, + { + "epoch": 3.0046420030946686, + "grad_norm": 7.09375, + "learning_rate": 1.4154145406387681e-06, + "loss": 1.5103, + "step": 10680 + }, + { + "epoch": 3.006048670699114, + "grad_norm": 7.84375, + "learning_rate": 1.4116683170204407e-06, + "loss": 1.5813, + "step": 10685 + }, + { + "epoch": 3.0074553383035587, + "grad_norm": 2.640625, + "learning_rate": 1.40792599503978e-06, + "loss": 1.4847, + "step": 10690 + }, + { + "epoch": 3.008862005908004, + "grad_norm": 3.65625, + "learning_rate": 1.404187580337941e-06, + "loss": 1.6993, + "step": 10695 + }, + { + "epoch": 3.010268673512449, + "grad_norm": 2.875, + "learning_rate": 1.40045307855019e-06, + "loss": 1.7316, + "step": 10700 + }, + { + "epoch": 3.011675341116894, + "grad_norm": 3.71875, + "learning_rate": 1.3967224953058988e-06, + "loss": 1.6364, + "step": 10705 + }, + { + "epoch": 3.013082008721339, + "grad_norm": 3.59375, + "learning_rate": 1.3929958362285242e-06, + "loss": 1.6369, + "step": 10710 + }, + { + "epoch": 3.0144886763257843, + "grad_norm": 5.75, + "learning_rate": 1.3892731069356161e-06, + "loss": 1.6918, + "step": 10715 + }, + { + "epoch": 3.015895343930229, + "grad_norm": 3.671875, + "learning_rate": 1.3855543130387965e-06, + "loss": 1.6818, + "step": 10720 + }, + { + "epoch": 3.0173020115346745, + "grad_norm": 2.984375, + "learning_rate": 1.3818394601437557e-06, + "loss": 1.6582, + "step": 10725 + }, + { + "epoch": 3.0187086791391193, + "grad_norm": 4.21875, + "learning_rate": 1.3781285538502418e-06, + "loss": 1.522, + "step": 10730 + }, + { + "epoch": 3.0201153467435646, + "grad_norm": 3.578125, + "learning_rate": 1.3744215997520602e-06, + "loss": 1.4227, + "step": 10735 + }, + { + "epoch": 3.0215220143480095, + "grad_norm": 3.046875, + "learning_rate": 1.3707186034370484e-06, + "loss": 1.6587, + "step": 10740 + }, + { + "epoch": 3.022928681952455, + "grad_norm": 3.21875, + "learning_rate": 1.3670195704870883e-06, + "loss": 1.5284, + "step": 10745 + }, + { + "epoch": 3.0243353495568996, + "grad_norm": 3.390625, + "learning_rate": 1.3633245064780803e-06, + "loss": 1.6379, + "step": 10750 + }, + { + "epoch": 3.025742017161345, + "grad_norm": 2.90625, + "learning_rate": 1.3596334169799457e-06, + "loss": 1.4993, + "step": 10755 + }, + { + "epoch": 3.02714868476579, + "grad_norm": 3.015625, + "learning_rate": 1.355946307556612e-06, + "loss": 1.6514, + "step": 10760 + }, + { + "epoch": 3.028555352370235, + "grad_norm": 4.46875, + "learning_rate": 1.3522631837660123e-06, + "loss": 1.5571, + "step": 10765 + }, + { + "epoch": 3.02996201997468, + "grad_norm": 2.71875, + "learning_rate": 1.3485840511600636e-06, + "loss": 1.47, + "step": 10770 + }, + { + "epoch": 3.0313686875791253, + "grad_norm": 3.359375, + "learning_rate": 1.3449089152846726e-06, + "loss": 1.6482, + "step": 10775 + }, + { + "epoch": 3.03277535518357, + "grad_norm": 3.953125, + "learning_rate": 1.341237781679724e-06, + "loss": 1.5908, + "step": 10780 + }, + { + "epoch": 3.0341820227880154, + "grad_norm": 3.015625, + "learning_rate": 1.337570655879059e-06, + "loss": 1.6287, + "step": 10785 + }, + { + "epoch": 3.0355886903924603, + "grad_norm": 4.21875, + "learning_rate": 1.3339075434104885e-06, + "loss": 1.6187, + "step": 10790 + }, + { + "epoch": 3.036995357996905, + "grad_norm": 3.6875, + "learning_rate": 1.3302484497957678e-06, + "loss": 1.6408, + "step": 10795 + }, + { + "epoch": 3.0384020256013504, + "grad_norm": 4.09375, + "learning_rate": 1.3265933805505954e-06, + "loss": 1.7244, + "step": 10800 + }, + { + "epoch": 3.0398086932057953, + "grad_norm": 3.453125, + "learning_rate": 1.3229423411846018e-06, + "loss": 1.8399, + "step": 10805 + }, + { + "epoch": 3.0412153608102406, + "grad_norm": 3.34375, + "learning_rate": 1.319295337201349e-06, + "loss": 1.4242, + "step": 10810 + }, + { + "epoch": 3.0426220284146854, + "grad_norm": 3.609375, + "learning_rate": 1.315652374098307e-06, + "loss": 1.5995, + "step": 10815 + }, + { + "epoch": 3.0440286960191307, + "grad_norm": 2.96875, + "learning_rate": 1.3120134573668624e-06, + "loss": 1.6642, + "step": 10820 + }, + { + "epoch": 3.0454353636235756, + "grad_norm": 3.25, + "learning_rate": 1.3083785924922986e-06, + "loss": 1.3545, + "step": 10825 + }, + { + "epoch": 3.046842031228021, + "grad_norm": 3.953125, + "learning_rate": 1.3047477849537916e-06, + "loss": 1.3587, + "step": 10830 + }, + { + "epoch": 3.0482486988324657, + "grad_norm": 3.328125, + "learning_rate": 1.3011210402244008e-06, + "loss": 1.6701, + "step": 10835 + }, + { + "epoch": 3.049655366436911, + "grad_norm": 3.0, + "learning_rate": 1.2974983637710644e-06, + "loss": 1.5038, + "step": 10840 + }, + { + "epoch": 3.051062034041356, + "grad_norm": 3.078125, + "learning_rate": 1.293879761054585e-06, + "loss": 1.3233, + "step": 10845 + }, + { + "epoch": 3.052468701645801, + "grad_norm": 3.421875, + "learning_rate": 1.2902652375296255e-06, + "loss": 1.3758, + "step": 10850 + }, + { + "epoch": 3.053875369250246, + "grad_norm": 3.171875, + "learning_rate": 1.2866547986446993e-06, + "loss": 1.495, + "step": 10855 + }, + { + "epoch": 3.0552820368546914, + "grad_norm": 3.09375, + "learning_rate": 1.283048449842162e-06, + "loss": 1.4547, + "step": 10860 + }, + { + "epoch": 3.056688704459136, + "grad_norm": 3.421875, + "learning_rate": 1.2794461965582098e-06, + "loss": 1.5248, + "step": 10865 + }, + { + "epoch": 3.0580953720635815, + "grad_norm": 3.25, + "learning_rate": 1.275848044222854e-06, + "loss": 1.5124, + "step": 10870 + }, + { + "epoch": 3.0595020396680264, + "grad_norm": 3.1875, + "learning_rate": 1.2722539982599352e-06, + "loss": 1.4788, + "step": 10875 + }, + { + "epoch": 3.0609087072724717, + "grad_norm": 3.265625, + "learning_rate": 1.268664064087098e-06, + "loss": 1.5596, + "step": 10880 + }, + { + "epoch": 3.0623153748769165, + "grad_norm": 2.90625, + "learning_rate": 1.2650782471157904e-06, + "loss": 1.6641, + "step": 10885 + }, + { + "epoch": 3.063722042481362, + "grad_norm": 3.375, + "learning_rate": 1.2614965527512533e-06, + "loss": 1.4911, + "step": 10890 + }, + { + "epoch": 3.0651287100858067, + "grad_norm": 2.46875, + "learning_rate": 1.2579189863925175e-06, + "loss": 1.5126, + "step": 10895 + }, + { + "epoch": 3.066535377690252, + "grad_norm": 2.453125, + "learning_rate": 1.2543455534323828e-06, + "loss": 1.3572, + "step": 10900 + }, + { + "epoch": 3.067942045294697, + "grad_norm": 4.0625, + "learning_rate": 1.2507762592574272e-06, + "loss": 1.7074, + "step": 10905 + }, + { + "epoch": 3.069348712899142, + "grad_norm": 2.9375, + "learning_rate": 1.2472111092479853e-06, + "loss": 1.5994, + "step": 10910 + }, + { + "epoch": 3.070755380503587, + "grad_norm": 3.3125, + "learning_rate": 1.2436501087781435e-06, + "loss": 1.6489, + "step": 10915 + }, + { + "epoch": 3.0721620481080323, + "grad_norm": 4.90625, + "learning_rate": 1.2400932632157389e-06, + "loss": 1.4295, + "step": 10920 + }, + { + "epoch": 3.073568715712477, + "grad_norm": 3.765625, + "learning_rate": 1.2365405779223395e-06, + "loss": 1.6276, + "step": 10925 + }, + { + "epoch": 3.0749753833169224, + "grad_norm": 2.390625, + "learning_rate": 1.2329920582532451e-06, + "loss": 1.5665, + "step": 10930 + }, + { + "epoch": 3.0763820509213673, + "grad_norm": 3.328125, + "learning_rate": 1.229447709557475e-06, + "loss": 1.7107, + "step": 10935 + }, + { + "epoch": 3.077788718525812, + "grad_norm": 2.9375, + "learning_rate": 1.2259075371777648e-06, + "loss": 1.6144, + "step": 10940 + }, + { + "epoch": 3.0791953861302575, + "grad_norm": 3.890625, + "learning_rate": 1.2223715464505473e-06, + "loss": 1.7557, + "step": 10945 + }, + { + "epoch": 3.0806020537347023, + "grad_norm": 4.21875, + "learning_rate": 1.2188397427059607e-06, + "loss": 1.5287, + "step": 10950 + }, + { + "epoch": 3.0820087213391476, + "grad_norm": 3.125, + "learning_rate": 1.215312131267825e-06, + "loss": 1.8295, + "step": 10955 + }, + { + "epoch": 3.0834153889435925, + "grad_norm": 2.65625, + "learning_rate": 1.2117887174536444e-06, + "loss": 1.646, + "step": 10960 + }, + { + "epoch": 3.0848220565480378, + "grad_norm": 4.375, + "learning_rate": 1.2082695065745925e-06, + "loss": 1.6581, + "step": 10965 + }, + { + "epoch": 3.0862287241524826, + "grad_norm": 3.15625, + "learning_rate": 1.2047545039355141e-06, + "loss": 1.7364, + "step": 10970 + }, + { + "epoch": 3.087635391756928, + "grad_norm": 4.53125, + "learning_rate": 1.2012437148348994e-06, + "loss": 1.4243, + "step": 10975 + }, + { + "epoch": 3.0890420593613728, + "grad_norm": 3.546875, + "learning_rate": 1.1977371445648988e-06, + "loss": 1.5345, + "step": 10980 + }, + { + "epoch": 3.090448726965818, + "grad_norm": 4.3125, + "learning_rate": 1.1942347984112959e-06, + "loss": 1.6182, + "step": 10985 + }, + { + "epoch": 3.091855394570263, + "grad_norm": 2.34375, + "learning_rate": 1.1907366816535076e-06, + "loss": 1.686, + "step": 10990 + }, + { + "epoch": 3.0932620621747082, + "grad_norm": 3.34375, + "learning_rate": 1.1872427995645803e-06, + "loss": 1.6519, + "step": 10995 + }, + { + "epoch": 3.094668729779153, + "grad_norm": 2.515625, + "learning_rate": 1.183753157411171e-06, + "loss": 1.5121, + "step": 11000 + }, + { + "epoch": 3.0960753973835984, + "grad_norm": 2.90625, + "learning_rate": 1.1802677604535496e-06, + "loss": 1.5297, + "step": 11005 + }, + { + "epoch": 3.0974820649880432, + "grad_norm": 3.328125, + "learning_rate": 1.176786613945584e-06, + "loss": 1.3636, + "step": 11010 + }, + { + "epoch": 3.0988887325924885, + "grad_norm": 3.484375, + "learning_rate": 1.1733097231347372e-06, + "loss": 1.6959, + "step": 11015 + }, + { + "epoch": 3.1002954001969334, + "grad_norm": 2.875, + "learning_rate": 1.1698370932620538e-06, + "loss": 1.5115, + "step": 11020 + }, + { + "epoch": 3.1017020678013787, + "grad_norm": 3.90625, + "learning_rate": 1.1663687295621621e-06, + "loss": 1.5369, + "step": 11025 + }, + { + "epoch": 3.1031087354058235, + "grad_norm": 3.59375, + "learning_rate": 1.1629046372632524e-06, + "loss": 1.5513, + "step": 11030 + }, + { + "epoch": 3.104515403010269, + "grad_norm": 3.484375, + "learning_rate": 1.1594448215870812e-06, + "loss": 1.4403, + "step": 11035 + }, + { + "epoch": 3.1059220706147137, + "grad_norm": 3.859375, + "learning_rate": 1.1559892877489548e-06, + "loss": 1.6092, + "step": 11040 + }, + { + "epoch": 3.107328738219159, + "grad_norm": 3.109375, + "learning_rate": 1.1525380409577282e-06, + "loss": 1.6913, + "step": 11045 + }, + { + "epoch": 3.108735405823604, + "grad_norm": 3.015625, + "learning_rate": 1.1490910864157907e-06, + "loss": 1.4355, + "step": 11050 + }, + { + "epoch": 3.110142073428049, + "grad_norm": 3.203125, + "learning_rate": 1.1456484293190668e-06, + "loss": 1.624, + "step": 11055 + }, + { + "epoch": 3.111548741032494, + "grad_norm": 3.40625, + "learning_rate": 1.1422100748569982e-06, + "loss": 1.2994, + "step": 11060 + }, + { + "epoch": 3.112955408636939, + "grad_norm": 3.328125, + "learning_rate": 1.1387760282125412e-06, + "loss": 1.52, + "step": 11065 + }, + { + "epoch": 3.114362076241384, + "grad_norm": 2.859375, + "learning_rate": 1.1353462945621632e-06, + "loss": 1.5788, + "step": 11070 + }, + { + "epoch": 3.115768743845829, + "grad_norm": 3.046875, + "learning_rate": 1.1319208790758223e-06, + "loss": 1.542, + "step": 11075 + }, + { + "epoch": 3.1171754114502743, + "grad_norm": 3.65625, + "learning_rate": 1.1284997869169756e-06, + "loss": 1.5894, + "step": 11080 + }, + { + "epoch": 3.118582079054719, + "grad_norm": 3.328125, + "learning_rate": 1.125083023242558e-06, + "loss": 1.6021, + "step": 11085 + }, + { + "epoch": 3.1199887466591645, + "grad_norm": 2.9375, + "learning_rate": 1.1216705932029816e-06, + "loss": 1.7714, + "step": 11090 + }, + { + "epoch": 3.1213954142636093, + "grad_norm": 3.09375, + "learning_rate": 1.1182625019421244e-06, + "loss": 1.5162, + "step": 11095 + }, + { + "epoch": 3.1228020818680546, + "grad_norm": 4.25, + "learning_rate": 1.114858754597329e-06, + "loss": 1.7289, + "step": 11100 + }, + { + "epoch": 3.1242087494724995, + "grad_norm": 4.0625, + "learning_rate": 1.111459356299381e-06, + "loss": 1.5885, + "step": 11105 + }, + { + "epoch": 3.125615417076945, + "grad_norm": 4.1875, + "learning_rate": 1.10806431217252e-06, + "loss": 1.5521, + "step": 11110 + }, + { + "epoch": 3.1270220846813896, + "grad_norm": 3.953125, + "learning_rate": 1.104673627334416e-06, + "loss": 1.578, + "step": 11115 + }, + { + "epoch": 3.128428752285835, + "grad_norm": 3.59375, + "learning_rate": 1.1012873068961702e-06, + "loss": 1.6254, + "step": 11120 + }, + { + "epoch": 3.12983541989028, + "grad_norm": 4.0, + "learning_rate": 1.0979053559623026e-06, + "loss": 1.4843, + "step": 11125 + }, + { + "epoch": 3.131242087494725, + "grad_norm": 3.25, + "learning_rate": 1.0945277796307513e-06, + "loss": 1.5013, + "step": 11130 + }, + { + "epoch": 3.13264875509917, + "grad_norm": 4.6875, + "learning_rate": 1.0911545829928552e-06, + "loss": 1.5731, + "step": 11135 + }, + { + "epoch": 3.1340554227036153, + "grad_norm": 2.828125, + "learning_rate": 1.0877857711333534e-06, + "loss": 1.6624, + "step": 11140 + }, + { + "epoch": 3.13546209030806, + "grad_norm": 2.859375, + "learning_rate": 1.0844213491303772e-06, + "loss": 1.766, + "step": 11145 + }, + { + "epoch": 3.1368687579125054, + "grad_norm": 4.34375, + "learning_rate": 1.0810613220554356e-06, + "loss": 1.6699, + "step": 11150 + }, + { + "epoch": 3.1382754255169503, + "grad_norm": 3.625, + "learning_rate": 1.0777056949734187e-06, + "loss": 1.7605, + "step": 11155 + }, + { + "epoch": 3.1396820931213956, + "grad_norm": 3.5625, + "learning_rate": 1.0743544729425802e-06, + "loss": 1.6822, + "step": 11160 + }, + { + "epoch": 3.1410887607258404, + "grad_norm": 4.25, + "learning_rate": 1.0710076610145344e-06, + "loss": 1.6808, + "step": 11165 + }, + { + "epoch": 3.1424954283302857, + "grad_norm": 2.890625, + "learning_rate": 1.0676652642342471e-06, + "loss": 1.5603, + "step": 11170 + }, + { + "epoch": 3.1439020959347306, + "grad_norm": 3.359375, + "learning_rate": 1.064327287640034e-06, + "loss": 1.6544, + "step": 11175 + }, + { + "epoch": 3.145308763539176, + "grad_norm": 3.859375, + "learning_rate": 1.0609937362635376e-06, + "loss": 1.6709, + "step": 11180 + }, + { + "epoch": 3.1467154311436207, + "grad_norm": 3.5, + "learning_rate": 1.0576646151297404e-06, + "loss": 1.6512, + "step": 11185 + }, + { + "epoch": 3.148122098748066, + "grad_norm": 4.78125, + "learning_rate": 1.0543399292569404e-06, + "loss": 1.7634, + "step": 11190 + }, + { + "epoch": 3.149528766352511, + "grad_norm": 4.28125, + "learning_rate": 1.0510196836567522e-06, + "loss": 1.4378, + "step": 11195 + }, + { + "epoch": 3.150935433956956, + "grad_norm": 3.046875, + "learning_rate": 1.0477038833340964e-06, + "loss": 1.5599, + "step": 11200 + }, + { + "epoch": 3.152342101561401, + "grad_norm": 3.265625, + "learning_rate": 1.0443925332871914e-06, + "loss": 1.6039, + "step": 11205 + }, + { + "epoch": 3.153748769165846, + "grad_norm": 3.109375, + "learning_rate": 1.0410856385075528e-06, + "loss": 1.5945, + "step": 11210 + }, + { + "epoch": 3.155155436770291, + "grad_norm": 3.796875, + "learning_rate": 1.037783203979974e-06, + "loss": 1.514, + "step": 11215 + }, + { + "epoch": 3.156562104374736, + "grad_norm": 3.40625, + "learning_rate": 1.0344852346825282e-06, + "loss": 1.3017, + "step": 11220 + }, + { + "epoch": 3.1579687719791814, + "grad_norm": 2.78125, + "learning_rate": 1.0311917355865554e-06, + "loss": 1.4708, + "step": 11225 + }, + { + "epoch": 3.159375439583626, + "grad_norm": 3.8125, + "learning_rate": 1.027902711656663e-06, + "loss": 1.676, + "step": 11230 + }, + { + "epoch": 3.1607821071880715, + "grad_norm": 4.59375, + "learning_rate": 1.0246181678507038e-06, + "loss": 1.5518, + "step": 11235 + }, + { + "epoch": 3.1621887747925164, + "grad_norm": 3.1875, + "learning_rate": 1.0213381091197852e-06, + "loss": 1.561, + "step": 11240 + }, + { + "epoch": 3.1635954423969617, + "grad_norm": 2.515625, + "learning_rate": 1.0180625404082497e-06, + "loss": 1.7335, + "step": 11245 + }, + { + "epoch": 3.1650021100014065, + "grad_norm": 3.625, + "learning_rate": 1.0147914666536718e-06, + "loss": 1.5096, + "step": 11250 + }, + { + "epoch": 3.166408777605852, + "grad_norm": 5.3125, + "learning_rate": 1.011524892786851e-06, + "loss": 1.5547, + "step": 11255 + }, + { + "epoch": 3.1678154452102967, + "grad_norm": 4.125, + "learning_rate": 1.0082628237318065e-06, + "loss": 1.6803, + "step": 11260 + }, + { + "epoch": 3.169222112814742, + "grad_norm": 3.34375, + "learning_rate": 1.0050052644057592e-06, + "loss": 1.6371, + "step": 11265 + }, + { + "epoch": 3.170628780419187, + "grad_norm": 3.640625, + "learning_rate": 1.00175221971914e-06, + "loss": 1.6701, + "step": 11270 + }, + { + "epoch": 3.172035448023632, + "grad_norm": 3.59375, + "learning_rate": 9.985036945755734e-07, + "loss": 1.7211, + "step": 11275 + }, + { + "epoch": 3.173442115628077, + "grad_norm": 3.609375, + "learning_rate": 9.952596938718648e-07, + "loss": 1.5572, + "step": 11280 + }, + { + "epoch": 3.1748487832325223, + "grad_norm": 3.21875, + "learning_rate": 9.920202224980072e-07, + "loss": 1.5249, + "step": 11285 + }, + { + "epoch": 3.176255450836967, + "grad_norm": 2.671875, + "learning_rate": 9.887852853371615e-07, + "loss": 1.4818, + "step": 11290 + }, + { + "epoch": 3.1776621184414124, + "grad_norm": 4.15625, + "learning_rate": 9.855548872656557e-07, + "loss": 1.5171, + "step": 11295 + }, + { + "epoch": 3.1790687860458573, + "grad_norm": 3.5625, + "learning_rate": 9.823290331529736e-07, + "loss": 1.7529, + "step": 11300 + }, + { + "epoch": 3.1804754536503026, + "grad_norm": 4.65625, + "learning_rate": 9.791077278617538e-07, + "loss": 1.546, + "step": 11305 + }, + { + "epoch": 3.1818821212547475, + "grad_norm": 3.421875, + "learning_rate": 9.758909762477717e-07, + "loss": 1.6178, + "step": 11310 + }, + { + "epoch": 3.1832887888591928, + "grad_norm": 3.578125, + "learning_rate": 9.726787831599436e-07, + "loss": 1.4941, + "step": 11315 + }, + { + "epoch": 3.1846954564636376, + "grad_norm": 2.734375, + "learning_rate": 9.694711534403128e-07, + "loss": 1.6382, + "step": 11320 + }, + { + "epoch": 3.186102124068083, + "grad_norm": 3.390625, + "learning_rate": 9.662680919240434e-07, + "loss": 1.4399, + "step": 11325 + }, + { + "epoch": 3.1875087916725278, + "grad_norm": 3.078125, + "learning_rate": 9.630696034394118e-07, + "loss": 1.2156, + "step": 11330 + }, + { + "epoch": 3.1889154592769726, + "grad_norm": 5.40625, + "learning_rate": 9.598756928078069e-07, + "loss": 1.7228, + "step": 11335 + }, + { + "epoch": 3.190322126881418, + "grad_norm": 2.53125, + "learning_rate": 9.56686364843708e-07, + "loss": 1.6953, + "step": 11340 + }, + { + "epoch": 3.191728794485863, + "grad_norm": 3.46875, + "learning_rate": 9.535016243546952e-07, + "loss": 1.586, + "step": 11345 + }, + { + "epoch": 3.193135462090308, + "grad_norm": 2.953125, + "learning_rate": 9.503214761414277e-07, + "loss": 1.7237, + "step": 11350 + }, + { + "epoch": 3.194542129694753, + "grad_norm": 3.421875, + "learning_rate": 9.471459249976446e-07, + "loss": 1.4558, + "step": 11355 + }, + { + "epoch": 3.1959487972991982, + "grad_norm": 4.5, + "learning_rate": 9.439749757101561e-07, + "loss": 1.5914, + "step": 11360 + }, + { + "epoch": 3.197355464903643, + "grad_norm": 3.140625, + "learning_rate": 9.408086330588343e-07, + "loss": 1.6958, + "step": 11365 + }, + { + "epoch": 3.1987621325080884, + "grad_norm": 4.9375, + "learning_rate": 9.376469018166071e-07, + "loss": 1.6927, + "step": 11370 + }, + { + "epoch": 3.2001688001125332, + "grad_norm": 2.8125, + "learning_rate": 9.344897867494515e-07, + "loss": 1.6201, + "step": 11375 + }, + { + "epoch": 3.2015754677169785, + "grad_norm": 4.1875, + "learning_rate": 9.313372926163867e-07, + "loss": 1.5303, + "step": 11380 + }, + { + "epoch": 3.2029821353214234, + "grad_norm": 4.15625, + "learning_rate": 9.28189424169465e-07, + "loss": 1.7088, + "step": 11385 + }, + { + "epoch": 3.2043888029258687, + "grad_norm": 3.359375, + "learning_rate": 9.250461861537684e-07, + "loss": 1.5421, + "step": 11390 + }, + { + "epoch": 3.2057954705303136, + "grad_norm": 2.96875, + "learning_rate": 9.21907583307397e-07, + "loss": 1.6744, + "step": 11395 + }, + { + "epoch": 3.207202138134759, + "grad_norm": 3.703125, + "learning_rate": 9.187736203614638e-07, + "loss": 1.5669, + "step": 11400 + }, + { + "epoch": 3.2086088057392037, + "grad_norm": 3.859375, + "learning_rate": 9.156443020400883e-07, + "loss": 1.6098, + "step": 11405 + }, + { + "epoch": 3.210015473343649, + "grad_norm": 4.21875, + "learning_rate": 9.125196330603877e-07, + "loss": 1.504, + "step": 11410 + }, + { + "epoch": 3.211422140948094, + "grad_norm": 9.0, + "learning_rate": 9.093996181324742e-07, + "loss": 1.4803, + "step": 11415 + }, + { + "epoch": 3.212828808552539, + "grad_norm": 3.296875, + "learning_rate": 9.062842619594402e-07, + "loss": 1.5536, + "step": 11420 + }, + { + "epoch": 3.214235476156984, + "grad_norm": 2.3125, + "learning_rate": 9.031735692373578e-07, + "loss": 1.8039, + "step": 11425 + }, + { + "epoch": 3.2156421437614293, + "grad_norm": 3.703125, + "learning_rate": 9.000675446552679e-07, + "loss": 1.7332, + "step": 11430 + }, + { + "epoch": 3.217048811365874, + "grad_norm": 3.0, + "learning_rate": 8.969661928951789e-07, + "loss": 1.3582, + "step": 11435 + }, + { + "epoch": 3.2184554789703195, + "grad_norm": 3.015625, + "learning_rate": 8.938695186320475e-07, + "loss": 1.5736, + "step": 11440 + }, + { + "epoch": 3.2198621465747643, + "grad_norm": 2.71875, + "learning_rate": 8.90777526533788e-07, + "loss": 1.5083, + "step": 11445 + }, + { + "epoch": 3.2212688141792096, + "grad_norm": 4.59375, + "learning_rate": 8.87690221261252e-07, + "loss": 1.5167, + "step": 11450 + }, + { + "epoch": 3.2226754817836545, + "grad_norm": 3.234375, + "learning_rate": 8.846076074682276e-07, + "loss": 1.7006, + "step": 11455 + }, + { + "epoch": 3.2240821493881, + "grad_norm": 2.671875, + "learning_rate": 8.815296898014293e-07, + "loss": 1.5955, + "step": 11460 + }, + { + "epoch": 3.2254888169925446, + "grad_norm": 3.40625, + "learning_rate": 8.784564729004978e-07, + "loss": 1.6101, + "step": 11465 + }, + { + "epoch": 3.22689548459699, + "grad_norm": 3.328125, + "learning_rate": 8.753879613979789e-07, + "loss": 1.6842, + "step": 11470 + }, + { + "epoch": 3.228302152201435, + "grad_norm": 4.34375, + "learning_rate": 8.723241599193349e-07, + "loss": 1.2444, + "step": 11475 + }, + { + "epoch": 3.2297088198058796, + "grad_norm": 4.46875, + "learning_rate": 8.692650730829232e-07, + "loss": 1.2935, + "step": 11480 + }, + { + "epoch": 3.231115487410325, + "grad_norm": 4.375, + "learning_rate": 8.662107054999936e-07, + "loss": 1.6327, + "step": 11485 + }, + { + "epoch": 3.23252215501477, + "grad_norm": 2.9375, + "learning_rate": 8.631610617746865e-07, + "loss": 1.7187, + "step": 11490 + }, + { + "epoch": 3.233928822619215, + "grad_norm": 2.78125, + "learning_rate": 8.601161465040179e-07, + "loss": 1.5457, + "step": 11495 + }, + { + "epoch": 3.23533549022366, + "grad_norm": 2.59375, + "learning_rate": 8.570759642778766e-07, + "loss": 1.6327, + "step": 11500 + }, + { + "epoch": 3.2367421578281053, + "grad_norm": 3.53125, + "learning_rate": 8.54040519679017e-07, + "loss": 1.6916, + "step": 11505 + }, + { + "epoch": 3.23814882543255, + "grad_norm": 2.953125, + "learning_rate": 8.510098172830553e-07, + "loss": 1.7347, + "step": 11510 + }, + { + "epoch": 3.2395554930369954, + "grad_norm": 3.265625, + "learning_rate": 8.479838616584523e-07, + "loss": 1.763, + "step": 11515 + }, + { + "epoch": 3.2409621606414403, + "grad_norm": 2.796875, + "learning_rate": 8.449626573665209e-07, + "loss": 1.791, + "step": 11520 + }, + { + "epoch": 3.2423688282458856, + "grad_norm": 3.359375, + "learning_rate": 8.419462089614073e-07, + "loss": 1.5492, + "step": 11525 + }, + { + "epoch": 3.2437754958503304, + "grad_norm": 2.6875, + "learning_rate": 8.389345209900907e-07, + "loss": 1.3123, + "step": 11530 + }, + { + "epoch": 3.2451821634547757, + "grad_norm": 4.53125, + "learning_rate": 8.359275979923723e-07, + "loss": 1.6477, + "step": 11535 + }, + { + "epoch": 3.2465888310592206, + "grad_norm": 4.15625, + "learning_rate": 8.329254445008755e-07, + "loss": 1.663, + "step": 11540 + }, + { + "epoch": 3.247995498663666, + "grad_norm": 3.390625, + "learning_rate": 8.299280650410265e-07, + "loss": 1.3256, + "step": 11545 + }, + { + "epoch": 3.2494021662681107, + "grad_norm": 4.0625, + "learning_rate": 8.269354641310627e-07, + "loss": 1.5578, + "step": 11550 + }, + { + "epoch": 3.250808833872556, + "grad_norm": 3.59375, + "learning_rate": 8.239476462820136e-07, + "loss": 1.6555, + "step": 11555 + }, + { + "epoch": 3.252215501477001, + "grad_norm": 2.390625, + "learning_rate": 8.209646159976999e-07, + "loss": 1.6347, + "step": 11560 + }, + { + "epoch": 3.253622169081446, + "grad_norm": 3.546875, + "learning_rate": 8.179863777747287e-07, + "loss": 1.4629, + "step": 11565 + }, + { + "epoch": 3.255028836685891, + "grad_norm": 3.5, + "learning_rate": 8.150129361024762e-07, + "loss": 1.6009, + "step": 11570 + }, + { + "epoch": 3.2564355042903363, + "grad_norm": 3.6875, + "learning_rate": 8.120442954630964e-07, + "loss": 1.3342, + "step": 11575 + }, + { + "epoch": 3.257842171894781, + "grad_norm": 3.34375, + "learning_rate": 8.090804603315016e-07, + "loss": 1.6218, + "step": 11580 + }, + { + "epoch": 3.2592488394992265, + "grad_norm": 4.75, + "learning_rate": 8.061214351753616e-07, + "loss": 1.6242, + "step": 11585 + }, + { + "epoch": 3.2606555071036714, + "grad_norm": 2.40625, + "learning_rate": 8.031672244550938e-07, + "loss": 1.5137, + "step": 11590 + }, + { + "epoch": 3.2620621747081167, + "grad_norm": 3.0625, + "learning_rate": 8.002178326238636e-07, + "loss": 1.6331, + "step": 11595 + }, + { + "epoch": 3.2634688423125615, + "grad_norm": 4.21875, + "learning_rate": 7.972732641275648e-07, + "loss": 1.2351, + "step": 11600 + }, + { + "epoch": 3.2648755099170064, + "grad_norm": 3.3125, + "learning_rate": 7.943335234048274e-07, + "loss": 1.539, + "step": 11605 + }, + { + "epoch": 3.2662821775214517, + "grad_norm": 5.375, + "learning_rate": 7.91398614887e-07, + "loss": 1.53, + "step": 11610 + }, + { + "epoch": 3.267688845125897, + "grad_norm": 3.53125, + "learning_rate": 7.88468542998149e-07, + "loss": 1.5524, + "step": 11615 + }, + { + "epoch": 3.269095512730342, + "grad_norm": 3.109375, + "learning_rate": 7.855433121550481e-07, + "loss": 1.4712, + "step": 11620 + }, + { + "epoch": 3.2705021803347867, + "grad_norm": 2.578125, + "learning_rate": 7.826229267671771e-07, + "loss": 1.4857, + "step": 11625 + }, + { + "epoch": 3.271908847939232, + "grad_norm": 4.4375, + "learning_rate": 7.797073912367085e-07, + "loss": 1.6786, + "step": 11630 + }, + { + "epoch": 3.2733155155436773, + "grad_norm": 3.53125, + "learning_rate": 7.767967099585044e-07, + "loss": 1.5137, + "step": 11635 + }, + { + "epoch": 3.274722183148122, + "grad_norm": 4.03125, + "learning_rate": 7.73890887320114e-07, + "loss": 1.7585, + "step": 11640 + }, + { + "epoch": 3.276128850752567, + "grad_norm": 2.8125, + "learning_rate": 7.709899277017546e-07, + "loss": 1.6304, + "step": 11645 + }, + { + "epoch": 3.2775355183570123, + "grad_norm": 3.296875, + "learning_rate": 7.680938354763205e-07, + "loss": 1.5074, + "step": 11650 + }, + { + "epoch": 3.278942185961457, + "grad_norm": 2.90625, + "learning_rate": 7.652026150093656e-07, + "loss": 1.6919, + "step": 11655 + }, + { + "epoch": 3.2803488535659024, + "grad_norm": 4.1875, + "learning_rate": 7.623162706591002e-07, + "loss": 1.5654, + "step": 11660 + }, + { + "epoch": 3.2817555211703473, + "grad_norm": 3.390625, + "learning_rate": 7.594348067763837e-07, + "loss": 1.5234, + "step": 11665 + }, + { + "epoch": 3.2831621887747926, + "grad_norm": 3.203125, + "learning_rate": 7.565582277047227e-07, + "loss": 1.4964, + "step": 11670 + }, + { + "epoch": 3.2845688563792375, + "grad_norm": 3.28125, + "learning_rate": 7.536865377802532e-07, + "loss": 1.5708, + "step": 11675 + }, + { + "epoch": 3.2859755239836828, + "grad_norm": 4.71875, + "learning_rate": 7.508197413317491e-07, + "loss": 1.4245, + "step": 11680 + }, + { + "epoch": 3.2873821915881276, + "grad_norm": 3.234375, + "learning_rate": 7.479578426806035e-07, + "loss": 1.4793, + "step": 11685 + }, + { + "epoch": 3.288788859192573, + "grad_norm": 3.109375, + "learning_rate": 7.45100846140827e-07, + "loss": 1.6957, + "step": 11690 + }, + { + "epoch": 3.2901955267970178, + "grad_norm": 3.203125, + "learning_rate": 7.422487560190407e-07, + "loss": 1.4461, + "step": 11695 + }, + { + "epoch": 3.291602194401463, + "grad_norm": 2.9375, + "learning_rate": 7.394015766144717e-07, + "loss": 1.7007, + "step": 11700 + }, + { + "epoch": 3.293008862005908, + "grad_norm": 4.0625, + "learning_rate": 7.365593122189428e-07, + "loss": 1.4112, + "step": 11705 + }, + { + "epoch": 3.294415529610353, + "grad_norm": 3.96875, + "learning_rate": 7.337219671168689e-07, + "loss": 1.5714, + "step": 11710 + }, + { + "epoch": 3.295822197214798, + "grad_norm": 2.828125, + "learning_rate": 7.308895455852484e-07, + "loss": 1.5193, + "step": 11715 + }, + { + "epoch": 3.2972288648192434, + "grad_norm": 3.421875, + "learning_rate": 7.280620518936582e-07, + "loss": 1.4024, + "step": 11720 + }, + { + "epoch": 3.2986355324236882, + "grad_norm": 3.078125, + "learning_rate": 7.252394903042498e-07, + "loss": 1.6127, + "step": 11725 + }, + { + "epoch": 3.3000422000281335, + "grad_norm": 3.8125, + "learning_rate": 7.224218650717361e-07, + "loss": 1.302, + "step": 11730 + }, + { + "epoch": 3.3014488676325784, + "grad_norm": 2.890625, + "learning_rate": 7.196091804433911e-07, + "loss": 1.7009, + "step": 11735 + }, + { + "epoch": 3.3028555352370237, + "grad_norm": 3.375, + "learning_rate": 7.168014406590405e-07, + "loss": 1.6338, + "step": 11740 + }, + { + "epoch": 3.3042622028414685, + "grad_norm": 3.84375, + "learning_rate": 7.139986499510575e-07, + "loss": 1.6938, + "step": 11745 + }, + { + "epoch": 3.3056688704459134, + "grad_norm": 3.5625, + "learning_rate": 7.112008125443524e-07, + "loss": 1.4994, + "step": 11750 + }, + { + "epoch": 3.3070755380503587, + "grad_norm": 3.421875, + "learning_rate": 7.084079326563728e-07, + "loss": 1.5465, + "step": 11755 + }, + { + "epoch": 3.308482205654804, + "grad_norm": 4.96875, + "learning_rate": 7.056200144970907e-07, + "loss": 1.2758, + "step": 11760 + }, + { + "epoch": 3.309888873259249, + "grad_norm": 4.25, + "learning_rate": 7.02837062268999e-07, + "loss": 1.6104, + "step": 11765 + }, + { + "epoch": 3.3112955408636937, + "grad_norm": 3.171875, + "learning_rate": 7.000590801671049e-07, + "loss": 1.3466, + "step": 11770 + }, + { + "epoch": 3.312702208468139, + "grad_norm": 4.03125, + "learning_rate": 6.972860723789243e-07, + "loss": 1.7502, + "step": 11775 + }, + { + "epoch": 3.314108876072584, + "grad_norm": 3.0625, + "learning_rate": 6.945180430844754e-07, + "loss": 1.7875, + "step": 11780 + }, + { + "epoch": 3.315515543677029, + "grad_norm": 3.03125, + "learning_rate": 6.917549964562712e-07, + "loss": 1.503, + "step": 11785 + }, + { + "epoch": 3.316922211281474, + "grad_norm": 4.09375, + "learning_rate": 6.889969366593127e-07, + "loss": 1.4491, + "step": 11790 + }, + { + "epoch": 3.3183288788859193, + "grad_norm": 4.78125, + "learning_rate": 6.862438678510849e-07, + "loss": 1.2961, + "step": 11795 + }, + { + "epoch": 3.319735546490364, + "grad_norm": 3.515625, + "learning_rate": 6.834957941815518e-07, + "loss": 1.6573, + "step": 11800 + }, + { + "epoch": 3.3211422140948095, + "grad_norm": 3.046875, + "learning_rate": 6.807527197931411e-07, + "loss": 1.3254, + "step": 11805 + }, + { + "epoch": 3.3225488816992543, + "grad_norm": 3.21875, + "learning_rate": 6.780146488207524e-07, + "loss": 1.5744, + "step": 11810 + }, + { + "epoch": 3.3239555493036996, + "grad_norm": 2.921875, + "learning_rate": 6.752815853917377e-07, + "loss": 1.5929, + "step": 11815 + }, + { + "epoch": 3.3253622169081445, + "grad_norm": 3.109375, + "learning_rate": 6.725535336259036e-07, + "loss": 1.687, + "step": 11820 + }, + { + "epoch": 3.32676888451259, + "grad_norm": 2.828125, + "learning_rate": 6.698304976354992e-07, + "loss": 1.5552, + "step": 11825 + }, + { + "epoch": 3.3281755521170346, + "grad_norm": 3.3125, + "learning_rate": 6.671124815252182e-07, + "loss": 1.4686, + "step": 11830 + }, + { + "epoch": 3.32958221972148, + "grad_norm": 3.890625, + "learning_rate": 6.643994893921801e-07, + "loss": 1.5944, + "step": 11835 + }, + { + "epoch": 3.330988887325925, + "grad_norm": 3.140625, + "learning_rate": 6.616915253259367e-07, + "loss": 1.3498, + "step": 11840 + }, + { + "epoch": 3.33239555493037, + "grad_norm": 3.234375, + "learning_rate": 6.589885934084609e-07, + "loss": 1.8226, + "step": 11845 + }, + { + "epoch": 3.333802222534815, + "grad_norm": 3.515625, + "learning_rate": 6.562906977141342e-07, + "loss": 1.5916, + "step": 11850 + }, + { + "epoch": 3.3352088901392603, + "grad_norm": 3.75, + "learning_rate": 6.535978423097535e-07, + "loss": 1.2771, + "step": 11855 + }, + { + "epoch": 3.336615557743705, + "grad_norm": 3.65625, + "learning_rate": 6.509100312545142e-07, + "loss": 1.7791, + "step": 11860 + }, + { + "epoch": 3.3380222253481504, + "grad_norm": 2.84375, + "learning_rate": 6.482272686000083e-07, + "loss": 1.5699, + "step": 11865 + }, + { + "epoch": 3.3394288929525953, + "grad_norm": 3.890625, + "learning_rate": 6.455495583902175e-07, + "loss": 1.5746, + "step": 11870 + }, + { + "epoch": 3.3408355605570406, + "grad_norm": 4.59375, + "learning_rate": 6.428769046615108e-07, + "loss": 1.7, + "step": 11875 + }, + { + "epoch": 3.3422422281614854, + "grad_norm": 3.6875, + "learning_rate": 6.402093114426291e-07, + "loss": 1.6023, + "step": 11880 + }, + { + "epoch": 3.3436488957659307, + "grad_norm": 3.859375, + "learning_rate": 6.375467827546908e-07, + "loss": 1.6378, + "step": 11885 + }, + { + "epoch": 3.3450555633703756, + "grad_norm": 4.71875, + "learning_rate": 6.348893226111775e-07, + "loss": 1.561, + "step": 11890 + }, + { + "epoch": 3.3464622309748204, + "grad_norm": 3.390625, + "learning_rate": 6.3223693501793e-07, + "loss": 1.6388, + "step": 11895 + }, + { + "epoch": 3.3478688985792657, + "grad_norm": 3.5625, + "learning_rate": 6.29589623973143e-07, + "loss": 1.6767, + "step": 11900 + }, + { + "epoch": 3.349275566183711, + "grad_norm": 2.953125, + "learning_rate": 6.269473934673617e-07, + "loss": 1.6953, + "step": 11905 + }, + { + "epoch": 3.350682233788156, + "grad_norm": 4.125, + "learning_rate": 6.243102474834679e-07, + "loss": 1.6556, + "step": 11910 + }, + { + "epoch": 3.3520889013926007, + "grad_norm": 3.203125, + "learning_rate": 6.21678189996683e-07, + "loss": 1.5383, + "step": 11915 + }, + { + "epoch": 3.353495568997046, + "grad_norm": 4.125, + "learning_rate": 6.19051224974557e-07, + "loss": 1.2658, + "step": 11920 + }, + { + "epoch": 3.354902236601491, + "grad_norm": 2.796875, + "learning_rate": 6.164293563769618e-07, + "loss": 1.6482, + "step": 11925 + }, + { + "epoch": 3.356308904205936, + "grad_norm": 3.875, + "learning_rate": 6.138125881560912e-07, + "loss": 1.7493, + "step": 11930 + }, + { + "epoch": 3.357715571810381, + "grad_norm": 4.5, + "learning_rate": 6.112009242564444e-07, + "loss": 1.4005, + "step": 11935 + }, + { + "epoch": 3.3591222394148263, + "grad_norm": 3.0, + "learning_rate": 6.085943686148329e-07, + "loss": 1.6748, + "step": 11940 + }, + { + "epoch": 3.360528907019271, + "grad_norm": 2.796875, + "learning_rate": 6.059929251603635e-07, + "loss": 1.4217, + "step": 11945 + }, + { + "epoch": 3.3619355746237165, + "grad_norm": 2.796875, + "learning_rate": 6.033965978144393e-07, + "loss": 1.7179, + "step": 11950 + }, + { + "epoch": 3.3633422422281614, + "grad_norm": 4.03125, + "learning_rate": 6.008053904907489e-07, + "loss": 1.5697, + "step": 11955 + }, + { + "epoch": 3.3647489098326067, + "grad_norm": 3.0, + "learning_rate": 5.982193070952677e-07, + "loss": 1.4438, + "step": 11960 + }, + { + "epoch": 3.3661555774370515, + "grad_norm": 4.15625, + "learning_rate": 5.956383515262411e-07, + "loss": 1.6678, + "step": 11965 + }, + { + "epoch": 3.367562245041497, + "grad_norm": 3.078125, + "learning_rate": 5.930625276741903e-07, + "loss": 1.66, + "step": 11970 + }, + { + "epoch": 3.3689689126459417, + "grad_norm": 3.5625, + "learning_rate": 5.904918394218978e-07, + "loss": 1.4462, + "step": 11975 + }, + { + "epoch": 3.370375580250387, + "grad_norm": 3.59375, + "learning_rate": 5.879262906444049e-07, + "loss": 1.6758, + "step": 11980 + }, + { + "epoch": 3.371782247854832, + "grad_norm": 3.09375, + "learning_rate": 5.853658852090082e-07, + "loss": 1.4858, + "step": 11985 + }, + { + "epoch": 3.373188915459277, + "grad_norm": 3.015625, + "learning_rate": 5.828106269752488e-07, + "loss": 1.3861, + "step": 11990 + }, + { + "epoch": 3.374595583063722, + "grad_norm": 2.578125, + "learning_rate": 5.802605197949093e-07, + "loss": 1.5823, + "step": 11995 + }, + { + "epoch": 3.3760022506681673, + "grad_norm": 2.984375, + "learning_rate": 5.777155675120071e-07, + "loss": 1.5136, + "step": 12000 + }, + { + "epoch": 3.377408918272612, + "grad_norm": 3.765625, + "learning_rate": 5.751757739627931e-07, + "loss": 1.4484, + "step": 12005 + }, + { + "epoch": 3.3788155858770574, + "grad_norm": 3.140625, + "learning_rate": 5.726411429757347e-07, + "loss": 1.6736, + "step": 12010 + }, + { + "epoch": 3.3802222534815023, + "grad_norm": 15.5625, + "learning_rate": 5.701116783715241e-07, + "loss": 1.399, + "step": 12015 + }, + { + "epoch": 3.381628921085947, + "grad_norm": 2.828125, + "learning_rate": 5.675873839630627e-07, + "loss": 1.7158, + "step": 12020 + }, + { + "epoch": 3.3830355886903924, + "grad_norm": 3.890625, + "learning_rate": 5.65068263555458e-07, + "loss": 1.6117, + "step": 12025 + }, + { + "epoch": 3.3844422562948377, + "grad_norm": 2.96875, + "learning_rate": 5.625543209460186e-07, + "loss": 1.592, + "step": 12030 + }, + { + "epoch": 3.3858489238992826, + "grad_norm": 3.203125, + "learning_rate": 5.60045559924251e-07, + "loss": 1.6539, + "step": 12035 + }, + { + "epoch": 3.3872555915037275, + "grad_norm": 2.75, + "learning_rate": 5.57541984271845e-07, + "loss": 1.5038, + "step": 12040 + }, + { + "epoch": 3.3886622591081728, + "grad_norm": 2.453125, + "learning_rate": 5.550435977626797e-07, + "loss": 1.4934, + "step": 12045 + }, + { + "epoch": 3.390068926712618, + "grad_norm": 3.484375, + "learning_rate": 5.525504041628095e-07, + "loss": 1.4347, + "step": 12050 + }, + { + "epoch": 3.391475594317063, + "grad_norm": 2.875, + "learning_rate": 5.5006240723046e-07, + "loss": 1.698, + "step": 12055 + }, + { + "epoch": 3.3928822619215078, + "grad_norm": 2.953125, + "learning_rate": 5.475796107160273e-07, + "loss": 1.5576, + "step": 12060 + }, + { + "epoch": 3.394288929525953, + "grad_norm": 4.25, + "learning_rate": 5.451020183620642e-07, + "loss": 1.5214, + "step": 12065 + }, + { + "epoch": 3.395695597130398, + "grad_norm": 4.875, + "learning_rate": 5.426296339032812e-07, + "loss": 1.5787, + "step": 12070 + }, + { + "epoch": 3.3971022647348432, + "grad_norm": 3.6875, + "learning_rate": 5.401624610665374e-07, + "loss": 1.5412, + "step": 12075 + }, + { + "epoch": 3.398508932339288, + "grad_norm": 3.4375, + "learning_rate": 5.377005035708362e-07, + "loss": 1.5295, + "step": 12080 + }, + { + "epoch": 3.3999155999437334, + "grad_norm": 3.296875, + "learning_rate": 5.352437651273183e-07, + "loss": 1.5275, + "step": 12085 + }, + { + "epoch": 3.4013222675481782, + "grad_norm": 2.984375, + "learning_rate": 5.32792249439261e-07, + "loss": 1.542, + "step": 12090 + }, + { + "epoch": 3.4027289351526235, + "grad_norm": 3.703125, + "learning_rate": 5.303459602020646e-07, + "loss": 1.3692, + "step": 12095 + }, + { + "epoch": 3.4041356027570684, + "grad_norm": 1.9296875, + "learning_rate": 5.279049011032533e-07, + "loss": 1.631, + "step": 12100 + }, + { + "epoch": 3.4055422703615137, + "grad_norm": 3.671875, + "learning_rate": 5.254690758224663e-07, + "loss": 1.7059, + "step": 12105 + }, + { + "epoch": 3.4069489379659585, + "grad_norm": 3.09375, + "learning_rate": 5.23038488031454e-07, + "loss": 1.3645, + "step": 12110 + }, + { + "epoch": 3.408355605570404, + "grad_norm": 3.8125, + "learning_rate": 5.206131413940711e-07, + "loss": 1.4428, + "step": 12115 + }, + { + "epoch": 3.4097622731748487, + "grad_norm": 2.875, + "learning_rate": 5.181930395662744e-07, + "loss": 1.6279, + "step": 12120 + }, + { + "epoch": 3.411168940779294, + "grad_norm": 3.359375, + "learning_rate": 5.157781861961115e-07, + "loss": 1.7865, + "step": 12125 + }, + { + "epoch": 3.412575608383739, + "grad_norm": 3.859375, + "learning_rate": 5.133685849237191e-07, + "loss": 1.6429, + "step": 12130 + }, + { + "epoch": 3.413982275988184, + "grad_norm": 3.6875, + "learning_rate": 5.109642393813201e-07, + "loss": 1.6585, + "step": 12135 + }, + { + "epoch": 3.415388943592629, + "grad_norm": 2.65625, + "learning_rate": 5.085651531932087e-07, + "loss": 1.6252, + "step": 12140 + }, + { + "epoch": 3.4167956111970743, + "grad_norm": 2.90625, + "learning_rate": 5.061713299757579e-07, + "loss": 1.647, + "step": 12145 + }, + { + "epoch": 3.418202278801519, + "grad_norm": 3.125, + "learning_rate": 5.037827733374031e-07, + "loss": 1.5837, + "step": 12150 + }, + { + "epoch": 3.4196089464059645, + "grad_norm": 5.40625, + "learning_rate": 5.013994868786429e-07, + "loss": 1.3133, + "step": 12155 + }, + { + "epoch": 3.4210156140104093, + "grad_norm": 3.375, + "learning_rate": 4.990214741920287e-07, + "loss": 1.81, + "step": 12160 + }, + { + "epoch": 3.422422281614854, + "grad_norm": 2.875, + "learning_rate": 4.966487388621679e-07, + "loss": 1.3786, + "step": 12165 + }, + { + "epoch": 3.4238289492192995, + "grad_norm": 3.515625, + "learning_rate": 4.942812844657061e-07, + "loss": 1.3051, + "step": 12170 + }, + { + "epoch": 3.4252356168237448, + "grad_norm": 4.4375, + "learning_rate": 4.919191145713335e-07, + "loss": 1.7033, + "step": 12175 + }, + { + "epoch": 3.4266422844281896, + "grad_norm": 2.875, + "learning_rate": 4.895622327397722e-07, + "loss": 1.54, + "step": 12180 + }, + { + "epoch": 3.4280489520326345, + "grad_norm": 3.921875, + "learning_rate": 4.872106425237734e-07, + "loss": 1.4154, + "step": 12185 + }, + { + "epoch": 3.42945561963708, + "grad_norm": 3.390625, + "learning_rate": 4.848643474681115e-07, + "loss": 1.3742, + "step": 12190 + }, + { + "epoch": 3.4308622872415246, + "grad_norm": 3.65625, + "learning_rate": 4.82523351109581e-07, + "loss": 1.5347, + "step": 12195 + }, + { + "epoch": 3.43226895484597, + "grad_norm": 3.328125, + "learning_rate": 4.801876569769865e-07, + "loss": 1.6441, + "step": 12200 + }, + { + "epoch": 3.433675622450415, + "grad_norm": 2.984375, + "learning_rate": 4.778572685911402e-07, + "loss": 1.4506, + "step": 12205 + }, + { + "epoch": 3.43508229005486, + "grad_norm": 2.78125, + "learning_rate": 4.7553218946486007e-07, + "loss": 1.4641, + "step": 12210 + }, + { + "epoch": 3.436488957659305, + "grad_norm": 4.1875, + "learning_rate": 4.732124231029546e-07, + "loss": 1.6343, + "step": 12215 + }, + { + "epoch": 3.4378956252637503, + "grad_norm": 3.734375, + "learning_rate": 4.708979730022307e-07, + "loss": 1.3519, + "step": 12220 + }, + { + "epoch": 3.439302292868195, + "grad_norm": 4.375, + "learning_rate": 4.6858884265147705e-07, + "loss": 1.6333, + "step": 12225 + }, + { + "epoch": 3.4407089604726404, + "grad_norm": 3.09375, + "learning_rate": 4.662850355314649e-07, + "loss": 1.4561, + "step": 12230 + }, + { + "epoch": 3.4421156280770853, + "grad_norm": 3.9375, + "learning_rate": 4.6398655511494e-07, + "loss": 1.569, + "step": 12235 + }, + { + "epoch": 3.4435222956815306, + "grad_norm": 3.3125, + "learning_rate": 4.6169340486662234e-07, + "loss": 1.684, + "step": 12240 + }, + { + "epoch": 3.4449289632859754, + "grad_norm": 2.609375, + "learning_rate": 4.594055882431913e-07, + "loss": 1.3547, + "step": 12245 + }, + { + "epoch": 3.4463356308904207, + "grad_norm": 3.90625, + "learning_rate": 4.571231086932923e-07, + "loss": 1.6273, + "step": 12250 + }, + { + "epoch": 3.4477422984948656, + "grad_norm": 3.234375, + "learning_rate": 4.54845969657522e-07, + "loss": 1.7027, + "step": 12255 + }, + { + "epoch": 3.449148966099311, + "grad_norm": 3.46875, + "learning_rate": 4.525741745684284e-07, + "loss": 1.5842, + "step": 12260 + }, + { + "epoch": 3.4505556337037557, + "grad_norm": 4.46875, + "learning_rate": 4.50307726850502e-07, + "loss": 1.6175, + "step": 12265 + }, + { + "epoch": 3.451962301308201, + "grad_norm": 3.140625, + "learning_rate": 4.480466299201766e-07, + "loss": 1.4567, + "step": 12270 + }, + { + "epoch": 3.453368968912646, + "grad_norm": 4.4375, + "learning_rate": 4.457908871858169e-07, + "loss": 1.4125, + "step": 12275 + }, + { + "epoch": 3.454775636517091, + "grad_norm": 3.25, + "learning_rate": 4.435405020477172e-07, + "loss": 1.5601, + "step": 12280 + }, + { + "epoch": 3.456182304121536, + "grad_norm": 4.28125, + "learning_rate": 4.412954778980968e-07, + "loss": 1.6602, + "step": 12285 + }, + { + "epoch": 3.4575889717259813, + "grad_norm": 2.765625, + "learning_rate": 4.390558181210928e-07, + "loss": 1.8053, + "step": 12290 + }, + { + "epoch": 3.458995639330426, + "grad_norm": 3.40625, + "learning_rate": 4.368215260927588e-07, + "loss": 1.5461, + "step": 12295 + }, + { + "epoch": 3.4604023069348715, + "grad_norm": 3.3125, + "learning_rate": 4.3459260518105134e-07, + "loss": 1.5155, + "step": 12300 + }, + { + "epoch": 3.4618089745393164, + "grad_norm": 4.5, + "learning_rate": 4.3236905874583704e-07, + "loss": 1.3204, + "step": 12305 + }, + { + "epoch": 3.463215642143761, + "grad_norm": 2.328125, + "learning_rate": 4.3015089013887753e-07, + "loss": 1.4738, + "step": 12310 + }, + { + "epoch": 3.4646223097482065, + "grad_norm": 3.390625, + "learning_rate": 4.279381027038278e-07, + "loss": 1.5011, + "step": 12315 + }, + { + "epoch": 3.466028977352652, + "grad_norm": 4.59375, + "learning_rate": 4.257306997762322e-07, + "loss": 1.3831, + "step": 12320 + }, + { + "epoch": 3.4674356449570967, + "grad_norm": 4.21875, + "learning_rate": 4.235286846835202e-07, + "loss": 1.5323, + "step": 12325 + }, + { + "epoch": 3.4688423125615415, + "grad_norm": 3.109375, + "learning_rate": 4.2133206074499527e-07, + "loss": 1.5963, + "step": 12330 + }, + { + "epoch": 3.470248980165987, + "grad_norm": 3.875, + "learning_rate": 4.191408312718385e-07, + "loss": 1.6031, + "step": 12335 + }, + { + "epoch": 3.4716556477704317, + "grad_norm": 3.453125, + "learning_rate": 4.169549995670971e-07, + "loss": 1.6792, + "step": 12340 + }, + { + "epoch": 3.473062315374877, + "grad_norm": 4.375, + "learning_rate": 4.147745689256821e-07, + "loss": 1.526, + "step": 12345 + }, + { + "epoch": 3.474468982979322, + "grad_norm": 2.96875, + "learning_rate": 4.1259954263436426e-07, + "loss": 1.6437, + "step": 12350 + }, + { + "epoch": 3.475875650583767, + "grad_norm": 4.125, + "learning_rate": 4.104299239717668e-07, + "loss": 1.5831, + "step": 12355 + }, + { + "epoch": 3.477282318188212, + "grad_norm": 3.203125, + "learning_rate": 4.082657162083607e-07, + "loss": 1.6606, + "step": 12360 + }, + { + "epoch": 3.4786889857926573, + "grad_norm": 5.875, + "learning_rate": 4.0610692260646085e-07, + "loss": 1.6935, + "step": 12365 + }, + { + "epoch": 3.480095653397102, + "grad_norm": 3.140625, + "learning_rate": 4.039535464202242e-07, + "loss": 1.6452, + "step": 12370 + }, + { + "epoch": 3.4815023210015474, + "grad_norm": 2.75, + "learning_rate": 4.018055908956355e-07, + "loss": 1.5527, + "step": 12375 + }, + { + "epoch": 3.4829089886059923, + "grad_norm": 3.4375, + "learning_rate": 3.9966305927051416e-07, + "loss": 1.5381, + "step": 12380 + }, + { + "epoch": 3.4843156562104376, + "grad_norm": 2.984375, + "learning_rate": 3.975259547744998e-07, + "loss": 1.6809, + "step": 12385 + }, + { + "epoch": 3.4857223238148825, + "grad_norm": 4.28125, + "learning_rate": 3.953942806290533e-07, + "loss": 1.3621, + "step": 12390 + }, + { + "epoch": 3.4871289914193278, + "grad_norm": 4.4375, + "learning_rate": 3.9326804004744794e-07, + "loss": 1.4407, + "step": 12395 + }, + { + "epoch": 3.4885356590237726, + "grad_norm": 3.609375, + "learning_rate": 3.911472362347701e-07, + "loss": 1.7222, + "step": 12400 + }, + { + "epoch": 3.489942326628218, + "grad_norm": 4.375, + "learning_rate": 3.8903187238790514e-07, + "loss": 1.7223, + "step": 12405 + }, + { + "epoch": 3.4913489942326628, + "grad_norm": 3.4375, + "learning_rate": 3.869219516955442e-07, + "loss": 1.7887, + "step": 12410 + }, + { + "epoch": 3.492755661837108, + "grad_norm": 4.0625, + "learning_rate": 3.8481747733816984e-07, + "loss": 1.5084, + "step": 12415 + }, + { + "epoch": 3.494162329441553, + "grad_norm": 3.25, + "learning_rate": 3.827184524880542e-07, + "loss": 1.6298, + "step": 12420 + }, + { + "epoch": 3.495568997045998, + "grad_norm": 2.46875, + "learning_rate": 3.8062488030925887e-07, + "loss": 1.5018, + "step": 12425 + }, + { + "epoch": 3.496975664650443, + "grad_norm": 3.296875, + "learning_rate": 3.785367639576225e-07, + "loss": 1.8452, + "step": 12430 + }, + { + "epoch": 3.498382332254888, + "grad_norm": 3.453125, + "learning_rate": 3.764541065807609e-07, + "loss": 1.562, + "step": 12435 + }, + { + "epoch": 3.4997889998593332, + "grad_norm": 3.0625, + "learning_rate": 3.7437691131806083e-07, + "loss": 1.6698, + "step": 12440 + }, + { + "epoch": 3.5011956674637785, + "grad_norm": 4.34375, + "learning_rate": 3.723051813006752e-07, + "loss": 1.2999, + "step": 12445 + }, + { + "epoch": 3.5026023350682234, + "grad_norm": 3.265625, + "learning_rate": 3.7023891965151853e-07, + "loss": 1.5353, + "step": 12450 + }, + { + "epoch": 3.5040090026726682, + "grad_norm": 3.546875, + "learning_rate": 3.6817812948526506e-07, + "loss": 1.4993, + "step": 12455 + }, + { + "epoch": 3.5054156702771135, + "grad_norm": 3.15625, + "learning_rate": 3.66122813908337e-07, + "loss": 1.4794, + "step": 12460 + }, + { + "epoch": 3.506822337881559, + "grad_norm": 2.953125, + "learning_rate": 3.6407297601890763e-07, + "loss": 1.6521, + "step": 12465 + }, + { + "epoch": 3.5082290054860037, + "grad_norm": 4.625, + "learning_rate": 3.6202861890689105e-07, + "loss": 1.4697, + "step": 12470 + }, + { + "epoch": 3.5096356730904485, + "grad_norm": 2.765625, + "learning_rate": 3.599897456539409e-07, + "loss": 1.6966, + "step": 12475 + }, + { + "epoch": 3.511042340694894, + "grad_norm": 2.75, + "learning_rate": 3.5795635933344313e-07, + "loss": 1.6049, + "step": 12480 + }, + { + "epoch": 3.5124490082993387, + "grad_norm": 3.5625, + "learning_rate": 3.5592846301051525e-07, + "loss": 1.5986, + "step": 12485 + }, + { + "epoch": 3.513855675903784, + "grad_norm": 4.1875, + "learning_rate": 3.5390605974199697e-07, + "loss": 1.4704, + "step": 12490 + }, + { + "epoch": 3.515262343508229, + "grad_norm": 3.203125, + "learning_rate": 3.518891525764474e-07, + "loss": 1.5354, + "step": 12495 + }, + { + "epoch": 3.516669011112674, + "grad_norm": 3.40625, + "learning_rate": 3.4987774455414434e-07, + "loss": 1.3862, + "step": 12500 + }, + { + "epoch": 3.518075678717119, + "grad_norm": 3.53125, + "learning_rate": 3.478718387070705e-07, + "loss": 1.7165, + "step": 12505 + }, + { + "epoch": 3.5194823463215643, + "grad_norm": 5.125, + "learning_rate": 3.458714380589205e-07, + "loss": 1.5756, + "step": 12510 + }, + { + "epoch": 3.520889013926009, + "grad_norm": 4.6875, + "learning_rate": 3.438765456250867e-07, + "loss": 1.5997, + "step": 12515 + }, + { + "epoch": 3.5222956815304545, + "grad_norm": 3.21875, + "learning_rate": 3.418871644126593e-07, + "loss": 1.5862, + "step": 12520 + }, + { + "epoch": 3.5237023491348993, + "grad_norm": 2.890625, + "learning_rate": 3.399032974204212e-07, + "loss": 1.647, + "step": 12525 + }, + { + "epoch": 3.5251090167393446, + "grad_norm": 3.484375, + "learning_rate": 3.3792494763884527e-07, + "loss": 1.6057, + "step": 12530 + }, + { + "epoch": 3.5265156843437895, + "grad_norm": 2.8125, + "learning_rate": 3.3595211805008193e-07, + "loss": 1.5112, + "step": 12535 + }, + { + "epoch": 3.527922351948235, + "grad_norm": 4.03125, + "learning_rate": 3.339848116279671e-07, + "loss": 1.3215, + "step": 12540 + }, + { + "epoch": 3.5293290195526796, + "grad_norm": 3.8125, + "learning_rate": 3.3202303133800724e-07, + "loss": 1.5533, + "step": 12545 + }, + { + "epoch": 3.530735687157125, + "grad_norm": 3.109375, + "learning_rate": 3.300667801373791e-07, + "loss": 1.6153, + "step": 12550 + }, + { + "epoch": 3.53214235476157, + "grad_norm": 2.953125, + "learning_rate": 3.281160609749265e-07, + "loss": 1.5961, + "step": 12555 + }, + { + "epoch": 3.5335490223660146, + "grad_norm": 3.265625, + "learning_rate": 3.261708767911533e-07, + "loss": 1.4118, + "step": 12560 + }, + { + "epoch": 3.53495568997046, + "grad_norm": 2.5, + "learning_rate": 3.242312305182193e-07, + "loss": 1.4728, + "step": 12565 + }, + { + "epoch": 3.5363623575749052, + "grad_norm": 3.9375, + "learning_rate": 3.222971250799373e-07, + "loss": 1.6069, + "step": 12570 + }, + { + "epoch": 3.53776902517935, + "grad_norm": 2.40625, + "learning_rate": 3.2036856339176897e-07, + "loss": 1.6743, + "step": 12575 + }, + { + "epoch": 3.539175692783795, + "grad_norm": 2.75, + "learning_rate": 3.1844554836081596e-07, + "loss": 1.7418, + "step": 12580 + }, + { + "epoch": 3.5405823603882403, + "grad_norm": 3.71875, + "learning_rate": 3.16528082885823e-07, + "loss": 1.7357, + "step": 12585 + }, + { + "epoch": 3.5419890279926856, + "grad_norm": 3.640625, + "learning_rate": 3.1461616985716655e-07, + "loss": 1.7516, + "step": 12590 + }, + { + "epoch": 3.5433956955971304, + "grad_norm": 3.234375, + "learning_rate": 3.12709812156855e-07, + "loss": 1.7343, + "step": 12595 + }, + { + "epoch": 3.5448023632015753, + "grad_norm": 2.921875, + "learning_rate": 3.1080901265852034e-07, + "loss": 1.5757, + "step": 12600 + }, + { + "epoch": 3.5462090308060206, + "grad_norm": 2.96875, + "learning_rate": 3.0891377422742084e-07, + "loss": 1.693, + "step": 12605 + }, + { + "epoch": 3.547615698410466, + "grad_norm": 3.09375, + "learning_rate": 3.070240997204254e-07, + "loss": 1.5975, + "step": 12610 + }, + { + "epoch": 3.5490223660149107, + "grad_norm": 4.625, + "learning_rate": 3.051399919860222e-07, + "loss": 1.6121, + "step": 12615 + }, + { + "epoch": 3.5504290336193556, + "grad_norm": 4.09375, + "learning_rate": 3.0326145386430433e-07, + "loss": 1.4582, + "step": 12620 + }, + { + "epoch": 3.551835701223801, + "grad_norm": 4.25, + "learning_rate": 3.013884881869702e-07, + "loss": 1.5848, + "step": 12625 + }, + { + "epoch": 3.5532423688282457, + "grad_norm": 3.109375, + "learning_rate": 2.9952109777731947e-07, + "loss": 1.6932, + "step": 12630 + }, + { + "epoch": 3.554649036432691, + "grad_norm": 4.84375, + "learning_rate": 2.97659285450246e-07, + "loss": 1.7294, + "step": 12635 + }, + { + "epoch": 3.556055704037136, + "grad_norm": 2.796875, + "learning_rate": 2.958030540122358e-07, + "loss": 1.7316, + "step": 12640 + }, + { + "epoch": 3.557462371641581, + "grad_norm": 2.671875, + "learning_rate": 2.9395240626136276e-07, + "loss": 1.7024, + "step": 12645 + }, + { + "epoch": 3.558869039246026, + "grad_norm": 3.203125, + "learning_rate": 2.9210734498728375e-07, + "loss": 1.6179, + "step": 12650 + }, + { + "epoch": 3.5602757068504713, + "grad_norm": 3.546875, + "learning_rate": 2.902678729712336e-07, + "loss": 1.6973, + "step": 12655 + }, + { + "epoch": 3.561682374454916, + "grad_norm": 3.140625, + "learning_rate": 2.8843399298602446e-07, + "loss": 1.3544, + "step": 12660 + }, + { + "epoch": 3.5630890420593615, + "grad_norm": 3.5625, + "learning_rate": 2.866057077960353e-07, + "loss": 1.7027, + "step": 12665 + }, + { + "epoch": 3.5644957096638064, + "grad_norm": 3.09375, + "learning_rate": 2.847830201572159e-07, + "loss": 1.9461, + "step": 12670 + }, + { + "epoch": 3.5659023772682517, + "grad_norm": 4.03125, + "learning_rate": 2.8296593281707457e-07, + "loss": 1.4954, + "step": 12675 + }, + { + "epoch": 3.5673090448726965, + "grad_norm": 4.09375, + "learning_rate": 2.811544485146804e-07, + "loss": 1.6632, + "step": 12680 + }, + { + "epoch": 3.568715712477142, + "grad_norm": 3.65625, + "learning_rate": 2.793485699806535e-07, + "loss": 1.5274, + "step": 12685 + }, + { + "epoch": 3.5701223800815867, + "grad_norm": 4.46875, + "learning_rate": 2.7754829993716876e-07, + "loss": 1.5179, + "step": 12690 + }, + { + "epoch": 3.571529047686032, + "grad_norm": 2.703125, + "learning_rate": 2.757536410979404e-07, + "loss": 1.404, + "step": 12695 + }, + { + "epoch": 3.572935715290477, + "grad_norm": 4.21875, + "learning_rate": 2.7396459616822974e-07, + "loss": 1.4433, + "step": 12700 + }, + { + "epoch": 3.5743423828949217, + "grad_norm": 3.65625, + "learning_rate": 2.721811678448347e-07, + "loss": 1.7107, + "step": 12705 + }, + { + "epoch": 3.575749050499367, + "grad_norm": 3.96875, + "learning_rate": 2.70403358816083e-07, + "loss": 1.3703, + "step": 12710 + }, + { + "epoch": 3.5771557181038123, + "grad_norm": 4.21875, + "learning_rate": 2.6863117176183727e-07, + "loss": 1.4843, + "step": 12715 + }, + { + "epoch": 3.578562385708257, + "grad_norm": 3.140625, + "learning_rate": 2.6686460935348187e-07, + "loss": 1.2866, + "step": 12720 + }, + { + "epoch": 3.579969053312702, + "grad_norm": 3.296875, + "learning_rate": 2.651036742539241e-07, + "loss": 1.511, + "step": 12725 + }, + { + "epoch": 3.5813757209171473, + "grad_norm": 4.0625, + "learning_rate": 2.633483691175877e-07, + "loss": 1.5051, + "step": 12730 + }, + { + "epoch": 3.5827823885215926, + "grad_norm": 3.1875, + "learning_rate": 2.6159869659041176e-07, + "loss": 1.5787, + "step": 12735 + }, + { + "epoch": 3.5841890561260374, + "grad_norm": 5.59375, + "learning_rate": 2.5985465930984163e-07, + "loss": 1.4837, + "step": 12740 + }, + { + "epoch": 3.5855957237304823, + "grad_norm": 3.453125, + "learning_rate": 2.5811625990483164e-07, + "loss": 1.4975, + "step": 12745 + }, + { + "epoch": 3.5870023913349276, + "grad_norm": 3.703125, + "learning_rate": 2.563835009958355e-07, + "loss": 1.624, + "step": 12750 + }, + { + "epoch": 3.588409058939373, + "grad_norm": 3.015625, + "learning_rate": 2.546563851948047e-07, + "loss": 1.5078, + "step": 12755 + }, + { + "epoch": 3.5898157265438178, + "grad_norm": 3.34375, + "learning_rate": 2.5293491510518425e-07, + "loss": 1.5151, + "step": 12760 + }, + { + "epoch": 3.5912223941482626, + "grad_norm": 3.640625, + "learning_rate": 2.5121909332191047e-07, + "loss": 1.4745, + "step": 12765 + }, + { + "epoch": 3.592629061752708, + "grad_norm": 3.90625, + "learning_rate": 2.4950892243140285e-07, + "loss": 1.6577, + "step": 12770 + }, + { + "epoch": 3.5940357293571528, + "grad_norm": 2.203125, + "learning_rate": 2.478044050115646e-07, + "loss": 1.5741, + "step": 12775 + }, + { + "epoch": 3.595442396961598, + "grad_norm": 2.921875, + "learning_rate": 2.4610554363177647e-07, + "loss": 1.4498, + "step": 12780 + }, + { + "epoch": 3.596849064566043, + "grad_norm": 4.21875, + "learning_rate": 2.4441234085289264e-07, + "loss": 1.7303, + "step": 12785 + }, + { + "epoch": 3.598255732170488, + "grad_norm": 3.40625, + "learning_rate": 2.4272479922723897e-07, + "loss": 1.6942, + "step": 12790 + }, + { + "epoch": 3.599662399774933, + "grad_norm": 3.0, + "learning_rate": 2.410429212986065e-07, + "loss": 1.4784, + "step": 12795 + }, + { + "epoch": 3.6010690673793784, + "grad_norm": 3.03125, + "learning_rate": 2.3936670960224935e-07, + "loss": 1.6887, + "step": 12800 + }, + { + "epoch": 3.6024757349838232, + "grad_norm": 3.671875, + "learning_rate": 2.3769616666488024e-07, + "loss": 1.6325, + "step": 12805 + }, + { + "epoch": 3.6038824025882685, + "grad_norm": 2.828125, + "learning_rate": 2.36031295004667e-07, + "loss": 1.6284, + "step": 12810 + }, + { + "epoch": 3.6052890701927134, + "grad_norm": 4.4375, + "learning_rate": 2.3437209713122707e-07, + "loss": 1.458, + "step": 12815 + }, + { + "epoch": 3.6066957377971587, + "grad_norm": 4.46875, + "learning_rate": 2.3271857554562914e-07, + "loss": 1.4742, + "step": 12820 + }, + { + "epoch": 3.6081024054016035, + "grad_norm": 3.46875, + "learning_rate": 2.3107073274038157e-07, + "loss": 1.5134, + "step": 12825 + }, + { + "epoch": 3.6095090730060484, + "grad_norm": 3.375, + "learning_rate": 2.2942857119943392e-07, + "loss": 1.6525, + "step": 12830 + }, + { + "epoch": 3.6109157406104937, + "grad_norm": 2.71875, + "learning_rate": 2.277920933981723e-07, + "loss": 1.8801, + "step": 12835 + }, + { + "epoch": 3.612322408214939, + "grad_norm": 3.84375, + "learning_rate": 2.2616130180341408e-07, + "loss": 1.4974, + "step": 12840 + }, + { + "epoch": 3.613729075819384, + "grad_norm": 2.3125, + "learning_rate": 2.245361988734076e-07, + "loss": 1.6507, + "step": 12845 + }, + { + "epoch": 3.6151357434238287, + "grad_norm": 3.15625, + "learning_rate": 2.2291678705782303e-07, + "loss": 1.811, + "step": 12850 + }, + { + "epoch": 3.616542411028274, + "grad_norm": 3.859375, + "learning_rate": 2.2130306879775396e-07, + "loss": 1.6014, + "step": 12855 + }, + { + "epoch": 3.6179490786327193, + "grad_norm": 3.28125, + "learning_rate": 2.1969504652571014e-07, + "loss": 1.7031, + "step": 12860 + }, + { + "epoch": 3.619355746237164, + "grad_norm": 2.625, + "learning_rate": 2.1809272266561796e-07, + "loss": 1.7339, + "step": 12865 + }, + { + "epoch": 3.620762413841609, + "grad_norm": 4.34375, + "learning_rate": 2.1649609963280892e-07, + "loss": 1.6556, + "step": 12870 + }, + { + "epoch": 3.6221690814460543, + "grad_norm": 2.390625, + "learning_rate": 2.1490517983402667e-07, + "loss": 1.6822, + "step": 12875 + }, + { + "epoch": 3.6235757490504996, + "grad_norm": 3.640625, + "learning_rate": 2.1331996566741473e-07, + "loss": 1.4878, + "step": 12880 + }, + { + "epoch": 3.6249824166549445, + "grad_norm": 2.71875, + "learning_rate": 2.1174045952251674e-07, + "loss": 1.7963, + "step": 12885 + }, + { + "epoch": 3.6263890842593893, + "grad_norm": 2.796875, + "learning_rate": 2.1016666378027127e-07, + "loss": 1.6533, + "step": 12890 + }, + { + "epoch": 3.6277957518638346, + "grad_norm": 3.734375, + "learning_rate": 2.085985808130113e-07, + "loss": 1.6496, + "step": 12895 + }, + { + "epoch": 3.6292024194682795, + "grad_norm": 3.5625, + "learning_rate": 2.070362129844554e-07, + "loss": 1.4314, + "step": 12900 + }, + { + "epoch": 3.630609087072725, + "grad_norm": 3.984375, + "learning_rate": 2.0547956264970946e-07, + "loss": 1.3795, + "step": 12905 + }, + { + "epoch": 3.6320157546771696, + "grad_norm": 3.140625, + "learning_rate": 2.0392863215525957e-07, + "loss": 1.4565, + "step": 12910 + }, + { + "epoch": 3.633422422281615, + "grad_norm": 3.15625, + "learning_rate": 2.0238342383897032e-07, + "loss": 1.4031, + "step": 12915 + }, + { + "epoch": 3.63482908988606, + "grad_norm": 4.09375, + "learning_rate": 2.0084394003008165e-07, + "loss": 1.5884, + "step": 12920 + }, + { + "epoch": 3.636235757490505, + "grad_norm": 4.0625, + "learning_rate": 1.9931018304920256e-07, + "loss": 1.7368, + "step": 12925 + }, + { + "epoch": 3.63764242509495, + "grad_norm": 2.9375, + "learning_rate": 1.9778215520831076e-07, + "loss": 1.5598, + "step": 12930 + }, + { + "epoch": 3.6390490926993952, + "grad_norm": 3.46875, + "learning_rate": 1.9625985881074603e-07, + "loss": 1.4654, + "step": 12935 + }, + { + "epoch": 3.64045576030384, + "grad_norm": 2.96875, + "learning_rate": 1.9474329615121232e-07, + "loss": 1.6227, + "step": 12940 + }, + { + "epoch": 3.6418624279082854, + "grad_norm": 3.5625, + "learning_rate": 1.9323246951576633e-07, + "loss": 1.4179, + "step": 12945 + }, + { + "epoch": 3.6432690955127303, + "grad_norm": 4.0625, + "learning_rate": 1.9172738118182098e-07, + "loss": 1.417, + "step": 12950 + }, + { + "epoch": 3.6446757631171756, + "grad_norm": 3.359375, + "learning_rate": 1.902280334181392e-07, + "loss": 1.65, + "step": 12955 + }, + { + "epoch": 3.6460824307216204, + "grad_norm": 3.21875, + "learning_rate": 1.8873442848482868e-07, + "loss": 1.5986, + "step": 12960 + }, + { + "epoch": 3.6474890983260657, + "grad_norm": 3.65625, + "learning_rate": 1.872465686333422e-07, + "loss": 1.6104, + "step": 12965 + }, + { + "epoch": 3.6488957659305106, + "grad_norm": 3.703125, + "learning_rate": 1.857644561064733e-07, + "loss": 1.5107, + "step": 12970 + }, + { + "epoch": 3.6503024335349554, + "grad_norm": 3.453125, + "learning_rate": 1.842880931383477e-07, + "loss": 1.5454, + "step": 12975 + }, + { + "epoch": 3.6517091011394007, + "grad_norm": 3.28125, + "learning_rate": 1.8281748195443015e-07, + "loss": 1.416, + "step": 12980 + }, + { + "epoch": 3.653115768743846, + "grad_norm": 3.46875, + "learning_rate": 1.8135262477151092e-07, + "loss": 1.6158, + "step": 12985 + }, + { + "epoch": 3.654522436348291, + "grad_norm": 3.046875, + "learning_rate": 1.7989352379770773e-07, + "loss": 1.5509, + "step": 12990 + }, + { + "epoch": 3.6559291039527357, + "grad_norm": 3.65625, + "learning_rate": 1.7844018123246295e-07, + "loss": 1.493, + "step": 12995 + }, + { + "epoch": 3.657335771557181, + "grad_norm": 2.734375, + "learning_rate": 1.7699259926653665e-07, + "loss": 1.7625, + "step": 13000 + }, + { + "epoch": 3.6587424391616263, + "grad_norm": 4.96875, + "learning_rate": 1.7555078008200685e-07, + "loss": 1.3462, + "step": 13005 + }, + { + "epoch": 3.660149106766071, + "grad_norm": 2.90625, + "learning_rate": 1.741147258522635e-07, + "loss": 1.5288, + "step": 13010 + }, + { + "epoch": 3.661555774370516, + "grad_norm": 3.21875, + "learning_rate": 1.7268443874200834e-07, + "loss": 1.7067, + "step": 13015 + }, + { + "epoch": 3.6629624419749613, + "grad_norm": 2.75, + "learning_rate": 1.712599209072474e-07, + "loss": 1.7385, + "step": 13020 + }, + { + "epoch": 3.6643691095794066, + "grad_norm": 3.984375, + "learning_rate": 1.6984117449529324e-07, + "loss": 1.5687, + "step": 13025 + }, + { + "epoch": 3.6657757771838515, + "grad_norm": 4.03125, + "learning_rate": 1.684282016447547e-07, + "loss": 1.7137, + "step": 13030 + }, + { + "epoch": 3.6671824447882964, + "grad_norm": 3.484375, + "learning_rate": 1.670210044855409e-07, + "loss": 1.5094, + "step": 13035 + }, + { + "epoch": 3.6685891123927417, + "grad_norm": 2.4375, + "learning_rate": 1.6561958513885332e-07, + "loss": 1.7145, + "step": 13040 + }, + { + "epoch": 3.6699957799971865, + "grad_norm": 3.828125, + "learning_rate": 1.6422394571718435e-07, + "loss": 1.4869, + "step": 13045 + }, + { + "epoch": 3.671402447601632, + "grad_norm": 2.984375, + "learning_rate": 1.628340883243129e-07, + "loss": 1.6588, + "step": 13050 + }, + { + "epoch": 3.6728091152060767, + "grad_norm": 3.140625, + "learning_rate": 1.6145001505530353e-07, + "loss": 1.6234, + "step": 13055 + }, + { + "epoch": 3.674215782810522, + "grad_norm": 3.03125, + "learning_rate": 1.6007172799650027e-07, + "loss": 1.4941, + "step": 13060 + }, + { + "epoch": 3.675622450414967, + "grad_norm": 4.6875, + "learning_rate": 1.5869922922552649e-07, + "loss": 1.6677, + "step": 13065 + }, + { + "epoch": 3.677029118019412, + "grad_norm": 3.15625, + "learning_rate": 1.573325208112801e-07, + "loss": 1.5113, + "step": 13070 + }, + { + "epoch": 3.678435785623857, + "grad_norm": 3.09375, + "learning_rate": 1.5597160481392834e-07, + "loss": 1.4624, + "step": 13075 + }, + { + "epoch": 3.6798424532283023, + "grad_norm": 2.96875, + "learning_rate": 1.5461648328491106e-07, + "loss": 1.333, + "step": 13080 + }, + { + "epoch": 3.681249120832747, + "grad_norm": 3.8125, + "learning_rate": 1.5326715826693027e-07, + "loss": 1.5943, + "step": 13085 + }, + { + "epoch": 3.6826557884371924, + "grad_norm": 3.65625, + "learning_rate": 1.519236317939514e-07, + "loss": 1.5913, + "step": 13090 + }, + { + "epoch": 3.6840624560416373, + "grad_norm": 4.03125, + "learning_rate": 1.5058590589119936e-07, + "loss": 1.59, + "step": 13095 + }, + { + "epoch": 3.6854691236460826, + "grad_norm": 3.359375, + "learning_rate": 1.492539825751562e-07, + "loss": 1.6069, + "step": 13100 + }, + { + "epoch": 3.6868757912505274, + "grad_norm": 3.25, + "learning_rate": 1.4792786385355415e-07, + "loss": 1.8708, + "step": 13105 + }, + { + "epoch": 3.6882824588549727, + "grad_norm": 3.21875, + "learning_rate": 1.4660755172537953e-07, + "loss": 1.4281, + "step": 13110 + }, + { + "epoch": 3.6896891264594176, + "grad_norm": 2.484375, + "learning_rate": 1.4529304818086297e-07, + "loss": 1.8313, + "step": 13115 + }, + { + "epoch": 3.6910957940638625, + "grad_norm": 3.375, + "learning_rate": 1.4398435520147988e-07, + "loss": 1.4211, + "step": 13120 + }, + { + "epoch": 3.6925024616683078, + "grad_norm": 4.03125, + "learning_rate": 1.426814747599483e-07, + "loss": 1.5624, + "step": 13125 + }, + { + "epoch": 3.693909129272753, + "grad_norm": 3.578125, + "learning_rate": 1.4138440882022297e-07, + "loss": 1.5928, + "step": 13130 + }, + { + "epoch": 3.695315796877198, + "grad_norm": 2.9375, + "learning_rate": 1.4009315933749411e-07, + "loss": 1.6827, + "step": 13135 + }, + { + "epoch": 3.6967224644816428, + "grad_norm": 3.109375, + "learning_rate": 1.388077282581852e-07, + "loss": 1.6348, + "step": 13140 + }, + { + "epoch": 3.698129132086088, + "grad_norm": 3.71875, + "learning_rate": 1.375281175199472e-07, + "loss": 1.5618, + "step": 13145 + }, + { + "epoch": 3.6995357996905334, + "grad_norm": 3.671875, + "learning_rate": 1.362543290516589e-07, + "loss": 1.621, + "step": 13150 + }, + { + "epoch": 3.700942467294978, + "grad_norm": 3.78125, + "learning_rate": 1.3498636477342307e-07, + "loss": 1.6355, + "step": 13155 + }, + { + "epoch": 3.702349134899423, + "grad_norm": 3.375, + "learning_rate": 1.337242265965619e-07, + "loss": 1.584, + "step": 13160 + }, + { + "epoch": 3.7037558025038684, + "grad_norm": 3.640625, + "learning_rate": 1.3246791642361622e-07, + "loss": 1.5461, + "step": 13165 + }, + { + "epoch": 3.7051624701083137, + "grad_norm": 3.90625, + "learning_rate": 1.3121743614834135e-07, + "loss": 1.7194, + "step": 13170 + }, + { + "epoch": 3.7065691377127585, + "grad_norm": 4.59375, + "learning_rate": 1.2997278765570463e-07, + "loss": 1.1595, + "step": 13175 + }, + { + "epoch": 3.7079758053172034, + "grad_norm": 2.9375, + "learning_rate": 1.2873397282188215e-07, + "loss": 1.7544, + "step": 13180 + }, + { + "epoch": 3.7093824729216487, + "grad_norm": 2.15625, + "learning_rate": 1.2750099351425792e-07, + "loss": 1.6543, + "step": 13185 + }, + { + "epoch": 3.7107891405260935, + "grad_norm": 6.375, + "learning_rate": 1.2627385159141812e-07, + "loss": 1.4372, + "step": 13190 + }, + { + "epoch": 3.712195808130539, + "grad_norm": 3.109375, + "learning_rate": 1.250525489031493e-07, + "loss": 1.525, + "step": 13195 + }, + { + "epoch": 3.7136024757349837, + "grad_norm": 3.78125, + "learning_rate": 1.2383708729043886e-07, + "loss": 1.7147, + "step": 13200 + }, + { + "epoch": 3.715009143339429, + "grad_norm": 3.75, + "learning_rate": 1.2262746858546468e-07, + "loss": 1.4541, + "step": 13205 + }, + { + "epoch": 3.716415810943874, + "grad_norm": 2.9375, + "learning_rate": 1.214236946116012e-07, + "loss": 1.696, + "step": 13210 + }, + { + "epoch": 3.717822478548319, + "grad_norm": 3.1875, + "learning_rate": 1.2022576718341104e-07, + "loss": 1.614, + "step": 13215 + }, + { + "epoch": 3.719229146152764, + "grad_norm": 3.03125, + "learning_rate": 1.1903368810664315e-07, + "loss": 1.7279, + "step": 13220 + }, + { + "epoch": 3.7206358137572093, + "grad_norm": 6.03125, + "learning_rate": 1.1784745917823169e-07, + "loss": 1.4992, + "step": 13225 + }, + { + "epoch": 3.722042481361654, + "grad_norm": 2.515625, + "learning_rate": 1.1666708218629206e-07, + "loss": 1.4643, + "step": 13230 + }, + { + "epoch": 3.7234491489660995, + "grad_norm": 3.375, + "learning_rate": 1.1549255891011788e-07, + "loss": 1.3531, + "step": 13235 + }, + { + "epoch": 3.7248558165705443, + "grad_norm": 2.984375, + "learning_rate": 1.1432389112017959e-07, + "loss": 1.5313, + "step": 13240 + }, + { + "epoch": 3.726262484174989, + "grad_norm": 3.03125, + "learning_rate": 1.1316108057812135e-07, + "loss": 1.5671, + "step": 13245 + }, + { + "epoch": 3.7276691517794345, + "grad_norm": 3.34375, + "learning_rate": 1.1200412903675749e-07, + "loss": 1.5695, + "step": 13250 + }, + { + "epoch": 3.7290758193838798, + "grad_norm": 3.703125, + "learning_rate": 1.1085303824006986e-07, + "loss": 1.505, + "step": 13255 + }, + { + "epoch": 3.7304824869883246, + "grad_norm": 4.15625, + "learning_rate": 1.0970780992320871e-07, + "loss": 1.4891, + "step": 13260 + }, + { + "epoch": 3.7318891545927695, + "grad_norm": 3.390625, + "learning_rate": 1.0856844581248292e-07, + "loss": 1.6268, + "step": 13265 + }, + { + "epoch": 3.733295822197215, + "grad_norm": 3.78125, + "learning_rate": 1.0743494762536486e-07, + "loss": 1.4452, + "step": 13270 + }, + { + "epoch": 3.73470248980166, + "grad_norm": 3.75, + "learning_rate": 1.0630731707048513e-07, + "loss": 1.5661, + "step": 13275 + }, + { + "epoch": 3.736109157406105, + "grad_norm": 3.5625, + "learning_rate": 1.0518555584762578e-07, + "loss": 1.5987, + "step": 13280 + }, + { + "epoch": 3.73751582501055, + "grad_norm": 3.40625, + "learning_rate": 1.0406966564772578e-07, + "loss": 1.6404, + "step": 13285 + }, + { + "epoch": 3.738922492614995, + "grad_norm": 3.953125, + "learning_rate": 1.0295964815287117e-07, + "loss": 1.5056, + "step": 13290 + }, + { + "epoch": 3.7403291602194404, + "grad_norm": 3.140625, + "learning_rate": 1.0185550503629725e-07, + "loss": 1.595, + "step": 13295 + }, + { + "epoch": 3.7417358278238853, + "grad_norm": 3.171875, + "learning_rate": 1.0075723796238244e-07, + "loss": 1.3048, + "step": 13300 + }, + { + "epoch": 3.74314249542833, + "grad_norm": 3.703125, + "learning_rate": 9.966484858665003e-08, + "loss": 1.5691, + "step": 13305 + }, + { + "epoch": 3.7445491630327754, + "grad_norm": 3.0625, + "learning_rate": 9.857833855576103e-08, + "loss": 1.504, + "step": 13310 + }, + { + "epoch": 3.7459558306372203, + "grad_norm": 3.203125, + "learning_rate": 9.749770950751601e-08, + "loss": 1.6968, + "step": 13315 + }, + { + "epoch": 3.7473624982416656, + "grad_norm": 3.140625, + "learning_rate": 9.642296307084885e-08, + "loss": 1.7236, + "step": 13320 + }, + { + "epoch": 3.7487691658461104, + "grad_norm": 3.609375, + "learning_rate": 9.535410086582718e-08, + "loss": 1.3874, + "step": 13325 + }, + { + "epoch": 3.7501758334505557, + "grad_norm": 4.78125, + "learning_rate": 9.429112450364707e-08, + "loss": 1.319, + "step": 13330 + }, + { + "epoch": 3.7515825010550006, + "grad_norm": 3.0, + "learning_rate": 9.323403558663523e-08, + "loss": 1.6897, + "step": 13335 + }, + { + "epoch": 3.752989168659446, + "grad_norm": 3.703125, + "learning_rate": 9.218283570824149e-08, + "loss": 1.5377, + "step": 13340 + }, + { + "epoch": 3.7543958362638907, + "grad_norm": 3.359375, + "learning_rate": 9.113752645303829e-08, + "loss": 1.4597, + "step": 13345 + }, + { + "epoch": 3.755802503868336, + "grad_norm": 3.234375, + "learning_rate": 9.009810939671991e-08, + "loss": 1.3243, + "step": 13350 + }, + { + "epoch": 3.757209171472781, + "grad_norm": 3.75, + "learning_rate": 8.906458610609791e-08, + "loss": 1.6396, + "step": 13355 + }, + { + "epoch": 3.758615839077226, + "grad_norm": 3.53125, + "learning_rate": 8.803695813910072e-08, + "loss": 1.7091, + "step": 13360 + }, + { + "epoch": 3.760022506681671, + "grad_norm": 4.03125, + "learning_rate": 8.701522704476838e-08, + "loss": 1.6138, + "step": 13365 + }, + { + "epoch": 3.7614291742861163, + "grad_norm": 3.328125, + "learning_rate": 8.599939436325376e-08, + "loss": 1.5434, + "step": 13370 + }, + { + "epoch": 3.762835841890561, + "grad_norm": 3.140625, + "learning_rate": 8.498946162581732e-08, + "loss": 1.6085, + "step": 13375 + }, + { + "epoch": 3.7642425094950065, + "grad_norm": 3.75, + "learning_rate": 8.39854303548262e-08, + "loss": 1.4269, + "step": 13380 + }, + { + "epoch": 3.7656491770994514, + "grad_norm": 4.40625, + "learning_rate": 8.298730206375237e-08, + "loss": 1.65, + "step": 13385 + }, + { + "epoch": 3.767055844703896, + "grad_norm": 3.046875, + "learning_rate": 8.199507825716923e-08, + "loss": 1.6611, + "step": 13390 + }, + { + "epoch": 3.7684625123083415, + "grad_norm": 3.421875, + "learning_rate": 8.100876043074878e-08, + "loss": 1.6301, + "step": 13395 + }, + { + "epoch": 3.769869179912787, + "grad_norm": 2.640625, + "learning_rate": 8.002835007126263e-08, + "loss": 1.7159, + "step": 13400 + }, + { + "epoch": 3.7712758475172317, + "grad_norm": 4.03125, + "learning_rate": 7.905384865657572e-08, + "loss": 1.6362, + "step": 13405 + }, + { + "epoch": 3.7726825151216765, + "grad_norm": 3.46875, + "learning_rate": 7.808525765564634e-08, + "loss": 1.5149, + "step": 13410 + }, + { + "epoch": 3.774089182726122, + "grad_norm": 2.90625, + "learning_rate": 7.712257852852344e-08, + "loss": 1.7875, + "step": 13415 + }, + { + "epoch": 3.775495850330567, + "grad_norm": 3.015625, + "learning_rate": 7.616581272634493e-08, + "loss": 1.5831, + "step": 13420 + }, + { + "epoch": 3.776902517935012, + "grad_norm": 3.296875, + "learning_rate": 7.521496169133445e-08, + "loss": 1.75, + "step": 13425 + }, + { + "epoch": 3.778309185539457, + "grad_norm": 2.53125, + "learning_rate": 7.427002685679884e-08, + "loss": 1.7126, + "step": 13430 + }, + { + "epoch": 3.779715853143902, + "grad_norm": 3.140625, + "learning_rate": 7.33310096471298e-08, + "loss": 1.453, + "step": 13435 + }, + { + "epoch": 3.7811225207483474, + "grad_norm": 5.15625, + "learning_rate": 7.23979114777955e-08, + "loss": 1.6371, + "step": 13440 + }, + { + "epoch": 3.7825291883527923, + "grad_norm": 3.03125, + "learning_rate": 7.147073375534374e-08, + "loss": 1.7193, + "step": 13445 + }, + { + "epoch": 3.783935855957237, + "grad_norm": 2.9375, + "learning_rate": 7.054947787739785e-08, + "loss": 1.4624, + "step": 13450 + }, + { + "epoch": 3.7853425235616824, + "grad_norm": 3.21875, + "learning_rate": 6.963414523265321e-08, + "loss": 1.6463, + "step": 13455 + }, + { + "epoch": 3.7867491911661273, + "grad_norm": 3.5625, + "learning_rate": 6.872473720087768e-08, + "loss": 1.6806, + "step": 13460 + }, + { + "epoch": 3.7881558587705726, + "grad_norm": 2.953125, + "learning_rate": 6.782125515290937e-08, + "loss": 1.5029, + "step": 13465 + }, + { + "epoch": 3.7895625263750174, + "grad_norm": 2.71875, + "learning_rate": 6.692370045065043e-08, + "loss": 1.4976, + "step": 13470 + }, + { + "epoch": 3.7909691939794627, + "grad_norm": 3.0, + "learning_rate": 6.603207444707149e-08, + "loss": 1.4385, + "step": 13475 + }, + { + "epoch": 3.7923758615839076, + "grad_norm": 3.09375, + "learning_rate": 6.514637848620497e-08, + "loss": 1.4465, + "step": 13480 + }, + { + "epoch": 3.793782529188353, + "grad_norm": 3.0625, + "learning_rate": 6.426661390314336e-08, + "loss": 1.6002, + "step": 13485 + }, + { + "epoch": 3.7951891967927978, + "grad_norm": 5.125, + "learning_rate": 6.339278202404009e-08, + "loss": 1.43, + "step": 13490 + }, + { + "epoch": 3.796595864397243, + "grad_norm": 3.453125, + "learning_rate": 6.252488416610458e-08, + "loss": 1.6215, + "step": 13495 + }, + { + "epoch": 3.798002532001688, + "grad_norm": 3.25, + "learning_rate": 6.166292163760145e-08, + "loss": 1.7022, + "step": 13500 + }, + { + "epoch": 3.799409199606133, + "grad_norm": 2.96875, + "learning_rate": 6.080689573784826e-08, + "loss": 1.4709, + "step": 13505 + }, + { + "epoch": 3.800815867210578, + "grad_norm": 2.828125, + "learning_rate": 5.995680775721457e-08, + "loss": 1.5759, + "step": 13510 + }, + { + "epoch": 3.8022225348150234, + "grad_norm": 3.0625, + "learning_rate": 5.911265897711759e-08, + "loss": 1.6665, + "step": 13515 + }, + { + "epoch": 3.8036292024194682, + "grad_norm": 4.5, + "learning_rate": 5.82744506700239e-08, + "loss": 1.7009, + "step": 13520 + }, + { + "epoch": 3.8050358700239135, + "grad_norm": 2.953125, + "learning_rate": 5.744218409944412e-08, + "loss": 1.5255, + "step": 13525 + }, + { + "epoch": 3.8064425376283584, + "grad_norm": 3.09375, + "learning_rate": 5.6615860519932054e-08, + "loss": 1.6446, + "step": 13530 + }, + { + "epoch": 3.8078492052328032, + "grad_norm": 4.53125, + "learning_rate": 5.5795481177083324e-08, + "loss": 1.5484, + "step": 13535 + }, + { + "epoch": 3.8092558728372485, + "grad_norm": 3.46875, + "learning_rate": 5.49810473075345e-08, + "loss": 1.6404, + "step": 13540 + }, + { + "epoch": 3.810662540441694, + "grad_norm": 5.59375, + "learning_rate": 5.417256013895777e-08, + "loss": 1.5426, + "step": 13545 + }, + { + "epoch": 3.8120692080461387, + "grad_norm": 3.234375, + "learning_rate": 5.337002089006315e-08, + "loss": 1.681, + "step": 13550 + }, + { + "epoch": 3.8134758756505835, + "grad_norm": 3.609375, + "learning_rate": 5.2573430770594505e-08, + "loss": 1.7098, + "step": 13555 + }, + { + "epoch": 3.814882543255029, + "grad_norm": 7.40625, + "learning_rate": 5.178279098132643e-08, + "loss": 1.7041, + "step": 13560 + }, + { + "epoch": 3.816289210859474, + "grad_norm": 3.65625, + "learning_rate": 5.099810271406646e-08, + "loss": 1.6355, + "step": 13565 + }, + { + "epoch": 3.817695878463919, + "grad_norm": 4.09375, + "learning_rate": 5.021936715164843e-08, + "loss": 1.6715, + "step": 13570 + }, + { + "epoch": 3.819102546068364, + "grad_norm": 6.0, + "learning_rate": 4.9446585467935566e-08, + "loss": 1.4218, + "step": 13575 + }, + { + "epoch": 3.820509213672809, + "grad_norm": 3.171875, + "learning_rate": 4.8679758827813835e-08, + "loss": 1.7012, + "step": 13580 + }, + { + "epoch": 3.8219158812772545, + "grad_norm": 3.0625, + "learning_rate": 4.791888838719416e-08, + "loss": 1.4577, + "step": 13585 + }, + { + "epoch": 3.8233225488816993, + "grad_norm": 3.046875, + "learning_rate": 4.7163975293008416e-08, + "loss": 1.6121, + "step": 13590 + }, + { + "epoch": 3.824729216486144, + "grad_norm": 3.171875, + "learning_rate": 4.641502068320946e-08, + "loss": 1.3883, + "step": 13595 + }, + { + "epoch": 3.8261358840905895, + "grad_norm": 2.9375, + "learning_rate": 4.567202568676665e-08, + "loss": 1.6198, + "step": 13600 + }, + { + "epoch": 3.8275425516950343, + "grad_norm": 3.484375, + "learning_rate": 4.493499142366719e-08, + "loss": 1.8679, + "step": 13605 + }, + { + "epoch": 3.8289492192994796, + "grad_norm": 3.21875, + "learning_rate": 4.4203919004912606e-08, + "loss": 1.5141, + "step": 13610 + }, + { + "epoch": 3.8303558869039245, + "grad_norm": 2.65625, + "learning_rate": 4.347880953251737e-08, + "loss": 1.5901, + "step": 13615 + }, + { + "epoch": 3.83176255450837, + "grad_norm": 3.125, + "learning_rate": 4.275966409950804e-08, + "loss": 1.5317, + "step": 13620 + }, + { + "epoch": 3.8331692221128146, + "grad_norm": 3.625, + "learning_rate": 4.2046483789920596e-08, + "loss": 1.4834, + "step": 13625 + }, + { + "epoch": 3.83457588971726, + "grad_norm": 5.25, + "learning_rate": 4.1339269678799525e-08, + "loss": 1.6823, + "step": 13630 + }, + { + "epoch": 3.835982557321705, + "grad_norm": 3.421875, + "learning_rate": 4.0638022832195197e-08, + "loss": 1.7195, + "step": 13635 + }, + { + "epoch": 3.83738922492615, + "grad_norm": 2.5625, + "learning_rate": 3.994274430716427e-08, + "loss": 1.8747, + "step": 13640 + }, + { + "epoch": 3.838795892530595, + "grad_norm": 2.296875, + "learning_rate": 3.925343515176482e-08, + "loss": 1.5168, + "step": 13645 + }, + { + "epoch": 3.8402025601350402, + "grad_norm": 3.3125, + "learning_rate": 3.857009640505859e-08, + "loss": 1.7263, + "step": 13650 + }, + { + "epoch": 3.841609227739485, + "grad_norm": 3.421875, + "learning_rate": 3.7892729097106944e-08, + "loss": 1.3747, + "step": 13655 + }, + { + "epoch": 3.84301589534393, + "grad_norm": 3.9375, + "learning_rate": 3.722133424896956e-08, + "loss": 1.4349, + "step": 13660 + }, + { + "epoch": 3.8444225629483753, + "grad_norm": 4.8125, + "learning_rate": 3.655591287270354e-08, + "loss": 1.7337, + "step": 13665 + }, + { + "epoch": 3.8458292305528206, + "grad_norm": 2.609375, + "learning_rate": 3.589646597136209e-08, + "loss": 1.647, + "step": 13670 + }, + { + "epoch": 3.8472358981572654, + "grad_norm": 3.5625, + "learning_rate": 3.524299453899093e-08, + "loss": 1.7049, + "step": 13675 + }, + { + "epoch": 3.8486425657617103, + "grad_norm": 3.015625, + "learning_rate": 3.459549956063013e-08, + "loss": 1.4334, + "step": 13680 + }, + { + "epoch": 3.8500492333661556, + "grad_norm": 2.734375, + "learning_rate": 3.395398201231048e-08, + "loss": 1.685, + "step": 13685 + }, + { + "epoch": 3.851455900970601, + "grad_norm": 3.46875, + "learning_rate": 3.331844286105179e-08, + "loss": 1.6998, + "step": 13690 + }, + { + "epoch": 3.8528625685750457, + "grad_norm": 3.0, + "learning_rate": 3.268888306486284e-08, + "loss": 1.5654, + "step": 13695 + }, + { + "epoch": 3.8542692361794906, + "grad_norm": 4.46875, + "learning_rate": 3.206530357273829e-08, + "loss": 1.623, + "step": 13700 + }, + { + "epoch": 3.855675903783936, + "grad_norm": 2.453125, + "learning_rate": 3.1447705324659126e-08, + "loss": 1.7226, + "step": 13705 + }, + { + "epoch": 3.857082571388381, + "grad_norm": 4.34375, + "learning_rate": 3.0836089251589535e-08, + "loss": 1.5045, + "step": 13710 + }, + { + "epoch": 3.858489238992826, + "grad_norm": 2.3125, + "learning_rate": 3.0230456275476045e-08, + "loss": 1.6649, + "step": 13715 + }, + { + "epoch": 3.859895906597271, + "grad_norm": 3.5, + "learning_rate": 2.963080730924705e-08, + "loss": 1.6491, + "step": 13720 + }, + { + "epoch": 3.861302574201716, + "grad_norm": 2.65625, + "learning_rate": 2.903714325681017e-08, + "loss": 1.5911, + "step": 13725 + }, + { + "epoch": 3.862709241806161, + "grad_norm": 3.078125, + "learning_rate": 2.8449465013051343e-08, + "loss": 1.5251, + "step": 13730 + }, + { + "epoch": 3.8641159094106063, + "grad_norm": 2.96875, + "learning_rate": 2.7867773463833954e-08, + "loss": 1.6151, + "step": 13735 + }, + { + "epoch": 3.865522577015051, + "grad_norm": 2.609375, + "learning_rate": 2.7292069485996604e-08, + "loss": 1.7493, + "step": 13740 + }, + { + "epoch": 3.8669292446194965, + "grad_norm": 4.75, + "learning_rate": 2.6722353947352227e-08, + "loss": 1.5258, + "step": 13745 + }, + { + "epoch": 3.8683359122239414, + "grad_norm": 3.21875, + "learning_rate": 2.615862770668764e-08, + "loss": 1.4538, + "step": 13750 + }, + { + "epoch": 3.8697425798283867, + "grad_norm": 3.71875, + "learning_rate": 2.5600891613760445e-08, + "loss": 1.5134, + "step": 13755 + }, + { + "epoch": 3.8711492474328315, + "grad_norm": 3.859375, + "learning_rate": 2.5049146509299012e-08, + "loss": 1.4722, + "step": 13760 + }, + { + "epoch": 3.872555915037277, + "grad_norm": 3.0625, + "learning_rate": 2.450339322500161e-08, + "loss": 1.608, + "step": 13765 + }, + { + "epoch": 3.8739625826417217, + "grad_norm": 2.6875, + "learning_rate": 2.3963632583533733e-08, + "loss": 1.6941, + "step": 13770 + }, + { + "epoch": 3.875369250246167, + "grad_norm": 2.40625, + "learning_rate": 2.342986539852676e-08, + "loss": 1.4776, + "step": 13775 + }, + { + "epoch": 3.876775917850612, + "grad_norm": 4.0, + "learning_rate": 2.2902092474579747e-08, + "loss": 1.5991, + "step": 13780 + }, + { + "epoch": 3.878182585455057, + "grad_norm": 3.296875, + "learning_rate": 2.2380314607254536e-08, + "loss": 1.542, + "step": 13785 + }, + { + "epoch": 3.879589253059502, + "grad_norm": 3.4375, + "learning_rate": 2.186453258307619e-08, + "loss": 1.6427, + "step": 13790 + }, + { + "epoch": 3.8809959206639473, + "grad_norm": 3.125, + "learning_rate": 2.1354747179531674e-08, + "loss": 1.5292, + "step": 13795 + }, + { + "epoch": 3.882402588268392, + "grad_norm": 3.171875, + "learning_rate": 2.0850959165069403e-08, + "loss": 1.4787, + "step": 13800 + }, + { + "epoch": 3.883809255872837, + "grad_norm": 3.03125, + "learning_rate": 2.035316929909614e-08, + "loss": 1.6944, + "step": 13805 + }, + { + "epoch": 3.8852159234772823, + "grad_norm": 3.015625, + "learning_rate": 1.9861378331978318e-08, + "loss": 1.5436, + "step": 13810 + }, + { + "epoch": 3.8866225910817276, + "grad_norm": 2.96875, + "learning_rate": 1.937558700503894e-08, + "loss": 1.8179, + "step": 13815 + }, + { + "epoch": 3.8880292586861724, + "grad_norm": 3.765625, + "learning_rate": 1.8895796050557134e-08, + "loss": 1.3398, + "step": 13820 + }, + { + "epoch": 3.8894359262906173, + "grad_norm": 3.0625, + "learning_rate": 1.8422006191766813e-08, + "loss": 1.6199, + "step": 13825 + }, + { + "epoch": 3.8908425938950626, + "grad_norm": 3.296875, + "learning_rate": 1.795421814285758e-08, + "loss": 1.4014, + "step": 13830 + }, + { + "epoch": 3.892249261499508, + "grad_norm": 3.171875, + "learning_rate": 1.7492432608969375e-08, + "loss": 1.7273, + "step": 13835 + }, + { + "epoch": 3.8936559291039528, + "grad_norm": 4.96875, + "learning_rate": 1.7036650286196498e-08, + "loss": 1.3763, + "step": 13840 + }, + { + "epoch": 3.8950625967083976, + "grad_norm": 3.640625, + "learning_rate": 1.6586871861581807e-08, + "loss": 1.6194, + "step": 13845 + }, + { + "epoch": 3.896469264312843, + "grad_norm": 3.53125, + "learning_rate": 1.6143098013119415e-08, + "loss": 1.4738, + "step": 13850 + }, + { + "epoch": 3.897875931917288, + "grad_norm": 3.46875, + "learning_rate": 1.5705329409751556e-08, + "loss": 1.6051, + "step": 13855 + }, + { + "epoch": 3.899282599521733, + "grad_norm": 3.734375, + "learning_rate": 1.5273566711369036e-08, + "loss": 1.6372, + "step": 13860 + }, + { + "epoch": 3.900689267126178, + "grad_norm": 3.484375, + "learning_rate": 1.4847810568807683e-08, + "loss": 1.5585, + "step": 13865 + }, + { + "epoch": 3.902095934730623, + "grad_norm": 3.953125, + "learning_rate": 1.4428061623850573e-08, + "loss": 1.4838, + "step": 13870 + }, + { + "epoch": 3.903502602335068, + "grad_norm": 3.25, + "learning_rate": 1.4014320509224909e-08, + "loss": 1.6295, + "step": 13875 + }, + { + "epoch": 3.9049092699395134, + "grad_norm": 3.296875, + "learning_rate": 1.3606587848602024e-08, + "loss": 1.6314, + "step": 13880 + }, + { + "epoch": 3.9063159375439582, + "grad_norm": 3.390625, + "learning_rate": 1.3204864256596504e-08, + "loss": 1.5786, + "step": 13885 + }, + { + "epoch": 3.9077226051484035, + "grad_norm": 3.375, + "learning_rate": 1.2809150338763064e-08, + "loss": 1.345, + "step": 13890 + }, + { + "epoch": 3.9091292727528484, + "grad_norm": 3.25, + "learning_rate": 1.241944669160011e-08, + "loss": 1.7872, + "step": 13895 + }, + { + "epoch": 3.9105359403572937, + "grad_norm": 3.046875, + "learning_rate": 1.203575390254441e-08, + "loss": 1.6342, + "step": 13900 + }, + { + "epoch": 3.9119426079617385, + "grad_norm": 3.09375, + "learning_rate": 1.1658072549971975e-08, + "loss": 1.3083, + "step": 13905 + }, + { + "epoch": 3.913349275566184, + "grad_norm": 3.375, + "learning_rate": 1.1286403203198513e-08, + "loss": 1.6906, + "step": 13910 + }, + { + "epoch": 3.9147559431706287, + "grad_norm": 4.71875, + "learning_rate": 1.0920746422476313e-08, + "loss": 1.3366, + "step": 13915 + }, + { + "epoch": 3.916162610775074, + "grad_norm": 3.046875, + "learning_rate": 1.05611027589938e-08, + "loss": 1.4893, + "step": 13920 + }, + { + "epoch": 3.917569278379519, + "grad_norm": 3.109375, + "learning_rate": 1.0207472754876878e-08, + "loss": 1.844, + "step": 13925 + }, + { + "epoch": 3.918975945983964, + "grad_norm": 4.21875, + "learning_rate": 9.859856943184919e-09, + "loss": 1.7505, + "step": 13930 + }, + { + "epoch": 3.920382613588409, + "grad_norm": 3.984375, + "learning_rate": 9.518255847912548e-09, + "loss": 1.6995, + "step": 13935 + }, + { + "epoch": 3.9217892811928543, + "grad_norm": 3.40625, + "learning_rate": 9.182669983986979e-09, + "loss": 1.4661, + "step": 13940 + }, + { + "epoch": 3.923195948797299, + "grad_norm": 3.671875, + "learning_rate": 8.853099857269342e-09, + "loss": 1.2838, + "step": 13945 + }, + { + "epoch": 3.924602616401744, + "grad_norm": 3.6875, + "learning_rate": 8.529545964551577e-09, + "loss": 1.5647, + "step": 13950 + }, + { + "epoch": 3.9260092840061893, + "grad_norm": 2.65625, + "learning_rate": 8.212008793556436e-09, + "loss": 1.657, + "step": 13955 + }, + { + "epoch": 3.9274159516106346, + "grad_norm": 4.0, + "learning_rate": 7.900488822939255e-09, + "loss": 1.4927, + "step": 13960 + }, + { + "epoch": 3.9288226192150795, + "grad_norm": 3.046875, + "learning_rate": 7.594986522282188e-09, + "loss": 1.604, + "step": 13965 + }, + { + "epoch": 3.9302292868195243, + "grad_norm": 3.765625, + "learning_rate": 7.295502352098637e-09, + "loss": 1.5973, + "step": 13970 + }, + { + "epoch": 3.9316359544239696, + "grad_norm": 3.53125, + "learning_rate": 7.002036763829267e-09, + "loss": 1.6541, + "step": 13975 + }, + { + "epoch": 3.933042622028415, + "grad_norm": 3.15625, + "learning_rate": 6.7145901998424404e-09, + "loss": 1.6558, + "step": 13980 + }, + { + "epoch": 3.93444928963286, + "grad_norm": 2.90625, + "learning_rate": 6.43316309343378e-09, + "loss": 1.7133, + "step": 13985 + }, + { + "epoch": 3.9358559572373046, + "grad_norm": 3.921875, + "learning_rate": 6.157755868824832e-09, + "loss": 1.5935, + "step": 13990 + }, + { + "epoch": 3.93726262484175, + "grad_norm": 2.515625, + "learning_rate": 5.888368941163513e-09, + "loss": 1.4415, + "step": 13995 + }, + { + "epoch": 3.9386692924461952, + "grad_norm": 3.234375, + "learning_rate": 5.625002716521887e-09, + "loss": 1.6804, + "step": 14000 + }, + { + "epoch": 3.94007596005064, + "grad_norm": 3.8125, + "learning_rate": 5.3676575918966125e-09, + "loss": 1.4467, + "step": 14005 + }, + { + "epoch": 3.941482627655085, + "grad_norm": 2.71875, + "learning_rate": 5.1163339552084964e-09, + "loss": 1.4877, + "step": 14010 + }, + { + "epoch": 3.9428892952595302, + "grad_norm": 3.75, + "learning_rate": 4.871032185302048e-09, + "loss": 1.4424, + "step": 14015 + }, + { + "epoch": 3.944295962863975, + "grad_norm": 3.625, + "learning_rate": 4.631752651943266e-09, + "loss": 1.6061, + "step": 14020 + }, + { + "epoch": 3.9457026304684204, + "grad_norm": 2.515625, + "learning_rate": 4.39849571582096e-09, + "loss": 1.4244, + "step": 14025 + }, + { + "epoch": 3.9471092980728653, + "grad_norm": 4.15625, + "learning_rate": 4.171261728545428e-09, + "loss": 1.4968, + "step": 14030 + }, + { + "epoch": 3.9485159656773106, + "grad_norm": 2.375, + "learning_rate": 3.950051032648449e-09, + "loss": 1.5433, + "step": 14035 + }, + { + "epoch": 3.9499226332817554, + "grad_norm": 4.4375, + "learning_rate": 3.734863961581069e-09, + "loss": 1.6523, + "step": 14040 + }, + { + "epoch": 3.9513293008862007, + "grad_norm": 4.6875, + "learning_rate": 3.525700839715817e-09, + "loss": 1.3554, + "step": 14045 + }, + { + "epoch": 3.9527359684906456, + "grad_norm": 3.65625, + "learning_rate": 3.322561982343597e-09, + "loss": 1.5458, + "step": 14050 + }, + { + "epoch": 3.954142636095091, + "grad_norm": 2.625, + "learning_rate": 3.1254476956750207e-09, + "loss": 1.5498, + "step": 14055 + }, + { + "epoch": 3.9555493036995357, + "grad_norm": 3.796875, + "learning_rate": 2.9343582768395215e-09, + "loss": 1.5235, + "step": 14060 + }, + { + "epoch": 3.956955971303981, + "grad_norm": 2.703125, + "learning_rate": 2.7492940138840183e-09, + "loss": 1.4832, + "step": 14065 + }, + { + "epoch": 3.958362638908426, + "grad_norm": 2.609375, + "learning_rate": 2.5702551857733623e-09, + "loss": 1.6262, + "step": 14070 + }, + { + "epoch": 3.9597693065128707, + "grad_norm": 3.203125, + "learning_rate": 2.3972420623898927e-09, + "loss": 1.6804, + "step": 14075 + }, + { + "epoch": 3.961175974117316, + "grad_norm": 3.0625, + "learning_rate": 2.230254904532547e-09, + "loss": 1.3863, + "step": 14080 + }, + { + "epoch": 3.9625826417217613, + "grad_norm": 5.03125, + "learning_rate": 2.069293963916419e-09, + "loss": 1.4973, + "step": 14085 + }, + { + "epoch": 3.963989309326206, + "grad_norm": 3.1875, + "learning_rate": 1.9143594831740882e-09, + "loss": 1.6832, + "step": 14090 + }, + { + "epoch": 3.965395976930651, + "grad_norm": 3.640625, + "learning_rate": 1.7654516958525156e-09, + "loss": 1.3712, + "step": 14095 + }, + { + "epoch": 3.9668026445350963, + "grad_norm": 3.5625, + "learning_rate": 1.6225708264148153e-09, + "loss": 1.4175, + "step": 14100 + }, + { + "epoch": 3.9682093121395416, + "grad_norm": 4.125, + "learning_rate": 1.4857170902384807e-09, + "loss": 1.5806, + "step": 14105 + }, + { + "epoch": 3.9696159797439865, + "grad_norm": 2.875, + "learning_rate": 1.354890693616273e-09, + "loss": 1.7819, + "step": 14110 + }, + { + "epoch": 3.9710226473484314, + "grad_norm": 5.59375, + "learning_rate": 1.2300918337553312e-09, + "loss": 1.413, + "step": 14115 + }, + { + "epoch": 3.9724293149528767, + "grad_norm": 2.875, + "learning_rate": 1.1113206987767298e-09, + "loss": 1.66, + "step": 14120 + }, + { + "epoch": 3.973835982557322, + "grad_norm": 2.859375, + "learning_rate": 9.985774677150339e-10, + "loss": 1.5321, + "step": 14125 + }, + { + "epoch": 3.975242650161767, + "grad_norm": 3.0625, + "learning_rate": 8.91862310519631e-10, + "loss": 1.4343, + "step": 14130 + }, + { + "epoch": 3.9766493177662117, + "grad_norm": 3.0, + "learning_rate": 7.911753880516236e-10, + "loss": 1.6013, + "step": 14135 + }, + { + "epoch": 3.978055985370657, + "grad_norm": 3.4375, + "learning_rate": 6.965168520864928e-10, + "loss": 1.517, + "step": 14140 + }, + { + "epoch": 3.979462652975102, + "grad_norm": 3.6875, + "learning_rate": 6.07886845311878e-10, + "loss": 1.6839, + "step": 14145 + }, + { + "epoch": 3.980869320579547, + "grad_norm": 3.625, + "learning_rate": 5.252855013280211e-10, + "loss": 1.5155, + "step": 14150 + }, + { + "epoch": 3.982275988183992, + "grad_norm": 2.609375, + "learning_rate": 4.487129446477667e-10, + "loss": 1.6638, + "step": 14155 + }, + { + "epoch": 3.9836826557884373, + "grad_norm": 2.953125, + "learning_rate": 3.7816929069656165e-10, + "loss": 1.6047, + "step": 14160 + }, + { + "epoch": 3.985089323392882, + "grad_norm": 3.25, + "learning_rate": 3.1365464581112334e-10, + "loss": 1.5561, + "step": 14165 + }, + { + "epoch": 3.9864959909973274, + "grad_norm": 3.53125, + "learning_rate": 2.5516910724077133e-10, + "loss": 1.7709, + "step": 14170 + }, + { + "epoch": 3.9879026586017723, + "grad_norm": 3.15625, + "learning_rate": 2.0271276314565155e-10, + "loss": 1.612, + "step": 14175 + }, + { + "epoch": 3.9893093262062176, + "grad_norm": 3.203125, + "learning_rate": 1.5628569259940049e-10, + "loss": 1.4502, + "step": 14180 + }, + { + "epoch": 3.9907159938106624, + "grad_norm": 3.109375, + "learning_rate": 1.1588796558470449e-10, + "loss": 1.6444, + "step": 14185 + }, + { + "epoch": 3.9921226614151077, + "grad_norm": 3.296875, + "learning_rate": 8.15196429977405e-11, + "loss": 1.2351, + "step": 14190 + }, + { + "epoch": 3.9935293290195526, + "grad_norm": 4.125, + "learning_rate": 5.3180776644623505e-11, + "loss": 1.6336, + "step": 14195 + }, + { + "epoch": 3.994935996623998, + "grad_norm": 3.421875, + "learning_rate": 3.087140924318277e-11, + "loss": 1.638, + "step": 14200 + }, + { + "epoch": 3.9963426642284428, + "grad_norm": 4.0625, + "learning_rate": 1.4591574422961883e-11, + "loss": 1.7758, + "step": 14205 + }, + { + "epoch": 3.997749331832888, + "grad_norm": 3.484375, + "learning_rate": 4.3412967238865004e-12, + "loss": 1.8328, + "step": 14210 + }, + { + "epoch": 3.999155999437333, + "grad_norm": 3.75, + "learning_rate": 1.2059159670840813e-13, + "loss": 1.8058, + "step": 14215 + }, + { + "epoch": 3.9994373329582222, + "eval_loss": 1.576697826385498, + "eval_runtime": 330.563, + "eval_samples_per_second": 9.553, + "eval_steps_per_second": 4.777, + "step": 14216 + }, + { + "epoch": 3.9994373329582222, + "step": 14216, + "total_flos": 1.684239053169361e+18, + "train_loss": 1.6145593146796164, + "train_runtime": 58184.933, + "train_samples_per_second": 1.955, + "train_steps_per_second": 0.244 + } + ], + "logging_steps": 5, + "max_steps": 14216, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 1, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.684239053169361e+18, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}