{ "best_metric": 0.3030637502670288, "best_model_checkpoint": "data/hansken_human_hql_v2/checkpoint-3154", "epoch": 8.999365884590995, "eval_steps": 500, "global_step": 7096, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012682308180088776, "grad_norm": 0.8203513622283936, "learning_rate": 8.460236886632826e-08, "loss": 1.5999, "step": 1 }, { "epoch": 0.0063411540900443885, "grad_norm": 0.7799253463745117, "learning_rate": 4.230118443316413e-07, "loss": 1.6165, "step": 5 }, { "epoch": 0.012682308180088777, "grad_norm": 0.7865122556686401, "learning_rate": 8.460236886632826e-07, "loss": 1.5683, "step": 10 }, { "epoch": 0.019023462270133164, "grad_norm": 0.7803131341934204, "learning_rate": 1.2690355329949238e-06, "loss": 1.5978, "step": 15 }, { "epoch": 0.025364616360177554, "grad_norm": 0.9768760204315186, "learning_rate": 1.6920473773265652e-06, "loss": 1.5416, "step": 20 }, { "epoch": 0.03170577045022194, "grad_norm": 0.7657057046890259, "learning_rate": 2.1150592216582065e-06, "loss": 1.5549, "step": 25 }, { "epoch": 0.03804692454026633, "grad_norm": 0.8321160674095154, "learning_rate": 2.5380710659898476e-06, "loss": 1.5671, "step": 30 }, { "epoch": 0.04438807863031072, "grad_norm": 0.8168915510177612, "learning_rate": 2.961082910321489e-06, "loss": 1.6318, "step": 35 }, { "epoch": 0.05072923272035511, "grad_norm": 0.8272048234939575, "learning_rate": 3.3840947546531303e-06, "loss": 1.4833, "step": 40 }, { "epoch": 0.05707038681039949, "grad_norm": 0.8651142716407776, "learning_rate": 3.807106598984772e-06, "loss": 1.4499, "step": 45 }, { "epoch": 0.06341154090044387, "grad_norm": 0.7288665771484375, "learning_rate": 4.230118443316413e-06, "loss": 1.4514, "step": 50 }, { "epoch": 0.06975269499048826, "grad_norm": 0.5823898911476135, "learning_rate": 4.6531302876480546e-06, "loss": 1.4289, "step": 55 }, { "epoch": 0.07609384908053266, "grad_norm": 0.507546603679657, "learning_rate": 5.076142131979695e-06, "loss": 1.4807, "step": 60 }, { "epoch": 0.08243500317057705, "grad_norm": 0.4384841024875641, "learning_rate": 5.499153976311338e-06, "loss": 1.3578, "step": 65 }, { "epoch": 0.08877615726062144, "grad_norm": 0.3931611180305481, "learning_rate": 5.922165820642978e-06, "loss": 1.3892, "step": 70 }, { "epoch": 0.09511731135066583, "grad_norm": 0.39905428886413574, "learning_rate": 6.345177664974619e-06, "loss": 1.2954, "step": 75 }, { "epoch": 0.10145846544071022, "grad_norm": 0.3486492335796356, "learning_rate": 6.768189509306261e-06, "loss": 1.2869, "step": 80 }, { "epoch": 0.10779961953075459, "grad_norm": 0.37722063064575195, "learning_rate": 7.191201353637902e-06, "loss": 1.2925, "step": 85 }, { "epoch": 0.11414077362079898, "grad_norm": 0.4812447130680084, "learning_rate": 7.614213197969544e-06, "loss": 1.307, "step": 90 }, { "epoch": 0.12048192771084337, "grad_norm": 0.3943069279193878, "learning_rate": 8.037225042301184e-06, "loss": 1.2716, "step": 95 }, { "epoch": 0.12682308180088775, "grad_norm": 0.3306090533733368, "learning_rate": 8.460236886632826e-06, "loss": 1.2246, "step": 100 }, { "epoch": 0.13316423589093215, "grad_norm": 0.34389835596084595, "learning_rate": 8.883248730964468e-06, "loss": 1.208, "step": 105 }, { "epoch": 0.13950538998097653, "grad_norm": 0.3812473714351654, "learning_rate": 9.306260575296109e-06, "loss": 1.3009, "step": 110 }, { "epoch": 0.14584654407102093, "grad_norm": 0.2907530963420868, "learning_rate": 9.729272419627749e-06, "loss": 1.1474, "step": 115 }, { "epoch": 0.1521876981610653, "grad_norm": 0.27222707867622375, "learning_rate": 1.015228426395939e-05, "loss": 1.1206, "step": 120 }, { "epoch": 0.15852885225110971, "grad_norm": 0.3077019453048706, "learning_rate": 1.0575296108291032e-05, "loss": 1.1131, "step": 125 }, { "epoch": 0.1648700063411541, "grad_norm": 0.2848523259162903, "learning_rate": 1.0998307952622675e-05, "loss": 1.1683, "step": 130 }, { "epoch": 0.17121116043119847, "grad_norm": 0.3003387749195099, "learning_rate": 1.1421319796954315e-05, "loss": 1.0797, "step": 135 }, { "epoch": 0.17755231452124287, "grad_norm": 0.3172609508037567, "learning_rate": 1.1844331641285957e-05, "loss": 1.0181, "step": 140 }, { "epoch": 0.18389346861128725, "grad_norm": 0.2948904037475586, "learning_rate": 1.2267343485617598e-05, "loss": 0.9826, "step": 145 }, { "epoch": 0.19023462270133165, "grad_norm": 0.2865538001060486, "learning_rate": 1.2690355329949238e-05, "loss": 0.9901, "step": 150 }, { "epoch": 0.19657577679137603, "grad_norm": 0.2940317690372467, "learning_rate": 1.311336717428088e-05, "loss": 0.9767, "step": 155 }, { "epoch": 0.20291693088142043, "grad_norm": 0.3364640176296234, "learning_rate": 1.3536379018612521e-05, "loss": 0.9626, "step": 160 }, { "epoch": 0.2092580849714648, "grad_norm": 0.3012336492538452, "learning_rate": 1.3959390862944163e-05, "loss": 0.9114, "step": 165 }, { "epoch": 0.21559923906150918, "grad_norm": 0.31542471051216125, "learning_rate": 1.4382402707275804e-05, "loss": 0.8821, "step": 170 }, { "epoch": 0.2219403931515536, "grad_norm": 0.3387998938560486, "learning_rate": 1.4805414551607446e-05, "loss": 0.887, "step": 175 }, { "epoch": 0.22828154724159797, "grad_norm": 0.3657248616218567, "learning_rate": 1.5228426395939088e-05, "loss": 0.8154, "step": 180 }, { "epoch": 0.23462270133164237, "grad_norm": 0.35051897168159485, "learning_rate": 1.5651438240270726e-05, "loss": 0.8332, "step": 185 }, { "epoch": 0.24096385542168675, "grad_norm": 0.2987581193447113, "learning_rate": 1.607445008460237e-05, "loss": 0.7817, "step": 190 }, { "epoch": 0.24730500951173112, "grad_norm": 0.25834420323371887, "learning_rate": 1.649746192893401e-05, "loss": 0.7543, "step": 195 }, { "epoch": 0.2536461636017755, "grad_norm": 0.28276240825653076, "learning_rate": 1.6920473773265652e-05, "loss": 0.6852, "step": 200 }, { "epoch": 0.25998731769181993, "grad_norm": 0.289991170167923, "learning_rate": 1.7343485617597295e-05, "loss": 0.7414, "step": 205 }, { "epoch": 0.2663284717818643, "grad_norm": 0.29507017135620117, "learning_rate": 1.7766497461928935e-05, "loss": 0.7004, "step": 210 }, { "epoch": 0.2726696258719087, "grad_norm": 0.3034938871860504, "learning_rate": 1.818950930626058e-05, "loss": 0.6837, "step": 215 }, { "epoch": 0.27901077996195306, "grad_norm": 0.26063042879104614, "learning_rate": 1.8612521150592218e-05, "loss": 0.6134, "step": 220 }, { "epoch": 0.2853519340519975, "grad_norm": 0.26957449316978455, "learning_rate": 1.9035532994923858e-05, "loss": 0.6651, "step": 225 }, { "epoch": 0.29169308814204187, "grad_norm": 0.28434062004089355, "learning_rate": 1.9458544839255498e-05, "loss": 0.591, "step": 230 }, { "epoch": 0.29803424223208624, "grad_norm": 0.27161988615989685, "learning_rate": 1.988155668358714e-05, "loss": 0.6384, "step": 235 }, { "epoch": 0.3043753963221306, "grad_norm": 0.266771137714386, "learning_rate": 2.030456852791878e-05, "loss": 0.6098, "step": 240 }, { "epoch": 0.310716550412175, "grad_norm": 0.2873334586620331, "learning_rate": 2.0727580372250424e-05, "loss": 0.5657, "step": 245 }, { "epoch": 0.31705770450221943, "grad_norm": 0.3301062285900116, "learning_rate": 2.1150592216582064e-05, "loss": 0.6102, "step": 250 }, { "epoch": 0.3233988585922638, "grad_norm": 0.29614776372909546, "learning_rate": 2.1573604060913707e-05, "loss": 0.621, "step": 255 }, { "epoch": 0.3297400126823082, "grad_norm": 0.3532796800136566, "learning_rate": 2.199661590524535e-05, "loss": 0.572, "step": 260 }, { "epoch": 0.33608116677235256, "grad_norm": 0.3184033930301666, "learning_rate": 2.2419627749576987e-05, "loss": 0.5943, "step": 265 }, { "epoch": 0.34242232086239693, "grad_norm": 0.3125542402267456, "learning_rate": 2.284263959390863e-05, "loss": 0.6029, "step": 270 }, { "epoch": 0.34876347495244137, "grad_norm": 0.2835546135902405, "learning_rate": 2.326565143824027e-05, "loss": 0.5386, "step": 275 }, { "epoch": 0.35510462904248574, "grad_norm": 0.3134188652038574, "learning_rate": 2.3688663282571914e-05, "loss": 0.525, "step": 280 }, { "epoch": 0.3614457831325301, "grad_norm": 0.33431121706962585, "learning_rate": 2.4111675126903553e-05, "loss": 0.5774, "step": 285 }, { "epoch": 0.3677869372225745, "grad_norm": 0.2846366763114929, "learning_rate": 2.4534686971235197e-05, "loss": 0.5067, "step": 290 }, { "epoch": 0.37412809131261887, "grad_norm": 0.3202008306980133, "learning_rate": 2.4957698815566837e-05, "loss": 0.5269, "step": 295 }, { "epoch": 0.3804692454026633, "grad_norm": 0.325595885515213, "learning_rate": 2.5380710659898476e-05, "loss": 0.5891, "step": 300 }, { "epoch": 0.3868103994927077, "grad_norm": 0.2878698706626892, "learning_rate": 2.580372250423012e-05, "loss": 0.5242, "step": 305 }, { "epoch": 0.39315155358275206, "grad_norm": 0.3068866729736328, "learning_rate": 2.622673434856176e-05, "loss": 0.5805, "step": 310 }, { "epoch": 0.39949270767279643, "grad_norm": 0.31244412064552307, "learning_rate": 2.6649746192893406e-05, "loss": 0.5242, "step": 315 }, { "epoch": 0.40583386176284086, "grad_norm": 0.3235008120536804, "learning_rate": 2.7072758037225043e-05, "loss": 0.5021, "step": 320 }, { "epoch": 0.41217501585288524, "grad_norm": 0.3150944709777832, "learning_rate": 2.7495769881556682e-05, "loss": 0.4843, "step": 325 }, { "epoch": 0.4185161699429296, "grad_norm": 0.32756704092025757, "learning_rate": 2.7918781725888326e-05, "loss": 0.5039, "step": 330 }, { "epoch": 0.424857324032974, "grad_norm": 0.35633939504623413, "learning_rate": 2.8341793570219966e-05, "loss": 0.5234, "step": 335 }, { "epoch": 0.43119847812301837, "grad_norm": 0.352976530790329, "learning_rate": 2.876480541455161e-05, "loss": 0.5211, "step": 340 }, { "epoch": 0.4375396322130628, "grad_norm": 0.3197077810764313, "learning_rate": 2.918781725888325e-05, "loss": 0.5264, "step": 345 }, { "epoch": 0.4438807863031072, "grad_norm": 0.3268304467201233, "learning_rate": 2.9610829103214892e-05, "loss": 0.5131, "step": 350 }, { "epoch": 0.45022194039315155, "grad_norm": 0.31383514404296875, "learning_rate": 3.0033840947546532e-05, "loss": 0.4621, "step": 355 }, { "epoch": 0.45656309448319593, "grad_norm": 0.39162302017211914, "learning_rate": 3.0456852791878175e-05, "loss": 0.5187, "step": 360 }, { "epoch": 0.4629042485732403, "grad_norm": 0.3349419832229614, "learning_rate": 3.087986463620982e-05, "loss": 0.5129, "step": 365 }, { "epoch": 0.46924540266328474, "grad_norm": 0.3417114019393921, "learning_rate": 3.130287648054145e-05, "loss": 0.4637, "step": 370 }, { "epoch": 0.4755865567533291, "grad_norm": 0.3470960259437561, "learning_rate": 3.17258883248731e-05, "loss": 0.4843, "step": 375 }, { "epoch": 0.4819277108433735, "grad_norm": 0.3389221727848053, "learning_rate": 3.214890016920474e-05, "loss": 0.5111, "step": 380 }, { "epoch": 0.48826886493341787, "grad_norm": 0.33506959676742554, "learning_rate": 3.2571912013536385e-05, "loss": 0.4714, "step": 385 }, { "epoch": 0.49461001902346224, "grad_norm": 0.340305894613266, "learning_rate": 3.299492385786802e-05, "loss": 0.4933, "step": 390 }, { "epoch": 0.5009511731135067, "grad_norm": 0.34868305921554565, "learning_rate": 3.3417935702199664e-05, "loss": 0.4626, "step": 395 }, { "epoch": 0.507292327203551, "grad_norm": 0.3472588360309601, "learning_rate": 3.3840947546531304e-05, "loss": 0.5079, "step": 400 }, { "epoch": 0.5136334812935954, "grad_norm": 0.3772400915622711, "learning_rate": 3.4263959390862944e-05, "loss": 0.4868, "step": 405 }, { "epoch": 0.5199746353836399, "grad_norm": 0.3427276909351349, "learning_rate": 3.468697123519459e-05, "loss": 0.4814, "step": 410 }, { "epoch": 0.5263157894736842, "grad_norm": 0.3400866687297821, "learning_rate": 3.5109983079526224e-05, "loss": 0.5828, "step": 415 }, { "epoch": 0.5326569435637286, "grad_norm": 0.40560877323150635, "learning_rate": 3.553299492385787e-05, "loss": 0.4782, "step": 420 }, { "epoch": 0.5389980976537729, "grad_norm": 0.35967177152633667, "learning_rate": 3.595600676818951e-05, "loss": 0.4646, "step": 425 }, { "epoch": 0.5453392517438174, "grad_norm": 0.38135913014411926, "learning_rate": 3.637901861252116e-05, "loss": 0.4544, "step": 430 }, { "epoch": 0.5516804058338618, "grad_norm": 0.33531418442726135, "learning_rate": 3.680203045685279e-05, "loss": 0.4737, "step": 435 }, { "epoch": 0.5580215599239061, "grad_norm": 0.37541505694389343, "learning_rate": 3.7225042301184437e-05, "loss": 0.4709, "step": 440 }, { "epoch": 0.5643627140139506, "grad_norm": 0.36099573969841003, "learning_rate": 3.7648054145516076e-05, "loss": 0.4458, "step": 445 }, { "epoch": 0.570703868103995, "grad_norm": 0.3607345223426819, "learning_rate": 3.8071065989847716e-05, "loss": 0.4362, "step": 450 }, { "epoch": 0.5770450221940393, "grad_norm": 0.3462175130844116, "learning_rate": 3.8494077834179356e-05, "loss": 0.4576, "step": 455 }, { "epoch": 0.5833861762840837, "grad_norm": 0.44858184456825256, "learning_rate": 3.8917089678510996e-05, "loss": 0.4522, "step": 460 }, { "epoch": 0.5897273303741281, "grad_norm": 0.37734973430633545, "learning_rate": 3.934010152284264e-05, "loss": 0.4247, "step": 465 }, { "epoch": 0.5960684844641725, "grad_norm": 0.3667463958263397, "learning_rate": 3.976311336717428e-05, "loss": 0.4458, "step": 470 }, { "epoch": 0.6024096385542169, "grad_norm": 0.3749213218688965, "learning_rate": 4.018612521150593e-05, "loss": 0.4431, "step": 475 }, { "epoch": 0.6087507926442612, "grad_norm": 0.5783745646476746, "learning_rate": 4.060913705583756e-05, "loss": 0.4567, "step": 480 }, { "epoch": 0.6150919467343057, "grad_norm": 0.3504064977169037, "learning_rate": 4.103214890016921e-05, "loss": 0.4361, "step": 485 }, { "epoch": 0.62143310082435, "grad_norm": 0.37889033555984497, "learning_rate": 4.145516074450085e-05, "loss": 0.4672, "step": 490 }, { "epoch": 0.6277742549143944, "grad_norm": 0.3864586055278778, "learning_rate": 4.187817258883249e-05, "loss": 0.445, "step": 495 }, { "epoch": 0.6341154090044389, "grad_norm": 0.4208654463291168, "learning_rate": 4.230118443316413e-05, "loss": 0.4256, "step": 500 }, { "epoch": 0.6404565630944832, "grad_norm": 0.48186224699020386, "learning_rate": 4.272419627749577e-05, "loss": 0.3888, "step": 505 }, { "epoch": 0.6467977171845276, "grad_norm": 0.4014621376991272, "learning_rate": 4.3147208121827415e-05, "loss": 0.4016, "step": 510 }, { "epoch": 0.6531388712745719, "grad_norm": 0.3794398605823517, "learning_rate": 4.3570219966159055e-05, "loss": 0.3891, "step": 515 }, { "epoch": 0.6594800253646164, "grad_norm": 0.34651198983192444, "learning_rate": 4.39932318104907e-05, "loss": 0.4338, "step": 520 }, { "epoch": 0.6658211794546608, "grad_norm": 0.41434913873672485, "learning_rate": 4.4416243654822335e-05, "loss": 0.3963, "step": 525 }, { "epoch": 0.6721623335447051, "grad_norm": 0.4929732084274292, "learning_rate": 4.4839255499153974e-05, "loss": 0.4065, "step": 530 }, { "epoch": 0.6785034876347495, "grad_norm": 0.3965291678905487, "learning_rate": 4.526226734348562e-05, "loss": 0.3902, "step": 535 }, { "epoch": 0.6848446417247939, "grad_norm": 0.5421135425567627, "learning_rate": 4.568527918781726e-05, "loss": 0.3845, "step": 540 }, { "epoch": 0.6911857958148383, "grad_norm": 0.4427511394023895, "learning_rate": 4.61082910321489e-05, "loss": 0.4044, "step": 545 }, { "epoch": 0.6975269499048827, "grad_norm": 0.4127551019191742, "learning_rate": 4.653130287648054e-05, "loss": 0.4106, "step": 550 }, { "epoch": 0.703868103994927, "grad_norm": 0.3919977843761444, "learning_rate": 4.695431472081219e-05, "loss": 0.4019, "step": 555 }, { "epoch": 0.7102092580849715, "grad_norm": 0.44385024905204773, "learning_rate": 4.737732656514383e-05, "loss": 0.3714, "step": 560 }, { "epoch": 0.7165504121750158, "grad_norm": 0.4373217821121216, "learning_rate": 4.780033840947547e-05, "loss": 0.4071, "step": 565 }, { "epoch": 0.7228915662650602, "grad_norm": 0.6990036368370056, "learning_rate": 4.822335025380711e-05, "loss": 0.4071, "step": 570 }, { "epoch": 0.7292327203551047, "grad_norm": 0.5950965881347656, "learning_rate": 4.864636209813875e-05, "loss": 0.4011, "step": 575 }, { "epoch": 0.735573874445149, "grad_norm": 0.6065270304679871, "learning_rate": 4.906937394247039e-05, "loss": 0.4358, "step": 580 }, { "epoch": 0.7419150285351934, "grad_norm": 0.35955673456192017, "learning_rate": 4.949238578680203e-05, "loss": 0.3949, "step": 585 }, { "epoch": 0.7482561826252377, "grad_norm": 0.39565518498420715, "learning_rate": 4.991539763113367e-05, "loss": 0.4076, "step": 590 }, { "epoch": 0.7545973367152822, "grad_norm": 0.43297094106674194, "learning_rate": 5.033840947546532e-05, "loss": 0.3834, "step": 595 }, { "epoch": 0.7609384908053266, "grad_norm": 0.7051054239273071, "learning_rate": 5.076142131979695e-05, "loss": 0.3877, "step": 600 }, { "epoch": 0.7672796448953709, "grad_norm": 0.503436803817749, "learning_rate": 5.11844331641286e-05, "loss": 0.3545, "step": 605 }, { "epoch": 0.7736207989854154, "grad_norm": 0.38463765382766724, "learning_rate": 5.160744500846024e-05, "loss": 0.3493, "step": 610 }, { "epoch": 0.7799619530754597, "grad_norm": 0.405073344707489, "learning_rate": 5.203045685279187e-05, "loss": 0.4395, "step": 615 }, { "epoch": 0.7863031071655041, "grad_norm": 0.4221075177192688, "learning_rate": 5.245346869712352e-05, "loss": 0.397, "step": 620 }, { "epoch": 0.7926442612555485, "grad_norm": 0.43135079741477966, "learning_rate": 5.2876480541455166e-05, "loss": 0.4177, "step": 625 }, { "epoch": 0.7989854153455929, "grad_norm": 0.44515150785446167, "learning_rate": 5.329949238578681e-05, "loss": 0.3809, "step": 630 }, { "epoch": 0.8053265694356373, "grad_norm": 0.44613438844680786, "learning_rate": 5.3722504230118445e-05, "loss": 0.3868, "step": 635 }, { "epoch": 0.8116677235256817, "grad_norm": 0.4737761914730072, "learning_rate": 5.4145516074450085e-05, "loss": 0.3814, "step": 640 }, { "epoch": 0.818008877615726, "grad_norm": 0.5453035235404968, "learning_rate": 5.456852791878173e-05, "loss": 0.3516, "step": 645 }, { "epoch": 0.8243500317057705, "grad_norm": 0.3765818774700165, "learning_rate": 5.4991539763113365e-05, "loss": 0.3472, "step": 650 }, { "epoch": 0.8306911857958148, "grad_norm": 0.3984309136867523, "learning_rate": 5.541455160744501e-05, "loss": 0.3736, "step": 655 }, { "epoch": 0.8370323398858592, "grad_norm": 0.4183863401412964, "learning_rate": 5.583756345177665e-05, "loss": 0.398, "step": 660 }, { "epoch": 0.8433734939759037, "grad_norm": 0.4257148802280426, "learning_rate": 5.62605752961083e-05, "loss": 0.4104, "step": 665 }, { "epoch": 0.849714648065948, "grad_norm": 0.3823991119861603, "learning_rate": 5.668358714043993e-05, "loss": 0.3585, "step": 670 }, { "epoch": 0.8560558021559924, "grad_norm": 0.4168572723865509, "learning_rate": 5.710659898477158e-05, "loss": 0.3529, "step": 675 }, { "epoch": 0.8623969562460367, "grad_norm": 0.3914043605327606, "learning_rate": 5.752961082910322e-05, "loss": 0.4033, "step": 680 }, { "epoch": 0.8687381103360812, "grad_norm": 0.3990406394004822, "learning_rate": 5.7952622673434864e-05, "loss": 0.4192, "step": 685 }, { "epoch": 0.8750792644261256, "grad_norm": 0.40446627140045166, "learning_rate": 5.83756345177665e-05, "loss": 0.3507, "step": 690 }, { "epoch": 0.8814204185161699, "grad_norm": 0.39247220754623413, "learning_rate": 5.8798646362098144e-05, "loss": 0.3837, "step": 695 }, { "epoch": 0.8877615726062144, "grad_norm": 0.40498554706573486, "learning_rate": 5.9221658206429784e-05, "loss": 0.441, "step": 700 }, { "epoch": 0.8941027266962587, "grad_norm": 0.3935602605342865, "learning_rate": 5.964467005076142e-05, "loss": 0.3534, "step": 705 }, { "epoch": 0.9004438807863031, "grad_norm": 0.42168527841567993, "learning_rate": 6.0067681895093064e-05, "loss": 0.3925, "step": 710 }, { "epoch": 0.9067850348763475, "grad_norm": 0.3969208300113678, "learning_rate": 6.049069373942471e-05, "loss": 0.3534, "step": 715 }, { "epoch": 0.9131261889663919, "grad_norm": 0.5047036409378052, "learning_rate": 6.091370558375635e-05, "loss": 0.3734, "step": 720 }, { "epoch": 0.9194673430564363, "grad_norm": 0.334219366312027, "learning_rate": 6.133671742808799e-05, "loss": 0.3653, "step": 725 }, { "epoch": 0.9258084971464806, "grad_norm": 0.39415931701660156, "learning_rate": 6.175972927241964e-05, "loss": 0.3605, "step": 730 }, { "epoch": 0.932149651236525, "grad_norm": 0.44251030683517456, "learning_rate": 6.218274111675127e-05, "loss": 0.3798, "step": 735 }, { "epoch": 0.9384908053265695, "grad_norm": 0.38705533742904663, "learning_rate": 6.26057529610829e-05, "loss": 0.3726, "step": 740 }, { "epoch": 0.9448319594166138, "grad_norm": 0.4148578345775604, "learning_rate": 6.302876480541455e-05, "loss": 0.3501, "step": 745 }, { "epoch": 0.9511731135066582, "grad_norm": 0.4429895877838135, "learning_rate": 6.34517766497462e-05, "loss": 0.3565, "step": 750 }, { "epoch": 0.9575142675967026, "grad_norm": 0.3851292133331299, "learning_rate": 6.387478849407784e-05, "loss": 0.3684, "step": 755 }, { "epoch": 0.963855421686747, "grad_norm": 0.41379883885383606, "learning_rate": 6.429780033840948e-05, "loss": 0.3741, "step": 760 }, { "epoch": 0.9701965757767914, "grad_norm": 0.3557893633842468, "learning_rate": 6.472081218274112e-05, "loss": 0.3284, "step": 765 }, { "epoch": 0.9765377298668357, "grad_norm": 0.3535398542881012, "learning_rate": 6.514382402707277e-05, "loss": 0.3741, "step": 770 }, { "epoch": 0.9828788839568802, "grad_norm": 0.4005914330482483, "learning_rate": 6.55668358714044e-05, "loss": 0.3512, "step": 775 }, { "epoch": 0.9892200380469245, "grad_norm": 0.42649465799331665, "learning_rate": 6.598984771573604e-05, "loss": 0.3879, "step": 780 }, { "epoch": 0.9955611921369689, "grad_norm": 0.4480552077293396, "learning_rate": 6.641285956006768e-05, "loss": 0.3676, "step": 785 }, { "epoch": 0.9993658845909955, "eval_loss": 0.37959593534469604, "eval_runtime": 1446.4681, "eval_samples_per_second": 1.09, "eval_steps_per_second": 1.09, "step": 788 }, { "epoch": 1.0019023462270134, "grad_norm": 0.36123502254486084, "learning_rate": 6.683587140439933e-05, "loss": 0.3936, "step": 790 }, { "epoch": 1.0082435003170578, "grad_norm": 0.34520888328552246, "learning_rate": 6.725888324873096e-05, "loss": 0.3253, "step": 795 }, { "epoch": 1.014584654407102, "grad_norm": 0.3713197410106659, "learning_rate": 6.768189509306261e-05, "loss": 0.367, "step": 800 }, { "epoch": 1.0209258084971464, "grad_norm": 0.3739059269428253, "learning_rate": 6.810490693739425e-05, "loss": 0.3585, "step": 805 }, { "epoch": 1.0272669625871909, "grad_norm": 0.3589707612991333, "learning_rate": 6.852791878172589e-05, "loss": 0.3185, "step": 810 }, { "epoch": 1.0336081166772353, "grad_norm": 0.3680344521999359, "learning_rate": 6.895093062605753e-05, "loss": 0.4076, "step": 815 }, { "epoch": 1.0399492707672797, "grad_norm": 0.39016789197921753, "learning_rate": 6.937394247038918e-05, "loss": 0.3771, "step": 820 }, { "epoch": 1.046290424857324, "grad_norm": 0.3454945683479309, "learning_rate": 6.979695431472081e-05, "loss": 0.3726, "step": 825 }, { "epoch": 1.0526315789473684, "grad_norm": 0.45799320936203003, "learning_rate": 7.021996615905245e-05, "loss": 0.3342, "step": 830 }, { "epoch": 1.0589727330374128, "grad_norm": 0.3723289668560028, "learning_rate": 7.06429780033841e-05, "loss": 0.3739, "step": 835 }, { "epoch": 1.0653138871274572, "grad_norm": 0.4028124213218689, "learning_rate": 7.106598984771574e-05, "loss": 0.3756, "step": 840 }, { "epoch": 1.0716550412175017, "grad_norm": 0.34121423959732056, "learning_rate": 7.148900169204739e-05, "loss": 0.3822, "step": 845 }, { "epoch": 1.0779961953075459, "grad_norm": 0.43818050622940063, "learning_rate": 7.191201353637902e-05, "loss": 0.3383, "step": 850 }, { "epoch": 1.0843373493975903, "grad_norm": 0.5434086322784424, "learning_rate": 7.233502538071067e-05, "loss": 0.3494, "step": 855 }, { "epoch": 1.0906785034876347, "grad_norm": 0.46009552478790283, "learning_rate": 7.275803722504231e-05, "loss": 0.3231, "step": 860 }, { "epoch": 1.0970196575776792, "grad_norm": 0.35976865887641907, "learning_rate": 7.318104906937395e-05, "loss": 0.3772, "step": 865 }, { "epoch": 1.1033608116677236, "grad_norm": 0.36262550950050354, "learning_rate": 7.360406091370558e-05, "loss": 0.3639, "step": 870 }, { "epoch": 1.1097019657577678, "grad_norm": 0.308373361825943, "learning_rate": 7.402707275803723e-05, "loss": 0.3444, "step": 875 }, { "epoch": 1.1160431198478122, "grad_norm": 0.37242528796195984, "learning_rate": 7.445008460236887e-05, "loss": 0.3646, "step": 880 }, { "epoch": 1.1223842739378567, "grad_norm": 0.3745775520801544, "learning_rate": 7.48730964467005e-05, "loss": 0.3409, "step": 885 }, { "epoch": 1.128725428027901, "grad_norm": 0.4131007194519043, "learning_rate": 7.529610829103215e-05, "loss": 0.3632, "step": 890 }, { "epoch": 1.1350665821179455, "grad_norm": 0.4335881173610687, "learning_rate": 7.57191201353638e-05, "loss": 0.4273, "step": 895 }, { "epoch": 1.1414077362079897, "grad_norm": 0.3967609405517578, "learning_rate": 7.614213197969543e-05, "loss": 0.3436, "step": 900 }, { "epoch": 1.1477488902980342, "grad_norm": 0.3518299460411072, "learning_rate": 7.656514382402708e-05, "loss": 0.3282, "step": 905 }, { "epoch": 1.1540900443880786, "grad_norm": 0.3778150975704193, "learning_rate": 7.698815566835871e-05, "loss": 0.3222, "step": 910 }, { "epoch": 1.160431198478123, "grad_norm": 0.35661545395851135, "learning_rate": 7.741116751269036e-05, "loss": 0.3301, "step": 915 }, { "epoch": 1.1667723525681675, "grad_norm": 0.363803893327713, "learning_rate": 7.783417935702199e-05, "loss": 0.3245, "step": 920 }, { "epoch": 1.1731135066582117, "grad_norm": 0.3545622229576111, "learning_rate": 7.825719120135364e-05, "loss": 0.3391, "step": 925 }, { "epoch": 1.1794546607482561, "grad_norm": 0.36038938164711, "learning_rate": 7.868020304568529e-05, "loss": 0.384, "step": 930 }, { "epoch": 1.1857958148383005, "grad_norm": 0.4018946886062622, "learning_rate": 7.910321489001692e-05, "loss": 0.3218, "step": 935 }, { "epoch": 1.192136968928345, "grad_norm": 0.34724465012550354, "learning_rate": 7.952622673434857e-05, "loss": 0.3167, "step": 940 }, { "epoch": 1.1984781230183894, "grad_norm": 0.35832861065864563, "learning_rate": 7.994923857868021e-05, "loss": 0.3198, "step": 945 }, { "epoch": 1.2048192771084336, "grad_norm": 0.4078144133090973, "learning_rate": 8.037225042301186e-05, "loss": 0.3166, "step": 950 }, { "epoch": 1.211160431198478, "grad_norm": 0.36026251316070557, "learning_rate": 8.079526226734349e-05, "loss": 0.3404, "step": 955 }, { "epoch": 1.2175015852885225, "grad_norm": 0.3477899730205536, "learning_rate": 8.121827411167512e-05, "loss": 0.314, "step": 960 }, { "epoch": 1.223842739378567, "grad_norm": 0.4439810514450073, "learning_rate": 8.164128595600677e-05, "loss": 0.3604, "step": 965 }, { "epoch": 1.2301838934686113, "grad_norm": 0.33793196082115173, "learning_rate": 8.206429780033842e-05, "loss": 0.3491, "step": 970 }, { "epoch": 1.2365250475586556, "grad_norm": 0.35255393385887146, "learning_rate": 8.248730964467005e-05, "loss": 0.3462, "step": 975 }, { "epoch": 1.2428662016487, "grad_norm": 0.3338475525379181, "learning_rate": 8.29103214890017e-05, "loss": 0.3291, "step": 980 }, { "epoch": 1.2492073557387444, "grad_norm": 0.3785984218120575, "learning_rate": 8.333333333333334e-05, "loss": 0.3673, "step": 985 }, { "epoch": 1.2555485098287889, "grad_norm": 0.3658965528011322, "learning_rate": 8.375634517766498e-05, "loss": 0.3262, "step": 990 }, { "epoch": 1.2618896639188333, "grad_norm": 0.4075332283973694, "learning_rate": 8.417935702199662e-05, "loss": 0.328, "step": 995 }, { "epoch": 1.2682308180088775, "grad_norm": 0.32019421458244324, "learning_rate": 8.460236886632826e-05, "loss": 0.3084, "step": 1000 }, { "epoch": 1.274571972098922, "grad_norm": 0.44961005449295044, "learning_rate": 8.50253807106599e-05, "loss": 0.3271, "step": 1005 }, { "epoch": 1.2809131261889664, "grad_norm": 0.439188688993454, "learning_rate": 8.544839255499154e-05, "loss": 0.3228, "step": 1010 }, { "epoch": 1.2872542802790108, "grad_norm": 0.3486860990524292, "learning_rate": 8.587140439932318e-05, "loss": 0.3552, "step": 1015 }, { "epoch": 1.2935954343690552, "grad_norm": 0.32728904485702515, "learning_rate": 8.629441624365483e-05, "loss": 0.33, "step": 1020 }, { "epoch": 1.2999365884590994, "grad_norm": 0.3614301085472107, "learning_rate": 8.671742808798646e-05, "loss": 0.3123, "step": 1025 }, { "epoch": 1.3062777425491439, "grad_norm": 0.3490604758262634, "learning_rate": 8.714043993231811e-05, "loss": 0.3064, "step": 1030 }, { "epoch": 1.3126188966391883, "grad_norm": 0.35223057866096497, "learning_rate": 8.756345177664976e-05, "loss": 0.3397, "step": 1035 }, { "epoch": 1.3189600507292327, "grad_norm": 0.31061244010925293, "learning_rate": 8.79864636209814e-05, "loss": 0.3132, "step": 1040 }, { "epoch": 1.3253012048192772, "grad_norm": 0.33759045600891113, "learning_rate": 8.840947546531304e-05, "loss": 0.3097, "step": 1045 }, { "epoch": 1.3316423589093214, "grad_norm": 0.37843048572540283, "learning_rate": 8.883248730964467e-05, "loss": 0.3311, "step": 1050 }, { "epoch": 1.337983512999366, "grad_norm": 0.3383367359638214, "learning_rate": 8.925549915397632e-05, "loss": 0.3082, "step": 1055 }, { "epoch": 1.3443246670894102, "grad_norm": 0.3539436161518097, "learning_rate": 8.967851099830795e-05, "loss": 0.3482, "step": 1060 }, { "epoch": 1.3506658211794547, "grad_norm": 0.3327707350254059, "learning_rate": 9.01015228426396e-05, "loss": 0.3128, "step": 1065 }, { "epoch": 1.357006975269499, "grad_norm": 0.3425014317035675, "learning_rate": 9.052453468697124e-05, "loss": 0.3147, "step": 1070 }, { "epoch": 1.3633481293595433, "grad_norm": 0.37500500679016113, "learning_rate": 9.094754653130289e-05, "loss": 0.3393, "step": 1075 }, { "epoch": 1.369689283449588, "grad_norm": 0.3452490568161011, "learning_rate": 9.137055837563452e-05, "loss": 0.3511, "step": 1080 }, { "epoch": 1.3760304375396322, "grad_norm": 0.35497164726257324, "learning_rate": 9.179357021996617e-05, "loss": 0.3372, "step": 1085 }, { "epoch": 1.3823715916296766, "grad_norm": 0.35102227330207825, "learning_rate": 9.22165820642978e-05, "loss": 0.3039, "step": 1090 }, { "epoch": 1.388712745719721, "grad_norm": 0.3348008692264557, "learning_rate": 9.263959390862943e-05, "loss": 0.386, "step": 1095 }, { "epoch": 1.3950538998097652, "grad_norm": 0.3521282374858856, "learning_rate": 9.306260575296108e-05, "loss": 0.3354, "step": 1100 }, { "epoch": 1.40139505389981, "grad_norm": 0.35749271512031555, "learning_rate": 9.348561759729273e-05, "loss": 0.292, "step": 1105 }, { "epoch": 1.407736207989854, "grad_norm": 0.2867288589477539, "learning_rate": 9.390862944162437e-05, "loss": 0.3338, "step": 1110 }, { "epoch": 1.4140773620798985, "grad_norm": 0.3310149013996124, "learning_rate": 9.433164128595601e-05, "loss": 0.3731, "step": 1115 }, { "epoch": 1.420418516169943, "grad_norm": 0.37106752395629883, "learning_rate": 9.475465313028765e-05, "loss": 0.3352, "step": 1120 }, { "epoch": 1.4267596702599874, "grad_norm": 0.3293766379356384, "learning_rate": 9.51776649746193e-05, "loss": 0.3117, "step": 1125 }, { "epoch": 1.4331008243500318, "grad_norm": 0.3193541467189789, "learning_rate": 9.560067681895093e-05, "loss": 0.3073, "step": 1130 }, { "epoch": 1.439441978440076, "grad_norm": 0.31949228048324585, "learning_rate": 9.602368866328258e-05, "loss": 0.3265, "step": 1135 }, { "epoch": 1.4457831325301205, "grad_norm": 0.3485932946205139, "learning_rate": 9.644670050761421e-05, "loss": 0.3379, "step": 1140 }, { "epoch": 1.452124286620165, "grad_norm": 0.373869389295578, "learning_rate": 9.686971235194586e-05, "loss": 0.3289, "step": 1145 }, { "epoch": 1.4584654407102093, "grad_norm": 0.34280332922935486, "learning_rate": 9.72927241962775e-05, "loss": 0.3187, "step": 1150 }, { "epoch": 1.4648065948002538, "grad_norm": 0.3661528527736664, "learning_rate": 9.771573604060914e-05, "loss": 0.3352, "step": 1155 }, { "epoch": 1.471147748890298, "grad_norm": 0.2989208996295929, "learning_rate": 9.813874788494079e-05, "loss": 0.3223, "step": 1160 }, { "epoch": 1.4774889029803424, "grad_norm": 0.35840898752212524, "learning_rate": 9.856175972927243e-05, "loss": 0.3085, "step": 1165 }, { "epoch": 1.4838300570703868, "grad_norm": 0.34834158420562744, "learning_rate": 9.898477157360407e-05, "loss": 0.3415, "step": 1170 }, { "epoch": 1.4901712111604313, "grad_norm": 0.2929879426956177, "learning_rate": 9.940778341793571e-05, "loss": 0.3211, "step": 1175 }, { "epoch": 1.4965123652504757, "grad_norm": 0.320539265871048, "learning_rate": 9.983079526226735e-05, "loss": 0.3287, "step": 1180 }, { "epoch": 1.50285351934052, "grad_norm": 0.32609283924102783, "learning_rate": 0.00010025380710659899, "loss": 0.3203, "step": 1185 }, { "epoch": 1.5091946734305643, "grad_norm": 0.3080826997756958, "learning_rate": 0.00010067681895093064, "loss": 0.3051, "step": 1190 }, { "epoch": 1.5155358275206088, "grad_norm": 0.3212393820285797, "learning_rate": 0.00010109983079526226, "loss": 0.3207, "step": 1195 }, { "epoch": 1.521876981610653, "grad_norm": 0.33465152978897095, "learning_rate": 0.0001015228426395939, "loss": 0.3358, "step": 1200 }, { "epoch": 1.5282181357006976, "grad_norm": 0.31196466088294983, "learning_rate": 0.00010194585448392555, "loss": 0.3264, "step": 1205 }, { "epoch": 1.5345592897907419, "grad_norm": 0.2974426746368408, "learning_rate": 0.0001023688663282572, "loss": 0.3191, "step": 1210 }, { "epoch": 1.5409004438807863, "grad_norm": 0.27661287784576416, "learning_rate": 0.00010279187817258885, "loss": 0.3047, "step": 1215 }, { "epoch": 1.5472415979708307, "grad_norm": 0.3493057191371918, "learning_rate": 0.00010321489001692048, "loss": 0.3135, "step": 1220 }, { "epoch": 1.553582752060875, "grad_norm": 0.3246723711490631, "learning_rate": 0.00010363790186125213, "loss": 0.3332, "step": 1225 }, { "epoch": 1.5599239061509196, "grad_norm": 0.31354302167892456, "learning_rate": 0.00010406091370558374, "loss": 0.3261, "step": 1230 }, { "epoch": 1.5662650602409638, "grad_norm": 0.268210768699646, "learning_rate": 0.00010448392554991539, "loss": 0.2889, "step": 1235 }, { "epoch": 1.5726062143310082, "grad_norm": 0.4062238037586212, "learning_rate": 0.00010490693739424704, "loss": 0.3801, "step": 1240 }, { "epoch": 1.5789473684210527, "grad_norm": 0.5570194721221924, "learning_rate": 0.00010532994923857868, "loss": 0.3671, "step": 1245 }, { "epoch": 1.5852885225110969, "grad_norm": 0.3068524897098541, "learning_rate": 0.00010575296108291033, "loss": 0.368, "step": 1250 }, { "epoch": 1.5916296766011415, "grad_norm": 0.3504703938961029, "learning_rate": 0.00010617597292724198, "loss": 0.3357, "step": 1255 }, { "epoch": 1.5979708306911857, "grad_norm": 0.33704885840415955, "learning_rate": 0.00010659898477157362, "loss": 0.3006, "step": 1260 }, { "epoch": 1.6043119847812302, "grad_norm": 0.3148198425769806, "learning_rate": 0.00010702199661590524, "loss": 0.369, "step": 1265 }, { "epoch": 1.6106531388712746, "grad_norm": 0.3036929666996002, "learning_rate": 0.00010744500846023689, "loss": 0.3167, "step": 1270 }, { "epoch": 1.6169942929613188, "grad_norm": 0.2927647829055786, "learning_rate": 0.00010786802030456852, "loss": 0.3243, "step": 1275 }, { "epoch": 1.6233354470513635, "grad_norm": 0.28257298469543457, "learning_rate": 0.00010829103214890017, "loss": 0.3033, "step": 1280 }, { "epoch": 1.6296766011414077, "grad_norm": 0.3285750448703766, "learning_rate": 0.00010871404399323182, "loss": 0.3006, "step": 1285 }, { "epoch": 1.636017755231452, "grad_norm": 0.4006665050983429, "learning_rate": 0.00010913705583756346, "loss": 0.3255, "step": 1290 }, { "epoch": 1.6423589093214965, "grad_norm": 0.31462082266807556, "learning_rate": 0.00010956006768189511, "loss": 0.3065, "step": 1295 }, { "epoch": 1.6487000634115407, "grad_norm": 0.3113979697227478, "learning_rate": 0.00010998307952622673, "loss": 0.3249, "step": 1300 }, { "epoch": 1.6550412175015854, "grad_norm": 0.3266550600528717, "learning_rate": 0.00011040609137055838, "loss": 0.2948, "step": 1305 }, { "epoch": 1.6613823715916296, "grad_norm": 0.3315008878707886, "learning_rate": 0.00011082910321489002, "loss": 0.3239, "step": 1310 }, { "epoch": 1.667723525681674, "grad_norm": 0.28523266315460205, "learning_rate": 0.00011125211505922166, "loss": 0.3163, "step": 1315 }, { "epoch": 1.6740646797717185, "grad_norm": 0.2836901843547821, "learning_rate": 0.0001116751269035533, "loss": 0.2959, "step": 1320 }, { "epoch": 1.6804058338617627, "grad_norm": 0.3455132842063904, "learning_rate": 0.00011209813874788495, "loss": 0.3124, "step": 1325 }, { "epoch": 1.6867469879518073, "grad_norm": 0.32125145196914673, "learning_rate": 0.0001125211505922166, "loss": 0.3255, "step": 1330 }, { "epoch": 1.6930881420418515, "grad_norm": 0.34217342734336853, "learning_rate": 0.00011294416243654824, "loss": 0.3519, "step": 1335 }, { "epoch": 1.699429296131896, "grad_norm": 0.30246150493621826, "learning_rate": 0.00011336717428087986, "loss": 0.3264, "step": 1340 }, { "epoch": 1.7057704502219404, "grad_norm": 0.3406198024749756, "learning_rate": 0.00011379018612521151, "loss": 0.3095, "step": 1345 }, { "epoch": 1.7121116043119848, "grad_norm": 0.34308096766471863, "learning_rate": 0.00011421319796954316, "loss": 0.2869, "step": 1350 }, { "epoch": 1.7184527584020293, "grad_norm": 0.2883923351764679, "learning_rate": 0.00011463620981387479, "loss": 0.3279, "step": 1355 }, { "epoch": 1.7247939124920735, "grad_norm": 0.34927430748939514, "learning_rate": 0.00011505922165820644, "loss": 0.3521, "step": 1360 }, { "epoch": 1.731135066582118, "grad_norm": 0.34140199422836304, "learning_rate": 0.00011548223350253808, "loss": 0.3179, "step": 1365 }, { "epoch": 1.7374762206721623, "grad_norm": 0.2626616656780243, "learning_rate": 0.00011590524534686973, "loss": 0.2854, "step": 1370 }, { "epoch": 1.7438173747622068, "grad_norm": 0.2928195595741272, "learning_rate": 0.00011632825719120135, "loss": 0.2823, "step": 1375 }, { "epoch": 1.7501585288522512, "grad_norm": 0.2925654947757721, "learning_rate": 0.000116751269035533, "loss": 0.3209, "step": 1380 }, { "epoch": 1.7564996829422954, "grad_norm": 0.328232079744339, "learning_rate": 0.00011717428087986464, "loss": 0.2864, "step": 1385 }, { "epoch": 1.7628408370323398, "grad_norm": 0.31291720271110535, "learning_rate": 0.00011759729272419629, "loss": 0.32, "step": 1390 }, { "epoch": 1.7691819911223843, "grad_norm": 0.35494890809059143, "learning_rate": 0.00011802030456852793, "loss": 0.3552, "step": 1395 }, { "epoch": 1.7755231452124287, "grad_norm": 0.32256317138671875, "learning_rate": 0.00011844331641285957, "loss": 0.3199, "step": 1400 }, { "epoch": 1.7818642993024731, "grad_norm": 0.32185688614845276, "learning_rate": 0.00011886632825719121, "loss": 0.2944, "step": 1405 }, { "epoch": 1.7882054533925174, "grad_norm": 0.31314074993133545, "learning_rate": 0.00011928934010152283, "loss": 0.3019, "step": 1410 }, { "epoch": 1.7945466074825618, "grad_norm": 0.3140206038951874, "learning_rate": 0.00011971235194585448, "loss": 0.3147, "step": 1415 }, { "epoch": 1.8008877615726062, "grad_norm": 0.33574074506759644, "learning_rate": 0.00012013536379018613, "loss": 0.3215, "step": 1420 }, { "epoch": 1.8072289156626506, "grad_norm": 0.30786749720573425, "learning_rate": 0.00012055837563451777, "loss": 0.291, "step": 1425 }, { "epoch": 1.813570069752695, "grad_norm": 0.27404117584228516, "learning_rate": 0.00012098138747884942, "loss": 0.3205, "step": 1430 }, { "epoch": 1.8199112238427393, "grad_norm": 0.2908042073249817, "learning_rate": 0.00012140439932318107, "loss": 0.3382, "step": 1435 }, { "epoch": 1.8262523779327837, "grad_norm": 0.3181898891925812, "learning_rate": 0.0001218274111675127, "loss": 0.3109, "step": 1440 }, { "epoch": 1.8325935320228282, "grad_norm": 0.2778260111808777, "learning_rate": 0.00012225042301184432, "loss": 0.2804, "step": 1445 }, { "epoch": 1.8389346861128726, "grad_norm": 0.31235480308532715, "learning_rate": 0.00012267343485617598, "loss": 0.35, "step": 1450 }, { "epoch": 1.845275840202917, "grad_norm": 0.28054380416870117, "learning_rate": 0.0001230964467005076, "loss": 0.2927, "step": 1455 }, { "epoch": 1.8516169942929612, "grad_norm": 0.3065265715122223, "learning_rate": 0.00012351945854483927, "loss": 0.2997, "step": 1460 }, { "epoch": 1.8579581483830057, "grad_norm": 0.2624269425868988, "learning_rate": 0.0001239424703891709, "loss": 0.2883, "step": 1465 }, { "epoch": 1.86429930247305, "grad_norm": 0.2894079387187958, "learning_rate": 0.00012436548223350254, "loss": 0.2962, "step": 1470 }, { "epoch": 1.8706404565630945, "grad_norm": 0.28862282633781433, "learning_rate": 0.0001247884940778342, "loss": 0.345, "step": 1475 }, { "epoch": 1.876981610653139, "grad_norm": 0.26577141880989075, "learning_rate": 0.0001252115059221658, "loss": 0.345, "step": 1480 }, { "epoch": 1.8833227647431832, "grad_norm": 0.29924800992012024, "learning_rate": 0.00012563451776649747, "loss": 0.2961, "step": 1485 }, { "epoch": 1.8896639188332276, "grad_norm": 0.32895246148109436, "learning_rate": 0.0001260575296108291, "loss": 0.2963, "step": 1490 }, { "epoch": 1.896005072923272, "grad_norm": 0.3141741454601288, "learning_rate": 0.00012648054145516076, "loss": 0.321, "step": 1495 }, { "epoch": 1.9023462270133165, "grad_norm": 0.30705851316452026, "learning_rate": 0.0001269035532994924, "loss": 0.2904, "step": 1500 }, { "epoch": 1.908687381103361, "grad_norm": 0.31538137793540955, "learning_rate": 0.00012732656514382405, "loss": 0.3673, "step": 1505 }, { "epoch": 1.915028535193405, "grad_norm": 0.2542174160480499, "learning_rate": 0.00012774957698815569, "loss": 0.326, "step": 1510 }, { "epoch": 1.9213696892834495, "grad_norm": 0.31089675426483154, "learning_rate": 0.00012817258883248732, "loss": 0.31, "step": 1515 }, { "epoch": 1.927710843373494, "grad_norm": 0.3491573631763458, "learning_rate": 0.00012859560067681895, "loss": 0.3017, "step": 1520 }, { "epoch": 1.9340519974635384, "grad_norm": 0.2782622277736664, "learning_rate": 0.00012901861252115058, "loss": 0.3378, "step": 1525 }, { "epoch": 1.9403931515535828, "grad_norm": 0.2578594386577606, "learning_rate": 0.00012944162436548224, "loss": 0.3032, "step": 1530 }, { "epoch": 1.946734305643627, "grad_norm": 0.30477938055992126, "learning_rate": 0.00012986463620981388, "loss": 0.3033, "step": 1535 }, { "epoch": 1.9530754597336717, "grad_norm": 0.2516118586063385, "learning_rate": 0.00013028764805414554, "loss": 0.2952, "step": 1540 }, { "epoch": 1.959416613823716, "grad_norm": 0.27054014801979065, "learning_rate": 0.00013071065989847717, "loss": 0.3075, "step": 1545 }, { "epoch": 1.9657577679137603, "grad_norm": 0.2583960294723511, "learning_rate": 0.0001311336717428088, "loss": 0.2908, "step": 1550 }, { "epoch": 1.9720989220038048, "grad_norm": 0.354736328125, "learning_rate": 0.00013155668358714044, "loss": 0.3236, "step": 1555 }, { "epoch": 1.978440076093849, "grad_norm": 0.2985624372959137, "learning_rate": 0.00013197969543147207, "loss": 0.2893, "step": 1560 }, { "epoch": 1.9847812301838936, "grad_norm": 0.2574755549430847, "learning_rate": 0.00013240270727580373, "loss": 0.3202, "step": 1565 }, { "epoch": 1.9911223842739378, "grad_norm": 0.28396016359329224, "learning_rate": 0.00013282571912013536, "loss": 0.294, "step": 1570 }, { "epoch": 1.9974635383639823, "grad_norm": 0.2647438943386078, "learning_rate": 0.00013324873096446702, "loss": 0.2968, "step": 1575 }, { "epoch": 2.0, "eval_loss": 0.3381074368953705, "eval_runtime": 1444.3134, "eval_samples_per_second": 1.092, "eval_steps_per_second": 1.092, "step": 1577 }, { "epoch": 2.0038046924540267, "grad_norm": 0.2823465168476105, "learning_rate": 0.00013367174280879866, "loss": 0.3023, "step": 1580 }, { "epoch": 2.010145846544071, "grad_norm": 0.24398605525493622, "learning_rate": 0.0001340947546531303, "loss": 0.3073, "step": 1585 }, { "epoch": 2.0164870006341156, "grad_norm": 0.30086255073547363, "learning_rate": 0.00013451776649746192, "loss": 0.2813, "step": 1590 }, { "epoch": 2.0228281547241598, "grad_norm": 0.2933647632598877, "learning_rate": 0.00013494077834179358, "loss": 0.3563, "step": 1595 }, { "epoch": 2.029169308814204, "grad_norm": 0.26003360748291016, "learning_rate": 0.00013536379018612522, "loss": 0.3003, "step": 1600 }, { "epoch": 2.0355104629042486, "grad_norm": 0.26458802819252014, "learning_rate": 0.00013578680203045685, "loss": 0.2759, "step": 1605 }, { "epoch": 2.041851616994293, "grad_norm": 0.35564297437667847, "learning_rate": 0.0001362098138747885, "loss": 0.2774, "step": 1610 }, { "epoch": 2.0481927710843375, "grad_norm": 0.24942746758460999, "learning_rate": 0.00013663282571912014, "loss": 0.314, "step": 1615 }, { "epoch": 2.0545339251743817, "grad_norm": 0.3398435115814209, "learning_rate": 0.00013705583756345178, "loss": 0.3038, "step": 1620 }, { "epoch": 2.060875079264426, "grad_norm": 0.2435198873281479, "learning_rate": 0.0001374788494077834, "loss": 0.2691, "step": 1625 }, { "epoch": 2.0672162333544706, "grad_norm": 0.27647852897644043, "learning_rate": 0.00013790186125211507, "loss": 0.3356, "step": 1630 }, { "epoch": 2.073557387444515, "grad_norm": 0.26509585976600647, "learning_rate": 0.0001383248730964467, "loss": 0.286, "step": 1635 }, { "epoch": 2.0798985415345594, "grad_norm": 0.2631427049636841, "learning_rate": 0.00013874788494077836, "loss": 0.2705, "step": 1640 }, { "epoch": 2.0862396956246037, "grad_norm": 0.26194998621940613, "learning_rate": 0.00013917089678511, "loss": 0.2786, "step": 1645 }, { "epoch": 2.092580849714648, "grad_norm": 0.2973628342151642, "learning_rate": 0.00013959390862944163, "loss": 0.2899, "step": 1650 }, { "epoch": 2.0989220038046925, "grad_norm": 0.28257042169570923, "learning_rate": 0.0001400169204737733, "loss": 0.2976, "step": 1655 }, { "epoch": 2.1052631578947367, "grad_norm": 0.27788791060447693, "learning_rate": 0.0001404399323181049, "loss": 0.2859, "step": 1660 }, { "epoch": 2.1116043119847814, "grad_norm": 0.2781310975551605, "learning_rate": 0.00014086294416243656, "loss": 0.3062, "step": 1665 }, { "epoch": 2.1179454660748256, "grad_norm": 0.2550041079521179, "learning_rate": 0.0001412859560067682, "loss": 0.2651, "step": 1670 }, { "epoch": 2.12428662016487, "grad_norm": 0.28445178270339966, "learning_rate": 0.00014170896785109985, "loss": 0.3137, "step": 1675 }, { "epoch": 2.1306277742549145, "grad_norm": 0.2765163779258728, "learning_rate": 0.00014213197969543148, "loss": 0.2653, "step": 1680 }, { "epoch": 2.1369689283449587, "grad_norm": 0.27409252524375916, "learning_rate": 0.00014255499153976311, "loss": 0.3002, "step": 1685 }, { "epoch": 2.1433100824350033, "grad_norm": 0.29509592056274414, "learning_rate": 0.00014297800338409477, "loss": 0.3053, "step": 1690 }, { "epoch": 2.1496512365250475, "grad_norm": 0.24936360120773315, "learning_rate": 0.00014340101522842638, "loss": 0.2746, "step": 1695 }, { "epoch": 2.1559923906150917, "grad_norm": 0.3168596029281616, "learning_rate": 0.00014382402707275804, "loss": 0.2698, "step": 1700 }, { "epoch": 2.1623335447051364, "grad_norm": 0.26993608474731445, "learning_rate": 0.00014424703891708967, "loss": 0.2717, "step": 1705 }, { "epoch": 2.1686746987951806, "grad_norm": 0.2598145306110382, "learning_rate": 0.00014467005076142133, "loss": 0.2856, "step": 1710 }, { "epoch": 2.1750158528852253, "grad_norm": 0.2613801956176758, "learning_rate": 0.00014509306260575297, "loss": 0.2986, "step": 1715 }, { "epoch": 2.1813570069752695, "grad_norm": 0.2724541127681732, "learning_rate": 0.00014551607445008463, "loss": 0.3055, "step": 1720 }, { "epoch": 2.187698161065314, "grad_norm": 0.30996379256248474, "learning_rate": 0.00014593908629441626, "loss": 0.3015, "step": 1725 }, { "epoch": 2.1940393151553583, "grad_norm": 0.2557225823402405, "learning_rate": 0.0001463620981387479, "loss": 0.2771, "step": 1730 }, { "epoch": 2.2003804692454025, "grad_norm": 0.32113954424858093, "learning_rate": 0.00014678510998307953, "loss": 0.2707, "step": 1735 }, { "epoch": 2.206721623335447, "grad_norm": 0.2548729181289673, "learning_rate": 0.00014720812182741116, "loss": 0.3035, "step": 1740 }, { "epoch": 2.2130627774254914, "grad_norm": 0.3441978693008423, "learning_rate": 0.00014763113367174282, "loss": 0.3068, "step": 1745 }, { "epoch": 2.2194039315155356, "grad_norm": 0.2538011372089386, "learning_rate": 0.00014805414551607445, "loss": 0.2783, "step": 1750 }, { "epoch": 2.2257450856055803, "grad_norm": 0.23306721448898315, "learning_rate": 0.0001484771573604061, "loss": 0.26, "step": 1755 }, { "epoch": 2.2320862396956245, "grad_norm": 0.2583463490009308, "learning_rate": 0.00014890016920473775, "loss": 0.2781, "step": 1760 }, { "epoch": 2.238427393785669, "grad_norm": 0.25331056118011475, "learning_rate": 0.00014932318104906938, "loss": 0.2812, "step": 1765 }, { "epoch": 2.2447685478757133, "grad_norm": 0.24994800984859467, "learning_rate": 0.000149746192893401, "loss": 0.2656, "step": 1770 }, { "epoch": 2.251109701965758, "grad_norm": 0.28956037759780884, "learning_rate": 0.00015016920473773267, "loss": 0.3032, "step": 1775 }, { "epoch": 2.257450856055802, "grad_norm": 0.2774043679237366, "learning_rate": 0.0001505922165820643, "loss": 0.3067, "step": 1780 }, { "epoch": 2.2637920101458464, "grad_norm": 0.25354713201522827, "learning_rate": 0.00015101522842639594, "loss": 0.306, "step": 1785 }, { "epoch": 2.270133164235891, "grad_norm": 0.26151514053344727, "learning_rate": 0.0001514382402707276, "loss": 0.2679, "step": 1790 }, { "epoch": 2.2764743183259353, "grad_norm": 0.2814745008945465, "learning_rate": 0.00015186125211505923, "loss": 0.2987, "step": 1795 }, { "epoch": 2.2828154724159795, "grad_norm": 0.2938013970851898, "learning_rate": 0.00015228426395939087, "loss": 0.3113, "step": 1800 }, { "epoch": 2.289156626506024, "grad_norm": 0.2925238311290741, "learning_rate": 0.0001527072758037225, "loss": 0.2816, "step": 1805 }, { "epoch": 2.2954977805960683, "grad_norm": 0.3136110305786133, "learning_rate": 0.00015313028764805416, "loss": 0.3122, "step": 1810 }, { "epoch": 2.301838934686113, "grad_norm": 0.3624803125858307, "learning_rate": 0.0001535532994923858, "loss": 0.2974, "step": 1815 }, { "epoch": 2.308180088776157, "grad_norm": 0.2709241211414337, "learning_rate": 0.00015397631133671742, "loss": 0.2954, "step": 1820 }, { "epoch": 2.314521242866202, "grad_norm": 0.2335352599620819, "learning_rate": 0.00015439932318104908, "loss": 0.2984, "step": 1825 }, { "epoch": 2.320862396956246, "grad_norm": 0.23858477175235748, "learning_rate": 0.00015482233502538072, "loss": 0.2874, "step": 1830 }, { "epoch": 2.3272035510462903, "grad_norm": 0.2193860113620758, "learning_rate": 0.00015524534686971235, "loss": 0.2828, "step": 1835 }, { "epoch": 2.333544705136335, "grad_norm": 0.2635989487171173, "learning_rate": 0.00015566835871404398, "loss": 0.3162, "step": 1840 }, { "epoch": 2.339885859226379, "grad_norm": 0.23063664138317108, "learning_rate": 0.00015609137055837564, "loss": 0.2771, "step": 1845 }, { "epoch": 2.3462270133164234, "grad_norm": 0.2778102159500122, "learning_rate": 0.00015651438240270728, "loss": 0.3032, "step": 1850 }, { "epoch": 2.352568167406468, "grad_norm": 0.3227561116218567, "learning_rate": 0.00015693739424703894, "loss": 0.308, "step": 1855 }, { "epoch": 2.3589093214965122, "grad_norm": 0.2782094478607178, "learning_rate": 0.00015736040609137057, "loss": 0.2702, "step": 1860 }, { "epoch": 2.365250475586557, "grad_norm": 0.2572447657585144, "learning_rate": 0.0001577834179357022, "loss": 0.3029, "step": 1865 }, { "epoch": 2.371591629676601, "grad_norm": 0.24774974584579468, "learning_rate": 0.00015820642978003384, "loss": 0.2784, "step": 1870 }, { "epoch": 2.3779327837666457, "grad_norm": 0.2644084095954895, "learning_rate": 0.00015862944162436547, "loss": 0.2832, "step": 1875 }, { "epoch": 2.38427393785669, "grad_norm": 0.2516373097896576, "learning_rate": 0.00015905245346869713, "loss": 0.2988, "step": 1880 }, { "epoch": 2.390615091946734, "grad_norm": 0.24512116611003876, "learning_rate": 0.00015947546531302876, "loss": 0.2762, "step": 1885 }, { "epoch": 2.396956246036779, "grad_norm": 0.23743340373039246, "learning_rate": 0.00015989847715736042, "loss": 0.2859, "step": 1890 }, { "epoch": 2.403297400126823, "grad_norm": 0.21675048768520355, "learning_rate": 0.00016032148900169206, "loss": 0.283, "step": 1895 }, { "epoch": 2.4096385542168672, "grad_norm": 0.2559918761253357, "learning_rate": 0.00016074450084602372, "loss": 0.2979, "step": 1900 }, { "epoch": 2.415979708306912, "grad_norm": 0.2581586539745331, "learning_rate": 0.00016116751269035532, "loss": 0.2825, "step": 1905 }, { "epoch": 2.422320862396956, "grad_norm": 0.19741621613502502, "learning_rate": 0.00016159052453468698, "loss": 0.2867, "step": 1910 }, { "epoch": 2.4286620164870008, "grad_norm": 0.21792180836200714, "learning_rate": 0.00016201353637901862, "loss": 0.313, "step": 1915 }, { "epoch": 2.435003170577045, "grad_norm": 0.2565922141075134, "learning_rate": 0.00016243654822335025, "loss": 0.2799, "step": 1920 }, { "epoch": 2.4413443246670896, "grad_norm": 0.24959257245063782, "learning_rate": 0.0001628595600676819, "loss": 0.2739, "step": 1925 }, { "epoch": 2.447685478757134, "grad_norm": 0.23778261244297028, "learning_rate": 0.00016328257191201354, "loss": 0.293, "step": 1930 }, { "epoch": 2.454026632847178, "grad_norm": 0.26416465640068054, "learning_rate": 0.0001637055837563452, "loss": 0.2882, "step": 1935 }, { "epoch": 2.4603677869372227, "grad_norm": 0.21522019803524017, "learning_rate": 0.00016412859560067684, "loss": 0.2657, "step": 1940 }, { "epoch": 2.466708941027267, "grad_norm": 0.23454619944095612, "learning_rate": 0.00016455160744500847, "loss": 0.2785, "step": 1945 }, { "epoch": 2.473050095117311, "grad_norm": 0.2349841147661209, "learning_rate": 0.0001649746192893401, "loss": 0.2591, "step": 1950 }, { "epoch": 2.4793912492073558, "grad_norm": 0.2514456808567047, "learning_rate": 0.00016539763113367176, "loss": 0.2691, "step": 1955 }, { "epoch": 2.4857324032974, "grad_norm": 0.221239373087883, "learning_rate": 0.0001658206429780034, "loss": 0.2866, "step": 1960 }, { "epoch": 2.4920735573874446, "grad_norm": 0.24372516572475433, "learning_rate": 0.00016624365482233503, "loss": 0.297, "step": 1965 }, { "epoch": 2.498414711477489, "grad_norm": 0.2298443466424942, "learning_rate": 0.0001666666666666667, "loss": 0.2703, "step": 1970 }, { "epoch": 2.5047558655675335, "grad_norm": 0.2979632616043091, "learning_rate": 0.00016708967851099832, "loss": 0.3037, "step": 1975 }, { "epoch": 2.5110970196575777, "grad_norm": 0.21895724534988403, "learning_rate": 0.00016751269035532995, "loss": 0.2846, "step": 1980 }, { "epoch": 2.517438173747622, "grad_norm": 0.28638067841529846, "learning_rate": 0.0001679357021996616, "loss": 0.3385, "step": 1985 }, { "epoch": 2.5237793278376666, "grad_norm": 0.22999686002731323, "learning_rate": 0.00016835871404399325, "loss": 0.2905, "step": 1990 }, { "epoch": 2.5301204819277108, "grad_norm": 0.25732505321502686, "learning_rate": 0.00016878172588832488, "loss": 0.2597, "step": 1995 }, { "epoch": 2.536461636017755, "grad_norm": 0.22246240079402924, "learning_rate": 0.00016920473773265651, "loss": 0.2801, "step": 2000 }, { "epoch": 2.5428027901077996, "grad_norm": 0.2410714477300644, "learning_rate": 0.00016962774957698817, "loss": 0.2824, "step": 2005 }, { "epoch": 2.549143944197844, "grad_norm": 0.21686236560344696, "learning_rate": 0.0001700507614213198, "loss": 0.267, "step": 2010 }, { "epoch": 2.5554850982878885, "grad_norm": 0.2855628728866577, "learning_rate": 0.00017047377326565144, "loss": 0.2833, "step": 2015 }, { "epoch": 2.5618262523779327, "grad_norm": 0.22587080299854279, "learning_rate": 0.00017089678510998307, "loss": 0.271, "step": 2020 }, { "epoch": 2.5681674064679774, "grad_norm": 0.29146429896354675, "learning_rate": 0.00017131979695431473, "loss": 0.3015, "step": 2025 }, { "epoch": 2.5745085605580216, "grad_norm": 0.2501736283302307, "learning_rate": 0.00017174280879864637, "loss": 0.2704, "step": 2030 }, { "epoch": 2.580849714648066, "grad_norm": 0.2167421281337738, "learning_rate": 0.00017216582064297803, "loss": 0.2932, "step": 2035 }, { "epoch": 2.5871908687381104, "grad_norm": 0.260665625333786, "learning_rate": 0.00017258883248730966, "loss": 0.2858, "step": 2040 }, { "epoch": 2.5935320228281546, "grad_norm": 0.21929213404655457, "learning_rate": 0.0001730118443316413, "loss": 0.2644, "step": 2045 }, { "epoch": 2.599873176918199, "grad_norm": 0.2527408301830292, "learning_rate": 0.00017343485617597293, "loss": 0.2994, "step": 2050 }, { "epoch": 2.6062143310082435, "grad_norm": 0.2771224081516266, "learning_rate": 0.00017385786802030456, "loss": 0.2892, "step": 2055 }, { "epoch": 2.6125554850982877, "grad_norm": 0.24776731431484222, "learning_rate": 0.00017428087986463622, "loss": 0.3257, "step": 2060 }, { "epoch": 2.6188966391883324, "grad_norm": 0.2014625370502472, "learning_rate": 0.00017470389170896785, "loss": 0.2897, "step": 2065 }, { "epoch": 2.6252377932783766, "grad_norm": 0.21169061958789825, "learning_rate": 0.0001751269035532995, "loss": 0.2796, "step": 2070 }, { "epoch": 2.6315789473684212, "grad_norm": 0.24923637509346008, "learning_rate": 0.00017554991539763115, "loss": 0.2709, "step": 2075 }, { "epoch": 2.6379201014584654, "grad_norm": 0.23817142844200134, "learning_rate": 0.0001759729272419628, "loss": 0.3099, "step": 2080 }, { "epoch": 2.6442612555485097, "grad_norm": 0.2860507071018219, "learning_rate": 0.0001763959390862944, "loss": 0.273, "step": 2085 }, { "epoch": 2.6506024096385543, "grad_norm": 0.2566317021846771, "learning_rate": 0.00017681895093062607, "loss": 0.2942, "step": 2090 }, { "epoch": 2.6569435637285985, "grad_norm": 0.19406455755233765, "learning_rate": 0.0001772419627749577, "loss": 0.2653, "step": 2095 }, { "epoch": 2.6632847178186427, "grad_norm": 0.20335452258586884, "learning_rate": 0.00017766497461928934, "loss": 0.2679, "step": 2100 }, { "epoch": 2.6696258719086874, "grad_norm": 0.22100548446178436, "learning_rate": 0.000178087986463621, "loss": 0.2716, "step": 2105 }, { "epoch": 2.675967025998732, "grad_norm": 0.23922790586948395, "learning_rate": 0.00017851099830795263, "loss": 0.2673, "step": 2110 }, { "epoch": 2.6823081800887763, "grad_norm": 0.36680835485458374, "learning_rate": 0.0001789340101522843, "loss": 0.3166, "step": 2115 }, { "epoch": 2.6886493341788205, "grad_norm": 0.20901376008987427, "learning_rate": 0.0001793570219966159, "loss": 0.2563, "step": 2120 }, { "epoch": 2.694990488268865, "grad_norm": 0.4551635980606079, "learning_rate": 0.00017978003384094756, "loss": 0.2905, "step": 2125 }, { "epoch": 2.7013316423589093, "grad_norm": 0.21061168611049652, "learning_rate": 0.0001802030456852792, "loss": 0.2536, "step": 2130 }, { "epoch": 2.7076727964489535, "grad_norm": 0.2249498814344406, "learning_rate": 0.00018062605752961082, "loss": 0.2843, "step": 2135 }, { "epoch": 2.714013950538998, "grad_norm": 0.21941763162612915, "learning_rate": 0.00018104906937394248, "loss": 0.2596, "step": 2140 }, { "epoch": 2.7203551046290424, "grad_norm": 0.24866026639938354, "learning_rate": 0.00018147208121827412, "loss": 0.2616, "step": 2145 }, { "epoch": 2.7266962587190866, "grad_norm": 0.22482390701770782, "learning_rate": 0.00018189509306260578, "loss": 0.2765, "step": 2150 }, { "epoch": 2.7330374128091313, "grad_norm": 0.32635489106178284, "learning_rate": 0.00018231810490693738, "loss": 0.2833, "step": 2155 }, { "epoch": 2.739378566899176, "grad_norm": 0.21408367156982422, "learning_rate": 0.00018274111675126904, "loss": 0.2651, "step": 2160 }, { "epoch": 2.74571972098922, "grad_norm": 0.3729408383369446, "learning_rate": 0.00018316412859560068, "loss": 0.2685, "step": 2165 }, { "epoch": 2.7520608750792643, "grad_norm": 0.23560300469398499, "learning_rate": 0.00018358714043993234, "loss": 0.2597, "step": 2170 }, { "epoch": 2.758402029169309, "grad_norm": 0.25893548130989075, "learning_rate": 0.00018401015228426397, "loss": 0.2656, "step": 2175 }, { "epoch": 2.764743183259353, "grad_norm": 0.2248678356409073, "learning_rate": 0.0001844331641285956, "loss": 0.2533, "step": 2180 }, { "epoch": 2.7710843373493974, "grad_norm": 0.20640915632247925, "learning_rate": 0.00018485617597292726, "loss": 0.2979, "step": 2185 }, { "epoch": 2.777425491439442, "grad_norm": 0.2395174354314804, "learning_rate": 0.00018527918781725887, "loss": 0.3485, "step": 2190 }, { "epoch": 2.7837666455294863, "grad_norm": 0.272970587015152, "learning_rate": 0.00018570219966159053, "loss": 0.3132, "step": 2195 }, { "epoch": 2.7901077996195305, "grad_norm": 0.23604224622249603, "learning_rate": 0.00018612521150592216, "loss": 0.2581, "step": 2200 }, { "epoch": 2.796448953709575, "grad_norm": 0.21665266156196594, "learning_rate": 0.00018654822335025382, "loss": 0.2775, "step": 2205 }, { "epoch": 2.80279010779962, "grad_norm": 0.26417961716651917, "learning_rate": 0.00018697123519458546, "loss": 0.2937, "step": 2210 }, { "epoch": 2.809131261889664, "grad_norm": 0.2066638171672821, "learning_rate": 0.00018739424703891712, "loss": 0.2738, "step": 2215 }, { "epoch": 2.815472415979708, "grad_norm": 0.2561461329460144, "learning_rate": 0.00018781725888324875, "loss": 0.2644, "step": 2220 }, { "epoch": 2.821813570069753, "grad_norm": 0.2183157354593277, "learning_rate": 0.00018824027072758038, "loss": 0.2666, "step": 2225 }, { "epoch": 2.828154724159797, "grad_norm": 0.2652731239795685, "learning_rate": 0.00018866328257191202, "loss": 0.2907, "step": 2230 }, { "epoch": 2.8344958782498413, "grad_norm": 0.20578616857528687, "learning_rate": 0.00018908629441624365, "loss": 0.2668, "step": 2235 }, { "epoch": 2.840837032339886, "grad_norm": 0.25196900963783264, "learning_rate": 0.0001895093062605753, "loss": 0.3191, "step": 2240 }, { "epoch": 2.84717818642993, "grad_norm": 0.19236379861831665, "learning_rate": 0.00018993231810490694, "loss": 0.2523, "step": 2245 }, { "epoch": 2.853519340519975, "grad_norm": 0.20920859277248383, "learning_rate": 0.0001903553299492386, "loss": 0.2691, "step": 2250 }, { "epoch": 2.859860494610019, "grad_norm": 0.22025133669376373, "learning_rate": 0.00019077834179357023, "loss": 0.2785, "step": 2255 }, { "epoch": 2.8662016487000637, "grad_norm": 0.21243880689144135, "learning_rate": 0.00019120135363790187, "loss": 0.2869, "step": 2260 }, { "epoch": 2.872542802790108, "grad_norm": 0.19683094322681427, "learning_rate": 0.0001916243654822335, "loss": 0.2586, "step": 2265 }, { "epoch": 2.878883956880152, "grad_norm": 0.19929291307926178, "learning_rate": 0.00019204737732656516, "loss": 0.2693, "step": 2270 }, { "epoch": 2.8852251109701967, "grad_norm": 0.3156428039073944, "learning_rate": 0.0001924703891708968, "loss": 0.3124, "step": 2275 }, { "epoch": 2.891566265060241, "grad_norm": 0.20877346396446228, "learning_rate": 0.00019289340101522843, "loss": 0.2798, "step": 2280 }, { "epoch": 2.897907419150285, "grad_norm": 0.2895923852920532, "learning_rate": 0.0001933164128595601, "loss": 0.2985, "step": 2285 }, { "epoch": 2.90424857324033, "grad_norm": 0.21042507886886597, "learning_rate": 0.00019373942470389172, "loss": 0.2904, "step": 2290 }, { "epoch": 2.910589727330374, "grad_norm": 0.20666979253292084, "learning_rate": 0.00019416243654822338, "loss": 0.2501, "step": 2295 }, { "epoch": 2.9169308814204187, "grad_norm": 0.21271313726902008, "learning_rate": 0.000194585448392555, "loss": 0.257, "step": 2300 }, { "epoch": 2.923272035510463, "grad_norm": 0.21407407522201538, "learning_rate": 0.00019500846023688665, "loss": 0.2587, "step": 2305 }, { "epoch": 2.9296131896005075, "grad_norm": 0.3173685073852539, "learning_rate": 0.00019543147208121828, "loss": 0.314, "step": 2310 }, { "epoch": 2.9359543436905517, "grad_norm": 0.1981971561908722, "learning_rate": 0.0001958544839255499, "loss": 0.2547, "step": 2315 }, { "epoch": 2.942295497780596, "grad_norm": 0.1785871833562851, "learning_rate": 0.00019627749576988157, "loss": 0.2736, "step": 2320 }, { "epoch": 2.9486366518706406, "grad_norm": 0.3197482228279114, "learning_rate": 0.0001967005076142132, "loss": 0.274, "step": 2325 }, { "epoch": 2.954977805960685, "grad_norm": 0.2923840880393982, "learning_rate": 0.00019712351945854487, "loss": 0.2786, "step": 2330 }, { "epoch": 2.961318960050729, "grad_norm": 0.2850870192050934, "learning_rate": 0.00019754653130287647, "loss": 0.2953, "step": 2335 }, { "epoch": 2.9676601141407737, "grad_norm": 0.28410395979881287, "learning_rate": 0.00019796954314720813, "loss": 0.2691, "step": 2340 }, { "epoch": 2.974001268230818, "grad_norm": 0.20453417301177979, "learning_rate": 0.00019839255499153977, "loss": 0.2653, "step": 2345 }, { "epoch": 2.9803424223208625, "grad_norm": 0.21030914783477783, "learning_rate": 0.00019881556683587143, "loss": 0.2602, "step": 2350 }, { "epoch": 2.9866835764109068, "grad_norm": 0.18885819613933563, "learning_rate": 0.00019923857868020306, "loss": 0.2821, "step": 2355 }, { "epoch": 2.9930247305009514, "grad_norm": 0.24666975438594818, "learning_rate": 0.0001996615905245347, "loss": 0.2911, "step": 2360 }, { "epoch": 2.9993658845909956, "grad_norm": 0.2607840895652771, "learning_rate": 0.00019999999890984115, "loss": 0.2658, "step": 2365 }, { "epoch": 2.9993658845909956, "eval_loss": 0.3185872435569763, "eval_runtime": 1444.2327, "eval_samples_per_second": 1.092, "eval_steps_per_second": 1.092, "step": 2365 }, { "epoch": 3.00570703868104, "grad_norm": 0.19379326701164246, "learning_rate": 0.00019999996075428382, "loss": 0.2413, "step": 2370 }, { "epoch": 3.0120481927710845, "grad_norm": 0.29591313004493713, "learning_rate": 0.00019999986809080767, "loss": 0.2548, "step": 2375 }, { "epoch": 3.0183893468611287, "grad_norm": 0.1889077126979828, "learning_rate": 0.00019999972091946314, "loss": 0.2336, "step": 2380 }, { "epoch": 3.024730500951173, "grad_norm": 0.26883465051651, "learning_rate": 0.00019999951924033055, "loss": 0.2653, "step": 2385 }, { "epoch": 3.0310716550412176, "grad_norm": 0.20499426126480103, "learning_rate": 0.00019999926305351977, "loss": 0.2416, "step": 2390 }, { "epoch": 3.0374128091312618, "grad_norm": 0.21682646870613098, "learning_rate": 0.00019999895235917043, "loss": 0.3044, "step": 2395 }, { "epoch": 3.0437539632213064, "grad_norm": 0.200510635972023, "learning_rate": 0.00019999858715745195, "loss": 0.2431, "step": 2400 }, { "epoch": 3.0500951173113506, "grad_norm": 0.2255992293357849, "learning_rate": 0.0001999981674485633, "loss": 0.2817, "step": 2405 }, { "epoch": 3.056436271401395, "grad_norm": 0.39606860280036926, "learning_rate": 0.00019999769323273333, "loss": 0.2543, "step": 2410 }, { "epoch": 3.0627774254914395, "grad_norm": 0.20807872712612152, "learning_rate": 0.0001999971645102205, "loss": 0.2722, "step": 2415 }, { "epoch": 3.0691185795814837, "grad_norm": 0.23841217160224915, "learning_rate": 0.000199996581281313, "loss": 0.2661, "step": 2420 }, { "epoch": 3.0754597336715284, "grad_norm": 0.2552984654903412, "learning_rate": 0.00019999594354632873, "loss": 0.2791, "step": 2425 }, { "epoch": 3.0818008877615726, "grad_norm": 0.22511588037014008, "learning_rate": 0.00019999525130561533, "loss": 0.2435, "step": 2430 }, { "epoch": 3.088142041851617, "grad_norm": 0.2062397599220276, "learning_rate": 0.00019999450455955014, "loss": 0.2471, "step": 2435 }, { "epoch": 3.0944831959416614, "grad_norm": 0.1969393938779831, "learning_rate": 0.00019999370330854013, "loss": 0.2356, "step": 2440 }, { "epoch": 3.1008243500317056, "grad_norm": 0.23209957778453827, "learning_rate": 0.00019999284755302211, "loss": 0.2429, "step": 2445 }, { "epoch": 3.1071655041217503, "grad_norm": 0.22277694940567017, "learning_rate": 0.0001999919372934625, "loss": 0.2567, "step": 2450 }, { "epoch": 3.1135066582117945, "grad_norm": 0.22833660244941711, "learning_rate": 0.00019999097253035748, "loss": 0.2607, "step": 2455 }, { "epoch": 3.119847812301839, "grad_norm": 0.22441290318965912, "learning_rate": 0.0001999899532642329, "loss": 0.2359, "step": 2460 }, { "epoch": 3.1261889663918834, "grad_norm": 0.3228180706501007, "learning_rate": 0.00019998887949564437, "loss": 0.2879, "step": 2465 }, { "epoch": 3.1325301204819276, "grad_norm": 0.28033703565597534, "learning_rate": 0.0001999877512251772, "loss": 0.2625, "step": 2470 }, { "epoch": 3.1388712745719722, "grad_norm": 0.17552833259105682, "learning_rate": 0.00019998656845344632, "loss": 0.2639, "step": 2475 }, { "epoch": 3.1452124286620164, "grad_norm": 0.19929875433444977, "learning_rate": 0.00019998533118109645, "loss": 0.254, "step": 2480 }, { "epoch": 3.1515535827520607, "grad_norm": 0.23680488765239716, "learning_rate": 0.00019998403940880204, "loss": 0.2395, "step": 2485 }, { "epoch": 3.1578947368421053, "grad_norm": 0.2649296522140503, "learning_rate": 0.0001999826931372672, "loss": 0.2838, "step": 2490 }, { "epoch": 3.1642358909321495, "grad_norm": 0.21219000220298767, "learning_rate": 0.00019998129236722573, "loss": 0.2495, "step": 2495 }, { "epoch": 3.170577045022194, "grad_norm": 0.1842680722475052, "learning_rate": 0.00019997983709944117, "loss": 0.267, "step": 2500 }, { "epoch": 3.1769181991122384, "grad_norm": 0.20653505623340607, "learning_rate": 0.00019997832733470677, "loss": 0.2369, "step": 2505 }, { "epoch": 3.183259353202283, "grad_norm": 0.22465841472148895, "learning_rate": 0.00019997676307384547, "loss": 0.2397, "step": 2510 }, { "epoch": 3.1896005072923272, "grad_norm": 0.2130175679922104, "learning_rate": 0.0001999751443177099, "loss": 0.2538, "step": 2515 }, { "epoch": 3.1959416613823715, "grad_norm": 0.42864325642585754, "learning_rate": 0.0001999734710671824, "loss": 0.2607, "step": 2520 }, { "epoch": 3.202282815472416, "grad_norm": 0.19185760617256165, "learning_rate": 0.00019997174332317505, "loss": 0.2418, "step": 2525 }, { "epoch": 3.2086239695624603, "grad_norm": 0.2186804860830307, "learning_rate": 0.00019996996108662964, "loss": 0.2591, "step": 2530 }, { "epoch": 3.2149651236525045, "grad_norm": 0.1866760402917862, "learning_rate": 0.00019996812435851756, "loss": 0.2466, "step": 2535 }, { "epoch": 3.221306277742549, "grad_norm": 0.2104412466287613, "learning_rate": 0.00019996623313984, "loss": 0.2488, "step": 2540 }, { "epoch": 3.2276474318325934, "grad_norm": 0.1959371566772461, "learning_rate": 0.00019996428743162783, "loss": 0.23, "step": 2545 }, { "epoch": 3.233988585922638, "grad_norm": 0.197498619556427, "learning_rate": 0.00019996228723494165, "loss": 0.2366, "step": 2550 }, { "epoch": 3.2403297400126823, "grad_norm": 0.17135101556777954, "learning_rate": 0.00019996023255087163, "loss": 0.2416, "step": 2555 }, { "epoch": 3.246670894102727, "grad_norm": 0.3009037673473358, "learning_rate": 0.00019995812338053783, "loss": 0.2645, "step": 2560 }, { "epoch": 3.253012048192771, "grad_norm": 0.19022932648658752, "learning_rate": 0.00019995595972508988, "loss": 0.249, "step": 2565 }, { "epoch": 3.2593532022828153, "grad_norm": 0.2203480452299118, "learning_rate": 0.00019995374158570718, "loss": 0.2431, "step": 2570 }, { "epoch": 3.26569435637286, "grad_norm": 0.22309836745262146, "learning_rate": 0.00019995146896359872, "loss": 0.2357, "step": 2575 }, { "epoch": 3.272035510462904, "grad_norm": 0.23879647254943848, "learning_rate": 0.00019994914186000328, "loss": 0.2452, "step": 2580 }, { "epoch": 3.2783766645529484, "grad_norm": 0.18533390760421753, "learning_rate": 0.00019994676027618936, "loss": 0.2633, "step": 2585 }, { "epoch": 3.284717818642993, "grad_norm": 0.23272106051445007, "learning_rate": 0.00019994432421345507, "loss": 0.2407, "step": 2590 }, { "epoch": 3.2910589727330373, "grad_norm": 0.26440662145614624, "learning_rate": 0.00019994183367312828, "loss": 0.2611, "step": 2595 }, { "epoch": 3.297400126823082, "grad_norm": 0.18213769793510437, "learning_rate": 0.00019993928865656653, "loss": 0.2748, "step": 2600 }, { "epoch": 3.303741280913126, "grad_norm": 0.21100805699825287, "learning_rate": 0.00019993668916515705, "loss": 0.2396, "step": 2605 }, { "epoch": 3.310082435003171, "grad_norm": 0.18831108510494232, "learning_rate": 0.00019993403520031678, "loss": 0.2413, "step": 2610 }, { "epoch": 3.316423589093215, "grad_norm": 0.20541886985301971, "learning_rate": 0.0001999313267634923, "loss": 0.267, "step": 2615 }, { "epoch": 3.322764743183259, "grad_norm": 0.1734210103750229, "learning_rate": 0.00019992856385616, "loss": 0.2689, "step": 2620 }, { "epoch": 3.329105897273304, "grad_norm": 0.3569490909576416, "learning_rate": 0.0001999257464798258, "loss": 0.2577, "step": 2625 }, { "epoch": 3.335447051363348, "grad_norm": 0.20772813260555267, "learning_rate": 0.00019992287463602547, "loss": 0.2347, "step": 2630 }, { "epoch": 3.3417882054533923, "grad_norm": 0.3979608118534088, "learning_rate": 0.00019991994832632432, "loss": 0.2673, "step": 2635 }, { "epoch": 3.348129359543437, "grad_norm": 0.29580190777778625, "learning_rate": 0.00019991696755231747, "loss": 0.2397, "step": 2640 }, { "epoch": 3.354470513633481, "grad_norm": 0.18281158804893494, "learning_rate": 0.0001999139323156297, "loss": 0.2408, "step": 2645 }, { "epoch": 3.360811667723526, "grad_norm": 0.2271375060081482, "learning_rate": 0.00019991084261791539, "loss": 0.242, "step": 2650 }, { "epoch": 3.36715282181357, "grad_norm": 0.2104540318250656, "learning_rate": 0.00019990769846085868, "loss": 0.2434, "step": 2655 }, { "epoch": 3.3734939759036147, "grad_norm": 0.19053971767425537, "learning_rate": 0.00019990449984617342, "loss": 0.2485, "step": 2660 }, { "epoch": 3.379835129993659, "grad_norm": 0.20505011081695557, "learning_rate": 0.00019990124677560312, "loss": 0.2417, "step": 2665 }, { "epoch": 3.386176284083703, "grad_norm": 0.18881699442863464, "learning_rate": 0.0001998979392509209, "loss": 0.2642, "step": 2670 }, { "epoch": 3.3925174381737477, "grad_norm": 0.20456953346729279, "learning_rate": 0.00019989457727392972, "loss": 0.2514, "step": 2675 }, { "epoch": 3.398858592263792, "grad_norm": 0.19508343935012817, "learning_rate": 0.00019989116084646202, "loss": 0.2488, "step": 2680 }, { "epoch": 3.405199746353836, "grad_norm": 0.2118864208459854, "learning_rate": 0.00019988768997038006, "loss": 0.2368, "step": 2685 }, { "epoch": 3.411540900443881, "grad_norm": 0.19977030158042908, "learning_rate": 0.00019988416464757576, "loss": 0.2377, "step": 2690 }, { "epoch": 3.417882054533925, "grad_norm": 0.2753262519836426, "learning_rate": 0.00019988058487997073, "loss": 0.2629, "step": 2695 }, { "epoch": 3.4242232086239697, "grad_norm": 0.24887989461421967, "learning_rate": 0.00019987695066951618, "loss": 0.2518, "step": 2700 }, { "epoch": 3.430564362714014, "grad_norm": 0.21111193299293518, "learning_rate": 0.00019987326201819302, "loss": 0.2352, "step": 2705 }, { "epoch": 3.4369055168040585, "grad_norm": 0.2114241123199463, "learning_rate": 0.00019986951892801192, "loss": 0.26, "step": 2710 }, { "epoch": 3.4432466708941027, "grad_norm": 0.17171643674373627, "learning_rate": 0.00019986572140101312, "loss": 0.235, "step": 2715 }, { "epoch": 3.449587824984147, "grad_norm": 0.1984821856021881, "learning_rate": 0.00019986186943926658, "loss": 0.2518, "step": 2720 }, { "epoch": 3.4559289790741916, "grad_norm": 0.1960582286119461, "learning_rate": 0.00019985796304487194, "loss": 0.2577, "step": 2725 }, { "epoch": 3.462270133164236, "grad_norm": 0.18548458814620972, "learning_rate": 0.00019985400221995853, "loss": 0.2646, "step": 2730 }, { "epoch": 3.4686112872542805, "grad_norm": 0.18605677783489227, "learning_rate": 0.0001998499869666852, "loss": 0.2654, "step": 2735 }, { "epoch": 3.4749524413443247, "grad_norm": 0.17846783995628357, "learning_rate": 0.00019984591728724072, "loss": 0.2313, "step": 2740 }, { "epoch": 3.481293595434369, "grad_norm": 0.17410464584827423, "learning_rate": 0.00019984179318384326, "loss": 0.2557, "step": 2745 }, { "epoch": 3.4876347495244135, "grad_norm": 0.26158639788627625, "learning_rate": 0.0001998376146587409, "loss": 0.2375, "step": 2750 }, { "epoch": 3.4939759036144578, "grad_norm": 0.19227950274944305, "learning_rate": 0.00019983338171421117, "loss": 0.2351, "step": 2755 }, { "epoch": 3.5003170577045024, "grad_norm": 0.18109232187271118, "learning_rate": 0.00019982909435256144, "loss": 0.2432, "step": 2760 }, { "epoch": 3.5066582117945466, "grad_norm": 0.17076869308948517, "learning_rate": 0.0001998247525761286, "loss": 0.235, "step": 2765 }, { "epoch": 3.512999365884591, "grad_norm": 0.2036367505788803, "learning_rate": 0.0001998203563872793, "loss": 0.2427, "step": 2770 }, { "epoch": 3.5193405199746355, "grad_norm": 0.2593757212162018, "learning_rate": 0.0001998159057884098, "loss": 0.2648, "step": 2775 }, { "epoch": 3.5256816740646797, "grad_norm": 0.1747037023305893, "learning_rate": 0.00019981140078194606, "loss": 0.2507, "step": 2780 }, { "epoch": 3.532022828154724, "grad_norm": 0.18372654914855957, "learning_rate": 0.0001998068413703436, "loss": 0.2397, "step": 2785 }, { "epoch": 3.5383639822447686, "grad_norm": 0.20845967531204224, "learning_rate": 0.0001998022275560877, "loss": 0.2587, "step": 2790 }, { "epoch": 3.544705136334813, "grad_norm": 0.2604580819606781, "learning_rate": 0.00019979755934169325, "loss": 0.2476, "step": 2795 }, { "epoch": 3.5510462904248574, "grad_norm": 0.17546501755714417, "learning_rate": 0.00019979283672970482, "loss": 0.2592, "step": 2800 }, { "epoch": 3.5573874445149016, "grad_norm": 0.5025832653045654, "learning_rate": 0.0001997880597226966, "loss": 0.2746, "step": 2805 }, { "epoch": 3.5637285986049463, "grad_norm": 0.2474837452173233, "learning_rate": 0.0001997832283232724, "loss": 0.2474, "step": 2810 }, { "epoch": 3.5700697526949905, "grad_norm": 0.21203100681304932, "learning_rate": 0.00019977834253406577, "loss": 0.2651, "step": 2815 }, { "epoch": 3.5764109067850347, "grad_norm": 0.2845933139324188, "learning_rate": 0.00019977340235773983, "loss": 0.2399, "step": 2820 }, { "epoch": 3.5827520608750794, "grad_norm": 0.19890236854553223, "learning_rate": 0.00019976840779698733, "loss": 0.2493, "step": 2825 }, { "epoch": 3.5890932149651236, "grad_norm": 0.18368743360042572, "learning_rate": 0.00019976335885453077, "loss": 0.2502, "step": 2830 }, { "epoch": 3.595434369055168, "grad_norm": 0.19258390367031097, "learning_rate": 0.0001997582555331222, "loss": 0.2863, "step": 2835 }, { "epoch": 3.6017755231452124, "grad_norm": 0.18293575942516327, "learning_rate": 0.0001997530978355433, "loss": 0.235, "step": 2840 }, { "epoch": 3.608116677235257, "grad_norm": 0.23059338331222534, "learning_rate": 0.00019974788576460547, "loss": 0.2504, "step": 2845 }, { "epoch": 3.6144578313253013, "grad_norm": 0.16231469810009003, "learning_rate": 0.00019974261932314972, "loss": 0.2378, "step": 2850 }, { "epoch": 3.6207989854153455, "grad_norm": 0.1730029582977295, "learning_rate": 0.0001997372985140466, "loss": 0.2364, "step": 2855 }, { "epoch": 3.62714013950539, "grad_norm": 0.35427993535995483, "learning_rate": 0.0001997319233401964, "loss": 0.261, "step": 2860 }, { "epoch": 3.6334812935954344, "grad_norm": 0.1841244399547577, "learning_rate": 0.0001997264938045291, "loss": 0.2525, "step": 2865 }, { "epoch": 3.6398224476854786, "grad_norm": 0.28419819474220276, "learning_rate": 0.0001997210099100041, "loss": 0.2406, "step": 2870 }, { "epoch": 3.6461636017755232, "grad_norm": 0.1965455561876297, "learning_rate": 0.00019971547165961064, "loss": 0.2323, "step": 2875 }, { "epoch": 3.6525047558655674, "grad_norm": 0.18513832986354828, "learning_rate": 0.0001997098790563675, "loss": 0.2297, "step": 2880 }, { "epoch": 3.6588459099556117, "grad_norm": 0.1846759021282196, "learning_rate": 0.00019970423210332306, "loss": 0.2392, "step": 2885 }, { "epoch": 3.6651870640456563, "grad_norm": 0.22224122285842896, "learning_rate": 0.00019969853080355538, "loss": 0.2326, "step": 2890 }, { "epoch": 3.671528218135701, "grad_norm": 0.17456310987472534, "learning_rate": 0.00019969277516017212, "loss": 0.2358, "step": 2895 }, { "epoch": 3.677869372225745, "grad_norm": 0.2508101463317871, "learning_rate": 0.00019968696517631056, "loss": 0.2401, "step": 2900 }, { "epoch": 3.6842105263157894, "grad_norm": 0.20836228132247925, "learning_rate": 0.00019968110085513764, "loss": 0.2372, "step": 2905 }, { "epoch": 3.690551680405834, "grad_norm": 0.19365110993385315, "learning_rate": 0.0001996751821998498, "loss": 0.2511, "step": 2910 }, { "epoch": 3.6968928344958782, "grad_norm": 0.18440744280815125, "learning_rate": 0.00019966920921367322, "loss": 0.2367, "step": 2915 }, { "epoch": 3.7032339885859225, "grad_norm": 0.16461604833602905, "learning_rate": 0.00019966318189986368, "loss": 0.2363, "step": 2920 }, { "epoch": 3.709575142675967, "grad_norm": 0.2054930031299591, "learning_rate": 0.0001996571002617065, "loss": 0.2374, "step": 2925 }, { "epoch": 3.7159162967660113, "grad_norm": 0.18506033718585968, "learning_rate": 0.00019965096430251666, "loss": 0.2519, "step": 2930 }, { "epoch": 3.7222574508560555, "grad_norm": 0.17987391352653503, "learning_rate": 0.0001996447740256388, "loss": 0.2324, "step": 2935 }, { "epoch": 3.7285986049461, "grad_norm": 0.1654725968837738, "learning_rate": 0.00019963852943444702, "loss": 0.2335, "step": 2940 }, { "epoch": 3.734939759036145, "grad_norm": 0.39607933163642883, "learning_rate": 0.0001996322305323452, "loss": 0.2469, "step": 2945 }, { "epoch": 3.741280913126189, "grad_norm": 0.3621523678302765, "learning_rate": 0.00019962587732276672, "loss": 0.2413, "step": 2950 }, { "epoch": 3.7476220672162333, "grad_norm": 0.2711822986602783, "learning_rate": 0.00019961946980917456, "loss": 0.2501, "step": 2955 }, { "epoch": 3.753963221306278, "grad_norm": 0.27202561497688293, "learning_rate": 0.00019961300799506133, "loss": 0.2354, "step": 2960 }, { "epoch": 3.760304375396322, "grad_norm": 0.18990547955036163, "learning_rate": 0.00019960649188394927, "loss": 0.2353, "step": 2965 }, { "epoch": 3.7666455294863663, "grad_norm": 0.21755172312259674, "learning_rate": 0.00019959992147939012, "loss": 0.2336, "step": 2970 }, { "epoch": 3.772986683576411, "grad_norm": 0.22083641588687897, "learning_rate": 0.0001995932967849653, "loss": 0.2404, "step": 2975 }, { "epoch": 3.779327837666455, "grad_norm": 0.20415766537189484, "learning_rate": 0.0001995866178042858, "loss": 0.2407, "step": 2980 }, { "epoch": 3.7856689917565, "grad_norm": 0.26714959740638733, "learning_rate": 0.00019957988454099218, "loss": 0.2562, "step": 2985 }, { "epoch": 3.792010145846544, "grad_norm": 0.17150932550430298, "learning_rate": 0.00019957309699875463, "loss": 0.2416, "step": 2990 }, { "epoch": 3.7983512999365887, "grad_norm": 0.38623398542404175, "learning_rate": 0.00019956625518127287, "loss": 0.2804, "step": 2995 }, { "epoch": 3.804692454026633, "grad_norm": 0.16458174586296082, "learning_rate": 0.00019955935909227624, "loss": 0.2528, "step": 3000 }, { "epoch": 3.811033608116677, "grad_norm": 0.1877012550830841, "learning_rate": 0.00019955240873552367, "loss": 0.2286, "step": 3005 }, { "epoch": 3.817374762206722, "grad_norm": 0.18321797251701355, "learning_rate": 0.00019954540411480363, "loss": 0.2507, "step": 3010 }, { "epoch": 3.823715916296766, "grad_norm": 0.17943665385246277, "learning_rate": 0.00019953834523393424, "loss": 0.2376, "step": 3015 }, { "epoch": 3.83005707038681, "grad_norm": 0.20930522680282593, "learning_rate": 0.0001995312320967631, "loss": 0.2371, "step": 3020 }, { "epoch": 3.836398224476855, "grad_norm": 0.18790116906166077, "learning_rate": 0.0001995240647071675, "loss": 0.2514, "step": 3025 }, { "epoch": 3.842739378566899, "grad_norm": 0.1652878224849701, "learning_rate": 0.00019951684306905416, "loss": 0.2585, "step": 3030 }, { "epoch": 3.8490805326569437, "grad_norm": 0.1846035271883011, "learning_rate": 0.00019950956718635946, "loss": 0.2311, "step": 3035 }, { "epoch": 3.855421686746988, "grad_norm": 0.20467427372932434, "learning_rate": 0.0001995022370630494, "loss": 0.2263, "step": 3040 }, { "epoch": 3.8617628408370326, "grad_norm": 0.17810489237308502, "learning_rate": 0.00019949485270311942, "loss": 0.2482, "step": 3045 }, { "epoch": 3.868103994927077, "grad_norm": 0.17590197920799255, "learning_rate": 0.00019948741411059457, "loss": 0.235, "step": 3050 }, { "epoch": 3.874445149017121, "grad_norm": 0.2845124304294586, "learning_rate": 0.00019947992128952952, "loss": 0.2388, "step": 3055 }, { "epoch": 3.8807863031071657, "grad_norm": 0.25081172585487366, "learning_rate": 0.0001994723742440084, "loss": 0.2379, "step": 3060 }, { "epoch": 3.88712745719721, "grad_norm": 0.17248068749904633, "learning_rate": 0.000199464772978145, "loss": 0.2329, "step": 3065 }, { "epoch": 3.893468611287254, "grad_norm": 0.1765974760055542, "learning_rate": 0.00019945711749608263, "loss": 0.227, "step": 3070 }, { "epoch": 3.8998097653772987, "grad_norm": 0.23323915898799896, "learning_rate": 0.00019944940780199409, "loss": 0.2269, "step": 3075 }, { "epoch": 3.906150919467343, "grad_norm": 0.27264583110809326, "learning_rate": 0.00019944164390008174, "loss": 0.2383, "step": 3080 }, { "epoch": 3.9124920735573876, "grad_norm": 0.1800317019224167, "learning_rate": 0.00019943382579457758, "loss": 0.2475, "step": 3085 }, { "epoch": 3.918833227647432, "grad_norm": 0.2866079807281494, "learning_rate": 0.00019942595348974312, "loss": 0.2379, "step": 3090 }, { "epoch": 3.9251743817374765, "grad_norm": 0.17202016711235046, "learning_rate": 0.00019941802698986933, "loss": 0.2384, "step": 3095 }, { "epoch": 3.9315155358275207, "grad_norm": 0.18656712770462036, "learning_rate": 0.00019941004629927682, "loss": 0.2327, "step": 3100 }, { "epoch": 3.937856689917565, "grad_norm": 0.17708571255207062, "learning_rate": 0.00019940201142231568, "loss": 0.2363, "step": 3105 }, { "epoch": 3.9441978440076095, "grad_norm": 0.22036848962306976, "learning_rate": 0.00019939392236336558, "loss": 0.2403, "step": 3110 }, { "epoch": 3.9505389980976537, "grad_norm": 0.17269612848758698, "learning_rate": 0.00019938577912683567, "loss": 0.2415, "step": 3115 }, { "epoch": 3.956880152187698, "grad_norm": 0.15976901352405548, "learning_rate": 0.00019937758171716468, "loss": 0.2318, "step": 3120 }, { "epoch": 3.9632213062777426, "grad_norm": 0.1768805831670761, "learning_rate": 0.00019936933013882084, "loss": 0.2378, "step": 3125 }, { "epoch": 3.969562460367787, "grad_norm": 0.19822637736797333, "learning_rate": 0.00019936102439630193, "loss": 0.234, "step": 3130 }, { "epoch": 3.9759036144578315, "grad_norm": 0.21476231515407562, "learning_rate": 0.00019935266449413522, "loss": 0.2282, "step": 3135 }, { "epoch": 3.9822447685478757, "grad_norm": 0.15989047288894653, "learning_rate": 0.00019934425043687754, "loss": 0.2295, "step": 3140 }, { "epoch": 3.9885859226379203, "grad_norm": 0.18053029477596283, "learning_rate": 0.00019933578222911522, "loss": 0.2315, "step": 3145 }, { "epoch": 3.9949270767279645, "grad_norm": 0.2666521966457367, "learning_rate": 0.00019932725987546407, "loss": 0.2389, "step": 3150 }, { "epoch": 4.0, "eval_loss": 0.3030637502670288, "eval_runtime": 1444.3891, "eval_samples_per_second": 1.092, "eval_steps_per_second": 1.092, "step": 3154 }, { "epoch": 4.001268230818009, "grad_norm": 0.1614377796649933, "learning_rate": 0.00019931868338056948, "loss": 0.2249, "step": 3155 }, { "epoch": 4.007609384908053, "grad_norm": 0.22796852886676788, "learning_rate": 0.0001993100527491063, "loss": 0.2154, "step": 3160 }, { "epoch": 4.013950538998098, "grad_norm": 0.1922776699066162, "learning_rate": 0.00019930136798577896, "loss": 0.2116, "step": 3165 }, { "epoch": 4.020291693088142, "grad_norm": 0.17657621204853058, "learning_rate": 0.0001992926290953213, "loss": 0.2099, "step": 3170 }, { "epoch": 4.0266328471781865, "grad_norm": 0.19676920771598816, "learning_rate": 0.0001992838360824967, "loss": 0.2207, "step": 3175 }, { "epoch": 4.032974001268231, "grad_norm": 0.19092674553394318, "learning_rate": 0.00019927498895209807, "loss": 0.2072, "step": 3180 }, { "epoch": 4.039315155358275, "grad_norm": 0.17013514041900635, "learning_rate": 0.0001992660877089478, "loss": 0.2179, "step": 3185 }, { "epoch": 4.0456563094483196, "grad_norm": 0.23439811170101166, "learning_rate": 0.00019925713235789777, "loss": 0.2192, "step": 3190 }, { "epoch": 4.051997463538364, "grad_norm": 0.1818152219057083, "learning_rate": 0.00019924812290382938, "loss": 0.2114, "step": 3195 }, { "epoch": 4.058338617628408, "grad_norm": 0.19560812413692474, "learning_rate": 0.00019923905935165346, "loss": 0.2177, "step": 3200 }, { "epoch": 4.064679771718453, "grad_norm": 0.17303204536437988, "learning_rate": 0.00019922994170631038, "loss": 0.2162, "step": 3205 }, { "epoch": 4.071020925808497, "grad_norm": 0.18479858338832855, "learning_rate": 0.00019922076997276998, "loss": 0.2105, "step": 3210 }, { "epoch": 4.077362079898542, "grad_norm": 0.21143008768558502, "learning_rate": 0.0001992115441560316, "loss": 0.2129, "step": 3215 }, { "epoch": 4.083703233988586, "grad_norm": 0.17805492877960205, "learning_rate": 0.00019920226426112402, "loss": 0.2111, "step": 3220 }, { "epoch": 4.09004438807863, "grad_norm": 0.18527206778526306, "learning_rate": 0.00019919293029310552, "loss": 0.2096, "step": 3225 }, { "epoch": 4.096385542168675, "grad_norm": 0.17861348390579224, "learning_rate": 0.0001991835422570639, "loss": 0.2155, "step": 3230 }, { "epoch": 4.102726696258719, "grad_norm": 0.17721214890480042, "learning_rate": 0.0001991741001581163, "loss": 0.2244, "step": 3235 }, { "epoch": 4.109067850348763, "grad_norm": 0.24097007513046265, "learning_rate": 0.0001991646040014095, "loss": 0.215, "step": 3240 }, { "epoch": 4.115409004438808, "grad_norm": 0.3572041392326355, "learning_rate": 0.00019915505379211962, "loss": 0.2197, "step": 3245 }, { "epoch": 4.121750158528852, "grad_norm": 0.19363339245319366, "learning_rate": 0.00019914544953545226, "loss": 0.2143, "step": 3250 }, { "epoch": 4.1280913126188965, "grad_norm": 0.5962703227996826, "learning_rate": 0.00019913579123664253, "loss": 0.2372, "step": 3255 }, { "epoch": 4.134432466708941, "grad_norm": 0.18855033814907074, "learning_rate": 0.00019912607890095496, "loss": 0.216, "step": 3260 }, { "epoch": 4.140773620798986, "grad_norm": 0.18019284307956696, "learning_rate": 0.00019911631253368355, "loss": 0.2159, "step": 3265 }, { "epoch": 4.14711477488903, "grad_norm": 0.20620961487293243, "learning_rate": 0.00019910649214015177, "loss": 0.2148, "step": 3270 }, { "epoch": 4.153455928979074, "grad_norm": 0.4239434003829956, "learning_rate": 0.00019909661772571244, "loss": 0.2158, "step": 3275 }, { "epoch": 4.159797083069119, "grad_norm": 0.1988591104745865, "learning_rate": 0.00019908668929574801, "loss": 0.2363, "step": 3280 }, { "epoch": 4.166138237159163, "grad_norm": 0.23416364192962646, "learning_rate": 0.00019907670685567018, "loss": 0.2246, "step": 3285 }, { "epoch": 4.172479391249207, "grad_norm": 0.212542325258255, "learning_rate": 0.00019906667041092019, "loss": 0.219, "step": 3290 }, { "epoch": 4.178820545339252, "grad_norm": 0.1844007670879364, "learning_rate": 0.00019905657996696868, "loss": 0.2198, "step": 3295 }, { "epoch": 4.185161699429296, "grad_norm": 0.1967695951461792, "learning_rate": 0.0001990464355293158, "loss": 0.2344, "step": 3300 }, { "epoch": 4.19150285351934, "grad_norm": 0.19594764709472656, "learning_rate": 0.00019903623710349099, "loss": 0.2222, "step": 3305 }, { "epoch": 4.197844007609385, "grad_norm": 0.20161382853984833, "learning_rate": 0.00019902598469505328, "loss": 0.2293, "step": 3310 }, { "epoch": 4.20418516169943, "grad_norm": 0.1726006269454956, "learning_rate": 0.00019901567830959104, "loss": 0.2088, "step": 3315 }, { "epoch": 4.2105263157894735, "grad_norm": 0.19265885651111603, "learning_rate": 0.00019900531795272205, "loss": 0.2171, "step": 3320 }, { "epoch": 4.216867469879518, "grad_norm": 0.20152045786380768, "learning_rate": 0.0001989949036300935, "loss": 0.2228, "step": 3325 }, { "epoch": 4.223208623969563, "grad_norm": 0.20607762038707733, "learning_rate": 0.00019898443534738206, "loss": 0.2108, "step": 3330 }, { "epoch": 4.2295497780596065, "grad_norm": 0.19238682091236115, "learning_rate": 0.00019897391311029375, "loss": 0.2144, "step": 3335 }, { "epoch": 4.235890932149651, "grad_norm": 0.18624347448349, "learning_rate": 0.00019896333692456407, "loss": 0.2277, "step": 3340 }, { "epoch": 4.242232086239696, "grad_norm": 0.20221658051013947, "learning_rate": 0.00019895270679595782, "loss": 0.2118, "step": 3345 }, { "epoch": 4.24857324032974, "grad_norm": 0.17318499088287354, "learning_rate": 0.00019894202273026927, "loss": 0.2134, "step": 3350 }, { "epoch": 4.254914394419784, "grad_norm": 0.1885792315006256, "learning_rate": 0.00019893128473332214, "loss": 0.2173, "step": 3355 }, { "epoch": 4.261255548509829, "grad_norm": 0.1968296319246292, "learning_rate": 0.00019892049281096944, "loss": 0.2147, "step": 3360 }, { "epoch": 4.267596702599874, "grad_norm": 0.1656263917684555, "learning_rate": 0.00019890964696909368, "loss": 0.21, "step": 3365 }, { "epoch": 4.273937856689917, "grad_norm": 0.18949894607067108, "learning_rate": 0.00019889874721360664, "loss": 0.2162, "step": 3370 }, { "epoch": 4.280279010779962, "grad_norm": 0.19511955976486206, "learning_rate": 0.00019888779355044958, "loss": 0.2172, "step": 3375 }, { "epoch": 4.286620164870007, "grad_norm": 0.18598318099975586, "learning_rate": 0.0001988767859855931, "loss": 0.2077, "step": 3380 }, { "epoch": 4.29296131896005, "grad_norm": 0.28125661611557007, "learning_rate": 0.00019886572452503724, "loss": 0.2147, "step": 3385 }, { "epoch": 4.299302473050095, "grad_norm": 0.2732760012149811, "learning_rate": 0.00019885460917481136, "loss": 0.2185, "step": 3390 }, { "epoch": 4.30564362714014, "grad_norm": 0.19608043134212494, "learning_rate": 0.00019884343994097417, "loss": 0.2204, "step": 3395 }, { "epoch": 4.3119847812301835, "grad_norm": 0.1970449686050415, "learning_rate": 0.0001988322168296138, "loss": 0.2117, "step": 3400 }, { "epoch": 4.318325935320228, "grad_norm": 0.19621534645557404, "learning_rate": 0.00019882093984684778, "loss": 0.2178, "step": 3405 }, { "epoch": 4.324667089410273, "grad_norm": 0.17943595349788666, "learning_rate": 0.0001988096089988229, "loss": 0.2123, "step": 3410 }, { "epoch": 4.331008243500317, "grad_norm": 0.19156089425086975, "learning_rate": 0.0001987982242917154, "loss": 0.2164, "step": 3415 }, { "epoch": 4.337349397590361, "grad_norm": 0.1807197630405426, "learning_rate": 0.0001987867857317309, "loss": 0.2146, "step": 3420 }, { "epoch": 4.343690551680406, "grad_norm": 0.18102240562438965, "learning_rate": 0.00019877529332510427, "loss": 0.2133, "step": 3425 }, { "epoch": 4.3500317057704505, "grad_norm": 0.24060651659965515, "learning_rate": 0.00019876374707809976, "loss": 0.218, "step": 3430 }, { "epoch": 4.356372859860494, "grad_norm": 0.19333739578723907, "learning_rate": 0.00019875214699701106, "loss": 0.2142, "step": 3435 }, { "epoch": 4.362714013950539, "grad_norm": 0.17184320092201233, "learning_rate": 0.00019874049308816108, "loss": 0.2142, "step": 3440 }, { "epoch": 4.369055168040584, "grad_norm": 0.17979229986667633, "learning_rate": 0.00019872878535790212, "loss": 0.2113, "step": 3445 }, { "epoch": 4.375396322130628, "grad_norm": 0.1780640333890915, "learning_rate": 0.0001987170238126159, "loss": 0.2134, "step": 3450 }, { "epoch": 4.381737476220672, "grad_norm": 0.18288017809391022, "learning_rate": 0.00019870520845871327, "loss": 0.2199, "step": 3455 }, { "epoch": 4.388078630310717, "grad_norm": 0.22124269604682922, "learning_rate": 0.00019869333930263468, "loss": 0.2175, "step": 3460 }, { "epoch": 4.394419784400761, "grad_norm": 0.22013512253761292, "learning_rate": 0.00019868141635084963, "loss": 0.2086, "step": 3465 }, { "epoch": 4.400760938490805, "grad_norm": 0.19169054925441742, "learning_rate": 0.00019866943960985718, "loss": 0.2077, "step": 3470 }, { "epoch": 4.40710209258085, "grad_norm": 0.1731882095336914, "learning_rate": 0.00019865740908618553, "loss": 0.221, "step": 3475 }, { "epoch": 4.413443246670894, "grad_norm": 0.18289774656295776, "learning_rate": 0.00019864532478639234, "loss": 0.214, "step": 3480 }, { "epoch": 4.419784400760938, "grad_norm": 0.18588317930698395, "learning_rate": 0.00019863318671706442, "loss": 0.2143, "step": 3485 }, { "epoch": 4.426125554850983, "grad_norm": 0.18272556364536285, "learning_rate": 0.00019862099488481808, "loss": 0.2229, "step": 3490 }, { "epoch": 4.4324667089410275, "grad_norm": 0.18219348788261414, "learning_rate": 0.00019860874929629877, "loss": 0.2124, "step": 3495 }, { "epoch": 4.438807863031071, "grad_norm": 0.26763269305229187, "learning_rate": 0.00019859644995818133, "loss": 0.2155, "step": 3500 }, { "epoch": 4.445149017121116, "grad_norm": 0.20787706971168518, "learning_rate": 0.0001985840968771699, "loss": 0.2108, "step": 3505 }, { "epoch": 4.4514901712111605, "grad_norm": 0.16134731471538544, "learning_rate": 0.00019857169005999787, "loss": 0.2114, "step": 3510 }, { "epoch": 4.457831325301205, "grad_norm": 0.17464995384216309, "learning_rate": 0.0001985592295134279, "loss": 0.2215, "step": 3515 }, { "epoch": 4.464172479391249, "grad_norm": 0.20766876637935638, "learning_rate": 0.00019854671524425206, "loss": 0.2115, "step": 3520 }, { "epoch": 4.470513633481294, "grad_norm": 0.17349757254123688, "learning_rate": 0.00019853414725929156, "loss": 0.2198, "step": 3525 }, { "epoch": 4.476854787571338, "grad_norm": 0.2001785933971405, "learning_rate": 0.00019852152556539693, "loss": 0.2098, "step": 3530 }, { "epoch": 4.483195941661382, "grad_norm": 0.1968441903591156, "learning_rate": 0.00019850885016944807, "loss": 0.2151, "step": 3535 }, { "epoch": 4.489537095751427, "grad_norm": 0.1814134269952774, "learning_rate": 0.000198496121078354, "loss": 0.2095, "step": 3540 }, { "epoch": 4.495878249841471, "grad_norm": 0.17872226238250732, "learning_rate": 0.00019848333829905316, "loss": 0.2157, "step": 3545 }, { "epoch": 4.502219403931516, "grad_norm": 0.22608976066112518, "learning_rate": 0.00019847050183851312, "loss": 0.2162, "step": 3550 }, { "epoch": 4.50856055802156, "grad_norm": 0.18943296372890472, "learning_rate": 0.0001984576117037308, "loss": 0.213, "step": 3555 }, { "epoch": 4.514901712111604, "grad_norm": 0.1768791675567627, "learning_rate": 0.00019844466790173236, "loss": 0.2119, "step": 3560 }, { "epoch": 4.521242866201649, "grad_norm": 0.3802310824394226, "learning_rate": 0.00019843167043957316, "loss": 0.2164, "step": 3565 }, { "epoch": 4.527584020291693, "grad_norm": 0.18943294882774353, "learning_rate": 0.00019841861932433786, "loss": 0.212, "step": 3570 }, { "epoch": 4.5339251743817375, "grad_norm": 0.16153913736343384, "learning_rate": 0.00019840551456314036, "loss": 0.2122, "step": 3575 }, { "epoch": 4.540266328471782, "grad_norm": 0.19486777484416962, "learning_rate": 0.00019839235616312385, "loss": 0.222, "step": 3580 }, { "epoch": 4.546607482561826, "grad_norm": 0.19401216506958008, "learning_rate": 0.00019837914413146058, "loss": 0.2142, "step": 3585 }, { "epoch": 4.5529486366518706, "grad_norm": 0.1667824536561966, "learning_rate": 0.00019836587847535226, "loss": 0.21, "step": 3590 }, { "epoch": 4.559289790741915, "grad_norm": 0.17908911406993866, "learning_rate": 0.00019835255920202968, "loss": 0.2178, "step": 3595 }, { "epoch": 4.565630944831959, "grad_norm": 0.24865512549877167, "learning_rate": 0.00019833918631875292, "loss": 0.2128, "step": 3600 }, { "epoch": 4.571972098922004, "grad_norm": 0.16716158390045166, "learning_rate": 0.00019832575983281126, "loss": 0.2119, "step": 3605 }, { "epoch": 4.578313253012048, "grad_norm": 0.22156748175621033, "learning_rate": 0.0001983122797515232, "loss": 0.2106, "step": 3610 }, { "epoch": 4.584654407102093, "grad_norm": 0.18539752066135406, "learning_rate": 0.00019829874608223646, "loss": 0.2088, "step": 3615 }, { "epoch": 4.590995561192137, "grad_norm": 0.1889573186635971, "learning_rate": 0.00019828515883232796, "loss": 0.2117, "step": 3620 }, { "epoch": 4.597336715282181, "grad_norm": 0.17657649517059326, "learning_rate": 0.0001982715180092038, "loss": 0.212, "step": 3625 }, { "epoch": 4.603677869372226, "grad_norm": 0.16492141783237457, "learning_rate": 0.00019825782362029937, "loss": 0.2094, "step": 3630 }, { "epoch": 4.61001902346227, "grad_norm": 0.16165238618850708, "learning_rate": 0.00019824407567307914, "loss": 0.2153, "step": 3635 }, { "epoch": 4.616360177552314, "grad_norm": 0.18977701663970947, "learning_rate": 0.0001982302741750369, "loss": 0.2109, "step": 3640 }, { "epoch": 4.622701331642359, "grad_norm": 0.18218207359313965, "learning_rate": 0.0001982164191336955, "loss": 0.212, "step": 3645 }, { "epoch": 4.629042485732404, "grad_norm": 0.17536284029483795, "learning_rate": 0.00019820251055660703, "loss": 0.2189, "step": 3650 }, { "epoch": 4.6353836398224475, "grad_norm": 0.16788943111896515, "learning_rate": 0.00019818854845135285, "loss": 0.2116, "step": 3655 }, { "epoch": 4.641724793912492, "grad_norm": 0.2326701581478119, "learning_rate": 0.00019817453282554333, "loss": 0.2273, "step": 3660 }, { "epoch": 4.648065948002537, "grad_norm": 0.173734650015831, "learning_rate": 0.00019816046368681818, "loss": 0.2155, "step": 3665 }, { "epoch": 4.654407102092581, "grad_norm": 0.17680080235004425, "learning_rate": 0.00019814634104284613, "loss": 0.2103, "step": 3670 }, { "epoch": 4.660748256182625, "grad_norm": 0.18231511116027832, "learning_rate": 0.00019813216490132516, "loss": 0.2076, "step": 3675 }, { "epoch": 4.66708941027267, "grad_norm": 0.22735215723514557, "learning_rate": 0.00019811793526998238, "loss": 0.2193, "step": 3680 }, { "epoch": 4.673430564362714, "grad_norm": 0.29292428493499756, "learning_rate": 0.00019810365215657412, "loss": 0.2181, "step": 3685 }, { "epoch": 4.679771718452758, "grad_norm": 0.2138868123292923, "learning_rate": 0.00019808931556888575, "loss": 0.2121, "step": 3690 }, { "epoch": 4.686112872542803, "grad_norm": 0.17052961885929108, "learning_rate": 0.0001980749255147319, "loss": 0.211, "step": 3695 }, { "epoch": 4.692454026632847, "grad_norm": 0.19157268106937408, "learning_rate": 0.00019806048200195626, "loss": 0.2145, "step": 3700 }, { "epoch": 4.698795180722891, "grad_norm": 0.20984715223312378, "learning_rate": 0.0001980459850384317, "loss": 0.2161, "step": 3705 }, { "epoch": 4.705136334812936, "grad_norm": 0.17019401490688324, "learning_rate": 0.00019803143463206023, "loss": 0.2124, "step": 3710 }, { "epoch": 4.711477488902981, "grad_norm": 0.1725141555070877, "learning_rate": 0.00019801683079077295, "loss": 0.2141, "step": 3715 }, { "epoch": 4.7178186429930244, "grad_norm": 0.2576580345630646, "learning_rate": 0.00019800217352253013, "loss": 0.2076, "step": 3720 }, { "epoch": 4.724159797083069, "grad_norm": 0.18043720722198486, "learning_rate": 0.00019798746283532112, "loss": 0.2069, "step": 3725 }, { "epoch": 4.730500951173114, "grad_norm": 0.18270549178123474, "learning_rate": 0.00019797269873716444, "loss": 0.2084, "step": 3730 }, { "epoch": 4.7368421052631575, "grad_norm": 0.17066051065921783, "learning_rate": 0.00019795788123610773, "loss": 0.2129, "step": 3735 }, { "epoch": 4.743183259353202, "grad_norm": 0.1815827190876007, "learning_rate": 0.00019794301034022763, "loss": 0.2117, "step": 3740 }, { "epoch": 4.749524413443247, "grad_norm": 0.1916676163673401, "learning_rate": 0.00019792808605762998, "loss": 0.2079, "step": 3745 }, { "epoch": 4.7558655675332915, "grad_norm": 0.18342970311641693, "learning_rate": 0.00019791310839644975, "loss": 0.2059, "step": 3750 }, { "epoch": 4.762206721623335, "grad_norm": 0.15993835031986237, "learning_rate": 0.0001978980773648509, "loss": 0.2093, "step": 3755 }, { "epoch": 4.76854787571338, "grad_norm": 0.17355243861675262, "learning_rate": 0.00019788299297102653, "loss": 0.2132, "step": 3760 }, { "epoch": 4.774889029803425, "grad_norm": 0.21971476078033447, "learning_rate": 0.00019786785522319885, "loss": 0.2125, "step": 3765 }, { "epoch": 4.781230183893468, "grad_norm": 0.1786206066608429, "learning_rate": 0.00019785266412961918, "loss": 0.2108, "step": 3770 }, { "epoch": 4.787571337983513, "grad_norm": 0.178743377327919, "learning_rate": 0.0001978374196985678, "loss": 0.2103, "step": 3775 }, { "epoch": 4.793912492073558, "grad_norm": 0.16480551660060883, "learning_rate": 0.00019782212193835422, "loss": 0.2107, "step": 3780 }, { "epoch": 4.800253646163601, "grad_norm": 0.20611509680747986, "learning_rate": 0.00019780677085731683, "loss": 0.2126, "step": 3785 }, { "epoch": 4.806594800253646, "grad_norm": 0.21283523738384247, "learning_rate": 0.00019779136646382327, "loss": 0.2126, "step": 3790 }, { "epoch": 4.812935954343691, "grad_norm": 0.28413909673690796, "learning_rate": 0.0001977759087662701, "loss": 0.2149, "step": 3795 }, { "epoch": 4.8192771084337345, "grad_norm": 0.1610143482685089, "learning_rate": 0.00019776039777308303, "loss": 0.2087, "step": 3800 }, { "epoch": 4.825618262523779, "grad_norm": 0.18190249800682068, "learning_rate": 0.00019774483349271676, "loss": 0.2136, "step": 3805 }, { "epoch": 4.831959416613824, "grad_norm": 0.1742052286863327, "learning_rate": 0.00019772921593365507, "loss": 0.2038, "step": 3810 }, { "epoch": 4.838300570703868, "grad_norm": 0.25073137879371643, "learning_rate": 0.0001977135451044108, "loss": 0.2123, "step": 3815 }, { "epoch": 4.844641724793912, "grad_norm": 0.16550874710083008, "learning_rate": 0.0001976978210135257, "loss": 0.2086, "step": 3820 }, { "epoch": 4.850982878883957, "grad_norm": 0.17842721939086914, "learning_rate": 0.0001976820436695708, "loss": 0.2089, "step": 3825 }, { "epoch": 4.8573240329740015, "grad_norm": 0.2961059808731079, "learning_rate": 0.00019766621308114587, "loss": 0.2067, "step": 3830 }, { "epoch": 4.863665187064045, "grad_norm": 0.1809186190366745, "learning_rate": 0.0001976503292568799, "loss": 0.2111, "step": 3835 }, { "epoch": 4.87000634115409, "grad_norm": 0.19864125549793243, "learning_rate": 0.00019763439220543084, "loss": 0.2105, "step": 3840 }, { "epoch": 4.876347495244135, "grad_norm": 0.17953461408615112, "learning_rate": 0.0001976184019354856, "loss": 0.2128, "step": 3845 }, { "epoch": 4.882688649334179, "grad_norm": 0.17933392524719238, "learning_rate": 0.0001976023584557602, "loss": 0.2103, "step": 3850 }, { "epoch": 4.889029803424223, "grad_norm": 0.19216038286685944, "learning_rate": 0.0001975862617749996, "loss": 0.2113, "step": 3855 }, { "epoch": 4.895370957514268, "grad_norm": 0.17764465510845184, "learning_rate": 0.00019757011190197772, "loss": 0.2061, "step": 3860 }, { "epoch": 4.901712111604312, "grad_norm": 0.19165575504302979, "learning_rate": 0.00019755390884549758, "loss": 0.213, "step": 3865 }, { "epoch": 4.908053265694356, "grad_norm": 0.16639097034931183, "learning_rate": 0.00019753765261439112, "loss": 0.2079, "step": 3870 }, { "epoch": 4.914394419784401, "grad_norm": 0.17730718851089478, "learning_rate": 0.00019752134321751927, "loss": 0.2167, "step": 3875 }, { "epoch": 4.920735573874445, "grad_norm": 0.16640359163284302, "learning_rate": 0.00019750498066377193, "loss": 0.2084, "step": 3880 }, { "epoch": 4.92707672796449, "grad_norm": 0.1850304752588272, "learning_rate": 0.00019748856496206802, "loss": 0.2053, "step": 3885 }, { "epoch": 4.933417882054534, "grad_norm": 0.17670786380767822, "learning_rate": 0.00019747209612135536, "loss": 0.2057, "step": 3890 }, { "epoch": 4.9397590361445785, "grad_norm": 0.16377036273479462, "learning_rate": 0.00019745557415061087, "loss": 0.2042, "step": 3895 }, { "epoch": 4.946100190234622, "grad_norm": 0.19302432239055634, "learning_rate": 0.00019743899905884022, "loss": 0.2107, "step": 3900 }, { "epoch": 4.952441344324667, "grad_norm": 0.16383332014083862, "learning_rate": 0.00019742237085507822, "loss": 0.2072, "step": 3905 }, { "epoch": 4.9587824984147115, "grad_norm": 0.1721595972776413, "learning_rate": 0.00019740568954838855, "loss": 0.2104, "step": 3910 }, { "epoch": 4.965123652504756, "grad_norm": 0.17714591324329376, "learning_rate": 0.00019738895514786382, "loss": 0.2077, "step": 3915 }, { "epoch": 4.9714648065948, "grad_norm": 0.18949434161186218, "learning_rate": 0.00019737216766262564, "loss": 0.2123, "step": 3920 }, { "epoch": 4.977805960684845, "grad_norm": 0.18523865938186646, "learning_rate": 0.00019735532710182446, "loss": 0.2055, "step": 3925 }, { "epoch": 4.984147114774889, "grad_norm": 0.16201648116111755, "learning_rate": 0.00019733843347463984, "loss": 0.2109, "step": 3930 }, { "epoch": 4.990488268864933, "grad_norm": 0.22933132946491241, "learning_rate": 0.00019732148679028005, "loss": 0.2134, "step": 3935 }, { "epoch": 4.996829422954978, "grad_norm": 0.17929863929748535, "learning_rate": 0.00019730448705798239, "loss": 0.2098, "step": 3940 }, { "epoch": 4.999365884590995, "eval_loss": 0.3035367429256439, "eval_runtime": 1445.5355, "eval_samples_per_second": 1.091, "eval_steps_per_second": 1.091, "step": 3942 }, { "epoch": 5.003170577045022, "grad_norm": 0.1531272530555725, "learning_rate": 0.0001972874342870131, "loss": 0.1935, "step": 3945 }, { "epoch": 5.009511731135067, "grad_norm": 0.19690929353237152, "learning_rate": 0.00019727032848666727, "loss": 0.1832, "step": 3950 }, { "epoch": 5.015852885225111, "grad_norm": 0.18265408277511597, "learning_rate": 0.00019725316966626895, "loss": 0.1835, "step": 3955 }, { "epoch": 5.022194039315155, "grad_norm": 0.19143721461296082, "learning_rate": 0.00019723595783517102, "loss": 0.1828, "step": 3960 }, { "epoch": 5.0285351934052, "grad_norm": 0.1983107328414917, "learning_rate": 0.00019721869300275531, "loss": 0.1891, "step": 3965 }, { "epoch": 5.034876347495244, "grad_norm": 0.19277577102184296, "learning_rate": 0.00019720137517843252, "loss": 0.1863, "step": 3970 }, { "epoch": 5.0412175015852885, "grad_norm": 0.1870567798614502, "learning_rate": 0.00019718400437164225, "loss": 0.1907, "step": 3975 }, { "epoch": 5.047558655675333, "grad_norm": 0.17584681510925293, "learning_rate": 0.00019716658059185294, "loss": 0.1928, "step": 3980 }, { "epoch": 5.053899809765377, "grad_norm": 0.19533997774124146, "learning_rate": 0.000197149103848562, "loss": 0.1893, "step": 3985 }, { "epoch": 5.0602409638554215, "grad_norm": 0.1852300614118576, "learning_rate": 0.00019713157415129557, "loss": 0.1864, "step": 3990 }, { "epoch": 5.066582117945466, "grad_norm": 0.20190046727657318, "learning_rate": 0.00019711399150960875, "loss": 0.1828, "step": 3995 }, { "epoch": 5.072923272035511, "grad_norm": 0.2005247175693512, "learning_rate": 0.00019709635593308548, "loss": 0.1921, "step": 4000 }, { "epoch": 5.079264426125555, "grad_norm": 0.20791403949260712, "learning_rate": 0.00019707866743133858, "loss": 0.1927, "step": 4005 }, { "epoch": 5.085605580215599, "grad_norm": 0.1999587118625641, "learning_rate": 0.00019706092601400963, "loss": 0.1894, "step": 4010 }, { "epoch": 5.091946734305644, "grad_norm": 0.20208244025707245, "learning_rate": 0.00019704313169076917, "loss": 0.1921, "step": 4015 }, { "epoch": 5.098287888395688, "grad_norm": 0.18083013594150543, "learning_rate": 0.00019702528447131646, "loss": 0.1887, "step": 4020 }, { "epoch": 5.104629042485732, "grad_norm": 0.19319649040699005, "learning_rate": 0.0001970073843653797, "loss": 0.1895, "step": 4025 }, { "epoch": 5.110970196575777, "grad_norm": 0.20358404517173767, "learning_rate": 0.00019698943138271585, "loss": 0.1905, "step": 4030 }, { "epoch": 5.117311350665821, "grad_norm": 0.1990876942873001, "learning_rate": 0.00019697142553311072, "loss": 0.1897, "step": 4035 }, { "epoch": 5.123652504755865, "grad_norm": 0.1871253252029419, "learning_rate": 0.0001969533668263789, "loss": 0.1895, "step": 4040 }, { "epoch": 5.12999365884591, "grad_norm": 0.17783567309379578, "learning_rate": 0.00019693525527236382, "loss": 0.1862, "step": 4045 }, { "epoch": 5.136334812935955, "grad_norm": 0.1865890920162201, "learning_rate": 0.00019691709088093777, "loss": 0.193, "step": 4050 }, { "epoch": 5.1426759670259985, "grad_norm": 0.19084204733371735, "learning_rate": 0.00019689887366200175, "loss": 0.1936, "step": 4055 }, { "epoch": 5.149017121116043, "grad_norm": 0.1816391497850418, "learning_rate": 0.00019688060362548556, "loss": 0.1906, "step": 4060 }, { "epoch": 5.155358275206088, "grad_norm": 0.1925617754459381, "learning_rate": 0.00019686228078134787, "loss": 0.1865, "step": 4065 }, { "epoch": 5.161699429296132, "grad_norm": 0.19281330704689026, "learning_rate": 0.00019684390513957607, "loss": 0.1894, "step": 4070 }, { "epoch": 5.168040583386176, "grad_norm": 0.20676252245903015, "learning_rate": 0.00019682547671018635, "loss": 0.1912, "step": 4075 }, { "epoch": 5.174381737476221, "grad_norm": 0.16814260184764862, "learning_rate": 0.0001968069955032236, "loss": 0.1872, "step": 4080 }, { "epoch": 5.180722891566265, "grad_norm": 0.19016312062740326, "learning_rate": 0.00019678846152876165, "loss": 0.1897, "step": 4085 }, { "epoch": 5.187064045656309, "grad_norm": 0.18751539289951324, "learning_rate": 0.00019676987479690295, "loss": 0.1942, "step": 4090 }, { "epoch": 5.193405199746354, "grad_norm": 0.2285935878753662, "learning_rate": 0.0001967512353177787, "loss": 0.1909, "step": 4095 }, { "epoch": 5.199746353836399, "grad_norm": 0.1692272573709488, "learning_rate": 0.00019673254310154895, "loss": 0.194, "step": 4100 }, { "epoch": 5.206087507926442, "grad_norm": 0.17639543116092682, "learning_rate": 0.0001967137981584024, "loss": 0.1916, "step": 4105 }, { "epoch": 5.212428662016487, "grad_norm": 0.1890820413827896, "learning_rate": 0.00019669500049855656, "loss": 0.1909, "step": 4110 }, { "epoch": 5.218769816106532, "grad_norm": 0.16969987750053406, "learning_rate": 0.00019667615013225763, "loss": 0.1938, "step": 4115 }, { "epoch": 5.225110970196575, "grad_norm": 0.17386728525161743, "learning_rate": 0.0001966572470697806, "loss": 0.191, "step": 4120 }, { "epoch": 5.23145212428662, "grad_norm": 0.18629340827465057, "learning_rate": 0.00019663829132142912, "loss": 0.1918, "step": 4125 }, { "epoch": 5.237793278376665, "grad_norm": 0.1844152808189392, "learning_rate": 0.00019661928289753554, "loss": 0.1912, "step": 4130 }, { "epoch": 5.2441344324667085, "grad_norm": 0.18697965145111084, "learning_rate": 0.00019660022180846098, "loss": 0.1918, "step": 4135 }, { "epoch": 5.250475586556753, "grad_norm": 0.21600137650966644, "learning_rate": 0.00019658110806459526, "loss": 0.1929, "step": 4140 }, { "epoch": 5.256816740646798, "grad_norm": 0.1842968612909317, "learning_rate": 0.0001965619416763569, "loss": 0.1918, "step": 4145 }, { "epoch": 5.2631578947368425, "grad_norm": 0.20122699439525604, "learning_rate": 0.00019654272265419307, "loss": 0.1941, "step": 4150 }, { "epoch": 5.269499048826886, "grad_norm": 0.19417595863342285, "learning_rate": 0.00019652345100857965, "loss": 0.1913, "step": 4155 }, { "epoch": 5.275840202916931, "grad_norm": 0.1983565241098404, "learning_rate": 0.00019650412675002127, "loss": 0.188, "step": 4160 }, { "epoch": 5.2821813570069756, "grad_norm": 0.1754962056875229, "learning_rate": 0.0001964847498890512, "loss": 0.1942, "step": 4165 }, { "epoch": 5.288522511097019, "grad_norm": 0.20699192583560944, "learning_rate": 0.00019646532043623126, "loss": 0.192, "step": 4170 }, { "epoch": 5.294863665187064, "grad_norm": 0.1981319934129715, "learning_rate": 0.00019644583840215213, "loss": 0.194, "step": 4175 }, { "epoch": 5.301204819277109, "grad_norm": 0.1924477219581604, "learning_rate": 0.00019642630379743307, "loss": 0.1925, "step": 4180 }, { "epoch": 5.307545973367153, "grad_norm": 0.1888171285390854, "learning_rate": 0.00019640671663272196, "loss": 0.1919, "step": 4185 }, { "epoch": 5.313887127457197, "grad_norm": 0.19581562280654907, "learning_rate": 0.0001963870769186954, "loss": 0.1907, "step": 4190 }, { "epoch": 5.320228281547242, "grad_norm": 0.1881704032421112, "learning_rate": 0.00019636738466605853, "loss": 0.1877, "step": 4195 }, { "epoch": 5.326569435637286, "grad_norm": 0.1843928098678589, "learning_rate": 0.00019634763988554522, "loss": 0.1914, "step": 4200 }, { "epoch": 5.33291058972733, "grad_norm": 0.2039584368467331, "learning_rate": 0.00019632784258791794, "loss": 0.1971, "step": 4205 }, { "epoch": 5.339251743817375, "grad_norm": 0.19955579936504364, "learning_rate": 0.0001963079927839678, "loss": 0.1918, "step": 4210 }, { "epoch": 5.345592897907419, "grad_norm": 0.1702607423067093, "learning_rate": 0.00019628809048451452, "loss": 0.1884, "step": 4215 }, { "epoch": 5.351934051997463, "grad_norm": 0.1859121024608612, "learning_rate": 0.00019626813570040646, "loss": 0.1921, "step": 4220 }, { "epoch": 5.358275206087508, "grad_norm": 0.1925940215587616, "learning_rate": 0.00019624812844252053, "loss": 0.1928, "step": 4225 }, { "epoch": 5.3646163601775525, "grad_norm": 0.18263481557369232, "learning_rate": 0.00019622806872176223, "loss": 0.1926, "step": 4230 }, { "epoch": 5.370957514267596, "grad_norm": 0.19940592348575592, "learning_rate": 0.00019620795654906576, "loss": 0.1908, "step": 4235 }, { "epoch": 5.377298668357641, "grad_norm": 0.2047148495912552, "learning_rate": 0.00019618779193539382, "loss": 0.195, "step": 4240 }, { "epoch": 5.383639822447686, "grad_norm": 0.18376454710960388, "learning_rate": 0.00019616757489173778, "loss": 0.1946, "step": 4245 }, { "epoch": 5.38998097653773, "grad_norm": 0.20889529585838318, "learning_rate": 0.0001961473054291175, "loss": 0.1902, "step": 4250 }, { "epoch": 5.396322130627774, "grad_norm": 0.18061743676662445, "learning_rate": 0.00019612698355858142, "loss": 0.1914, "step": 4255 }, { "epoch": 5.402663284717819, "grad_norm": 0.1830766350030899, "learning_rate": 0.00019610660929120658, "loss": 0.1932, "step": 4260 }, { "epoch": 5.409004438807863, "grad_norm": 0.18366467952728271, "learning_rate": 0.00019608618263809862, "loss": 0.1958, "step": 4265 }, { "epoch": 5.415345592897907, "grad_norm": 0.20729288458824158, "learning_rate": 0.00019606570361039164, "loss": 0.1952, "step": 4270 }, { "epoch": 5.421686746987952, "grad_norm": 0.18449997901916504, "learning_rate": 0.00019604517221924835, "loss": 0.1884, "step": 4275 }, { "epoch": 5.428027901077996, "grad_norm": 0.1771128922700882, "learning_rate": 0.00019602458847586002, "loss": 0.1887, "step": 4280 }, { "epoch": 5.434369055168041, "grad_norm": 0.16689437627792358, "learning_rate": 0.00019600395239144637, "loss": 0.1965, "step": 4285 }, { "epoch": 5.440710209258085, "grad_norm": 0.1994578242301941, "learning_rate": 0.00019598326397725574, "loss": 0.1913, "step": 4290 }, { "epoch": 5.4470513633481294, "grad_norm": 0.19402861595153809, "learning_rate": 0.00019596252324456494, "loss": 0.1892, "step": 4295 }, { "epoch": 5.453392517438174, "grad_norm": 0.17431293427944183, "learning_rate": 0.00019594173020467934, "loss": 0.1885, "step": 4300 }, { "epoch": 5.459733671528218, "grad_norm": 0.19461782276630402, "learning_rate": 0.00019592088486893277, "loss": 0.1906, "step": 4305 }, { "epoch": 5.4660748256182625, "grad_norm": 0.18617475032806396, "learning_rate": 0.00019589998724868762, "loss": 0.1881, "step": 4310 }, { "epoch": 5.472415979708307, "grad_norm": 0.18781709671020508, "learning_rate": 0.00019587903735533474, "loss": 0.192, "step": 4315 }, { "epoch": 5.478757133798351, "grad_norm": 0.18870596587657928, "learning_rate": 0.00019585803520029348, "loss": 0.1908, "step": 4320 }, { "epoch": 5.485098287888396, "grad_norm": 0.1798035055398941, "learning_rate": 0.00019583698079501168, "loss": 0.1902, "step": 4325 }, { "epoch": 5.49143944197844, "grad_norm": 0.17604327201843262, "learning_rate": 0.00019581587415096568, "loss": 0.1914, "step": 4330 }, { "epoch": 5.497780596068484, "grad_norm": 0.17630285024642944, "learning_rate": 0.00019579471527966024, "loss": 0.1939, "step": 4335 }, { "epoch": 5.504121750158529, "grad_norm": 0.1765008121728897, "learning_rate": 0.00019577350419262867, "loss": 0.1934, "step": 4340 }, { "epoch": 5.510462904248573, "grad_norm": 0.18374872207641602, "learning_rate": 0.0001957522409014327, "loss": 0.1944, "step": 4345 }, { "epoch": 5.516804058338618, "grad_norm": 0.18187780678272247, "learning_rate": 0.00019573092541766245, "loss": 0.1912, "step": 4350 }, { "epoch": 5.523145212428662, "grad_norm": 0.18066410720348358, "learning_rate": 0.0001957095577529366, "loss": 0.1897, "step": 4355 }, { "epoch": 5.529486366518706, "grad_norm": 0.1881483942270279, "learning_rate": 0.00019568813791890222, "loss": 0.1899, "step": 4360 }, { "epoch": 5.535827520608751, "grad_norm": 0.18128935992717743, "learning_rate": 0.00019566666592723482, "loss": 0.1926, "step": 4365 }, { "epoch": 5.542168674698795, "grad_norm": 0.18244339525699615, "learning_rate": 0.00019564514178963834, "loss": 0.1903, "step": 4370 }, { "epoch": 5.5485098287888395, "grad_norm": 0.19774708151817322, "learning_rate": 0.0001956235655178451, "loss": 0.1951, "step": 4375 }, { "epoch": 5.554850982878884, "grad_norm": 0.1943703293800354, "learning_rate": 0.00019560193712361596, "loss": 0.1931, "step": 4380 }, { "epoch": 5.561192136968929, "grad_norm": 0.19823729991912842, "learning_rate": 0.0001955802566187401, "loss": 0.1917, "step": 4385 }, { "epoch": 5.5675332910589725, "grad_norm": 0.17945356667041779, "learning_rate": 0.00019555852401503503, "loss": 0.1919, "step": 4390 }, { "epoch": 5.573874445149017, "grad_norm": 0.20048396289348602, "learning_rate": 0.0001955367393243468, "loss": 0.1969, "step": 4395 }, { "epoch": 5.580215599239062, "grad_norm": 0.18237970769405365, "learning_rate": 0.00019551490255854986, "loss": 0.193, "step": 4400 }, { "epoch": 5.586556753329106, "grad_norm": 0.1843222975730896, "learning_rate": 0.00019549301372954688, "loss": 0.1871, "step": 4405 }, { "epoch": 5.59289790741915, "grad_norm": 0.18752700090408325, "learning_rate": 0.00019547107284926906, "loss": 0.1939, "step": 4410 }, { "epoch": 5.599239061509195, "grad_norm": 0.17793463170528412, "learning_rate": 0.0001954490799296759, "loss": 0.1928, "step": 4415 }, { "epoch": 5.605580215599239, "grad_norm": 0.20990607142448425, "learning_rate": 0.00019542703498275532, "loss": 0.1887, "step": 4420 }, { "epoch": 5.611921369689283, "grad_norm": 0.1844872087240219, "learning_rate": 0.00019540493802052352, "loss": 0.1908, "step": 4425 }, { "epoch": 5.618262523779328, "grad_norm": 0.17690935730934143, "learning_rate": 0.0001953827890550252, "loss": 0.1913, "step": 4430 }, { "epoch": 5.624603677869372, "grad_norm": 0.1593828797340393, "learning_rate": 0.00019536058809833316, "loss": 0.192, "step": 4435 }, { "epoch": 5.630944831959416, "grad_norm": 0.17943666875362396, "learning_rate": 0.00019533833516254876, "loss": 0.1892, "step": 4440 }, { "epoch": 5.637285986049461, "grad_norm": 0.19799686968326569, "learning_rate": 0.00019531603025980162, "loss": 0.1898, "step": 4445 }, { "epoch": 5.643627140139506, "grad_norm": 0.20502060651779175, "learning_rate": 0.00019529367340224968, "loss": 0.1913, "step": 4450 }, { "epoch": 5.6499682942295495, "grad_norm": 0.185252845287323, "learning_rate": 0.00019527126460207921, "loss": 0.1909, "step": 4455 }, { "epoch": 5.656309448319594, "grad_norm": 0.18116562068462372, "learning_rate": 0.00019524880387150476, "loss": 0.1917, "step": 4460 }, { "epoch": 5.662650602409639, "grad_norm": 0.18006958067417145, "learning_rate": 0.00019522629122276924, "loss": 0.1919, "step": 4465 }, { "epoch": 5.668991756499683, "grad_norm": 0.1922648549079895, "learning_rate": 0.00019520372666814378, "loss": 0.1916, "step": 4470 }, { "epoch": 5.675332910589727, "grad_norm": 0.18379157781600952, "learning_rate": 0.00019518111021992794, "loss": 0.1909, "step": 4475 }, { "epoch": 5.681674064679772, "grad_norm": 0.18128110468387604, "learning_rate": 0.00019515844189044938, "loss": 0.1908, "step": 4480 }, { "epoch": 5.6880152187698165, "grad_norm": 0.19395412504673004, "learning_rate": 0.00019513572169206422, "loss": 0.189, "step": 4485 }, { "epoch": 5.69435637285986, "grad_norm": 0.17345359921455383, "learning_rate": 0.00019511294963715675, "loss": 0.186, "step": 4490 }, { "epoch": 5.700697526949905, "grad_norm": 0.16492041945457458, "learning_rate": 0.00019509012573813952, "loss": 0.1892, "step": 4495 }, { "epoch": 5.70703868103995, "grad_norm": 0.18284818530082703, "learning_rate": 0.00019506725000745337, "loss": 0.1911, "step": 4500 }, { "epoch": 5.713379835129993, "grad_norm": 0.17734254896640778, "learning_rate": 0.00019504432245756744, "loss": 0.1938, "step": 4505 }, { "epoch": 5.719720989220038, "grad_norm": 0.19504351913928986, "learning_rate": 0.00019502134310097903, "loss": 0.1886, "step": 4510 }, { "epoch": 5.726062143310083, "grad_norm": 0.18341602385044098, "learning_rate": 0.0001949983119502137, "loss": 0.1912, "step": 4515 }, { "epoch": 5.732403297400126, "grad_norm": 0.17228391766548157, "learning_rate": 0.00019497522901782528, "loss": 0.1929, "step": 4520 }, { "epoch": 5.738744451490171, "grad_norm": 0.19264021515846252, "learning_rate": 0.00019495209431639578, "loss": 0.189, "step": 4525 }, { "epoch": 5.745085605580216, "grad_norm": 0.18831369280815125, "learning_rate": 0.00019492890785853545, "loss": 0.1883, "step": 4530 }, { "epoch": 5.7514267596702595, "grad_norm": 0.1889246255159378, "learning_rate": 0.0001949056696568828, "loss": 0.1923, "step": 4535 }, { "epoch": 5.757767913760304, "grad_norm": 0.1777929812669754, "learning_rate": 0.00019488237972410444, "loss": 0.1855, "step": 4540 }, { "epoch": 5.764109067850349, "grad_norm": 0.18392111361026764, "learning_rate": 0.00019485903807289529, "loss": 0.1912, "step": 4545 }, { "epoch": 5.7704502219403935, "grad_norm": 0.17921772599220276, "learning_rate": 0.0001948356447159783, "loss": 0.1906, "step": 4550 }, { "epoch": 5.776791376030437, "grad_norm": 0.18110308051109314, "learning_rate": 0.0001948121996661048, "loss": 0.1967, "step": 4555 }, { "epoch": 5.783132530120482, "grad_norm": 0.17326200008392334, "learning_rate": 0.00019478870293605416, "loss": 0.1899, "step": 4560 }, { "epoch": 5.7894736842105265, "grad_norm": 0.16777203977108002, "learning_rate": 0.000194765154538634, "loss": 0.1918, "step": 4565 }, { "epoch": 5.79581483830057, "grad_norm": 0.18985359370708466, "learning_rate": 0.00019474155448668, "loss": 0.1902, "step": 4570 }, { "epoch": 5.802155992390615, "grad_norm": 0.18623381853103638, "learning_rate": 0.00019471790279305615, "loss": 0.1929, "step": 4575 }, { "epoch": 5.80849714648066, "grad_norm": 0.19313745200634003, "learning_rate": 0.00019469419947065446, "loss": 0.1895, "step": 4580 }, { "epoch": 5.814838300570704, "grad_norm": 0.18372663855552673, "learning_rate": 0.00019467044453239508, "loss": 0.1928, "step": 4585 }, { "epoch": 5.821179454660748, "grad_norm": 0.1851940006017685, "learning_rate": 0.0001946466379912264, "loss": 0.1875, "step": 4590 }, { "epoch": 5.827520608750793, "grad_norm": 0.1928340345621109, "learning_rate": 0.00019462277986012486, "loss": 0.1914, "step": 4595 }, { "epoch": 5.833861762840837, "grad_norm": 0.20064860582351685, "learning_rate": 0.00019459887015209503, "loss": 0.1919, "step": 4600 }, { "epoch": 5.840202916930881, "grad_norm": 0.16894268989562988, "learning_rate": 0.0001945749088801696, "loss": 0.1908, "step": 4605 }, { "epoch": 5.846544071020926, "grad_norm": 0.19123098254203796, "learning_rate": 0.00019455089605740936, "loss": 0.1882, "step": 4610 }, { "epoch": 5.85288522511097, "grad_norm": 0.16322028636932373, "learning_rate": 0.0001945268316969032, "loss": 0.1919, "step": 4615 }, { "epoch": 5.859226379201015, "grad_norm": 0.19259871542453766, "learning_rate": 0.0001945027158117681, "loss": 0.1881, "step": 4620 }, { "epoch": 5.865567533291059, "grad_norm": 0.1822081059217453, "learning_rate": 0.0001944785484151492, "loss": 0.1926, "step": 4625 }, { "epoch": 5.8719086873811035, "grad_norm": 0.1963387429714203, "learning_rate": 0.00019445432952021956, "loss": 0.1879, "step": 4630 }, { "epoch": 5.878249841471147, "grad_norm": 0.17558357119560242, "learning_rate": 0.00019443005914018043, "loss": 0.1904, "step": 4635 }, { "epoch": 5.884590995561192, "grad_norm": 0.18023225665092468, "learning_rate": 0.00019440573728826116, "loss": 0.1909, "step": 4640 }, { "epoch": 5.890932149651237, "grad_norm": 0.19065885245800018, "learning_rate": 0.00019438136397771896, "loss": 0.1884, "step": 4645 }, { "epoch": 5.897273303741281, "grad_norm": 0.17930124700069427, "learning_rate": 0.00019435693922183935, "loss": 0.1918, "step": 4650 }, { "epoch": 5.903614457831325, "grad_norm": 0.1738729327917099, "learning_rate": 0.00019433246303393568, "loss": 0.1874, "step": 4655 }, { "epoch": 5.90995561192137, "grad_norm": 0.18886950612068176, "learning_rate": 0.00019430793542734944, "loss": 0.1885, "step": 4660 }, { "epoch": 5.916296766011414, "grad_norm": 0.16737911105155945, "learning_rate": 0.00019428335641545011, "loss": 0.1889, "step": 4665 }, { "epoch": 5.922637920101458, "grad_norm": 0.18907523155212402, "learning_rate": 0.00019425872601163527, "loss": 0.1862, "step": 4670 }, { "epoch": 5.928979074191503, "grad_norm": 0.21261325478553772, "learning_rate": 0.00019423404422933035, "loss": 0.1917, "step": 4675 }, { "epoch": 5.935320228281547, "grad_norm": 0.17045345902442932, "learning_rate": 0.00019420931108198893, "loss": 0.189, "step": 4680 }, { "epoch": 5.941661382371592, "grad_norm": 0.1847270429134369, "learning_rate": 0.00019418452658309253, "loss": 0.1923, "step": 4685 }, { "epoch": 5.948002536461636, "grad_norm": 0.17452391982078552, "learning_rate": 0.0001941596907461507, "loss": 0.1941, "step": 4690 }, { "epoch": 5.9543436905516804, "grad_norm": 0.16806824505329132, "learning_rate": 0.00019413480358470087, "loss": 0.1892, "step": 4695 }, { "epoch": 5.960684844641725, "grad_norm": 0.19507285952568054, "learning_rate": 0.0001941098651123086, "loss": 0.1883, "step": 4700 }, { "epoch": 5.967025998731769, "grad_norm": 0.1816890686750412, "learning_rate": 0.0001940848753425673, "loss": 0.1909, "step": 4705 }, { "epoch": 5.9733671528218135, "grad_norm": 0.1882811039686203, "learning_rate": 0.00019405983428909837, "loss": 0.1854, "step": 4710 }, { "epoch": 5.979708306911858, "grad_norm": 0.16969385743141174, "learning_rate": 0.0001940347419655512, "loss": 0.1873, "step": 4715 }, { "epoch": 5.986049461001903, "grad_norm": 0.17581771314144135, "learning_rate": 0.00019400959838560306, "loss": 0.1927, "step": 4720 }, { "epoch": 5.992390615091947, "grad_norm": 0.16098125278949738, "learning_rate": 0.00019398440356295925, "loss": 0.1892, "step": 4725 }, { "epoch": 5.998731769181991, "grad_norm": 0.20809268951416016, "learning_rate": 0.0001939591575113529, "loss": 0.185, "step": 4730 }, { "epoch": 6.0, "eval_loss": 0.30793845653533936, "eval_runtime": 1444.4432, "eval_samples_per_second": 1.092, "eval_steps_per_second": 1.092, "step": 4731 }, { "epoch": 6.005072923272036, "grad_norm": 0.17869412899017334, "learning_rate": 0.00019393386024454511, "loss": 0.1698, "step": 4735 }, { "epoch": 6.01141407736208, "grad_norm": 0.21704988181591034, "learning_rate": 0.00019390851177632497, "loss": 0.1672, "step": 4740 }, { "epoch": 6.017755231452124, "grad_norm": 0.20210865139961243, "learning_rate": 0.0001938831121205093, "loss": 0.1629, "step": 4745 }, { "epoch": 6.024096385542169, "grad_norm": 0.1876503974199295, "learning_rate": 0.00019385766129094304, "loss": 0.1653, "step": 4750 }, { "epoch": 6.030437539632213, "grad_norm": 0.1624627411365509, "learning_rate": 0.00019383215930149884, "loss": 0.1617, "step": 4755 }, { "epoch": 6.036778693722257, "grad_norm": 0.19057364761829376, "learning_rate": 0.00019380660616607736, "loss": 0.1667, "step": 4760 }, { "epoch": 6.043119847812302, "grad_norm": 0.20077063143253326, "learning_rate": 0.00019378100189860705, "loss": 0.1682, "step": 4765 }, { "epoch": 6.049461001902346, "grad_norm": 0.21835148334503174, "learning_rate": 0.00019375534651304424, "loss": 0.164, "step": 4770 }, { "epoch": 6.0558021559923905, "grad_norm": 0.20760078728199005, "learning_rate": 0.00019372964002337318, "loss": 0.1617, "step": 4775 }, { "epoch": 6.062143310082435, "grad_norm": 0.18048901855945587, "learning_rate": 0.00019370388244360602, "loss": 0.165, "step": 4780 }, { "epoch": 6.06848446417248, "grad_norm": 0.23527438938617706, "learning_rate": 0.00019367807378778258, "loss": 0.1744, "step": 4785 }, { "epoch": 6.0748256182625235, "grad_norm": 0.1975928246974945, "learning_rate": 0.00019365221406997068, "loss": 0.1716, "step": 4790 }, { "epoch": 6.081166772352568, "grad_norm": 0.1958836168050766, "learning_rate": 0.00019362630330426592, "loss": 0.169, "step": 4795 }, { "epoch": 6.087507926442613, "grad_norm": 0.19794446229934692, "learning_rate": 0.0001936003415047917, "loss": 0.1688, "step": 4800 }, { "epoch": 6.093849080532657, "grad_norm": 0.20192603766918182, "learning_rate": 0.00019357432868569928, "loss": 0.1683, "step": 4805 }, { "epoch": 6.100190234622701, "grad_norm": 0.22243689000606537, "learning_rate": 0.0001935482648611677, "loss": 0.1666, "step": 4810 }, { "epoch": 6.106531388712746, "grad_norm": 0.20195354521274567, "learning_rate": 0.00019352215004540382, "loss": 0.1672, "step": 4815 }, { "epoch": 6.11287254280279, "grad_norm": 0.20959386229515076, "learning_rate": 0.0001934959842526423, "loss": 0.1686, "step": 4820 }, { "epoch": 6.119213696892834, "grad_norm": 0.19737522304058075, "learning_rate": 0.00019346976749714558, "loss": 0.1667, "step": 4825 }, { "epoch": 6.125554850982879, "grad_norm": 0.19156497716903687, "learning_rate": 0.00019344349979320385, "loss": 0.1697, "step": 4830 }, { "epoch": 6.131896005072924, "grad_norm": 0.19894227385520935, "learning_rate": 0.00019341718115513506, "loss": 0.1684, "step": 4835 }, { "epoch": 6.138237159162967, "grad_norm": 0.1929052770137787, "learning_rate": 0.00019339081159728508, "loss": 0.1702, "step": 4840 }, { "epoch": 6.144578313253012, "grad_norm": 0.19585652649402618, "learning_rate": 0.00019336439113402728, "loss": 0.1738, "step": 4845 }, { "epoch": 6.150919467343057, "grad_norm": 0.19753801822662354, "learning_rate": 0.00019333791977976298, "loss": 0.1704, "step": 4850 }, { "epoch": 6.1572606214331005, "grad_norm": 0.2030104398727417, "learning_rate": 0.00019331139754892113, "loss": 0.1656, "step": 4855 }, { "epoch": 6.163601775523145, "grad_norm": 0.19860798120498657, "learning_rate": 0.0001932848244559585, "loss": 0.1677, "step": 4860 }, { "epoch": 6.16994292961319, "grad_norm": 0.17280183732509613, "learning_rate": 0.0001932582005153595, "loss": 0.1664, "step": 4865 }, { "epoch": 6.176284083703234, "grad_norm": 0.19971050322055817, "learning_rate": 0.0001932315257416363, "loss": 0.1661, "step": 4870 }, { "epoch": 6.182625237793278, "grad_norm": 0.18800394237041473, "learning_rate": 0.00019320480014932878, "loss": 0.1692, "step": 4875 }, { "epoch": 6.188966391883323, "grad_norm": 0.19015033543109894, "learning_rate": 0.0001931780237530045, "loss": 0.17, "step": 4880 }, { "epoch": 6.1953075459733675, "grad_norm": 0.1927490085363388, "learning_rate": 0.00019315119656725872, "loss": 0.1677, "step": 4885 }, { "epoch": 6.201648700063411, "grad_norm": 0.2142195850610733, "learning_rate": 0.00019312431860671442, "loss": 0.1732, "step": 4890 }, { "epoch": 6.207989854153456, "grad_norm": 0.19102154672145844, "learning_rate": 0.00019309738988602216, "loss": 0.1752, "step": 4895 }, { "epoch": 6.214331008243501, "grad_norm": 0.20553378760814667, "learning_rate": 0.00019307041041986028, "loss": 0.1685, "step": 4900 }, { "epoch": 6.220672162333544, "grad_norm": 0.19381991028785706, "learning_rate": 0.00019304338022293468, "loss": 0.1719, "step": 4905 }, { "epoch": 6.227013316423589, "grad_norm": 0.20748937129974365, "learning_rate": 0.00019301629930997907, "loss": 0.1706, "step": 4910 }, { "epoch": 6.233354470513634, "grad_norm": 0.21351176500320435, "learning_rate": 0.00019298916769575457, "loss": 0.1748, "step": 4915 }, { "epoch": 6.239695624603678, "grad_norm": 0.2113640010356903, "learning_rate": 0.00019296198539505013, "loss": 0.1723, "step": 4920 }, { "epoch": 6.246036778693722, "grad_norm": 0.19443054497241974, "learning_rate": 0.00019293475242268223, "loss": 0.1692, "step": 4925 }, { "epoch": 6.252377932783767, "grad_norm": 0.1807393878698349, "learning_rate": 0.00019290746879349507, "loss": 0.1663, "step": 4930 }, { "epoch": 6.258719086873811, "grad_norm": 0.19841726124286652, "learning_rate": 0.00019288013452236036, "loss": 0.1742, "step": 4935 }, { "epoch": 6.265060240963855, "grad_norm": 0.18737393617630005, "learning_rate": 0.00019285274962417738, "loss": 0.1721, "step": 4940 }, { "epoch": 6.2714013950539, "grad_norm": 0.1896350234746933, "learning_rate": 0.00019282531411387316, "loss": 0.1693, "step": 4945 }, { "epoch": 6.2777425491439445, "grad_norm": 0.2064460963010788, "learning_rate": 0.00019279782800640222, "loss": 0.1719, "step": 4950 }, { "epoch": 6.284083703233988, "grad_norm": 0.19095759093761444, "learning_rate": 0.00019277029131674664, "loss": 0.1731, "step": 4955 }, { "epoch": 6.290424857324033, "grad_norm": 0.19461022317409515, "learning_rate": 0.0001927427040599161, "loss": 0.1709, "step": 4960 }, { "epoch": 6.2967660114140775, "grad_norm": 0.19708266854286194, "learning_rate": 0.0001927150662509479, "loss": 0.1761, "step": 4965 }, { "epoch": 6.303107165504121, "grad_norm": 0.18199646472930908, "learning_rate": 0.00019268737790490678, "loss": 0.1722, "step": 4970 }, { "epoch": 6.309448319594166, "grad_norm": 0.19694223999977112, "learning_rate": 0.00019265963903688512, "loss": 0.1721, "step": 4975 }, { "epoch": 6.315789473684211, "grad_norm": 0.20176419615745544, "learning_rate": 0.00019263184966200278, "loss": 0.1707, "step": 4980 }, { "epoch": 6.322130627774255, "grad_norm": 0.18807032704353333, "learning_rate": 0.00019260400979540722, "loss": 0.1691, "step": 4985 }, { "epoch": 6.328471781864299, "grad_norm": 0.2108924686908722, "learning_rate": 0.00019257611945227332, "loss": 0.1685, "step": 4990 }, { "epoch": 6.334812935954344, "grad_norm": 0.1884462982416153, "learning_rate": 0.00019254817864780357, "loss": 0.172, "step": 4995 }, { "epoch": 6.341154090044388, "grad_norm": 0.18567998707294464, "learning_rate": 0.0001925201873972279, "loss": 0.1732, "step": 5000 }, { "epoch": 6.347495244134432, "grad_norm": 0.22390568256378174, "learning_rate": 0.00019249214571580382, "loss": 0.1723, "step": 5005 }, { "epoch": 6.353836398224477, "grad_norm": 0.2352142184972763, "learning_rate": 0.00019246405361881622, "loss": 0.1714, "step": 5010 }, { "epoch": 6.360177552314521, "grad_norm": 0.18142126500606537, "learning_rate": 0.00019243591112157751, "loss": 0.1704, "step": 5015 }, { "epoch": 6.366518706404566, "grad_norm": 0.17922544479370117, "learning_rate": 0.00019240771823942764, "loss": 0.1724, "step": 5020 }, { "epoch": 6.37285986049461, "grad_norm": 0.1944347769021988, "learning_rate": 0.00019237947498773394, "loss": 0.1699, "step": 5025 }, { "epoch": 6.3792010145846545, "grad_norm": 0.19995710253715515, "learning_rate": 0.00019235118138189123, "loss": 0.1695, "step": 5030 }, { "epoch": 6.385542168674699, "grad_norm": 0.197807177901268, "learning_rate": 0.00019232283743732175, "loss": 0.1736, "step": 5035 }, { "epoch": 6.391883322764743, "grad_norm": 0.20247666537761688, "learning_rate": 0.00019229444316947524, "loss": 0.174, "step": 5040 }, { "epoch": 6.398224476854788, "grad_norm": 0.18536652624607086, "learning_rate": 0.0001922659985938288, "loss": 0.1723, "step": 5045 }, { "epoch": 6.404565630944832, "grad_norm": 0.1977190375328064, "learning_rate": 0.000192237503725887, "loss": 0.1733, "step": 5050 }, { "epoch": 6.410906785034876, "grad_norm": 0.18622733652591705, "learning_rate": 0.0001922089585811818, "loss": 0.168, "step": 5055 }, { "epoch": 6.417247939124921, "grad_norm": 0.18643243610858917, "learning_rate": 0.00019218036317527256, "loss": 0.1724, "step": 5060 }, { "epoch": 6.423589093214965, "grad_norm": 0.1913641095161438, "learning_rate": 0.0001921517175237461, "loss": 0.1735, "step": 5065 }, { "epoch": 6.429930247305009, "grad_norm": 0.1839703917503357, "learning_rate": 0.00019212302164221646, "loss": 0.1725, "step": 5070 }, { "epoch": 6.436271401395054, "grad_norm": 0.19428937137126923, "learning_rate": 0.00019209427554632525, "loss": 0.1743, "step": 5075 }, { "epoch": 6.442612555485098, "grad_norm": 0.20090366899967194, "learning_rate": 0.00019206547925174143, "loss": 0.1739, "step": 5080 }, { "epoch": 6.448953709575143, "grad_norm": 0.20825113356113434, "learning_rate": 0.00019203663277416117, "loss": 0.1744, "step": 5085 }, { "epoch": 6.455294863665187, "grad_norm": 0.20592446625232697, "learning_rate": 0.00019200773612930812, "loss": 0.1779, "step": 5090 }, { "epoch": 6.461636017755231, "grad_norm": 0.19911469519138336, "learning_rate": 0.00019197878933293328, "loss": 0.1742, "step": 5095 }, { "epoch": 6.467977171845276, "grad_norm": 0.20088796317577362, "learning_rate": 0.0001919497924008149, "loss": 0.172, "step": 5100 }, { "epoch": 6.47431832593532, "grad_norm": 0.19776076078414917, "learning_rate": 0.00019192074534875864, "loss": 0.1715, "step": 5105 }, { "epoch": 6.4806594800253645, "grad_norm": 0.19061751663684845, "learning_rate": 0.0001918916481925974, "loss": 0.1718, "step": 5110 }, { "epoch": 6.487000634115409, "grad_norm": 0.20752453804016113, "learning_rate": 0.00019186250094819157, "loss": 0.1758, "step": 5115 }, { "epoch": 6.493341788205454, "grad_norm": 0.19058051705360413, "learning_rate": 0.0001918333036314286, "loss": 0.1717, "step": 5120 }, { "epoch": 6.499682942295498, "grad_norm": 0.20934075117111206, "learning_rate": 0.00019180405625822332, "loss": 0.1729, "step": 5125 }, { "epoch": 6.506024096385542, "grad_norm": 0.18336834013462067, "learning_rate": 0.000191774758844518, "loss": 0.1775, "step": 5130 }, { "epoch": 6.512365250475587, "grad_norm": 0.18896086513996124, "learning_rate": 0.00019174541140628195, "loss": 0.1677, "step": 5135 }, { "epoch": 6.518706404565631, "grad_norm": 0.1955013871192932, "learning_rate": 0.00019171601395951188, "loss": 0.1767, "step": 5140 }, { "epoch": 6.525047558655675, "grad_norm": 0.205348938703537, "learning_rate": 0.00019168656652023173, "loss": 0.1729, "step": 5145 }, { "epoch": 6.53138871274572, "grad_norm": 0.21883191168308258, "learning_rate": 0.00019165706910449272, "loss": 0.1726, "step": 5150 }, { "epoch": 6.537729866835764, "grad_norm": 0.19722765684127808, "learning_rate": 0.00019162752172837327, "loss": 0.1721, "step": 5155 }, { "epoch": 6.544071020925808, "grad_norm": 0.20495383441448212, "learning_rate": 0.00019159792440797904, "loss": 0.1717, "step": 5160 }, { "epoch": 6.550412175015853, "grad_norm": 0.18866901099681854, "learning_rate": 0.00019156827715944292, "loss": 0.1718, "step": 5165 }, { "epoch": 6.556753329105897, "grad_norm": 0.19127093255519867, "learning_rate": 0.00019153857999892502, "loss": 0.1695, "step": 5170 }, { "epoch": 6.5630944831959415, "grad_norm": 0.1837748885154724, "learning_rate": 0.00019150883294261263, "loss": 0.1745, "step": 5175 }, { "epoch": 6.569435637285986, "grad_norm": 0.205242320895195, "learning_rate": 0.00019147903600672027, "loss": 0.1751, "step": 5180 }, { "epoch": 6.575776791376031, "grad_norm": 0.20338760316371918, "learning_rate": 0.00019144918920748963, "loss": 0.1726, "step": 5185 }, { "epoch": 6.5821179454660745, "grad_norm": 0.18401671946048737, "learning_rate": 0.00019141929256118962, "loss": 0.1715, "step": 5190 }, { "epoch": 6.588459099556119, "grad_norm": 0.1876956820487976, "learning_rate": 0.00019138934608411626, "loss": 0.1766, "step": 5195 }, { "epoch": 6.594800253646164, "grad_norm": 0.19667501747608185, "learning_rate": 0.00019135934979259273, "loss": 0.1737, "step": 5200 }, { "epoch": 6.601141407736208, "grad_norm": 0.18518556654453278, "learning_rate": 0.00019132930370296947, "loss": 0.1697, "step": 5205 }, { "epoch": 6.607482561826252, "grad_norm": 0.20856086909770966, "learning_rate": 0.0001912992078316239, "loss": 0.1701, "step": 5210 }, { "epoch": 6.613823715916297, "grad_norm": 0.19420833885669708, "learning_rate": 0.0001912690621949607, "loss": 0.1745, "step": 5215 }, { "epoch": 6.620164870006342, "grad_norm": 0.20611417293548584, "learning_rate": 0.00019123886680941162, "loss": 0.1719, "step": 5220 }, { "epoch": 6.626506024096385, "grad_norm": 0.18236397206783295, "learning_rate": 0.00019120862169143556, "loss": 0.1665, "step": 5225 }, { "epoch": 6.63284717818643, "grad_norm": 0.19114331901073456, "learning_rate": 0.0001911783268575185, "loss": 0.1718, "step": 5230 }, { "epoch": 6.639188332276475, "grad_norm": 0.18286781013011932, "learning_rate": 0.00019114798232417355, "loss": 0.1723, "step": 5235 }, { "epoch": 6.645529486366518, "grad_norm": 0.19397655129432678, "learning_rate": 0.00019111758810794085, "loss": 0.1701, "step": 5240 }, { "epoch": 6.651870640456563, "grad_norm": 0.19499529898166656, "learning_rate": 0.0001910871442253877, "loss": 0.176, "step": 5245 }, { "epoch": 6.658211794546608, "grad_norm": 0.18763020634651184, "learning_rate": 0.0001910566506931084, "loss": 0.1741, "step": 5250 }, { "epoch": 6.6645529486366515, "grad_norm": 0.20211605727672577, "learning_rate": 0.0001910261075277244, "loss": 0.1764, "step": 5255 }, { "epoch": 6.670894102726696, "grad_norm": 0.18774905800819397, "learning_rate": 0.00019099551474588406, "loss": 0.1729, "step": 5260 }, { "epoch": 6.677235256816741, "grad_norm": 0.18556103110313416, "learning_rate": 0.00019096487236426298, "loss": 0.1758, "step": 5265 }, { "epoch": 6.683576410906785, "grad_norm": 0.1864437758922577, "learning_rate": 0.00019093418039956359, "loss": 0.1723, "step": 5270 }, { "epoch": 6.689917564996829, "grad_norm": 0.19958314299583435, "learning_rate": 0.00019090343886851553, "loss": 0.1726, "step": 5275 }, { "epoch": 6.696258719086874, "grad_norm": 0.19806356728076935, "learning_rate": 0.00019087264778787534, "loss": 0.1687, "step": 5280 }, { "epoch": 6.7025998731769185, "grad_norm": 0.1977902352809906, "learning_rate": 0.0001908418071744266, "loss": 0.1754, "step": 5285 }, { "epoch": 6.708941027266962, "grad_norm": 0.19435393810272217, "learning_rate": 0.00019081091704497994, "loss": 0.1766, "step": 5290 }, { "epoch": 6.715282181357007, "grad_norm": 0.2176521271467209, "learning_rate": 0.00019077997741637282, "loss": 0.1727, "step": 5295 }, { "epoch": 6.721623335447052, "grad_norm": 0.1980164647102356, "learning_rate": 0.00019074898830546993, "loss": 0.1724, "step": 5300 }, { "epoch": 6.727964489537095, "grad_norm": 0.19032271206378937, "learning_rate": 0.00019071794972916272, "loss": 0.1755, "step": 5305 }, { "epoch": 6.73430564362714, "grad_norm": 0.19116751849651337, "learning_rate": 0.0001906868617043697, "loss": 0.1748, "step": 5310 }, { "epoch": 6.740646797717185, "grad_norm": 0.21168269217014313, "learning_rate": 0.00019065572424803627, "loss": 0.1719, "step": 5315 }, { "epoch": 6.746987951807229, "grad_norm": 0.18788640201091766, "learning_rate": 0.00019062453737713487, "loss": 0.1763, "step": 5320 }, { "epoch": 6.753329105897273, "grad_norm": 0.20749929547309875, "learning_rate": 0.0001905933011086648, "loss": 0.168, "step": 5325 }, { "epoch": 6.759670259987318, "grad_norm": 0.18825682997703552, "learning_rate": 0.0001905620154596523, "loss": 0.1755, "step": 5330 }, { "epoch": 6.766011414077362, "grad_norm": 0.184580460190773, "learning_rate": 0.00019053068044715056, "loss": 0.1737, "step": 5335 }, { "epoch": 6.772352568167406, "grad_norm": 0.19419164955615997, "learning_rate": 0.00019049929608823963, "loss": 0.1726, "step": 5340 }, { "epoch": 6.778693722257451, "grad_norm": 0.18139266967773438, "learning_rate": 0.00019046786240002646, "loss": 0.1695, "step": 5345 }, { "epoch": 6.7850348763474955, "grad_norm": 0.18364496529102325, "learning_rate": 0.00019043637939964494, "loss": 0.1726, "step": 5350 }, { "epoch": 6.79137603043754, "grad_norm": 0.234736829996109, "learning_rate": 0.0001904048471042558, "loss": 0.1702, "step": 5355 }, { "epoch": 6.797717184527584, "grad_norm": 0.18215778470039368, "learning_rate": 0.00019037326553104661, "loss": 0.1735, "step": 5360 }, { "epoch": 6.8040583386176285, "grad_norm": 0.17532981932163239, "learning_rate": 0.00019034163469723188, "loss": 0.1716, "step": 5365 }, { "epoch": 6.810399492707672, "grad_norm": 0.21182258427143097, "learning_rate": 0.0001903099546200529, "loss": 0.1757, "step": 5370 }, { "epoch": 6.816740646797717, "grad_norm": 0.1960773915052414, "learning_rate": 0.00019027822531677782, "loss": 0.1722, "step": 5375 }, { "epoch": 6.823081800887762, "grad_norm": 0.17683179676532745, "learning_rate": 0.00019024644680470164, "loss": 0.169, "step": 5380 }, { "epoch": 6.829422954977806, "grad_norm": 0.2136959284543991, "learning_rate": 0.0001902146191011462, "loss": 0.1734, "step": 5385 }, { "epoch": 6.83576410906785, "grad_norm": 0.22185508906841278, "learning_rate": 0.00019018274222346008, "loss": 0.1722, "step": 5390 }, { "epoch": 6.842105263157895, "grad_norm": 0.2034195214509964, "learning_rate": 0.00019015081618901873, "loss": 0.1755, "step": 5395 }, { "epoch": 6.848446417247939, "grad_norm": 0.18836680054664612, "learning_rate": 0.00019011884101522437, "loss": 0.1716, "step": 5400 }, { "epoch": 6.854787571337983, "grad_norm": 0.22345790266990662, "learning_rate": 0.00019008681671950603, "loss": 0.1749, "step": 5405 }, { "epoch": 6.861128725428028, "grad_norm": 0.17723453044891357, "learning_rate": 0.00019005474331931947, "loss": 0.1748, "step": 5410 }, { "epoch": 6.867469879518072, "grad_norm": 0.19350096583366394, "learning_rate": 0.00019002262083214725, "loss": 0.1757, "step": 5415 }, { "epoch": 6.873811033608117, "grad_norm": 0.1992489993572235, "learning_rate": 0.00018999044927549866, "loss": 0.1723, "step": 5420 }, { "epoch": 6.880152187698161, "grad_norm": 0.16827121376991272, "learning_rate": 0.0001899582286669098, "loss": 0.1745, "step": 5425 }, { "epoch": 6.8864933417882055, "grad_norm": 0.2135612964630127, "learning_rate": 0.00018992595902394337, "loss": 0.1757, "step": 5430 }, { "epoch": 6.89283449587825, "grad_norm": 0.18933072686195374, "learning_rate": 0.00018989364036418898, "loss": 0.1713, "step": 5435 }, { "epoch": 6.899175649968294, "grad_norm": 0.19649459421634674, "learning_rate": 0.00018986127270526282, "loss": 0.173, "step": 5440 }, { "epoch": 6.905516804058339, "grad_norm": 0.18803948163986206, "learning_rate": 0.0001898288560648078, "loss": 0.1717, "step": 5445 }, { "epoch": 6.911857958148383, "grad_norm": 0.1802651286125183, "learning_rate": 0.0001897963904604937, "loss": 0.1705, "step": 5450 }, { "epoch": 6.918199112238428, "grad_norm": 0.19817164540290833, "learning_rate": 0.0001897638759100167, "loss": 0.1687, "step": 5455 }, { "epoch": 6.924540266328472, "grad_norm": 0.19307886064052582, "learning_rate": 0.00018973131243109988, "loss": 0.1745, "step": 5460 }, { "epoch": 6.930881420418516, "grad_norm": 0.19789700210094452, "learning_rate": 0.00018969870004149287, "loss": 0.1735, "step": 5465 }, { "epoch": 6.937222574508561, "grad_norm": 0.21038688719272614, "learning_rate": 0.00018966603875897212, "loss": 0.1735, "step": 5470 }, { "epoch": 6.943563728598605, "grad_norm": 0.18712224066257477, "learning_rate": 0.00018963332860134053, "loss": 0.1695, "step": 5475 }, { "epoch": 6.949904882688649, "grad_norm": 0.2119869738817215, "learning_rate": 0.00018960056958642775, "loss": 0.1795, "step": 5480 }, { "epoch": 6.956246036778694, "grad_norm": 0.19623573124408722, "learning_rate": 0.00018956776173209005, "loss": 0.1732, "step": 5485 }, { "epoch": 6.962587190868738, "grad_norm": 0.1935732215642929, "learning_rate": 0.00018953490505621033, "loss": 0.1705, "step": 5490 }, { "epoch": 6.968928344958782, "grad_norm": 0.1920432299375534, "learning_rate": 0.00018950199957669807, "loss": 0.1737, "step": 5495 }, { "epoch": 6.975269499048827, "grad_norm": 0.196640744805336, "learning_rate": 0.0001894690453114894, "loss": 0.1754, "step": 5500 }, { "epoch": 6.981610653138871, "grad_norm": 0.21967598795890808, "learning_rate": 0.00018943604227854698, "loss": 0.1737, "step": 5505 }, { "epoch": 6.9879518072289155, "grad_norm": 0.18497136235237122, "learning_rate": 0.00018940299049586003, "loss": 0.1688, "step": 5510 }, { "epoch": 6.99429296131896, "grad_norm": 0.20006784796714783, "learning_rate": 0.00018936988998144452, "loss": 0.1707, "step": 5515 }, { "epoch": 6.999365884590995, "eval_loss": 0.3125212788581848, "eval_runtime": 1444.8796, "eval_samples_per_second": 1.091, "eval_steps_per_second": 1.091, "step": 5519 }, { "epoch": 7.000634115409005, "grad_norm": 0.1989293098449707, "learning_rate": 0.00018933674075334274, "loss": 0.1706, "step": 5520 }, { "epoch": 7.006975269499049, "grad_norm": 0.19384793937206268, "learning_rate": 0.00018930354282962374, "loss": 0.1471, "step": 5525 }, { "epoch": 7.013316423589093, "grad_norm": 0.2166214883327484, "learning_rate": 0.000189270296228383, "loss": 0.1511, "step": 5530 }, { "epoch": 7.019657577679138, "grad_norm": 0.18415993452072144, "learning_rate": 0.00018923700096774255, "loss": 0.1509, "step": 5535 }, { "epoch": 7.025998731769182, "grad_norm": 0.24180269241333008, "learning_rate": 0.00018920365706585096, "loss": 0.1473, "step": 5540 }, { "epoch": 7.032339885859226, "grad_norm": 0.20664559304714203, "learning_rate": 0.00018917026454088324, "loss": 0.1459, "step": 5545 }, { "epoch": 7.038681039949271, "grad_norm": 0.19181884825229645, "learning_rate": 0.00018913682341104105, "loss": 0.1446, "step": 5550 }, { "epoch": 7.045022194039315, "grad_norm": 0.19855549931526184, "learning_rate": 0.0001891033336945524, "loss": 0.1396, "step": 5555 }, { "epoch": 7.051363348129359, "grad_norm": 0.18266354501247406, "learning_rate": 0.00018906979540967193, "loss": 0.1456, "step": 5560 }, { "epoch": 7.057704502219404, "grad_norm": 0.21585144102573395, "learning_rate": 0.0001890362085746806, "loss": 0.1493, "step": 5565 }, { "epoch": 7.064045656309449, "grad_norm": 0.21209566295146942, "learning_rate": 0.0001890025732078859, "loss": 0.1478, "step": 5570 }, { "epoch": 7.0703868103994925, "grad_norm": 0.2531264126300812, "learning_rate": 0.0001889688893276218, "loss": 0.15, "step": 5575 }, { "epoch": 7.076727964489537, "grad_norm": 0.2012319266796112, "learning_rate": 0.0001889351569522487, "loss": 0.147, "step": 5580 }, { "epoch": 7.083069118579582, "grad_norm": 0.22575150430202484, "learning_rate": 0.00018890137610015338, "loss": 0.1494, "step": 5585 }, { "epoch": 7.0894102726696255, "grad_norm": 0.2081664502620697, "learning_rate": 0.0001888675467897491, "loss": 0.1516, "step": 5590 }, { "epoch": 7.09575142675967, "grad_norm": 0.22012066841125488, "learning_rate": 0.00018883366903947555, "loss": 0.1504, "step": 5595 }, { "epoch": 7.102092580849715, "grad_norm": 0.21323354542255402, "learning_rate": 0.00018879974286779876, "loss": 0.1498, "step": 5600 }, { "epoch": 7.108433734939759, "grad_norm": 0.2156241536140442, "learning_rate": 0.00018876576829321122, "loss": 0.1513, "step": 5605 }, { "epoch": 7.114774889029803, "grad_norm": 0.21562200784683228, "learning_rate": 0.00018873174533423176, "loss": 0.1485, "step": 5610 }, { "epoch": 7.121116043119848, "grad_norm": 0.18515565991401672, "learning_rate": 0.00018869767400940554, "loss": 0.1471, "step": 5615 }, { "epoch": 7.127457197209893, "grad_norm": 0.21599265933036804, "learning_rate": 0.00018866355433730421, "loss": 0.1508, "step": 5620 }, { "epoch": 7.133798351299936, "grad_norm": 0.20091302692890167, "learning_rate": 0.0001886293863365257, "loss": 0.1469, "step": 5625 }, { "epoch": 7.140139505389981, "grad_norm": 0.19008801877498627, "learning_rate": 0.00018859517002569421, "loss": 0.1507, "step": 5630 }, { "epoch": 7.146480659480026, "grad_norm": 0.22209949791431427, "learning_rate": 0.00018856090542346046, "loss": 0.151, "step": 5635 }, { "epoch": 7.152821813570069, "grad_norm": 0.21827732026576996, "learning_rate": 0.00018852659254850126, "loss": 0.1494, "step": 5640 }, { "epoch": 7.159162967660114, "grad_norm": 0.20055322349071503, "learning_rate": 0.00018849223141951992, "loss": 0.1459, "step": 5645 }, { "epoch": 7.165504121750159, "grad_norm": 0.21264611184597015, "learning_rate": 0.00018845782205524598, "loss": 0.1536, "step": 5650 }, { "epoch": 7.171845275840203, "grad_norm": 0.19409732520580292, "learning_rate": 0.00018842336447443526, "loss": 0.153, "step": 5655 }, { "epoch": 7.178186429930247, "grad_norm": 0.18462423980236053, "learning_rate": 0.00018838885869586986, "loss": 0.1503, "step": 5660 }, { "epoch": 7.184527584020292, "grad_norm": 0.19405901432037354, "learning_rate": 0.0001883543047383582, "loss": 0.1515, "step": 5665 }, { "epoch": 7.1908687381103364, "grad_norm": 0.2198774814605713, "learning_rate": 0.00018831970262073492, "loss": 0.151, "step": 5670 }, { "epoch": 7.19720989220038, "grad_norm": 0.20815542340278625, "learning_rate": 0.00018828505236186097, "loss": 0.1545, "step": 5675 }, { "epoch": 7.203551046290425, "grad_norm": 0.20187035202980042, "learning_rate": 0.00018825035398062337, "loss": 0.1542, "step": 5680 }, { "epoch": 7.2098922003804695, "grad_norm": 0.20356439054012299, "learning_rate": 0.00018821560749593562, "loss": 0.1488, "step": 5685 }, { "epoch": 7.216233354470513, "grad_norm": 0.21523556113243103, "learning_rate": 0.00018818081292673727, "loss": 0.1564, "step": 5690 }, { "epoch": 7.222574508560558, "grad_norm": 0.20374581217765808, "learning_rate": 0.0001881459702919941, "loss": 0.1501, "step": 5695 }, { "epoch": 7.228915662650603, "grad_norm": 0.2075379490852356, "learning_rate": 0.00018811107961069808, "loss": 0.1508, "step": 5700 }, { "epoch": 7.235256816740646, "grad_norm": 0.1995144486427307, "learning_rate": 0.0001880761409018675, "loss": 0.1507, "step": 5705 }, { "epoch": 7.241597970830691, "grad_norm": 0.2131168693304062, "learning_rate": 0.00018804115418454668, "loss": 0.1475, "step": 5710 }, { "epoch": 7.247939124920736, "grad_norm": 0.2024415135383606, "learning_rate": 0.00018800611947780615, "loss": 0.1519, "step": 5715 }, { "epoch": 7.25428027901078, "grad_norm": 0.21563947200775146, "learning_rate": 0.00018797103680074262, "loss": 0.1532, "step": 5720 }, { "epoch": 7.260621433100824, "grad_norm": 0.21354436874389648, "learning_rate": 0.0001879359061724789, "loss": 0.1501, "step": 5725 }, { "epoch": 7.266962587190869, "grad_norm": 0.20749351382255554, "learning_rate": 0.00018790072761216403, "loss": 0.1502, "step": 5730 }, { "epoch": 7.273303741280913, "grad_norm": 0.19911982119083405, "learning_rate": 0.00018786550113897307, "loss": 0.1512, "step": 5735 }, { "epoch": 7.279644895370957, "grad_norm": 0.2475031018257141, "learning_rate": 0.0001878302267721073, "loss": 0.1506, "step": 5740 }, { "epoch": 7.285986049461002, "grad_norm": 0.2125454992055893, "learning_rate": 0.000187794904530794, "loss": 0.1475, "step": 5745 }, { "epoch": 7.2923272035510465, "grad_norm": 0.22095218300819397, "learning_rate": 0.0001877595344342866, "loss": 0.153, "step": 5750 }, { "epoch": 7.298668357641091, "grad_norm": 0.19597214460372925, "learning_rate": 0.00018772411650186464, "loss": 0.1551, "step": 5755 }, { "epoch": 7.305009511731135, "grad_norm": 0.21204133331775665, "learning_rate": 0.00018768865075283366, "loss": 0.1525, "step": 5760 }, { "epoch": 7.3113506658211795, "grad_norm": 0.19843082129955292, "learning_rate": 0.00018765313720652538, "loss": 0.1562, "step": 5765 }, { "epoch": 7.317691819911224, "grad_norm": 0.24536220729351044, "learning_rate": 0.00018761757588229742, "loss": 0.1529, "step": 5770 }, { "epoch": 7.324032974001268, "grad_norm": 0.20897875726222992, "learning_rate": 0.00018758196679953358, "loss": 0.1522, "step": 5775 }, { "epoch": 7.330374128091313, "grad_norm": 0.20354749262332916, "learning_rate": 0.00018754630997764365, "loss": 0.1553, "step": 5780 }, { "epoch": 7.336715282181357, "grad_norm": 0.20935474336147308, "learning_rate": 0.00018751060543606337, "loss": 0.1514, "step": 5785 }, { "epoch": 7.343056436271401, "grad_norm": 0.19947344064712524, "learning_rate": 0.0001874748531942546, "loss": 0.1547, "step": 5790 }, { "epoch": 7.349397590361446, "grad_norm": 0.1891491711139679, "learning_rate": 0.00018743905327170515, "loss": 0.1535, "step": 5795 }, { "epoch": 7.35573874445149, "grad_norm": 0.2123071551322937, "learning_rate": 0.00018740320568792876, "loss": 0.1538, "step": 5800 }, { "epoch": 7.362079898541534, "grad_norm": 0.20945200324058533, "learning_rate": 0.00018736731046246528, "loss": 0.1533, "step": 5805 }, { "epoch": 7.368421052631579, "grad_norm": 0.19436801970005035, "learning_rate": 0.0001873313676148804, "loss": 0.1548, "step": 5810 }, { "epoch": 7.374762206721623, "grad_norm": 0.21055305004119873, "learning_rate": 0.00018729537716476592, "loss": 0.1527, "step": 5815 }, { "epoch": 7.381103360811668, "grad_norm": 0.21154020726680756, "learning_rate": 0.00018725933913173938, "loss": 0.1553, "step": 5820 }, { "epoch": 7.387444514901712, "grad_norm": 0.1909535974264145, "learning_rate": 0.00018722325353544444, "loss": 0.156, "step": 5825 }, { "epoch": 7.3937856689917565, "grad_norm": 0.2075398713350296, "learning_rate": 0.0001871871203955506, "loss": 0.1518, "step": 5830 }, { "epoch": 7.400126823081801, "grad_norm": 0.20339295268058777, "learning_rate": 0.0001871509397317533, "loss": 0.1554, "step": 5835 }, { "epoch": 7.406467977171845, "grad_norm": 0.19547177851200104, "learning_rate": 0.0001871147115637738, "loss": 0.153, "step": 5840 }, { "epoch": 7.41280913126189, "grad_norm": 0.2065151333808899, "learning_rate": 0.00018707843591135942, "loss": 0.1528, "step": 5845 }, { "epoch": 7.419150285351934, "grad_norm": 0.1928146332502365, "learning_rate": 0.00018704211279428325, "loss": 0.151, "step": 5850 }, { "epoch": 7.425491439441979, "grad_norm": 0.19940437376499176, "learning_rate": 0.00018700574223234426, "loss": 0.158, "step": 5855 }, { "epoch": 7.431832593532023, "grad_norm": 0.18590781092643738, "learning_rate": 0.0001869693242453673, "loss": 0.1544, "step": 5860 }, { "epoch": 7.438173747622067, "grad_norm": 0.21022003889083862, "learning_rate": 0.00018693285885320305, "loss": 0.1556, "step": 5865 }, { "epoch": 7.444514901712112, "grad_norm": 0.20190821588039398, "learning_rate": 0.00018689634607572806, "loss": 0.1504, "step": 5870 }, { "epoch": 7.450856055802156, "grad_norm": 0.23204441368579865, "learning_rate": 0.00018685978593284468, "loss": 0.1546, "step": 5875 }, { "epoch": 7.4571972098922, "grad_norm": 0.19375638663768768, "learning_rate": 0.0001868231784444811, "loss": 0.1536, "step": 5880 }, { "epoch": 7.463538363982245, "grad_norm": 0.20619122684001923, "learning_rate": 0.00018678652363059132, "loss": 0.1552, "step": 5885 }, { "epoch": 7.469879518072289, "grad_norm": 0.21565838158130646, "learning_rate": 0.0001867498215111551, "loss": 0.1546, "step": 5890 }, { "epoch": 7.476220672162333, "grad_norm": 0.19375625252723694, "learning_rate": 0.00018671307210617802, "loss": 0.1586, "step": 5895 }, { "epoch": 7.482561826252378, "grad_norm": 0.22896713018417358, "learning_rate": 0.00018667627543569144, "loss": 0.1591, "step": 5900 }, { "epoch": 7.488902980342422, "grad_norm": 0.2110198736190796, "learning_rate": 0.00018663943151975244, "loss": 0.1547, "step": 5905 }, { "epoch": 7.4952441344324665, "grad_norm": 0.21495884656906128, "learning_rate": 0.00018660254037844388, "loss": 0.1532, "step": 5910 }, { "epoch": 7.501585288522511, "grad_norm": 0.2226954698562622, "learning_rate": 0.00018656560203187436, "loss": 0.1553, "step": 5915 }, { "epoch": 7.507926442612556, "grad_norm": 0.19277793169021606, "learning_rate": 0.00018652861650017826, "loss": 0.157, "step": 5920 }, { "epoch": 7.5142675967026, "grad_norm": 0.22772695124149323, "learning_rate": 0.0001864915838035156, "loss": 0.1529, "step": 5925 }, { "epoch": 7.520608750792644, "grad_norm": 0.2202853262424469, "learning_rate": 0.0001864545039620721, "loss": 0.1576, "step": 5930 }, { "epoch": 7.526949904882689, "grad_norm": 0.20546339452266693, "learning_rate": 0.00018641737699605926, "loss": 0.1594, "step": 5935 }, { "epoch": 7.533291058972733, "grad_norm": 0.20915482938289642, "learning_rate": 0.00018638020292571422, "loss": 0.1584, "step": 5940 }, { "epoch": 7.539632213062777, "grad_norm": 0.2137264460325241, "learning_rate": 0.0001863429817712998, "loss": 0.1563, "step": 5945 }, { "epoch": 7.545973367152822, "grad_norm": 0.20398524403572083, "learning_rate": 0.0001863057135531045, "loss": 0.1559, "step": 5950 }, { "epoch": 7.552314521242867, "grad_norm": 0.21843913197517395, "learning_rate": 0.00018626839829144243, "loss": 0.1563, "step": 5955 }, { "epoch": 7.55865567533291, "grad_norm": 0.21201571822166443, "learning_rate": 0.00018623103600665336, "loss": 0.155, "step": 5960 }, { "epoch": 7.564996829422955, "grad_norm": 0.23190584778785706, "learning_rate": 0.00018619362671910275, "loss": 0.1551, "step": 5965 }, { "epoch": 7.571337983513, "grad_norm": 0.19844332337379456, "learning_rate": 0.00018615617044918158, "loss": 0.1556, "step": 5970 }, { "epoch": 7.5776791376030435, "grad_norm": 0.21056532859802246, "learning_rate": 0.00018611866721730653, "loss": 0.1553, "step": 5975 }, { "epoch": 7.584020291693088, "grad_norm": 0.22660580277442932, "learning_rate": 0.00018608111704391983, "loss": 0.1535, "step": 5980 }, { "epoch": 7.590361445783133, "grad_norm": 0.21001556515693665, "learning_rate": 0.0001860435199494893, "loss": 0.1565, "step": 5985 }, { "epoch": 7.5967025998731765, "grad_norm": 0.21099837124347687, "learning_rate": 0.00018600587595450833, "loss": 0.1595, "step": 5990 }, { "epoch": 7.603043753963221, "grad_norm": 0.2276591658592224, "learning_rate": 0.00018596818507949592, "loss": 0.1553, "step": 5995 }, { "epoch": 7.609384908053266, "grad_norm": 0.2051401138305664, "learning_rate": 0.00018593044734499655, "loss": 0.1553, "step": 6000 }, { "epoch": 7.61572606214331, "grad_norm": 0.19349893927574158, "learning_rate": 0.00018589266277158032, "loss": 0.157, "step": 6005 }, { "epoch": 7.622067216233354, "grad_norm": 0.21950533986091614, "learning_rate": 0.0001858548313798428, "loss": 0.1569, "step": 6010 }, { "epoch": 7.628408370323399, "grad_norm": 0.20209990441799164, "learning_rate": 0.00018581695319040508, "loss": 0.1597, "step": 6015 }, { "epoch": 7.634749524413444, "grad_norm": 0.20784875750541687, "learning_rate": 0.00018577902822391383, "loss": 0.1545, "step": 6020 }, { "epoch": 7.641090678503487, "grad_norm": 0.2058633416891098, "learning_rate": 0.00018574105650104113, "loss": 0.1568, "step": 6025 }, { "epoch": 7.647431832593532, "grad_norm": 0.20721520483493805, "learning_rate": 0.00018570303804248464, "loss": 0.1573, "step": 6030 }, { "epoch": 7.653772986683577, "grad_norm": 0.23835545778274536, "learning_rate": 0.00018566497286896737, "loss": 0.1543, "step": 6035 }, { "epoch": 7.66011414077362, "grad_norm": 0.19631311297416687, "learning_rate": 0.00018562686100123788, "loss": 0.155, "step": 6040 }, { "epoch": 7.666455294863665, "grad_norm": 0.20680221915245056, "learning_rate": 0.00018558870246007016, "loss": 0.1566, "step": 6045 }, { "epoch": 7.67279644895371, "grad_norm": 0.20779050886631012, "learning_rate": 0.0001855504972662637, "loss": 0.1597, "step": 6050 }, { "epoch": 7.679137603043754, "grad_norm": 0.1998825967311859, "learning_rate": 0.00018551224544064327, "loss": 0.1523, "step": 6055 }, { "epoch": 7.685478757133798, "grad_norm": 0.18663161993026733, "learning_rate": 0.00018547394700405925, "loss": 0.1551, "step": 6060 }, { "epoch": 7.691819911223843, "grad_norm": 0.2132645547389984, "learning_rate": 0.00018543560197738728, "loss": 0.1572, "step": 6065 }, { "epoch": 7.698161065313887, "grad_norm": 0.2167138308286667, "learning_rate": 0.00018539721038152843, "loss": 0.155, "step": 6070 }, { "epoch": 7.704502219403931, "grad_norm": 0.20640896260738373, "learning_rate": 0.0001853587722374092, "loss": 0.1577, "step": 6075 }, { "epoch": 7.710843373493976, "grad_norm": 0.2075803279876709, "learning_rate": 0.0001853202875659814, "loss": 0.1543, "step": 6080 }, { "epoch": 7.7171845275840205, "grad_norm": 0.22270505130290985, "learning_rate": 0.00018528175638822227, "loss": 0.1594, "step": 6085 }, { "epoch": 7.723525681674065, "grad_norm": 0.19470542669296265, "learning_rate": 0.00018524317872513433, "loss": 0.1547, "step": 6090 }, { "epoch": 7.729866835764109, "grad_norm": 0.2342950999736786, "learning_rate": 0.00018520455459774548, "loss": 0.1575, "step": 6095 }, { "epoch": 7.736207989854154, "grad_norm": 0.20525361597537994, "learning_rate": 0.00018516588402710891, "loss": 0.155, "step": 6100 }, { "epoch": 7.742549143944197, "grad_norm": 0.21330955624580383, "learning_rate": 0.00018512716703430324, "loss": 0.1581, "step": 6105 }, { "epoch": 7.748890298034242, "grad_norm": 0.22245383262634277, "learning_rate": 0.0001850884036404322, "loss": 0.1579, "step": 6110 }, { "epoch": 7.755231452124287, "grad_norm": 0.2045760601758957, "learning_rate": 0.00018504959386662494, "loss": 0.1559, "step": 6115 }, { "epoch": 7.761572606214331, "grad_norm": 0.20652806758880615, "learning_rate": 0.00018501073773403593, "loss": 0.1569, "step": 6120 }, { "epoch": 7.767913760304375, "grad_norm": 0.20066697895526886, "learning_rate": 0.00018497183526384477, "loss": 0.1579, "step": 6125 }, { "epoch": 7.77425491439442, "grad_norm": 0.18545429408550262, "learning_rate": 0.00018493288647725647, "loss": 0.1578, "step": 6130 }, { "epoch": 7.780596068484464, "grad_norm": 0.20205913484096527, "learning_rate": 0.00018489389139550113, "loss": 0.1545, "step": 6135 }, { "epoch": 7.786937222574508, "grad_norm": 0.20748645067214966, "learning_rate": 0.00018485485003983426, "loss": 0.1541, "step": 6140 }, { "epoch": 7.793278376664553, "grad_norm": 0.20185454189777374, "learning_rate": 0.0001848157624315364, "loss": 0.1575, "step": 6145 }, { "epoch": 7.7996195307545975, "grad_norm": 0.20392458140850067, "learning_rate": 0.00018477662859191346, "loss": 0.1547, "step": 6150 }, { "epoch": 7.805960684844642, "grad_norm": 0.2183723896741867, "learning_rate": 0.0001847374485422965, "loss": 0.1545, "step": 6155 }, { "epoch": 7.812301838934686, "grad_norm": 0.196157306432724, "learning_rate": 0.0001846982223040417, "loss": 0.158, "step": 6160 }, { "epoch": 7.8186429930247305, "grad_norm": 0.19561372697353363, "learning_rate": 0.00018465894989853053, "loss": 0.155, "step": 6165 }, { "epoch": 7.824984147114775, "grad_norm": 0.21836337447166443, "learning_rate": 0.00018461963134716952, "loss": 0.1574, "step": 6170 }, { "epoch": 7.831325301204819, "grad_norm": 0.19623330235481262, "learning_rate": 0.00018458026667139048, "loss": 0.1583, "step": 6175 }, { "epoch": 7.837666455294864, "grad_norm": 0.20231418311595917, "learning_rate": 0.0001845408558926502, "loss": 0.1597, "step": 6180 }, { "epoch": 7.844007609384908, "grad_norm": 0.19910742342472076, "learning_rate": 0.00018450139903243074, "loss": 0.156, "step": 6185 }, { "epoch": 7.850348763474953, "grad_norm": 0.2339324951171875, "learning_rate": 0.0001844618961122392, "loss": 0.1563, "step": 6190 }, { "epoch": 7.856689917564997, "grad_norm": 0.19209963083267212, "learning_rate": 0.0001844223471536078, "loss": 0.1539, "step": 6195 }, { "epoch": 7.863031071655041, "grad_norm": 0.19854125380516052, "learning_rate": 0.00018438275217809387, "loss": 0.1529, "step": 6200 }, { "epoch": 7.869372225745086, "grad_norm": 0.21074460446834564, "learning_rate": 0.0001843431112072798, "loss": 0.1621, "step": 6205 }, { "epoch": 7.87571337983513, "grad_norm": 0.2018590271472931, "learning_rate": 0.0001843034242627731, "loss": 0.1526, "step": 6210 }, { "epoch": 7.882054533925174, "grad_norm": 0.22279158234596252, "learning_rate": 0.0001842636913662063, "loss": 0.1584, "step": 6215 }, { "epoch": 7.888395688015219, "grad_norm": 0.2056560516357422, "learning_rate": 0.00018422391253923698, "loss": 0.1553, "step": 6220 }, { "epoch": 7.894736842105263, "grad_norm": 0.205663800239563, "learning_rate": 0.00018418408780354777, "loss": 0.1548, "step": 6225 }, { "epoch": 7.9010779961953075, "grad_norm": 0.20977124571800232, "learning_rate": 0.00018414421718084624, "loss": 0.1536, "step": 6230 }, { "epoch": 7.907419150285352, "grad_norm": 0.222980797290802, "learning_rate": 0.00018410430069286515, "loss": 0.1525, "step": 6235 }, { "epoch": 7.913760304375396, "grad_norm": 0.20202401280403137, "learning_rate": 0.00018406433836136205, "loss": 0.1601, "step": 6240 }, { "epoch": 7.920101458465441, "grad_norm": 0.2281368225812912, "learning_rate": 0.0001840243302081197, "loss": 0.1586, "step": 6245 }, { "epoch": 7.926442612555485, "grad_norm": 0.20891331136226654, "learning_rate": 0.00018398427625494561, "loss": 0.1552, "step": 6250 }, { "epoch": 7.93278376664553, "grad_norm": 0.21633101999759674, "learning_rate": 0.00018394417652367246, "loss": 0.1556, "step": 6255 }, { "epoch": 7.939124920735574, "grad_norm": 0.20824643969535828, "learning_rate": 0.00018390403103615773, "loss": 0.1568, "step": 6260 }, { "epoch": 7.945466074825618, "grad_norm": 0.1912243813276291, "learning_rate": 0.0001838638398142839, "loss": 0.1546, "step": 6265 }, { "epoch": 7.951807228915663, "grad_norm": 0.23373466730117798, "learning_rate": 0.0001838236028799584, "loss": 0.1549, "step": 6270 }, { "epoch": 7.958148383005707, "grad_norm": 0.21032807230949402, "learning_rate": 0.00018378332025511352, "loss": 0.1563, "step": 6275 }, { "epoch": 7.964489537095751, "grad_norm": 0.20116302371025085, "learning_rate": 0.00018374299196170655, "loss": 0.158, "step": 6280 }, { "epoch": 7.970830691185796, "grad_norm": 0.21573922038078308, "learning_rate": 0.00018370261802171952, "loss": 0.1572, "step": 6285 }, { "epoch": 7.977171845275841, "grad_norm": 0.21183468401432037, "learning_rate": 0.00018366219845715952, "loss": 0.1603, "step": 6290 }, { "epoch": 7.983512999365884, "grad_norm": 0.18994176387786865, "learning_rate": 0.00018362173329005842, "loss": 0.1573, "step": 6295 }, { "epoch": 7.989854153455929, "grad_norm": 0.19292685389518738, "learning_rate": 0.00018358122254247286, "loss": 0.1585, "step": 6300 }, { "epoch": 7.996195307545974, "grad_norm": 0.20079150795936584, "learning_rate": 0.00018354066623648451, "loss": 0.1578, "step": 6305 }, { "epoch": 8.0, "eval_loss": 0.3237058222293854, "eval_runtime": 1445.5576, "eval_samples_per_second": 1.091, "eval_steps_per_second": 1.091, "step": 6308 }, { "epoch": 8.002536461636018, "grad_norm": 0.1928432732820511, "learning_rate": 0.0001835000643941997, "loss": 0.1484, "step": 6310 }, { "epoch": 8.008877615726062, "grad_norm": 0.2337139993906021, "learning_rate": 0.00018345941703774973, "loss": 0.1299, "step": 6315 }, { "epoch": 8.015218769816107, "grad_norm": 0.21178649365901947, "learning_rate": 0.00018341872418929062, "loss": 0.1315, "step": 6320 }, { "epoch": 8.021559923906151, "grad_norm": 0.18957169353961945, "learning_rate": 0.00018337798587100314, "loss": 0.1274, "step": 6325 }, { "epoch": 8.027901077996196, "grad_norm": 0.23835398256778717, "learning_rate": 0.000183337202105093, "loss": 0.1274, "step": 6330 }, { "epoch": 8.034242232086239, "grad_norm": 0.19453467428684235, "learning_rate": 0.0001832963729137905, "loss": 0.1254, "step": 6335 }, { "epoch": 8.040583386176284, "grad_norm": 0.1994711309671402, "learning_rate": 0.0001832554983193509, "loss": 0.1286, "step": 6340 }, { "epoch": 8.046924540266328, "grad_norm": 0.19881466031074524, "learning_rate": 0.00018321457834405397, "loss": 0.1314, "step": 6345 }, { "epoch": 8.053265694356373, "grad_norm": 0.21651390194892883, "learning_rate": 0.00018317361301020446, "loss": 0.1316, "step": 6350 }, { "epoch": 8.059606848446418, "grad_norm": 0.20017582178115845, "learning_rate": 0.00018313260234013168, "loss": 0.1306, "step": 6355 }, { "epoch": 8.065948002536462, "grad_norm": 0.22035111486911774, "learning_rate": 0.00018309154635618965, "loss": 0.1295, "step": 6360 }, { "epoch": 8.072289156626505, "grad_norm": 0.20705905556678772, "learning_rate": 0.00018305044508075725, "loss": 0.13, "step": 6365 }, { "epoch": 8.07863031071655, "grad_norm": 0.2055715024471283, "learning_rate": 0.00018300929853623787, "loss": 0.1321, "step": 6370 }, { "epoch": 8.084971464806594, "grad_norm": 0.2200298309326172, "learning_rate": 0.00018296810674505966, "loss": 0.1323, "step": 6375 }, { "epoch": 8.091312618896639, "grad_norm": 0.2232968956232071, "learning_rate": 0.0001829268697296754, "loss": 0.1351, "step": 6380 }, { "epoch": 8.097653772986684, "grad_norm": 0.21767346560955048, "learning_rate": 0.00018288558751256256, "loss": 0.1303, "step": 6385 }, { "epoch": 8.103994927076728, "grad_norm": 0.23261016607284546, "learning_rate": 0.00018284426011622325, "loss": 0.1327, "step": 6390 }, { "epoch": 8.110336081166773, "grad_norm": 0.22258134186267853, "learning_rate": 0.0001828028875631841, "loss": 0.133, "step": 6395 }, { "epoch": 8.116677235256816, "grad_norm": 0.21947380900382996, "learning_rate": 0.00018276146987599652, "loss": 0.1334, "step": 6400 }, { "epoch": 8.12301838934686, "grad_norm": 0.2318253517150879, "learning_rate": 0.0001827200070772364, "loss": 0.1307, "step": 6405 }, { "epoch": 8.129359543436905, "grad_norm": 0.21170242130756378, "learning_rate": 0.00018267849918950427, "loss": 0.1359, "step": 6410 }, { "epoch": 8.13570069752695, "grad_norm": 0.21570248901844025, "learning_rate": 0.0001826369462354252, "loss": 0.1319, "step": 6415 }, { "epoch": 8.142041851616995, "grad_norm": 0.2363613396883011, "learning_rate": 0.00018259534823764886, "loss": 0.1335, "step": 6420 }, { "epoch": 8.14838300570704, "grad_norm": 0.20604030787944794, "learning_rate": 0.00018255370521884948, "loss": 0.1349, "step": 6425 }, { "epoch": 8.154724159797084, "grad_norm": 0.20395101606845856, "learning_rate": 0.0001825120172017258, "loss": 0.1355, "step": 6430 }, { "epoch": 8.161065313887127, "grad_norm": 0.24164056777954102, "learning_rate": 0.0001824702842090011, "loss": 0.1366, "step": 6435 }, { "epoch": 8.167406467977171, "grad_norm": 0.20743347704410553, "learning_rate": 0.0001824285062634232, "loss": 0.1348, "step": 6440 }, { "epoch": 8.173747622067216, "grad_norm": 0.2241652011871338, "learning_rate": 0.00018238668338776433, "loss": 0.1321, "step": 6445 }, { "epoch": 8.18008877615726, "grad_norm": 0.22267642617225647, "learning_rate": 0.00018234481560482134, "loss": 0.1322, "step": 6450 }, { "epoch": 8.186429930247305, "grad_norm": 0.21147051453590393, "learning_rate": 0.00018230290293741547, "loss": 0.1366, "step": 6455 }, { "epoch": 8.19277108433735, "grad_norm": 0.23828718066215515, "learning_rate": 0.00018226094540839247, "loss": 0.1363, "step": 6460 }, { "epoch": 8.199112238427393, "grad_norm": 0.20498618483543396, "learning_rate": 0.00018221894304062253, "loss": 0.1351, "step": 6465 }, { "epoch": 8.205453392517438, "grad_norm": 0.21847303211688995, "learning_rate": 0.00018217689585700022, "loss": 0.1339, "step": 6470 }, { "epoch": 8.211794546607482, "grad_norm": 0.20998454093933105, "learning_rate": 0.00018213480388044463, "loss": 0.1358, "step": 6475 }, { "epoch": 8.218135700697527, "grad_norm": 0.2062244415283203, "learning_rate": 0.00018209266713389928, "loss": 0.1365, "step": 6480 }, { "epoch": 8.224476854787572, "grad_norm": 0.23493388295173645, "learning_rate": 0.00018205048564033196, "loss": 0.138, "step": 6485 }, { "epoch": 8.230818008877616, "grad_norm": 0.22964544594287872, "learning_rate": 0.00018200825942273497, "loss": 0.1284, "step": 6490 }, { "epoch": 8.23715916296766, "grad_norm": 0.21370603144168854, "learning_rate": 0.00018196598850412493, "loss": 0.1325, "step": 6495 }, { "epoch": 8.243500317057704, "grad_norm": 0.21881423890590668, "learning_rate": 0.0001819236729075429, "loss": 0.1372, "step": 6500 }, { "epoch": 8.249841471147748, "grad_norm": 0.23206481337547302, "learning_rate": 0.0001818813126560542, "loss": 0.134, "step": 6505 }, { "epoch": 8.256182625237793, "grad_norm": 0.21674104034900665, "learning_rate": 0.00018183890777274855, "loss": 0.1348, "step": 6510 }, { "epoch": 8.262523779327838, "grad_norm": 0.20125320553779602, "learning_rate": 0.00018179645828073995, "loss": 0.1359, "step": 6515 }, { "epoch": 8.268864933417882, "grad_norm": 0.19258597493171692, "learning_rate": 0.00018175396420316674, "loss": 0.1361, "step": 6520 }, { "epoch": 8.275206087507927, "grad_norm": 0.20930828154087067, "learning_rate": 0.00018171142556319163, "loss": 0.1305, "step": 6525 }, { "epoch": 8.281547241597972, "grad_norm": 0.2351502925157547, "learning_rate": 0.00018166884238400148, "loss": 0.1325, "step": 6530 }, { "epoch": 8.287888395688014, "grad_norm": 0.20575560629367828, "learning_rate": 0.00018162621468880753, "loss": 0.138, "step": 6535 }, { "epoch": 8.29422954977806, "grad_norm": 0.214612677693367, "learning_rate": 0.00018158354250084527, "loss": 0.1354, "step": 6540 }, { "epoch": 8.300570703868104, "grad_norm": 0.22076138854026794, "learning_rate": 0.00018154082584337445, "loss": 0.1347, "step": 6545 }, { "epoch": 8.306911857958148, "grad_norm": 0.20074552297592163, "learning_rate": 0.000181498064739679, "loss": 0.1358, "step": 6550 }, { "epoch": 8.313253012048193, "grad_norm": 0.2210153043270111, "learning_rate": 0.00018145525921306712, "loss": 0.1358, "step": 6555 }, { "epoch": 8.319594166138238, "grad_norm": 0.2114107757806778, "learning_rate": 0.00018141240928687123, "loss": 0.1348, "step": 6560 }, { "epoch": 8.325935320228282, "grad_norm": 0.22477009892463684, "learning_rate": 0.00018136951498444795, "loss": 0.1392, "step": 6565 }, { "epoch": 8.332276474318325, "grad_norm": 0.21344219148159027, "learning_rate": 0.00018132657632917808, "loss": 0.1377, "step": 6570 }, { "epoch": 8.33861762840837, "grad_norm": 0.2258421629667282, "learning_rate": 0.0001812835933444666, "loss": 0.138, "step": 6575 }, { "epoch": 8.344958782498415, "grad_norm": 0.22514615952968597, "learning_rate": 0.00018124056605374264, "loss": 0.1385, "step": 6580 }, { "epoch": 8.35129993658846, "grad_norm": 0.21505369246006012, "learning_rate": 0.00018119749448045947, "loss": 0.1377, "step": 6585 }, { "epoch": 8.357641090678504, "grad_norm": 0.22354766726493835, "learning_rate": 0.00018115437864809456, "loss": 0.1385, "step": 6590 }, { "epoch": 8.363982244768549, "grad_norm": 0.2166944444179535, "learning_rate": 0.00018111121858014944, "loss": 0.138, "step": 6595 }, { "epoch": 8.370323398858591, "grad_norm": 0.23484566807746887, "learning_rate": 0.0001810680143001498, "loss": 0.1371, "step": 6600 }, { "epoch": 8.376664552948636, "grad_norm": 0.2131161391735077, "learning_rate": 0.00018102476583164534, "loss": 0.1382, "step": 6605 }, { "epoch": 8.38300570703868, "grad_norm": 0.22732941806316376, "learning_rate": 0.00018098147319820998, "loss": 0.1374, "step": 6610 }, { "epoch": 8.389346861128725, "grad_norm": 0.20924319326877594, "learning_rate": 0.00018093813642344156, "loss": 0.1365, "step": 6615 }, { "epoch": 8.39568801521877, "grad_norm": 0.2202252745628357, "learning_rate": 0.00018089475553096217, "loss": 0.1385, "step": 6620 }, { "epoch": 8.402029169308815, "grad_norm": 0.23223629593849182, "learning_rate": 0.00018085133054441776, "loss": 0.1355, "step": 6625 }, { "epoch": 8.40837032339886, "grad_norm": 0.23204581439495087, "learning_rate": 0.00018080786148747842, "loss": 0.14, "step": 6630 }, { "epoch": 8.414711477488902, "grad_norm": 0.19659708440303802, "learning_rate": 0.00018076434838383825, "loss": 0.1404, "step": 6635 }, { "epoch": 8.421052631578947, "grad_norm": 0.2020081877708435, "learning_rate": 0.00018072079125721534, "loss": 0.1377, "step": 6640 }, { "epoch": 8.427393785668992, "grad_norm": 0.21982277929782867, "learning_rate": 0.00018067719013135176, "loss": 0.1396, "step": 6645 }, { "epoch": 8.433734939759036, "grad_norm": 0.23543202877044678, "learning_rate": 0.00018063354503001358, "loss": 0.1389, "step": 6650 }, { "epoch": 8.44007609384908, "grad_norm": 0.21117927134037018, "learning_rate": 0.0001805898559769909, "loss": 0.1403, "step": 6655 }, { "epoch": 8.446417247939126, "grad_norm": 0.22728238999843597, "learning_rate": 0.0001805461229960977, "loss": 0.1388, "step": 6660 }, { "epoch": 8.452758402029168, "grad_norm": 0.19661027193069458, "learning_rate": 0.0001805023461111719, "loss": 0.1398, "step": 6665 }, { "epoch": 8.459099556119213, "grad_norm": 0.2136552333831787, "learning_rate": 0.0001804585253460754, "loss": 0.1361, "step": 6670 }, { "epoch": 8.465440710209258, "grad_norm": 0.23508967459201813, "learning_rate": 0.000180414660724694, "loss": 0.1386, "step": 6675 }, { "epoch": 8.471781864299302, "grad_norm": 0.2067343294620514, "learning_rate": 0.0001803707522709374, "loss": 0.1369, "step": 6680 }, { "epoch": 8.478123018389347, "grad_norm": 0.22101737558841705, "learning_rate": 0.00018032680000873915, "loss": 0.1351, "step": 6685 }, { "epoch": 8.484464172479392, "grad_norm": 0.234425351023674, "learning_rate": 0.00018028280396205682, "loss": 0.1376, "step": 6690 }, { "epoch": 8.490805326569436, "grad_norm": 0.19354897737503052, "learning_rate": 0.00018023876415487167, "loss": 0.1367, "step": 6695 }, { "epoch": 8.49714648065948, "grad_norm": 0.23648487031459808, "learning_rate": 0.00018019468061118888, "loss": 0.1425, "step": 6700 }, { "epoch": 8.503487634749524, "grad_norm": 0.23927363753318787, "learning_rate": 0.00018015055335503756, "loss": 0.1387, "step": 6705 }, { "epoch": 8.509828788839569, "grad_norm": 0.22322037816047668, "learning_rate": 0.0001801063824104705, "loss": 0.1371, "step": 6710 }, { "epoch": 8.516169942929613, "grad_norm": 0.21309080719947815, "learning_rate": 0.00018006216780156438, "loss": 0.1392, "step": 6715 }, { "epoch": 8.522511097019658, "grad_norm": 0.21818041801452637, "learning_rate": 0.00018001790955241972, "loss": 0.1386, "step": 6720 }, { "epoch": 8.528852251109702, "grad_norm": 0.2028253674507141, "learning_rate": 0.00017997360768716073, "loss": 0.1391, "step": 6725 }, { "epoch": 8.535193405199747, "grad_norm": 0.20757676661014557, "learning_rate": 0.00017992926222993544, "loss": 0.1408, "step": 6730 }, { "epoch": 8.54153455928979, "grad_norm": 0.20694056153297424, "learning_rate": 0.00017988487320491568, "loss": 0.1392, "step": 6735 }, { "epoch": 8.547875713379835, "grad_norm": 0.19931674003601074, "learning_rate": 0.00017984044063629702, "loss": 0.1405, "step": 6740 }, { "epoch": 8.55421686746988, "grad_norm": 0.20958775281906128, "learning_rate": 0.00017979596454829866, "loss": 0.1391, "step": 6745 }, { "epoch": 8.560558021559924, "grad_norm": 0.21511487662792206, "learning_rate": 0.00017975144496516364, "loss": 0.1403, "step": 6750 }, { "epoch": 8.566899175649969, "grad_norm": 0.21476149559020996, "learning_rate": 0.00017970688191115865, "loss": 0.1422, "step": 6755 }, { "epoch": 8.573240329740013, "grad_norm": 0.19213946163654327, "learning_rate": 0.00017966227541057412, "loss": 0.1428, "step": 6760 }, { "epoch": 8.579581483830058, "grad_norm": 0.21595697104930878, "learning_rate": 0.00017961762548772414, "loss": 0.1394, "step": 6765 }, { "epoch": 8.5859226379201, "grad_norm": 0.20870253443717957, "learning_rate": 0.0001795729321669464, "loss": 0.1387, "step": 6770 }, { "epoch": 8.592263792010145, "grad_norm": 0.21722225844860077, "learning_rate": 0.0001795281954726024, "loss": 0.1399, "step": 6775 }, { "epoch": 8.59860494610019, "grad_norm": 0.22180074453353882, "learning_rate": 0.00017948341542907713, "loss": 0.139, "step": 6780 }, { "epoch": 8.604946100190235, "grad_norm": 0.20733681321144104, "learning_rate": 0.00017943859206077928, "loss": 0.1377, "step": 6785 }, { "epoch": 8.61128725428028, "grad_norm": 0.20832906663417816, "learning_rate": 0.00017939372539214118, "loss": 0.1383, "step": 6790 }, { "epoch": 8.617628408370324, "grad_norm": 0.21679314970970154, "learning_rate": 0.00017934881544761865, "loss": 0.1403, "step": 6795 }, { "epoch": 8.623969562460367, "grad_norm": 0.2017892599105835, "learning_rate": 0.00017930386225169126, "loss": 0.1393, "step": 6800 }, { "epoch": 8.630310716550412, "grad_norm": 0.2310793399810791, "learning_rate": 0.00017925886582886201, "loss": 0.1386, "step": 6805 }, { "epoch": 8.636651870640456, "grad_norm": 0.19726167619228363, "learning_rate": 0.00017921382620365755, "loss": 0.1404, "step": 6810 }, { "epoch": 8.642993024730501, "grad_norm": 0.19933365285396576, "learning_rate": 0.00017916874340062806, "loss": 0.1389, "step": 6815 }, { "epoch": 8.649334178820546, "grad_norm": 0.2327439785003662, "learning_rate": 0.00017912361744434722, "loss": 0.1417, "step": 6820 }, { "epoch": 8.65567533291059, "grad_norm": 0.19539040327072144, "learning_rate": 0.00017907844835941228, "loss": 0.1379, "step": 6825 }, { "epoch": 8.662016487000635, "grad_norm": 0.2231414020061493, "learning_rate": 0.00017903323617044395, "loss": 0.1397, "step": 6830 }, { "epoch": 8.668357641090678, "grad_norm": 0.23892861604690552, "learning_rate": 0.00017898798090208652, "loss": 0.1376, "step": 6835 }, { "epoch": 8.674698795180722, "grad_norm": 0.19660015404224396, "learning_rate": 0.00017894268257900766, "loss": 0.1396, "step": 6840 }, { "epoch": 8.681039949270767, "grad_norm": 0.2007194608449936, "learning_rate": 0.00017889734122589858, "loss": 0.1392, "step": 6845 }, { "epoch": 8.687381103360812, "grad_norm": 0.20236362516880035, "learning_rate": 0.0001788519568674739, "loss": 0.1358, "step": 6850 }, { "epoch": 8.693722257450856, "grad_norm": 0.21106046438217163, "learning_rate": 0.00017880652952847163, "loss": 0.1413, "step": 6855 }, { "epoch": 8.700063411540901, "grad_norm": 0.2001102715730667, "learning_rate": 0.0001787610592336534, "loss": 0.1389, "step": 6860 }, { "epoch": 8.706404565630944, "grad_norm": 0.2033744752407074, "learning_rate": 0.0001787155460078041, "loss": 0.1372, "step": 6865 }, { "epoch": 8.712745719720989, "grad_norm": 0.2112102508544922, "learning_rate": 0.00017866998987573198, "loss": 0.1379, "step": 6870 }, { "epoch": 8.719086873811033, "grad_norm": 0.22294828295707703, "learning_rate": 0.00017862439086226885, "loss": 0.1395, "step": 6875 }, { "epoch": 8.725428027901078, "grad_norm": 0.21097150444984436, "learning_rate": 0.00017857874899226972, "loss": 0.1388, "step": 6880 }, { "epoch": 8.731769181991123, "grad_norm": 0.2113080471754074, "learning_rate": 0.00017853306429061301, "loss": 0.1433, "step": 6885 }, { "epoch": 8.738110336081167, "grad_norm": 0.21775498986244202, "learning_rate": 0.0001784873367822006, "loss": 0.1426, "step": 6890 }, { "epoch": 8.744451490171212, "grad_norm": 0.23071123659610748, "learning_rate": 0.00017844156649195759, "loss": 0.1426, "step": 6895 }, { "epoch": 8.750792644261256, "grad_norm": 0.2137385755777359, "learning_rate": 0.00017839575344483238, "loss": 0.1386, "step": 6900 }, { "epoch": 8.7571337983513, "grad_norm": 0.20723086595535278, "learning_rate": 0.00017834989766579674, "loss": 0.1407, "step": 6905 }, { "epoch": 8.763474952441344, "grad_norm": 0.21198566257953644, "learning_rate": 0.00017830399917984568, "loss": 0.1422, "step": 6910 }, { "epoch": 8.769816106531389, "grad_norm": 0.21262198686599731, "learning_rate": 0.00017825805801199756, "loss": 0.1397, "step": 6915 }, { "epoch": 8.776157260621433, "grad_norm": 0.2137686163187027, "learning_rate": 0.00017821207418729394, "loss": 0.1372, "step": 6920 }, { "epoch": 8.782498414711478, "grad_norm": 0.21698841452598572, "learning_rate": 0.00017816604773079973, "loss": 0.1441, "step": 6925 }, { "epoch": 8.788839568801523, "grad_norm": 0.20860496163368225, "learning_rate": 0.00017811997866760287, "loss": 0.1399, "step": 6930 }, { "epoch": 8.795180722891565, "grad_norm": 0.20779329538345337, "learning_rate": 0.00017807386702281476, "loss": 0.1413, "step": 6935 }, { "epoch": 8.80152187698161, "grad_norm": 0.2399662584066391, "learning_rate": 0.0001780277128215699, "loss": 0.1406, "step": 6940 }, { "epoch": 8.807863031071655, "grad_norm": 0.20318613946437836, "learning_rate": 0.00017798151608902597, "loss": 0.1412, "step": 6945 }, { "epoch": 8.8142041851617, "grad_norm": 0.2290705144405365, "learning_rate": 0.0001779352768503638, "loss": 0.138, "step": 6950 }, { "epoch": 8.820545339251744, "grad_norm": 0.2023320198059082, "learning_rate": 0.00017788899513078755, "loss": 0.1431, "step": 6955 }, { "epoch": 8.826886493341789, "grad_norm": 0.2113533318042755, "learning_rate": 0.00017784267095552437, "loss": 0.1378, "step": 6960 }, { "epoch": 8.833227647431833, "grad_norm": 0.22192510962486267, "learning_rate": 0.00017779630434982467, "loss": 0.1416, "step": 6965 }, { "epoch": 8.839568801521876, "grad_norm": 0.22244036197662354, "learning_rate": 0.00017774989533896185, "loss": 0.1396, "step": 6970 }, { "epoch": 8.845909955611921, "grad_norm": 0.21596921980381012, "learning_rate": 0.00017770344394823256, "loss": 0.1443, "step": 6975 }, { "epoch": 8.852251109701966, "grad_norm": 0.21304269134998322, "learning_rate": 0.0001776569502029565, "loss": 0.1396, "step": 6980 }, { "epoch": 8.85859226379201, "grad_norm": 0.2098553627729416, "learning_rate": 0.00017761041412847641, "loss": 0.1426, "step": 6985 }, { "epoch": 8.864933417882055, "grad_norm": 0.22711095213890076, "learning_rate": 0.0001775638357501582, "loss": 0.1411, "step": 6990 }, { "epoch": 8.8712745719721, "grad_norm": 0.21013690531253815, "learning_rate": 0.00017751721509339077, "loss": 0.1435, "step": 6995 }, { "epoch": 8.877615726062142, "grad_norm": 0.22348740696907043, "learning_rate": 0.00017747055218358604, "loss": 0.1402, "step": 7000 }, { "epoch": 8.883956880152187, "grad_norm": 0.21728843450546265, "learning_rate": 0.00017742384704617903, "loss": 0.1425, "step": 7005 }, { "epoch": 8.890298034242232, "grad_norm": 0.2051689326763153, "learning_rate": 0.00017737709970662774, "loss": 0.1408, "step": 7010 }, { "epoch": 8.896639188332276, "grad_norm": 0.2097233086824417, "learning_rate": 0.00017733031019041322, "loss": 0.1411, "step": 7015 }, { "epoch": 8.902980342422321, "grad_norm": 0.2114674150943756, "learning_rate": 0.00017728347852303942, "loss": 0.1417, "step": 7020 }, { "epoch": 8.909321496512366, "grad_norm": 0.22041675448417664, "learning_rate": 0.00017723660473003335, "loss": 0.1413, "step": 7025 }, { "epoch": 8.91566265060241, "grad_norm": 0.19942757487297058, "learning_rate": 0.0001771896888369449, "loss": 0.1423, "step": 7030 }, { "epoch": 8.922003804692453, "grad_norm": 0.2510841488838196, "learning_rate": 0.00017714273086934706, "loss": 0.1437, "step": 7035 }, { "epoch": 8.928344958782498, "grad_norm": 0.19419313967227936, "learning_rate": 0.00017709573085283555, "loss": 0.1382, "step": 7040 }, { "epoch": 8.934686112872543, "grad_norm": 0.22038325667381287, "learning_rate": 0.00017704868881302912, "loss": 0.1418, "step": 7045 }, { "epoch": 8.941027266962587, "grad_norm": 0.20312252640724182, "learning_rate": 0.00017700160477556948, "loss": 0.1422, "step": 7050 }, { "epoch": 8.947368421052632, "grad_norm": 0.21089406311511993, "learning_rate": 0.00017695447876612112, "loss": 0.1414, "step": 7055 }, { "epoch": 8.953709575142677, "grad_norm": 0.216489776968956, "learning_rate": 0.00017690731081037146, "loss": 0.1427, "step": 7060 }, { "epoch": 8.960050729232721, "grad_norm": 0.2159714549779892, "learning_rate": 0.0001768601009340308, "loss": 0.1387, "step": 7065 }, { "epoch": 8.966391883322764, "grad_norm": 0.2168179601430893, "learning_rate": 0.00017681284916283224, "loss": 0.1427, "step": 7070 }, { "epoch": 8.972733037412809, "grad_norm": 0.22179380059242249, "learning_rate": 0.00017676555552253182, "loss": 0.1422, "step": 7075 }, { "epoch": 8.979074191502853, "grad_norm": 0.21048948168754578, "learning_rate": 0.00017671822003890823, "loss": 0.1412, "step": 7080 }, { "epoch": 8.985415345592898, "grad_norm": 0.20671595633029938, "learning_rate": 0.00017667084273776315, "loss": 0.1406, "step": 7085 }, { "epoch": 8.991756499682943, "grad_norm": 0.22312676906585693, "learning_rate": 0.0001766234236449209, "loss": 0.1422, "step": 7090 }, { "epoch": 8.998097653772987, "grad_norm": 0.20762966573238373, "learning_rate": 0.00017657596278622872, "loss": 0.1426, "step": 7095 }, { "epoch": 8.999365884590995, "eval_loss": 0.33262965083122253, "eval_runtime": 1444.9791, "eval_samples_per_second": 1.091, "eval_steps_per_second": 1.091, "step": 7096 }, { "epoch": 8.999365884590995, "step": 7096, "total_flos": 5.264803297235042e+18, "train_loss": 0.26379992237908867, "train_runtime": 178382.3033, "train_samples_per_second": 1.061, "train_steps_per_second": 0.133 } ], "logging_steps": 5, "max_steps": 23640, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.264803297235042e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }