{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9977892409727342, "eval_steps": 500, "global_step": 2712, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001105379513633014, "grad_norm": 6.202141761779785, "learning_rate": 3.6764705882352945e-08, "loss": 0.9258, "step": 1 }, { "epoch": 0.002210759027266028, "grad_norm": 6.2503437995910645, "learning_rate": 7.352941176470589e-08, "loss": 0.8939, "step": 2 }, { "epoch": 0.003316138540899042, "grad_norm": 6.218397617340088, "learning_rate": 1.1029411764705884e-07, "loss": 0.943, "step": 3 }, { "epoch": 0.004421518054532056, "grad_norm": 6.271090984344482, "learning_rate": 1.4705882352941178e-07, "loss": 0.9192, "step": 4 }, { "epoch": 0.00552689756816507, "grad_norm": 6.510982513427734, "learning_rate": 1.8382352941176472e-07, "loss": 0.9211, "step": 5 }, { "epoch": 0.006632277081798084, "grad_norm": 6.131255626678467, "learning_rate": 2.2058823529411768e-07, "loss": 0.8763, "step": 6 }, { "epoch": 0.007737656595431098, "grad_norm": 5.9245452880859375, "learning_rate": 2.573529411764706e-07, "loss": 0.8715, "step": 7 }, { "epoch": 0.008843036109064112, "grad_norm": 6.351240634918213, "learning_rate": 2.9411764705882356e-07, "loss": 0.915, "step": 8 }, { "epoch": 0.009948415622697125, "grad_norm": 6.314542770385742, "learning_rate": 3.308823529411765e-07, "loss": 0.9309, "step": 9 }, { "epoch": 0.01105379513633014, "grad_norm": 6.022907257080078, "learning_rate": 3.6764705882352943e-07, "loss": 0.8831, "step": 10 }, { "epoch": 0.012159174649963155, "grad_norm": 6.318159103393555, "learning_rate": 4.044117647058824e-07, "loss": 0.9572, "step": 11 }, { "epoch": 0.013264554163596167, "grad_norm": 5.984519004821777, "learning_rate": 4.4117647058823536e-07, "loss": 0.909, "step": 12 }, { "epoch": 0.014369933677229182, "grad_norm": 5.847412109375, "learning_rate": 4.779411764705882e-07, "loss": 0.8688, "step": 13 }, { "epoch": 0.015475313190862197, "grad_norm": 5.818578243255615, "learning_rate": 5.147058823529412e-07, "loss": 0.8938, "step": 14 }, { "epoch": 0.01658069270449521, "grad_norm": 5.885476112365723, "learning_rate": 5.514705882352942e-07, "loss": 0.9133, "step": 15 }, { "epoch": 0.017686072218128224, "grad_norm": 5.651276588439941, "learning_rate": 5.882352941176471e-07, "loss": 0.9289, "step": 16 }, { "epoch": 0.01879145173176124, "grad_norm": 4.591658115386963, "learning_rate": 6.25e-07, "loss": 0.8333, "step": 17 }, { "epoch": 0.01989683124539425, "grad_norm": 4.73502779006958, "learning_rate": 6.61764705882353e-07, "loss": 0.8533, "step": 18 }, { "epoch": 0.021002210759027265, "grad_norm": 4.804879665374756, "learning_rate": 6.985294117647059e-07, "loss": 0.8767, "step": 19 }, { "epoch": 0.02210759027266028, "grad_norm": 4.49127197265625, "learning_rate": 7.352941176470589e-07, "loss": 0.862, "step": 20 }, { "epoch": 0.023212969786293294, "grad_norm": 4.516640663146973, "learning_rate": 7.720588235294119e-07, "loss": 0.8554, "step": 21 }, { "epoch": 0.02431834929992631, "grad_norm": 3.4016852378845215, "learning_rate": 8.088235294117648e-07, "loss": 0.8273, "step": 22 }, { "epoch": 0.025423728813559324, "grad_norm": 2.7037136554718018, "learning_rate": 8.455882352941178e-07, "loss": 0.8214, "step": 23 }, { "epoch": 0.026529108327192335, "grad_norm": 2.5232088565826416, "learning_rate": 8.823529411764707e-07, "loss": 0.7957, "step": 24 }, { "epoch": 0.02763448784082535, "grad_norm": 2.3901748657226562, "learning_rate": 9.191176470588237e-07, "loss": 0.7873, "step": 25 }, { "epoch": 0.028739867354458364, "grad_norm": 2.4370107650756836, "learning_rate": 9.558823529411764e-07, "loss": 0.7918, "step": 26 }, { "epoch": 0.02984524686809138, "grad_norm": 2.16312575340271, "learning_rate": 9.926470588235295e-07, "loss": 0.8047, "step": 27 }, { "epoch": 0.030950626381724394, "grad_norm": 2.096590995788574, "learning_rate": 1.0294117647058825e-06, "loss": 0.8122, "step": 28 }, { "epoch": 0.032056005895357405, "grad_norm": 2.082396984100342, "learning_rate": 1.0661764705882354e-06, "loss": 0.7882, "step": 29 }, { "epoch": 0.03316138540899042, "grad_norm": 1.5802079439163208, "learning_rate": 1.1029411764705884e-06, "loss": 0.7848, "step": 30 }, { "epoch": 0.034266764922623434, "grad_norm": 2.0999622344970703, "learning_rate": 1.1397058823529413e-06, "loss": 0.7818, "step": 31 }, { "epoch": 0.03537214443625645, "grad_norm": 2.450608730316162, "learning_rate": 1.1764705882352942e-06, "loss": 0.7708, "step": 32 }, { "epoch": 0.036477523949889464, "grad_norm": 2.783461570739746, "learning_rate": 1.2132352941176472e-06, "loss": 0.7839, "step": 33 }, { "epoch": 0.03758290346352248, "grad_norm": 2.839360237121582, "learning_rate": 1.25e-06, "loss": 0.7884, "step": 34 }, { "epoch": 0.03868828297715549, "grad_norm": 2.688298463821411, "learning_rate": 1.2867647058823528e-06, "loss": 0.7463, "step": 35 }, { "epoch": 0.0397936624907885, "grad_norm": 2.510631561279297, "learning_rate": 1.323529411764706e-06, "loss": 0.7583, "step": 36 }, { "epoch": 0.040899042004421515, "grad_norm": 2.2282369136810303, "learning_rate": 1.360294117647059e-06, "loss": 0.7734, "step": 37 }, { "epoch": 0.04200442151805453, "grad_norm": 1.88447904586792, "learning_rate": 1.3970588235294119e-06, "loss": 0.7595, "step": 38 }, { "epoch": 0.043109801031687545, "grad_norm": 1.5797384977340698, "learning_rate": 1.4338235294117648e-06, "loss": 0.7487, "step": 39 }, { "epoch": 0.04421518054532056, "grad_norm": 1.2249430418014526, "learning_rate": 1.4705882352941177e-06, "loss": 0.7241, "step": 40 }, { "epoch": 0.045320560058953574, "grad_norm": 0.9825201630592346, "learning_rate": 1.5073529411764707e-06, "loss": 0.7356, "step": 41 }, { "epoch": 0.04642593957258659, "grad_norm": 1.0709400177001953, "learning_rate": 1.5441176470588238e-06, "loss": 0.7589, "step": 42 }, { "epoch": 0.0475313190862196, "grad_norm": 1.1806859970092773, "learning_rate": 1.5808823529411765e-06, "loss": 0.7037, "step": 43 }, { "epoch": 0.04863669859985262, "grad_norm": 1.1590816974639893, "learning_rate": 1.6176470588235297e-06, "loss": 0.7111, "step": 44 }, { "epoch": 0.04974207811348563, "grad_norm": 1.1259855031967163, "learning_rate": 1.6544117647058824e-06, "loss": 0.695, "step": 45 }, { "epoch": 0.05084745762711865, "grad_norm": 1.037628173828125, "learning_rate": 1.6911764705882356e-06, "loss": 0.678, "step": 46 }, { "epoch": 0.051952837140751655, "grad_norm": 0.9054527878761292, "learning_rate": 1.7279411764705883e-06, "loss": 0.6947, "step": 47 }, { "epoch": 0.05305821665438467, "grad_norm": 0.8047701120376587, "learning_rate": 1.7647058823529414e-06, "loss": 0.665, "step": 48 }, { "epoch": 0.054163596168017684, "grad_norm": 0.9736095666885376, "learning_rate": 1.8014705882352942e-06, "loss": 0.7202, "step": 49 }, { "epoch": 0.0552689756816507, "grad_norm": 0.888812780380249, "learning_rate": 1.8382352941176473e-06, "loss": 0.7269, "step": 50 }, { "epoch": 0.056374355195283714, "grad_norm": 0.7443651556968689, "learning_rate": 1.8750000000000003e-06, "loss": 0.654, "step": 51 }, { "epoch": 0.05747973470891673, "grad_norm": 0.7068009972572327, "learning_rate": 1.9117647058823528e-06, "loss": 0.65, "step": 52 }, { "epoch": 0.05858511422254974, "grad_norm": 0.6925762891769409, "learning_rate": 1.948529411764706e-06, "loss": 0.6335, "step": 53 }, { "epoch": 0.05969049373618276, "grad_norm": 0.824958086013794, "learning_rate": 1.985294117647059e-06, "loss": 0.6667, "step": 54 }, { "epoch": 0.06079587324981577, "grad_norm": 0.8363823294639587, "learning_rate": 2.022058823529412e-06, "loss": 0.6748, "step": 55 }, { "epoch": 0.06190125276344879, "grad_norm": 0.7325060367584229, "learning_rate": 2.058823529411765e-06, "loss": 0.6538, "step": 56 }, { "epoch": 0.0630066322770818, "grad_norm": 0.7176344394683838, "learning_rate": 2.095588235294118e-06, "loss": 0.6614, "step": 57 }, { "epoch": 0.06411201179071481, "grad_norm": 0.599174976348877, "learning_rate": 2.132352941176471e-06, "loss": 0.6756, "step": 58 }, { "epoch": 0.06521739130434782, "grad_norm": 0.6026215553283691, "learning_rate": 2.1691176470588238e-06, "loss": 0.6665, "step": 59 }, { "epoch": 0.06632277081798084, "grad_norm": 0.6436171531677246, "learning_rate": 2.2058823529411767e-06, "loss": 0.6593, "step": 60 }, { "epoch": 0.06742815033161385, "grad_norm": 0.6254538297653198, "learning_rate": 2.2426470588235296e-06, "loss": 0.6593, "step": 61 }, { "epoch": 0.06853352984524687, "grad_norm": 0.6279752254486084, "learning_rate": 2.2794117647058826e-06, "loss": 0.6155, "step": 62 }, { "epoch": 0.06963890935887988, "grad_norm": 0.5987311005592346, "learning_rate": 2.3161764705882355e-06, "loss": 0.6269, "step": 63 }, { "epoch": 0.0707442888725129, "grad_norm": 0.5910629034042358, "learning_rate": 2.3529411764705885e-06, "loss": 0.634, "step": 64 }, { "epoch": 0.07184966838614591, "grad_norm": 0.5276488065719604, "learning_rate": 2.3897058823529414e-06, "loss": 0.6291, "step": 65 }, { "epoch": 0.07295504789977893, "grad_norm": 0.5081461668014526, "learning_rate": 2.4264705882352943e-06, "loss": 0.6303, "step": 66 }, { "epoch": 0.07406042741341194, "grad_norm": 0.47226476669311523, "learning_rate": 2.4632352941176473e-06, "loss": 0.6252, "step": 67 }, { "epoch": 0.07516580692704496, "grad_norm": 0.516169011592865, "learning_rate": 2.5e-06, "loss": 0.6296, "step": 68 }, { "epoch": 0.07627118644067797, "grad_norm": 0.4941953718662262, "learning_rate": 2.536764705882353e-06, "loss": 0.6291, "step": 69 }, { "epoch": 0.07737656595431099, "grad_norm": 0.4769359230995178, "learning_rate": 2.5735294117647057e-06, "loss": 0.601, "step": 70 }, { "epoch": 0.078481945467944, "grad_norm": 0.47221505641937256, "learning_rate": 2.610294117647059e-06, "loss": 0.5915, "step": 71 }, { "epoch": 0.079587324981577, "grad_norm": 0.48454976081848145, "learning_rate": 2.647058823529412e-06, "loss": 0.6266, "step": 72 }, { "epoch": 0.08069270449521002, "grad_norm": 0.5660485625267029, "learning_rate": 2.683823529411765e-06, "loss": 0.6247, "step": 73 }, { "epoch": 0.08179808400884303, "grad_norm": 0.49627426266670227, "learning_rate": 2.720588235294118e-06, "loss": 0.6536, "step": 74 }, { "epoch": 0.08290346352247605, "grad_norm": 0.5033668875694275, "learning_rate": 2.757352941176471e-06, "loss": 0.6491, "step": 75 }, { "epoch": 0.08400884303610906, "grad_norm": 0.4648224711418152, "learning_rate": 2.7941176470588237e-06, "loss": 0.5848, "step": 76 }, { "epoch": 0.08511422254974207, "grad_norm": 0.43396902084350586, "learning_rate": 2.8308823529411766e-06, "loss": 0.6338, "step": 77 }, { "epoch": 0.08621960206337509, "grad_norm": 0.45148423314094543, "learning_rate": 2.8676470588235296e-06, "loss": 0.6127, "step": 78 }, { "epoch": 0.0873249815770081, "grad_norm": 0.4692694842815399, "learning_rate": 2.904411764705883e-06, "loss": 0.6039, "step": 79 }, { "epoch": 0.08843036109064112, "grad_norm": 0.5423398613929749, "learning_rate": 2.9411764705882355e-06, "loss": 0.6187, "step": 80 }, { "epoch": 0.08953574060427413, "grad_norm": 0.43723583221435547, "learning_rate": 2.9779411764705884e-06, "loss": 0.6192, "step": 81 }, { "epoch": 0.09064112011790715, "grad_norm": 0.4498275816440582, "learning_rate": 3.0147058823529413e-06, "loss": 0.6195, "step": 82 }, { "epoch": 0.09174649963154016, "grad_norm": 0.5075950026512146, "learning_rate": 3.0514705882352947e-06, "loss": 0.5929, "step": 83 }, { "epoch": 0.09285187914517318, "grad_norm": 0.486625999212265, "learning_rate": 3.0882352941176476e-06, "loss": 0.6087, "step": 84 }, { "epoch": 0.09395725865880619, "grad_norm": 0.46693581342697144, "learning_rate": 3.125e-06, "loss": 0.5809, "step": 85 }, { "epoch": 0.0950626381724392, "grad_norm": 0.42462649941444397, "learning_rate": 3.161764705882353e-06, "loss": 0.6211, "step": 86 }, { "epoch": 0.09616801768607222, "grad_norm": 0.49524590373039246, "learning_rate": 3.198529411764706e-06, "loss": 0.6022, "step": 87 }, { "epoch": 0.09727339719970524, "grad_norm": 0.5189037919044495, "learning_rate": 3.2352941176470594e-06, "loss": 0.5997, "step": 88 }, { "epoch": 0.09837877671333825, "grad_norm": 0.42072102427482605, "learning_rate": 3.272058823529412e-06, "loss": 0.6108, "step": 89 }, { "epoch": 0.09948415622697127, "grad_norm": 0.4795837998390198, "learning_rate": 3.308823529411765e-06, "loss": 0.5977, "step": 90 }, { "epoch": 0.10058953574060428, "grad_norm": 0.4457264542579651, "learning_rate": 3.3455882352941178e-06, "loss": 0.573, "step": 91 }, { "epoch": 0.1016949152542373, "grad_norm": 0.5412639379501343, "learning_rate": 3.382352941176471e-06, "loss": 0.583, "step": 92 }, { "epoch": 0.1028002947678703, "grad_norm": 0.4543777406215668, "learning_rate": 3.419117647058824e-06, "loss": 0.5809, "step": 93 }, { "epoch": 0.10390567428150331, "grad_norm": 0.4360213577747345, "learning_rate": 3.4558823529411766e-06, "loss": 0.5715, "step": 94 }, { "epoch": 0.10501105379513632, "grad_norm": 0.4994044899940491, "learning_rate": 3.4926470588235295e-06, "loss": 0.5743, "step": 95 }, { "epoch": 0.10611643330876934, "grad_norm": 0.4798681437969208, "learning_rate": 3.529411764705883e-06, "loss": 0.5728, "step": 96 }, { "epoch": 0.10722181282240235, "grad_norm": 0.5267371535301208, "learning_rate": 3.566176470588236e-06, "loss": 0.5953, "step": 97 }, { "epoch": 0.10832719233603537, "grad_norm": 0.48274433612823486, "learning_rate": 3.6029411764705883e-06, "loss": 0.6019, "step": 98 }, { "epoch": 0.10943257184966838, "grad_norm": 0.507043719291687, "learning_rate": 3.6397058823529413e-06, "loss": 0.5998, "step": 99 }, { "epoch": 0.1105379513633014, "grad_norm": 0.47128158807754517, "learning_rate": 3.6764705882352946e-06, "loss": 0.5941, "step": 100 }, { "epoch": 0.11164333087693441, "grad_norm": 0.42746299505233765, "learning_rate": 3.7132352941176476e-06, "loss": 0.5631, "step": 101 }, { "epoch": 0.11274871039056743, "grad_norm": 0.4545954465866089, "learning_rate": 3.7500000000000005e-06, "loss": 0.5963, "step": 102 }, { "epoch": 0.11385408990420044, "grad_norm": 0.45454320311546326, "learning_rate": 3.786764705882353e-06, "loss": 0.6126, "step": 103 }, { "epoch": 0.11495946941783346, "grad_norm": 0.42147010564804077, "learning_rate": 3.8235294117647055e-06, "loss": 0.5989, "step": 104 }, { "epoch": 0.11606484893146647, "grad_norm": 0.45101892948150635, "learning_rate": 3.860294117647059e-06, "loss": 0.5817, "step": 105 }, { "epoch": 0.11717022844509949, "grad_norm": 0.4342888593673706, "learning_rate": 3.897058823529412e-06, "loss": 0.5783, "step": 106 }, { "epoch": 0.1182756079587325, "grad_norm": 0.5361742973327637, "learning_rate": 3.933823529411765e-06, "loss": 0.5818, "step": 107 }, { "epoch": 0.11938098747236552, "grad_norm": 0.4235663115978241, "learning_rate": 3.970588235294118e-06, "loss": 0.5875, "step": 108 }, { "epoch": 0.12048636698599853, "grad_norm": 0.4940057098865509, "learning_rate": 4.007352941176471e-06, "loss": 0.5791, "step": 109 }, { "epoch": 0.12159174649963155, "grad_norm": 0.4368382692337036, "learning_rate": 4.044117647058824e-06, "loss": 0.5958, "step": 110 }, { "epoch": 0.12269712601326456, "grad_norm": 0.5182525515556335, "learning_rate": 4.080882352941177e-06, "loss": 0.6064, "step": 111 }, { "epoch": 0.12380250552689757, "grad_norm": 0.45458662509918213, "learning_rate": 4.11764705882353e-06, "loss": 0.5614, "step": 112 }, { "epoch": 0.12490788504053059, "grad_norm": 0.4230400323867798, "learning_rate": 4.154411764705883e-06, "loss": 0.5765, "step": 113 }, { "epoch": 0.1260132645541636, "grad_norm": 0.48289769887924194, "learning_rate": 4.191176470588236e-06, "loss": 0.6065, "step": 114 }, { "epoch": 0.1271186440677966, "grad_norm": 0.4700344204902649, "learning_rate": 4.227941176470589e-06, "loss": 0.5775, "step": 115 }, { "epoch": 0.12822402358142962, "grad_norm": 0.5134259462356567, "learning_rate": 4.264705882352942e-06, "loss": 0.5759, "step": 116 }, { "epoch": 0.12932940309506263, "grad_norm": 0.5416266322135925, "learning_rate": 4.301470588235295e-06, "loss": 0.5848, "step": 117 }, { "epoch": 0.13043478260869565, "grad_norm": 0.45261380076408386, "learning_rate": 4.3382352941176475e-06, "loss": 0.5859, "step": 118 }, { "epoch": 0.13154016212232866, "grad_norm": 0.46047863364219666, "learning_rate": 4.3750000000000005e-06, "loss": 0.5686, "step": 119 }, { "epoch": 0.13264554163596168, "grad_norm": 0.49938341975212097, "learning_rate": 4.411764705882353e-06, "loss": 0.5634, "step": 120 }, { "epoch": 0.1337509211495947, "grad_norm": 0.4719489812850952, "learning_rate": 4.448529411764706e-06, "loss": 0.5688, "step": 121 }, { "epoch": 0.1348563006632277, "grad_norm": 0.4679771959781647, "learning_rate": 4.485294117647059e-06, "loss": 0.5482, "step": 122 }, { "epoch": 0.13596168017686072, "grad_norm": 0.4308927059173584, "learning_rate": 4.522058823529412e-06, "loss": 0.5693, "step": 123 }, { "epoch": 0.13706705969049374, "grad_norm": 0.4673250913619995, "learning_rate": 4.558823529411765e-06, "loss": 0.5775, "step": 124 }, { "epoch": 0.13817243920412675, "grad_norm": 0.4882560670375824, "learning_rate": 4.595588235294118e-06, "loss": 0.5405, "step": 125 }, { "epoch": 0.13927781871775977, "grad_norm": 0.4822829067707062, "learning_rate": 4.632352941176471e-06, "loss": 0.569, "step": 126 }, { "epoch": 0.14038319823139278, "grad_norm": 0.4564063549041748, "learning_rate": 4.669117647058824e-06, "loss": 0.5917, "step": 127 }, { "epoch": 0.1414885777450258, "grad_norm": 0.4588386118412018, "learning_rate": 4.705882352941177e-06, "loss": 0.5487, "step": 128 }, { "epoch": 0.1425939572586588, "grad_norm": 0.45219072699546814, "learning_rate": 4.74264705882353e-06, "loss": 0.565, "step": 129 }, { "epoch": 0.14369933677229182, "grad_norm": 0.4773537814617157, "learning_rate": 4.779411764705883e-06, "loss": 0.5805, "step": 130 }, { "epoch": 0.14480471628592484, "grad_norm": 0.43788325786590576, "learning_rate": 4.816176470588236e-06, "loss": 0.5536, "step": 131 }, { "epoch": 0.14591009579955785, "grad_norm": 0.425717294216156, "learning_rate": 4.852941176470589e-06, "loss": 0.5904, "step": 132 }, { "epoch": 0.14701547531319087, "grad_norm": 0.4549780488014221, "learning_rate": 4.889705882352942e-06, "loss": 0.5557, "step": 133 }, { "epoch": 0.14812085482682388, "grad_norm": 0.46649402379989624, "learning_rate": 4.9264705882352945e-06, "loss": 0.5636, "step": 134 }, { "epoch": 0.1492262343404569, "grad_norm": 0.45299869775772095, "learning_rate": 4.9632352941176475e-06, "loss": 0.5742, "step": 135 }, { "epoch": 0.1503316138540899, "grad_norm": 0.4986342787742615, "learning_rate": 5e-06, "loss": 0.5964, "step": 136 }, { "epoch": 0.15143699336772293, "grad_norm": 0.4568003714084625, "learning_rate": 5.036764705882353e-06, "loss": 0.5577, "step": 137 }, { "epoch": 0.15254237288135594, "grad_norm": 0.4697153568267822, "learning_rate": 5.073529411764706e-06, "loss": 0.5721, "step": 138 }, { "epoch": 0.15364775239498896, "grad_norm": 0.47089800238609314, "learning_rate": 5.110294117647059e-06, "loss": 0.5816, "step": 139 }, { "epoch": 0.15475313190862197, "grad_norm": 0.4529527723789215, "learning_rate": 5.147058823529411e-06, "loss": 0.5405, "step": 140 }, { "epoch": 0.15585851142225499, "grad_norm": 0.4688420593738556, "learning_rate": 5.183823529411766e-06, "loss": 0.569, "step": 141 }, { "epoch": 0.156963890935888, "grad_norm": 0.4281812906265259, "learning_rate": 5.220588235294118e-06, "loss": 0.5433, "step": 142 }, { "epoch": 0.15806927044952102, "grad_norm": 0.4788137972354889, "learning_rate": 5.257352941176471e-06, "loss": 0.5736, "step": 143 }, { "epoch": 0.159174649963154, "grad_norm": 0.49690037965774536, "learning_rate": 5.294117647058824e-06, "loss": 0.5437, "step": 144 }, { "epoch": 0.16028002947678702, "grad_norm": 0.4782974421977997, "learning_rate": 5.330882352941177e-06, "loss": 0.5706, "step": 145 }, { "epoch": 0.16138540899042003, "grad_norm": 0.49766674637794495, "learning_rate": 5.36764705882353e-06, "loss": 0.583, "step": 146 }, { "epoch": 0.16249078850405305, "grad_norm": 0.43575310707092285, "learning_rate": 5.404411764705883e-06, "loss": 0.5699, "step": 147 }, { "epoch": 0.16359616801768606, "grad_norm": 0.5305167436599731, "learning_rate": 5.441176470588236e-06, "loss": 0.561, "step": 148 }, { "epoch": 0.16470154753131908, "grad_norm": 0.47852370142936707, "learning_rate": 5.4779411764705894e-06, "loss": 0.5668, "step": 149 }, { "epoch": 0.1658069270449521, "grad_norm": 0.4962552785873413, "learning_rate": 5.514705882352942e-06, "loss": 0.5215, "step": 150 }, { "epoch": 0.1669123065585851, "grad_norm": 0.5402052402496338, "learning_rate": 5.5514705882352945e-06, "loss": 0.5534, "step": 151 }, { "epoch": 0.16801768607221812, "grad_norm": 0.4789322316646576, "learning_rate": 5.588235294117647e-06, "loss": 0.5216, "step": 152 }, { "epoch": 0.16912306558585113, "grad_norm": 0.46958643198013306, "learning_rate": 5.625e-06, "loss": 0.5518, "step": 153 }, { "epoch": 0.17022844509948415, "grad_norm": 0.47278931736946106, "learning_rate": 5.661764705882353e-06, "loss": 0.5829, "step": 154 }, { "epoch": 0.17133382461311716, "grad_norm": 0.4709634780883789, "learning_rate": 5.698529411764706e-06, "loss": 0.557, "step": 155 }, { "epoch": 0.17243920412675018, "grad_norm": 0.49254631996154785, "learning_rate": 5.735294117647059e-06, "loss": 0.5549, "step": 156 }, { "epoch": 0.1735445836403832, "grad_norm": 0.4257882833480835, "learning_rate": 5.772058823529412e-06, "loss": 0.5552, "step": 157 }, { "epoch": 0.1746499631540162, "grad_norm": 0.5098555684089661, "learning_rate": 5.808823529411766e-06, "loss": 0.5744, "step": 158 }, { "epoch": 0.17575534266764922, "grad_norm": 0.4555637836456299, "learning_rate": 5.845588235294119e-06, "loss": 0.5697, "step": 159 }, { "epoch": 0.17686072218128224, "grad_norm": 0.45549020171165466, "learning_rate": 5.882352941176471e-06, "loss": 0.5491, "step": 160 }, { "epoch": 0.17796610169491525, "grad_norm": 0.43787822127342224, "learning_rate": 5.919117647058824e-06, "loss": 0.5221, "step": 161 }, { "epoch": 0.17907148120854827, "grad_norm": 0.4425792694091797, "learning_rate": 5.955882352941177e-06, "loss": 0.5366, "step": 162 }, { "epoch": 0.18017686072218128, "grad_norm": 0.5634394288063049, "learning_rate": 5.99264705882353e-06, "loss": 0.5433, "step": 163 }, { "epoch": 0.1812822402358143, "grad_norm": 0.44385239481925964, "learning_rate": 6.029411764705883e-06, "loss": 0.5234, "step": 164 }, { "epoch": 0.1823876197494473, "grad_norm": 0.4794684946537018, "learning_rate": 6.066176470588236e-06, "loss": 0.555, "step": 165 }, { "epoch": 0.18349299926308033, "grad_norm": 0.44709575176239014, "learning_rate": 6.102941176470589e-06, "loss": 0.5522, "step": 166 }, { "epoch": 0.18459837877671334, "grad_norm": 0.4927119314670563, "learning_rate": 6.139705882352942e-06, "loss": 0.5642, "step": 167 }, { "epoch": 0.18570375829034635, "grad_norm": 0.46800270676612854, "learning_rate": 6.176470588235295e-06, "loss": 0.5639, "step": 168 }, { "epoch": 0.18680913780397937, "grad_norm": 0.5303888916969299, "learning_rate": 6.213235294117647e-06, "loss": 0.5568, "step": 169 }, { "epoch": 0.18791451731761238, "grad_norm": 0.5041533708572388, "learning_rate": 6.25e-06, "loss": 0.5596, "step": 170 }, { "epoch": 0.1890198968312454, "grad_norm": 0.4551161825656891, "learning_rate": 6.286764705882353e-06, "loss": 0.534, "step": 171 }, { "epoch": 0.1901252763448784, "grad_norm": 0.5476518869400024, "learning_rate": 6.323529411764706e-06, "loss": 0.5427, "step": 172 }, { "epoch": 0.19123065585851143, "grad_norm": 0.511348307132721, "learning_rate": 6.360294117647059e-06, "loss": 0.5632, "step": 173 }, { "epoch": 0.19233603537214444, "grad_norm": 0.5366650223731995, "learning_rate": 6.397058823529412e-06, "loss": 0.5521, "step": 174 }, { "epoch": 0.19344141488577746, "grad_norm": 0.5286919474601746, "learning_rate": 6.433823529411766e-06, "loss": 0.5224, "step": 175 }, { "epoch": 0.19454679439941047, "grad_norm": 0.49304676055908203, "learning_rate": 6.470588235294119e-06, "loss": 0.5569, "step": 176 }, { "epoch": 0.1956521739130435, "grad_norm": 0.5082322359085083, "learning_rate": 6.507352941176472e-06, "loss": 0.5421, "step": 177 }, { "epoch": 0.1967575534266765, "grad_norm": 0.5189738869667053, "learning_rate": 6.544117647058824e-06, "loss": 0.5472, "step": 178 }, { "epoch": 0.19786293294030952, "grad_norm": 0.46261143684387207, "learning_rate": 6.580882352941177e-06, "loss": 0.5507, "step": 179 }, { "epoch": 0.19896831245394253, "grad_norm": 0.5298084020614624, "learning_rate": 6.61764705882353e-06, "loss": 0.5369, "step": 180 }, { "epoch": 0.20007369196757555, "grad_norm": 0.5241328477859497, "learning_rate": 6.654411764705883e-06, "loss": 0.5452, "step": 181 }, { "epoch": 0.20117907148120856, "grad_norm": 0.5809250473976135, "learning_rate": 6.6911764705882356e-06, "loss": 0.5448, "step": 182 }, { "epoch": 0.20228445099484157, "grad_norm": 0.4426633417606354, "learning_rate": 6.727941176470589e-06, "loss": 0.5501, "step": 183 }, { "epoch": 0.2033898305084746, "grad_norm": 0.5121925473213196, "learning_rate": 6.764705882352942e-06, "loss": 0.5352, "step": 184 }, { "epoch": 0.2044952100221076, "grad_norm": 0.5188096165657043, "learning_rate": 6.801470588235295e-06, "loss": 0.5369, "step": 185 }, { "epoch": 0.2056005895357406, "grad_norm": 0.41698119044303894, "learning_rate": 6.838235294117648e-06, "loss": 0.5288, "step": 186 }, { "epoch": 0.2067059690493736, "grad_norm": 0.5744749307632446, "learning_rate": 6.875e-06, "loss": 0.5243, "step": 187 }, { "epoch": 0.20781134856300662, "grad_norm": 0.49398308992385864, "learning_rate": 6.911764705882353e-06, "loss": 0.5138, "step": 188 }, { "epoch": 0.20891672807663964, "grad_norm": 0.46186763048171997, "learning_rate": 6.948529411764706e-06, "loss": 0.554, "step": 189 }, { "epoch": 0.21002210759027265, "grad_norm": 0.5792961120605469, "learning_rate": 6.985294117647059e-06, "loss": 0.5144, "step": 190 }, { "epoch": 0.21112748710390566, "grad_norm": 0.5596435070037842, "learning_rate": 7.022058823529412e-06, "loss": 0.5592, "step": 191 }, { "epoch": 0.21223286661753868, "grad_norm": 0.45052850246429443, "learning_rate": 7.058823529411766e-06, "loss": 0.5423, "step": 192 }, { "epoch": 0.2133382461311717, "grad_norm": 0.5268111228942871, "learning_rate": 7.095588235294119e-06, "loss": 0.5466, "step": 193 }, { "epoch": 0.2144436256448047, "grad_norm": 0.5137447118759155, "learning_rate": 7.132352941176472e-06, "loss": 0.5034, "step": 194 }, { "epoch": 0.21554900515843772, "grad_norm": 0.5449075102806091, "learning_rate": 7.169117647058825e-06, "loss": 0.5283, "step": 195 }, { "epoch": 0.21665438467207074, "grad_norm": 0.4578010141849518, "learning_rate": 7.205882352941177e-06, "loss": 0.4956, "step": 196 }, { "epoch": 0.21775976418570375, "grad_norm": 0.5838969349861145, "learning_rate": 7.24264705882353e-06, "loss": 0.545, "step": 197 }, { "epoch": 0.21886514369933677, "grad_norm": 0.44667795300483704, "learning_rate": 7.2794117647058826e-06, "loss": 0.544, "step": 198 }, { "epoch": 0.21997052321296978, "grad_norm": 0.509713888168335, "learning_rate": 7.3161764705882355e-06, "loss": 0.5471, "step": 199 }, { "epoch": 0.2210759027266028, "grad_norm": 0.5461410284042358, "learning_rate": 7.352941176470589e-06, "loss": 0.5445, "step": 200 }, { "epoch": 0.2221812822402358, "grad_norm": 0.5272083282470703, "learning_rate": 7.389705882352942e-06, "loss": 0.5263, "step": 201 }, { "epoch": 0.22328666175386883, "grad_norm": 0.5492861866950989, "learning_rate": 7.426470588235295e-06, "loss": 0.5472, "step": 202 }, { "epoch": 0.22439204126750184, "grad_norm": 0.5130373239517212, "learning_rate": 7.463235294117648e-06, "loss": 0.5282, "step": 203 }, { "epoch": 0.22549742078113486, "grad_norm": 0.49408307671546936, "learning_rate": 7.500000000000001e-06, "loss": 0.5175, "step": 204 }, { "epoch": 0.22660280029476787, "grad_norm": 0.5356124639511108, "learning_rate": 7.536764705882353e-06, "loss": 0.515, "step": 205 }, { "epoch": 0.22770817980840088, "grad_norm": 0.6235688924789429, "learning_rate": 7.573529411764706e-06, "loss": 0.556, "step": 206 }, { "epoch": 0.2288135593220339, "grad_norm": 0.5164855122566223, "learning_rate": 7.610294117647059e-06, "loss": 0.5227, "step": 207 }, { "epoch": 0.22991893883566691, "grad_norm": 0.5702990889549255, "learning_rate": 7.647058823529411e-06, "loss": 0.5597, "step": 208 }, { "epoch": 0.23102431834929993, "grad_norm": 0.5412698984146118, "learning_rate": 7.683823529411766e-06, "loss": 0.5476, "step": 209 }, { "epoch": 0.23212969786293294, "grad_norm": 0.5291882157325745, "learning_rate": 7.720588235294119e-06, "loss": 0.5159, "step": 210 }, { "epoch": 0.23323507737656596, "grad_norm": 0.47034499049186707, "learning_rate": 7.757352941176472e-06, "loss": 0.5227, "step": 211 }, { "epoch": 0.23434045689019897, "grad_norm": 0.5935096740722656, "learning_rate": 7.794117647058825e-06, "loss": 0.5418, "step": 212 }, { "epoch": 0.235445836403832, "grad_norm": 0.5457180738449097, "learning_rate": 7.830882352941177e-06, "loss": 0.5551, "step": 213 }, { "epoch": 0.236551215917465, "grad_norm": 0.6419122815132141, "learning_rate": 7.86764705882353e-06, "loss": 0.5346, "step": 214 }, { "epoch": 0.23765659543109802, "grad_norm": 0.6282505989074707, "learning_rate": 7.904411764705883e-06, "loss": 0.5377, "step": 215 }, { "epoch": 0.23876197494473103, "grad_norm": 0.6698830723762512, "learning_rate": 7.941176470588236e-06, "loss": 0.5487, "step": 216 }, { "epoch": 0.23986735445836405, "grad_norm": 0.616992175579071, "learning_rate": 7.97794117647059e-06, "loss": 0.5322, "step": 217 }, { "epoch": 0.24097273397199706, "grad_norm": 0.5718163251876831, "learning_rate": 8.014705882352942e-06, "loss": 0.5219, "step": 218 }, { "epoch": 0.24207811348563008, "grad_norm": 0.5170466303825378, "learning_rate": 8.051470588235295e-06, "loss": 0.518, "step": 219 }, { "epoch": 0.2431834929992631, "grad_norm": 0.5410168170928955, "learning_rate": 8.088235294117648e-06, "loss": 0.5172, "step": 220 }, { "epoch": 0.2442888725128961, "grad_norm": 0.5024592280387878, "learning_rate": 8.125000000000001e-06, "loss": 0.5392, "step": 221 }, { "epoch": 0.24539425202652912, "grad_norm": 0.53636234998703, "learning_rate": 8.161764705882354e-06, "loss": 0.5398, "step": 222 }, { "epoch": 0.24649963154016213, "grad_norm": 0.5443093776702881, "learning_rate": 8.198529411764707e-06, "loss": 0.5304, "step": 223 }, { "epoch": 0.24760501105379515, "grad_norm": 0.5792282819747925, "learning_rate": 8.23529411764706e-06, "loss": 0.577, "step": 224 }, { "epoch": 0.24871039056742816, "grad_norm": 0.46914142370224, "learning_rate": 8.272058823529413e-06, "loss": 0.5405, "step": 225 }, { "epoch": 0.24981577008106118, "grad_norm": 0.5338611006736755, "learning_rate": 8.308823529411766e-06, "loss": 0.5322, "step": 226 }, { "epoch": 0.2509211495946942, "grad_norm": 0.5636720657348633, "learning_rate": 8.345588235294119e-06, "loss": 0.5427, "step": 227 }, { "epoch": 0.2520265291083272, "grad_norm": 0.5476045608520508, "learning_rate": 8.382352941176472e-06, "loss": 0.5552, "step": 228 }, { "epoch": 0.2531319086219602, "grad_norm": 0.5906997919082642, "learning_rate": 8.419117647058824e-06, "loss": 0.5416, "step": 229 }, { "epoch": 0.2542372881355932, "grad_norm": 0.46757903695106506, "learning_rate": 8.455882352941177e-06, "loss": 0.5245, "step": 230 }, { "epoch": 0.25534266764922625, "grad_norm": 0.5489746928215027, "learning_rate": 8.49264705882353e-06, "loss": 0.5077, "step": 231 }, { "epoch": 0.25644804716285924, "grad_norm": 0.6381259560585022, "learning_rate": 8.529411764705883e-06, "loss": 0.5517, "step": 232 }, { "epoch": 0.2575534266764923, "grad_norm": 0.5217357873916626, "learning_rate": 8.566176470588236e-06, "loss": 0.5643, "step": 233 }, { "epoch": 0.25865880619012527, "grad_norm": 0.5020130276679993, "learning_rate": 8.60294117647059e-06, "loss": 0.5092, "step": 234 }, { "epoch": 0.2597641857037583, "grad_norm": 0.5788955688476562, "learning_rate": 8.639705882352942e-06, "loss": 0.5122, "step": 235 }, { "epoch": 0.2608695652173913, "grad_norm": 0.6943607330322266, "learning_rate": 8.676470588235295e-06, "loss": 0.5487, "step": 236 }, { "epoch": 0.26197494473102434, "grad_norm": 0.5365936160087585, "learning_rate": 8.713235294117648e-06, "loss": 0.5578, "step": 237 }, { "epoch": 0.2630803242446573, "grad_norm": 0.5747148990631104, "learning_rate": 8.750000000000001e-06, "loss": 0.5599, "step": 238 }, { "epoch": 0.26418570375829037, "grad_norm": 0.6929720640182495, "learning_rate": 8.786764705882354e-06, "loss": 0.555, "step": 239 }, { "epoch": 0.26529108327192336, "grad_norm": 0.4534219801425934, "learning_rate": 8.823529411764707e-06, "loss": 0.5532, "step": 240 }, { "epoch": 0.2663964627855564, "grad_norm": 0.542911946773529, "learning_rate": 8.86029411764706e-06, "loss": 0.5342, "step": 241 }, { "epoch": 0.2675018422991894, "grad_norm": 0.5279428362846375, "learning_rate": 8.897058823529413e-06, "loss": 0.5515, "step": 242 }, { "epoch": 0.2686072218128224, "grad_norm": 0.47872450947761536, "learning_rate": 8.933823529411766e-06, "loss": 0.5361, "step": 243 }, { "epoch": 0.2697126013264554, "grad_norm": 0.5772373080253601, "learning_rate": 8.970588235294119e-06, "loss": 0.559, "step": 244 }, { "epoch": 0.27081798084008846, "grad_norm": 0.5207620859146118, "learning_rate": 9.007352941176471e-06, "loss": 0.5241, "step": 245 }, { "epoch": 0.27192336035372144, "grad_norm": 0.5398781299591064, "learning_rate": 9.044117647058824e-06, "loss": 0.5373, "step": 246 }, { "epoch": 0.27302873986735443, "grad_norm": 0.5014872550964355, "learning_rate": 9.080882352941177e-06, "loss": 0.5101, "step": 247 }, { "epoch": 0.2741341193809875, "grad_norm": 0.48251187801361084, "learning_rate": 9.11764705882353e-06, "loss": 0.5261, "step": 248 }, { "epoch": 0.27523949889462046, "grad_norm": 0.5711583495140076, "learning_rate": 9.154411764705883e-06, "loss": 0.5308, "step": 249 }, { "epoch": 0.2763448784082535, "grad_norm": 0.5239256620407104, "learning_rate": 9.191176470588236e-06, "loss": 0.5209, "step": 250 }, { "epoch": 0.2774502579218865, "grad_norm": 0.47207388281822205, "learning_rate": 9.227941176470589e-06, "loss": 0.5159, "step": 251 }, { "epoch": 0.27855563743551953, "grad_norm": 0.5202915072441101, "learning_rate": 9.264705882352942e-06, "loss": 0.5038, "step": 252 }, { "epoch": 0.2796610169491525, "grad_norm": 0.47947701811790466, "learning_rate": 9.301470588235295e-06, "loss": 0.5255, "step": 253 }, { "epoch": 0.28076639646278556, "grad_norm": 0.5310620665550232, "learning_rate": 9.338235294117648e-06, "loss": 0.5547, "step": 254 }, { "epoch": 0.28187177597641855, "grad_norm": 0.48968780040740967, "learning_rate": 9.375000000000001e-06, "loss": 0.5331, "step": 255 }, { "epoch": 0.2829771554900516, "grad_norm": 0.5163161754608154, "learning_rate": 9.411764705882354e-06, "loss": 0.5305, "step": 256 }, { "epoch": 0.2840825350036846, "grad_norm": 0.5558844804763794, "learning_rate": 9.448529411764707e-06, "loss": 0.5291, "step": 257 }, { "epoch": 0.2851879145173176, "grad_norm": 0.44736921787261963, "learning_rate": 9.48529411764706e-06, "loss": 0.5131, "step": 258 }, { "epoch": 0.2862932940309506, "grad_norm": 0.6266206502914429, "learning_rate": 9.522058823529413e-06, "loss": 0.5511, "step": 259 }, { "epoch": 0.28739867354458365, "grad_norm": 0.5110976099967957, "learning_rate": 9.558823529411766e-06, "loss": 0.5052, "step": 260 }, { "epoch": 0.28850405305821664, "grad_norm": 0.515807569026947, "learning_rate": 9.595588235294119e-06, "loss": 0.5424, "step": 261 }, { "epoch": 0.2896094325718497, "grad_norm": 0.5821594595909119, "learning_rate": 9.632352941176471e-06, "loss": 0.5424, "step": 262 }, { "epoch": 0.29071481208548267, "grad_norm": 0.45542535185813904, "learning_rate": 9.669117647058824e-06, "loss": 0.5238, "step": 263 }, { "epoch": 0.2918201915991157, "grad_norm": 0.6616759300231934, "learning_rate": 9.705882352941177e-06, "loss": 0.5252, "step": 264 }, { "epoch": 0.2929255711127487, "grad_norm": 0.5062791705131531, "learning_rate": 9.74264705882353e-06, "loss": 0.5138, "step": 265 }, { "epoch": 0.29403095062638174, "grad_norm": 0.5784790515899658, "learning_rate": 9.779411764705883e-06, "loss": 0.5624, "step": 266 }, { "epoch": 0.2951363301400147, "grad_norm": 0.5244966149330139, "learning_rate": 9.816176470588236e-06, "loss": 0.529, "step": 267 }, { "epoch": 0.29624170965364777, "grad_norm": 0.5308823585510254, "learning_rate": 9.852941176470589e-06, "loss": 0.5393, "step": 268 }, { "epoch": 0.29734708916728075, "grad_norm": 0.5184321403503418, "learning_rate": 9.889705882352942e-06, "loss": 0.5185, "step": 269 }, { "epoch": 0.2984524686809138, "grad_norm": 0.49477338790893555, "learning_rate": 9.926470588235295e-06, "loss": 0.5353, "step": 270 }, { "epoch": 0.2995578481945468, "grad_norm": 0.4364514648914337, "learning_rate": 9.963235294117648e-06, "loss": 0.5213, "step": 271 }, { "epoch": 0.3006632277081798, "grad_norm": 0.48450222611427307, "learning_rate": 1e-05, "loss": 0.5342, "step": 272 }, { "epoch": 0.3017686072218128, "grad_norm": 0.5775253176689148, "learning_rate": 9.999995855615494e-06, "loss": 0.5113, "step": 273 }, { "epoch": 0.30287398673544585, "grad_norm": 0.48574355244636536, "learning_rate": 9.999983422468849e-06, "loss": 0.5292, "step": 274 }, { "epoch": 0.30397936624907884, "grad_norm": 0.4984934329986572, "learning_rate": 9.99996270058067e-06, "loss": 0.5083, "step": 275 }, { "epoch": 0.3050847457627119, "grad_norm": 0.4925508499145508, "learning_rate": 9.999933689985315e-06, "loss": 0.524, "step": 276 }, { "epoch": 0.30619012527634487, "grad_norm": 0.47675901651382446, "learning_rate": 9.999896390730872e-06, "loss": 0.546, "step": 277 }, { "epoch": 0.3072955047899779, "grad_norm": 0.5151447057723999, "learning_rate": 9.999850802879177e-06, "loss": 0.5101, "step": 278 }, { "epoch": 0.3084008843036109, "grad_norm": 0.47942131757736206, "learning_rate": 9.999796926505803e-06, "loss": 0.5248, "step": 279 }, { "epoch": 0.30950626381724394, "grad_norm": 0.5642871260643005, "learning_rate": 9.999734761700061e-06, "loss": 0.5178, "step": 280 }, { "epoch": 0.31061164333087693, "grad_norm": 0.5059849619865417, "learning_rate": 9.999664308565009e-06, "loss": 0.5287, "step": 281 }, { "epoch": 0.31171702284450997, "grad_norm": 0.6773280501365662, "learning_rate": 9.99958556721744e-06, "loss": 0.5476, "step": 282 }, { "epoch": 0.31282240235814296, "grad_norm": 0.45705118775367737, "learning_rate": 9.999498537787884e-06, "loss": 0.5167, "step": 283 }, { "epoch": 0.313927781871776, "grad_norm": 0.5016194581985474, "learning_rate": 9.999403220420619e-06, "loss": 0.5501, "step": 284 }, { "epoch": 0.315033161385409, "grad_norm": 0.5689473152160645, "learning_rate": 9.999299615273655e-06, "loss": 0.5158, "step": 285 }, { "epoch": 0.31613854089904203, "grad_norm": 0.5179623961448669, "learning_rate": 9.999187722518747e-06, "loss": 0.5376, "step": 286 }, { "epoch": 0.317243920412675, "grad_norm": 1.1839812994003296, "learning_rate": 9.99906754234138e-06, "loss": 0.5136, "step": 287 }, { "epoch": 0.318349299926308, "grad_norm": 0.5192070007324219, "learning_rate": 9.998939074940788e-06, "loss": 0.5133, "step": 288 }, { "epoch": 0.31945467943994105, "grad_norm": 0.49143102765083313, "learning_rate": 9.998802320529938e-06, "loss": 0.5304, "step": 289 }, { "epoch": 0.32056005895357403, "grad_norm": 0.5001342296600342, "learning_rate": 9.99865727933553e-06, "loss": 0.5193, "step": 290 }, { "epoch": 0.3216654384672071, "grad_norm": 0.48756471276283264, "learning_rate": 9.998503951598015e-06, "loss": 0.5297, "step": 291 }, { "epoch": 0.32277081798084006, "grad_norm": 0.58963543176651, "learning_rate": 9.998342337571566e-06, "loss": 0.5152, "step": 292 }, { "epoch": 0.3238761974944731, "grad_norm": 0.5163702964782715, "learning_rate": 9.998172437524103e-06, "loss": 0.5171, "step": 293 }, { "epoch": 0.3249815770081061, "grad_norm": 0.4882372319698334, "learning_rate": 9.997994251737276e-06, "loss": 0.5294, "step": 294 }, { "epoch": 0.32608695652173914, "grad_norm": 0.512572169303894, "learning_rate": 9.997807780506473e-06, "loss": 0.5306, "step": 295 }, { "epoch": 0.3271923360353721, "grad_norm": 0.5832269191741943, "learning_rate": 9.99761302414082e-06, "loss": 0.5199, "step": 296 }, { "epoch": 0.32829771554900516, "grad_norm": 0.46874383091926575, "learning_rate": 9.997409982963173e-06, "loss": 0.495, "step": 297 }, { "epoch": 0.32940309506263815, "grad_norm": 0.5713374614715576, "learning_rate": 9.997198657310126e-06, "loss": 0.5273, "step": 298 }, { "epoch": 0.3305084745762712, "grad_norm": 0.47881534695625305, "learning_rate": 9.996979047532001e-06, "loss": 0.5112, "step": 299 }, { "epoch": 0.3316138540899042, "grad_norm": 0.6145876049995422, "learning_rate": 9.996751153992861e-06, "loss": 0.5227, "step": 300 }, { "epoch": 0.3327192336035372, "grad_norm": 0.5647861361503601, "learning_rate": 9.996514977070497e-06, "loss": 0.5472, "step": 301 }, { "epoch": 0.3338246131171702, "grad_norm": 0.5180700421333313, "learning_rate": 9.996270517156431e-06, "loss": 0.5116, "step": 302 }, { "epoch": 0.33492999263080325, "grad_norm": 0.5466155409812927, "learning_rate": 9.996017774655917e-06, "loss": 0.5232, "step": 303 }, { "epoch": 0.33603537214443624, "grad_norm": 0.556660532951355, "learning_rate": 9.995756749987942e-06, "loss": 0.5281, "step": 304 }, { "epoch": 0.3371407516580693, "grad_norm": 0.5523375868797302, "learning_rate": 9.995487443585217e-06, "loss": 0.5072, "step": 305 }, { "epoch": 0.33824613117170227, "grad_norm": 0.5007307529449463, "learning_rate": 9.995209855894191e-06, "loss": 0.5082, "step": 306 }, { "epoch": 0.3393515106853353, "grad_norm": 0.5231107473373413, "learning_rate": 9.994923987375029e-06, "loss": 0.5156, "step": 307 }, { "epoch": 0.3404568901989683, "grad_norm": 0.5620622038841248, "learning_rate": 9.994629838501637e-06, "loss": 0.5285, "step": 308 }, { "epoch": 0.34156226971260134, "grad_norm": 0.5452278256416321, "learning_rate": 9.994327409761637e-06, "loss": 0.51, "step": 309 }, { "epoch": 0.3426676492262343, "grad_norm": 0.5257782936096191, "learning_rate": 9.994016701656384e-06, "loss": 0.496, "step": 310 }, { "epoch": 0.34377302873986737, "grad_norm": 0.5400412082672119, "learning_rate": 9.993697714700957e-06, "loss": 0.5362, "step": 311 }, { "epoch": 0.34487840825350036, "grad_norm": 0.5909719467163086, "learning_rate": 9.993370449424153e-06, "loss": 0.5348, "step": 312 }, { "epoch": 0.3459837877671334, "grad_norm": 0.48313477635383606, "learning_rate": 9.993034906368502e-06, "loss": 0.5261, "step": 313 }, { "epoch": 0.3470891672807664, "grad_norm": 0.6287991404533386, "learning_rate": 9.992691086090249e-06, "loss": 0.5023, "step": 314 }, { "epoch": 0.34819454679439943, "grad_norm": 0.5219399929046631, "learning_rate": 9.992338989159363e-06, "loss": 0.5209, "step": 315 }, { "epoch": 0.3492999263080324, "grad_norm": 0.5161361694335938, "learning_rate": 9.991978616159535e-06, "loss": 0.5134, "step": 316 }, { "epoch": 0.35040530582166546, "grad_norm": 0.549393355846405, "learning_rate": 9.991609967688177e-06, "loss": 0.532, "step": 317 }, { "epoch": 0.35151068533529845, "grad_norm": 0.54290771484375, "learning_rate": 9.991233044356414e-06, "loss": 0.537, "step": 318 }, { "epoch": 0.3526160648489315, "grad_norm": 0.5386824607849121, "learning_rate": 9.990847846789093e-06, "loss": 0.5213, "step": 319 }, { "epoch": 0.3537214443625645, "grad_norm": 0.5484795570373535, "learning_rate": 9.990454375624778e-06, "loss": 0.4717, "step": 320 }, { "epoch": 0.3548268238761975, "grad_norm": 0.5217592716217041, "learning_rate": 9.990052631515746e-06, "loss": 0.5497, "step": 321 }, { "epoch": 0.3559322033898305, "grad_norm": 0.5892103910446167, "learning_rate": 9.98964261512799e-06, "loss": 0.5326, "step": 322 }, { "epoch": 0.35703758290346355, "grad_norm": 0.5677636861801147, "learning_rate": 9.989224327141215e-06, "loss": 0.5518, "step": 323 }, { "epoch": 0.35814296241709653, "grad_norm": 0.6619137525558472, "learning_rate": 9.988797768248844e-06, "loss": 0.5393, "step": 324 }, { "epoch": 0.3592483419307296, "grad_norm": 0.6067458391189575, "learning_rate": 9.988362939158e-06, "loss": 0.5155, "step": 325 }, { "epoch": 0.36035372144436256, "grad_norm": 0.5850074291229248, "learning_rate": 9.987919840589529e-06, "loss": 0.5212, "step": 326 }, { "epoch": 0.3614591009579956, "grad_norm": 0.6708207130432129, "learning_rate": 9.987468473277975e-06, "loss": 0.5471, "step": 327 }, { "epoch": 0.3625644804716286, "grad_norm": 0.45812103152275085, "learning_rate": 9.987008837971595e-06, "loss": 0.4967, "step": 328 }, { "epoch": 0.3636698599852616, "grad_norm": 0.551084041595459, "learning_rate": 9.986540935432354e-06, "loss": 0.5201, "step": 329 }, { "epoch": 0.3647752394988946, "grad_norm": 0.53420090675354, "learning_rate": 9.986064766435915e-06, "loss": 0.5234, "step": 330 }, { "epoch": 0.3658806190125276, "grad_norm": 0.5431454181671143, "learning_rate": 9.98558033177165e-06, "loss": 0.5219, "step": 331 }, { "epoch": 0.36698599852616065, "grad_norm": 0.4878302216529846, "learning_rate": 9.985087632242634e-06, "loss": 0.5332, "step": 332 }, { "epoch": 0.36809137803979364, "grad_norm": 0.593783438205719, "learning_rate": 9.984586668665641e-06, "loss": 0.5252, "step": 333 }, { "epoch": 0.3691967575534267, "grad_norm": 0.4893244206905365, "learning_rate": 9.984077441871144e-06, "loss": 0.5521, "step": 334 }, { "epoch": 0.37030213706705967, "grad_norm": 0.5851832032203674, "learning_rate": 9.983559952703316e-06, "loss": 0.514, "step": 335 }, { "epoch": 0.3714075165806927, "grad_norm": 0.5407524108886719, "learning_rate": 9.98303420202003e-06, "loss": 0.4965, "step": 336 }, { "epoch": 0.3725128960943257, "grad_norm": 0.6697810888290405, "learning_rate": 9.982500190692846e-06, "loss": 0.5323, "step": 337 }, { "epoch": 0.37361827560795874, "grad_norm": 0.5142413377761841, "learning_rate": 9.981957919607026e-06, "loss": 0.5364, "step": 338 }, { "epoch": 0.3747236551215917, "grad_norm": 0.5846256613731384, "learning_rate": 9.981407389661522e-06, "loss": 0.5229, "step": 339 }, { "epoch": 0.37582903463522477, "grad_norm": 0.5552329421043396, "learning_rate": 9.980848601768976e-06, "loss": 0.4926, "step": 340 }, { "epoch": 0.37693441414885775, "grad_norm": 0.5647974014282227, "learning_rate": 9.980281556855724e-06, "loss": 0.5328, "step": 341 }, { "epoch": 0.3780397936624908, "grad_norm": 0.5731161236763, "learning_rate": 9.97970625586178e-06, "loss": 0.5196, "step": 342 }, { "epoch": 0.3791451731761238, "grad_norm": 0.5248236656188965, "learning_rate": 9.97912269974086e-06, "loss": 0.5421, "step": 343 }, { "epoch": 0.3802505526897568, "grad_norm": 0.5064373016357422, "learning_rate": 9.978530889460351e-06, "loss": 0.4749, "step": 344 }, { "epoch": 0.3813559322033898, "grad_norm": 0.5121058225631714, "learning_rate": 9.977930826001328e-06, "loss": 0.5289, "step": 345 }, { "epoch": 0.38246131171702286, "grad_norm": 0.5517717599868774, "learning_rate": 9.977322510358552e-06, "loss": 0.4929, "step": 346 }, { "epoch": 0.38356669123065584, "grad_norm": 0.5280880928039551, "learning_rate": 9.976705943540458e-06, "loss": 0.5142, "step": 347 }, { "epoch": 0.3846720707442889, "grad_norm": 0.6166154146194458, "learning_rate": 9.976081126569164e-06, "loss": 0.514, "step": 348 }, { "epoch": 0.3857774502579219, "grad_norm": 0.5047816634178162, "learning_rate": 9.975448060480462e-06, "loss": 0.5578, "step": 349 }, { "epoch": 0.3868828297715549, "grad_norm": 0.49621471762657166, "learning_rate": 9.97480674632382e-06, "loss": 0.518, "step": 350 }, { "epoch": 0.3879882092851879, "grad_norm": 0.5390298962593079, "learning_rate": 9.974157185162377e-06, "loss": 0.5144, "step": 351 }, { "epoch": 0.38909358879882094, "grad_norm": 0.5356057286262512, "learning_rate": 9.973499378072947e-06, "loss": 0.5106, "step": 352 }, { "epoch": 0.39019896831245393, "grad_norm": 0.4981788396835327, "learning_rate": 9.97283332614601e-06, "loss": 0.5068, "step": 353 }, { "epoch": 0.391304347826087, "grad_norm": 0.5405673980712891, "learning_rate": 9.972159030485722e-06, "loss": 0.5389, "step": 354 }, { "epoch": 0.39240972733971996, "grad_norm": 0.5313906669616699, "learning_rate": 9.971476492209892e-06, "loss": 0.5321, "step": 355 }, { "epoch": 0.393515106853353, "grad_norm": 0.5733017921447754, "learning_rate": 9.970785712450007e-06, "loss": 0.5367, "step": 356 }, { "epoch": 0.394620486366986, "grad_norm": 0.5085213780403137, "learning_rate": 9.970086692351204e-06, "loss": 0.4888, "step": 357 }, { "epoch": 0.39572586588061903, "grad_norm": 0.5927191376686096, "learning_rate": 9.96937943307229e-06, "loss": 0.5475, "step": 358 }, { "epoch": 0.396831245394252, "grad_norm": 0.5176211595535278, "learning_rate": 9.968663935785724e-06, "loss": 0.5096, "step": 359 }, { "epoch": 0.39793662490788506, "grad_norm": 0.502750813961029, "learning_rate": 9.967940201677628e-06, "loss": 0.5352, "step": 360 }, { "epoch": 0.39904200442151805, "grad_norm": 0.5412819385528564, "learning_rate": 9.967208231947772e-06, "loss": 0.5067, "step": 361 }, { "epoch": 0.4001473839351511, "grad_norm": 0.5106357932090759, "learning_rate": 9.966468027809582e-06, "loss": 0.5186, "step": 362 }, { "epoch": 0.4012527634487841, "grad_norm": 0.5199390053749084, "learning_rate": 9.965719590490134e-06, "loss": 0.5199, "step": 363 }, { "epoch": 0.4023581429624171, "grad_norm": 0.5018960237503052, "learning_rate": 9.964962921230155e-06, "loss": 0.5219, "step": 364 }, { "epoch": 0.4034635224760501, "grad_norm": 0.4920346438884735, "learning_rate": 9.964198021284013e-06, "loss": 0.5332, "step": 365 }, { "epoch": 0.40456890198968315, "grad_norm": 0.6014273166656494, "learning_rate": 9.963424891919728e-06, "loss": 0.5397, "step": 366 }, { "epoch": 0.40567428150331614, "grad_norm": 0.4786156713962555, "learning_rate": 9.962643534418954e-06, "loss": 0.5408, "step": 367 }, { "epoch": 0.4067796610169492, "grad_norm": 0.4526278078556061, "learning_rate": 9.961853950076992e-06, "loss": 0.4973, "step": 368 }, { "epoch": 0.40788504053058217, "grad_norm": 0.5099552869796753, "learning_rate": 9.961056140202777e-06, "loss": 0.5246, "step": 369 }, { "epoch": 0.4089904200442152, "grad_norm": 0.4966123402118683, "learning_rate": 9.960250106118883e-06, "loss": 0.483, "step": 370 }, { "epoch": 0.4100957995578482, "grad_norm": 0.503221869468689, "learning_rate": 9.959435849161515e-06, "loss": 0.5074, "step": 371 }, { "epoch": 0.4112011790714812, "grad_norm": 0.5197461247444153, "learning_rate": 9.958613370680507e-06, "loss": 0.4879, "step": 372 }, { "epoch": 0.4123065585851142, "grad_norm": 0.4877639412879944, "learning_rate": 9.957782672039334e-06, "loss": 0.4852, "step": 373 }, { "epoch": 0.4134119380987472, "grad_norm": 0.48913413286209106, "learning_rate": 9.956943754615082e-06, "loss": 0.5073, "step": 374 }, { "epoch": 0.41451731761238025, "grad_norm": 0.44102615118026733, "learning_rate": 9.956096619798475e-06, "loss": 0.5139, "step": 375 }, { "epoch": 0.41562269712601324, "grad_norm": 0.4540201723575592, "learning_rate": 9.955241268993852e-06, "loss": 0.5226, "step": 376 }, { "epoch": 0.4167280766396463, "grad_norm": 0.626828670501709, "learning_rate": 9.954377703619171e-06, "loss": 0.5429, "step": 377 }, { "epoch": 0.41783345615327927, "grad_norm": 0.5536373853683472, "learning_rate": 9.953505925106016e-06, "loss": 0.5495, "step": 378 }, { "epoch": 0.4189388356669123, "grad_norm": 0.5399571061134338, "learning_rate": 9.952625934899578e-06, "loss": 0.5172, "step": 379 }, { "epoch": 0.4200442151805453, "grad_norm": 0.572642982006073, "learning_rate": 9.951737734458665e-06, "loss": 0.511, "step": 380 }, { "epoch": 0.42114959469417834, "grad_norm": 0.4361434578895569, "learning_rate": 9.950841325255695e-06, "loss": 0.4951, "step": 381 }, { "epoch": 0.42225497420781133, "grad_norm": 0.48561277985572815, "learning_rate": 9.949936708776692e-06, "loss": 0.5268, "step": 382 }, { "epoch": 0.42336035372144437, "grad_norm": 0.5045523643493652, "learning_rate": 9.94902388652129e-06, "loss": 0.5079, "step": 383 }, { "epoch": 0.42446573323507736, "grad_norm": 0.46930769085884094, "learning_rate": 9.94810286000272e-06, "loss": 0.5386, "step": 384 }, { "epoch": 0.4255711127487104, "grad_norm": 0.5199976563453674, "learning_rate": 9.947173630747822e-06, "loss": 0.5292, "step": 385 }, { "epoch": 0.4266764922623434, "grad_norm": 0.46365293860435486, "learning_rate": 9.946236200297025e-06, "loss": 0.5518, "step": 386 }, { "epoch": 0.42778187177597643, "grad_norm": 0.4703636169433594, "learning_rate": 9.945290570204361e-06, "loss": 0.5104, "step": 387 }, { "epoch": 0.4288872512896094, "grad_norm": 0.577120840549469, "learning_rate": 9.944336742037451e-06, "loss": 0.5045, "step": 388 }, { "epoch": 0.42999263080324246, "grad_norm": 0.512116014957428, "learning_rate": 9.943374717377505e-06, "loss": 0.5219, "step": 389 }, { "epoch": 0.43109801031687545, "grad_norm": 0.6117689609527588, "learning_rate": 9.942404497819324e-06, "loss": 0.5018, "step": 390 }, { "epoch": 0.4322033898305085, "grad_norm": 0.5136898756027222, "learning_rate": 9.941426084971296e-06, "loss": 0.5196, "step": 391 }, { "epoch": 0.4333087693441415, "grad_norm": 0.5661974549293518, "learning_rate": 9.940439480455386e-06, "loss": 0.5121, "step": 392 }, { "epoch": 0.4344141488577745, "grad_norm": 0.5574038028717041, "learning_rate": 9.939444685907142e-06, "loss": 0.5335, "step": 393 }, { "epoch": 0.4355195283714075, "grad_norm": 0.5164183378219604, "learning_rate": 9.938441702975689e-06, "loss": 0.5158, "step": 394 }, { "epoch": 0.43662490788504055, "grad_norm": 0.549483597278595, "learning_rate": 9.937430533323725e-06, "loss": 0.4905, "step": 395 }, { "epoch": 0.43773028739867353, "grad_norm": 0.5192741751670837, "learning_rate": 9.936411178627521e-06, "loss": 0.5177, "step": 396 }, { "epoch": 0.4388356669123066, "grad_norm": 0.584197461605072, "learning_rate": 9.935383640576915e-06, "loss": 0.503, "step": 397 }, { "epoch": 0.43994104642593956, "grad_norm": 0.4986412823200226, "learning_rate": 9.934347920875314e-06, "loss": 0.5167, "step": 398 }, { "epoch": 0.4410464259395726, "grad_norm": 0.5902361273765564, "learning_rate": 9.933304021239683e-06, "loss": 0.5134, "step": 399 }, { "epoch": 0.4421518054532056, "grad_norm": 0.579990565776825, "learning_rate": 9.932251943400554e-06, "loss": 0.5078, "step": 400 }, { "epoch": 0.44325718496683864, "grad_norm": 0.49263447523117065, "learning_rate": 9.93119168910201e-06, "loss": 0.5076, "step": 401 }, { "epoch": 0.4443625644804716, "grad_norm": 0.5411688089370728, "learning_rate": 9.930123260101697e-06, "loss": 0.4945, "step": 402 }, { "epoch": 0.44546794399410466, "grad_norm": 0.5461675524711609, "learning_rate": 9.9290466581708e-06, "loss": 0.5422, "step": 403 }, { "epoch": 0.44657332350773765, "grad_norm": 0.46478936076164246, "learning_rate": 9.927961885094065e-06, "loss": 0.5215, "step": 404 }, { "epoch": 0.4476787030213707, "grad_norm": 0.4920472502708435, "learning_rate": 9.926868942669776e-06, "loss": 0.4972, "step": 405 }, { "epoch": 0.4487840825350037, "grad_norm": 0.5353307127952576, "learning_rate": 9.925767832709765e-06, "loss": 0.5214, "step": 406 }, { "epoch": 0.4498894620486367, "grad_norm": 0.5005469918251038, "learning_rate": 9.9246585570394e-06, "loss": 0.5089, "step": 407 }, { "epoch": 0.4509948415622697, "grad_norm": 0.49084949493408203, "learning_rate": 9.923541117497586e-06, "loss": 0.5421, "step": 408 }, { "epoch": 0.45210022107590275, "grad_norm": 0.4659053683280945, "learning_rate": 9.922415515936763e-06, "loss": 0.5026, "step": 409 }, { "epoch": 0.45320560058953574, "grad_norm": 0.49350810050964355, "learning_rate": 9.921281754222903e-06, "loss": 0.502, "step": 410 }, { "epoch": 0.4543109801031688, "grad_norm": 0.4558250904083252, "learning_rate": 9.920139834235503e-06, "loss": 0.5184, "step": 411 }, { "epoch": 0.45541635961680177, "grad_norm": 0.40973055362701416, "learning_rate": 9.918989757867584e-06, "loss": 0.5044, "step": 412 }, { "epoch": 0.45652173913043476, "grad_norm": 0.5124444365501404, "learning_rate": 9.917831527025689e-06, "loss": 0.5031, "step": 413 }, { "epoch": 0.4576271186440678, "grad_norm": 0.4899892807006836, "learning_rate": 9.916665143629881e-06, "loss": 0.4881, "step": 414 }, { "epoch": 0.4587324981577008, "grad_norm": 0.5355746746063232, "learning_rate": 9.915490609613737e-06, "loss": 0.5098, "step": 415 }, { "epoch": 0.45983787767133383, "grad_norm": 0.47358977794647217, "learning_rate": 9.914307926924344e-06, "loss": 0.5261, "step": 416 }, { "epoch": 0.4609432571849668, "grad_norm": 0.6277471780776978, "learning_rate": 9.9131170975223e-06, "loss": 0.4846, "step": 417 }, { "epoch": 0.46204863669859986, "grad_norm": 0.48336535692214966, "learning_rate": 9.911918123381706e-06, "loss": 0.5082, "step": 418 }, { "epoch": 0.46315401621223284, "grad_norm": 0.4898737072944641, "learning_rate": 9.910711006490166e-06, "loss": 0.4895, "step": 419 }, { "epoch": 0.4642593957258659, "grad_norm": 0.6082646250724792, "learning_rate": 9.909495748848783e-06, "loss": 0.5131, "step": 420 }, { "epoch": 0.4653647752394989, "grad_norm": 0.5278540253639221, "learning_rate": 9.908272352472154e-06, "loss": 0.5212, "step": 421 }, { "epoch": 0.4664701547531319, "grad_norm": 0.5272799134254456, "learning_rate": 9.907040819388372e-06, "loss": 0.4908, "step": 422 }, { "epoch": 0.4675755342667649, "grad_norm": 0.5537007451057434, "learning_rate": 9.90580115163901e-06, "loss": 0.5101, "step": 423 }, { "epoch": 0.46868091378039795, "grad_norm": 0.603565514087677, "learning_rate": 9.904553351279139e-06, "loss": 0.5197, "step": 424 }, { "epoch": 0.46978629329403093, "grad_norm": 0.3959555923938751, "learning_rate": 9.903297420377297e-06, "loss": 0.5046, "step": 425 }, { "epoch": 0.470891672807664, "grad_norm": 0.544605553150177, "learning_rate": 9.902033361015515e-06, "loss": 0.5088, "step": 426 }, { "epoch": 0.47199705232129696, "grad_norm": 0.5008931756019592, "learning_rate": 9.90076117528929e-06, "loss": 0.5119, "step": 427 }, { "epoch": 0.47310243183493, "grad_norm": 0.5141536593437195, "learning_rate": 9.899480865307591e-06, "loss": 0.5283, "step": 428 }, { "epoch": 0.474207811348563, "grad_norm": 0.47151434421539307, "learning_rate": 9.898192433192859e-06, "loss": 0.491, "step": 429 }, { "epoch": 0.47531319086219603, "grad_norm": 0.47807303071022034, "learning_rate": 9.896895881080995e-06, "loss": 0.5272, "step": 430 }, { "epoch": 0.476418570375829, "grad_norm": 0.5555190443992615, "learning_rate": 9.895591211121366e-06, "loss": 0.5093, "step": 431 }, { "epoch": 0.47752394988946206, "grad_norm": 0.5975165367126465, "learning_rate": 9.89427842547679e-06, "loss": 0.5258, "step": 432 }, { "epoch": 0.47862932940309505, "grad_norm": 0.5332474708557129, "learning_rate": 9.892957526323545e-06, "loss": 0.4803, "step": 433 }, { "epoch": 0.4797347089167281, "grad_norm": 0.5132103562355042, "learning_rate": 9.891628515851358e-06, "loss": 0.4894, "step": 434 }, { "epoch": 0.4808400884303611, "grad_norm": 0.4935170114040375, "learning_rate": 9.890291396263396e-06, "loss": 0.4877, "step": 435 }, { "epoch": 0.4819454679439941, "grad_norm": 0.49289026856422424, "learning_rate": 9.88894616977628e-06, "loss": 0.4775, "step": 436 }, { "epoch": 0.4830508474576271, "grad_norm": 0.5260725617408752, "learning_rate": 9.88759283862006e-06, "loss": 0.5157, "step": 437 }, { "epoch": 0.48415622697126015, "grad_norm": 0.47882357239723206, "learning_rate": 9.88623140503823e-06, "loss": 0.5002, "step": 438 }, { "epoch": 0.48526160648489314, "grad_norm": 0.48658815026283264, "learning_rate": 9.884861871287706e-06, "loss": 0.4815, "step": 439 }, { "epoch": 0.4863669859985262, "grad_norm": 0.5217152237892151, "learning_rate": 9.883484239638842e-06, "loss": 0.5229, "step": 440 }, { "epoch": 0.48747236551215917, "grad_norm": 0.46965423226356506, "learning_rate": 9.882098512375411e-06, "loss": 0.523, "step": 441 }, { "epoch": 0.4885777450257922, "grad_norm": 0.542387843132019, "learning_rate": 9.880704691794608e-06, "loss": 0.5098, "step": 442 }, { "epoch": 0.4896831245394252, "grad_norm": 0.509696364402771, "learning_rate": 9.879302780207044e-06, "loss": 0.5064, "step": 443 }, { "epoch": 0.49078850405305824, "grad_norm": 0.5036212801933289, "learning_rate": 9.877892779936744e-06, "loss": 0.4905, "step": 444 }, { "epoch": 0.4918938835666912, "grad_norm": 0.5915627479553223, "learning_rate": 9.876474693321138e-06, "loss": 0.4937, "step": 445 }, { "epoch": 0.49299926308032427, "grad_norm": 0.5749354958534241, "learning_rate": 9.87504852271107e-06, "loss": 0.5126, "step": 446 }, { "epoch": 0.49410464259395726, "grad_norm": 0.5480256080627441, "learning_rate": 9.873614270470778e-06, "loss": 0.484, "step": 447 }, { "epoch": 0.4952100221075903, "grad_norm": 0.4503266215324402, "learning_rate": 9.872171938977895e-06, "loss": 0.4617, "step": 448 }, { "epoch": 0.4963154016212233, "grad_norm": 0.6611694097518921, "learning_rate": 9.870721530623455e-06, "loss": 0.4882, "step": 449 }, { "epoch": 0.4974207811348563, "grad_norm": 0.5618709325790405, "learning_rate": 9.869263047811877e-06, "loss": 0.499, "step": 450 }, { "epoch": 0.4985261606484893, "grad_norm": 0.4667017459869385, "learning_rate": 9.867796492960968e-06, "loss": 0.4901, "step": 451 }, { "epoch": 0.49963154016212236, "grad_norm": 0.6630564332008362, "learning_rate": 9.866321868501914e-06, "loss": 0.4846, "step": 452 }, { "epoch": 0.5007369196757553, "grad_norm": 0.5551646947860718, "learning_rate": 9.864839176879278e-06, "loss": 0.5052, "step": 453 }, { "epoch": 0.5018422991893884, "grad_norm": 0.40802398324012756, "learning_rate": 9.863348420550998e-06, "loss": 0.4852, "step": 454 }, { "epoch": 0.5029476787030214, "grad_norm": 0.6616340279579163, "learning_rate": 9.861849601988384e-06, "loss": 0.5205, "step": 455 }, { "epoch": 0.5040530582166544, "grad_norm": 0.5447099804878235, "learning_rate": 9.860342723676105e-06, "loss": 0.5215, "step": 456 }, { "epoch": 0.5051584377302873, "grad_norm": 0.5643423795700073, "learning_rate": 9.858827788112195e-06, "loss": 0.5264, "step": 457 }, { "epoch": 0.5062638172439204, "grad_norm": 0.6286356449127197, "learning_rate": 9.857304797808043e-06, "loss": 0.5367, "step": 458 }, { "epoch": 0.5073691967575534, "grad_norm": 0.5641555786132812, "learning_rate": 9.855773755288396e-06, "loss": 0.5275, "step": 459 }, { "epoch": 0.5084745762711864, "grad_norm": 0.5900198817253113, "learning_rate": 9.85423466309134e-06, "loss": 0.5012, "step": 460 }, { "epoch": 0.5095799557848194, "grad_norm": 0.5676926374435425, "learning_rate": 9.852687523768316e-06, "loss": 0.4886, "step": 461 }, { "epoch": 0.5106853352984525, "grad_norm": 0.46308156847953796, "learning_rate": 9.851132339884097e-06, "loss": 0.5128, "step": 462 }, { "epoch": 0.5117907148120855, "grad_norm": 0.4954703748226166, "learning_rate": 9.849569114016794e-06, "loss": 0.515, "step": 463 }, { "epoch": 0.5128960943257185, "grad_norm": 0.5706295371055603, "learning_rate": 9.847997848757855e-06, "loss": 0.5071, "step": 464 }, { "epoch": 0.5140014738393515, "grad_norm": 0.4420671761035919, "learning_rate": 9.84641854671205e-06, "loss": 0.519, "step": 465 }, { "epoch": 0.5151068533529846, "grad_norm": 0.44633108377456665, "learning_rate": 9.844831210497468e-06, "loss": 0.4807, "step": 466 }, { "epoch": 0.5162122328666175, "grad_norm": 0.5985791087150574, "learning_rate": 9.843235842745527e-06, "loss": 0.4951, "step": 467 }, { "epoch": 0.5173176123802505, "grad_norm": 0.4298173785209656, "learning_rate": 9.84163244610095e-06, "loss": 0.5168, "step": 468 }, { "epoch": 0.5184229918938835, "grad_norm": 0.45164477825164795, "learning_rate": 9.840021023221777e-06, "loss": 0.4856, "step": 469 }, { "epoch": 0.5195283714075166, "grad_norm": 0.5320607423782349, "learning_rate": 9.83840157677935e-06, "loss": 0.5072, "step": 470 }, { "epoch": 0.5206337509211496, "grad_norm": 0.4582657217979431, "learning_rate": 9.836774109458311e-06, "loss": 0.4748, "step": 471 }, { "epoch": 0.5217391304347826, "grad_norm": 0.6252796053886414, "learning_rate": 9.835138623956603e-06, "loss": 0.5278, "step": 472 }, { "epoch": 0.5228445099484156, "grad_norm": 0.45874759554862976, "learning_rate": 9.833495122985455e-06, "loss": 0.5036, "step": 473 }, { "epoch": 0.5239498894620487, "grad_norm": 0.49125081300735474, "learning_rate": 9.831843609269387e-06, "loss": 0.4743, "step": 474 }, { "epoch": 0.5250552689756817, "grad_norm": 0.6214786767959595, "learning_rate": 9.830184085546203e-06, "loss": 0.5405, "step": 475 }, { "epoch": 0.5261606484893147, "grad_norm": 0.5058581829071045, "learning_rate": 9.828516554566988e-06, "loss": 0.4844, "step": 476 }, { "epoch": 0.5272660280029476, "grad_norm": 0.5052692890167236, "learning_rate": 9.826841019096095e-06, "loss": 0.4946, "step": 477 }, { "epoch": 0.5283714075165807, "grad_norm": 0.6124356985092163, "learning_rate": 9.825157481911146e-06, "loss": 0.5099, "step": 478 }, { "epoch": 0.5294767870302137, "grad_norm": 0.47449296712875366, "learning_rate": 9.823465945803036e-06, "loss": 0.5028, "step": 479 }, { "epoch": 0.5305821665438467, "grad_norm": 0.5293912887573242, "learning_rate": 9.821766413575915e-06, "loss": 0.5061, "step": 480 }, { "epoch": 0.5316875460574797, "grad_norm": 0.5744134783744812, "learning_rate": 9.820058888047187e-06, "loss": 0.4888, "step": 481 }, { "epoch": 0.5327929255711128, "grad_norm": 0.510883092880249, "learning_rate": 9.818343372047509e-06, "loss": 0.4994, "step": 482 }, { "epoch": 0.5338983050847458, "grad_norm": 0.5081970691680908, "learning_rate": 9.816619868420785e-06, "loss": 0.4815, "step": 483 }, { "epoch": 0.5350036845983788, "grad_norm": 0.5020565986633301, "learning_rate": 9.814888380024161e-06, "loss": 0.5047, "step": 484 }, { "epoch": 0.5361090641120118, "grad_norm": 0.5322097539901733, "learning_rate": 9.813148909728016e-06, "loss": 0.4945, "step": 485 }, { "epoch": 0.5372144436256449, "grad_norm": 0.443344384431839, "learning_rate": 9.811401460415966e-06, "loss": 0.4898, "step": 486 }, { "epoch": 0.5383198231392778, "grad_norm": 0.5644372701644897, "learning_rate": 9.80964603498485e-06, "loss": 0.524, "step": 487 }, { "epoch": 0.5394252026529108, "grad_norm": 0.5414569973945618, "learning_rate": 9.80788263634473e-06, "loss": 0.5094, "step": 488 }, { "epoch": 0.5405305821665438, "grad_norm": 0.43462854623794556, "learning_rate": 9.80611126741889e-06, "loss": 0.5122, "step": 489 }, { "epoch": 0.5416359616801769, "grad_norm": 0.4668678045272827, "learning_rate": 9.804331931143822e-06, "loss": 0.4588, "step": 490 }, { "epoch": 0.5427413411938099, "grad_norm": 0.5473464131355286, "learning_rate": 9.802544630469227e-06, "loss": 0.5027, "step": 491 }, { "epoch": 0.5438467207074429, "grad_norm": 0.5007190704345703, "learning_rate": 9.80074936835801e-06, "loss": 0.483, "step": 492 }, { "epoch": 0.5449521002210759, "grad_norm": 0.5312074422836304, "learning_rate": 9.798946147786273e-06, "loss": 0.4966, "step": 493 }, { "epoch": 0.5460574797347089, "grad_norm": 0.4783822000026703, "learning_rate": 9.797134971743312e-06, "loss": 0.4762, "step": 494 }, { "epoch": 0.547162859248342, "grad_norm": 0.5033414959907532, "learning_rate": 9.795315843231613e-06, "loss": 0.4896, "step": 495 }, { "epoch": 0.548268238761975, "grad_norm": 0.455998033285141, "learning_rate": 9.793488765266838e-06, "loss": 0.5028, "step": 496 }, { "epoch": 0.5493736182756079, "grad_norm": 0.47610336542129517, "learning_rate": 9.79165374087784e-06, "loss": 0.4832, "step": 497 }, { "epoch": 0.5504789977892409, "grad_norm": 0.4619760811328888, "learning_rate": 9.789810773106632e-06, "loss": 0.5042, "step": 498 }, { "epoch": 0.551584377302874, "grad_norm": 0.44413551688194275, "learning_rate": 9.7879598650084e-06, "loss": 0.4711, "step": 499 }, { "epoch": 0.552689756816507, "grad_norm": 0.4892210364341736, "learning_rate": 9.786101019651499e-06, "loss": 0.4871, "step": 500 }, { "epoch": 0.55379513633014, "grad_norm": 0.5182769298553467, "learning_rate": 9.784234240117433e-06, "loss": 0.4931, "step": 501 }, { "epoch": 0.554900515843773, "grad_norm": 0.5005459785461426, "learning_rate": 9.782359529500867e-06, "loss": 0.526, "step": 502 }, { "epoch": 0.5560058953574061, "grad_norm": 0.575646162033081, "learning_rate": 9.780476890909605e-06, "loss": 0.5272, "step": 503 }, { "epoch": 0.5571112748710391, "grad_norm": 0.46679654717445374, "learning_rate": 9.7785863274646e-06, "loss": 0.4906, "step": 504 }, { "epoch": 0.558216654384672, "grad_norm": 0.5743274092674255, "learning_rate": 9.776687842299939e-06, "loss": 0.5134, "step": 505 }, { "epoch": 0.559322033898305, "grad_norm": 0.4349190294742584, "learning_rate": 9.774781438562846e-06, "loss": 0.5015, "step": 506 }, { "epoch": 0.5604274134119381, "grad_norm": 0.5719268918037415, "learning_rate": 9.772867119413667e-06, "loss": 0.5072, "step": 507 }, { "epoch": 0.5615327929255711, "grad_norm": 0.49311426281929016, "learning_rate": 9.770944888025874e-06, "loss": 0.5312, "step": 508 }, { "epoch": 0.5626381724392041, "grad_norm": 0.4549877345561981, "learning_rate": 9.769014747586052e-06, "loss": 0.4893, "step": 509 }, { "epoch": 0.5637435519528371, "grad_norm": 0.4449659287929535, "learning_rate": 9.767076701293898e-06, "loss": 0.4947, "step": 510 }, { "epoch": 0.5648489314664702, "grad_norm": 0.4313373863697052, "learning_rate": 9.765130752362217e-06, "loss": 0.5391, "step": 511 }, { "epoch": 0.5659543109801032, "grad_norm": 0.4769163131713867, "learning_rate": 9.763176904016914e-06, "loss": 0.5283, "step": 512 }, { "epoch": 0.5670596904937362, "grad_norm": 0.4155581593513489, "learning_rate": 9.761215159496985e-06, "loss": 0.5143, "step": 513 }, { "epoch": 0.5681650700073692, "grad_norm": 0.4437922537326813, "learning_rate": 9.759245522054523e-06, "loss": 0.4964, "step": 514 }, { "epoch": 0.5692704495210023, "grad_norm": 0.4707755446434021, "learning_rate": 9.7572679949547e-06, "loss": 0.5056, "step": 515 }, { "epoch": 0.5703758290346352, "grad_norm": 0.43742597103118896, "learning_rate": 9.755282581475769e-06, "loss": 0.5011, "step": 516 }, { "epoch": 0.5714812085482682, "grad_norm": 0.44067561626434326, "learning_rate": 9.753289284909058e-06, "loss": 0.4683, "step": 517 }, { "epoch": 0.5725865880619012, "grad_norm": 0.483964741230011, "learning_rate": 9.751288108558961e-06, "loss": 0.4889, "step": 518 }, { "epoch": 0.5736919675755343, "grad_norm": 0.4943755567073822, "learning_rate": 9.749279055742936e-06, "loss": 0.4937, "step": 519 }, { "epoch": 0.5747973470891673, "grad_norm": 0.42413976788520813, "learning_rate": 9.747262129791497e-06, "loss": 0.4963, "step": 520 }, { "epoch": 0.5759027266028003, "grad_norm": 0.5275388956069946, "learning_rate": 9.745237334048213e-06, "loss": 0.4782, "step": 521 }, { "epoch": 0.5770081061164333, "grad_norm": 0.4472026228904724, "learning_rate": 9.743204671869694e-06, "loss": 0.4911, "step": 522 }, { "epoch": 0.5781134856300664, "grad_norm": 0.4579602777957916, "learning_rate": 9.741164146625597e-06, "loss": 0.5431, "step": 523 }, { "epoch": 0.5792188651436994, "grad_norm": 0.4586956799030304, "learning_rate": 9.739115761698607e-06, "loss": 0.509, "step": 524 }, { "epoch": 0.5803242446573323, "grad_norm": 0.5318251848220825, "learning_rate": 9.737059520484444e-06, "loss": 0.5191, "step": 525 }, { "epoch": 0.5814296241709653, "grad_norm": 0.5058639049530029, "learning_rate": 9.73499542639185e-06, "loss": 0.4841, "step": 526 }, { "epoch": 0.5825350036845984, "grad_norm": 0.4750097692012787, "learning_rate": 9.73292348284258e-06, "loss": 0.4984, "step": 527 }, { "epoch": 0.5836403831982314, "grad_norm": 0.4803778827190399, "learning_rate": 9.730843693271413e-06, "loss": 0.494, "step": 528 }, { "epoch": 0.5847457627118644, "grad_norm": 0.4708038568496704, "learning_rate": 9.728756061126126e-06, "loss": 0.4928, "step": 529 }, { "epoch": 0.5858511422254974, "grad_norm": 0.5602582097053528, "learning_rate": 9.726660589867495e-06, "loss": 0.487, "step": 530 }, { "epoch": 0.5869565217391305, "grad_norm": 0.4646506607532501, "learning_rate": 9.724557282969302e-06, "loss": 0.4792, "step": 531 }, { "epoch": 0.5880619012527635, "grad_norm": 0.44431015849113464, "learning_rate": 9.722446143918307e-06, "loss": 0.4784, "step": 532 }, { "epoch": 0.5891672807663965, "grad_norm": 0.4894096255302429, "learning_rate": 9.720327176214262e-06, "loss": 0.5205, "step": 533 }, { "epoch": 0.5902726602800294, "grad_norm": 0.5127395987510681, "learning_rate": 9.718200383369891e-06, "loss": 0.5227, "step": 534 }, { "epoch": 0.5913780397936624, "grad_norm": 0.42986398935317993, "learning_rate": 9.716065768910895e-06, "loss": 0.4872, "step": 535 }, { "epoch": 0.5924834193072955, "grad_norm": 0.4994347095489502, "learning_rate": 9.713923336375936e-06, "loss": 0.5009, "step": 536 }, { "epoch": 0.5935887988209285, "grad_norm": 0.41778478026390076, "learning_rate": 9.711773089316645e-06, "loss": 0.4878, "step": 537 }, { "epoch": 0.5946941783345615, "grad_norm": 0.41869840025901794, "learning_rate": 9.709615031297598e-06, "loss": 0.4916, "step": 538 }, { "epoch": 0.5957995578481945, "grad_norm": 0.4940272569656372, "learning_rate": 9.707449165896328e-06, "loss": 0.4962, "step": 539 }, { "epoch": 0.5969049373618276, "grad_norm": 0.4439680278301239, "learning_rate": 9.705275496703302e-06, "loss": 0.5203, "step": 540 }, { "epoch": 0.5980103168754606, "grad_norm": 0.4309464395046234, "learning_rate": 9.70309402732193e-06, "loss": 0.4789, "step": 541 }, { "epoch": 0.5991156963890936, "grad_norm": 0.5458532571792603, "learning_rate": 9.70090476136855e-06, "loss": 0.4903, "step": 542 }, { "epoch": 0.6002210759027266, "grad_norm": 0.49766257405281067, "learning_rate": 9.69870770247243e-06, "loss": 0.4882, "step": 543 }, { "epoch": 0.6013264554163597, "grad_norm": 0.5295751690864563, "learning_rate": 9.69650285427575e-06, "loss": 0.4989, "step": 544 }, { "epoch": 0.6024318349299926, "grad_norm": 0.4698268175125122, "learning_rate": 9.694290220433603e-06, "loss": 0.4849, "step": 545 }, { "epoch": 0.6035372144436256, "grad_norm": 0.4883490204811096, "learning_rate": 9.692069804613995e-06, "loss": 0.4934, "step": 546 }, { "epoch": 0.6046425939572586, "grad_norm": 0.5819318890571594, "learning_rate": 9.689841610497828e-06, "loss": 0.5192, "step": 547 }, { "epoch": 0.6057479734708917, "grad_norm": 0.41133859753608704, "learning_rate": 9.687605641778899e-06, "loss": 0.508, "step": 548 }, { "epoch": 0.6068533529845247, "grad_norm": 0.49741798639297485, "learning_rate": 9.685361902163893e-06, "loss": 0.5358, "step": 549 }, { "epoch": 0.6079587324981577, "grad_norm": 0.5592443943023682, "learning_rate": 9.683110395372379e-06, "loss": 0.5084, "step": 550 }, { "epoch": 0.6090641120117907, "grad_norm": 0.4790709912776947, "learning_rate": 9.680851125136799e-06, "loss": 0.4695, "step": 551 }, { "epoch": 0.6101694915254238, "grad_norm": 0.42638182640075684, "learning_rate": 9.678584095202468e-06, "loss": 0.4754, "step": 552 }, { "epoch": 0.6112748710390568, "grad_norm": 0.48262834548950195, "learning_rate": 9.676309309327564e-06, "loss": 0.4942, "step": 553 }, { "epoch": 0.6123802505526897, "grad_norm": 0.49953779578208923, "learning_rate": 9.674026771283123e-06, "loss": 0.5018, "step": 554 }, { "epoch": 0.6134856300663227, "grad_norm": 0.5020205974578857, "learning_rate": 9.671736484853029e-06, "loss": 0.4905, "step": 555 }, { "epoch": 0.6145910095799558, "grad_norm": 0.463408499956131, "learning_rate": 9.669438453834014e-06, "loss": 0.5164, "step": 556 }, { "epoch": 0.6156963890935888, "grad_norm": 0.47316861152648926, "learning_rate": 9.667132682035646e-06, "loss": 0.4856, "step": 557 }, { "epoch": 0.6168017686072218, "grad_norm": 0.4807091951370239, "learning_rate": 9.664819173280328e-06, "loss": 0.5055, "step": 558 }, { "epoch": 0.6179071481208548, "grad_norm": 0.42022043466567993, "learning_rate": 9.66249793140329e-06, "loss": 0.5159, "step": 559 }, { "epoch": 0.6190125276344879, "grad_norm": 0.512110710144043, "learning_rate": 9.660168960252575e-06, "loss": 0.4824, "step": 560 }, { "epoch": 0.6201179071481209, "grad_norm": 0.4525811970233917, "learning_rate": 9.657832263689051e-06, "loss": 0.5245, "step": 561 }, { "epoch": 0.6212232866617539, "grad_norm": 0.45435631275177, "learning_rate": 9.655487845586378e-06, "loss": 0.4712, "step": 562 }, { "epoch": 0.6223286661753868, "grad_norm": 0.44853875041007996, "learning_rate": 9.653135709831028e-06, "loss": 0.4897, "step": 563 }, { "epoch": 0.6234340456890199, "grad_norm": 0.4534260928630829, "learning_rate": 9.650775860322263e-06, "loss": 0.4971, "step": 564 }, { "epoch": 0.6245394252026529, "grad_norm": 0.4517850875854492, "learning_rate": 9.64840830097213e-06, "loss": 0.4857, "step": 565 }, { "epoch": 0.6256448047162859, "grad_norm": 0.4814460277557373, "learning_rate": 9.646033035705462e-06, "loss": 0.4832, "step": 566 }, { "epoch": 0.6267501842299189, "grad_norm": 0.4203760325908661, "learning_rate": 9.643650068459863e-06, "loss": 0.4868, "step": 567 }, { "epoch": 0.627855563743552, "grad_norm": 0.44670718908309937, "learning_rate": 9.641259403185706e-06, "loss": 0.4915, "step": 568 }, { "epoch": 0.628960943257185, "grad_norm": 0.41987767815589905, "learning_rate": 9.638861043846125e-06, "loss": 0.4997, "step": 569 }, { "epoch": 0.630066322770818, "grad_norm": 0.412405401468277, "learning_rate": 9.636454994417013e-06, "loss": 0.5014, "step": 570 }, { "epoch": 0.631171702284451, "grad_norm": 0.42507413029670715, "learning_rate": 9.634041258887004e-06, "loss": 0.4977, "step": 571 }, { "epoch": 0.6322770817980841, "grad_norm": 0.540103018283844, "learning_rate": 9.631619841257477e-06, "loss": 0.4879, "step": 572 }, { "epoch": 0.633382461311717, "grad_norm": 0.468466192483902, "learning_rate": 9.629190745542546e-06, "loss": 0.4949, "step": 573 }, { "epoch": 0.63448784082535, "grad_norm": 0.4755018949508667, "learning_rate": 9.626753975769054e-06, "loss": 0.5063, "step": 574 }, { "epoch": 0.635593220338983, "grad_norm": 0.425984263420105, "learning_rate": 9.624309535976569e-06, "loss": 0.5391, "step": 575 }, { "epoch": 0.636698599852616, "grad_norm": 0.4315877854824066, "learning_rate": 9.621857430217366e-06, "loss": 0.4812, "step": 576 }, { "epoch": 0.6378039793662491, "grad_norm": 0.48628854751586914, "learning_rate": 9.619397662556434e-06, "loss": 0.4892, "step": 577 }, { "epoch": 0.6389093588798821, "grad_norm": 0.41636818647384644, "learning_rate": 9.616930237071464e-06, "loss": 0.5064, "step": 578 }, { "epoch": 0.6400147383935151, "grad_norm": 0.4280545115470886, "learning_rate": 9.614455157852836e-06, "loss": 0.4955, "step": 579 }, { "epoch": 0.6411201179071481, "grad_norm": 0.4578336179256439, "learning_rate": 9.611972429003626e-06, "loss": 0.5329, "step": 580 }, { "epoch": 0.6422254974207812, "grad_norm": 0.4270852506160736, "learning_rate": 9.609482054639586e-06, "loss": 0.5355, "step": 581 }, { "epoch": 0.6433308769344142, "grad_norm": 0.5284178256988525, "learning_rate": 9.606984038889142e-06, "loss": 0.5055, "step": 582 }, { "epoch": 0.6444362564480471, "grad_norm": 0.43940046429634094, "learning_rate": 9.60447838589339e-06, "loss": 0.5216, "step": 583 }, { "epoch": 0.6455416359616801, "grad_norm": 0.63617342710495, "learning_rate": 9.601965099806085e-06, "loss": 0.5248, "step": 584 }, { "epoch": 0.6466470154753132, "grad_norm": 0.517815887928009, "learning_rate": 9.59944418479364e-06, "loss": 0.4857, "step": 585 }, { "epoch": 0.6477523949889462, "grad_norm": 0.5134850740432739, "learning_rate": 9.596915645035107e-06, "loss": 0.4945, "step": 586 }, { "epoch": 0.6488577745025792, "grad_norm": 0.5380955934524536, "learning_rate": 9.594379484722185e-06, "loss": 0.5207, "step": 587 }, { "epoch": 0.6499631540162122, "grad_norm": 0.486435204744339, "learning_rate": 9.591835708059202e-06, "loss": 0.4876, "step": 588 }, { "epoch": 0.6510685335298453, "grad_norm": 0.5660929679870605, "learning_rate": 9.589284319263116e-06, "loss": 0.5241, "step": 589 }, { "epoch": 0.6521739130434783, "grad_norm": 0.4971397817134857, "learning_rate": 9.5867253225635e-06, "loss": 0.4929, "step": 590 }, { "epoch": 0.6532792925571113, "grad_norm": 0.4996197819709778, "learning_rate": 9.584158722202538e-06, "loss": 0.5124, "step": 591 }, { "epoch": 0.6543846720707442, "grad_norm": 0.6175948977470398, "learning_rate": 9.581584522435025e-06, "loss": 0.5102, "step": 592 }, { "epoch": 0.6554900515843773, "grad_norm": 0.49270105361938477, "learning_rate": 9.579002727528348e-06, "loss": 0.4792, "step": 593 }, { "epoch": 0.6565954310980103, "grad_norm": 0.5233104228973389, "learning_rate": 9.57641334176249e-06, "loss": 0.5186, "step": 594 }, { "epoch": 0.6577008106116433, "grad_norm": 0.5118619203567505, "learning_rate": 9.573816369430013e-06, "loss": 0.5023, "step": 595 }, { "epoch": 0.6588061901252763, "grad_norm": 0.5350534319877625, "learning_rate": 9.571211814836059e-06, "loss": 0.5296, "step": 596 }, { "epoch": 0.6599115696389094, "grad_norm": 0.48804426193237305, "learning_rate": 9.568599682298337e-06, "loss": 0.5233, "step": 597 }, { "epoch": 0.6610169491525424, "grad_norm": 0.521026611328125, "learning_rate": 9.56597997614712e-06, "loss": 0.5041, "step": 598 }, { "epoch": 0.6621223286661754, "grad_norm": 0.44309839606285095, "learning_rate": 9.563352700725235e-06, "loss": 0.5022, "step": 599 }, { "epoch": 0.6632277081798084, "grad_norm": 0.49812477827072144, "learning_rate": 9.560717860388061e-06, "loss": 0.5106, "step": 600 }, { "epoch": 0.6643330876934415, "grad_norm": 0.513699471950531, "learning_rate": 9.558075459503511e-06, "loss": 0.5056, "step": 601 }, { "epoch": 0.6654384672070744, "grad_norm": 0.5042475461959839, "learning_rate": 9.555425502452038e-06, "loss": 0.4994, "step": 602 }, { "epoch": 0.6665438467207074, "grad_norm": 0.5049284100532532, "learning_rate": 9.552767993626618e-06, "loss": 0.5285, "step": 603 }, { "epoch": 0.6676492262343404, "grad_norm": 0.4206513464450836, "learning_rate": 9.550102937432743e-06, "loss": 0.493, "step": 604 }, { "epoch": 0.6687546057479735, "grad_norm": 0.4712638258934021, "learning_rate": 9.547430338288423e-06, "loss": 0.5087, "step": 605 }, { "epoch": 0.6698599852616065, "grad_norm": 0.5565213561058044, "learning_rate": 9.544750200624169e-06, "loss": 0.5186, "step": 606 }, { "epoch": 0.6709653647752395, "grad_norm": 0.47286733984947205, "learning_rate": 9.542062528882989e-06, "loss": 0.492, "step": 607 }, { "epoch": 0.6720707442888725, "grad_norm": 0.5103026628494263, "learning_rate": 9.539367327520382e-06, "loss": 0.482, "step": 608 }, { "epoch": 0.6731761238025056, "grad_norm": 0.46373724937438965, "learning_rate": 9.536664601004326e-06, "loss": 0.465, "step": 609 }, { "epoch": 0.6742815033161386, "grad_norm": 0.5124532580375671, "learning_rate": 9.533954353815279e-06, "loss": 0.4847, "step": 610 }, { "epoch": 0.6753868828297716, "grad_norm": 0.497402548789978, "learning_rate": 9.531236590446162e-06, "loss": 0.5108, "step": 611 }, { "epoch": 0.6764922623434045, "grad_norm": 0.45893919467926025, "learning_rate": 9.528511315402358e-06, "loss": 0.5024, "step": 612 }, { "epoch": 0.6775976418570376, "grad_norm": 0.49103185534477234, "learning_rate": 9.525778533201702e-06, "loss": 0.5215, "step": 613 }, { "epoch": 0.6787030213706706, "grad_norm": 0.4280547797679901, "learning_rate": 9.523038248374474e-06, "loss": 0.4883, "step": 614 }, { "epoch": 0.6798084008843036, "grad_norm": 0.511462390422821, "learning_rate": 9.52029046546339e-06, "loss": 0.5123, "step": 615 }, { "epoch": 0.6809137803979366, "grad_norm": 0.42267224192619324, "learning_rate": 9.517535189023602e-06, "loss": 0.5, "step": 616 }, { "epoch": 0.6820191599115696, "grad_norm": 0.49845463037490845, "learning_rate": 9.514772423622675e-06, "loss": 0.5002, "step": 617 }, { "epoch": 0.6831245394252027, "grad_norm": 0.4965236186981201, "learning_rate": 9.512002173840597e-06, "loss": 0.4889, "step": 618 }, { "epoch": 0.6842299189388357, "grad_norm": 0.5159595012664795, "learning_rate": 9.50922444426976e-06, "loss": 0.4688, "step": 619 }, { "epoch": 0.6853352984524687, "grad_norm": 0.556387186050415, "learning_rate": 9.506439239514954e-06, "loss": 0.4791, "step": 620 }, { "epoch": 0.6864406779661016, "grad_norm": 0.47413209080696106, "learning_rate": 9.503646564193363e-06, "loss": 0.4826, "step": 621 }, { "epoch": 0.6875460574797347, "grad_norm": 0.4781365692615509, "learning_rate": 9.500846422934557e-06, "loss": 0.4913, "step": 622 }, { "epoch": 0.6886514369933677, "grad_norm": 0.45107683539390564, "learning_rate": 9.498038820380478e-06, "loss": 0.4707, "step": 623 }, { "epoch": 0.6897568165070007, "grad_norm": 0.44477614760398865, "learning_rate": 9.495223761185443e-06, "loss": 0.4846, "step": 624 }, { "epoch": 0.6908621960206337, "grad_norm": 0.48909062147140503, "learning_rate": 9.492401250016125e-06, "loss": 0.502, "step": 625 }, { "epoch": 0.6919675755342668, "grad_norm": 0.4580002427101135, "learning_rate": 9.489571291551553e-06, "loss": 0.4973, "step": 626 }, { "epoch": 0.6930729550478998, "grad_norm": 0.472623348236084, "learning_rate": 9.4867338904831e-06, "loss": 0.4996, "step": 627 }, { "epoch": 0.6941783345615328, "grad_norm": 0.441806823015213, "learning_rate": 9.483889051514483e-06, "loss": 0.4925, "step": 628 }, { "epoch": 0.6952837140751658, "grad_norm": 0.5248472690582275, "learning_rate": 9.481036779361738e-06, "loss": 0.5303, "step": 629 }, { "epoch": 0.6963890935887989, "grad_norm": 0.5406106114387512, "learning_rate": 9.478177078753236e-06, "loss": 0.4914, "step": 630 }, { "epoch": 0.6974944731024318, "grad_norm": 0.5258222818374634, "learning_rate": 9.475309954429654e-06, "loss": 0.5103, "step": 631 }, { "epoch": 0.6985998526160648, "grad_norm": 0.5886120200157166, "learning_rate": 9.472435411143979e-06, "loss": 0.499, "step": 632 }, { "epoch": 0.6997052321296978, "grad_norm": 0.4666534960269928, "learning_rate": 9.469553453661493e-06, "loss": 0.4762, "step": 633 }, { "epoch": 0.7008106116433309, "grad_norm": 0.506817638874054, "learning_rate": 9.466664086759777e-06, "loss": 0.4766, "step": 634 }, { "epoch": 0.7019159911569639, "grad_norm": 0.5043922066688538, "learning_rate": 9.463767315228687e-06, "loss": 0.5183, "step": 635 }, { "epoch": 0.7030213706705969, "grad_norm": 0.5371654629707336, "learning_rate": 9.460863143870355e-06, "loss": 0.4764, "step": 636 }, { "epoch": 0.7041267501842299, "grad_norm": 0.46658825874328613, "learning_rate": 9.457951577499187e-06, "loss": 0.4992, "step": 637 }, { "epoch": 0.705232129697863, "grad_norm": 0.493245929479599, "learning_rate": 9.45503262094184e-06, "loss": 0.4931, "step": 638 }, { "epoch": 0.706337509211496, "grad_norm": 0.48513203859329224, "learning_rate": 9.452106279037226e-06, "loss": 0.5125, "step": 639 }, { "epoch": 0.707442888725129, "grad_norm": 0.45463261008262634, "learning_rate": 9.4491725566365e-06, "loss": 0.4716, "step": 640 }, { "epoch": 0.7085482682387619, "grad_norm": 0.39873823523521423, "learning_rate": 9.446231458603051e-06, "loss": 0.48, "step": 641 }, { "epoch": 0.709653647752395, "grad_norm": 0.45242276787757874, "learning_rate": 9.443282989812494e-06, "loss": 0.5066, "step": 642 }, { "epoch": 0.710759027266028, "grad_norm": 0.4502450227737427, "learning_rate": 9.440327155152667e-06, "loss": 0.5088, "step": 643 }, { "epoch": 0.711864406779661, "grad_norm": 0.3912629783153534, "learning_rate": 9.437363959523613e-06, "loss": 0.4788, "step": 644 }, { "epoch": 0.712969786293294, "grad_norm": 0.44197404384613037, "learning_rate": 9.434393407837585e-06, "loss": 0.4762, "step": 645 }, { "epoch": 0.7140751658069271, "grad_norm": 0.4473787248134613, "learning_rate": 9.431415505019024e-06, "loss": 0.5145, "step": 646 }, { "epoch": 0.7151805453205601, "grad_norm": 0.45003071427345276, "learning_rate": 9.428430256004558e-06, "loss": 0.5243, "step": 647 }, { "epoch": 0.7162859248341931, "grad_norm": 0.40199312567710876, "learning_rate": 9.425437665742998e-06, "loss": 0.4948, "step": 648 }, { "epoch": 0.717391304347826, "grad_norm": 0.46728041768074036, "learning_rate": 9.42243773919532e-06, "loss": 0.4967, "step": 649 }, { "epoch": 0.7184966838614592, "grad_norm": 0.4916439652442932, "learning_rate": 9.419430481334663e-06, "loss": 0.519, "step": 650 }, { "epoch": 0.7196020633750921, "grad_norm": 0.39190107583999634, "learning_rate": 9.416415897146325e-06, "loss": 0.5379, "step": 651 }, { "epoch": 0.7207074428887251, "grad_norm": 0.45062732696533203, "learning_rate": 9.413393991627737e-06, "loss": 0.4868, "step": 652 }, { "epoch": 0.7218128224023581, "grad_norm": 0.4452456831932068, "learning_rate": 9.410364769788481e-06, "loss": 0.5213, "step": 653 }, { "epoch": 0.7229182019159912, "grad_norm": 0.4030410647392273, "learning_rate": 9.407328236650257e-06, "loss": 0.4662, "step": 654 }, { "epoch": 0.7240235814296242, "grad_norm": 0.45318660140037537, "learning_rate": 9.40428439724689e-06, "loss": 0.5007, "step": 655 }, { "epoch": 0.7251289609432572, "grad_norm": 0.45293989777565, "learning_rate": 9.401233256624318e-06, "loss": 0.4981, "step": 656 }, { "epoch": 0.7262343404568902, "grad_norm": 0.4332043528556824, "learning_rate": 9.398174819840577e-06, "loss": 0.5051, "step": 657 }, { "epoch": 0.7273397199705232, "grad_norm": 0.4818902909755707, "learning_rate": 9.395109091965808e-06, "loss": 0.5189, "step": 658 }, { "epoch": 0.7284450994841563, "grad_norm": 0.506685197353363, "learning_rate": 9.392036078082229e-06, "loss": 0.4794, "step": 659 }, { "epoch": 0.7295504789977892, "grad_norm": 0.5043912529945374, "learning_rate": 9.388955783284141e-06, "loss": 0.4874, "step": 660 }, { "epoch": 0.7306558585114222, "grad_norm": 0.4825596511363983, "learning_rate": 9.385868212677917e-06, "loss": 0.4932, "step": 661 }, { "epoch": 0.7317612380250552, "grad_norm": 0.46176213026046753, "learning_rate": 9.382773371381986e-06, "loss": 0.4937, "step": 662 }, { "epoch": 0.7328666175386883, "grad_norm": 0.4218919575214386, "learning_rate": 9.379671264526833e-06, "loss": 0.5039, "step": 663 }, { "epoch": 0.7339719970523213, "grad_norm": 0.48071998357772827, "learning_rate": 9.376561897254987e-06, "loss": 0.5414, "step": 664 }, { "epoch": 0.7350773765659543, "grad_norm": 0.4574851393699646, "learning_rate": 9.373445274721017e-06, "loss": 0.4874, "step": 665 }, { "epoch": 0.7361827560795873, "grad_norm": 0.46952536702156067, "learning_rate": 9.370321402091514e-06, "loss": 0.518, "step": 666 }, { "epoch": 0.7372881355932204, "grad_norm": 0.4091287851333618, "learning_rate": 9.367190284545087e-06, "loss": 0.4719, "step": 667 }, { "epoch": 0.7383935151068534, "grad_norm": 0.5122862458229065, "learning_rate": 9.36405192727236e-06, "loss": 0.4875, "step": 668 }, { "epoch": 0.7394988946204863, "grad_norm": 0.4446588158607483, "learning_rate": 9.360906335475959e-06, "loss": 0.5063, "step": 669 }, { "epoch": 0.7406042741341193, "grad_norm": 0.49027779698371887, "learning_rate": 9.357753514370497e-06, "loss": 0.4716, "step": 670 }, { "epoch": 0.7417096536477524, "grad_norm": 0.5438688397407532, "learning_rate": 9.354593469182577e-06, "loss": 0.5066, "step": 671 }, { "epoch": 0.7428150331613854, "grad_norm": 0.40905964374542236, "learning_rate": 9.351426205150778e-06, "loss": 0.4912, "step": 672 }, { "epoch": 0.7439204126750184, "grad_norm": 0.4557497203350067, "learning_rate": 9.348251727525639e-06, "loss": 0.5165, "step": 673 }, { "epoch": 0.7450257921886514, "grad_norm": 0.5191920399665833, "learning_rate": 9.345070041569666e-06, "loss": 0.5079, "step": 674 }, { "epoch": 0.7461311717022845, "grad_norm": 0.5078957676887512, "learning_rate": 9.341881152557311e-06, "loss": 0.5048, "step": 675 }, { "epoch": 0.7472365512159175, "grad_norm": 0.5229098796844482, "learning_rate": 9.338685065774964e-06, "loss": 0.5059, "step": 676 }, { "epoch": 0.7483419307295505, "grad_norm": 0.5136930346488953, "learning_rate": 9.335481786520955e-06, "loss": 0.4799, "step": 677 }, { "epoch": 0.7494473102431835, "grad_norm": 0.5401421785354614, "learning_rate": 9.332271320105527e-06, "loss": 0.4993, "step": 678 }, { "epoch": 0.7505526897568165, "grad_norm": 0.5049564242362976, "learning_rate": 9.329053671850847e-06, "loss": 0.4897, "step": 679 }, { "epoch": 0.7516580692704495, "grad_norm": 0.4710080027580261, "learning_rate": 9.32582884709098e-06, "loss": 0.4982, "step": 680 }, { "epoch": 0.7527634487840825, "grad_norm": 0.5606082081794739, "learning_rate": 9.322596851171894e-06, "loss": 0.4908, "step": 681 }, { "epoch": 0.7538688282977155, "grad_norm": 0.4388163089752197, "learning_rate": 9.319357689451444e-06, "loss": 0.49, "step": 682 }, { "epoch": 0.7549742078113486, "grad_norm": 0.44062986969947815, "learning_rate": 9.316111367299358e-06, "loss": 0.4862, "step": 683 }, { "epoch": 0.7560795873249816, "grad_norm": 0.5375773310661316, "learning_rate": 9.312857890097243e-06, "loss": 0.4891, "step": 684 }, { "epoch": 0.7571849668386146, "grad_norm": 0.45706865191459656, "learning_rate": 9.309597263238561e-06, "loss": 0.4984, "step": 685 }, { "epoch": 0.7582903463522476, "grad_norm": 0.4425821900367737, "learning_rate": 9.30632949212863e-06, "loss": 0.4697, "step": 686 }, { "epoch": 0.7593957258658807, "grad_norm": 0.5073388814926147, "learning_rate": 9.30305458218461e-06, "loss": 0.5038, "step": 687 }, { "epoch": 0.7605011053795137, "grad_norm": 0.49822574853897095, "learning_rate": 9.299772538835492e-06, "loss": 0.4787, "step": 688 }, { "epoch": 0.7616064848931466, "grad_norm": 0.6178840398788452, "learning_rate": 9.2964833675221e-06, "loss": 0.4797, "step": 689 }, { "epoch": 0.7627118644067796, "grad_norm": 0.45991382002830505, "learning_rate": 9.29318707369707e-06, "loss": 0.526, "step": 690 }, { "epoch": 0.7638172439204127, "grad_norm": 0.5744336247444153, "learning_rate": 9.289883662824844e-06, "loss": 0.487, "step": 691 }, { "epoch": 0.7649226234340457, "grad_norm": 0.508097767829895, "learning_rate": 9.286573140381663e-06, "loss": 0.5309, "step": 692 }, { "epoch": 0.7660280029476787, "grad_norm": 0.5551114678382874, "learning_rate": 9.28325551185556e-06, "loss": 0.4883, "step": 693 }, { "epoch": 0.7671333824613117, "grad_norm": 0.46240028738975525, "learning_rate": 9.279930782746346e-06, "loss": 0.4953, "step": 694 }, { "epoch": 0.7682387619749448, "grad_norm": 0.48190081119537354, "learning_rate": 9.276598958565602e-06, "loss": 0.4973, "step": 695 }, { "epoch": 0.7693441414885778, "grad_norm": 0.5606549382209778, "learning_rate": 9.273260044836675e-06, "loss": 0.4979, "step": 696 }, { "epoch": 0.7704495210022108, "grad_norm": 0.4515574872493744, "learning_rate": 9.26991404709466e-06, "loss": 0.4867, "step": 697 }, { "epoch": 0.7715549005158437, "grad_norm": 0.4516068696975708, "learning_rate": 9.266560970886397e-06, "loss": 0.4864, "step": 698 }, { "epoch": 0.7726602800294768, "grad_norm": 0.4740319550037384, "learning_rate": 9.263200821770462e-06, "loss": 0.4877, "step": 699 }, { "epoch": 0.7737656595431098, "grad_norm": 0.42985427379608154, "learning_rate": 9.259833605317155e-06, "loss": 0.4893, "step": 700 }, { "epoch": 0.7748710390567428, "grad_norm": 0.46830248832702637, "learning_rate": 9.25645932710849e-06, "loss": 0.5034, "step": 701 }, { "epoch": 0.7759764185703758, "grad_norm": 0.44326698780059814, "learning_rate": 9.253077992738193e-06, "loss": 0.4953, "step": 702 }, { "epoch": 0.7770817980840088, "grad_norm": 0.41992080211639404, "learning_rate": 9.24968960781168e-06, "loss": 0.5169, "step": 703 }, { "epoch": 0.7781871775976419, "grad_norm": 0.4288303256034851, "learning_rate": 9.246294177946062e-06, "loss": 0.5037, "step": 704 }, { "epoch": 0.7792925571112749, "grad_norm": 0.515893816947937, "learning_rate": 9.242891708770122e-06, "loss": 0.4774, "step": 705 }, { "epoch": 0.7803979366249079, "grad_norm": 0.5638248324394226, "learning_rate": 9.239482205924322e-06, "loss": 0.484, "step": 706 }, { "epoch": 0.7815033161385408, "grad_norm": 0.42334336042404175, "learning_rate": 9.236065675060775e-06, "loss": 0.4781, "step": 707 }, { "epoch": 0.782608695652174, "grad_norm": 0.5962066650390625, "learning_rate": 9.232642121843247e-06, "loss": 0.5151, "step": 708 }, { "epoch": 0.7837140751658069, "grad_norm": 0.5468141436576843, "learning_rate": 9.229211551947148e-06, "loss": 0.505, "step": 709 }, { "epoch": 0.7848194546794399, "grad_norm": 0.45399045944213867, "learning_rate": 9.225773971059518e-06, "loss": 0.5126, "step": 710 }, { "epoch": 0.7859248341930729, "grad_norm": 0.4050552248954773, "learning_rate": 9.22232938487902e-06, "loss": 0.4846, "step": 711 }, { "epoch": 0.787030213706706, "grad_norm": 0.43107879161834717, "learning_rate": 9.218877799115929e-06, "loss": 0.469, "step": 712 }, { "epoch": 0.788135593220339, "grad_norm": 0.42511773109436035, "learning_rate": 9.215419219492126e-06, "loss": 0.4615, "step": 713 }, { "epoch": 0.789240972733972, "grad_norm": 0.43296122550964355, "learning_rate": 9.21195365174108e-06, "loss": 0.4913, "step": 714 }, { "epoch": 0.790346352247605, "grad_norm": 0.41232484579086304, "learning_rate": 9.208481101607856e-06, "loss": 0.4893, "step": 715 }, { "epoch": 0.7914517317612381, "grad_norm": 0.4678559899330139, "learning_rate": 9.205001574849081e-06, "loss": 0.5147, "step": 716 }, { "epoch": 0.792557111274871, "grad_norm": 0.38720783591270447, "learning_rate": 9.201515077232958e-06, "loss": 0.4724, "step": 717 }, { "epoch": 0.793662490788504, "grad_norm": 0.44568225741386414, "learning_rate": 9.19802161453924e-06, "loss": 0.5141, "step": 718 }, { "epoch": 0.794767870302137, "grad_norm": 0.4007556140422821, "learning_rate": 9.19452119255923e-06, "loss": 0.4707, "step": 719 }, { "epoch": 0.7958732498157701, "grad_norm": 0.403889536857605, "learning_rate": 9.191013817095762e-06, "loss": 0.5107, "step": 720 }, { "epoch": 0.7969786293294031, "grad_norm": 0.4523642957210541, "learning_rate": 9.187499493963203e-06, "loss": 0.4999, "step": 721 }, { "epoch": 0.7980840088430361, "grad_norm": 0.431140273809433, "learning_rate": 9.183978228987436e-06, "loss": 0.4967, "step": 722 }, { "epoch": 0.7991893883566691, "grad_norm": 0.5181386470794678, "learning_rate": 9.18045002800585e-06, "loss": 0.4816, "step": 723 }, { "epoch": 0.8002947678703022, "grad_norm": 0.45666712522506714, "learning_rate": 9.176914896867335e-06, "loss": 0.4904, "step": 724 }, { "epoch": 0.8014001473839352, "grad_norm": 0.44938504695892334, "learning_rate": 9.173372841432268e-06, "loss": 0.4787, "step": 725 }, { "epoch": 0.8025055268975682, "grad_norm": 0.4733509421348572, "learning_rate": 9.169823867572505e-06, "loss": 0.4771, "step": 726 }, { "epoch": 0.8036109064112011, "grad_norm": 0.4370712637901306, "learning_rate": 9.166267981171369e-06, "loss": 0.4909, "step": 727 }, { "epoch": 0.8047162859248342, "grad_norm": 0.47236597537994385, "learning_rate": 9.162705188123647e-06, "loss": 0.5057, "step": 728 }, { "epoch": 0.8058216654384672, "grad_norm": 0.4470880925655365, "learning_rate": 9.159135494335571e-06, "loss": 0.4968, "step": 729 }, { "epoch": 0.8069270449521002, "grad_norm": 0.5168840289115906, "learning_rate": 9.155558905724815e-06, "loss": 0.4756, "step": 730 }, { "epoch": 0.8080324244657332, "grad_norm": 0.49397364258766174, "learning_rate": 9.151975428220483e-06, "loss": 0.5127, "step": 731 }, { "epoch": 0.8091378039793663, "grad_norm": 0.44057250022888184, "learning_rate": 9.148385067763094e-06, "loss": 0.4804, "step": 732 }, { "epoch": 0.8102431834929993, "grad_norm": 0.5466852188110352, "learning_rate": 9.144787830304589e-06, "loss": 0.4678, "step": 733 }, { "epoch": 0.8113485630066323, "grad_norm": 0.40236005187034607, "learning_rate": 9.141183721808298e-06, "loss": 0.4696, "step": 734 }, { "epoch": 0.8124539425202653, "grad_norm": 0.46221259236335754, "learning_rate": 9.137572748248943e-06, "loss": 0.4927, "step": 735 }, { "epoch": 0.8135593220338984, "grad_norm": 0.4742516577243805, "learning_rate": 9.133954915612635e-06, "loss": 0.4999, "step": 736 }, { "epoch": 0.8146647015475313, "grad_norm": 0.5002638697624207, "learning_rate": 9.130330229896846e-06, "loss": 0.4923, "step": 737 }, { "epoch": 0.8157700810611643, "grad_norm": 0.45516252517700195, "learning_rate": 9.126698697110414e-06, "loss": 0.486, "step": 738 }, { "epoch": 0.8168754605747973, "grad_norm": 0.4336082935333252, "learning_rate": 9.123060323273526e-06, "loss": 0.5133, "step": 739 }, { "epoch": 0.8179808400884304, "grad_norm": 0.4531625509262085, "learning_rate": 9.119415114417709e-06, "loss": 0.4734, "step": 740 }, { "epoch": 0.8190862196020634, "grad_norm": 0.4107901155948639, "learning_rate": 9.115763076585823e-06, "loss": 0.4936, "step": 741 }, { "epoch": 0.8201915991156964, "grad_norm": 0.38894474506378174, "learning_rate": 9.112104215832047e-06, "loss": 0.5164, "step": 742 }, { "epoch": 0.8212969786293294, "grad_norm": 0.4883616864681244, "learning_rate": 9.108438538221872e-06, "loss": 0.4895, "step": 743 }, { "epoch": 0.8224023581429624, "grad_norm": 0.4212181270122528, "learning_rate": 9.104766049832088e-06, "loss": 0.5025, "step": 744 }, { "epoch": 0.8235077376565955, "grad_norm": 0.4108904302120209, "learning_rate": 9.101086756750777e-06, "loss": 0.4934, "step": 745 }, { "epoch": 0.8246131171702284, "grad_norm": 0.48212993144989014, "learning_rate": 9.0974006650773e-06, "loss": 0.5066, "step": 746 }, { "epoch": 0.8257184966838614, "grad_norm": 0.4938141107559204, "learning_rate": 9.093707780922293e-06, "loss": 0.4812, "step": 747 }, { "epoch": 0.8268238761974944, "grad_norm": 0.446444034576416, "learning_rate": 9.090008110407646e-06, "loss": 0.4707, "step": 748 }, { "epoch": 0.8279292557111275, "grad_norm": 0.42565932869911194, "learning_rate": 9.086301659666504e-06, "loss": 0.5042, "step": 749 }, { "epoch": 0.8290346352247605, "grad_norm": 0.4551891088485718, "learning_rate": 9.082588434843244e-06, "loss": 0.4765, "step": 750 }, { "epoch": 0.8301400147383935, "grad_norm": 0.41815781593322754, "learning_rate": 9.078868442093486e-06, "loss": 0.4732, "step": 751 }, { "epoch": 0.8312453942520265, "grad_norm": 0.4539969563484192, "learning_rate": 9.075141687584056e-06, "loss": 0.4758, "step": 752 }, { "epoch": 0.8323507737656596, "grad_norm": 0.4504568874835968, "learning_rate": 9.071408177493002e-06, "loss": 0.51, "step": 753 }, { "epoch": 0.8334561532792926, "grad_norm": 0.3745630085468292, "learning_rate": 9.067667918009559e-06, "loss": 0.4768, "step": 754 }, { "epoch": 0.8345615327929256, "grad_norm": 0.45687437057495117, "learning_rate": 9.063920915334158e-06, "loss": 0.522, "step": 755 }, { "epoch": 0.8356669123065585, "grad_norm": 0.4192422926425934, "learning_rate": 9.060167175678407e-06, "loss": 0.4665, "step": 756 }, { "epoch": 0.8367722918201916, "grad_norm": 0.43837225437164307, "learning_rate": 9.056406705265084e-06, "loss": 0.4942, "step": 757 }, { "epoch": 0.8378776713338246, "grad_norm": 0.4310995638370514, "learning_rate": 9.05263951032812e-06, "loss": 0.4678, "step": 758 }, { "epoch": 0.8389830508474576, "grad_norm": 0.40962496399879456, "learning_rate": 9.048865597112598e-06, "loss": 0.5037, "step": 759 }, { "epoch": 0.8400884303610906, "grad_norm": 0.4668165147304535, "learning_rate": 9.045084971874738e-06, "loss": 0.5123, "step": 760 }, { "epoch": 0.8411938098747237, "grad_norm": 0.4415664076805115, "learning_rate": 9.041297640881885e-06, "loss": 0.5117, "step": 761 }, { "epoch": 0.8422991893883567, "grad_norm": 0.4182766377925873, "learning_rate": 9.037503610412502e-06, "loss": 0.4785, "step": 762 }, { "epoch": 0.8434045689019897, "grad_norm": 0.3995317220687866, "learning_rate": 9.033702886756155e-06, "loss": 0.4849, "step": 763 }, { "epoch": 0.8445099484156227, "grad_norm": 0.4744563400745392, "learning_rate": 9.02989547621351e-06, "loss": 0.5078, "step": 764 }, { "epoch": 0.8456153279292558, "grad_norm": 0.41242721676826477, "learning_rate": 9.026081385096317e-06, "loss": 0.4788, "step": 765 }, { "epoch": 0.8467207074428887, "grad_norm": 0.4069625437259674, "learning_rate": 9.022260619727401e-06, "loss": 0.4837, "step": 766 }, { "epoch": 0.8478260869565217, "grad_norm": 0.4492427408695221, "learning_rate": 9.018433186440648e-06, "loss": 0.4724, "step": 767 }, { "epoch": 0.8489314664701547, "grad_norm": 0.4087703824043274, "learning_rate": 9.014599091581e-06, "loss": 0.4903, "step": 768 }, { "epoch": 0.8500368459837878, "grad_norm": 0.42628926038742065, "learning_rate": 9.010758341504442e-06, "loss": 0.487, "step": 769 }, { "epoch": 0.8511422254974208, "grad_norm": 0.4472368657588959, "learning_rate": 9.006910942577995e-06, "loss": 0.4965, "step": 770 }, { "epoch": 0.8522476050110538, "grad_norm": 0.4118722975254059, "learning_rate": 9.003056901179696e-06, "loss": 0.467, "step": 771 }, { "epoch": 0.8533529845246868, "grad_norm": 0.4160650670528412, "learning_rate": 8.999196223698599e-06, "loss": 0.4883, "step": 772 }, { "epoch": 0.8544583640383199, "grad_norm": 0.4747214913368225, "learning_rate": 8.995328916534753e-06, "loss": 0.5267, "step": 773 }, { "epoch": 0.8555637435519529, "grad_norm": 0.4269624948501587, "learning_rate": 8.991454986099207e-06, "loss": 0.5016, "step": 774 }, { "epoch": 0.8566691230655858, "grad_norm": 0.40572571754455566, "learning_rate": 8.987574438813979e-06, "loss": 0.4958, "step": 775 }, { "epoch": 0.8577745025792188, "grad_norm": 0.5109791159629822, "learning_rate": 8.983687281112066e-06, "loss": 0.4817, "step": 776 }, { "epoch": 0.8588798820928519, "grad_norm": 0.49037817120552063, "learning_rate": 8.979793519437413e-06, "loss": 0.4937, "step": 777 }, { "epoch": 0.8599852616064849, "grad_norm": 0.4259691834449768, "learning_rate": 8.975893160244921e-06, "loss": 0.4822, "step": 778 }, { "epoch": 0.8610906411201179, "grad_norm": 0.4702318012714386, "learning_rate": 8.971986210000425e-06, "loss": 0.4783, "step": 779 }, { "epoch": 0.8621960206337509, "grad_norm": 0.4500643014907837, "learning_rate": 8.968072675180686e-06, "loss": 0.4925, "step": 780 }, { "epoch": 0.863301400147384, "grad_norm": 0.4135689437389374, "learning_rate": 8.964152562273383e-06, "loss": 0.4828, "step": 781 }, { "epoch": 0.864406779661017, "grad_norm": 0.4733305275440216, "learning_rate": 8.960225877777095e-06, "loss": 0.5077, "step": 782 }, { "epoch": 0.86551215917465, "grad_norm": 0.44539913535118103, "learning_rate": 8.956292628201302e-06, "loss": 0.4813, "step": 783 }, { "epoch": 0.866617538688283, "grad_norm": 0.45779839158058167, "learning_rate": 8.952352820066359e-06, "loss": 0.4607, "step": 784 }, { "epoch": 0.8677229182019159, "grad_norm": 0.417237788438797, "learning_rate": 8.948406459903503e-06, "loss": 0.4652, "step": 785 }, { "epoch": 0.868828297715549, "grad_norm": 0.41786548495292664, "learning_rate": 8.944453554254823e-06, "loss": 0.4845, "step": 786 }, { "epoch": 0.869933677229182, "grad_norm": 0.47990882396698, "learning_rate": 8.940494109673266e-06, "loss": 0.4885, "step": 787 }, { "epoch": 0.871039056742815, "grad_norm": 0.4268331825733185, "learning_rate": 8.936528132722616e-06, "loss": 0.4815, "step": 788 }, { "epoch": 0.872144436256448, "grad_norm": 0.45276498794555664, "learning_rate": 8.932555629977483e-06, "loss": 0.4825, "step": 789 }, { "epoch": 0.8732498157700811, "grad_norm": 0.440952867269516, "learning_rate": 8.928576608023305e-06, "loss": 0.4897, "step": 790 }, { "epoch": 0.8743551952837141, "grad_norm": 0.4220770001411438, "learning_rate": 8.924591073456316e-06, "loss": 0.4801, "step": 791 }, { "epoch": 0.8754605747973471, "grad_norm": 0.5337914228439331, "learning_rate": 8.920599032883553e-06, "loss": 0.4827, "step": 792 }, { "epoch": 0.8765659543109801, "grad_norm": 0.46392151713371277, "learning_rate": 8.916600492922835e-06, "loss": 0.5125, "step": 793 }, { "epoch": 0.8776713338246132, "grad_norm": 0.4204712510108948, "learning_rate": 8.912595460202758e-06, "loss": 0.4677, "step": 794 }, { "epoch": 0.8787767133382461, "grad_norm": 0.43567296862602234, "learning_rate": 8.90858394136268e-06, "loss": 0.508, "step": 795 }, { "epoch": 0.8798820928518791, "grad_norm": 0.46844470500946045, "learning_rate": 8.90456594305271e-06, "loss": 0.4938, "step": 796 }, { "epoch": 0.8809874723655121, "grad_norm": 0.48057666420936584, "learning_rate": 8.900541471933703e-06, "loss": 0.4932, "step": 797 }, { "epoch": 0.8820928518791452, "grad_norm": 0.43345582485198975, "learning_rate": 8.896510534677238e-06, "loss": 0.4828, "step": 798 }, { "epoch": 0.8831982313927782, "grad_norm": 0.5057882070541382, "learning_rate": 8.89247313796562e-06, "loss": 0.5169, "step": 799 }, { "epoch": 0.8843036109064112, "grad_norm": 0.44683605432510376, "learning_rate": 8.888429288491857e-06, "loss": 0.4889, "step": 800 }, { "epoch": 0.8854089904200442, "grad_norm": 0.4905654489994049, "learning_rate": 8.884378992959655e-06, "loss": 0.5115, "step": 801 }, { "epoch": 0.8865143699336773, "grad_norm": 0.4457947611808777, "learning_rate": 8.880322258083408e-06, "loss": 0.5003, "step": 802 }, { "epoch": 0.8876197494473103, "grad_norm": 0.45204317569732666, "learning_rate": 8.876259090588183e-06, "loss": 0.458, "step": 803 }, { "epoch": 0.8887251289609432, "grad_norm": 0.4536559581756592, "learning_rate": 8.872189497209712e-06, "loss": 0.4831, "step": 804 }, { "epoch": 0.8898305084745762, "grad_norm": 0.4714140295982361, "learning_rate": 8.868113484694378e-06, "loss": 0.5102, "step": 805 }, { "epoch": 0.8909358879882093, "grad_norm": 0.4172220528125763, "learning_rate": 8.864031059799208e-06, "loss": 0.4945, "step": 806 }, { "epoch": 0.8920412675018423, "grad_norm": 0.4215790331363678, "learning_rate": 8.859942229291856e-06, "loss": 0.5048, "step": 807 }, { "epoch": 0.8931466470154753, "grad_norm": 0.519412636756897, "learning_rate": 8.855846999950595e-06, "loss": 0.486, "step": 808 }, { "epoch": 0.8942520265291083, "grad_norm": 0.4318355917930603, "learning_rate": 8.85174537856431e-06, "loss": 0.5023, "step": 809 }, { "epoch": 0.8953574060427414, "grad_norm": 0.48124274611473083, "learning_rate": 8.847637371932478e-06, "loss": 0.4991, "step": 810 }, { "epoch": 0.8964627855563744, "grad_norm": 0.480027437210083, "learning_rate": 8.843522986865162e-06, "loss": 0.4988, "step": 811 }, { "epoch": 0.8975681650700074, "grad_norm": 0.47645479440689087, "learning_rate": 8.839402230183e-06, "loss": 0.5106, "step": 812 }, { "epoch": 0.8986735445836403, "grad_norm": 0.46411827206611633, "learning_rate": 8.835275108717194e-06, "loss": 0.4597, "step": 813 }, { "epoch": 0.8997789240972734, "grad_norm": 0.4392440617084503, "learning_rate": 8.831141629309492e-06, "loss": 0.4597, "step": 814 }, { "epoch": 0.9008843036109064, "grad_norm": 0.4804568290710449, "learning_rate": 8.827001798812186e-06, "loss": 0.4948, "step": 815 }, { "epoch": 0.9019896831245394, "grad_norm": 0.452887624502182, "learning_rate": 8.822855624088099e-06, "loss": 0.4743, "step": 816 }, { "epoch": 0.9030950626381724, "grad_norm": 0.49586859345436096, "learning_rate": 8.818703112010562e-06, "loss": 0.4792, "step": 817 }, { "epoch": 0.9042004421518055, "grad_norm": 0.4155937433242798, "learning_rate": 8.814544269463422e-06, "loss": 0.4766, "step": 818 }, { "epoch": 0.9053058216654385, "grad_norm": 0.4366413354873657, "learning_rate": 8.810379103341019e-06, "loss": 0.4925, "step": 819 }, { "epoch": 0.9064112011790715, "grad_norm": 0.40687376260757446, "learning_rate": 8.806207620548165e-06, "loss": 0.4736, "step": 820 }, { "epoch": 0.9075165806927045, "grad_norm": 0.3992072343826294, "learning_rate": 8.802029828000157e-06, "loss": 0.4902, "step": 821 }, { "epoch": 0.9086219602063376, "grad_norm": 0.39756298065185547, "learning_rate": 8.797845732622742e-06, "loss": 0.4848, "step": 822 }, { "epoch": 0.9097273397199706, "grad_norm": 0.439359575510025, "learning_rate": 8.793655341352127e-06, "loss": 0.485, "step": 823 }, { "epoch": 0.9108327192336035, "grad_norm": 0.4608912169933319, "learning_rate": 8.789458661134943e-06, "loss": 0.4806, "step": 824 }, { "epoch": 0.9119380987472365, "grad_norm": 0.4098968207836151, "learning_rate": 8.785255698928255e-06, "loss": 0.4735, "step": 825 }, { "epoch": 0.9130434782608695, "grad_norm": 0.4056512713432312, "learning_rate": 8.781046461699538e-06, "loss": 0.4887, "step": 826 }, { "epoch": 0.9141488577745026, "grad_norm": 0.42948803305625916, "learning_rate": 8.776830956426674e-06, "loss": 0.4805, "step": 827 }, { "epoch": 0.9152542372881356, "grad_norm": 0.4673222601413727, "learning_rate": 8.772609190097932e-06, "loss": 0.487, "step": 828 }, { "epoch": 0.9163596168017686, "grad_norm": 0.5111664533615112, "learning_rate": 8.768381169711959e-06, "loss": 0.5015, "step": 829 }, { "epoch": 0.9174649963154016, "grad_norm": 0.45832693576812744, "learning_rate": 8.764146902277773e-06, "loss": 0.4547, "step": 830 }, { "epoch": 0.9185703758290347, "grad_norm": 0.4617200791835785, "learning_rate": 8.759906394814747e-06, "loss": 0.4882, "step": 831 }, { "epoch": 0.9196757553426677, "grad_norm": 0.38526666164398193, "learning_rate": 8.755659654352599e-06, "loss": 0.476, "step": 832 }, { "epoch": 0.9207811348563006, "grad_norm": 0.43947833776474, "learning_rate": 8.751406687931381e-06, "loss": 0.4693, "step": 833 }, { "epoch": 0.9218865143699336, "grad_norm": 0.46694955229759216, "learning_rate": 8.747147502601458e-06, "loss": 0.4957, "step": 834 }, { "epoch": 0.9229918938835667, "grad_norm": 0.4974941909313202, "learning_rate": 8.742882105423518e-06, "loss": 0.5149, "step": 835 }, { "epoch": 0.9240972733971997, "grad_norm": 0.4303239583969116, "learning_rate": 8.738610503468534e-06, "loss": 0.5019, "step": 836 }, { "epoch": 0.9252026529108327, "grad_norm": 0.5279948711395264, "learning_rate": 8.734332703817771e-06, "loss": 0.508, "step": 837 }, { "epoch": 0.9263080324244657, "grad_norm": 0.5840197205543518, "learning_rate": 8.730048713562771e-06, "loss": 0.5103, "step": 838 }, { "epoch": 0.9274134119380988, "grad_norm": 0.42592623829841614, "learning_rate": 8.725758539805333e-06, "loss": 0.5009, "step": 839 }, { "epoch": 0.9285187914517318, "grad_norm": 0.483877956867218, "learning_rate": 8.72146218965751e-06, "loss": 0.4619, "step": 840 }, { "epoch": 0.9296241709653648, "grad_norm": 0.4268472194671631, "learning_rate": 8.71715967024159e-06, "loss": 0.4676, "step": 841 }, { "epoch": 0.9307295504789977, "grad_norm": 0.4554291069507599, "learning_rate": 8.712850988690094e-06, "loss": 0.4831, "step": 842 }, { "epoch": 0.9318349299926308, "grad_norm": 0.43877583742141724, "learning_rate": 8.708536152145755e-06, "loss": 0.4926, "step": 843 }, { "epoch": 0.9329403095062638, "grad_norm": 0.41033995151519775, "learning_rate": 8.704215167761506e-06, "loss": 0.4766, "step": 844 }, { "epoch": 0.9340456890198968, "grad_norm": 0.4164173901081085, "learning_rate": 8.69988804270048e-06, "loss": 0.4874, "step": 845 }, { "epoch": 0.9351510685335298, "grad_norm": 0.4802247881889343, "learning_rate": 8.695554784135982e-06, "loss": 0.4988, "step": 846 }, { "epoch": 0.9362564480471629, "grad_norm": 0.40767449140548706, "learning_rate": 8.691215399251489e-06, "loss": 0.5078, "step": 847 }, { "epoch": 0.9373618275607959, "grad_norm": 0.4829815924167633, "learning_rate": 8.686869895240631e-06, "loss": 0.4848, "step": 848 }, { "epoch": 0.9384672070744289, "grad_norm": 0.44986405968666077, "learning_rate": 8.682518279307188e-06, "loss": 0.5098, "step": 849 }, { "epoch": 0.9395725865880619, "grad_norm": 0.4226650297641754, "learning_rate": 8.678160558665063e-06, "loss": 0.4756, "step": 850 }, { "epoch": 0.940677966101695, "grad_norm": 0.4422720968723297, "learning_rate": 8.673796740538287e-06, "loss": 0.4711, "step": 851 }, { "epoch": 0.941783345615328, "grad_norm": 0.4236817955970764, "learning_rate": 8.669426832160997e-06, "loss": 0.467, "step": 852 }, { "epoch": 0.9428887251289609, "grad_norm": 0.42468002438545227, "learning_rate": 8.665050840777422e-06, "loss": 0.4904, "step": 853 }, { "epoch": 0.9439941046425939, "grad_norm": 0.43013420701026917, "learning_rate": 8.66066877364188e-06, "loss": 0.51, "step": 854 }, { "epoch": 0.945099484156227, "grad_norm": 0.4210425913333893, "learning_rate": 8.656280638018759e-06, "loss": 0.4777, "step": 855 }, { "epoch": 0.94620486366986, "grad_norm": 0.4078814685344696, "learning_rate": 8.651886441182509e-06, "loss": 0.5053, "step": 856 }, { "epoch": 0.947310243183493, "grad_norm": 0.4467892646789551, "learning_rate": 8.647486190417624e-06, "loss": 0.4779, "step": 857 }, { "epoch": 0.948415622697126, "grad_norm": 0.4591618478298187, "learning_rate": 8.64307989301864e-06, "loss": 0.4887, "step": 858 }, { "epoch": 0.9495210022107591, "grad_norm": 0.43053239583969116, "learning_rate": 8.638667556290108e-06, "loss": 0.4792, "step": 859 }, { "epoch": 0.9506263817243921, "grad_norm": 0.4689154028892517, "learning_rate": 8.634249187546601e-06, "loss": 0.4854, "step": 860 }, { "epoch": 0.951731761238025, "grad_norm": 0.4494744539260864, "learning_rate": 8.629824794112686e-06, "loss": 0.4598, "step": 861 }, { "epoch": 0.952837140751658, "grad_norm": 0.42166227102279663, "learning_rate": 8.625394383322914e-06, "loss": 0.48, "step": 862 }, { "epoch": 0.9539425202652911, "grad_norm": 0.5064461827278137, "learning_rate": 8.62095796252182e-06, "loss": 0.5001, "step": 863 }, { "epoch": 0.9550478997789241, "grad_norm": 0.5115634202957153, "learning_rate": 8.616515539063894e-06, "loss": 0.4924, "step": 864 }, { "epoch": 0.9561532792925571, "grad_norm": 0.47794032096862793, "learning_rate": 8.612067120313583e-06, "loss": 0.4834, "step": 865 }, { "epoch": 0.9572586588061901, "grad_norm": 0.4660879373550415, "learning_rate": 8.60761271364527e-06, "loss": 0.4754, "step": 866 }, { "epoch": 0.9583640383198231, "grad_norm": 0.4753507673740387, "learning_rate": 8.603152326443262e-06, "loss": 0.5147, "step": 867 }, { "epoch": 0.9594694178334562, "grad_norm": 0.4895542562007904, "learning_rate": 8.598685966101783e-06, "loss": 0.5004, "step": 868 }, { "epoch": 0.9605747973470892, "grad_norm": 0.39868831634521484, "learning_rate": 8.594213640024961e-06, "loss": 0.5072, "step": 869 }, { "epoch": 0.9616801768607222, "grad_norm": 0.4305480718612671, "learning_rate": 8.589735355626814e-06, "loss": 0.4729, "step": 870 }, { "epoch": 0.9627855563743551, "grad_norm": 0.4958736002445221, "learning_rate": 8.585251120331228e-06, "loss": 0.4934, "step": 871 }, { "epoch": 0.9638909358879882, "grad_norm": 0.4178593158721924, "learning_rate": 8.580760941571968e-06, "loss": 0.4813, "step": 872 }, { "epoch": 0.9649963154016212, "grad_norm": 0.5182307362556458, "learning_rate": 8.57626482679264e-06, "loss": 0.4895, "step": 873 }, { "epoch": 0.9661016949152542, "grad_norm": 0.456022173166275, "learning_rate": 8.571762783446696e-06, "loss": 0.4972, "step": 874 }, { "epoch": 0.9672070744288872, "grad_norm": 0.4462178647518158, "learning_rate": 8.56725481899742e-06, "loss": 0.5042, "step": 875 }, { "epoch": 0.9683124539425203, "grad_norm": 0.4184209704399109, "learning_rate": 8.562740940917901e-06, "loss": 0.4698, "step": 876 }, { "epoch": 0.9694178334561533, "grad_norm": 0.4680977761745453, "learning_rate": 8.55822115669104e-06, "loss": 0.4907, "step": 877 }, { "epoch": 0.9705232129697863, "grad_norm": 0.43977299332618713, "learning_rate": 8.55369547380953e-06, "loss": 0.4867, "step": 878 }, { "epoch": 0.9716285924834193, "grad_norm": 0.4160180687904358, "learning_rate": 8.549163899775834e-06, "loss": 0.4933, "step": 879 }, { "epoch": 0.9727339719970524, "grad_norm": 0.4132622480392456, "learning_rate": 8.544626442102188e-06, "loss": 0.4983, "step": 880 }, { "epoch": 0.9738393515106853, "grad_norm": 0.42725232243537903, "learning_rate": 8.540083108310579e-06, "loss": 0.4933, "step": 881 }, { "epoch": 0.9749447310243183, "grad_norm": 0.45787328481674194, "learning_rate": 8.535533905932739e-06, "loss": 0.4576, "step": 882 }, { "epoch": 0.9760501105379513, "grad_norm": 0.42389991879463196, "learning_rate": 8.53097884251012e-06, "loss": 0.5026, "step": 883 }, { "epoch": 0.9771554900515844, "grad_norm": 0.4036232531070709, "learning_rate": 8.526417925593901e-06, "loss": 0.4548, "step": 884 }, { "epoch": 0.9782608695652174, "grad_norm": 0.44081807136535645, "learning_rate": 8.521851162744958e-06, "loss": 0.4997, "step": 885 }, { "epoch": 0.9793662490788504, "grad_norm": 0.52150559425354, "learning_rate": 8.517278561533857e-06, "loss": 0.4808, "step": 886 }, { "epoch": 0.9804716285924834, "grad_norm": 0.4498632252216339, "learning_rate": 8.512700129540847e-06, "loss": 0.4976, "step": 887 }, { "epoch": 0.9815770081061165, "grad_norm": 0.4432542622089386, "learning_rate": 8.50811587435584e-06, "loss": 0.5019, "step": 888 }, { "epoch": 0.9826823876197495, "grad_norm": 0.42049771547317505, "learning_rate": 8.503525803578405e-06, "loss": 0.4586, "step": 889 }, { "epoch": 0.9837877671333825, "grad_norm": 0.423035204410553, "learning_rate": 8.498929924817745e-06, "loss": 0.4638, "step": 890 }, { "epoch": 0.9848931466470154, "grad_norm": 0.45330503582954407, "learning_rate": 8.4943282456927e-06, "loss": 0.5168, "step": 891 }, { "epoch": 0.9859985261606485, "grad_norm": 0.4168023467063904, "learning_rate": 8.489720773831717e-06, "loss": 0.4593, "step": 892 }, { "epoch": 0.9871039056742815, "grad_norm": 0.45019084215164185, "learning_rate": 8.485107516872854e-06, "loss": 0.5059, "step": 893 }, { "epoch": 0.9882092851879145, "grad_norm": 0.4491893947124481, "learning_rate": 8.480488482463753e-06, "loss": 0.5085, "step": 894 }, { "epoch": 0.9893146647015475, "grad_norm": 0.4503895342350006, "learning_rate": 8.475863678261638e-06, "loss": 0.4689, "step": 895 }, { "epoch": 0.9904200442151806, "grad_norm": 0.493648499250412, "learning_rate": 8.471233111933291e-06, "loss": 0.5049, "step": 896 }, { "epoch": 0.9915254237288136, "grad_norm": 0.4031809866428375, "learning_rate": 8.466596791155055e-06, "loss": 0.5034, "step": 897 }, { "epoch": 0.9926308032424466, "grad_norm": 0.4961598813533783, "learning_rate": 8.461954723612807e-06, "loss": 0.4941, "step": 898 }, { "epoch": 0.9937361827560796, "grad_norm": 0.42914876341819763, "learning_rate": 8.457306917001952e-06, "loss": 0.4595, "step": 899 }, { "epoch": 0.9948415622697127, "grad_norm": 0.451678603887558, "learning_rate": 8.45265337902741e-06, "loss": 0.4949, "step": 900 }, { "epoch": 0.9959469417833456, "grad_norm": 0.43319398164749146, "learning_rate": 8.447994117403601e-06, "loss": 0.4998, "step": 901 }, { "epoch": 0.9970523212969786, "grad_norm": 0.521942138671875, "learning_rate": 8.443329139854434e-06, "loss": 0.4785, "step": 902 }, { "epoch": 0.9981577008106116, "grad_norm": 0.4527154564857483, "learning_rate": 8.43865845411329e-06, "loss": 0.5013, "step": 903 }, { "epoch": 0.9992630803242447, "grad_norm": 0.4612230658531189, "learning_rate": 8.433982067923021e-06, "loss": 0.5089, "step": 904 }, { "epoch": 1.0003684598378777, "grad_norm": 0.6573925614356995, "learning_rate": 8.429299989035922e-06, "loss": 0.6202, "step": 905 }, { "epoch": 1.0014738393515106, "grad_norm": 0.42655786871910095, "learning_rate": 8.424612225213726e-06, "loss": 0.4373, "step": 906 }, { "epoch": 1.0025792188651437, "grad_norm": 0.47309955954551697, "learning_rate": 8.419918784227592e-06, "loss": 0.4856, "step": 907 }, { "epoch": 1.0036845983787768, "grad_norm": 0.5385839343070984, "learning_rate": 8.41521967385809e-06, "loss": 0.503, "step": 908 }, { "epoch": 1.0047899778924096, "grad_norm": 0.4585118591785431, "learning_rate": 8.410514901895188e-06, "loss": 0.3983, "step": 909 }, { "epoch": 1.0058953574060427, "grad_norm": 0.42308610677719116, "learning_rate": 8.405804476138239e-06, "loss": 0.4697, "step": 910 }, { "epoch": 1.0070007369196758, "grad_norm": 0.3993573784828186, "learning_rate": 8.401088404395969e-06, "loss": 0.4094, "step": 911 }, { "epoch": 1.0081061164333087, "grad_norm": 0.4705113470554352, "learning_rate": 8.396366694486466e-06, "loss": 0.4599, "step": 912 }, { "epoch": 1.0092114959469418, "grad_norm": 0.4073447287082672, "learning_rate": 8.39163935423716e-06, "loss": 0.4541, "step": 913 }, { "epoch": 1.0103168754605747, "grad_norm": 0.4799155592918396, "learning_rate": 8.386906391484819e-06, "loss": 0.4923, "step": 914 }, { "epoch": 1.0114222549742078, "grad_norm": 0.48811179399490356, "learning_rate": 8.38216781407553e-06, "loss": 0.4761, "step": 915 }, { "epoch": 1.0125276344878409, "grad_norm": 0.4845130741596222, "learning_rate": 8.377423629864686e-06, "loss": 0.4474, "step": 916 }, { "epoch": 1.0136330140014738, "grad_norm": 0.5137596726417542, "learning_rate": 8.372673846716977e-06, "loss": 0.4252, "step": 917 }, { "epoch": 1.0147383935151069, "grad_norm": 0.49790093302726746, "learning_rate": 8.367918472506375e-06, "loss": 0.4329, "step": 918 }, { "epoch": 1.01584377302874, "grad_norm": 0.4481281340122223, "learning_rate": 8.36315751511612e-06, "loss": 0.4591, "step": 919 }, { "epoch": 1.0169491525423728, "grad_norm": 0.4647185504436493, "learning_rate": 8.358390982438706e-06, "loss": 0.4477, "step": 920 }, { "epoch": 1.018054532056006, "grad_norm": 0.45475560426712036, "learning_rate": 8.35361888237587e-06, "loss": 0.4666, "step": 921 }, { "epoch": 1.0191599115696388, "grad_norm": 0.4328113794326782, "learning_rate": 8.348841222838579e-06, "loss": 0.4401, "step": 922 }, { "epoch": 1.020265291083272, "grad_norm": 0.4792473316192627, "learning_rate": 8.344058011747021e-06, "loss": 0.4545, "step": 923 }, { "epoch": 1.021370670596905, "grad_norm": 0.4722910523414612, "learning_rate": 8.339269257030576e-06, "loss": 0.474, "step": 924 }, { "epoch": 1.0224760501105379, "grad_norm": 0.45502522587776184, "learning_rate": 8.334474966627826e-06, "loss": 0.5111, "step": 925 }, { "epoch": 1.023581429624171, "grad_norm": 0.46602070331573486, "learning_rate": 8.329675148486518e-06, "loss": 0.4758, "step": 926 }, { "epoch": 1.024686809137804, "grad_norm": 0.3883390724658966, "learning_rate": 8.324869810563573e-06, "loss": 0.4227, "step": 927 }, { "epoch": 1.025792188651437, "grad_norm": 0.48001739382743835, "learning_rate": 8.32005896082506e-06, "loss": 0.4878, "step": 928 }, { "epoch": 1.02689756816507, "grad_norm": 0.41404756903648376, "learning_rate": 8.315242607246176e-06, "loss": 0.4608, "step": 929 }, { "epoch": 1.028002947678703, "grad_norm": 0.41535237431526184, "learning_rate": 8.310420757811258e-06, "loss": 0.4512, "step": 930 }, { "epoch": 1.029108327192336, "grad_norm": 0.46859246492385864, "learning_rate": 8.30559342051374e-06, "loss": 0.4772, "step": 931 }, { "epoch": 1.0302137067059691, "grad_norm": 0.46060824394226074, "learning_rate": 8.30076060335616e-06, "loss": 0.4731, "step": 932 }, { "epoch": 1.031319086219602, "grad_norm": 0.39975038170814514, "learning_rate": 8.295922314350137e-06, "loss": 0.4496, "step": 933 }, { "epoch": 1.032424465733235, "grad_norm": 0.4640049636363983, "learning_rate": 8.291078561516368e-06, "loss": 0.4453, "step": 934 }, { "epoch": 1.0335298452468682, "grad_norm": 0.7588776350021362, "learning_rate": 8.2862293528846e-06, "loss": 0.4815, "step": 935 }, { "epoch": 1.034635224760501, "grad_norm": 0.4440298080444336, "learning_rate": 8.281374696493628e-06, "loss": 0.4488, "step": 936 }, { "epoch": 1.0357406042741342, "grad_norm": 0.4251173734664917, "learning_rate": 8.276514600391272e-06, "loss": 0.4558, "step": 937 }, { "epoch": 1.036845983787767, "grad_norm": 0.3778236508369446, "learning_rate": 8.271649072634381e-06, "loss": 0.4473, "step": 938 }, { "epoch": 1.0379513633014001, "grad_norm": 0.4082000255584717, "learning_rate": 8.2667781212888e-06, "loss": 0.4569, "step": 939 }, { "epoch": 1.0390567428150332, "grad_norm": 0.42234501242637634, "learning_rate": 8.261901754429367e-06, "loss": 0.4532, "step": 940 }, { "epoch": 1.0401621223286661, "grad_norm": 0.40084636211395264, "learning_rate": 8.257019980139897e-06, "loss": 0.4563, "step": 941 }, { "epoch": 1.0412675018422992, "grad_norm": 0.41254594922065735, "learning_rate": 8.25213280651317e-06, "loss": 0.4642, "step": 942 }, { "epoch": 1.042372881355932, "grad_norm": 0.37419232726097107, "learning_rate": 8.247240241650918e-06, "loss": 0.4163, "step": 943 }, { "epoch": 1.0434782608695652, "grad_norm": 0.4025315046310425, "learning_rate": 8.24234229366381e-06, "loss": 0.4964, "step": 944 }, { "epoch": 1.0445836403831983, "grad_norm": 0.39143264293670654, "learning_rate": 8.237438970671434e-06, "loss": 0.4061, "step": 945 }, { "epoch": 1.0456890198968312, "grad_norm": 0.4959494471549988, "learning_rate": 8.232530280802296e-06, "loss": 0.4647, "step": 946 }, { "epoch": 1.0467943994104643, "grad_norm": 0.40898388624191284, "learning_rate": 8.227616232193794e-06, "loss": 0.4995, "step": 947 }, { "epoch": 1.0478997789240974, "grad_norm": 0.4148244559764862, "learning_rate": 8.222696832992208e-06, "loss": 0.4482, "step": 948 }, { "epoch": 1.0490051584377302, "grad_norm": 0.4869987964630127, "learning_rate": 8.217772091352696e-06, "loss": 0.4659, "step": 949 }, { "epoch": 1.0501105379513633, "grad_norm": 0.4471106231212616, "learning_rate": 8.212842015439263e-06, "loss": 0.4601, "step": 950 }, { "epoch": 1.0512159174649962, "grad_norm": 0.41724562644958496, "learning_rate": 8.207906613424763e-06, "loss": 0.413, "step": 951 }, { "epoch": 1.0523212969786293, "grad_norm": 0.438362181186676, "learning_rate": 8.202965893490877e-06, "loss": 0.4872, "step": 952 }, { "epoch": 1.0534266764922624, "grad_norm": 0.3890039026737213, "learning_rate": 8.198019863828102e-06, "loss": 0.4121, "step": 953 }, { "epoch": 1.0545320560058953, "grad_norm": 0.4416607916355133, "learning_rate": 8.193068532635737e-06, "loss": 0.4404, "step": 954 }, { "epoch": 1.0556374355195284, "grad_norm": 0.4113590121269226, "learning_rate": 8.188111908121874e-06, "loss": 0.407, "step": 955 }, { "epoch": 1.0567428150331615, "grad_norm": 0.4102858603000641, "learning_rate": 8.18314999850337e-06, "loss": 0.5272, "step": 956 }, { "epoch": 1.0578481945467944, "grad_norm": 0.43052491545677185, "learning_rate": 8.178182812005853e-06, "loss": 0.4662, "step": 957 }, { "epoch": 1.0589535740604274, "grad_norm": 0.39660048484802246, "learning_rate": 8.173210356863696e-06, "loss": 0.4505, "step": 958 }, { "epoch": 1.0600589535740603, "grad_norm": 0.3605460822582245, "learning_rate": 8.168232641320003e-06, "loss": 0.4121, "step": 959 }, { "epoch": 1.0611643330876934, "grad_norm": 0.4206472337245941, "learning_rate": 8.163249673626603e-06, "loss": 0.4423, "step": 960 }, { "epoch": 1.0622697126013265, "grad_norm": 0.4133666753768921, "learning_rate": 8.158261462044028e-06, "loss": 0.4491, "step": 961 }, { "epoch": 1.0633750921149594, "grad_norm": 0.4245593547821045, "learning_rate": 8.153268014841507e-06, "loss": 0.4602, "step": 962 }, { "epoch": 1.0644804716285925, "grad_norm": 0.5062248706817627, "learning_rate": 8.148269340296943e-06, "loss": 0.4343, "step": 963 }, { "epoch": 1.0655858511422256, "grad_norm": 0.45495492219924927, "learning_rate": 8.143265446696909e-06, "loss": 0.5059, "step": 964 }, { "epoch": 1.0666912306558585, "grad_norm": 0.4155999422073364, "learning_rate": 8.13825634233663e-06, "loss": 0.4226, "step": 965 }, { "epoch": 1.0677966101694916, "grad_norm": 0.4355109632015228, "learning_rate": 8.133242035519968e-06, "loss": 0.4473, "step": 966 }, { "epoch": 1.0689019896831244, "grad_norm": 0.4636383056640625, "learning_rate": 8.128222534559406e-06, "loss": 0.4749, "step": 967 }, { "epoch": 1.0700073691967575, "grad_norm": 0.4772741198539734, "learning_rate": 8.123197847776043e-06, "loss": 0.4442, "step": 968 }, { "epoch": 1.0711127487103906, "grad_norm": 0.4402015209197998, "learning_rate": 8.118167983499573e-06, "loss": 0.48, "step": 969 }, { "epoch": 1.0722181282240235, "grad_norm": 0.47390079498291016, "learning_rate": 8.113132950068272e-06, "loss": 0.4345, "step": 970 }, { "epoch": 1.0733235077376566, "grad_norm": 0.4544709622859955, "learning_rate": 8.108092755828984e-06, "loss": 0.4799, "step": 971 }, { "epoch": 1.0744288872512897, "grad_norm": 0.40329885482788086, "learning_rate": 8.103047409137114e-06, "loss": 0.4432, "step": 972 }, { "epoch": 1.0755342667649226, "grad_norm": 0.4449723958969116, "learning_rate": 8.097996918356603e-06, "loss": 0.4849, "step": 973 }, { "epoch": 1.0766396462785557, "grad_norm": 0.47747352719306946, "learning_rate": 8.09294129185992e-06, "loss": 0.4612, "step": 974 }, { "epoch": 1.0777450257921886, "grad_norm": 0.3962857723236084, "learning_rate": 8.08788053802805e-06, "loss": 0.4357, "step": 975 }, { "epoch": 1.0788504053058217, "grad_norm": 0.42324206233024597, "learning_rate": 8.082814665250476e-06, "loss": 0.4983, "step": 976 }, { "epoch": 1.0799557848194548, "grad_norm": 0.4634276330471039, "learning_rate": 8.07774368192517e-06, "loss": 0.4692, "step": 977 }, { "epoch": 1.0810611643330876, "grad_norm": 0.37491491436958313, "learning_rate": 8.072667596458573e-06, "loss": 0.4207, "step": 978 }, { "epoch": 1.0821665438467207, "grad_norm": 0.42738163471221924, "learning_rate": 8.067586417265584e-06, "loss": 0.4727, "step": 979 }, { "epoch": 1.0832719233603538, "grad_norm": 0.39351728558540344, "learning_rate": 8.062500152769547e-06, "loss": 0.4215, "step": 980 }, { "epoch": 1.0843773028739867, "grad_norm": 0.4577445089817047, "learning_rate": 8.057408811402239e-06, "loss": 0.5153, "step": 981 }, { "epoch": 1.0854826823876198, "grad_norm": 0.39283058047294617, "learning_rate": 8.052312401603848e-06, "loss": 0.4316, "step": 982 }, { "epoch": 1.0865880619012527, "grad_norm": 0.4522751271724701, "learning_rate": 8.047210931822967e-06, "loss": 0.512, "step": 983 }, { "epoch": 1.0876934414148858, "grad_norm": 0.44578850269317627, "learning_rate": 8.042104410516576e-06, "loss": 0.4107, "step": 984 }, { "epoch": 1.0887988209285189, "grad_norm": 0.42070865631103516, "learning_rate": 8.036992846150033e-06, "loss": 0.4597, "step": 985 }, { "epoch": 1.0899042004421517, "grad_norm": 0.4285004734992981, "learning_rate": 8.03187624719705e-06, "loss": 0.4612, "step": 986 }, { "epoch": 1.0910095799557848, "grad_norm": 0.4020606577396393, "learning_rate": 8.026754622139691e-06, "loss": 0.4412, "step": 987 }, { "epoch": 1.0921149594694177, "grad_norm": 0.4920555353164673, "learning_rate": 8.021627979468348e-06, "loss": 0.4753, "step": 988 }, { "epoch": 1.0932203389830508, "grad_norm": 0.39559850096702576, "learning_rate": 8.016496327681734e-06, "loss": 0.4292, "step": 989 }, { "epoch": 1.094325718496684, "grad_norm": 0.4804953932762146, "learning_rate": 8.01135967528686e-06, "loss": 0.4556, "step": 990 }, { "epoch": 1.0954310980103168, "grad_norm": 0.4527795910835266, "learning_rate": 8.006218030799038e-06, "loss": 0.4248, "step": 991 }, { "epoch": 1.09653647752395, "grad_norm": 0.43357619643211365, "learning_rate": 8.001071402741843e-06, "loss": 0.4725, "step": 992 }, { "epoch": 1.097641857037583, "grad_norm": 0.4797877073287964, "learning_rate": 7.995919799647118e-06, "loss": 0.4806, "step": 993 }, { "epoch": 1.0987472365512159, "grad_norm": 0.435132771730423, "learning_rate": 7.990763230054953e-06, "loss": 0.4667, "step": 994 }, { "epoch": 1.099852616064849, "grad_norm": 0.3868756890296936, "learning_rate": 7.985601702513675e-06, "loss": 0.447, "step": 995 }, { "epoch": 1.100957995578482, "grad_norm": 0.5021476149559021, "learning_rate": 7.980435225579819e-06, "loss": 0.4767, "step": 996 }, { "epoch": 1.102063375092115, "grad_norm": 0.43748944997787476, "learning_rate": 7.975263807818136e-06, "loss": 0.3819, "step": 997 }, { "epoch": 1.103168754605748, "grad_norm": 0.39699864387512207, "learning_rate": 7.970087457801563e-06, "loss": 0.5076, "step": 998 }, { "epoch": 1.104274134119381, "grad_norm": 0.4288889765739441, "learning_rate": 7.964906184111214e-06, "loss": 0.4322, "step": 999 }, { "epoch": 1.105379513633014, "grad_norm": 0.5798135995864868, "learning_rate": 7.959719995336364e-06, "loss": 0.4815, "step": 1000 }, { "epoch": 1.106484893146647, "grad_norm": 0.3861381411552429, "learning_rate": 7.954528900074438e-06, "loss": 0.4534, "step": 1001 }, { "epoch": 1.10759027266028, "grad_norm": 0.5081346035003662, "learning_rate": 7.949332906930995e-06, "loss": 0.4669, "step": 1002 }, { "epoch": 1.108695652173913, "grad_norm": 0.47156327962875366, "learning_rate": 7.94413202451971e-06, "loss": 0.4763, "step": 1003 }, { "epoch": 1.109801031687546, "grad_norm": 0.37115100026130676, "learning_rate": 7.938926261462366e-06, "loss": 0.4075, "step": 1004 }, { "epoch": 1.110906411201179, "grad_norm": 0.44342729449272156, "learning_rate": 7.933715626388838e-06, "loss": 0.4724, "step": 1005 }, { "epoch": 1.1120117907148122, "grad_norm": 0.4394318163394928, "learning_rate": 7.928500127937075e-06, "loss": 0.4761, "step": 1006 }, { "epoch": 1.113117170228445, "grad_norm": 0.4064244031906128, "learning_rate": 7.923279774753092e-06, "loss": 0.4464, "step": 1007 }, { "epoch": 1.1142225497420781, "grad_norm": 0.41119179129600525, "learning_rate": 7.918054575490943e-06, "loss": 0.4541, "step": 1008 }, { "epoch": 1.1153279292557112, "grad_norm": 0.4054982364177704, "learning_rate": 7.912824538812729e-06, "loss": 0.421, "step": 1009 }, { "epoch": 1.116433308769344, "grad_norm": 0.4360243082046509, "learning_rate": 7.90758967338856e-06, "loss": 0.4753, "step": 1010 }, { "epoch": 1.1175386882829772, "grad_norm": 0.3944956362247467, "learning_rate": 7.902349987896554e-06, "loss": 0.4632, "step": 1011 }, { "epoch": 1.11864406779661, "grad_norm": 0.4160965085029602, "learning_rate": 7.897105491022819e-06, "loss": 0.4466, "step": 1012 }, { "epoch": 1.1197494473102432, "grad_norm": 0.43728724122047424, "learning_rate": 7.891856191461441e-06, "loss": 0.4944, "step": 1013 }, { "epoch": 1.1208548268238763, "grad_norm": 0.44369879364967346, "learning_rate": 7.886602097914466e-06, "loss": 0.4721, "step": 1014 }, { "epoch": 1.1219602063375091, "grad_norm": 0.40321722626686096, "learning_rate": 7.881343219091887e-06, "loss": 0.4146, "step": 1015 }, { "epoch": 1.1230655858511422, "grad_norm": 0.5056473016738892, "learning_rate": 7.876079563711631e-06, "loss": 0.5143, "step": 1016 }, { "epoch": 1.1241709653647753, "grad_norm": 0.3835659325122833, "learning_rate": 7.870811140499543e-06, "loss": 0.4279, "step": 1017 }, { "epoch": 1.1252763448784082, "grad_norm": 0.49890124797821045, "learning_rate": 7.86553795818937e-06, "loss": 0.4631, "step": 1018 }, { "epoch": 1.1263817243920413, "grad_norm": 0.4587661027908325, "learning_rate": 7.86026002552275e-06, "loss": 0.445, "step": 1019 }, { "epoch": 1.1274871039056742, "grad_norm": 0.3993169665336609, "learning_rate": 7.854977351249199e-06, "loss": 0.4296, "step": 1020 }, { "epoch": 1.1285924834193073, "grad_norm": 0.39011457562446594, "learning_rate": 7.849689944126088e-06, "loss": 0.4312, "step": 1021 }, { "epoch": 1.1296978629329404, "grad_norm": 0.4394485056400299, "learning_rate": 7.844397812918637e-06, "loss": 0.4594, "step": 1022 }, { "epoch": 1.1308032424465733, "grad_norm": 0.4140453040599823, "learning_rate": 7.839100966399894e-06, "loss": 0.4795, "step": 1023 }, { "epoch": 1.1319086219602064, "grad_norm": 0.37442588806152344, "learning_rate": 7.833799413350732e-06, "loss": 0.4301, "step": 1024 }, { "epoch": 1.1330140014738395, "grad_norm": 0.4787435233592987, "learning_rate": 7.828493162559814e-06, "loss": 0.5049, "step": 1025 }, { "epoch": 1.1341193809874723, "grad_norm": 0.3800182342529297, "learning_rate": 7.823182222823603e-06, "loss": 0.4156, "step": 1026 }, { "epoch": 1.1352247605011054, "grad_norm": 0.3985009789466858, "learning_rate": 7.817866602946326e-06, "loss": 0.4886, "step": 1027 }, { "epoch": 1.1363301400147383, "grad_norm": 0.3471875786781311, "learning_rate": 7.812546311739976e-06, "loss": 0.4122, "step": 1028 }, { "epoch": 1.1374355195283714, "grad_norm": 0.3986920416355133, "learning_rate": 7.807221358024282e-06, "loss": 0.4747, "step": 1029 }, { "epoch": 1.1385408990420045, "grad_norm": 0.3859270513057709, "learning_rate": 7.801891750626706e-06, "loss": 0.4669, "step": 1030 }, { "epoch": 1.1396462785556374, "grad_norm": 0.39931902289390564, "learning_rate": 7.79655749838243e-06, "loss": 0.4154, "step": 1031 }, { "epoch": 1.1407516580692705, "grad_norm": 0.3783586323261261, "learning_rate": 7.791218610134324e-06, "loss": 0.4656, "step": 1032 }, { "epoch": 1.1418570375829034, "grad_norm": 0.4276062548160553, "learning_rate": 7.785875094732955e-06, "loss": 0.4738, "step": 1033 }, { "epoch": 1.1429624170965365, "grad_norm": 0.38978928327560425, "learning_rate": 7.780526961036556e-06, "loss": 0.4287, "step": 1034 }, { "epoch": 1.1440677966101696, "grad_norm": 0.3704260289669037, "learning_rate": 7.775174217911015e-06, "loss": 0.4505, "step": 1035 }, { "epoch": 1.1451731761238024, "grad_norm": 0.4558396637439728, "learning_rate": 7.769816874229862e-06, "loss": 0.5041, "step": 1036 }, { "epoch": 1.1462785556374355, "grad_norm": 0.4063161313533783, "learning_rate": 7.764454938874252e-06, "loss": 0.4328, "step": 1037 }, { "epoch": 1.1473839351510686, "grad_norm": 0.4459267556667328, "learning_rate": 7.759088420732958e-06, "loss": 0.4822, "step": 1038 }, { "epoch": 1.1484893146647015, "grad_norm": 0.42000794410705566, "learning_rate": 7.753717328702343e-06, "loss": 0.4632, "step": 1039 }, { "epoch": 1.1495946941783346, "grad_norm": 0.39485952258110046, "learning_rate": 7.748341671686355e-06, "loss": 0.4116, "step": 1040 }, { "epoch": 1.1507000736919677, "grad_norm": 0.40772002935409546, "learning_rate": 7.74296145859651e-06, "loss": 0.4937, "step": 1041 }, { "epoch": 1.1518054532056006, "grad_norm": 0.4053741991519928, "learning_rate": 7.737576698351878e-06, "loss": 0.4487, "step": 1042 }, { "epoch": 1.1529108327192337, "grad_norm": 0.4123914837837219, "learning_rate": 7.732187399879065e-06, "loss": 0.4951, "step": 1043 }, { "epoch": 1.1540162122328665, "grad_norm": 0.340718537569046, "learning_rate": 7.726793572112203e-06, "loss": 0.3749, "step": 1044 }, { "epoch": 1.1551215917464996, "grad_norm": 0.40388378500938416, "learning_rate": 7.721395223992926e-06, "loss": 0.4699, "step": 1045 }, { "epoch": 1.1562269712601327, "grad_norm": 0.4603058397769928, "learning_rate": 7.715992364470371e-06, "loss": 0.4797, "step": 1046 }, { "epoch": 1.1573323507737656, "grad_norm": 0.3819417655467987, "learning_rate": 7.710585002501145e-06, "loss": 0.448, "step": 1047 }, { "epoch": 1.1584377302873987, "grad_norm": 0.41214630007743835, "learning_rate": 7.705173147049326e-06, "loss": 0.4203, "step": 1048 }, { "epoch": 1.1595431098010316, "grad_norm": 0.42238011956214905, "learning_rate": 7.699756807086435e-06, "loss": 0.4603, "step": 1049 }, { "epoch": 1.1606484893146647, "grad_norm": 0.4293155074119568, "learning_rate": 7.694335991591431e-06, "loss": 0.455, "step": 1050 }, { "epoch": 1.1617538688282978, "grad_norm": 0.41166290640830994, "learning_rate": 7.688910709550693e-06, "loss": 0.4501, "step": 1051 }, { "epoch": 1.1628592483419307, "grad_norm": 0.5106882452964783, "learning_rate": 7.683480969958005e-06, "loss": 0.4473, "step": 1052 }, { "epoch": 1.1639646278555638, "grad_norm": 0.39939138293266296, "learning_rate": 7.67804678181453e-06, "loss": 0.4517, "step": 1053 }, { "epoch": 1.1650700073691969, "grad_norm": 0.4186530113220215, "learning_rate": 7.672608154128824e-06, "loss": 0.4419, "step": 1054 }, { "epoch": 1.1661753868828297, "grad_norm": 0.4475560784339905, "learning_rate": 7.667165095916787e-06, "loss": 0.4602, "step": 1055 }, { "epoch": 1.1672807663964628, "grad_norm": 0.41778939962387085, "learning_rate": 7.66171761620167e-06, "loss": 0.4627, "step": 1056 }, { "epoch": 1.1683861459100957, "grad_norm": 0.43467220664024353, "learning_rate": 7.656265724014054e-06, "loss": 0.4204, "step": 1057 }, { "epoch": 1.1694915254237288, "grad_norm": 0.40398141741752625, "learning_rate": 7.650809428391834e-06, "loss": 0.4185, "step": 1058 }, { "epoch": 1.170596904937362, "grad_norm": 0.428554892539978, "learning_rate": 7.645348738380207e-06, "loss": 0.461, "step": 1059 }, { "epoch": 1.1717022844509948, "grad_norm": 0.5255104899406433, "learning_rate": 7.63988366303165e-06, "loss": 0.4802, "step": 1060 }, { "epoch": 1.1728076639646279, "grad_norm": 0.43826889991760254, "learning_rate": 7.634414211405911e-06, "loss": 0.4433, "step": 1061 }, { "epoch": 1.1739130434782608, "grad_norm": 0.4056016504764557, "learning_rate": 7.628940392569995e-06, "loss": 0.4658, "step": 1062 }, { "epoch": 1.1750184229918939, "grad_norm": 0.3812839686870575, "learning_rate": 7.623462215598148e-06, "loss": 0.4355, "step": 1063 }, { "epoch": 1.176123802505527, "grad_norm": 0.3963984549045563, "learning_rate": 7.61797968957184e-06, "loss": 0.4474, "step": 1064 }, { "epoch": 1.1772291820191598, "grad_norm": 0.396513968706131, "learning_rate": 7.612492823579744e-06, "loss": 0.4313, "step": 1065 }, { "epoch": 1.178334561532793, "grad_norm": 0.4507621228694916, "learning_rate": 7.607001626717738e-06, "loss": 0.4739, "step": 1066 }, { "epoch": 1.179439941046426, "grad_norm": 0.39765697717666626, "learning_rate": 7.601506108088874e-06, "loss": 0.4307, "step": 1067 }, { "epoch": 1.180545320560059, "grad_norm": 0.4306188225746155, "learning_rate": 7.596006276803365e-06, "loss": 0.4765, "step": 1068 }, { "epoch": 1.181650700073692, "grad_norm": 0.42180269956588745, "learning_rate": 7.590502141978581e-06, "loss": 0.435, "step": 1069 }, { "epoch": 1.182756079587325, "grad_norm": 0.40602344274520874, "learning_rate": 7.58499371273902e-06, "loss": 0.4716, "step": 1070 }, { "epoch": 1.183861459100958, "grad_norm": 0.4175351858139038, "learning_rate": 7.579480998216304e-06, "loss": 0.4351, "step": 1071 }, { "epoch": 1.184966838614591, "grad_norm": 0.4109854996204376, "learning_rate": 7.5739640075491546e-06, "loss": 0.4446, "step": 1072 }, { "epoch": 1.186072218128224, "grad_norm": 0.4522290527820587, "learning_rate": 7.5684427498833836e-06, "loss": 0.4852, "step": 1073 }, { "epoch": 1.187177597641857, "grad_norm": 0.40911978483200073, "learning_rate": 7.562917234371879e-06, "loss": 0.476, "step": 1074 }, { "epoch": 1.1882829771554901, "grad_norm": 0.36169755458831787, "learning_rate": 7.557387470174584e-06, "loss": 0.4196, "step": 1075 }, { "epoch": 1.189388356669123, "grad_norm": 0.4266171455383301, "learning_rate": 7.551853466458486e-06, "loss": 0.4524, "step": 1076 }, { "epoch": 1.1904937361827561, "grad_norm": 0.3860180675983429, "learning_rate": 7.546315232397601e-06, "loss": 0.4285, "step": 1077 }, { "epoch": 1.191599115696389, "grad_norm": 0.5117107629776001, "learning_rate": 7.540772777172958e-06, "loss": 0.4805, "step": 1078 }, { "epoch": 1.192704495210022, "grad_norm": 0.4055849015712738, "learning_rate": 7.535226109972582e-06, "loss": 0.4525, "step": 1079 }, { "epoch": 1.1938098747236552, "grad_norm": 0.3842204809188843, "learning_rate": 7.529675239991483e-06, "loss": 0.449, "step": 1080 }, { "epoch": 1.194915254237288, "grad_norm": 0.52243572473526, "learning_rate": 7.524120176431636e-06, "loss": 0.5198, "step": 1081 }, { "epoch": 1.1960206337509212, "grad_norm": 0.44179287552833557, "learning_rate": 7.518560928501969e-06, "loss": 0.4663, "step": 1082 }, { "epoch": 1.1971260132645543, "grad_norm": 0.3928244709968567, "learning_rate": 7.512997505418347e-06, "loss": 0.4326, "step": 1083 }, { "epoch": 1.1982313927781871, "grad_norm": 0.49839723110198975, "learning_rate": 7.507429916403553e-06, "loss": 0.4905, "step": 1084 }, { "epoch": 1.1993367722918202, "grad_norm": 0.45600688457489014, "learning_rate": 7.5018581706872815e-06, "loss": 0.4539, "step": 1085 }, { "epoch": 1.2004421518054533, "grad_norm": 0.43748632073402405, "learning_rate": 7.496282277506115e-06, "loss": 0.4032, "step": 1086 }, { "epoch": 1.2015475313190862, "grad_norm": 0.4158378839492798, "learning_rate": 7.4907022461035125e-06, "loss": 0.3934, "step": 1087 }, { "epoch": 1.2026529108327193, "grad_norm": 0.446931928396225, "learning_rate": 7.48511808572979e-06, "loss": 0.434, "step": 1088 }, { "epoch": 1.2037582903463522, "grad_norm": 0.466039776802063, "learning_rate": 7.479529805642112e-06, "loss": 0.5015, "step": 1089 }, { "epoch": 1.2048636698599853, "grad_norm": 0.4409915506839752, "learning_rate": 7.473937415104471e-06, "loss": 0.4747, "step": 1090 }, { "epoch": 1.2059690493736182, "grad_norm": 0.48441535234451294, "learning_rate": 7.468340923387672e-06, "loss": 0.4233, "step": 1091 }, { "epoch": 1.2070744288872512, "grad_norm": 0.45815229415893555, "learning_rate": 7.462740339769323e-06, "loss": 0.4456, "step": 1092 }, { "epoch": 1.2081798084008843, "grad_norm": 0.42775869369506836, "learning_rate": 7.457135673533811e-06, "loss": 0.5042, "step": 1093 }, { "epoch": 1.2092851879145172, "grad_norm": 0.40356263518333435, "learning_rate": 7.451526933972294e-06, "loss": 0.4167, "step": 1094 }, { "epoch": 1.2103905674281503, "grad_norm": 0.5347933173179626, "learning_rate": 7.445914130382679e-06, "loss": 0.4714, "step": 1095 }, { "epoch": 1.2114959469417834, "grad_norm": 0.4342212975025177, "learning_rate": 7.440297272069615e-06, "loss": 0.4629, "step": 1096 }, { "epoch": 1.2126013264554163, "grad_norm": 0.47652867436408997, "learning_rate": 7.434676368344469e-06, "loss": 0.5002, "step": 1097 }, { "epoch": 1.2137067059690494, "grad_norm": 0.4299674332141876, "learning_rate": 7.429051428525318e-06, "loss": 0.4021, "step": 1098 }, { "epoch": 1.2148120854826825, "grad_norm": 0.45697999000549316, "learning_rate": 7.4234224619369235e-06, "loss": 0.4495, "step": 1099 }, { "epoch": 1.2159174649963154, "grad_norm": 0.4714875817298889, "learning_rate": 7.417789477910728e-06, "loss": 0.4501, "step": 1100 }, { "epoch": 1.2170228445099485, "grad_norm": 0.46141529083251953, "learning_rate": 7.412152485784834e-06, "loss": 0.4681, "step": 1101 }, { "epoch": 1.2181282240235813, "grad_norm": 0.43832409381866455, "learning_rate": 7.406511494903982e-06, "loss": 0.4125, "step": 1102 }, { "epoch": 1.2192336035372144, "grad_norm": 0.38011008501052856, "learning_rate": 7.400866514619551e-06, "loss": 0.456, "step": 1103 }, { "epoch": 1.2203389830508475, "grad_norm": 0.42274919152259827, "learning_rate": 7.395217554289524e-06, "loss": 0.4257, "step": 1104 }, { "epoch": 1.2214443625644804, "grad_norm": 0.3637048304080963, "learning_rate": 7.389564623278492e-06, "loss": 0.4017, "step": 1105 }, { "epoch": 1.2225497420781135, "grad_norm": 0.4011034071445465, "learning_rate": 7.383907730957618e-06, "loss": 0.4799, "step": 1106 }, { "epoch": 1.2236551215917464, "grad_norm": 0.4230492115020752, "learning_rate": 7.378246886704638e-06, "loss": 0.4427, "step": 1107 }, { "epoch": 1.2247605011053795, "grad_norm": 0.42504703998565674, "learning_rate": 7.372582099903841e-06, "loss": 0.4476, "step": 1108 }, { "epoch": 1.2258658806190126, "grad_norm": 0.4368453919887543, "learning_rate": 7.366913379946044e-06, "loss": 0.4701, "step": 1109 }, { "epoch": 1.2269712601326455, "grad_norm": 0.3973553478717804, "learning_rate": 7.361240736228594e-06, "loss": 0.4633, "step": 1110 }, { "epoch": 1.2280766396462786, "grad_norm": 0.4286786615848541, "learning_rate": 7.355564178155335e-06, "loss": 0.4258, "step": 1111 }, { "epoch": 1.2291820191599117, "grad_norm": 0.4013882279396057, "learning_rate": 7.349883715136601e-06, "loss": 0.4706, "step": 1112 }, { "epoch": 1.2302873986735445, "grad_norm": 0.4033791720867157, "learning_rate": 7.344199356589204e-06, "loss": 0.5045, "step": 1113 }, { "epoch": 1.2313927781871776, "grad_norm": 0.36437901854515076, "learning_rate": 7.3385111119364105e-06, "loss": 0.3975, "step": 1114 }, { "epoch": 1.2324981577008107, "grad_norm": 0.41906335949897766, "learning_rate": 7.332818990607929e-06, "loss": 0.4633, "step": 1115 }, { "epoch": 1.2336035372144436, "grad_norm": 0.37997010350227356, "learning_rate": 7.327123002039897e-06, "loss": 0.4279, "step": 1116 }, { "epoch": 1.2347089167280767, "grad_norm": 0.4413769543170929, "learning_rate": 7.321423155674858e-06, "loss": 0.518, "step": 1117 }, { "epoch": 1.2358142962417096, "grad_norm": 0.3519197702407837, "learning_rate": 7.315719460961757e-06, "loss": 0.426, "step": 1118 }, { "epoch": 1.2369196757553427, "grad_norm": 0.4014701247215271, "learning_rate": 7.310011927355913e-06, "loss": 0.4776, "step": 1119 }, { "epoch": 1.2380250552689758, "grad_norm": 0.42840901017189026, "learning_rate": 7.304300564319013e-06, "loss": 0.4968, "step": 1120 }, { "epoch": 1.2391304347826086, "grad_norm": 0.34531742334365845, "learning_rate": 7.2985853813190935e-06, "loss": 0.4519, "step": 1121 }, { "epoch": 1.2402358142962417, "grad_norm": 0.43007877469062805, "learning_rate": 7.292866387830515e-06, "loss": 0.4889, "step": 1122 }, { "epoch": 1.2413411938098746, "grad_norm": 0.3916306793689728, "learning_rate": 7.287143593333967e-06, "loss": 0.408, "step": 1123 }, { "epoch": 1.2424465733235077, "grad_norm": 0.4449232220649719, "learning_rate": 7.281417007316427e-06, "loss": 0.5036, "step": 1124 }, { "epoch": 1.2435519528371408, "grad_norm": 0.39692071080207825, "learning_rate": 7.275686639271171e-06, "loss": 0.4438, "step": 1125 }, { "epoch": 1.2446573323507737, "grad_norm": 0.40539419651031494, "learning_rate": 7.269952498697734e-06, "loss": 0.4394, "step": 1126 }, { "epoch": 1.2457627118644068, "grad_norm": 0.47820109128952026, "learning_rate": 7.264214595101913e-06, "loss": 0.4603, "step": 1127 }, { "epoch": 1.2468680913780399, "grad_norm": 0.3846510946750641, "learning_rate": 7.258472937995736e-06, "loss": 0.4387, "step": 1128 }, { "epoch": 1.2479734708916728, "grad_norm": 0.43571117520332336, "learning_rate": 7.252727536897459e-06, "loss": 0.455, "step": 1129 }, { "epoch": 1.2490788504053059, "grad_norm": 0.41596755385398865, "learning_rate": 7.246978401331543e-06, "loss": 0.4127, "step": 1130 }, { "epoch": 1.250184229918939, "grad_norm": 0.43149393796920776, "learning_rate": 7.241225540828638e-06, "loss": 0.4737, "step": 1131 }, { "epoch": 1.2512896094325718, "grad_norm": 0.41112470626831055, "learning_rate": 7.235468964925571e-06, "loss": 0.4467, "step": 1132 }, { "epoch": 1.252394988946205, "grad_norm": 0.460287868976593, "learning_rate": 7.229708683165326e-06, "loss": 0.463, "step": 1133 }, { "epoch": 1.2535003684598378, "grad_norm": 0.37186431884765625, "learning_rate": 7.223944705097035e-06, "loss": 0.4504, "step": 1134 }, { "epoch": 1.254605747973471, "grad_norm": 0.40952861309051514, "learning_rate": 7.218177040275951e-06, "loss": 0.4305, "step": 1135 }, { "epoch": 1.2557111274871038, "grad_norm": 0.4489339590072632, "learning_rate": 7.212405698263446e-06, "loss": 0.4672, "step": 1136 }, { "epoch": 1.2568165070007369, "grad_norm": 0.41202783584594727, "learning_rate": 7.206630688626981e-06, "loss": 0.4553, "step": 1137 }, { "epoch": 1.25792188651437, "grad_norm": 0.4717872142791748, "learning_rate": 7.200852020940102e-06, "loss": 0.4469, "step": 1138 }, { "epoch": 1.2590272660280029, "grad_norm": 0.37709513306617737, "learning_rate": 7.195069704782418e-06, "loss": 0.396, "step": 1139 }, { "epoch": 1.260132645541636, "grad_norm": 0.4893186390399933, "learning_rate": 7.189283749739584e-06, "loss": 0.4648, "step": 1140 }, { "epoch": 1.261238025055269, "grad_norm": 0.4218249022960663, "learning_rate": 7.183494165403288e-06, "loss": 0.456, "step": 1141 }, { "epoch": 1.262343404568902, "grad_norm": 0.4985559582710266, "learning_rate": 7.177700961371239e-06, "loss": 0.4859, "step": 1142 }, { "epoch": 1.263448784082535, "grad_norm": 0.385615736246109, "learning_rate": 7.1719041472471394e-06, "loss": 0.4335, "step": 1143 }, { "epoch": 1.2645541635961681, "grad_norm": 0.36673903465270996, "learning_rate": 7.1661037326406825e-06, "loss": 0.4476, "step": 1144 }, { "epoch": 1.265659543109801, "grad_norm": 0.4200752079486847, "learning_rate": 7.160299727167526e-06, "loss": 0.4671, "step": 1145 }, { "epoch": 1.266764922623434, "grad_norm": 0.3806777000427246, "learning_rate": 7.154492140449283e-06, "loss": 0.4351, "step": 1146 }, { "epoch": 1.2678703021370672, "grad_norm": 0.3654707968235016, "learning_rate": 7.148680982113502e-06, "loss": 0.4285, "step": 1147 }, { "epoch": 1.2689756816507, "grad_norm": 0.43061330914497375, "learning_rate": 7.142866261793651e-06, "loss": 0.4787, "step": 1148 }, { "epoch": 1.2700810611643332, "grad_norm": 0.3674818277359009, "learning_rate": 7.137047989129108e-06, "loss": 0.421, "step": 1149 }, { "epoch": 1.271186440677966, "grad_norm": 0.46230852603912354, "learning_rate": 7.1312261737651354e-06, "loss": 0.4917, "step": 1150 }, { "epoch": 1.2722918201915991, "grad_norm": 0.4178174138069153, "learning_rate": 7.125400825352869e-06, "loss": 0.4401, "step": 1151 }, { "epoch": 1.273397199705232, "grad_norm": 0.4159031808376312, "learning_rate": 7.119571953549305e-06, "loss": 0.4928, "step": 1152 }, { "epoch": 1.2745025792188651, "grad_norm": 0.44731152057647705, "learning_rate": 7.113739568017272e-06, "loss": 0.5128, "step": 1153 }, { "epoch": 1.2756079587324982, "grad_norm": 0.37007468938827515, "learning_rate": 7.107903678425436e-06, "loss": 0.427, "step": 1154 }, { "epoch": 1.276713338246131, "grad_norm": 0.47471725940704346, "learning_rate": 7.102064294448261e-06, "loss": 0.4737, "step": 1155 }, { "epoch": 1.2778187177597642, "grad_norm": 0.37566375732421875, "learning_rate": 7.09622142576601e-06, "loss": 0.448, "step": 1156 }, { "epoch": 1.2789240972733973, "grad_norm": 0.4613412916660309, "learning_rate": 7.0903750820647175e-06, "loss": 0.4765, "step": 1157 }, { "epoch": 1.2800294767870302, "grad_norm": 0.4201585054397583, "learning_rate": 7.084525273036187e-06, "loss": 0.4473, "step": 1158 }, { "epoch": 1.2811348563006633, "grad_norm": 0.41340765357017517, "learning_rate": 7.078672008377958e-06, "loss": 0.447, "step": 1159 }, { "epoch": 1.2822402358142964, "grad_norm": 0.40777838230133057, "learning_rate": 7.072815297793303e-06, "loss": 0.4539, "step": 1160 }, { "epoch": 1.2833456153279292, "grad_norm": 0.4346905052661896, "learning_rate": 7.066955150991207e-06, "loss": 0.4545, "step": 1161 }, { "epoch": 1.2844509948415623, "grad_norm": 0.39979392290115356, "learning_rate": 7.061091577686349e-06, "loss": 0.4612, "step": 1162 }, { "epoch": 1.2855563743551952, "grad_norm": 0.41313478350639343, "learning_rate": 7.055224587599092e-06, "loss": 0.4784, "step": 1163 }, { "epoch": 1.2866617538688283, "grad_norm": 0.3834293484687805, "learning_rate": 7.0493541904554605e-06, "loss": 0.4351, "step": 1164 }, { "epoch": 1.2877671333824612, "grad_norm": 0.39407458901405334, "learning_rate": 7.043480395987128e-06, "loss": 0.4922, "step": 1165 }, { "epoch": 1.2888725128960943, "grad_norm": 0.3778684735298157, "learning_rate": 7.037603213931397e-06, "loss": 0.4106, "step": 1166 }, { "epoch": 1.2899778924097274, "grad_norm": 0.39143848419189453, "learning_rate": 7.031722654031192e-06, "loss": 0.4622, "step": 1167 }, { "epoch": 1.2910832719233603, "grad_norm": 0.4490334689617157, "learning_rate": 7.025838726035032e-06, "loss": 0.4337, "step": 1168 }, { "epoch": 1.2921886514369934, "grad_norm": 0.39536529779434204, "learning_rate": 7.019951439697021e-06, "loss": 0.4417, "step": 1169 }, { "epoch": 1.2932940309506264, "grad_norm": 0.45170509815216064, "learning_rate": 7.014060804776831e-06, "loss": 0.4661, "step": 1170 }, { "epoch": 1.2943994104642593, "grad_norm": 0.36763206124305725, "learning_rate": 7.008166831039681e-06, "loss": 0.4424, "step": 1171 }, { "epoch": 1.2955047899778924, "grad_norm": 0.4107193052768707, "learning_rate": 7.002269528256334e-06, "loss": 0.4694, "step": 1172 }, { "epoch": 1.2966101694915255, "grad_norm": 0.4289482831954956, "learning_rate": 6.99636890620306e-06, "loss": 0.4714, "step": 1173 }, { "epoch": 1.2977155490051584, "grad_norm": 0.37017086148262024, "learning_rate": 6.990464974661644e-06, "loss": 0.454, "step": 1174 }, { "epoch": 1.2988209285187915, "grad_norm": 0.38968926668167114, "learning_rate": 6.984557743419347e-06, "loss": 0.387, "step": 1175 }, { "epoch": 1.2999263080324246, "grad_norm": 0.41898199915885925, "learning_rate": 6.978647222268904e-06, "loss": 0.453, "step": 1176 }, { "epoch": 1.3010316875460575, "grad_norm": 0.37095993757247925, "learning_rate": 6.972733421008505e-06, "loss": 0.4785, "step": 1177 }, { "epoch": 1.3021370670596906, "grad_norm": 0.34932854771614075, "learning_rate": 6.9668163494417775e-06, "loss": 0.4057, "step": 1178 }, { "epoch": 1.3032424465733234, "grad_norm": 0.4311746656894684, "learning_rate": 6.960896017377767e-06, "loss": 0.5042, "step": 1179 }, { "epoch": 1.3043478260869565, "grad_norm": 0.382388174533844, "learning_rate": 6.954972434630928e-06, "loss": 0.4363, "step": 1180 }, { "epoch": 1.3054532056005894, "grad_norm": 0.4223378896713257, "learning_rate": 6.949045611021102e-06, "loss": 0.4423, "step": 1181 }, { "epoch": 1.3065585851142225, "grad_norm": 0.45198631286621094, "learning_rate": 6.943115556373503e-06, "loss": 0.4429, "step": 1182 }, { "epoch": 1.3076639646278556, "grad_norm": 0.401805579662323, "learning_rate": 6.937182280518701e-06, "loss": 0.4409, "step": 1183 }, { "epoch": 1.3087693441414885, "grad_norm": 0.41899973154067993, "learning_rate": 6.93124579329261e-06, "loss": 0.4892, "step": 1184 }, { "epoch": 1.3098747236551216, "grad_norm": 0.41413789987564087, "learning_rate": 6.925306104536461e-06, "loss": 0.4387, "step": 1185 }, { "epoch": 1.3109801031687547, "grad_norm": 0.4049571752548218, "learning_rate": 6.919363224096797e-06, "loss": 0.4394, "step": 1186 }, { "epoch": 1.3120854826823876, "grad_norm": 0.39611250162124634, "learning_rate": 6.913417161825449e-06, "loss": 0.4642, "step": 1187 }, { "epoch": 1.3131908621960207, "grad_norm": 0.45123374462127686, "learning_rate": 6.907467927579528e-06, "loss": 0.4511, "step": 1188 }, { "epoch": 1.3142962417096538, "grad_norm": 0.4182085394859314, "learning_rate": 6.9015155312213945e-06, "loss": 0.4655, "step": 1189 }, { "epoch": 1.3154016212232866, "grad_norm": 0.4151879549026489, "learning_rate": 6.8955599826186606e-06, "loss": 0.4229, "step": 1190 }, { "epoch": 1.3165070007369197, "grad_norm": 0.37806180119514465, "learning_rate": 6.889601291644156e-06, "loss": 0.4527, "step": 1191 }, { "epoch": 1.3176123802505528, "grad_norm": 0.43100041151046753, "learning_rate": 6.883639468175926e-06, "loss": 0.4363, "step": 1192 }, { "epoch": 1.3187177597641857, "grad_norm": 0.4266984462738037, "learning_rate": 6.8776745220972054e-06, "loss": 0.4689, "step": 1193 }, { "epoch": 1.3198231392778186, "grad_norm": 0.40104153752326965, "learning_rate": 6.871706463296407e-06, "loss": 0.4602, "step": 1194 }, { "epoch": 1.3209285187914517, "grad_norm": 0.3745673894882202, "learning_rate": 6.865735301667101e-06, "loss": 0.4466, "step": 1195 }, { "epoch": 1.3220338983050848, "grad_norm": 0.39226603507995605, "learning_rate": 6.859761047108007e-06, "loss": 0.4753, "step": 1196 }, { "epoch": 1.3231392778187177, "grad_norm": 0.4076964855194092, "learning_rate": 6.853783709522963e-06, "loss": 0.4573, "step": 1197 }, { "epoch": 1.3242446573323507, "grad_norm": 0.4391975700855255, "learning_rate": 6.847803298820927e-06, "loss": 0.4527, "step": 1198 }, { "epoch": 1.3253500368459838, "grad_norm": 0.41713201999664307, "learning_rate": 6.841819824915947e-06, "loss": 0.4504, "step": 1199 }, { "epoch": 1.3264554163596167, "grad_norm": 0.39618778228759766, "learning_rate": 6.835833297727148e-06, "loss": 0.4177, "step": 1200 }, { "epoch": 1.3275607958732498, "grad_norm": 0.431017130613327, "learning_rate": 6.82984372717872e-06, "loss": 0.4897, "step": 1201 }, { "epoch": 1.328666175386883, "grad_norm": 0.4216987192630768, "learning_rate": 6.823851123199894e-06, "loss": 0.4554, "step": 1202 }, { "epoch": 1.3297715549005158, "grad_norm": 0.40307381749153137, "learning_rate": 6.817855495724934e-06, "loss": 0.4637, "step": 1203 }, { "epoch": 1.330876934414149, "grad_norm": 0.36674609780311584, "learning_rate": 6.811856854693114e-06, "loss": 0.4241, "step": 1204 }, { "epoch": 1.331982313927782, "grad_norm": 0.3713102638721466, "learning_rate": 6.805855210048704e-06, "loss": 0.433, "step": 1205 }, { "epoch": 1.3330876934414149, "grad_norm": 0.4308029115200043, "learning_rate": 6.799850571740955e-06, "loss": 0.477, "step": 1206 }, { "epoch": 1.334193072955048, "grad_norm": 0.4130851626396179, "learning_rate": 6.793842949724074e-06, "loss": 0.4503, "step": 1207 }, { "epoch": 1.3352984524686808, "grad_norm": 0.4645656645298004, "learning_rate": 6.787832353957225e-06, "loss": 0.488, "step": 1208 }, { "epoch": 1.336403831982314, "grad_norm": 0.38922467827796936, "learning_rate": 6.7818187944044915e-06, "loss": 0.4403, "step": 1209 }, { "epoch": 1.3375092114959468, "grad_norm": 0.3940206468105316, "learning_rate": 6.775802281034876e-06, "loss": 0.4915, "step": 1210 }, { "epoch": 1.33861459100958, "grad_norm": 0.37230926752090454, "learning_rate": 6.7697828238222784e-06, "loss": 0.4444, "step": 1211 }, { "epoch": 1.339719970523213, "grad_norm": 0.40626221895217896, "learning_rate": 6.763760432745475e-06, "loss": 0.4944, "step": 1212 }, { "epoch": 1.3408253500368459, "grad_norm": 0.3579564392566681, "learning_rate": 6.7577351177881065e-06, "loss": 0.4499, "step": 1213 }, { "epoch": 1.341930729550479, "grad_norm": 0.35179683566093445, "learning_rate": 6.751706888938665e-06, "loss": 0.4172, "step": 1214 }, { "epoch": 1.343036109064112, "grad_norm": 0.4297713041305542, "learning_rate": 6.7456757561904666e-06, "loss": 0.4688, "step": 1215 }, { "epoch": 1.344141488577745, "grad_norm": 0.4195805490016937, "learning_rate": 6.739641729541645e-06, "loss": 0.44, "step": 1216 }, { "epoch": 1.345246868091378, "grad_norm": 0.35681959986686707, "learning_rate": 6.733604818995133e-06, "loss": 0.4425, "step": 1217 }, { "epoch": 1.3463522476050112, "grad_norm": 0.4117702841758728, "learning_rate": 6.72756503455864e-06, "loss": 0.4041, "step": 1218 }, { "epoch": 1.347457627118644, "grad_norm": 0.4653003215789795, "learning_rate": 6.721522386244642e-06, "loss": 0.471, "step": 1219 }, { "epoch": 1.3485630066322771, "grad_norm": 0.4609624445438385, "learning_rate": 6.715476884070362e-06, "loss": 0.4605, "step": 1220 }, { "epoch": 1.3496683861459102, "grad_norm": 0.460163950920105, "learning_rate": 6.709428538057756e-06, "loss": 0.4572, "step": 1221 }, { "epoch": 1.350773765659543, "grad_norm": 0.3800603449344635, "learning_rate": 6.703377358233489e-06, "loss": 0.4355, "step": 1222 }, { "epoch": 1.3518791451731762, "grad_norm": 0.43823617696762085, "learning_rate": 6.697323354628932e-06, "loss": 0.4044, "step": 1223 }, { "epoch": 1.352984524686809, "grad_norm": 0.42566922307014465, "learning_rate": 6.691266537280128e-06, "loss": 0.4803, "step": 1224 }, { "epoch": 1.3540899042004422, "grad_norm": 0.42597711086273193, "learning_rate": 6.6852069162277935e-06, "loss": 0.4406, "step": 1225 }, { "epoch": 1.355195283714075, "grad_norm": 0.4112999141216278, "learning_rate": 6.679144501517283e-06, "loss": 0.4459, "step": 1226 }, { "epoch": 1.3563006632277081, "grad_norm": 0.4186733365058899, "learning_rate": 6.673079303198591e-06, "loss": 0.4725, "step": 1227 }, { "epoch": 1.3574060427413412, "grad_norm": 0.4236551821231842, "learning_rate": 6.667011331326324e-06, "loss": 0.3982, "step": 1228 }, { "epoch": 1.3585114222549741, "grad_norm": 0.5090063810348511, "learning_rate": 6.660940595959683e-06, "loss": 0.5224, "step": 1229 }, { "epoch": 1.3596168017686072, "grad_norm": 0.3885136842727661, "learning_rate": 6.654867107162454e-06, "loss": 0.4325, "step": 1230 }, { "epoch": 1.3607221812822403, "grad_norm": 0.4611840546131134, "learning_rate": 6.648790875002985e-06, "loss": 0.4332, "step": 1231 }, { "epoch": 1.3618275607958732, "grad_norm": 0.4711607098579407, "learning_rate": 6.6427119095541745e-06, "loss": 0.4557, "step": 1232 }, { "epoch": 1.3629329403095063, "grad_norm": 0.42332723736763, "learning_rate": 6.6366302208934496e-06, "loss": 0.5103, "step": 1233 }, { "epoch": 1.3640383198231394, "grad_norm": 0.40063685178756714, "learning_rate": 6.6305458191027525e-06, "loss": 0.4421, "step": 1234 }, { "epoch": 1.3651436993367723, "grad_norm": 0.4578125476837158, "learning_rate": 6.6244587142685245e-06, "loss": 0.4623, "step": 1235 }, { "epoch": 1.3662490788504054, "grad_norm": 0.3799440562725067, "learning_rate": 6.618368916481686e-06, "loss": 0.3986, "step": 1236 }, { "epoch": 1.3673544583640382, "grad_norm": 0.4588302671909332, "learning_rate": 6.612276435837622e-06, "loss": 0.4735, "step": 1237 }, { "epoch": 1.3684598378776713, "grad_norm": 0.41177505254745483, "learning_rate": 6.606181282436166e-06, "loss": 0.4949, "step": 1238 }, { "epoch": 1.3695652173913042, "grad_norm": 0.3942297101020813, "learning_rate": 6.600083466381583e-06, "loss": 0.4063, "step": 1239 }, { "epoch": 1.3706705969049373, "grad_norm": 0.4159407913684845, "learning_rate": 6.593982997782549e-06, "loss": 0.4204, "step": 1240 }, { "epoch": 1.3717759764185704, "grad_norm": 0.4077970087528229, "learning_rate": 6.58787988675214e-06, "loss": 0.4527, "step": 1241 }, { "epoch": 1.3728813559322033, "grad_norm": 0.4068014621734619, "learning_rate": 6.58177414340781e-06, "loss": 0.4221, "step": 1242 }, { "epoch": 1.3739867354458364, "grad_norm": 0.38260531425476074, "learning_rate": 6.5756657778713795e-06, "loss": 0.4373, "step": 1243 }, { "epoch": 1.3750921149594695, "grad_norm": 0.4347043037414551, "learning_rate": 6.569554800269014e-06, "loss": 0.4692, "step": 1244 }, { "epoch": 1.3761974944731024, "grad_norm": 0.41565826535224915, "learning_rate": 6.563441220731213e-06, "loss": 0.4391, "step": 1245 }, { "epoch": 1.3773028739867355, "grad_norm": 0.44535741209983826, "learning_rate": 6.557325049392781e-06, "loss": 0.4585, "step": 1246 }, { "epoch": 1.3784082535003686, "grad_norm": 0.37292203307151794, "learning_rate": 6.551206296392827e-06, "loss": 0.4371, "step": 1247 }, { "epoch": 1.3795136330140014, "grad_norm": 0.44108831882476807, "learning_rate": 6.545084971874738e-06, "loss": 0.4973, "step": 1248 }, { "epoch": 1.3806190125276345, "grad_norm": 0.3842078447341919, "learning_rate": 6.53896108598616e-06, "loss": 0.4132, "step": 1249 }, { "epoch": 1.3817243920412676, "grad_norm": 0.4204309582710266, "learning_rate": 6.532834648878993e-06, "loss": 0.4788, "step": 1250 }, { "epoch": 1.3828297715549005, "grad_norm": 0.45064839720726013, "learning_rate": 6.526705670709357e-06, "loss": 0.4759, "step": 1251 }, { "epoch": 1.3839351510685336, "grad_norm": 0.39389848709106445, "learning_rate": 6.520574161637591e-06, "loss": 0.4774, "step": 1252 }, { "epoch": 1.3850405305821665, "grad_norm": 0.41006165742874146, "learning_rate": 6.514440131828224e-06, "loss": 0.4324, "step": 1253 }, { "epoch": 1.3861459100957996, "grad_norm": 0.4087454676628113, "learning_rate": 6.5083035914499736e-06, "loss": 0.4138, "step": 1254 }, { "epoch": 1.3872512896094324, "grad_norm": 0.4277544319629669, "learning_rate": 6.502164550675707e-06, "loss": 0.5057, "step": 1255 }, { "epoch": 1.3883566691230655, "grad_norm": 0.38144564628601074, "learning_rate": 6.496023019682447e-06, "loss": 0.4034, "step": 1256 }, { "epoch": 1.3894620486366986, "grad_norm": 0.4453551471233368, "learning_rate": 6.4898790086513366e-06, "loss": 0.4904, "step": 1257 }, { "epoch": 1.3905674281503315, "grad_norm": 0.4054315686225891, "learning_rate": 6.483732527767633e-06, "loss": 0.4477, "step": 1258 }, { "epoch": 1.3916728076639646, "grad_norm": 0.3850966691970825, "learning_rate": 6.477583587220691e-06, "loss": 0.4203, "step": 1259 }, { "epoch": 1.3927781871775977, "grad_norm": 0.4349023401737213, "learning_rate": 6.4714321972039395e-06, "loss": 0.419, "step": 1260 }, { "epoch": 1.3938835666912306, "grad_norm": 0.39026540517807007, "learning_rate": 6.465278367914869e-06, "loss": 0.4449, "step": 1261 }, { "epoch": 1.3949889462048637, "grad_norm": 0.4875854253768921, "learning_rate": 6.459122109555011e-06, "loss": 0.4896, "step": 1262 }, { "epoch": 1.3960943257184968, "grad_norm": 0.3945048749446869, "learning_rate": 6.452963432329928e-06, "loss": 0.4161, "step": 1263 }, { "epoch": 1.3971997052321297, "grad_norm": 0.47127753496170044, "learning_rate": 6.4468023464491906e-06, "loss": 0.5026, "step": 1264 }, { "epoch": 1.3983050847457628, "grad_norm": 0.3621992766857147, "learning_rate": 6.4406388621263624e-06, "loss": 0.408, "step": 1265 }, { "epoch": 1.3994104642593959, "grad_norm": 0.5135603547096252, "learning_rate": 6.434472989578983e-06, "loss": 0.4536, "step": 1266 }, { "epoch": 1.4005158437730287, "grad_norm": 0.39922234416007996, "learning_rate": 6.42830473902855e-06, "loss": 0.4795, "step": 1267 }, { "epoch": 1.4016212232866618, "grad_norm": 0.39839431643486023, "learning_rate": 6.422134120700506e-06, "loss": 0.4883, "step": 1268 }, { "epoch": 1.4027266028002947, "grad_norm": 0.42837029695510864, "learning_rate": 6.4159611448242144e-06, "loss": 0.4756, "step": 1269 }, { "epoch": 1.4038319823139278, "grad_norm": 0.41109856963157654, "learning_rate": 6.409785821632952e-06, "loss": 0.4774, "step": 1270 }, { "epoch": 1.4049373618275607, "grad_norm": 0.37480178475379944, "learning_rate": 6.403608161363883e-06, "loss": 0.4293, "step": 1271 }, { "epoch": 1.4060427413411938, "grad_norm": 0.4513096213340759, "learning_rate": 6.397428174258048e-06, "loss": 0.4438, "step": 1272 }, { "epoch": 1.4071481208548269, "grad_norm": 0.44606131315231323, "learning_rate": 6.391245870560343e-06, "loss": 0.5145, "step": 1273 }, { "epoch": 1.4082535003684598, "grad_norm": 0.3226590156555176, "learning_rate": 6.385061260519507e-06, "loss": 0.3781, "step": 1274 }, { "epoch": 1.4093588798820929, "grad_norm": 0.5156649351119995, "learning_rate": 6.378874354388099e-06, "loss": 0.4511, "step": 1275 }, { "epoch": 1.410464259395726, "grad_norm": 0.4259345531463623, "learning_rate": 6.3726851624224875e-06, "loss": 0.4511, "step": 1276 }, { "epoch": 1.4115696389093588, "grad_norm": 0.36743488907814026, "learning_rate": 6.3664936948828296e-06, "loss": 0.446, "step": 1277 }, { "epoch": 1.412675018422992, "grad_norm": 0.5175212025642395, "learning_rate": 6.360299962033051e-06, "loss": 0.4794, "step": 1278 }, { "epoch": 1.413780397936625, "grad_norm": 0.40029630064964294, "learning_rate": 6.35410397414084e-06, "loss": 0.4372, "step": 1279 }, { "epoch": 1.414885777450258, "grad_norm": 0.42148104310035706, "learning_rate": 6.347905741477613e-06, "loss": 0.4569, "step": 1280 }, { "epoch": 1.415991156963891, "grad_norm": 0.4845616817474365, "learning_rate": 6.341705274318522e-06, "loss": 0.499, "step": 1281 }, { "epoch": 1.4170965364775239, "grad_norm": 0.37208157777786255, "learning_rate": 6.335502582942409e-06, "loss": 0.3868, "step": 1282 }, { "epoch": 1.418201915991157, "grad_norm": 0.42013654112815857, "learning_rate": 6.3292976776318125e-06, "loss": 0.4545, "step": 1283 }, { "epoch": 1.4193072955047898, "grad_norm": 0.41186368465423584, "learning_rate": 6.323090568672935e-06, "loss": 0.4434, "step": 1284 }, { "epoch": 1.420412675018423, "grad_norm": 0.38058048486709595, "learning_rate": 6.316881266355638e-06, "loss": 0.4589, "step": 1285 }, { "epoch": 1.421518054532056, "grad_norm": 0.41723689436912537, "learning_rate": 6.310669780973414e-06, "loss": 0.4627, "step": 1286 }, { "epoch": 1.422623434045689, "grad_norm": 0.3623057007789612, "learning_rate": 6.304456122823377e-06, "loss": 0.4515, "step": 1287 }, { "epoch": 1.423728813559322, "grad_norm": 0.4193348288536072, "learning_rate": 6.298240302206242e-06, "loss": 0.4621, "step": 1288 }, { "epoch": 1.4248341930729551, "grad_norm": 0.43340617418289185, "learning_rate": 6.292022329426309e-06, "loss": 0.4151, "step": 1289 }, { "epoch": 1.425939572586588, "grad_norm": 0.41336315870285034, "learning_rate": 6.285802214791448e-06, "loss": 0.507, "step": 1290 }, { "epoch": 1.427044952100221, "grad_norm": 0.4070287048816681, "learning_rate": 6.279579968613074e-06, "loss": 0.4548, "step": 1291 }, { "epoch": 1.4281503316138542, "grad_norm": 0.4322182238101959, "learning_rate": 6.273355601206143e-06, "loss": 0.4505, "step": 1292 }, { "epoch": 1.429255711127487, "grad_norm": 0.3746210038661957, "learning_rate": 6.267129122889124e-06, "loss": 0.4013, "step": 1293 }, { "epoch": 1.4303610906411202, "grad_norm": 0.38119253516197205, "learning_rate": 6.260900543983982e-06, "loss": 0.4282, "step": 1294 }, { "epoch": 1.4314664701547533, "grad_norm": 0.4288400411605835, "learning_rate": 6.254669874816171e-06, "loss": 0.4763, "step": 1295 }, { "epoch": 1.4325718496683861, "grad_norm": 0.37199071049690247, "learning_rate": 6.2484371257146e-06, "loss": 0.4639, "step": 1296 }, { "epoch": 1.4336772291820192, "grad_norm": 0.41369983553886414, "learning_rate": 6.24220230701164e-06, "loss": 0.478, "step": 1297 }, { "epoch": 1.434782608695652, "grad_norm": 0.33851873874664307, "learning_rate": 6.235965429043082e-06, "loss": 0.3868, "step": 1298 }, { "epoch": 1.4358879882092852, "grad_norm": 0.37912717461586, "learning_rate": 6.229726502148135e-06, "loss": 0.464, "step": 1299 }, { "epoch": 1.436993367722918, "grad_norm": 0.440818727016449, "learning_rate": 6.223485536669401e-06, "loss": 0.4583, "step": 1300 }, { "epoch": 1.4380987472365512, "grad_norm": 0.39888861775398254, "learning_rate": 6.217242542952868e-06, "loss": 0.4449, "step": 1301 }, { "epoch": 1.4392041267501843, "grad_norm": 0.40044310688972473, "learning_rate": 6.210997531347879e-06, "loss": 0.4809, "step": 1302 }, { "epoch": 1.4403095062638172, "grad_norm": 0.3388465344905853, "learning_rate": 6.20475051220713e-06, "loss": 0.4107, "step": 1303 }, { "epoch": 1.4414148857774502, "grad_norm": 0.39669695496559143, "learning_rate": 6.1985014958866386e-06, "loss": 0.4778, "step": 1304 }, { "epoch": 1.4425202652910833, "grad_norm": 0.34392353892326355, "learning_rate": 6.192250492745734e-06, "loss": 0.4154, "step": 1305 }, { "epoch": 1.4436256448047162, "grad_norm": 0.4101218581199646, "learning_rate": 6.185997513147043e-06, "loss": 0.4176, "step": 1306 }, { "epoch": 1.4447310243183493, "grad_norm": 0.3739987909793854, "learning_rate": 6.179742567456464e-06, "loss": 0.4587, "step": 1307 }, { "epoch": 1.4458364038319824, "grad_norm": 0.3648110032081604, "learning_rate": 6.173485666043159e-06, "loss": 0.475, "step": 1308 }, { "epoch": 1.4469417833456153, "grad_norm": 0.44394350051879883, "learning_rate": 6.1672268192795285e-06, "loss": 0.4866, "step": 1309 }, { "epoch": 1.4480471628592484, "grad_norm": 0.4004497230052948, "learning_rate": 6.160966037541201e-06, "loss": 0.4759, "step": 1310 }, { "epoch": 1.4491525423728815, "grad_norm": 0.36648663878440857, "learning_rate": 6.154703331207008e-06, "loss": 0.4409, "step": 1311 }, { "epoch": 1.4502579218865144, "grad_norm": 0.40531179308891296, "learning_rate": 6.148438710658979e-06, "loss": 0.3934, "step": 1312 }, { "epoch": 1.4513633014001475, "grad_norm": 0.39420628547668457, "learning_rate": 6.142172186282309e-06, "loss": 0.4869, "step": 1313 }, { "epoch": 1.4524686809137803, "grad_norm": 0.4386855363845825, "learning_rate": 6.135903768465356e-06, "loss": 0.4653, "step": 1314 }, { "epoch": 1.4535740604274134, "grad_norm": 0.34116625785827637, "learning_rate": 6.129633467599611e-06, "loss": 0.386, "step": 1315 }, { "epoch": 1.4546794399410463, "grad_norm": 0.4136148989200592, "learning_rate": 6.123361294079691e-06, "loss": 0.4835, "step": 1316 }, { "epoch": 1.4557848194546794, "grad_norm": 0.40554729104042053, "learning_rate": 6.117087258303314e-06, "loss": 0.4359, "step": 1317 }, { "epoch": 1.4568901989683125, "grad_norm": 0.3758818805217743, "learning_rate": 6.110811370671286e-06, "loss": 0.4174, "step": 1318 }, { "epoch": 1.4579955784819454, "grad_norm": 0.4306005537509918, "learning_rate": 6.104533641587484e-06, "loss": 0.4916, "step": 1319 }, { "epoch": 1.4591009579955785, "grad_norm": 0.4034125506877899, "learning_rate": 6.098254081458839e-06, "loss": 0.4679, "step": 1320 }, { "epoch": 1.4602063375092116, "grad_norm": 0.4172939956188202, "learning_rate": 6.091972700695314e-06, "loss": 0.4263, "step": 1321 }, { "epoch": 1.4613117170228445, "grad_norm": 0.41148149967193604, "learning_rate": 6.085689509709893e-06, "loss": 0.4871, "step": 1322 }, { "epoch": 1.4624170965364776, "grad_norm": 0.3340839445590973, "learning_rate": 6.079404518918559e-06, "loss": 0.3883, "step": 1323 }, { "epoch": 1.4635224760501107, "grad_norm": 0.4464852213859558, "learning_rate": 6.07311773874028e-06, "loss": 0.4567, "step": 1324 }, { "epoch": 1.4646278555637435, "grad_norm": 0.4949098527431488, "learning_rate": 6.066829179596987e-06, "loss": 0.4693, "step": 1325 }, { "epoch": 1.4657332350773766, "grad_norm": 0.37640607357025146, "learning_rate": 6.060538851913568e-06, "loss": 0.4382, "step": 1326 }, { "epoch": 1.4668386145910095, "grad_norm": 0.47431182861328125, "learning_rate": 6.0542467661178325e-06, "loss": 0.4619, "step": 1327 }, { "epoch": 1.4679439941046426, "grad_norm": 0.40695661306381226, "learning_rate": 6.047952932640513e-06, "loss": 0.4604, "step": 1328 }, { "epoch": 1.4690493736182755, "grad_norm": 0.41739755868911743, "learning_rate": 6.041657361915234e-06, "loss": 0.4102, "step": 1329 }, { "epoch": 1.4701547531319086, "grad_norm": 0.430508017539978, "learning_rate": 6.035360064378504e-06, "loss": 0.5161, "step": 1330 }, { "epoch": 1.4712601326455417, "grad_norm": 0.3647215664386749, "learning_rate": 6.029061050469689e-06, "loss": 0.474, "step": 1331 }, { "epoch": 1.4723655121591745, "grad_norm": 0.39113396406173706, "learning_rate": 6.022760330631006e-06, "loss": 0.4569, "step": 1332 }, { "epoch": 1.4734708916728076, "grad_norm": 0.3872900903224945, "learning_rate": 6.016457915307494e-06, "loss": 0.4671, "step": 1333 }, { "epoch": 1.4745762711864407, "grad_norm": 0.4000808596611023, "learning_rate": 6.01015381494701e-06, "loss": 0.4605, "step": 1334 }, { "epoch": 1.4756816507000736, "grad_norm": 0.3802982270717621, "learning_rate": 6.003848040000196e-06, "loss": 0.4694, "step": 1335 }, { "epoch": 1.4767870302137067, "grad_norm": 0.3968397080898285, "learning_rate": 5.997540600920479e-06, "loss": 0.5087, "step": 1336 }, { "epoch": 1.4778924097273398, "grad_norm": 0.3828469514846802, "learning_rate": 5.991231508164037e-06, "loss": 0.423, "step": 1337 }, { "epoch": 1.4789977892409727, "grad_norm": 0.37646445631980896, "learning_rate": 5.984920772189793e-06, "loss": 0.4785, "step": 1338 }, { "epoch": 1.4801031687546058, "grad_norm": 0.3972923159599304, "learning_rate": 5.978608403459395e-06, "loss": 0.4259, "step": 1339 }, { "epoch": 1.481208548268239, "grad_norm": 0.4107617139816284, "learning_rate": 5.972294412437194e-06, "loss": 0.4459, "step": 1340 }, { "epoch": 1.4823139277818718, "grad_norm": 0.4165216088294983, "learning_rate": 5.965978809590235e-06, "loss": 0.4852, "step": 1341 }, { "epoch": 1.4834193072955049, "grad_norm": 0.41350728273391724, "learning_rate": 5.959661605388229e-06, "loss": 0.4461, "step": 1342 }, { "epoch": 1.4845246868091377, "grad_norm": 0.40955862402915955, "learning_rate": 5.953342810303551e-06, "loss": 0.4874, "step": 1343 }, { "epoch": 1.4856300663227708, "grad_norm": 0.3718957304954529, "learning_rate": 5.947022434811202e-06, "loss": 0.4532, "step": 1344 }, { "epoch": 1.4867354458364037, "grad_norm": 0.3858948051929474, "learning_rate": 5.940700489388811e-06, "loss": 0.428, "step": 1345 }, { "epoch": 1.4878408253500368, "grad_norm": 0.35791173577308655, "learning_rate": 5.934376984516608e-06, "loss": 0.4506, "step": 1346 }, { "epoch": 1.48894620486367, "grad_norm": 0.3629496097564697, "learning_rate": 5.928051930677404e-06, "loss": 0.4267, "step": 1347 }, { "epoch": 1.4900515843773028, "grad_norm": 0.42379075288772583, "learning_rate": 5.921725338356585e-06, "loss": 0.4958, "step": 1348 }, { "epoch": 1.4911569638909359, "grad_norm": 0.3714812099933624, "learning_rate": 5.915397218042081e-06, "loss": 0.4151, "step": 1349 }, { "epoch": 1.492262343404569, "grad_norm": 0.42653462290763855, "learning_rate": 5.909067580224359e-06, "loss": 0.4457, "step": 1350 }, { "epoch": 1.4933677229182019, "grad_norm": 0.41303884983062744, "learning_rate": 5.902736435396398e-06, "loss": 0.4391, "step": 1351 }, { "epoch": 1.494473102431835, "grad_norm": 0.4314262866973877, "learning_rate": 5.896403794053679e-06, "loss": 0.5166, "step": 1352 }, { "epoch": 1.495578481945468, "grad_norm": 0.4075848162174225, "learning_rate": 5.890069666694163e-06, "loss": 0.4175, "step": 1353 }, { "epoch": 1.496683861459101, "grad_norm": 0.42914697527885437, "learning_rate": 5.883734063818272e-06, "loss": 0.4572, "step": 1354 }, { "epoch": 1.497789240972734, "grad_norm": 0.4011940658092499, "learning_rate": 5.877396995928876e-06, "loss": 0.4455, "step": 1355 }, { "epoch": 1.4988946204863671, "grad_norm": 0.4104115664958954, "learning_rate": 5.871058473531273e-06, "loss": 0.4392, "step": 1356 }, { "epoch": 1.5, "grad_norm": 0.3702162206172943, "learning_rate": 5.864718507133176e-06, "loss": 0.4143, "step": 1357 }, { "epoch": 1.5011053795136329, "grad_norm": 0.39451539516448975, "learning_rate": 5.858377107244683e-06, "loss": 0.4703, "step": 1358 }, { "epoch": 1.502210759027266, "grad_norm": 0.40542301535606384, "learning_rate": 5.8520342843782785e-06, "loss": 0.4401, "step": 1359 }, { "epoch": 1.503316138540899, "grad_norm": 0.4206547141075134, "learning_rate": 5.845690049048799e-06, "loss": 0.4558, "step": 1360 }, { "epoch": 1.504421518054532, "grad_norm": 0.39446622133255005, "learning_rate": 5.839344411773425e-06, "loss": 0.4495, "step": 1361 }, { "epoch": 1.505526897568165, "grad_norm": 0.4484139084815979, "learning_rate": 5.83299738307166e-06, "loss": 0.4637, "step": 1362 }, { "epoch": 1.5066322770817981, "grad_norm": 0.3749121427536011, "learning_rate": 5.8266489734653175e-06, "loss": 0.4397, "step": 1363 }, { "epoch": 1.507737656595431, "grad_norm": 0.36431533098220825, "learning_rate": 5.820299193478496e-06, "loss": 0.4638, "step": 1364 }, { "epoch": 1.5088430361090641, "grad_norm": 0.3972304165363312, "learning_rate": 5.813948053637567e-06, "loss": 0.4561, "step": 1365 }, { "epoch": 1.5099484156226972, "grad_norm": 0.3986959457397461, "learning_rate": 5.807595564471157e-06, "loss": 0.4433, "step": 1366 }, { "epoch": 1.51105379513633, "grad_norm": 0.39401331543922424, "learning_rate": 5.801241736510128e-06, "loss": 0.4345, "step": 1367 }, { "epoch": 1.5121591746499632, "grad_norm": 0.35044094920158386, "learning_rate": 5.794886580287565e-06, "loss": 0.4478, "step": 1368 }, { "epoch": 1.5132645541635963, "grad_norm": 0.37380725145339966, "learning_rate": 5.78853010633875e-06, "loss": 0.4539, "step": 1369 }, { "epoch": 1.5143699336772292, "grad_norm": 0.45428985357284546, "learning_rate": 5.782172325201155e-06, "loss": 0.4884, "step": 1370 }, { "epoch": 1.515475313190862, "grad_norm": 0.38266661763191223, "learning_rate": 5.775813247414413e-06, "loss": 0.4509, "step": 1371 }, { "epoch": 1.5165806927044954, "grad_norm": 0.3774400055408478, "learning_rate": 5.76945288352031e-06, "loss": 0.4528, "step": 1372 }, { "epoch": 1.5176860722181282, "grad_norm": 0.35472041368484497, "learning_rate": 5.7630912440627625e-06, "loss": 0.4059, "step": 1373 }, { "epoch": 1.518791451731761, "grad_norm": 0.4012596309185028, "learning_rate": 5.756728339587806e-06, "loss": 0.4936, "step": 1374 }, { "epoch": 1.5198968312453942, "grad_norm": 0.344155490398407, "learning_rate": 5.750364180643565e-06, "loss": 0.4026, "step": 1375 }, { "epoch": 1.5210022107590273, "grad_norm": 0.3758469820022583, "learning_rate": 5.743998777780252e-06, "loss": 0.4601, "step": 1376 }, { "epoch": 1.5221075902726602, "grad_norm": 0.39537274837493896, "learning_rate": 5.7376321415501356e-06, "loss": 0.4592, "step": 1377 }, { "epoch": 1.5232129697862933, "grad_norm": 0.38407063484191895, "learning_rate": 5.731264282507531e-06, "loss": 0.4719, "step": 1378 }, { "epoch": 1.5243183492999264, "grad_norm": 0.37339651584625244, "learning_rate": 5.724895211208782e-06, "loss": 0.4825, "step": 1379 }, { "epoch": 1.5254237288135593, "grad_norm": 0.38245999813079834, "learning_rate": 5.71852493821224e-06, "loss": 0.4823, "step": 1380 }, { "epoch": 1.5265291083271924, "grad_norm": 0.37086495757102966, "learning_rate": 5.71215347407825e-06, "loss": 0.4228, "step": 1381 }, { "epoch": 1.5276344878408255, "grad_norm": 0.4456254243850708, "learning_rate": 5.7057808293691305e-06, "loss": 0.4697, "step": 1382 }, { "epoch": 1.5287398673544583, "grad_norm": 0.36977121233940125, "learning_rate": 5.699407014649159e-06, "loss": 0.4465, "step": 1383 }, { "epoch": 1.5298452468680914, "grad_norm": 0.3494488298892975, "learning_rate": 5.6930320404845475e-06, "loss": 0.4202, "step": 1384 }, { "epoch": 1.5309506263817245, "grad_norm": 0.40091851353645325, "learning_rate": 5.686655917443436e-06, "loss": 0.4808, "step": 1385 }, { "epoch": 1.5320560058953574, "grad_norm": 0.38189491629600525, "learning_rate": 5.680278656095868e-06, "loss": 0.4578, "step": 1386 }, { "epoch": 1.5331613854089903, "grad_norm": 0.4211690127849579, "learning_rate": 5.67390026701377e-06, "loss": 0.492, "step": 1387 }, { "epoch": 1.5342667649226236, "grad_norm": 0.3266015946865082, "learning_rate": 5.6675207607709426e-06, "loss": 0.4245, "step": 1388 }, { "epoch": 1.5353721444362565, "grad_norm": 0.3786960542201996, "learning_rate": 5.661140147943034e-06, "loss": 0.4524, "step": 1389 }, { "epoch": 1.5364775239498893, "grad_norm": 0.4003026485443115, "learning_rate": 5.654758439107533e-06, "loss": 0.4673, "step": 1390 }, { "epoch": 1.5375829034635224, "grad_norm": 0.36495688557624817, "learning_rate": 5.648375644843739e-06, "loss": 0.4626, "step": 1391 }, { "epoch": 1.5386882829771555, "grad_norm": 0.33502325415611267, "learning_rate": 5.641991775732756e-06, "loss": 0.4499, "step": 1392 }, { "epoch": 1.5397936624907884, "grad_norm": 0.35385942459106445, "learning_rate": 5.635606842357464e-06, "loss": 0.4262, "step": 1393 }, { "epoch": 1.5408990420044215, "grad_norm": 0.4498881697654724, "learning_rate": 5.629220855302513e-06, "loss": 0.4516, "step": 1394 }, { "epoch": 1.5420044215180546, "grad_norm": 0.4227311909198761, "learning_rate": 5.622833825154297e-06, "loss": 0.5076, "step": 1395 }, { "epoch": 1.5431098010316875, "grad_norm": 0.37824538350105286, "learning_rate": 5.6164457625009386e-06, "loss": 0.4361, "step": 1396 }, { "epoch": 1.5442151805453206, "grad_norm": 0.4467746317386627, "learning_rate": 5.610056677932274e-06, "loss": 0.4631, "step": 1397 }, { "epoch": 1.5453205600589537, "grad_norm": 0.48510223627090454, "learning_rate": 5.60366658203983e-06, "loss": 0.4525, "step": 1398 }, { "epoch": 1.5464259395725866, "grad_norm": 0.41713324189186096, "learning_rate": 5.597275485416818e-06, "loss": 0.4358, "step": 1399 }, { "epoch": 1.5475313190862194, "grad_norm": 0.36827898025512695, "learning_rate": 5.590883398658095e-06, "loss": 0.444, "step": 1400 }, { "epoch": 1.5486366985998528, "grad_norm": 0.428786963224411, "learning_rate": 5.5844903323601725e-06, "loss": 0.4792, "step": 1401 }, { "epoch": 1.5497420781134856, "grad_norm": 0.4215226173400879, "learning_rate": 5.5780962971211795e-06, "loss": 0.4281, "step": 1402 }, { "epoch": 1.5508474576271185, "grad_norm": 0.40402793884277344, "learning_rate": 5.571701303540851e-06, "loss": 0.4396, "step": 1403 }, { "epoch": 1.5519528371407516, "grad_norm": 0.38568753004074097, "learning_rate": 5.565305362220515e-06, "loss": 0.4354, "step": 1404 }, { "epoch": 1.5530582166543847, "grad_norm": 0.4287998676300049, "learning_rate": 5.558908483763064e-06, "loss": 0.4908, "step": 1405 }, { "epoch": 1.5541635961680176, "grad_norm": 0.43495601415634155, "learning_rate": 5.552510678772949e-06, "loss": 0.4215, "step": 1406 }, { "epoch": 1.5552689756816507, "grad_norm": 0.4028578996658325, "learning_rate": 5.546111957856155e-06, "loss": 0.442, "step": 1407 }, { "epoch": 1.5563743551952838, "grad_norm": 0.4350937604904175, "learning_rate": 5.539712331620186e-06, "loss": 0.4559, "step": 1408 }, { "epoch": 1.5574797347089167, "grad_norm": 0.4396096467971802, "learning_rate": 5.533311810674048e-06, "loss": 0.4265, "step": 1409 }, { "epoch": 1.5585851142225498, "grad_norm": 0.36808326840400696, "learning_rate": 5.526910405628227e-06, "loss": 0.4318, "step": 1410 }, { "epoch": 1.5596904937361828, "grad_norm": 0.3778028190135956, "learning_rate": 5.520508127094677e-06, "loss": 0.4431, "step": 1411 }, { "epoch": 1.5607958732498157, "grad_norm": 0.4178030788898468, "learning_rate": 5.514104985686802e-06, "loss": 0.4347, "step": 1412 }, { "epoch": 1.5619012527634488, "grad_norm": 0.4540087878704071, "learning_rate": 5.507700992019433e-06, "loss": 0.4707, "step": 1413 }, { "epoch": 1.563006632277082, "grad_norm": 0.37767305970191956, "learning_rate": 5.501296156708812e-06, "loss": 0.3937, "step": 1414 }, { "epoch": 1.5641120117907148, "grad_norm": 0.3992292881011963, "learning_rate": 5.4948904903725815e-06, "loss": 0.4718, "step": 1415 }, { "epoch": 1.5652173913043477, "grad_norm": 0.3768535852432251, "learning_rate": 5.488484003629759e-06, "loss": 0.4237, "step": 1416 }, { "epoch": 1.566322770817981, "grad_norm": 0.3811759948730469, "learning_rate": 5.482076707100723e-06, "loss": 0.4392, "step": 1417 }, { "epoch": 1.5674281503316139, "grad_norm": 0.404037207365036, "learning_rate": 5.475668611407191e-06, "loss": 0.466, "step": 1418 }, { "epoch": 1.5685335298452467, "grad_norm": 0.3735414147377014, "learning_rate": 5.4692597271722115e-06, "loss": 0.4336, "step": 1419 }, { "epoch": 1.5696389093588798, "grad_norm": 0.3823690712451935, "learning_rate": 5.462850065020133e-06, "loss": 0.4412, "step": 1420 }, { "epoch": 1.570744288872513, "grad_norm": 0.3968217074871063, "learning_rate": 5.4564396355766014e-06, "loss": 0.4366, "step": 1421 }, { "epoch": 1.5718496683861458, "grad_norm": 0.4286130666732788, "learning_rate": 5.4500284494685275e-06, "loss": 0.5061, "step": 1422 }, { "epoch": 1.572955047899779, "grad_norm": 0.33074694871902466, "learning_rate": 5.443616517324079e-06, "loss": 0.4181, "step": 1423 }, { "epoch": 1.574060427413412, "grad_norm": 0.43031829595565796, "learning_rate": 5.437203849772664e-06, "loss": 0.4824, "step": 1424 }, { "epoch": 1.5751658069270449, "grad_norm": 0.3560393452644348, "learning_rate": 5.430790457444903e-06, "loss": 0.4408, "step": 1425 }, { "epoch": 1.576271186440678, "grad_norm": 0.4044763743877411, "learning_rate": 5.424376350972625e-06, "loss": 0.4621, "step": 1426 }, { "epoch": 1.577376565954311, "grad_norm": 0.35748153924942017, "learning_rate": 5.417961540988837e-06, "loss": 0.4016, "step": 1427 }, { "epoch": 1.578481945467944, "grad_norm": 0.3989635705947876, "learning_rate": 5.411546038127715e-06, "loss": 0.4895, "step": 1428 }, { "epoch": 1.579587324981577, "grad_norm": 0.3714737296104431, "learning_rate": 5.405129853024583e-06, "loss": 0.4604, "step": 1429 }, { "epoch": 1.5806927044952102, "grad_norm": 0.3774634301662445, "learning_rate": 5.398712996315898e-06, "loss": 0.4352, "step": 1430 }, { "epoch": 1.581798084008843, "grad_norm": 0.34670567512512207, "learning_rate": 5.392295478639226e-06, "loss": 0.4135, "step": 1431 }, { "epoch": 1.582903463522476, "grad_norm": 0.42179471254348755, "learning_rate": 5.385877310633233e-06, "loss": 0.4747, "step": 1432 }, { "epoch": 1.5840088430361092, "grad_norm": 0.43300625681877136, "learning_rate": 5.379458502937661e-06, "loss": 0.4599, "step": 1433 }, { "epoch": 1.585114222549742, "grad_norm": 0.42134934663772583, "learning_rate": 5.373039066193312e-06, "loss": 0.4615, "step": 1434 }, { "epoch": 1.586219602063375, "grad_norm": 0.3970409035682678, "learning_rate": 5.366619011042035e-06, "loss": 0.4226, "step": 1435 }, { "epoch": 1.587324981577008, "grad_norm": 0.4283446669578552, "learning_rate": 5.360198348126696e-06, "loss": 0.4592, "step": 1436 }, { "epoch": 1.5884303610906412, "grad_norm": 0.3904299736022949, "learning_rate": 5.353777088091177e-06, "loss": 0.4447, "step": 1437 }, { "epoch": 1.589535740604274, "grad_norm": 0.4239232838153839, "learning_rate": 5.347355241580344e-06, "loss": 0.4168, "step": 1438 }, { "epoch": 1.5906411201179071, "grad_norm": 0.4332175850868225, "learning_rate": 5.340932819240041e-06, "loss": 0.4944, "step": 1439 }, { "epoch": 1.5917464996315402, "grad_norm": 0.4112946391105652, "learning_rate": 5.334509831717058e-06, "loss": 0.4742, "step": 1440 }, { "epoch": 1.5928518791451731, "grad_norm": 0.39780721068382263, "learning_rate": 5.328086289659131e-06, "loss": 0.4024, "step": 1441 }, { "epoch": 1.5939572586588062, "grad_norm": 0.4983542859554291, "learning_rate": 5.321662203714909e-06, "loss": 0.4921, "step": 1442 }, { "epoch": 1.5950626381724393, "grad_norm": 0.4096823036670685, "learning_rate": 5.315237584533945e-06, "loss": 0.4606, "step": 1443 }, { "epoch": 1.5961680176860722, "grad_norm": 0.40668633580207825, "learning_rate": 5.308812442766679e-06, "loss": 0.4127, "step": 1444 }, { "epoch": 1.597273397199705, "grad_norm": 0.44575098156929016, "learning_rate": 5.30238678906441e-06, "loss": 0.4781, "step": 1445 }, { "epoch": 1.5983787767133384, "grad_norm": 0.38975149393081665, "learning_rate": 5.295960634079292e-06, "loss": 0.4734, "step": 1446 }, { "epoch": 1.5994841562269713, "grad_norm": 0.39246827363967896, "learning_rate": 5.289533988464307e-06, "loss": 0.4383, "step": 1447 }, { "epoch": 1.6005895357406041, "grad_norm": 0.39155343174934387, "learning_rate": 5.283106862873253e-06, "loss": 0.4566, "step": 1448 }, { "epoch": 1.6016949152542372, "grad_norm": 0.3673551380634308, "learning_rate": 5.276679267960719e-06, "loss": 0.4298, "step": 1449 }, { "epoch": 1.6028002947678703, "grad_norm": 0.4181453585624695, "learning_rate": 5.270251214382078e-06, "loss": 0.4669, "step": 1450 }, { "epoch": 1.6039056742815032, "grad_norm": 0.4351784288883209, "learning_rate": 5.263822712793459e-06, "loss": 0.4422, "step": 1451 }, { "epoch": 1.6050110537951363, "grad_norm": 0.37973752617836, "learning_rate": 5.257393773851733e-06, "loss": 0.4272, "step": 1452 }, { "epoch": 1.6061164333087694, "grad_norm": 0.39072585105895996, "learning_rate": 5.250964408214501e-06, "loss": 0.4349, "step": 1453 }, { "epoch": 1.6072218128224023, "grad_norm": 0.39705321192741394, "learning_rate": 5.244534626540067e-06, "loss": 0.4124, "step": 1454 }, { "epoch": 1.6083271923360354, "grad_norm": 0.37305402755737305, "learning_rate": 5.238104439487425e-06, "loss": 0.5022, "step": 1455 }, { "epoch": 1.6094325718496685, "grad_norm": 0.33818671107292175, "learning_rate": 5.231673857716244e-06, "loss": 0.4333, "step": 1456 }, { "epoch": 1.6105379513633014, "grad_norm": 0.4028286933898926, "learning_rate": 5.2252428918868446e-06, "loss": 0.4647, "step": 1457 }, { "epoch": 1.6116433308769345, "grad_norm": 0.3777776062488556, "learning_rate": 5.218811552660184e-06, "loss": 0.4507, "step": 1458 }, { "epoch": 1.6127487103905676, "grad_norm": 0.37573808431625366, "learning_rate": 5.212379850697841e-06, "loss": 0.4315, "step": 1459 }, { "epoch": 1.6138540899042004, "grad_norm": 0.39426523447036743, "learning_rate": 5.205947796661991e-06, "loss": 0.4604, "step": 1460 }, { "epoch": 1.6149594694178333, "grad_norm": 0.34392786026000977, "learning_rate": 5.199515401215401e-06, "loss": 0.4092, "step": 1461 }, { "epoch": 1.6160648489314666, "grad_norm": 0.3689812123775482, "learning_rate": 5.193082675021393e-06, "loss": 0.4496, "step": 1462 }, { "epoch": 1.6171702284450995, "grad_norm": 0.38902321457862854, "learning_rate": 5.186649628743849e-06, "loss": 0.4531, "step": 1463 }, { "epoch": 1.6182756079587324, "grad_norm": 0.38703951239585876, "learning_rate": 5.1802162730471704e-06, "loss": 0.4305, "step": 1464 }, { "epoch": 1.6193809874723655, "grad_norm": 0.41052693128585815, "learning_rate": 5.173782618596281e-06, "loss": 0.5406, "step": 1465 }, { "epoch": 1.6204863669859986, "grad_norm": 0.39292532205581665, "learning_rate": 5.167348676056596e-06, "loss": 0.4262, "step": 1466 }, { "epoch": 1.6215917464996314, "grad_norm": 0.36877933144569397, "learning_rate": 5.160914456094005e-06, "loss": 0.4437, "step": 1467 }, { "epoch": 1.6226971260132645, "grad_norm": 0.3557402789592743, "learning_rate": 5.154479969374865e-06, "loss": 0.4014, "step": 1468 }, { "epoch": 1.6238025055268976, "grad_norm": 0.4252469539642334, "learning_rate": 5.148045226565966e-06, "loss": 0.4657, "step": 1469 }, { "epoch": 1.6249078850405305, "grad_norm": 0.35349205136299133, "learning_rate": 5.1416102383345315e-06, "loss": 0.4284, "step": 1470 }, { "epoch": 1.6260132645541636, "grad_norm": 0.4146263003349304, "learning_rate": 5.135175015348185e-06, "loss": 0.4689, "step": 1471 }, { "epoch": 1.6271186440677967, "grad_norm": 0.37610721588134766, "learning_rate": 5.1287395682749444e-06, "loss": 0.4494, "step": 1472 }, { "epoch": 1.6282240235814296, "grad_norm": 0.38195571303367615, "learning_rate": 5.122303907783193e-06, "loss": 0.4475, "step": 1473 }, { "epoch": 1.6293294030950627, "grad_norm": 0.39711394906044006, "learning_rate": 5.115868044541674e-06, "loss": 0.4505, "step": 1474 }, { "epoch": 1.6304347826086958, "grad_norm": 0.4148280620574951, "learning_rate": 5.109431989219464e-06, "loss": 0.4401, "step": 1475 }, { "epoch": 1.6315401621223287, "grad_norm": 0.37311628460884094, "learning_rate": 5.102995752485956e-06, "loss": 0.4402, "step": 1476 }, { "epoch": 1.6326455416359615, "grad_norm": 0.37405794858932495, "learning_rate": 5.0965593450108495e-06, "loss": 0.4401, "step": 1477 }, { "epoch": 1.6337509211495949, "grad_norm": 0.4271431863307953, "learning_rate": 5.090122777464121e-06, "loss": 0.5103, "step": 1478 }, { "epoch": 1.6348563006632277, "grad_norm": 0.34972086548805237, "learning_rate": 5.083686060516017e-06, "loss": 0.4098, "step": 1479 }, { "epoch": 1.6359616801768606, "grad_norm": 0.3895830810070038, "learning_rate": 5.077249204837026e-06, "loss": 0.4474, "step": 1480 }, { "epoch": 1.6370670596904937, "grad_norm": 0.40346094965934753, "learning_rate": 5.070812221097874e-06, "loss": 0.4565, "step": 1481 }, { "epoch": 1.6381724392041268, "grad_norm": 0.3526574671268463, "learning_rate": 5.064375119969491e-06, "loss": 0.421, "step": 1482 }, { "epoch": 1.6392778187177597, "grad_norm": 0.4210236370563507, "learning_rate": 5.05793791212301e-06, "loss": 0.4809, "step": 1483 }, { "epoch": 1.6403831982313928, "grad_norm": 0.37198764085769653, "learning_rate": 5.051500608229734e-06, "loss": 0.4342, "step": 1484 }, { "epoch": 1.6414885777450259, "grad_norm": 0.3595938980579376, "learning_rate": 5.045063218961128e-06, "loss": 0.4374, "step": 1485 }, { "epoch": 1.6425939572586588, "grad_norm": 0.39127427339553833, "learning_rate": 5.038625754988802e-06, "loss": 0.4656, "step": 1486 }, { "epoch": 1.6436993367722919, "grad_norm": 0.38468292355537415, "learning_rate": 5.03218822698448e-06, "loss": 0.4608, "step": 1487 }, { "epoch": 1.644804716285925, "grad_norm": 0.43970853090286255, "learning_rate": 5.025750645620004e-06, "loss": 0.4567, "step": 1488 }, { "epoch": 1.6459100957995578, "grad_norm": 0.37551215291023254, "learning_rate": 5.019313021567298e-06, "loss": 0.4639, "step": 1489 }, { "epoch": 1.6470154753131907, "grad_norm": 0.422775536775589, "learning_rate": 5.012875365498357e-06, "loss": 0.4691, "step": 1490 }, { "epoch": 1.648120854826824, "grad_norm": 0.4162147045135498, "learning_rate": 5.00643768808523e-06, "loss": 0.3969, "step": 1491 }, { "epoch": 1.649226234340457, "grad_norm": 0.4017612338066101, "learning_rate": 5e-06, "loss": 0.4802, "step": 1492 }, { "epoch": 1.6503316138540898, "grad_norm": 0.4245891273021698, "learning_rate": 4.993562311914772e-06, "loss": 0.491, "step": 1493 }, { "epoch": 1.6514369933677229, "grad_norm": 0.3370142877101898, "learning_rate": 4.9871246345016445e-06, "loss": 0.4373, "step": 1494 }, { "epoch": 1.652542372881356, "grad_norm": 0.48576411604881287, "learning_rate": 4.980686978432703e-06, "loss": 0.4416, "step": 1495 }, { "epoch": 1.6536477523949888, "grad_norm": 0.3807646334171295, "learning_rate": 4.974249354379997e-06, "loss": 0.4176, "step": 1496 }, { "epoch": 1.654753131908622, "grad_norm": 0.3929847180843353, "learning_rate": 4.967811773015521e-06, "loss": 0.4871, "step": 1497 }, { "epoch": 1.655858511422255, "grad_norm": 0.4007715880870819, "learning_rate": 4.961374245011201e-06, "loss": 0.4128, "step": 1498 }, { "epoch": 1.656963890935888, "grad_norm": 0.4174908697605133, "learning_rate": 4.954936781038874e-06, "loss": 0.4504, "step": 1499 }, { "epoch": 1.658069270449521, "grad_norm": 0.4298599660396576, "learning_rate": 4.948499391770268e-06, "loss": 0.4173, "step": 1500 }, { "epoch": 1.6591746499631541, "grad_norm": 0.42372065782546997, "learning_rate": 4.942062087876993e-06, "loss": 0.5141, "step": 1501 }, { "epoch": 1.660280029476787, "grad_norm": 0.40663769841194153, "learning_rate": 4.93562488003051e-06, "loss": 0.4564, "step": 1502 }, { "epoch": 1.66138540899042, "grad_norm": 0.43389537930488586, "learning_rate": 4.929187778902127e-06, "loss": 0.45, "step": 1503 }, { "epoch": 1.6624907885040532, "grad_norm": 0.4478735327720642, "learning_rate": 4.922750795162974e-06, "loss": 0.4731, "step": 1504 }, { "epoch": 1.663596168017686, "grad_norm": 0.41193529963493347, "learning_rate": 4.916313939483983e-06, "loss": 0.4471, "step": 1505 }, { "epoch": 1.664701547531319, "grad_norm": 0.3743836581707001, "learning_rate": 4.909877222535879e-06, "loss": 0.4622, "step": 1506 }, { "epoch": 1.6658069270449523, "grad_norm": 0.4034438133239746, "learning_rate": 4.90344065498915e-06, "loss": 0.4315, "step": 1507 }, { "epoch": 1.6669123065585851, "grad_norm": 0.43028944730758667, "learning_rate": 4.897004247514044e-06, "loss": 0.4408, "step": 1508 }, { "epoch": 1.668017686072218, "grad_norm": 0.36529287695884705, "learning_rate": 4.890568010780537e-06, "loss": 0.4734, "step": 1509 }, { "epoch": 1.669123065585851, "grad_norm": 0.34568169713020325, "learning_rate": 4.884131955458327e-06, "loss": 0.4353, "step": 1510 }, { "epoch": 1.6702284450994842, "grad_norm": 0.34875860810279846, "learning_rate": 4.877696092216809e-06, "loss": 0.438, "step": 1511 }, { "epoch": 1.671333824613117, "grad_norm": 0.3610610067844391, "learning_rate": 4.871260431725058e-06, "loss": 0.4524, "step": 1512 }, { "epoch": 1.6724392041267502, "grad_norm": 0.36430999636650085, "learning_rate": 4.864824984651817e-06, "loss": 0.4079, "step": 1513 }, { "epoch": 1.6735445836403833, "grad_norm": 0.3979116678237915, "learning_rate": 4.858389761665469e-06, "loss": 0.4722, "step": 1514 }, { "epoch": 1.6746499631540162, "grad_norm": 0.35655391216278076, "learning_rate": 4.851954773434035e-06, "loss": 0.3753, "step": 1515 }, { "epoch": 1.6757553426676493, "grad_norm": 0.34895798563957214, "learning_rate": 4.845520030625136e-06, "loss": 0.4422, "step": 1516 }, { "epoch": 1.6768607221812823, "grad_norm": 0.3864400088787079, "learning_rate": 4.8390855439059955e-06, "loss": 0.4417, "step": 1517 }, { "epoch": 1.6779661016949152, "grad_norm": 0.39543554186820984, "learning_rate": 4.832651323943406e-06, "loss": 0.4274, "step": 1518 }, { "epoch": 1.6790714812085483, "grad_norm": 0.39186757802963257, "learning_rate": 4.82621738140372e-06, "loss": 0.4346, "step": 1519 }, { "epoch": 1.6801768607221814, "grad_norm": 0.37700143456459045, "learning_rate": 4.819783726952831e-06, "loss": 0.3861, "step": 1520 }, { "epoch": 1.6812822402358143, "grad_norm": 0.36242225766181946, "learning_rate": 4.813350371256155e-06, "loss": 0.4739, "step": 1521 }, { "epoch": 1.6823876197494472, "grad_norm": 0.443261981010437, "learning_rate": 4.806917324978608e-06, "loss": 0.5049, "step": 1522 }, { "epoch": 1.6834929992630803, "grad_norm": 0.4200631082057953, "learning_rate": 4.800484598784602e-06, "loss": 0.4789, "step": 1523 }, { "epoch": 1.6845983787767134, "grad_norm": 0.3613986074924469, "learning_rate": 4.79405220333801e-06, "loss": 0.411, "step": 1524 }, { "epoch": 1.6857037582903462, "grad_norm": 0.3977981209754944, "learning_rate": 4.78762014930216e-06, "loss": 0.4755, "step": 1525 }, { "epoch": 1.6868091378039793, "grad_norm": 0.39229482412338257, "learning_rate": 4.781188447339817e-06, "loss": 0.4507, "step": 1526 }, { "epoch": 1.6879145173176124, "grad_norm": 0.41242101788520813, "learning_rate": 4.774757108113156e-06, "loss": 0.4495, "step": 1527 }, { "epoch": 1.6890198968312453, "grad_norm": 0.44897472858428955, "learning_rate": 4.768326142283757e-06, "loss": 0.5074, "step": 1528 }, { "epoch": 1.6901252763448784, "grad_norm": 0.30506932735443115, "learning_rate": 4.761895560512576e-06, "loss": 0.3858, "step": 1529 }, { "epoch": 1.6912306558585115, "grad_norm": 0.38112103939056396, "learning_rate": 4.755465373459934e-06, "loss": 0.4594, "step": 1530 }, { "epoch": 1.6923360353721444, "grad_norm": 0.41440993547439575, "learning_rate": 4.749035591785501e-06, "loss": 0.4692, "step": 1531 }, { "epoch": 1.6934414148857775, "grad_norm": 0.3912055194377899, "learning_rate": 4.742606226148268e-06, "loss": 0.4691, "step": 1532 }, { "epoch": 1.6945467943994106, "grad_norm": 0.3743450939655304, "learning_rate": 4.736177287206543e-06, "loss": 0.4597, "step": 1533 }, { "epoch": 1.6956521739130435, "grad_norm": 0.39076218008995056, "learning_rate": 4.7297487856179224e-06, "loss": 0.4505, "step": 1534 }, { "epoch": 1.6967575534266763, "grad_norm": 0.36359283328056335, "learning_rate": 4.7233207320392816e-06, "loss": 0.4401, "step": 1535 }, { "epoch": 1.6978629329403097, "grad_norm": 0.4112439453601837, "learning_rate": 4.716893137126748e-06, "loss": 0.4419, "step": 1536 }, { "epoch": 1.6989683124539425, "grad_norm": 0.35502293705940247, "learning_rate": 4.710466011535695e-06, "loss": 0.4206, "step": 1537 }, { "epoch": 1.7000736919675754, "grad_norm": 0.4005138576030731, "learning_rate": 4.704039365920709e-06, "loss": 0.4474, "step": 1538 }, { "epoch": 1.7011790714812085, "grad_norm": 0.37563979625701904, "learning_rate": 4.697613210935592e-06, "loss": 0.4249, "step": 1539 }, { "epoch": 1.7022844509948416, "grad_norm": 0.395396888256073, "learning_rate": 4.691187557233323e-06, "loss": 0.4626, "step": 1540 }, { "epoch": 1.7033898305084745, "grad_norm": 0.3850739002227783, "learning_rate": 4.684762415466056e-06, "loss": 0.4639, "step": 1541 }, { "epoch": 1.7044952100221076, "grad_norm": 0.34968331456184387, "learning_rate": 4.678337796285093e-06, "loss": 0.417, "step": 1542 }, { "epoch": 1.7056005895357407, "grad_norm": 0.35675427317619324, "learning_rate": 4.6719137103408716e-06, "loss": 0.4134, "step": 1543 }, { "epoch": 1.7067059690493736, "grad_norm": 0.3989929258823395, "learning_rate": 4.665490168282943e-06, "loss": 0.4666, "step": 1544 }, { "epoch": 1.7078113485630066, "grad_norm": 0.4070923924446106, "learning_rate": 4.659067180759962e-06, "loss": 0.458, "step": 1545 }, { "epoch": 1.7089167280766397, "grad_norm": 0.407103568315506, "learning_rate": 4.6526447584196575e-06, "loss": 0.4183, "step": 1546 }, { "epoch": 1.7100221075902726, "grad_norm": 0.4287553131580353, "learning_rate": 4.6462229119088234e-06, "loss": 0.4851, "step": 1547 }, { "epoch": 1.7111274871039057, "grad_norm": 0.41589683294296265, "learning_rate": 4.639801651873305e-06, "loss": 0.4356, "step": 1548 }, { "epoch": 1.7122328666175388, "grad_norm": 0.4039379060268402, "learning_rate": 4.633380988957968e-06, "loss": 0.4558, "step": 1549 }, { "epoch": 1.7133382461311717, "grad_norm": 0.38477519154548645, "learning_rate": 4.6269609338066875e-06, "loss": 0.4728, "step": 1550 }, { "epoch": 1.7144436256448046, "grad_norm": 0.39914339780807495, "learning_rate": 4.620541497062341e-06, "loss": 0.4139, "step": 1551 }, { "epoch": 1.715549005158438, "grad_norm": 0.4216427803039551, "learning_rate": 4.614122689366769e-06, "loss": 0.4173, "step": 1552 }, { "epoch": 1.7166543846720708, "grad_norm": 0.3945509195327759, "learning_rate": 4.6077045213607765e-06, "loss": 0.4558, "step": 1553 }, { "epoch": 1.7177597641857036, "grad_norm": 0.4193847179412842, "learning_rate": 4.601287003684104e-06, "loss": 0.4486, "step": 1554 }, { "epoch": 1.7188651436993367, "grad_norm": 0.43009454011917114, "learning_rate": 4.594870146975418e-06, "loss": 0.4108, "step": 1555 }, { "epoch": 1.7199705232129698, "grad_norm": 0.42228907346725464, "learning_rate": 4.588453961872286e-06, "loss": 0.4416, "step": 1556 }, { "epoch": 1.7210759027266027, "grad_norm": 0.35112375020980835, "learning_rate": 4.582038459011165e-06, "loss": 0.3895, "step": 1557 }, { "epoch": 1.7221812822402358, "grad_norm": 0.4157795310020447, "learning_rate": 4.575623649027376e-06, "loss": 0.4761, "step": 1558 }, { "epoch": 1.723286661753869, "grad_norm": 0.42196187376976013, "learning_rate": 4.569209542555098e-06, "loss": 0.4767, "step": 1559 }, { "epoch": 1.7243920412675018, "grad_norm": 0.38202348351478577, "learning_rate": 4.562796150227337e-06, "loss": 0.4454, "step": 1560 }, { "epoch": 1.7254974207811349, "grad_norm": 0.38140401244163513, "learning_rate": 4.556383482675922e-06, "loss": 0.4445, "step": 1561 }, { "epoch": 1.726602800294768, "grad_norm": 0.37558937072753906, "learning_rate": 4.549971550531474e-06, "loss": 0.4155, "step": 1562 }, { "epoch": 1.7277081798084009, "grad_norm": 0.3711106479167938, "learning_rate": 4.543560364423401e-06, "loss": 0.463, "step": 1563 }, { "epoch": 1.7288135593220337, "grad_norm": 0.3518333435058594, "learning_rate": 4.537149934979869e-06, "loss": 0.4422, "step": 1564 }, { "epoch": 1.729918938835667, "grad_norm": 0.3765067756175995, "learning_rate": 4.530740272827792e-06, "loss": 0.498, "step": 1565 }, { "epoch": 1.7310243183493, "grad_norm": 0.34172582626342773, "learning_rate": 4.524331388592812e-06, "loss": 0.4368, "step": 1566 }, { "epoch": 1.7321296978629328, "grad_norm": 0.320735365152359, "learning_rate": 4.51792329289928e-06, "loss": 0.4295, "step": 1567 }, { "epoch": 1.733235077376566, "grad_norm": 0.412371426820755, "learning_rate": 4.511515996370244e-06, "loss": 0.4574, "step": 1568 }, { "epoch": 1.734340456890199, "grad_norm": 0.3908901810646057, "learning_rate": 4.505109509627419e-06, "loss": 0.4851, "step": 1569 }, { "epoch": 1.7354458364038319, "grad_norm": 0.3685707151889801, "learning_rate": 4.498703843291189e-06, "loss": 0.4273, "step": 1570 }, { "epoch": 1.736551215917465, "grad_norm": 0.41782909631729126, "learning_rate": 4.492299007980569e-06, "loss": 0.5101, "step": 1571 }, { "epoch": 1.737656595431098, "grad_norm": 0.36283472180366516, "learning_rate": 4.485895014313198e-06, "loss": 0.4174, "step": 1572 }, { "epoch": 1.738761974944731, "grad_norm": 0.3736819624900818, "learning_rate": 4.479491872905323e-06, "loss": 0.5026, "step": 1573 }, { "epoch": 1.739867354458364, "grad_norm": 0.35793188214302063, "learning_rate": 4.4730895943717735e-06, "loss": 0.4356, "step": 1574 }, { "epoch": 1.7409727339719971, "grad_norm": 0.3972049355506897, "learning_rate": 4.466688189325954e-06, "loss": 0.4518, "step": 1575 }, { "epoch": 1.74207811348563, "grad_norm": 0.3445079028606415, "learning_rate": 4.460287668379815e-06, "loss": 0.4101, "step": 1576 }, { "epoch": 1.7431834929992631, "grad_norm": 0.354682981967926, "learning_rate": 4.453888042143847e-06, "loss": 0.4357, "step": 1577 }, { "epoch": 1.7442888725128962, "grad_norm": 0.3621591627597809, "learning_rate": 4.447489321227052e-06, "loss": 0.4186, "step": 1578 }, { "epoch": 1.745394252026529, "grad_norm": 0.3806709945201874, "learning_rate": 4.441091516236938e-06, "loss": 0.4301, "step": 1579 }, { "epoch": 1.746499631540162, "grad_norm": 0.3600875735282898, "learning_rate": 4.434694637779486e-06, "loss": 0.4706, "step": 1580 }, { "epoch": 1.7476050110537953, "grad_norm": 0.35431089997291565, "learning_rate": 4.428298696459149e-06, "loss": 0.4335, "step": 1581 }, { "epoch": 1.7487103905674282, "grad_norm": 0.3898012340068817, "learning_rate": 4.421903702878822e-06, "loss": 0.4637, "step": 1582 }, { "epoch": 1.749815770081061, "grad_norm": 0.3967501223087311, "learning_rate": 4.415509667639828e-06, "loss": 0.5038, "step": 1583 }, { "epoch": 1.7509211495946941, "grad_norm": 0.34739866852760315, "learning_rate": 4.409116601341908e-06, "loss": 0.4175, "step": 1584 }, { "epoch": 1.7520265291083272, "grad_norm": 0.3876234292984009, "learning_rate": 4.4027245145831856e-06, "loss": 0.4616, "step": 1585 }, { "epoch": 1.7531319086219601, "grad_norm": 0.3689795136451721, "learning_rate": 4.396333417960172e-06, "loss": 0.4272, "step": 1586 }, { "epoch": 1.7542372881355932, "grad_norm": 0.34261801838874817, "learning_rate": 4.389943322067728e-06, "loss": 0.4483, "step": 1587 }, { "epoch": 1.7553426676492263, "grad_norm": 0.35831865668296814, "learning_rate": 4.383554237499064e-06, "loss": 0.464, "step": 1588 }, { "epoch": 1.7564480471628592, "grad_norm": 0.39502084255218506, "learning_rate": 4.377166174845705e-06, "loss": 0.464, "step": 1589 }, { "epoch": 1.7575534266764923, "grad_norm": 0.3487764000892639, "learning_rate": 4.37077914469749e-06, "loss": 0.4262, "step": 1590 }, { "epoch": 1.7586588061901254, "grad_norm": 0.3592751920223236, "learning_rate": 4.364393157642537e-06, "loss": 0.4964, "step": 1591 }, { "epoch": 1.7597641857037583, "grad_norm": 0.39729583263397217, "learning_rate": 4.358008224267245e-06, "loss": 0.4257, "step": 1592 }, { "epoch": 1.7608695652173914, "grad_norm": 0.3702075481414795, "learning_rate": 4.351624355156262e-06, "loss": 0.4873, "step": 1593 }, { "epoch": 1.7619749447310245, "grad_norm": 0.3720692992210388, "learning_rate": 4.345241560892467e-06, "loss": 0.4176, "step": 1594 }, { "epoch": 1.7630803242446573, "grad_norm": 0.45165640115737915, "learning_rate": 4.338859852056967e-06, "loss": 0.4754, "step": 1595 }, { "epoch": 1.7641857037582902, "grad_norm": 0.37220561504364014, "learning_rate": 4.332479239229059e-06, "loss": 0.4422, "step": 1596 }, { "epoch": 1.7652910832719235, "grad_norm": 0.3804519772529602, "learning_rate": 4.326099732986231e-06, "loss": 0.4641, "step": 1597 }, { "epoch": 1.7663964627855564, "grad_norm": 0.43481871485710144, "learning_rate": 4.319721343904133e-06, "loss": 0.4804, "step": 1598 }, { "epoch": 1.7675018422991893, "grad_norm": 0.36303603649139404, "learning_rate": 4.3133440825565645e-06, "loss": 0.4191, "step": 1599 }, { "epoch": 1.7686072218128224, "grad_norm": 0.318490207195282, "learning_rate": 4.306967959515454e-06, "loss": 0.448, "step": 1600 }, { "epoch": 1.7697126013264555, "grad_norm": 0.39497217535972595, "learning_rate": 4.300592985350843e-06, "loss": 0.5002, "step": 1601 }, { "epoch": 1.7708179808400883, "grad_norm": 0.33648860454559326, "learning_rate": 4.29421917063087e-06, "loss": 0.417, "step": 1602 }, { "epoch": 1.7719233603537214, "grad_norm": 0.33422836661338806, "learning_rate": 4.2878465259217505e-06, "loss": 0.4325, "step": 1603 }, { "epoch": 1.7730287398673545, "grad_norm": 0.35902833938598633, "learning_rate": 4.2814750617877615e-06, "loss": 0.4616, "step": 1604 }, { "epoch": 1.7741341193809874, "grad_norm": 0.3250856399536133, "learning_rate": 4.275104788791219e-06, "loss": 0.4258, "step": 1605 }, { "epoch": 1.7752394988946205, "grad_norm": 0.3643254339694977, "learning_rate": 4.268735717492472e-06, "loss": 0.4312, "step": 1606 }, { "epoch": 1.7763448784082536, "grad_norm": 0.38532084226608276, "learning_rate": 4.262367858449867e-06, "loss": 0.4499, "step": 1607 }, { "epoch": 1.7774502579218865, "grad_norm": 0.3807108402252197, "learning_rate": 4.256001222219751e-06, "loss": 0.4657, "step": 1608 }, { "epoch": 1.7785556374355194, "grad_norm": 0.3178168833255768, "learning_rate": 4.249635819356436e-06, "loss": 0.4217, "step": 1609 }, { "epoch": 1.7796610169491527, "grad_norm": 0.3387340009212494, "learning_rate": 4.243271660412197e-06, "loss": 0.4368, "step": 1610 }, { "epoch": 1.7807663964627856, "grad_norm": 0.37469446659088135, "learning_rate": 4.236908755937238e-06, "loss": 0.507, "step": 1611 }, { "epoch": 1.7818717759764184, "grad_norm": 0.3668052554130554, "learning_rate": 4.230547116479691e-06, "loss": 0.4403, "step": 1612 }, { "epoch": 1.7829771554900515, "grad_norm": 0.35366255044937134, "learning_rate": 4.224186752585588e-06, "loss": 0.422, "step": 1613 }, { "epoch": 1.7840825350036846, "grad_norm": 0.35299596190452576, "learning_rate": 4.217827674798845e-06, "loss": 0.4722, "step": 1614 }, { "epoch": 1.7851879145173175, "grad_norm": 0.362405389547348, "learning_rate": 4.21146989366125e-06, "loss": 0.4628, "step": 1615 }, { "epoch": 1.7862932940309506, "grad_norm": 0.40858194231987, "learning_rate": 4.2051134197124354e-06, "loss": 0.4503, "step": 1616 }, { "epoch": 1.7873986735445837, "grad_norm": 0.349040150642395, "learning_rate": 4.1987582634898724e-06, "loss": 0.4261, "step": 1617 }, { "epoch": 1.7885040530582166, "grad_norm": 0.33225852251052856, "learning_rate": 4.192404435528844e-06, "loss": 0.4254, "step": 1618 }, { "epoch": 1.7896094325718497, "grad_norm": 0.35014358162879944, "learning_rate": 4.186051946362435e-06, "loss": 0.4351, "step": 1619 }, { "epoch": 1.7907148120854828, "grad_norm": 0.3435118496417999, "learning_rate": 4.179700806521506e-06, "loss": 0.4708, "step": 1620 }, { "epoch": 1.7918201915991157, "grad_norm": 0.3737446367740631, "learning_rate": 4.173351026534683e-06, "loss": 0.4767, "step": 1621 }, { "epoch": 1.7929255711127488, "grad_norm": 0.3446429967880249, "learning_rate": 4.167002616928341e-06, "loss": 0.4396, "step": 1622 }, { "epoch": 1.7940309506263818, "grad_norm": 0.33797815442085266, "learning_rate": 4.160655588226576e-06, "loss": 0.4492, "step": 1623 }, { "epoch": 1.7951363301400147, "grad_norm": 0.3803517818450928, "learning_rate": 4.154309950951203e-06, "loss": 0.4774, "step": 1624 }, { "epoch": 1.7962417096536476, "grad_norm": 0.36335763335227966, "learning_rate": 4.147965715621722e-06, "loss": 0.4465, "step": 1625 }, { "epoch": 1.797347089167281, "grad_norm": 0.41492894291877747, "learning_rate": 4.141622892755318e-06, "loss": 0.4872, "step": 1626 }, { "epoch": 1.7984524686809138, "grad_norm": 0.3798779547214508, "learning_rate": 4.135281492866826e-06, "loss": 0.4214, "step": 1627 }, { "epoch": 1.7995578481945467, "grad_norm": 0.35543960332870483, "learning_rate": 4.128941526468728e-06, "loss": 0.4343, "step": 1628 }, { "epoch": 1.8006632277081798, "grad_norm": 0.37220996618270874, "learning_rate": 4.122603004071127e-06, "loss": 0.4524, "step": 1629 }, { "epoch": 1.8017686072218129, "grad_norm": 0.4153907001018524, "learning_rate": 4.116265936181731e-06, "loss": 0.4496, "step": 1630 }, { "epoch": 1.8028739867354457, "grad_norm": 0.39634883403778076, "learning_rate": 4.109930333305839e-06, "loss": 0.4312, "step": 1631 }, { "epoch": 1.8039793662490788, "grad_norm": 0.37721240520477295, "learning_rate": 4.103596205946323e-06, "loss": 0.4657, "step": 1632 }, { "epoch": 1.805084745762712, "grad_norm": 0.421220064163208, "learning_rate": 4.097263564603604e-06, "loss": 0.4904, "step": 1633 }, { "epoch": 1.8061901252763448, "grad_norm": 0.3824896812438965, "learning_rate": 4.090932419775642e-06, "loss": 0.4262, "step": 1634 }, { "epoch": 1.807295504789978, "grad_norm": 0.4168621599674225, "learning_rate": 4.08460278195792e-06, "loss": 0.4414, "step": 1635 }, { "epoch": 1.808400884303611, "grad_norm": 0.3957691192626953, "learning_rate": 4.078274661643415e-06, "loss": 0.4284, "step": 1636 }, { "epoch": 1.8095062638172439, "grad_norm": 0.3917975425720215, "learning_rate": 4.0719480693225964e-06, "loss": 0.4166, "step": 1637 }, { "epoch": 1.810611643330877, "grad_norm": 0.4236138164997101, "learning_rate": 4.065623015483394e-06, "loss": 0.4638, "step": 1638 }, { "epoch": 1.81171702284451, "grad_norm": 0.3645436465740204, "learning_rate": 4.05929951061119e-06, "loss": 0.4338, "step": 1639 }, { "epoch": 1.812822402358143, "grad_norm": 0.33403685688972473, "learning_rate": 4.0529775651888e-06, "loss": 0.4265, "step": 1640 }, { "epoch": 1.8139277818717758, "grad_norm": 0.34999674558639526, "learning_rate": 4.046657189696451e-06, "loss": 0.4608, "step": 1641 }, { "epoch": 1.8150331613854092, "grad_norm": 0.4106908440589905, "learning_rate": 4.040338394611772e-06, "loss": 0.4591, "step": 1642 }, { "epoch": 1.816138540899042, "grad_norm": 0.37110623717308044, "learning_rate": 4.034021190409767e-06, "loss": 0.4068, "step": 1643 }, { "epoch": 1.817243920412675, "grad_norm": 0.36376965045928955, "learning_rate": 4.027705587562808e-06, "loss": 0.4431, "step": 1644 }, { "epoch": 1.818349299926308, "grad_norm": 0.3466004729270935, "learning_rate": 4.021391596540607e-06, "loss": 0.488, "step": 1645 }, { "epoch": 1.819454679439941, "grad_norm": 0.3609422743320465, "learning_rate": 4.015079227810208e-06, "loss": 0.4776, "step": 1646 }, { "epoch": 1.820560058953574, "grad_norm": 0.376562237739563, "learning_rate": 4.0087684918359646e-06, "loss": 0.4161, "step": 1647 }, { "epoch": 1.821665438467207, "grad_norm": 0.35262778401374817, "learning_rate": 4.002459399079523e-06, "loss": 0.4709, "step": 1648 }, { "epoch": 1.8227708179808402, "grad_norm": 0.35922110080718994, "learning_rate": 3.9961519599998045e-06, "loss": 0.4264, "step": 1649 }, { "epoch": 1.823876197494473, "grad_norm": 0.3386583924293518, "learning_rate": 3.9898461850529925e-06, "loss": 0.4206, "step": 1650 }, { "epoch": 1.8249815770081061, "grad_norm": 0.3740977942943573, "learning_rate": 3.983542084692508e-06, "loss": 0.4638, "step": 1651 }, { "epoch": 1.8260869565217392, "grad_norm": 0.37539365887641907, "learning_rate": 3.977239669368998e-06, "loss": 0.4535, "step": 1652 }, { "epoch": 1.8271923360353721, "grad_norm": 0.3573692739009857, "learning_rate": 3.970938949530314e-06, "loss": 0.4224, "step": 1653 }, { "epoch": 1.828297715549005, "grad_norm": 0.43764591217041016, "learning_rate": 3.964639935621498e-06, "loss": 0.4607, "step": 1654 }, { "epoch": 1.8294030950626383, "grad_norm": 0.3752228617668152, "learning_rate": 3.958342638084768e-06, "loss": 0.4296, "step": 1655 }, { "epoch": 1.8305084745762712, "grad_norm": 0.35685062408447266, "learning_rate": 3.952047067359488e-06, "loss": 0.4307, "step": 1656 }, { "epoch": 1.831613854089904, "grad_norm": 0.3759077489376068, "learning_rate": 3.945753233882168e-06, "loss": 0.4327, "step": 1657 }, { "epoch": 1.8327192336035372, "grad_norm": 0.4374396800994873, "learning_rate": 3.939461148086434e-06, "loss": 0.4613, "step": 1658 }, { "epoch": 1.8338246131171703, "grad_norm": 0.3935193419456482, "learning_rate": 3.933170820403013e-06, "loss": 0.451, "step": 1659 }, { "epoch": 1.8349299926308031, "grad_norm": 0.3295822739601135, "learning_rate": 3.926882261259723e-06, "loss": 0.417, "step": 1660 }, { "epoch": 1.8360353721444362, "grad_norm": 0.3713218867778778, "learning_rate": 3.920595481081442e-06, "loss": 0.4193, "step": 1661 }, { "epoch": 1.8371407516580693, "grad_norm": 0.44389718770980835, "learning_rate": 3.9143104902901085e-06, "loss": 0.481, "step": 1662 }, { "epoch": 1.8382461311717022, "grad_norm": 0.34496137499809265, "learning_rate": 3.908027299304687e-06, "loss": 0.3977, "step": 1663 }, { "epoch": 1.8393515106853353, "grad_norm": 0.35104823112487793, "learning_rate": 3.901745918541162e-06, "loss": 0.4707, "step": 1664 }, { "epoch": 1.8404568901989684, "grad_norm": 0.43491172790527344, "learning_rate": 3.895466358412517e-06, "loss": 0.5014, "step": 1665 }, { "epoch": 1.8415622697126013, "grad_norm": 0.3916094899177551, "learning_rate": 3.889188629328716e-06, "loss": 0.4229, "step": 1666 }, { "epoch": 1.8426676492262344, "grad_norm": 0.38548544049263, "learning_rate": 3.882912741696688e-06, "loss": 0.4336, "step": 1667 }, { "epoch": 1.8437730287398675, "grad_norm": 0.4150683879852295, "learning_rate": 3.876638705920312e-06, "loss": 0.4568, "step": 1668 }, { "epoch": 1.8448784082535004, "grad_norm": 0.31834936141967773, "learning_rate": 3.8703665324003895e-06, "loss": 0.4371, "step": 1669 }, { "epoch": 1.8459837877671332, "grad_norm": 0.3918519914150238, "learning_rate": 3.864096231534645e-06, "loss": 0.4448, "step": 1670 }, { "epoch": 1.8470891672807666, "grad_norm": 0.40643617510795593, "learning_rate": 3.857827813717692e-06, "loss": 0.4274, "step": 1671 }, { "epoch": 1.8481945467943994, "grad_norm": 0.4014897048473358, "learning_rate": 3.851561289341023e-06, "loss": 0.5035, "step": 1672 }, { "epoch": 1.8492999263080323, "grad_norm": 0.34347957372665405, "learning_rate": 3.845296668792994e-06, "loss": 0.4096, "step": 1673 }, { "epoch": 1.8504053058216654, "grad_norm": 0.44800102710723877, "learning_rate": 3.839033962458802e-06, "loss": 0.4634, "step": 1674 }, { "epoch": 1.8515106853352985, "grad_norm": 0.39145925641059875, "learning_rate": 3.832773180720475e-06, "loss": 0.4477, "step": 1675 }, { "epoch": 1.8526160648489314, "grad_norm": 0.39821621775627136, "learning_rate": 3.826514333956843e-06, "loss": 0.4628, "step": 1676 }, { "epoch": 1.8537214443625645, "grad_norm": 0.3613109886646271, "learning_rate": 3.820257432543539e-06, "loss": 0.4094, "step": 1677 }, { "epoch": 1.8548268238761976, "grad_norm": 0.3803155720233917, "learning_rate": 3.8140024868529585e-06, "loss": 0.4423, "step": 1678 }, { "epoch": 1.8559322033898304, "grad_norm": 0.396979421377182, "learning_rate": 3.8077495072542663e-06, "loss": 0.4654, "step": 1679 }, { "epoch": 1.8570375829034635, "grad_norm": 0.3578045070171356, "learning_rate": 3.8014985041133627e-06, "loss": 0.4579, "step": 1680 }, { "epoch": 1.8581429624170966, "grad_norm": 0.3342948853969574, "learning_rate": 3.7952494877928704e-06, "loss": 0.4208, "step": 1681 }, { "epoch": 1.8592483419307295, "grad_norm": 0.42938435077667236, "learning_rate": 3.789002468652121e-06, "loss": 0.4567, "step": 1682 }, { "epoch": 1.8603537214443626, "grad_norm": 0.37852928042411804, "learning_rate": 3.7827574570471337e-06, "loss": 0.4266, "step": 1683 }, { "epoch": 1.8614591009579957, "grad_norm": 0.3399139642715454, "learning_rate": 3.7765144633306006e-06, "loss": 0.4531, "step": 1684 }, { "epoch": 1.8625644804716286, "grad_norm": 0.3275691568851471, "learning_rate": 3.7702734978518674e-06, "loss": 0.4451, "step": 1685 }, { "epoch": 1.8636698599852615, "grad_norm": 0.36778801679611206, "learning_rate": 3.76403457095692e-06, "loss": 0.4456, "step": 1686 }, { "epoch": 1.8647752394988946, "grad_norm": 0.35151973366737366, "learning_rate": 3.7577976929883608e-06, "loss": 0.4395, "step": 1687 }, { "epoch": 1.8658806190125277, "grad_norm": 0.33157435059547424, "learning_rate": 3.7515628742854006e-06, "loss": 0.4513, "step": 1688 }, { "epoch": 1.8669859985261605, "grad_norm": 0.34350475668907166, "learning_rate": 3.7453301251838314e-06, "loss": 0.4469, "step": 1689 }, { "epoch": 1.8680913780397936, "grad_norm": 0.37216755747795105, "learning_rate": 3.7390994560160187e-06, "loss": 0.4677, "step": 1690 }, { "epoch": 1.8691967575534267, "grad_norm": 0.3465190529823303, "learning_rate": 3.732870877110878e-06, "loss": 0.4089, "step": 1691 }, { "epoch": 1.8703021370670596, "grad_norm": 0.37116461992263794, "learning_rate": 3.726644398793857e-06, "loss": 0.4455, "step": 1692 }, { "epoch": 1.8714075165806927, "grad_norm": 0.35547444224357605, "learning_rate": 3.720420031386927e-06, "loss": 0.4095, "step": 1693 }, { "epoch": 1.8725128960943258, "grad_norm": 0.39676833152770996, "learning_rate": 3.714197785208554e-06, "loss": 0.4817, "step": 1694 }, { "epoch": 1.8736182756079587, "grad_norm": 0.3807957172393799, "learning_rate": 3.7079776705736937e-06, "loss": 0.4362, "step": 1695 }, { "epoch": 1.8747236551215918, "grad_norm": 0.36723747849464417, "learning_rate": 3.701759697793761e-06, "loss": 0.4556, "step": 1696 }, { "epoch": 1.8758290346352249, "grad_norm": 0.38136112689971924, "learning_rate": 3.695543877176626e-06, "loss": 0.4367, "step": 1697 }, { "epoch": 1.8769344141488578, "grad_norm": 0.3909912705421448, "learning_rate": 3.689330219026588e-06, "loss": 0.4621, "step": 1698 }, { "epoch": 1.8780397936624906, "grad_norm": 0.33360040187835693, "learning_rate": 3.6831187336443626e-06, "loss": 0.4569, "step": 1699 }, { "epoch": 1.879145173176124, "grad_norm": 0.425890177488327, "learning_rate": 3.6769094313270647e-06, "loss": 0.422, "step": 1700 }, { "epoch": 1.8802505526897568, "grad_norm": 0.40646079182624817, "learning_rate": 3.6707023223681883e-06, "loss": 0.5085, "step": 1701 }, { "epoch": 1.8813559322033897, "grad_norm": 0.30128124356269836, "learning_rate": 3.6644974170575907e-06, "loss": 0.4156, "step": 1702 }, { "epoch": 1.8824613117170228, "grad_norm": 0.3616032898426056, "learning_rate": 3.6582947256814783e-06, "loss": 0.4908, "step": 1703 }, { "epoch": 1.883566691230656, "grad_norm": 0.35343214869499207, "learning_rate": 3.652094258522387e-06, "loss": 0.3909, "step": 1704 }, { "epoch": 1.8846720707442888, "grad_norm": 0.39410123229026794, "learning_rate": 3.6458960258591624e-06, "loss": 0.4748, "step": 1705 }, { "epoch": 1.8857774502579219, "grad_norm": 0.36482831835746765, "learning_rate": 3.6397000379669513e-06, "loss": 0.433, "step": 1706 }, { "epoch": 1.886882829771555, "grad_norm": 0.35653504729270935, "learning_rate": 3.6335063051171725e-06, "loss": 0.4471, "step": 1707 }, { "epoch": 1.8879882092851878, "grad_norm": 0.3423863351345062, "learning_rate": 3.627314837577514e-06, "loss": 0.4282, "step": 1708 }, { "epoch": 1.889093588798821, "grad_norm": 0.3618341386318207, "learning_rate": 3.6211256456119026e-06, "loss": 0.43, "step": 1709 }, { "epoch": 1.890198968312454, "grad_norm": 0.3813115060329437, "learning_rate": 3.6149387394804946e-06, "loss": 0.448, "step": 1710 }, { "epoch": 1.891304347826087, "grad_norm": 0.37243878841400146, "learning_rate": 3.608754129439659e-06, "loss": 0.4729, "step": 1711 }, { "epoch": 1.89240972733972, "grad_norm": 0.35786065459251404, "learning_rate": 3.6025718257419532e-06, "loss": 0.4943, "step": 1712 }, { "epoch": 1.8935151068533531, "grad_norm": 0.3115291893482208, "learning_rate": 3.5963918386361186e-06, "loss": 0.413, "step": 1713 }, { "epoch": 1.894620486366986, "grad_norm": 0.39461660385131836, "learning_rate": 3.590214178367049e-06, "loss": 0.4802, "step": 1714 }, { "epoch": 1.8957258658806189, "grad_norm": 0.35573914647102356, "learning_rate": 3.5840388551757876e-06, "loss": 0.454, "step": 1715 }, { "epoch": 1.8968312453942522, "grad_norm": 0.3421839773654938, "learning_rate": 3.5778658792994957e-06, "loss": 0.4309, "step": 1716 }, { "epoch": 1.897936624907885, "grad_norm": 0.3851657211780548, "learning_rate": 3.5716952609714517e-06, "loss": 0.4837, "step": 1717 }, { "epoch": 1.899042004421518, "grad_norm": 0.34376582503318787, "learning_rate": 3.565527010421019e-06, "loss": 0.4127, "step": 1718 }, { "epoch": 1.900147383935151, "grad_norm": 0.3781861662864685, "learning_rate": 3.5593611378736392e-06, "loss": 0.4509, "step": 1719 }, { "epoch": 1.9012527634487841, "grad_norm": 0.37228086590766907, "learning_rate": 3.5531976535508107e-06, "loss": 0.4756, "step": 1720 }, { "epoch": 1.902358142962417, "grad_norm": 0.34292104840278625, "learning_rate": 3.5470365676700715e-06, "loss": 0.436, "step": 1721 }, { "epoch": 1.90346352247605, "grad_norm": 0.38040798902511597, "learning_rate": 3.540877890444989e-06, "loss": 0.4253, "step": 1722 }, { "epoch": 1.9045689019896832, "grad_norm": 0.3866693377494812, "learning_rate": 3.534721632085132e-06, "loss": 0.4613, "step": 1723 }, { "epoch": 1.905674281503316, "grad_norm": 0.36908987164497375, "learning_rate": 3.528567802796061e-06, "loss": 0.4344, "step": 1724 }, { "epoch": 1.9067796610169492, "grad_norm": 0.35240623354911804, "learning_rate": 3.522416412779308e-06, "loss": 0.4374, "step": 1725 }, { "epoch": 1.9078850405305823, "grad_norm": 0.3577819764614105, "learning_rate": 3.5162674722323677e-06, "loss": 0.4364, "step": 1726 }, { "epoch": 1.9089904200442152, "grad_norm": 0.3440542221069336, "learning_rate": 3.5101209913486655e-06, "loss": 0.4511, "step": 1727 }, { "epoch": 1.910095799557848, "grad_norm": 0.33109793066978455, "learning_rate": 3.5039769803175545e-06, "loss": 0.4562, "step": 1728 }, { "epoch": 1.9112011790714813, "grad_norm": 0.3406193256378174, "learning_rate": 3.4978354493242937e-06, "loss": 0.4371, "step": 1729 }, { "epoch": 1.9123065585851142, "grad_norm": 0.35918039083480835, "learning_rate": 3.4916964085500277e-06, "loss": 0.4829, "step": 1730 }, { "epoch": 1.913411938098747, "grad_norm": 0.3424345850944519, "learning_rate": 3.485559868171776e-06, "loss": 0.4276, "step": 1731 }, { "epoch": 1.9145173176123802, "grad_norm": 0.31907355785369873, "learning_rate": 3.4794258383624115e-06, "loss": 0.4062, "step": 1732 }, { "epoch": 1.9156226971260133, "grad_norm": 0.3546552360057831, "learning_rate": 3.4732943292906453e-06, "loss": 0.4788, "step": 1733 }, { "epoch": 1.9167280766396462, "grad_norm": 0.35615572333335876, "learning_rate": 3.4671653511210086e-06, "loss": 0.473, "step": 1734 }, { "epoch": 1.9178334561532793, "grad_norm": 0.3804091215133667, "learning_rate": 3.4610389140138404e-06, "loss": 0.462, "step": 1735 }, { "epoch": 1.9189388356669124, "grad_norm": 0.3700900971889496, "learning_rate": 3.4549150281252635e-06, "loss": 0.4307, "step": 1736 }, { "epoch": 1.9200442151805452, "grad_norm": 0.3499837815761566, "learning_rate": 3.448793703607175e-06, "loss": 0.413, "step": 1737 }, { "epoch": 1.9211495946941783, "grad_norm": 0.35732701420783997, "learning_rate": 3.442674950607221e-06, "loss": 0.4698, "step": 1738 }, { "epoch": 1.9222549742078114, "grad_norm": 0.38744744658470154, "learning_rate": 3.4365587792687906e-06, "loss": 0.4609, "step": 1739 }, { "epoch": 1.9233603537214443, "grad_norm": 0.3874463737010956, "learning_rate": 3.430445199730987e-06, "loss": 0.4359, "step": 1740 }, { "epoch": 1.9244657332350774, "grad_norm": 0.40476468205451965, "learning_rate": 3.4243342221286226e-06, "loss": 0.4394, "step": 1741 }, { "epoch": 1.9255711127487105, "grad_norm": 0.39241066575050354, "learning_rate": 3.4182258565921933e-06, "loss": 0.4581, "step": 1742 }, { "epoch": 1.9266764922623434, "grad_norm": 0.373761922121048, "learning_rate": 3.412120113247861e-06, "loss": 0.4152, "step": 1743 }, { "epoch": 1.9277818717759763, "grad_norm": 0.37896719574928284, "learning_rate": 3.406017002217452e-06, "loss": 0.454, "step": 1744 }, { "epoch": 1.9288872512896096, "grad_norm": 0.3860524892807007, "learning_rate": 3.3999165336184183e-06, "loss": 0.4251, "step": 1745 }, { "epoch": 1.9299926308032425, "grad_norm": 0.36931565403938293, "learning_rate": 3.393818717563834e-06, "loss": 0.4737, "step": 1746 }, { "epoch": 1.9310980103168753, "grad_norm": 0.4122299551963806, "learning_rate": 3.3877235641623797e-06, "loss": 0.4874, "step": 1747 }, { "epoch": 1.9322033898305084, "grad_norm": 0.3368544280529022, "learning_rate": 3.3816310835183153e-06, "loss": 0.3903, "step": 1748 }, { "epoch": 1.9333087693441415, "grad_norm": 0.39701685309410095, "learning_rate": 3.375541285731477e-06, "loss": 0.4575, "step": 1749 }, { "epoch": 1.9344141488577744, "grad_norm": 0.4133145809173584, "learning_rate": 3.369454180897248e-06, "loss": 0.437, "step": 1750 }, { "epoch": 1.9355195283714075, "grad_norm": 0.39687222242355347, "learning_rate": 3.363369779106552e-06, "loss": 0.479, "step": 1751 }, { "epoch": 1.9366249078850406, "grad_norm": 0.34392571449279785, "learning_rate": 3.3572880904458267e-06, "loss": 0.4506, "step": 1752 }, { "epoch": 1.9377302873986735, "grad_norm": 0.3457908034324646, "learning_rate": 3.3512091249970165e-06, "loss": 0.4515, "step": 1753 }, { "epoch": 1.9388356669123066, "grad_norm": 0.33988478779792786, "learning_rate": 3.345132892837547e-06, "loss": 0.3685, "step": 1754 }, { "epoch": 1.9399410464259397, "grad_norm": 0.37284475564956665, "learning_rate": 3.3390594040403184e-06, "loss": 0.4406, "step": 1755 }, { "epoch": 1.9410464259395726, "grad_norm": 0.360158771276474, "learning_rate": 3.332988668673677e-06, "loss": 0.464, "step": 1756 }, { "epoch": 1.9421518054532056, "grad_norm": 0.38655805587768555, "learning_rate": 3.32692069680141e-06, "loss": 0.415, "step": 1757 }, { "epoch": 1.9432571849668387, "grad_norm": 0.39790865778923035, "learning_rate": 3.320855498482718e-06, "loss": 0.4921, "step": 1758 }, { "epoch": 1.9443625644804716, "grad_norm": 0.324493408203125, "learning_rate": 3.31479308377221e-06, "loss": 0.3919, "step": 1759 }, { "epoch": 1.9454679439941045, "grad_norm": 0.35228490829467773, "learning_rate": 3.308733462719873e-06, "loss": 0.4796, "step": 1760 }, { "epoch": 1.9465733235077378, "grad_norm": 0.363515704870224, "learning_rate": 3.30267664537107e-06, "loss": 0.4552, "step": 1761 }, { "epoch": 1.9476787030213707, "grad_norm": 0.3859129250049591, "learning_rate": 3.2966226417665125e-06, "loss": 0.4325, "step": 1762 }, { "epoch": 1.9487840825350036, "grad_norm": 0.3791467249393463, "learning_rate": 3.2905714619422467e-06, "loss": 0.4429, "step": 1763 }, { "epoch": 1.9498894620486367, "grad_norm": 0.33396661281585693, "learning_rate": 3.2845231159296404e-06, "loss": 0.4619, "step": 1764 }, { "epoch": 1.9509948415622698, "grad_norm": 0.39208129048347473, "learning_rate": 3.278477613755359e-06, "loss": 0.4327, "step": 1765 }, { "epoch": 1.9521002210759026, "grad_norm": 0.3913731575012207, "learning_rate": 3.2724349654413612e-06, "loss": 0.4351, "step": 1766 }, { "epoch": 1.9532056005895357, "grad_norm": 0.3284873068332672, "learning_rate": 3.2663951810048683e-06, "loss": 0.4352, "step": 1767 }, { "epoch": 1.9543109801031688, "grad_norm": 0.40079861879348755, "learning_rate": 3.2603582704583547e-06, "loss": 0.4548, "step": 1768 }, { "epoch": 1.9554163596168017, "grad_norm": 0.3621026277542114, "learning_rate": 3.2543242438095347e-06, "loss": 0.424, "step": 1769 }, { "epoch": 1.9565217391304348, "grad_norm": 0.3519558012485504, "learning_rate": 3.2482931110613358e-06, "loss": 0.4452, "step": 1770 }, { "epoch": 1.957627118644068, "grad_norm": 0.3937278389930725, "learning_rate": 3.242264882211894e-06, "loss": 0.4726, "step": 1771 }, { "epoch": 1.9587324981577008, "grad_norm": 0.41381126642227173, "learning_rate": 3.236239567254526e-06, "loss": 0.4576, "step": 1772 }, { "epoch": 1.9598378776713337, "grad_norm": 0.36334460973739624, "learning_rate": 3.230217176177723e-06, "loss": 0.4155, "step": 1773 }, { "epoch": 1.960943257184967, "grad_norm": 0.36778998374938965, "learning_rate": 3.224197718965124e-06, "loss": 0.4325, "step": 1774 }, { "epoch": 1.9620486366985999, "grad_norm": 0.40116575360298157, "learning_rate": 3.21818120559551e-06, "loss": 0.4749, "step": 1775 }, { "epoch": 1.9631540162122327, "grad_norm": 0.3267873525619507, "learning_rate": 3.2121676460427765e-06, "loss": 0.3925, "step": 1776 }, { "epoch": 1.9642593957258658, "grad_norm": 0.3840785324573517, "learning_rate": 3.206157050275927e-06, "loss": 0.518, "step": 1777 }, { "epoch": 1.965364775239499, "grad_norm": 0.3358251452445984, "learning_rate": 3.2001494282590466e-06, "loss": 0.4291, "step": 1778 }, { "epoch": 1.9664701547531318, "grad_norm": 0.325832337141037, "learning_rate": 3.1941447899512966e-06, "loss": 0.4242, "step": 1779 }, { "epoch": 1.967575534266765, "grad_norm": 0.36580514907836914, "learning_rate": 3.188143145306888e-06, "loss": 0.4539, "step": 1780 }, { "epoch": 1.968680913780398, "grad_norm": 0.39226552844047546, "learning_rate": 3.1821445042750677e-06, "loss": 0.4667, "step": 1781 }, { "epoch": 1.9697862932940309, "grad_norm": 0.33195826411247253, "learning_rate": 3.176148876800109e-06, "loss": 0.4301, "step": 1782 }, { "epoch": 1.970891672807664, "grad_norm": 0.3939443826675415, "learning_rate": 3.170156272821283e-06, "loss": 0.4428, "step": 1783 }, { "epoch": 1.971997052321297, "grad_norm": 0.36185628175735474, "learning_rate": 3.164166702272855e-06, "loss": 0.4074, "step": 1784 }, { "epoch": 1.97310243183493, "grad_norm": 0.3742704689502716, "learning_rate": 3.1581801750840556e-06, "loss": 0.4837, "step": 1785 }, { "epoch": 1.974207811348563, "grad_norm": 0.3594764769077301, "learning_rate": 3.1521967011790753e-06, "loss": 0.4408, "step": 1786 }, { "epoch": 1.9753131908621961, "grad_norm": 0.3900530934333801, "learning_rate": 3.1462162904770376e-06, "loss": 0.4506, "step": 1787 }, { "epoch": 1.976418570375829, "grad_norm": 0.3670540750026703, "learning_rate": 3.140238952891994e-06, "loss": 0.4317, "step": 1788 }, { "epoch": 1.977523949889462, "grad_norm": 0.3286137878894806, "learning_rate": 3.1342646983328995e-06, "loss": 0.4566, "step": 1789 }, { "epoch": 1.9786293294030952, "grad_norm": 0.3595986068248749, "learning_rate": 3.1282935367035935e-06, "loss": 0.4223, "step": 1790 }, { "epoch": 1.979734708916728, "grad_norm": 0.40041521191596985, "learning_rate": 3.122325477902795e-06, "loss": 0.4878, "step": 1791 }, { "epoch": 1.980840088430361, "grad_norm": 0.370739221572876, "learning_rate": 3.116360531824074e-06, "loss": 0.4429, "step": 1792 }, { "epoch": 1.981945467943994, "grad_norm": 0.35410529375076294, "learning_rate": 3.110398708355845e-06, "loss": 0.3911, "step": 1793 }, { "epoch": 1.9830508474576272, "grad_norm": 0.34605565667152405, "learning_rate": 3.1044400173813415e-06, "loss": 0.4782, "step": 1794 }, { "epoch": 1.98415622697126, "grad_norm": 0.35022327303886414, "learning_rate": 3.0984844687786076e-06, "loss": 0.4544, "step": 1795 }, { "epoch": 1.9852616064848931, "grad_norm": 0.41975727677345276, "learning_rate": 3.0925320724204743e-06, "loss": 0.4281, "step": 1796 }, { "epoch": 1.9863669859985262, "grad_norm": 0.44948700070381165, "learning_rate": 3.0865828381745515e-06, "loss": 0.4873, "step": 1797 }, { "epoch": 1.9874723655121591, "grad_norm": 0.34598836302757263, "learning_rate": 3.080636775903205e-06, "loss": 0.4579, "step": 1798 }, { "epoch": 1.9885777450257922, "grad_norm": 0.3767213225364685, "learning_rate": 3.0746938954635396e-06, "loss": 0.4633, "step": 1799 }, { "epoch": 1.9896831245394253, "grad_norm": 0.3467770516872406, "learning_rate": 3.068754206707392e-06, "loss": 0.4499, "step": 1800 }, { "epoch": 1.9907885040530582, "grad_norm": 0.3422654867172241, "learning_rate": 3.062817719481299e-06, "loss": 0.4379, "step": 1801 }, { "epoch": 1.9918938835666913, "grad_norm": 0.3750484883785248, "learning_rate": 3.056884443626499e-06, "loss": 0.4648, "step": 1802 }, { "epoch": 1.9929992630803244, "grad_norm": 0.37781617045402527, "learning_rate": 3.0509543889788997e-06, "loss": 0.4813, "step": 1803 }, { "epoch": 1.9941046425939573, "grad_norm": 0.3512095510959625, "learning_rate": 3.0450275653690743e-06, "loss": 0.4106, "step": 1804 }, { "epoch": 1.9952100221075901, "grad_norm": 0.36745697259902954, "learning_rate": 3.0391039826222347e-06, "loss": 0.4978, "step": 1805 }, { "epoch": 1.9963154016212235, "grad_norm": 0.3641338050365448, "learning_rate": 3.033183650558225e-06, "loss": 0.4485, "step": 1806 }, { "epoch": 1.9974207811348563, "grad_norm": 0.35008320212364197, "learning_rate": 3.027266578991497e-06, "loss": 0.4447, "step": 1807 }, { "epoch": 1.9985261606484892, "grad_norm": 0.3481820225715637, "learning_rate": 3.021352777731096e-06, "loss": 0.417, "step": 1808 }, { "epoch": 1.9996315401621223, "grad_norm": 0.49000436067581177, "learning_rate": 3.015442256580654e-06, "loss": 0.5076, "step": 1809 }, { "epoch": 2.0007369196757554, "grad_norm": 0.4487617015838623, "learning_rate": 3.0095350253383567e-06, "loss": 0.5865, "step": 1810 }, { "epoch": 2.0018422991893883, "grad_norm": 0.34568625688552856, "learning_rate": 3.00363109379694e-06, "loss": 0.404, "step": 1811 }, { "epoch": 2.002947678703021, "grad_norm": 0.4428139925003052, "learning_rate": 2.997730471743667e-06, "loss": 0.4363, "step": 1812 }, { "epoch": 2.0040530582166545, "grad_norm": 0.3445308804512024, "learning_rate": 2.9918331689603197e-06, "loss": 0.385, "step": 1813 }, { "epoch": 2.0051584377302873, "grad_norm": 0.36059221625328064, "learning_rate": 2.985939195223171e-06, "loss": 0.3761, "step": 1814 }, { "epoch": 2.0062638172439202, "grad_norm": 0.38642165064811707, "learning_rate": 2.9800485603029805e-06, "loss": 0.4485, "step": 1815 }, { "epoch": 2.0073691967575535, "grad_norm": 0.33731305599212646, "learning_rate": 2.9741612739649694e-06, "loss": 0.3885, "step": 1816 }, { "epoch": 2.0084745762711864, "grad_norm": 0.4023420214653015, "learning_rate": 2.9682773459688087e-06, "loss": 0.4456, "step": 1817 }, { "epoch": 2.0095799557848193, "grad_norm": 0.35344362258911133, "learning_rate": 2.9623967860686035e-06, "loss": 0.4053, "step": 1818 }, { "epoch": 2.0106853352984526, "grad_norm": 0.3647872805595398, "learning_rate": 2.9565196040128736e-06, "loss": 0.4063, "step": 1819 }, { "epoch": 2.0117907148120855, "grad_norm": 0.38329359889030457, "learning_rate": 2.9506458095445408e-06, "loss": 0.465, "step": 1820 }, { "epoch": 2.0128960943257184, "grad_norm": 0.33320972323417664, "learning_rate": 2.9447754124009087e-06, "loss": 0.4037, "step": 1821 }, { "epoch": 2.0140014738393517, "grad_norm": 0.3432464599609375, "learning_rate": 2.9389084223136523e-06, "loss": 0.4363, "step": 1822 }, { "epoch": 2.0151068533529846, "grad_norm": 0.3437812924385071, "learning_rate": 2.933044849008795e-06, "loss": 0.4291, "step": 1823 }, { "epoch": 2.0162122328666174, "grad_norm": 0.3748723566532135, "learning_rate": 2.9271847022066992e-06, "loss": 0.4085, "step": 1824 }, { "epoch": 2.0173176123802508, "grad_norm": 0.354678750038147, "learning_rate": 2.921327991622044e-06, "loss": 0.3897, "step": 1825 }, { "epoch": 2.0184229918938836, "grad_norm": 0.37946945428848267, "learning_rate": 2.915474726963815e-06, "loss": 0.4329, "step": 1826 }, { "epoch": 2.0195283714075165, "grad_norm": 0.3262304961681366, "learning_rate": 2.9096249179352833e-06, "loss": 0.3747, "step": 1827 }, { "epoch": 2.0206337509211494, "grad_norm": 0.3607347011566162, "learning_rate": 2.903778574233992e-06, "loss": 0.4248, "step": 1828 }, { "epoch": 2.0217391304347827, "grad_norm": 0.37773627042770386, "learning_rate": 2.8979357055517416e-06, "loss": 0.4065, "step": 1829 }, { "epoch": 2.0228445099484156, "grad_norm": 0.38898104429244995, "learning_rate": 2.892096321574564e-06, "loss": 0.3981, "step": 1830 }, { "epoch": 2.0239498894620485, "grad_norm": 0.33444660902023315, "learning_rate": 2.8862604319827263e-06, "loss": 0.4303, "step": 1831 }, { "epoch": 2.0250552689756818, "grad_norm": 0.36019203066825867, "learning_rate": 2.880428046450697e-06, "loss": 0.4331, "step": 1832 }, { "epoch": 2.0261606484893147, "grad_norm": 0.4235753118991852, "learning_rate": 2.874599174647131e-06, "loss": 0.4857, "step": 1833 }, { "epoch": 2.0272660280029475, "grad_norm": 0.3366357982158661, "learning_rate": 2.8687738262348645e-06, "loss": 0.3948, "step": 1834 }, { "epoch": 2.028371407516581, "grad_norm": 0.35286110639572144, "learning_rate": 2.862952010870893e-06, "loss": 0.3938, "step": 1835 }, { "epoch": 2.0294767870302137, "grad_norm": 0.4159576892852783, "learning_rate": 2.8571337382063503e-06, "loss": 0.474, "step": 1836 }, { "epoch": 2.0305821665438466, "grad_norm": 0.364764004945755, "learning_rate": 2.8513190178865004e-06, "loss": 0.4784, "step": 1837 }, { "epoch": 2.03168754605748, "grad_norm": 0.3737800717353821, "learning_rate": 2.845507859550718e-06, "loss": 0.3955, "step": 1838 }, { "epoch": 2.032792925571113, "grad_norm": 0.3593927323818207, "learning_rate": 2.839700272832476e-06, "loss": 0.4145, "step": 1839 }, { "epoch": 2.0338983050847457, "grad_norm": 0.3654785454273224, "learning_rate": 2.8338962673593196e-06, "loss": 0.4438, "step": 1840 }, { "epoch": 2.035003684598379, "grad_norm": 0.35869982838630676, "learning_rate": 2.8280958527528614e-06, "loss": 0.4122, "step": 1841 }, { "epoch": 2.036109064112012, "grad_norm": 0.373416930437088, "learning_rate": 2.822299038628762e-06, "loss": 0.4323, "step": 1842 }, { "epoch": 2.0372144436256447, "grad_norm": 0.3644184172153473, "learning_rate": 2.8165058345967134e-06, "loss": 0.3503, "step": 1843 }, { "epoch": 2.0383198231392776, "grad_norm": 0.3962327837944031, "learning_rate": 2.810716250260418e-06, "loss": 0.4326, "step": 1844 }, { "epoch": 2.039425202652911, "grad_norm": 0.37271663546562195, "learning_rate": 2.804930295217583e-06, "loss": 0.409, "step": 1845 }, { "epoch": 2.040530582166544, "grad_norm": 0.37129124999046326, "learning_rate": 2.799147979059898e-06, "loss": 0.4262, "step": 1846 }, { "epoch": 2.0416359616801767, "grad_norm": 0.3381025493144989, "learning_rate": 2.793369311373021e-06, "loss": 0.4039, "step": 1847 }, { "epoch": 2.04274134119381, "grad_norm": 0.38008350133895874, "learning_rate": 2.787594301736556e-06, "loss": 0.4255, "step": 1848 }, { "epoch": 2.043846720707443, "grad_norm": 0.36039870977401733, "learning_rate": 2.78182295972405e-06, "loss": 0.4128, "step": 1849 }, { "epoch": 2.0449521002210758, "grad_norm": 0.3875678777694702, "learning_rate": 2.7760552949029683e-06, "loss": 0.4231, "step": 1850 }, { "epoch": 2.046057479734709, "grad_norm": 0.34928736090660095, "learning_rate": 2.7702913168346767e-06, "loss": 0.4422, "step": 1851 }, { "epoch": 2.047162859248342, "grad_norm": 0.3321850001811981, "learning_rate": 2.7645310350744296e-06, "loss": 0.4028, "step": 1852 }, { "epoch": 2.048268238761975, "grad_norm": 0.3264100253582001, "learning_rate": 2.758774459171364e-06, "loss": 0.3758, "step": 1853 }, { "epoch": 2.049373618275608, "grad_norm": 0.3490244150161743, "learning_rate": 2.7530215986684584e-06, "loss": 0.42, "step": 1854 }, { "epoch": 2.050478997789241, "grad_norm": 0.3624034523963928, "learning_rate": 2.747272463102541e-06, "loss": 0.4372, "step": 1855 }, { "epoch": 2.051584377302874, "grad_norm": 0.35755500197410583, "learning_rate": 2.741527062004264e-06, "loss": 0.4042, "step": 1856 }, { "epoch": 2.052689756816507, "grad_norm": 0.3729250133037567, "learning_rate": 2.7357854048980893e-06, "loss": 0.4146, "step": 1857 }, { "epoch": 2.05379513633014, "grad_norm": 0.34821978211402893, "learning_rate": 2.7300475013022666e-06, "loss": 0.4026, "step": 1858 }, { "epoch": 2.054900515843773, "grad_norm": 0.3505648374557495, "learning_rate": 2.7243133607288296e-06, "loss": 0.3757, "step": 1859 }, { "epoch": 2.056005895357406, "grad_norm": 0.42369332909584045, "learning_rate": 2.7185829926835728e-06, "loss": 0.4373, "step": 1860 }, { "epoch": 2.057111274871039, "grad_norm": 0.3499515950679779, "learning_rate": 2.7128564066660355e-06, "loss": 0.4054, "step": 1861 }, { "epoch": 2.058216654384672, "grad_norm": 0.3536069393157959, "learning_rate": 2.7071336121694856e-06, "loss": 0.4426, "step": 1862 }, { "epoch": 2.059322033898305, "grad_norm": 0.36293312907218933, "learning_rate": 2.7014146186809077e-06, "loss": 0.401, "step": 1863 }, { "epoch": 2.0604274134119382, "grad_norm": 0.34100690484046936, "learning_rate": 2.695699435680986e-06, "loss": 0.4063, "step": 1864 }, { "epoch": 2.061532792925571, "grad_norm": 0.38878142833709717, "learning_rate": 2.6899880726440885e-06, "loss": 0.4334, "step": 1865 }, { "epoch": 2.062638172439204, "grad_norm": 0.36152786016464233, "learning_rate": 2.6842805390382453e-06, "loss": 0.3865, "step": 1866 }, { "epoch": 2.0637435519528373, "grad_norm": 0.33091670274734497, "learning_rate": 2.6785768443251437e-06, "loss": 0.3969, "step": 1867 }, { "epoch": 2.06484893146647, "grad_norm": 0.38610920310020447, "learning_rate": 2.6728769979601044e-06, "loss": 0.4645, "step": 1868 }, { "epoch": 2.065954310980103, "grad_norm": 0.31939804553985596, "learning_rate": 2.667181009392073e-06, "loss": 0.3992, "step": 1869 }, { "epoch": 2.0670596904937364, "grad_norm": 0.3500080406665802, "learning_rate": 2.661488888063591e-06, "loss": 0.4474, "step": 1870 }, { "epoch": 2.0681650700073693, "grad_norm": 0.35492923855781555, "learning_rate": 2.6558006434107974e-06, "loss": 0.3922, "step": 1871 }, { "epoch": 2.069270449521002, "grad_norm": 0.3578505218029022, "learning_rate": 2.6501162848634023e-06, "loss": 0.4258, "step": 1872 }, { "epoch": 2.070375829034635, "grad_norm": 0.33528921008110046, "learning_rate": 2.6444358218446687e-06, "loss": 0.4026, "step": 1873 }, { "epoch": 2.0714812085482683, "grad_norm": 0.333994597196579, "learning_rate": 2.6387592637714062e-06, "loss": 0.4014, "step": 1874 }, { "epoch": 2.072586588061901, "grad_norm": 0.3651977479457855, "learning_rate": 2.6330866200539564e-06, "loss": 0.4368, "step": 1875 }, { "epoch": 2.073691967575534, "grad_norm": 0.3617836534976959, "learning_rate": 2.6274179000961604e-06, "loss": 0.4241, "step": 1876 }, { "epoch": 2.0747973470891674, "grad_norm": 0.3210521340370178, "learning_rate": 2.621753113295361e-06, "loss": 0.3967, "step": 1877 }, { "epoch": 2.0759027266028003, "grad_norm": 0.3823128342628479, "learning_rate": 2.6160922690423818e-06, "loss": 0.4681, "step": 1878 }, { "epoch": 2.077008106116433, "grad_norm": 0.36844485998153687, "learning_rate": 2.6104353767215103e-06, "loss": 0.4052, "step": 1879 }, { "epoch": 2.0781134856300665, "grad_norm": 0.34701210260391235, "learning_rate": 2.6047824457104766e-06, "loss": 0.3526, "step": 1880 }, { "epoch": 2.0792188651436994, "grad_norm": 0.35781094431877136, "learning_rate": 2.599133485380451e-06, "loss": 0.4207, "step": 1881 }, { "epoch": 2.0803242446573322, "grad_norm": 0.37514442205429077, "learning_rate": 2.5934885050960183e-06, "loss": 0.5084, "step": 1882 }, { "epoch": 2.0814296241709656, "grad_norm": 0.3400008976459503, "learning_rate": 2.5878475142151695e-06, "loss": 0.4143, "step": 1883 }, { "epoch": 2.0825350036845984, "grad_norm": 0.3268665671348572, "learning_rate": 2.5822105220892733e-06, "loss": 0.3779, "step": 1884 }, { "epoch": 2.0836403831982313, "grad_norm": 0.37079328298568726, "learning_rate": 2.5765775380630773e-06, "loss": 0.4322, "step": 1885 }, { "epoch": 2.084745762711864, "grad_norm": 0.3651455342769623, "learning_rate": 2.5709485714746827e-06, "loss": 0.4298, "step": 1886 }, { "epoch": 2.0858511422254975, "grad_norm": 0.37170350551605225, "learning_rate": 2.565323631655532e-06, "loss": 0.4333, "step": 1887 }, { "epoch": 2.0869565217391304, "grad_norm": 0.32639020681381226, "learning_rate": 2.559702727930386e-06, "loss": 0.4287, "step": 1888 }, { "epoch": 2.0880619012527633, "grad_norm": 0.32918259501457214, "learning_rate": 2.554085869617322e-06, "loss": 0.4034, "step": 1889 }, { "epoch": 2.0891672807663966, "grad_norm": 0.3352969288825989, "learning_rate": 2.548473066027709e-06, "loss": 0.3929, "step": 1890 }, { "epoch": 2.0902726602800294, "grad_norm": 0.35925960540771484, "learning_rate": 2.542864326466191e-06, "loss": 0.4496, "step": 1891 }, { "epoch": 2.0913780397936623, "grad_norm": 0.32939472794532776, "learning_rate": 2.537259660230679e-06, "loss": 0.4204, "step": 1892 }, { "epoch": 2.0924834193072956, "grad_norm": 0.3406940996646881, "learning_rate": 2.531659076612329e-06, "loss": 0.4109, "step": 1893 }, { "epoch": 2.0935887988209285, "grad_norm": 0.41532135009765625, "learning_rate": 2.5260625848955324e-06, "loss": 0.4351, "step": 1894 }, { "epoch": 2.0946941783345614, "grad_norm": 0.3557802736759186, "learning_rate": 2.5204701943578904e-06, "loss": 0.3677, "step": 1895 }, { "epoch": 2.0957995578481947, "grad_norm": 0.3734782934188843, "learning_rate": 2.5148819142702095e-06, "loss": 0.407, "step": 1896 }, { "epoch": 2.0969049373618276, "grad_norm": 0.33275318145751953, "learning_rate": 2.5092977538964887e-06, "loss": 0.4067, "step": 1897 }, { "epoch": 2.0980103168754605, "grad_norm": 0.34813469648361206, "learning_rate": 2.5037177224938846e-06, "loss": 0.4446, "step": 1898 }, { "epoch": 2.099115696389094, "grad_norm": 0.3512954115867615, "learning_rate": 2.498141829312718e-06, "loss": 0.3854, "step": 1899 }, { "epoch": 2.1002210759027267, "grad_norm": 0.35331737995147705, "learning_rate": 2.492570083596447e-06, "loss": 0.4126, "step": 1900 }, { "epoch": 2.1013264554163595, "grad_norm": 0.3417462706565857, "learning_rate": 2.487002494581656e-06, "loss": 0.3954, "step": 1901 }, { "epoch": 2.1024318349299924, "grad_norm": 0.3353654146194458, "learning_rate": 2.4814390714980325e-06, "loss": 0.3869, "step": 1902 }, { "epoch": 2.1035372144436257, "grad_norm": 0.31571099162101746, "learning_rate": 2.475879823568365e-06, "loss": 0.4491, "step": 1903 }, { "epoch": 2.1046425939572586, "grad_norm": 0.36133265495300293, "learning_rate": 2.4703247600085173e-06, "loss": 0.4846, "step": 1904 }, { "epoch": 2.1057479734708915, "grad_norm": 0.34291163086891174, "learning_rate": 2.4647738900274193e-06, "loss": 0.4248, "step": 1905 }, { "epoch": 2.106853352984525, "grad_norm": 0.3293028175830841, "learning_rate": 2.459227222827043e-06, "loss": 0.3393, "step": 1906 }, { "epoch": 2.1079587324981577, "grad_norm": 0.3862082064151764, "learning_rate": 2.4536847676023995e-06, "loss": 0.4335, "step": 1907 }, { "epoch": 2.1090641120117906, "grad_norm": 0.37619277834892273, "learning_rate": 2.4481465335415162e-06, "loss": 0.4222, "step": 1908 }, { "epoch": 2.110169491525424, "grad_norm": 0.3544367253780365, "learning_rate": 2.4426125298254177e-06, "loss": 0.4636, "step": 1909 }, { "epoch": 2.1112748710390568, "grad_norm": 0.352112352848053, "learning_rate": 2.437082765628122e-06, "loss": 0.4105, "step": 1910 }, { "epoch": 2.1123802505526896, "grad_norm": 0.34878963232040405, "learning_rate": 2.431557250116617e-06, "loss": 0.4336, "step": 1911 }, { "epoch": 2.113485630066323, "grad_norm": 0.314779669046402, "learning_rate": 2.426035992450848e-06, "loss": 0.389, "step": 1912 }, { "epoch": 2.114591009579956, "grad_norm": 0.3768092095851898, "learning_rate": 2.4205190017836983e-06, "loss": 0.4813, "step": 1913 }, { "epoch": 2.1156963890935887, "grad_norm": 0.3212016224861145, "learning_rate": 2.4150062872609812e-06, "loss": 0.4039, "step": 1914 }, { "epoch": 2.116801768607222, "grad_norm": 0.32705414295196533, "learning_rate": 2.4094978580214207e-06, "loss": 0.4054, "step": 1915 }, { "epoch": 2.117907148120855, "grad_norm": 0.3437402844429016, "learning_rate": 2.4039937231966374e-06, "loss": 0.4439, "step": 1916 }, { "epoch": 2.1190125276344878, "grad_norm": 0.3402983546257019, "learning_rate": 2.398493891911127e-06, "loss": 0.4537, "step": 1917 }, { "epoch": 2.1201179071481207, "grad_norm": 0.3174438178539276, "learning_rate": 2.3929983732822607e-06, "loss": 0.3633, "step": 1918 }, { "epoch": 2.121223286661754, "grad_norm": 0.36042192578315735, "learning_rate": 2.387507176420256e-06, "loss": 0.4542, "step": 1919 }, { "epoch": 2.122328666175387, "grad_norm": 0.3303794860839844, "learning_rate": 2.3820203104281616e-06, "loss": 0.4107, "step": 1920 }, { "epoch": 2.1234340456890197, "grad_norm": 0.3525560200214386, "learning_rate": 2.3765377844018518e-06, "loss": 0.3948, "step": 1921 }, { "epoch": 2.124539425202653, "grad_norm": 0.3492959439754486, "learning_rate": 2.3710596074300045e-06, "loss": 0.4491, "step": 1922 }, { "epoch": 2.125644804716286, "grad_norm": 0.30542507767677307, "learning_rate": 2.365585788594092e-06, "loss": 0.3764, "step": 1923 }, { "epoch": 2.126750184229919, "grad_norm": 0.35655155777931213, "learning_rate": 2.360116336968353e-06, "loss": 0.4689, "step": 1924 }, { "epoch": 2.127855563743552, "grad_norm": 0.36612871289253235, "learning_rate": 2.354651261619794e-06, "loss": 0.4175, "step": 1925 }, { "epoch": 2.128960943257185, "grad_norm": 0.3415554165840149, "learning_rate": 2.3491905716081668e-06, "loss": 0.4164, "step": 1926 }, { "epoch": 2.130066322770818, "grad_norm": 0.35355016589164734, "learning_rate": 2.3437342759859472e-06, "loss": 0.4381, "step": 1927 }, { "epoch": 2.131171702284451, "grad_norm": 0.33092787861824036, "learning_rate": 2.3382823837983314e-06, "loss": 0.4187, "step": 1928 }, { "epoch": 2.132277081798084, "grad_norm": 0.34183257818222046, "learning_rate": 2.3328349040832145e-06, "loss": 0.4155, "step": 1929 }, { "epoch": 2.133382461311717, "grad_norm": 0.3600374162197113, "learning_rate": 2.327391845871179e-06, "loss": 0.4551, "step": 1930 }, { "epoch": 2.13448784082535, "grad_norm": 0.3678256571292877, "learning_rate": 2.321953218185471e-06, "loss": 0.4286, "step": 1931 }, { "epoch": 2.135593220338983, "grad_norm": 0.3661292791366577, "learning_rate": 2.316519030041998e-06, "loss": 0.4238, "step": 1932 }, { "epoch": 2.136698599852616, "grad_norm": 0.3765193521976471, "learning_rate": 2.311089290449307e-06, "loss": 0.4051, "step": 1933 }, { "epoch": 2.137803979366249, "grad_norm": 0.350040465593338, "learning_rate": 2.3056640084085707e-06, "loss": 0.4103, "step": 1934 }, { "epoch": 2.138909358879882, "grad_norm": 0.3242654800415039, "learning_rate": 2.300243192913568e-06, "loss": 0.3989, "step": 1935 }, { "epoch": 2.140014738393515, "grad_norm": 0.34167516231536865, "learning_rate": 2.2948268529506768e-06, "loss": 0.4431, "step": 1936 }, { "epoch": 2.141120117907148, "grad_norm": 0.3467220664024353, "learning_rate": 2.289414997498856e-06, "loss": 0.4503, "step": 1937 }, { "epoch": 2.1422254974207813, "grad_norm": 0.3371836841106415, "learning_rate": 2.284007635529632e-06, "loss": 0.414, "step": 1938 }, { "epoch": 2.143330876934414, "grad_norm": 0.3220456540584564, "learning_rate": 2.2786047760070736e-06, "loss": 0.3976, "step": 1939 }, { "epoch": 2.144436256448047, "grad_norm": 0.35219806432724, "learning_rate": 2.2732064278877975e-06, "loss": 0.4434, "step": 1940 }, { "epoch": 2.1455416359616803, "grad_norm": 0.32847273349761963, "learning_rate": 2.267812600120935e-06, "loss": 0.4026, "step": 1941 }, { "epoch": 2.1466470154753132, "grad_norm": 0.3447869122028351, "learning_rate": 2.2624233016481224e-06, "loss": 0.402, "step": 1942 }, { "epoch": 2.147752394988946, "grad_norm": 0.33457425236701965, "learning_rate": 2.25703854140349e-06, "loss": 0.4086, "step": 1943 }, { "epoch": 2.1488577745025794, "grad_norm": 0.350190132856369, "learning_rate": 2.2516583283136474e-06, "loss": 0.431, "step": 1944 }, { "epoch": 2.1499631540162123, "grad_norm": 0.3282012939453125, "learning_rate": 2.246282671297659e-06, "loss": 0.428, "step": 1945 }, { "epoch": 2.151068533529845, "grad_norm": 0.36859315633773804, "learning_rate": 2.2409115792670434e-06, "loss": 0.4446, "step": 1946 }, { "epoch": 2.1521739130434785, "grad_norm": 0.3447286784648895, "learning_rate": 2.235545061125748e-06, "loss": 0.3805, "step": 1947 }, { "epoch": 2.1532792925571114, "grad_norm": 0.3373052477836609, "learning_rate": 2.2301831257701405e-06, "loss": 0.4297, "step": 1948 }, { "epoch": 2.1543846720707442, "grad_norm": 0.3072667717933655, "learning_rate": 2.2248257820889863e-06, "loss": 0.3751, "step": 1949 }, { "epoch": 2.155490051584377, "grad_norm": 0.3555934727191925, "learning_rate": 2.2194730389634444e-06, "loss": 0.418, "step": 1950 }, { "epoch": 2.1565954310980104, "grad_norm": 0.36960074305534363, "learning_rate": 2.2141249052670442e-06, "loss": 0.4435, "step": 1951 }, { "epoch": 2.1577008106116433, "grad_norm": 0.3191264569759369, "learning_rate": 2.2087813898656775e-06, "loss": 0.3787, "step": 1952 }, { "epoch": 2.158806190125276, "grad_norm": 0.3658792972564697, "learning_rate": 2.2034425016175733e-06, "loss": 0.4541, "step": 1953 }, { "epoch": 2.1599115696389095, "grad_norm": 0.33981823921203613, "learning_rate": 2.1981082493732945e-06, "loss": 0.4073, "step": 1954 }, { "epoch": 2.1610169491525424, "grad_norm": 0.32842883467674255, "learning_rate": 2.1927786419757196e-06, "loss": 0.4652, "step": 1955 }, { "epoch": 2.1621223286661753, "grad_norm": 0.34609612822532654, "learning_rate": 2.187453688260027e-06, "loss": 0.3987, "step": 1956 }, { "epoch": 2.1632277081798086, "grad_norm": 0.36678487062454224, "learning_rate": 2.182133397053675e-06, "loss": 0.4459, "step": 1957 }, { "epoch": 2.1643330876934415, "grad_norm": 0.354952335357666, "learning_rate": 2.176817777176398e-06, "loss": 0.401, "step": 1958 }, { "epoch": 2.1654384672070743, "grad_norm": 0.3504767417907715, "learning_rate": 2.171506837440188e-06, "loss": 0.3985, "step": 1959 }, { "epoch": 2.1665438467207077, "grad_norm": 0.35410311818122864, "learning_rate": 2.166200586649272e-06, "loss": 0.4266, "step": 1960 }, { "epoch": 2.1676492262343405, "grad_norm": 0.36724165081977844, "learning_rate": 2.1608990336001056e-06, "loss": 0.448, "step": 1961 }, { "epoch": 2.1687546057479734, "grad_norm": 0.31017065048217773, "learning_rate": 2.1556021870813653e-06, "loss": 0.3523, "step": 1962 }, { "epoch": 2.1698599852616063, "grad_norm": 0.32812589406967163, "learning_rate": 2.1503100558739133e-06, "loss": 0.3996, "step": 1963 }, { "epoch": 2.1709653647752396, "grad_norm": 0.3158884644508362, "learning_rate": 2.1450226487508017e-06, "loss": 0.4289, "step": 1964 }, { "epoch": 2.1720707442888725, "grad_norm": 0.3489215672016144, "learning_rate": 2.1397399744772497e-06, "loss": 0.4368, "step": 1965 }, { "epoch": 2.1731761238025054, "grad_norm": 0.36491674184799194, "learning_rate": 2.134462041810632e-06, "loss": 0.3965, "step": 1966 }, { "epoch": 2.1742815033161387, "grad_norm": 0.3603987991809845, "learning_rate": 2.129188859500459e-06, "loss": 0.4308, "step": 1967 }, { "epoch": 2.1753868828297716, "grad_norm": 0.32269829511642456, "learning_rate": 2.12392043628837e-06, "loss": 0.4339, "step": 1968 }, { "epoch": 2.1764922623434044, "grad_norm": 0.3263040781021118, "learning_rate": 2.1186567809081134e-06, "loss": 0.4252, "step": 1969 }, { "epoch": 2.1775976418570377, "grad_norm": 0.3276136815547943, "learning_rate": 2.1133979020855357e-06, "loss": 0.3685, "step": 1970 }, { "epoch": 2.1787030213706706, "grad_norm": 0.3638245165348053, "learning_rate": 2.1081438085385604e-06, "loss": 0.4803, "step": 1971 }, { "epoch": 2.1798084008843035, "grad_norm": 0.30910393595695496, "learning_rate": 2.102894508977182e-06, "loss": 0.3687, "step": 1972 }, { "epoch": 2.180913780397937, "grad_norm": 0.3373703360557556, "learning_rate": 2.097650012103447e-06, "loss": 0.4535, "step": 1973 }, { "epoch": 2.1820191599115697, "grad_norm": 0.3205033838748932, "learning_rate": 2.0924103266114422e-06, "loss": 0.3984, "step": 1974 }, { "epoch": 2.1831245394252026, "grad_norm": 0.3237846791744232, "learning_rate": 2.0871754611872717e-06, "loss": 0.4228, "step": 1975 }, { "epoch": 2.1842299189388354, "grad_norm": 0.3245994746685028, "learning_rate": 2.081945424509057e-06, "loss": 0.4218, "step": 1976 }, { "epoch": 2.1853352984524688, "grad_norm": 0.3319624662399292, "learning_rate": 2.0767202252469113e-06, "loss": 0.4167, "step": 1977 }, { "epoch": 2.1864406779661016, "grad_norm": 0.3249664306640625, "learning_rate": 2.0714998720629264e-06, "loss": 0.3834, "step": 1978 }, { "epoch": 2.1875460574797345, "grad_norm": 0.35937973856925964, "learning_rate": 2.066284373611163e-06, "loss": 0.4249, "step": 1979 }, { "epoch": 2.188651436993368, "grad_norm": 0.3293467164039612, "learning_rate": 2.061073738537635e-06, "loss": 0.4119, "step": 1980 }, { "epoch": 2.1897568165070007, "grad_norm": 0.34505555033683777, "learning_rate": 2.0558679754802927e-06, "loss": 0.4437, "step": 1981 }, { "epoch": 2.1908621960206336, "grad_norm": 0.3219762444496155, "learning_rate": 2.0506670930690074e-06, "loss": 0.4392, "step": 1982 }, { "epoch": 2.191967575534267, "grad_norm": 0.31965070962905884, "learning_rate": 2.045471099925561e-06, "loss": 0.4043, "step": 1983 }, { "epoch": 2.1930729550479, "grad_norm": 0.34870442748069763, "learning_rate": 2.040280004663637e-06, "loss": 0.4385, "step": 1984 }, { "epoch": 2.1941783345615327, "grad_norm": 0.31959447264671326, "learning_rate": 2.035093815888787e-06, "loss": 0.3749, "step": 1985 }, { "epoch": 2.195283714075166, "grad_norm": 0.3346288502216339, "learning_rate": 2.0299125421984367e-06, "loss": 0.4632, "step": 1986 }, { "epoch": 2.196389093588799, "grad_norm": 0.37376829981803894, "learning_rate": 2.0247361921818638e-06, "loss": 0.4392, "step": 1987 }, { "epoch": 2.1974944731024317, "grad_norm": 0.3364272713661194, "learning_rate": 2.0195647744201826e-06, "loss": 0.3877, "step": 1988 }, { "epoch": 2.198599852616065, "grad_norm": 0.4412841200828552, "learning_rate": 2.0143982974863267e-06, "loss": 0.4531, "step": 1989 }, { "epoch": 2.199705232129698, "grad_norm": 0.3371700942516327, "learning_rate": 2.0092367699450466e-06, "loss": 0.3968, "step": 1990 }, { "epoch": 2.200810611643331, "grad_norm": 0.3318536877632141, "learning_rate": 2.0040802003528826e-06, "loss": 0.4197, "step": 1991 }, { "epoch": 2.201915991156964, "grad_norm": 0.33967965841293335, "learning_rate": 1.9989285972581595e-06, "loss": 0.4022, "step": 1992 }, { "epoch": 2.203021370670597, "grad_norm": 0.3292958736419678, "learning_rate": 1.993781969200964e-06, "loss": 0.4852, "step": 1993 }, { "epoch": 2.20412675018423, "grad_norm": 0.35078439116477966, "learning_rate": 1.9886403247131395e-06, "loss": 0.4349, "step": 1994 }, { "epoch": 2.2052321296978628, "grad_norm": 0.35522106289863586, "learning_rate": 1.983503672318269e-06, "loss": 0.4271, "step": 1995 }, { "epoch": 2.206337509211496, "grad_norm": 0.38820120692253113, "learning_rate": 1.9783720205316535e-06, "loss": 0.4312, "step": 1996 }, { "epoch": 2.207442888725129, "grad_norm": 0.3250499665737152, "learning_rate": 1.9732453778603104e-06, "loss": 0.4101, "step": 1997 }, { "epoch": 2.208548268238762, "grad_norm": 0.3426540791988373, "learning_rate": 1.9681237528029513e-06, "loss": 0.425, "step": 1998 }, { "epoch": 2.209653647752395, "grad_norm": 0.3427170217037201, "learning_rate": 1.96300715384997e-06, "loss": 0.4224, "step": 1999 }, { "epoch": 2.210759027266028, "grad_norm": 0.34691473841667175, "learning_rate": 1.957895589483426e-06, "loss": 0.4185, "step": 2000 }, { "epoch": 2.211864406779661, "grad_norm": 0.35873961448669434, "learning_rate": 1.9527890681770357e-06, "loss": 0.4547, "step": 2001 }, { "epoch": 2.212969786293294, "grad_norm": 0.3665260374546051, "learning_rate": 1.947687598396154e-06, "loss": 0.388, "step": 2002 }, { "epoch": 2.214075165806927, "grad_norm": 0.36420339345932007, "learning_rate": 1.942591188597764e-06, "loss": 0.4348, "step": 2003 }, { "epoch": 2.21518054532056, "grad_norm": 0.3523566424846649, "learning_rate": 1.9374998472304523e-06, "loss": 0.4039, "step": 2004 }, { "epoch": 2.2162859248341933, "grad_norm": 0.34955766797065735, "learning_rate": 1.932413582734416e-06, "loss": 0.4528, "step": 2005 }, { "epoch": 2.217391304347826, "grad_norm": 0.304375022649765, "learning_rate": 1.927332403541428e-06, "loss": 0.3427, "step": 2006 }, { "epoch": 2.218496683861459, "grad_norm": 0.3726739287376404, "learning_rate": 1.92225631807483e-06, "loss": 0.4572, "step": 2007 }, { "epoch": 2.219602063375092, "grad_norm": 0.33875399827957153, "learning_rate": 1.9171853347495234e-06, "loss": 0.4328, "step": 2008 }, { "epoch": 2.2207074428887252, "grad_norm": 0.379548043012619, "learning_rate": 1.91211946197195e-06, "loss": 0.4258, "step": 2009 }, { "epoch": 2.221812822402358, "grad_norm": 0.3388579785823822, "learning_rate": 1.9070587081400815e-06, "loss": 0.3893, "step": 2010 }, { "epoch": 2.222918201915991, "grad_norm": 0.35633447766304016, "learning_rate": 1.9020030816433982e-06, "loss": 0.4619, "step": 2011 }, { "epoch": 2.2240235814296243, "grad_norm": 0.3147948086261749, "learning_rate": 1.896952590862886e-06, "loss": 0.4148, "step": 2012 }, { "epoch": 2.225128960943257, "grad_norm": 0.33281680941581726, "learning_rate": 1.8919072441710169e-06, "loss": 0.4398, "step": 2013 }, { "epoch": 2.22623434045689, "grad_norm": 0.3416505753993988, "learning_rate": 1.8868670499317298e-06, "loss": 0.3987, "step": 2014 }, { "epoch": 2.2273397199705234, "grad_norm": 0.33138781785964966, "learning_rate": 1.8818320165004284e-06, "loss": 0.4122, "step": 2015 }, { "epoch": 2.2284450994841563, "grad_norm": 0.3420338034629822, "learning_rate": 1.8768021522239576e-06, "loss": 0.4689, "step": 2016 }, { "epoch": 2.229550478997789, "grad_norm": 0.3416149318218231, "learning_rate": 1.8717774654405962e-06, "loss": 0.4466, "step": 2017 }, { "epoch": 2.2306558585114225, "grad_norm": 0.3379804491996765, "learning_rate": 1.8667579644800344e-06, "loss": 0.3577, "step": 2018 }, { "epoch": 2.2317612380250553, "grad_norm": 0.3421051502227783, "learning_rate": 1.8617436576633708e-06, "loss": 0.4096, "step": 2019 }, { "epoch": 2.232866617538688, "grad_norm": 0.36087939143180847, "learning_rate": 1.856734553303091e-06, "loss": 0.4111, "step": 2020 }, { "epoch": 2.233971997052321, "grad_norm": 0.3535141348838806, "learning_rate": 1.8517306597030593e-06, "loss": 0.4189, "step": 2021 }, { "epoch": 2.2350773765659544, "grad_norm": 0.33633002638816833, "learning_rate": 1.8467319851584952e-06, "loss": 0.3987, "step": 2022 }, { "epoch": 2.2361827560795873, "grad_norm": 0.3204532265663147, "learning_rate": 1.8417385379559733e-06, "loss": 0.4179, "step": 2023 }, { "epoch": 2.23728813559322, "grad_norm": 0.34462791681289673, "learning_rate": 1.8367503263733983e-06, "loss": 0.4149, "step": 2024 }, { "epoch": 2.2383935151068535, "grad_norm": 0.32982438802719116, "learning_rate": 1.8317673586799995e-06, "loss": 0.4019, "step": 2025 }, { "epoch": 2.2394988946204863, "grad_norm": 0.3728993833065033, "learning_rate": 1.8267896431363048e-06, "loss": 0.444, "step": 2026 }, { "epoch": 2.2406042741341192, "grad_norm": 0.3290599286556244, "learning_rate": 1.8218171879941465e-06, "loss": 0.3733, "step": 2027 }, { "epoch": 2.2417096536477525, "grad_norm": 0.3578796088695526, "learning_rate": 1.8168500014966316e-06, "loss": 0.4158, "step": 2028 }, { "epoch": 2.2428150331613854, "grad_norm": 0.32496416568756104, "learning_rate": 1.8118880918781278e-06, "loss": 0.3855, "step": 2029 }, { "epoch": 2.2439204126750183, "grad_norm": 0.35857516527175903, "learning_rate": 1.8069314673642624e-06, "loss": 0.4447, "step": 2030 }, { "epoch": 2.2450257921886516, "grad_norm": 0.33323249220848083, "learning_rate": 1.8019801361718996e-06, "loss": 0.4043, "step": 2031 }, { "epoch": 2.2461311717022845, "grad_norm": 0.3403133749961853, "learning_rate": 1.7970341065091246e-06, "loss": 0.4028, "step": 2032 }, { "epoch": 2.2472365512159174, "grad_norm": 0.3676002025604248, "learning_rate": 1.7920933865752382e-06, "loss": 0.4495, "step": 2033 }, { "epoch": 2.2483419307295507, "grad_norm": 0.3205709457397461, "learning_rate": 1.7871579845607378e-06, "loss": 0.4107, "step": 2034 }, { "epoch": 2.2494473102431836, "grad_norm": 0.347301721572876, "learning_rate": 1.7822279086473065e-06, "loss": 0.41, "step": 2035 }, { "epoch": 2.2505526897568164, "grad_norm": 0.3568117618560791, "learning_rate": 1.7773031670077934e-06, "loss": 0.4277, "step": 2036 }, { "epoch": 2.2516580692704498, "grad_norm": 0.36117807030677795, "learning_rate": 1.7723837678062083e-06, "loss": 0.4342, "step": 2037 }, { "epoch": 2.2527634487840826, "grad_norm": 0.33948928117752075, "learning_rate": 1.7674697191977053e-06, "loss": 0.4131, "step": 2038 }, { "epoch": 2.2538688282977155, "grad_norm": 0.3310565650463104, "learning_rate": 1.7625610293285677e-06, "loss": 0.4064, "step": 2039 }, { "epoch": 2.2549742078113484, "grad_norm": 0.4105362594127655, "learning_rate": 1.757657706336192e-06, "loss": 0.4013, "step": 2040 }, { "epoch": 2.2560795873249817, "grad_norm": 0.34031614661216736, "learning_rate": 1.7527597583490825e-06, "loss": 0.464, "step": 2041 }, { "epoch": 2.2571849668386146, "grad_norm": 0.30544036626815796, "learning_rate": 1.7478671934868302e-06, "loss": 0.3756, "step": 2042 }, { "epoch": 2.2582903463522475, "grad_norm": 0.35222378373146057, "learning_rate": 1.7429800198601055e-06, "loss": 0.3973, "step": 2043 }, { "epoch": 2.2593957258658808, "grad_norm": 0.36738547682762146, "learning_rate": 1.7380982455706353e-06, "loss": 0.4617, "step": 2044 }, { "epoch": 2.2605011053795137, "grad_norm": 0.3436881899833679, "learning_rate": 1.7332218787112014e-06, "loss": 0.389, "step": 2045 }, { "epoch": 2.2616064848931465, "grad_norm": 0.3213620185852051, "learning_rate": 1.72835092736562e-06, "loss": 0.4014, "step": 2046 }, { "epoch": 2.26271186440678, "grad_norm": 0.31369879841804504, "learning_rate": 1.7234853996087304e-06, "loss": 0.4012, "step": 2047 }, { "epoch": 2.2638172439204127, "grad_norm": 0.3621673285961151, "learning_rate": 1.7186253035063738e-06, "loss": 0.4863, "step": 2048 }, { "epoch": 2.2649226234340456, "grad_norm": 0.31270501017570496, "learning_rate": 1.7137706471153997e-06, "loss": 0.3658, "step": 2049 }, { "epoch": 2.266028002947679, "grad_norm": 0.349442720413208, "learning_rate": 1.7089214384836322e-06, "loss": 0.4253, "step": 2050 }, { "epoch": 2.267133382461312, "grad_norm": 0.344448059797287, "learning_rate": 1.704077685649862e-06, "loss": 0.398, "step": 2051 }, { "epoch": 2.2682387619749447, "grad_norm": 0.36248019337654114, "learning_rate": 1.699239396643841e-06, "loss": 0.4309, "step": 2052 }, { "epoch": 2.2693441414885775, "grad_norm": 0.315403014421463, "learning_rate": 1.6944065794862624e-06, "loss": 0.3821, "step": 2053 }, { "epoch": 2.270449521002211, "grad_norm": 0.31365254521369934, "learning_rate": 1.6895792421887437e-06, "loss": 0.4102, "step": 2054 }, { "epoch": 2.2715549005158437, "grad_norm": 0.3616870939731598, "learning_rate": 1.6847573927538235e-06, "loss": 0.4686, "step": 2055 }, { "epoch": 2.2726602800294766, "grad_norm": 0.36700835824012756, "learning_rate": 1.6799410391749416e-06, "loss": 0.4076, "step": 2056 }, { "epoch": 2.27376565954311, "grad_norm": 0.3565669357776642, "learning_rate": 1.6751301894364274e-06, "loss": 0.426, "step": 2057 }, { "epoch": 2.274871039056743, "grad_norm": 0.31520798802375793, "learning_rate": 1.670324851513483e-06, "loss": 0.3861, "step": 2058 }, { "epoch": 2.2759764185703757, "grad_norm": 0.3399445116519928, "learning_rate": 1.6655250333721757e-06, "loss": 0.4808, "step": 2059 }, { "epoch": 2.277081798084009, "grad_norm": 0.32186129689216614, "learning_rate": 1.6607307429694237e-06, "loss": 0.3873, "step": 2060 }, { "epoch": 2.278187177597642, "grad_norm": 0.4001791179180145, "learning_rate": 1.655941988252981e-06, "loss": 0.4297, "step": 2061 }, { "epoch": 2.2792925571112748, "grad_norm": 0.3371703624725342, "learning_rate": 1.6511587771614208e-06, "loss": 0.3543, "step": 2062 }, { "epoch": 2.280397936624908, "grad_norm": 0.37130147218704224, "learning_rate": 1.6463811176241312e-06, "loss": 0.4604, "step": 2063 }, { "epoch": 2.281503316138541, "grad_norm": 0.3371886610984802, "learning_rate": 1.6416090175612958e-06, "loss": 0.4228, "step": 2064 }, { "epoch": 2.282608695652174, "grad_norm": 0.35105475783348083, "learning_rate": 1.6368424848838826e-06, "loss": 0.3854, "step": 2065 }, { "epoch": 2.2837140751658067, "grad_norm": 0.33856359124183655, "learning_rate": 1.6320815274936269e-06, "loss": 0.3878, "step": 2066 }, { "epoch": 2.28481945467944, "grad_norm": 0.3187699615955353, "learning_rate": 1.6273261532830242e-06, "loss": 0.3723, "step": 2067 }, { "epoch": 2.285924834193073, "grad_norm": 0.35689717531204224, "learning_rate": 1.622576370135317e-06, "loss": 0.4939, "step": 2068 }, { "epoch": 2.287030213706706, "grad_norm": 0.29181137681007385, "learning_rate": 1.6178321859244727e-06, "loss": 0.383, "step": 2069 }, { "epoch": 2.288135593220339, "grad_norm": 0.36193007230758667, "learning_rate": 1.613093608515181e-06, "loss": 0.4364, "step": 2070 }, { "epoch": 2.289240972733972, "grad_norm": 0.30966687202453613, "learning_rate": 1.6083606457628408e-06, "loss": 0.4144, "step": 2071 }, { "epoch": 2.290346352247605, "grad_norm": 0.3479548990726471, "learning_rate": 1.6036333055135345e-06, "loss": 0.453, "step": 2072 }, { "epoch": 2.291451731761238, "grad_norm": 0.32208192348480225, "learning_rate": 1.5989115956040307e-06, "loss": 0.3776, "step": 2073 }, { "epoch": 2.292557111274871, "grad_norm": 0.3521566390991211, "learning_rate": 1.5941955238617612e-06, "loss": 0.461, "step": 2074 }, { "epoch": 2.293662490788504, "grad_norm": 0.31235620379447937, "learning_rate": 1.5894850981048133e-06, "loss": 0.4108, "step": 2075 }, { "epoch": 2.2947678703021372, "grad_norm": 0.3165307939052582, "learning_rate": 1.5847803261419109e-06, "loss": 0.4208, "step": 2076 }, { "epoch": 2.29587324981577, "grad_norm": 0.31744468212127686, "learning_rate": 1.5800812157724084e-06, "loss": 0.3961, "step": 2077 }, { "epoch": 2.296978629329403, "grad_norm": 0.3507217764854431, "learning_rate": 1.575387774786274e-06, "loss": 0.3925, "step": 2078 }, { "epoch": 2.298084008843036, "grad_norm": 0.37014636397361755, "learning_rate": 1.5707000109640797e-06, "loss": 0.4499, "step": 2079 }, { "epoch": 2.299189388356669, "grad_norm": 0.3149360716342926, "learning_rate": 1.5660179320769792e-06, "loss": 0.413, "step": 2080 }, { "epoch": 2.300294767870302, "grad_norm": 0.3302253782749176, "learning_rate": 1.5613415458867093e-06, "loss": 0.4533, "step": 2081 }, { "epoch": 2.3014001473839354, "grad_norm": 0.32403290271759033, "learning_rate": 1.556670860145567e-06, "loss": 0.4092, "step": 2082 }, { "epoch": 2.3025055268975683, "grad_norm": 0.3313794434070587, "learning_rate": 1.5520058825964002e-06, "loss": 0.4408, "step": 2083 }, { "epoch": 2.303610906411201, "grad_norm": 0.33240807056427, "learning_rate": 1.5473466209725907e-06, "loss": 0.4154, "step": 2084 }, { "epoch": 2.304716285924834, "grad_norm": 0.34203657507896423, "learning_rate": 1.5426930829980485e-06, "loss": 0.4154, "step": 2085 }, { "epoch": 2.3058216654384673, "grad_norm": 0.33638232946395874, "learning_rate": 1.5380452763871951e-06, "loss": 0.4223, "step": 2086 }, { "epoch": 2.3069270449521, "grad_norm": 0.3018338978290558, "learning_rate": 1.533403208844947e-06, "loss": 0.4178, "step": 2087 }, { "epoch": 2.308032424465733, "grad_norm": 0.31290650367736816, "learning_rate": 1.5287668880667107e-06, "loss": 0.3829, "step": 2088 }, { "epoch": 2.3091378039793664, "grad_norm": 0.31923508644104004, "learning_rate": 1.5241363217383642e-06, "loss": 0.4336, "step": 2089 }, { "epoch": 2.3102431834929993, "grad_norm": 0.33488574624061584, "learning_rate": 1.5195115175362485e-06, "loss": 0.3903, "step": 2090 }, { "epoch": 2.311348563006632, "grad_norm": 0.3407267928123474, "learning_rate": 1.5148924831271473e-06, "loss": 0.4387, "step": 2091 }, { "epoch": 2.3124539425202655, "grad_norm": 0.3186940550804138, "learning_rate": 1.5102792261682813e-06, "loss": 0.4191, "step": 2092 }, { "epoch": 2.3135593220338984, "grad_norm": 0.3189232647418976, "learning_rate": 1.5056717543073013e-06, "loss": 0.4084, "step": 2093 }, { "epoch": 2.3146647015475312, "grad_norm": 0.3451037108898163, "learning_rate": 1.5010700751822555e-06, "loss": 0.4329, "step": 2094 }, { "epoch": 2.3157700810611646, "grad_norm": 0.34534433484077454, "learning_rate": 1.4964741964215967e-06, "loss": 0.4046, "step": 2095 }, { "epoch": 2.3168754605747974, "grad_norm": 0.32234466075897217, "learning_rate": 1.4918841256441603e-06, "loss": 0.4375, "step": 2096 }, { "epoch": 2.3179808400884303, "grad_norm": 0.30033695697784424, "learning_rate": 1.487299870459155e-06, "loss": 0.3791, "step": 2097 }, { "epoch": 2.319086219602063, "grad_norm": 0.34484803676605225, "learning_rate": 1.4827214384661447e-06, "loss": 0.4402, "step": 2098 }, { "epoch": 2.3201915991156965, "grad_norm": 0.326983243227005, "learning_rate": 1.4781488372550434e-06, "loss": 0.4334, "step": 2099 }, { "epoch": 2.3212969786293294, "grad_norm": 0.34243327379226685, "learning_rate": 1.473582074406099e-06, "loss": 0.4523, "step": 2100 }, { "epoch": 2.3224023581429623, "grad_norm": 0.329102486371994, "learning_rate": 1.4690211574898805e-06, "loss": 0.4118, "step": 2101 }, { "epoch": 2.3235077376565956, "grad_norm": 0.347956120967865, "learning_rate": 1.4644660940672628e-06, "loss": 0.3889, "step": 2102 }, { "epoch": 2.3246131171702284, "grad_norm": 0.3199414014816284, "learning_rate": 1.4599168916894208e-06, "loss": 0.408, "step": 2103 }, { "epoch": 2.3257184966838613, "grad_norm": 0.3404994010925293, "learning_rate": 1.455373557897814e-06, "loss": 0.4619, "step": 2104 }, { "epoch": 2.3268238761974946, "grad_norm": 0.33072566986083984, "learning_rate": 1.4508361002241677e-06, "loss": 0.4195, "step": 2105 }, { "epoch": 2.3279292557111275, "grad_norm": 0.31469571590423584, "learning_rate": 1.4463045261904718e-06, "loss": 0.3925, "step": 2106 }, { "epoch": 2.3290346352247604, "grad_norm": 0.3048354983329773, "learning_rate": 1.4417788433089596e-06, "loss": 0.3767, "step": 2107 }, { "epoch": 2.3301400147383937, "grad_norm": 0.3205545246601105, "learning_rate": 1.4372590590821012e-06, "loss": 0.4711, "step": 2108 }, { "epoch": 2.3312453942520266, "grad_norm": 0.31059780716896057, "learning_rate": 1.4327451810025828e-06, "loss": 0.3898, "step": 2109 }, { "epoch": 2.3323507737656595, "grad_norm": 0.3144071102142334, "learning_rate": 1.4282372165533042e-06, "loss": 0.4189, "step": 2110 }, { "epoch": 2.3334561532792923, "grad_norm": 0.3259497880935669, "learning_rate": 1.4237351732073617e-06, "loss": 0.4123, "step": 2111 }, { "epoch": 2.3345615327929257, "grad_norm": 0.34411320090293884, "learning_rate": 1.4192390584280347e-06, "loss": 0.4407, "step": 2112 }, { "epoch": 2.3356669123065585, "grad_norm": 0.3053654432296753, "learning_rate": 1.4147488796687714e-06, "loss": 0.3813, "step": 2113 }, { "epoch": 2.3367722918201914, "grad_norm": 0.3571425974369049, "learning_rate": 1.4102646443731866e-06, "loss": 0.3944, "step": 2114 }, { "epoch": 2.3378776713338247, "grad_norm": 0.31851640343666077, "learning_rate": 1.4057863599750382e-06, "loss": 0.3949, "step": 2115 }, { "epoch": 2.3389830508474576, "grad_norm": 0.2955031991004944, "learning_rate": 1.4013140338982168e-06, "loss": 0.3979, "step": 2116 }, { "epoch": 2.3400884303610905, "grad_norm": 0.330147922039032, "learning_rate": 1.3968476735567392e-06, "loss": 0.4612, "step": 2117 }, { "epoch": 2.341193809874724, "grad_norm": 0.3475954234600067, "learning_rate": 1.392387286354731e-06, "loss": 0.3863, "step": 2118 }, { "epoch": 2.3422991893883567, "grad_norm": 0.32387781143188477, "learning_rate": 1.3879328796864177e-06, "loss": 0.4321, "step": 2119 }, { "epoch": 2.3434045689019896, "grad_norm": 0.3207874298095703, "learning_rate": 1.3834844609361064e-06, "loss": 0.4041, "step": 2120 }, { "epoch": 2.344509948415623, "grad_norm": 0.309574156999588, "learning_rate": 1.3790420374781804e-06, "loss": 0.3943, "step": 2121 }, { "epoch": 2.3456153279292558, "grad_norm": 0.36055833101272583, "learning_rate": 1.3746056166770872e-06, "loss": 0.4263, "step": 2122 }, { "epoch": 2.3467207074428886, "grad_norm": 0.3681198060512543, "learning_rate": 1.3701752058873157e-06, "loss": 0.461, "step": 2123 }, { "epoch": 2.3478260869565215, "grad_norm": 0.3137659430503845, "learning_rate": 1.3657508124533992e-06, "loss": 0.3877, "step": 2124 }, { "epoch": 2.348931466470155, "grad_norm": 0.3424391448497772, "learning_rate": 1.3613324437098918e-06, "loss": 0.4472, "step": 2125 }, { "epoch": 2.3500368459837877, "grad_norm": 0.3454430103302002, "learning_rate": 1.3569201069813626e-06, "loss": 0.4186, "step": 2126 }, { "epoch": 2.351142225497421, "grad_norm": 0.357544869184494, "learning_rate": 1.352513809582377e-06, "loss": 0.4472, "step": 2127 }, { "epoch": 2.352247605011054, "grad_norm": 0.30943143367767334, "learning_rate": 1.3481135588174926e-06, "loss": 0.3873, "step": 2128 }, { "epoch": 2.3533529845246868, "grad_norm": 0.3160175681114197, "learning_rate": 1.3437193619812417e-06, "loss": 0.4019, "step": 2129 }, { "epoch": 2.3544583640383197, "grad_norm": 0.3325958251953125, "learning_rate": 1.3393312263581222e-06, "loss": 0.4412, "step": 2130 }, { "epoch": 2.355563743551953, "grad_norm": 0.33506253361701965, "learning_rate": 1.33494915922258e-06, "loss": 0.3953, "step": 2131 }, { "epoch": 2.356669123065586, "grad_norm": 0.31912481784820557, "learning_rate": 1.330573167839005e-06, "loss": 0.4192, "step": 2132 }, { "epoch": 2.3577745025792187, "grad_norm": 0.36964139342308044, "learning_rate": 1.3262032594617136e-06, "loss": 0.4634, "step": 2133 }, { "epoch": 2.358879882092852, "grad_norm": 0.32596319913864136, "learning_rate": 1.3218394413349389e-06, "loss": 0.4009, "step": 2134 }, { "epoch": 2.359985261606485, "grad_norm": 0.3360520303249359, "learning_rate": 1.317481720692813e-06, "loss": 0.4181, "step": 2135 }, { "epoch": 2.361090641120118, "grad_norm": 0.31723350286483765, "learning_rate": 1.3131301047593685e-06, "loss": 0.3995, "step": 2136 }, { "epoch": 2.362196020633751, "grad_norm": 0.3292984664440155, "learning_rate": 1.3087846007485134e-06, "loss": 0.3791, "step": 2137 }, { "epoch": 2.363301400147384, "grad_norm": 0.3441758453845978, "learning_rate": 1.3044452158640197e-06, "loss": 0.407, "step": 2138 }, { "epoch": 2.364406779661017, "grad_norm": 0.35691651701927185, "learning_rate": 1.3001119572995214e-06, "loss": 0.4086, "step": 2139 }, { "epoch": 2.36551215917465, "grad_norm": 0.31761112809181213, "learning_rate": 1.2957848322384959e-06, "loss": 0.4191, "step": 2140 }, { "epoch": 2.366617538688283, "grad_norm": 0.30717697739601135, "learning_rate": 1.2914638478542474e-06, "loss": 0.3782, "step": 2141 }, { "epoch": 2.367722918201916, "grad_norm": 0.32944056391716003, "learning_rate": 1.2871490113099066e-06, "loss": 0.4266, "step": 2142 }, { "epoch": 2.368828297715549, "grad_norm": 0.32940933108329773, "learning_rate": 1.2828403297584097e-06, "loss": 0.4389, "step": 2143 }, { "epoch": 2.369933677229182, "grad_norm": 0.28081440925598145, "learning_rate": 1.2785378103424917e-06, "loss": 0.3533, "step": 2144 }, { "epoch": 2.371039056742815, "grad_norm": 0.318877637386322, "learning_rate": 1.2742414601946673e-06, "loss": 0.4239, "step": 2145 }, { "epoch": 2.372144436256448, "grad_norm": 0.31249094009399414, "learning_rate": 1.2699512864372287e-06, "loss": 0.4102, "step": 2146 }, { "epoch": 2.373249815770081, "grad_norm": 0.34991195797920227, "learning_rate": 1.2656672961822285e-06, "loss": 0.429, "step": 2147 }, { "epoch": 2.374355195283714, "grad_norm": 0.346693217754364, "learning_rate": 1.2613894965314682e-06, "loss": 0.4245, "step": 2148 }, { "epoch": 2.375460574797347, "grad_norm": 0.3245968818664551, "learning_rate": 1.2571178945764844e-06, "loss": 0.4181, "step": 2149 }, { "epoch": 2.3765659543109803, "grad_norm": 0.3161187469959259, "learning_rate": 1.2528524973985424e-06, "loss": 0.4103, "step": 2150 }, { "epoch": 2.377671333824613, "grad_norm": 0.3409148156642914, "learning_rate": 1.2485933120686206e-06, "loss": 0.4217, "step": 2151 }, { "epoch": 2.378776713338246, "grad_norm": 0.3253411054611206, "learning_rate": 1.2443403456474017e-06, "loss": 0.4182, "step": 2152 }, { "epoch": 2.3798820928518793, "grad_norm": 0.3305019736289978, "learning_rate": 1.2400936051852535e-06, "loss": 0.433, "step": 2153 }, { "epoch": 2.3809874723655122, "grad_norm": 0.3393715023994446, "learning_rate": 1.2358530977222276e-06, "loss": 0.402, "step": 2154 }, { "epoch": 2.382092851879145, "grad_norm": 0.3145518898963928, "learning_rate": 1.2316188302880434e-06, "loss": 0.4017, "step": 2155 }, { "epoch": 2.383198231392778, "grad_norm": 0.338236927986145, "learning_rate": 1.2273908099020703e-06, "loss": 0.4226, "step": 2156 }, { "epoch": 2.3843036109064113, "grad_norm": 0.32268765568733215, "learning_rate": 1.223169043573325e-06, "loss": 0.3895, "step": 2157 }, { "epoch": 2.385408990420044, "grad_norm": 0.33133023977279663, "learning_rate": 1.218953538300462e-06, "loss": 0.3846, "step": 2158 }, { "epoch": 2.386514369933677, "grad_norm": 0.3500003516674042, "learning_rate": 1.2147443010717463e-06, "loss": 0.4081, "step": 2159 }, { "epoch": 2.3876197494473104, "grad_norm": 0.3519018590450287, "learning_rate": 1.2105413388650577e-06, "loss": 0.4061, "step": 2160 }, { "epoch": 2.3887251289609432, "grad_norm": 0.3715590834617615, "learning_rate": 1.2063446586478743e-06, "loss": 0.494, "step": 2161 }, { "epoch": 2.389830508474576, "grad_norm": 0.31260231137275696, "learning_rate": 1.2021542673772584e-06, "loss": 0.364, "step": 2162 }, { "epoch": 2.3909358879882094, "grad_norm": 0.3275202810764313, "learning_rate": 1.1979701719998454e-06, "loss": 0.3972, "step": 2163 }, { "epoch": 2.3920412675018423, "grad_norm": 0.34352126717567444, "learning_rate": 1.193792379451837e-06, "loss": 0.4543, "step": 2164 }, { "epoch": 2.393146647015475, "grad_norm": 0.3422808051109314, "learning_rate": 1.1896208966589834e-06, "loss": 0.3963, "step": 2165 }, { "epoch": 2.3942520265291085, "grad_norm": 0.333499938249588, "learning_rate": 1.1854557305365783e-06, "loss": 0.4363, "step": 2166 }, { "epoch": 2.3953574060427414, "grad_norm": 0.3251384496688843, "learning_rate": 1.1812968879894387e-06, "loss": 0.424, "step": 2167 }, { "epoch": 2.3964627855563743, "grad_norm": 0.29696038365364075, "learning_rate": 1.1771443759119028e-06, "loss": 0.3931, "step": 2168 }, { "epoch": 2.397568165070007, "grad_norm": 0.29432690143585205, "learning_rate": 1.1729982011878139e-06, "loss": 0.4117, "step": 2169 }, { "epoch": 2.3986735445836405, "grad_norm": 0.319213330745697, "learning_rate": 1.1688583706905099e-06, "loss": 0.3769, "step": 2170 }, { "epoch": 2.3997789240972733, "grad_norm": 0.3434392809867859, "learning_rate": 1.164724891282808e-06, "loss": 0.4394, "step": 2171 }, { "epoch": 2.4008843036109067, "grad_norm": 0.3440203368663788, "learning_rate": 1.1605977698170001e-06, "loss": 0.4369, "step": 2172 }, { "epoch": 2.4019896831245395, "grad_norm": 0.3346264362335205, "learning_rate": 1.1564770131348385e-06, "loss": 0.4062, "step": 2173 }, { "epoch": 2.4030950626381724, "grad_norm": 0.33624231815338135, "learning_rate": 1.1523626280675237e-06, "loss": 0.4154, "step": 2174 }, { "epoch": 2.4042004421518053, "grad_norm": 0.3257479965686798, "learning_rate": 1.148254621435691e-06, "loss": 0.4091, "step": 2175 }, { "epoch": 2.4053058216654386, "grad_norm": 0.3532037138938904, "learning_rate": 1.1441530000494055e-06, "loss": 0.3903, "step": 2176 }, { "epoch": 2.4064112011790715, "grad_norm": 0.3281491994857788, "learning_rate": 1.1400577707081467e-06, "loss": 0.4597, "step": 2177 }, { "epoch": 2.4075165806927044, "grad_norm": 0.3178335726261139, "learning_rate": 1.135968940200794e-06, "loss": 0.4137, "step": 2178 }, { "epoch": 2.4086219602063377, "grad_norm": 0.36392757296562195, "learning_rate": 1.1318865153056218e-06, "loss": 0.4189, "step": 2179 }, { "epoch": 2.4097273397199706, "grad_norm": 0.3515454828739166, "learning_rate": 1.1278105027902898e-06, "loss": 0.433, "step": 2180 }, { "epoch": 2.4108327192336034, "grad_norm": 0.30348077416419983, "learning_rate": 1.1237409094118179e-06, "loss": 0.3979, "step": 2181 }, { "epoch": 2.4119380987472363, "grad_norm": 0.3043134808540344, "learning_rate": 1.1196777419165927e-06, "loss": 0.4377, "step": 2182 }, { "epoch": 2.4130434782608696, "grad_norm": 0.31381770968437195, "learning_rate": 1.1156210070403457e-06, "loss": 0.402, "step": 2183 }, { "epoch": 2.4141488577745025, "grad_norm": 0.30549487471580505, "learning_rate": 1.1115707115081448e-06, "loss": 0.4027, "step": 2184 }, { "epoch": 2.415254237288136, "grad_norm": 0.3587329685688019, "learning_rate": 1.1075268620343809e-06, "loss": 0.4587, "step": 2185 }, { "epoch": 2.4163596168017687, "grad_norm": 0.3127484619617462, "learning_rate": 1.1034894653227618e-06, "loss": 0.3856, "step": 2186 }, { "epoch": 2.4174649963154016, "grad_norm": 0.31944820284843445, "learning_rate": 1.0994585280662978e-06, "loss": 0.4384, "step": 2187 }, { "epoch": 2.4185703758290344, "grad_norm": 0.31398552656173706, "learning_rate": 1.095434056947291e-06, "loss": 0.4047, "step": 2188 }, { "epoch": 2.4196757553426678, "grad_norm": 0.32963132858276367, "learning_rate": 1.0914160586373218e-06, "loss": 0.4132, "step": 2189 }, { "epoch": 2.4207811348563006, "grad_norm": 0.3204709589481354, "learning_rate": 1.0874045397972433e-06, "loss": 0.4056, "step": 2190 }, { "epoch": 2.4218865143699335, "grad_norm": 0.35029566287994385, "learning_rate": 1.0833995070771652e-06, "loss": 0.4486, "step": 2191 }, { "epoch": 2.422991893883567, "grad_norm": 0.29267606139183044, "learning_rate": 1.0794009671164484e-06, "loss": 0.369, "step": 2192 }, { "epoch": 2.4240972733971997, "grad_norm": 0.3196967542171478, "learning_rate": 1.0754089265436845e-06, "loss": 0.4664, "step": 2193 }, { "epoch": 2.4252026529108326, "grad_norm": 0.32648804783821106, "learning_rate": 1.0714233919766953e-06, "loss": 0.3903, "step": 2194 }, { "epoch": 2.426308032424466, "grad_norm": 0.35091081261634827, "learning_rate": 1.0674443700225173e-06, "loss": 0.4594, "step": 2195 }, { "epoch": 2.427413411938099, "grad_norm": 0.32719916105270386, "learning_rate": 1.0634718672773863e-06, "loss": 0.4017, "step": 2196 }, { "epoch": 2.4285187914517317, "grad_norm": 0.3458060920238495, "learning_rate": 1.0595058903267357e-06, "loss": 0.4378, "step": 2197 }, { "epoch": 2.429624170965365, "grad_norm": 0.3267258405685425, "learning_rate": 1.055546445745178e-06, "loss": 0.3859, "step": 2198 }, { "epoch": 2.430729550478998, "grad_norm": 0.3181951344013214, "learning_rate": 1.0515935400965e-06, "loss": 0.4122, "step": 2199 }, { "epoch": 2.4318349299926307, "grad_norm": 0.32572656869888306, "learning_rate": 1.0476471799336424e-06, "loss": 0.4007, "step": 2200 }, { "epoch": 2.4329403095062636, "grad_norm": 0.3543911278247833, "learning_rate": 1.043707371798699e-06, "loss": 0.4132, "step": 2201 }, { "epoch": 2.434045689019897, "grad_norm": 0.3109610676765442, "learning_rate": 1.0397741222229057e-06, "loss": 0.4149, "step": 2202 }, { "epoch": 2.43515106853353, "grad_norm": 0.35836902260780334, "learning_rate": 1.0358474377266187e-06, "loss": 0.4417, "step": 2203 }, { "epoch": 2.4362564480471627, "grad_norm": 0.33226215839385986, "learning_rate": 1.0319273248193145e-06, "loss": 0.3877, "step": 2204 }, { "epoch": 2.437361827560796, "grad_norm": 0.3394250273704529, "learning_rate": 1.0280137899995756e-06, "loss": 0.4279, "step": 2205 }, { "epoch": 2.438467207074429, "grad_norm": 0.3410290479660034, "learning_rate": 1.0241068397550807e-06, "loss": 0.367, "step": 2206 }, { "epoch": 2.4395725865880618, "grad_norm": 0.3758080303668976, "learning_rate": 1.0202064805625883e-06, "loss": 0.4397, "step": 2207 }, { "epoch": 2.440677966101695, "grad_norm": 0.33992353081703186, "learning_rate": 1.0163127188879352e-06, "loss": 0.4041, "step": 2208 }, { "epoch": 2.441783345615328, "grad_norm": 0.33012160658836365, "learning_rate": 1.01242556118602e-06, "loss": 0.4141, "step": 2209 }, { "epoch": 2.442888725128961, "grad_norm": 0.31934618949890137, "learning_rate": 1.008545013900794e-06, "loss": 0.3836, "step": 2210 }, { "epoch": 2.443994104642594, "grad_norm": 0.3130892217159271, "learning_rate": 1.0046710834652474e-06, "loss": 0.4049, "step": 2211 }, { "epoch": 2.445099484156227, "grad_norm": 0.3153076767921448, "learning_rate": 1.0008037763014033e-06, "loss": 0.4258, "step": 2212 }, { "epoch": 2.44620486366986, "grad_norm": 0.32796332240104675, "learning_rate": 9.969430988203065e-07, "loss": 0.4495, "step": 2213 }, { "epoch": 2.4473102431834928, "grad_norm": 0.3051679730415344, "learning_rate": 9.930890574220076e-07, "loss": 0.4209, "step": 2214 }, { "epoch": 2.448415622697126, "grad_norm": 0.299526184797287, "learning_rate": 9.892416584955595e-07, "loss": 0.3968, "step": 2215 }, { "epoch": 2.449521002210759, "grad_norm": 0.3183306157588959, "learning_rate": 9.85400908419002e-07, "loss": 0.4292, "step": 2216 }, { "epoch": 2.4506263817243923, "grad_norm": 0.3608407974243164, "learning_rate": 9.815668135593548e-07, "loss": 0.4221, "step": 2217 }, { "epoch": 2.451731761238025, "grad_norm": 0.32681697607040405, "learning_rate": 9.77739380272601e-07, "loss": 0.4307, "step": 2218 }, { "epoch": 2.452837140751658, "grad_norm": 0.29986852407455444, "learning_rate": 9.739186149036828e-07, "loss": 0.3988, "step": 2219 }, { "epoch": 2.453942520265291, "grad_norm": 0.35284990072250366, "learning_rate": 9.701045237864898e-07, "loss": 0.4421, "step": 2220 }, { "epoch": 2.4550478997789242, "grad_norm": 0.3351341187953949, "learning_rate": 9.66297113243847e-07, "loss": 0.394, "step": 2221 }, { "epoch": 2.456153279292557, "grad_norm": 0.33576667308807373, "learning_rate": 9.624963895874995e-07, "loss": 0.4301, "step": 2222 }, { "epoch": 2.45725865880619, "grad_norm": 0.3190898895263672, "learning_rate": 9.587023591181156e-07, "loss": 0.4219, "step": 2223 }, { "epoch": 2.4583640383198233, "grad_norm": 0.31513798236846924, "learning_rate": 9.549150281252633e-07, "loss": 0.4006, "step": 2224 }, { "epoch": 2.459469417833456, "grad_norm": 0.35179510712623596, "learning_rate": 9.511344028874026e-07, "loss": 0.444, "step": 2225 }, { "epoch": 2.460574797347089, "grad_norm": 0.3383944034576416, "learning_rate": 9.473604896718808e-07, "loss": 0.4391, "step": 2226 }, { "epoch": 2.461680176860722, "grad_norm": 0.3258511424064636, "learning_rate": 9.435932947349169e-07, "loss": 0.3846, "step": 2227 }, { "epoch": 2.4627855563743553, "grad_norm": 0.34794342517852783, "learning_rate": 9.398328243215937e-07, "loss": 0.4528, "step": 2228 }, { "epoch": 2.463890935887988, "grad_norm": 0.3319951295852661, "learning_rate": 9.360790846658429e-07, "loss": 0.4296, "step": 2229 }, { "epoch": 2.4649963154016215, "grad_norm": 0.3665831387042999, "learning_rate": 9.323320819904419e-07, "loss": 0.4563, "step": 2230 }, { "epoch": 2.4661016949152543, "grad_norm": 0.3212851881980896, "learning_rate": 9.285918225070001e-07, "loss": 0.4087, "step": 2231 }, { "epoch": 2.467207074428887, "grad_norm": 0.31476667523384094, "learning_rate": 9.248583124159438e-07, "loss": 0.4212, "step": 2232 }, { "epoch": 2.46831245394252, "grad_norm": 0.33086958527565, "learning_rate": 9.211315579065155e-07, "loss": 0.4433, "step": 2233 }, { "epoch": 2.4694178334561534, "grad_norm": 0.32194480299949646, "learning_rate": 9.174115651567561e-07, "loss": 0.4026, "step": 2234 }, { "epoch": 2.4705232129697863, "grad_norm": 0.31426337361335754, "learning_rate": 9.136983403334993e-07, "loss": 0.4041, "step": 2235 }, { "epoch": 2.471628592483419, "grad_norm": 0.3219498097896576, "learning_rate": 9.099918895923554e-07, "loss": 0.416, "step": 2236 }, { "epoch": 2.4727339719970525, "grad_norm": 0.3320116102695465, "learning_rate": 9.062922190777079e-07, "loss": 0.4457, "step": 2237 }, { "epoch": 2.4738393515106853, "grad_norm": 0.32528364658355713, "learning_rate": 9.025993349226997e-07, "loss": 0.3992, "step": 2238 }, { "epoch": 2.4749447310243182, "grad_norm": 0.30041274428367615, "learning_rate": 8.989132432492253e-07, "loss": 0.4012, "step": 2239 }, { "epoch": 2.4760501105379515, "grad_norm": 0.34784868359565735, "learning_rate": 8.952339501679142e-07, "loss": 0.4662, "step": 2240 }, { "epoch": 2.4771554900515844, "grad_norm": 0.32953599095344543, "learning_rate": 8.915614617781298e-07, "loss": 0.4152, "step": 2241 }, { "epoch": 2.4782608695652173, "grad_norm": 0.30824118852615356, "learning_rate": 8.878957841679542e-07, "loss": 0.3648, "step": 2242 }, { "epoch": 2.4793662490788506, "grad_norm": 0.3159421980381012, "learning_rate": 8.842369234141796e-07, "loss": 0.3677, "step": 2243 }, { "epoch": 2.4804716285924835, "grad_norm": 0.3338443636894226, "learning_rate": 8.805848855822918e-07, "loss": 0.3772, "step": 2244 }, { "epoch": 2.4815770081061164, "grad_norm": 0.35238105058670044, "learning_rate": 8.769396767264743e-07, "loss": 0.4294, "step": 2245 }, { "epoch": 2.4826823876197492, "grad_norm": 0.321770578622818, "learning_rate": 8.733013028895864e-07, "loss": 0.4358, "step": 2246 }, { "epoch": 2.4837877671333826, "grad_norm": 0.30832579731941223, "learning_rate": 8.696697701031543e-07, "loss": 0.4046, "step": 2247 }, { "epoch": 2.4848931466470154, "grad_norm": 0.28866735100746155, "learning_rate": 8.660450843873647e-07, "loss": 0.3962, "step": 2248 }, { "epoch": 2.4859985261606483, "grad_norm": 0.34075507521629333, "learning_rate": 8.624272517510574e-07, "loss": 0.4719, "step": 2249 }, { "epoch": 2.4871039056742816, "grad_norm": 0.3305790424346924, "learning_rate": 8.588162781917042e-07, "loss": 0.4004, "step": 2250 }, { "epoch": 2.4882092851879145, "grad_norm": 0.29533684253692627, "learning_rate": 8.552121696954119e-07, "loss": 0.3693, "step": 2251 }, { "epoch": 2.4893146647015474, "grad_norm": 0.3299785256385803, "learning_rate": 8.516149322369055e-07, "loss": 0.4337, "step": 2252 }, { "epoch": 2.4904200442151807, "grad_norm": 0.33997663855552673, "learning_rate": 8.480245717795199e-07, "loss": 0.4125, "step": 2253 }, { "epoch": 2.4915254237288136, "grad_norm": 0.31506386399269104, "learning_rate": 8.444410942751863e-07, "loss": 0.4029, "step": 2254 }, { "epoch": 2.4926308032424465, "grad_norm": 0.3549812436103821, "learning_rate": 8.408645056644299e-07, "loss": 0.4441, "step": 2255 }, { "epoch": 2.4937361827560798, "grad_norm": 0.3108268082141876, "learning_rate": 8.372948118763536e-07, "loss": 0.404, "step": 2256 }, { "epoch": 2.4948415622697127, "grad_norm": 0.3422608971595764, "learning_rate": 8.337320188286318e-07, "loss": 0.4263, "step": 2257 }, { "epoch": 2.4959469417833455, "grad_norm": 0.3232058882713318, "learning_rate": 8.301761324274965e-07, "loss": 0.4158, "step": 2258 }, { "epoch": 2.4970523212969784, "grad_norm": 0.33123281598091125, "learning_rate": 8.266271585677327e-07, "loss": 0.3924, "step": 2259 }, { "epoch": 2.4981577008106117, "grad_norm": 0.33436378836631775, "learning_rate": 8.230851031326653e-07, "loss": 0.4107, "step": 2260 }, { "epoch": 2.4992630803242446, "grad_norm": 0.3710539937019348, "learning_rate": 8.195499719941513e-07, "loss": 0.447, "step": 2261 }, { "epoch": 2.500368459837878, "grad_norm": 0.3305223882198334, "learning_rate": 8.160217710125661e-07, "loss": 0.425, "step": 2262 }, { "epoch": 2.501473839351511, "grad_norm": 0.3192126154899597, "learning_rate": 8.125005060367985e-07, "loss": 0.4234, "step": 2263 }, { "epoch": 2.5025792188651437, "grad_norm": 0.3540102541446686, "learning_rate": 8.089861829042406e-07, "loss": 0.4379, "step": 2264 }, { "epoch": 2.5036845983787765, "grad_norm": 0.3117208778858185, "learning_rate": 8.054788074407727e-07, "loss": 0.403, "step": 2265 }, { "epoch": 2.50478997789241, "grad_norm": 0.3366854786872864, "learning_rate": 8.019783854607593e-07, "loss": 0.3762, "step": 2266 }, { "epoch": 2.5058953574060427, "grad_norm": 0.3513021171092987, "learning_rate": 7.984849227670421e-07, "loss": 0.4425, "step": 2267 }, { "epoch": 2.5070007369196756, "grad_norm": 0.32283011078834534, "learning_rate": 7.949984251509185e-07, "loss": 0.4065, "step": 2268 }, { "epoch": 2.508106116433309, "grad_norm": 0.36328673362731934, "learning_rate": 7.915188983921451e-07, "loss": 0.431, "step": 2269 }, { "epoch": 2.509211495946942, "grad_norm": 0.33109983801841736, "learning_rate": 7.880463482589196e-07, "loss": 0.4057, "step": 2270 }, { "epoch": 2.5103168754605747, "grad_norm": 0.3191819489002228, "learning_rate": 7.845807805078764e-07, "loss": 0.39, "step": 2271 }, { "epoch": 2.5114222549742076, "grad_norm": 0.2999105453491211, "learning_rate": 7.811222008840719e-07, "loss": 0.3671, "step": 2272 }, { "epoch": 2.512527634487841, "grad_norm": 0.41040411591529846, "learning_rate": 7.776706151209807e-07, "loss": 0.432, "step": 2273 }, { "epoch": 2.5136330140014738, "grad_norm": 0.3313489854335785, "learning_rate": 7.742260289404819e-07, "loss": 0.4585, "step": 2274 }, { "epoch": 2.514738393515107, "grad_norm": 0.30589577555656433, "learning_rate": 7.707884480528526e-07, "loss": 0.397, "step": 2275 }, { "epoch": 2.51584377302874, "grad_norm": 0.3239889442920685, "learning_rate": 7.673578781567537e-07, "loss": 0.4093, "step": 2276 }, { "epoch": 2.516949152542373, "grad_norm": 0.3065659999847412, "learning_rate": 7.639343249392256e-07, "loss": 0.4135, "step": 2277 }, { "epoch": 2.5180545320560057, "grad_norm": 0.34021010994911194, "learning_rate": 7.605177940756774e-07, "loss": 0.4753, "step": 2278 }, { "epoch": 2.519159911569639, "grad_norm": 0.29342517256736755, "learning_rate": 7.571082912298777e-07, "loss": 0.3598, "step": 2279 }, { "epoch": 2.520265291083272, "grad_norm": 0.34394371509552, "learning_rate": 7.537058220539395e-07, "loss": 0.4701, "step": 2280 }, { "epoch": 2.521370670596905, "grad_norm": 0.33225148916244507, "learning_rate": 7.503103921883209e-07, "loss": 0.4045, "step": 2281 }, { "epoch": 2.522476050110538, "grad_norm": 0.3474089801311493, "learning_rate": 7.469220072618094e-07, "loss": 0.4062, "step": 2282 }, { "epoch": 2.523581429624171, "grad_norm": 0.33313125371932983, "learning_rate": 7.435406728915112e-07, "loss": 0.4162, "step": 2283 }, { "epoch": 2.524686809137804, "grad_norm": 0.30952155590057373, "learning_rate": 7.401663946828469e-07, "loss": 0.4216, "step": 2284 }, { "epoch": 2.5257921886514367, "grad_norm": 0.3137383460998535, "learning_rate": 7.367991782295392e-07, "loss": 0.443, "step": 2285 }, { "epoch": 2.52689756816507, "grad_norm": 0.3169736862182617, "learning_rate": 7.334390291136051e-07, "loss": 0.4213, "step": 2286 }, { "epoch": 2.528002947678703, "grad_norm": 0.3019518554210663, "learning_rate": 7.300859529053422e-07, "loss": 0.3982, "step": 2287 }, { "epoch": 2.5291083271923362, "grad_norm": 0.3113415837287903, "learning_rate": 7.267399551633253e-07, "loss": 0.4083, "step": 2288 }, { "epoch": 2.530213706705969, "grad_norm": 0.33189207315444946, "learning_rate": 7.234010414343989e-07, "loss": 0.4395, "step": 2289 }, { "epoch": 2.531319086219602, "grad_norm": 0.3208407163619995, "learning_rate": 7.200692172536555e-07, "loss": 0.4399, "step": 2290 }, { "epoch": 2.532424465733235, "grad_norm": 0.3195371925830841, "learning_rate": 7.167444881444413e-07, "loss": 0.4234, "step": 2291 }, { "epoch": 2.533529845246868, "grad_norm": 0.328323632478714, "learning_rate": 7.13426859618338e-07, "loss": 0.4128, "step": 2292 }, { "epoch": 2.534635224760501, "grad_norm": 0.313218891620636, "learning_rate": 7.101163371751585e-07, "loss": 0.4404, "step": 2293 }, { "epoch": 2.5357406042741344, "grad_norm": 0.32555344700813293, "learning_rate": 7.068129263029316e-07, "loss": 0.3885, "step": 2294 }, { "epoch": 2.5368459837877673, "grad_norm": 0.3751481771469116, "learning_rate": 7.035166324779002e-07, "loss": 0.4132, "step": 2295 }, { "epoch": 2.5379513633014, "grad_norm": 0.31991690397262573, "learning_rate": 7.002274611645083e-07, "loss": 0.3983, "step": 2296 }, { "epoch": 2.539056742815033, "grad_norm": 0.3132786750793457, "learning_rate": 6.969454178153923e-07, "loss": 0.3866, "step": 2297 }, { "epoch": 2.5401621223286663, "grad_norm": 0.3120950758457184, "learning_rate": 6.936705078713713e-07, "loss": 0.3886, "step": 2298 }, { "epoch": 2.541267501842299, "grad_norm": 0.33536574244499207, "learning_rate": 6.904027367614397e-07, "loss": 0.4458, "step": 2299 }, { "epoch": 2.542372881355932, "grad_norm": 0.30776211619377136, "learning_rate": 6.871421099027586e-07, "loss": 0.3972, "step": 2300 }, { "epoch": 2.5434782608695654, "grad_norm": 0.3205487132072449, "learning_rate": 6.838886327006428e-07, "loss": 0.397, "step": 2301 }, { "epoch": 2.5445836403831983, "grad_norm": 0.35418498516082764, "learning_rate": 6.806423105485576e-07, "loss": 0.4442, "step": 2302 }, { "epoch": 2.545689019896831, "grad_norm": 0.32221585512161255, "learning_rate": 6.774031488281057e-07, "loss": 0.3991, "step": 2303 }, { "epoch": 2.546794399410464, "grad_norm": 0.3256107568740845, "learning_rate": 6.741711529090212e-07, "loss": 0.4126, "step": 2304 }, { "epoch": 2.5478997789240974, "grad_norm": 0.33321723341941833, "learning_rate": 6.709463281491551e-07, "loss": 0.4277, "step": 2305 }, { "epoch": 2.5490051584377302, "grad_norm": 0.34105005860328674, "learning_rate": 6.677286798944743e-07, "loss": 0.4047, "step": 2306 }, { "epoch": 2.5501105379513636, "grad_norm": 0.31856659054756165, "learning_rate": 6.645182134790467e-07, "loss": 0.4194, "step": 2307 }, { "epoch": 2.5512159174649964, "grad_norm": 0.3133260905742645, "learning_rate": 6.61314934225037e-07, "loss": 0.3962, "step": 2308 }, { "epoch": 2.5523212969786293, "grad_norm": 0.33417364954948425, "learning_rate": 6.581188474426898e-07, "loss": 0.4991, "step": 2309 }, { "epoch": 2.553426676492262, "grad_norm": 0.2836672365665436, "learning_rate": 6.549299584303343e-07, "loss": 0.3763, "step": 2310 }, { "epoch": 2.5545320560058955, "grad_norm": 0.3217768967151642, "learning_rate": 6.517482724743623e-07, "loss": 0.4496, "step": 2311 }, { "epoch": 2.5556374355195284, "grad_norm": 0.3016747236251831, "learning_rate": 6.485737948492237e-07, "loss": 0.4157, "step": 2312 }, { "epoch": 2.5567428150331613, "grad_norm": 0.3315136134624481, "learning_rate": 6.454065308174229e-07, "loss": 0.4365, "step": 2313 }, { "epoch": 2.5578481945467946, "grad_norm": 0.31676042079925537, "learning_rate": 6.422464856295035e-07, "loss": 0.3746, "step": 2314 }, { "epoch": 2.5589535740604274, "grad_norm": 0.32818907499313354, "learning_rate": 6.390936645240431e-07, "loss": 0.4473, "step": 2315 }, { "epoch": 2.5600589535740603, "grad_norm": 0.35828259587287903, "learning_rate": 6.359480727276407e-07, "loss": 0.4898, "step": 2316 }, { "epoch": 2.561164333087693, "grad_norm": 0.3296453058719635, "learning_rate": 6.328097154549146e-07, "loss": 0.3739, "step": 2317 }, { "epoch": 2.5622697126013265, "grad_norm": 0.3114607632160187, "learning_rate": 6.296785979084891e-07, "loss": 0.4264, "step": 2318 }, { "epoch": 2.5633750921149594, "grad_norm": 0.31511789560317993, "learning_rate": 6.265547252789844e-07, "loss": 0.4443, "step": 2319 }, { "epoch": 2.5644804716285927, "grad_norm": 0.30085381865501404, "learning_rate": 6.234381027450132e-07, "loss": 0.4098, "step": 2320 }, { "epoch": 2.5655858511422256, "grad_norm": 0.3536820113658905, "learning_rate": 6.203287354731686e-07, "loss": 0.4371, "step": 2321 }, { "epoch": 2.5666912306558585, "grad_norm": 0.32621386647224426, "learning_rate": 6.172266286180162e-07, "loss": 0.376, "step": 2322 }, { "epoch": 2.5677966101694913, "grad_norm": 0.29918092489242554, "learning_rate": 6.141317873220848e-07, "loss": 0.4185, "step": 2323 }, { "epoch": 2.5689019896831247, "grad_norm": 0.31577619910240173, "learning_rate": 6.110442167158592e-07, "loss": 0.4214, "step": 2324 }, { "epoch": 2.5700073691967575, "grad_norm": 0.3169631361961365, "learning_rate": 6.079639219177714e-07, "loss": 0.4206, "step": 2325 }, { "epoch": 2.5711127487103904, "grad_norm": 0.30377864837646484, "learning_rate": 6.048909080341936e-07, "loss": 0.3794, "step": 2326 }, { "epoch": 2.5722181282240237, "grad_norm": 0.4561157822608948, "learning_rate": 6.018251801594232e-07, "loss": 0.4349, "step": 2327 }, { "epoch": 2.5733235077376566, "grad_norm": 0.3357909917831421, "learning_rate": 5.987667433756844e-07, "loss": 0.4203, "step": 2328 }, { "epoch": 2.5744288872512895, "grad_norm": 0.33869779109954834, "learning_rate": 5.957156027531113e-07, "loss": 0.4186, "step": 2329 }, { "epoch": 2.5755342667649224, "grad_norm": 0.34390777349472046, "learning_rate": 5.926717633497453e-07, "loss": 0.431, "step": 2330 }, { "epoch": 2.5766396462785557, "grad_norm": 0.307102233171463, "learning_rate": 5.896352302115199e-07, "loss": 0.3858, "step": 2331 }, { "epoch": 2.5777450257921886, "grad_norm": 0.3352380692958832, "learning_rate": 5.866060083722624e-07, "loss": 0.4389, "step": 2332 }, { "epoch": 2.578850405305822, "grad_norm": 0.3054060935974121, "learning_rate": 5.83584102853677e-07, "loss": 0.3541, "step": 2333 }, { "epoch": 2.5799557848194548, "grad_norm": 0.35297706723213196, "learning_rate": 5.805695186653365e-07, "loss": 0.4554, "step": 2334 }, { "epoch": 2.5810611643330876, "grad_norm": 0.30546414852142334, "learning_rate": 5.775622608046804e-07, "loss": 0.4303, "step": 2335 }, { "epoch": 2.5821665438467205, "grad_norm": 0.31113576889038086, "learning_rate": 5.745623342570039e-07, "loss": 0.4256, "step": 2336 }, { "epoch": 2.583271923360354, "grad_norm": 0.3118310868740082, "learning_rate": 5.715697439954432e-07, "loss": 0.4417, "step": 2337 }, { "epoch": 2.5843773028739867, "grad_norm": 0.3075689971446991, "learning_rate": 5.685844949809777e-07, "loss": 0.3864, "step": 2338 }, { "epoch": 2.58548268238762, "grad_norm": 0.3089504837989807, "learning_rate": 5.656065921624159e-07, "loss": 0.4164, "step": 2339 }, { "epoch": 2.586588061901253, "grad_norm": 0.2922597825527191, "learning_rate": 5.626360404763875e-07, "loss": 0.3933, "step": 2340 }, { "epoch": 2.5876934414148858, "grad_norm": 0.32994937896728516, "learning_rate": 5.596728448473349e-07, "loss": 0.4168, "step": 2341 }, { "epoch": 2.5887988209285187, "grad_norm": 0.3318001627922058, "learning_rate": 5.567170101875074e-07, "loss": 0.4008, "step": 2342 }, { "epoch": 2.589904200442152, "grad_norm": 0.3039640486240387, "learning_rate": 5.537685413969507e-07, "loss": 0.4278, "step": 2343 }, { "epoch": 2.591009579955785, "grad_norm": 0.3205920457839966, "learning_rate": 5.508274433635019e-07, "loss": 0.4479, "step": 2344 }, { "epoch": 2.5921149594694177, "grad_norm": 0.3043215572834015, "learning_rate": 5.478937209627755e-07, "loss": 0.4084, "step": 2345 }, { "epoch": 2.593220338983051, "grad_norm": 0.32119688391685486, "learning_rate": 5.449673790581611e-07, "loss": 0.424, "step": 2346 }, { "epoch": 2.594325718496684, "grad_norm": 0.31154701113700867, "learning_rate": 5.420484225008138e-07, "loss": 0.4116, "step": 2347 }, { "epoch": 2.595431098010317, "grad_norm": 0.3122122585773468, "learning_rate": 5.391368561296456e-07, "loss": 0.3781, "step": 2348 }, { "epoch": 2.5965364775239497, "grad_norm": 0.33806830644607544, "learning_rate": 5.362326847713151e-07, "loss": 0.4334, "step": 2349 }, { "epoch": 2.597641857037583, "grad_norm": 0.32300376892089844, "learning_rate": 5.333359132402238e-07, "loss": 0.4634, "step": 2350 }, { "epoch": 2.598747236551216, "grad_norm": 0.3209781348705292, "learning_rate": 5.304465463385067e-07, "loss": 0.4098, "step": 2351 }, { "epoch": 2.599852616064849, "grad_norm": 0.31290918588638306, "learning_rate": 5.275645888560233e-07, "loss": 0.3853, "step": 2352 }, { "epoch": 2.600957995578482, "grad_norm": 0.3108130395412445, "learning_rate": 5.246900455703457e-07, "loss": 0.4184, "step": 2353 }, { "epoch": 2.602063375092115, "grad_norm": 0.3249339759349823, "learning_rate": 5.218229212467635e-07, "loss": 0.4273, "step": 2354 }, { "epoch": 2.603168754605748, "grad_norm": 0.31844401359558105, "learning_rate": 5.189632206382617e-07, "loss": 0.3763, "step": 2355 }, { "epoch": 2.604274134119381, "grad_norm": 0.3337879776954651, "learning_rate": 5.161109484855182e-07, "loss": 0.4212, "step": 2356 }, { "epoch": 2.605379513633014, "grad_norm": 0.3074633777141571, "learning_rate": 5.132661095168994e-07, "loss": 0.3999, "step": 2357 }, { "epoch": 2.606484893146647, "grad_norm": 0.3207220137119293, "learning_rate": 5.104287084484489e-07, "loss": 0.4097, "step": 2358 }, { "epoch": 2.60759027266028, "grad_norm": 0.35439935326576233, "learning_rate": 5.075987499838763e-07, "loss": 0.428, "step": 2359 }, { "epoch": 2.608695652173913, "grad_norm": 0.32097578048706055, "learning_rate": 5.047762388145582e-07, "loss": 0.443, "step": 2360 }, { "epoch": 2.609801031687546, "grad_norm": 0.3193216323852539, "learning_rate": 5.019611796195222e-07, "loss": 0.4447, "step": 2361 }, { "epoch": 2.610906411201179, "grad_norm": 0.3180513381958008, "learning_rate": 4.991535770654449e-07, "loss": 0.4152, "step": 2362 }, { "epoch": 2.612011790714812, "grad_norm": 0.2998959720134735, "learning_rate": 4.963534358066379e-07, "loss": 0.3997, "step": 2363 }, { "epoch": 2.613117170228445, "grad_norm": 0.310007780790329, "learning_rate": 4.935607604850473e-07, "loss": 0.3867, "step": 2364 }, { "epoch": 2.6142225497420783, "grad_norm": 0.32866135239601135, "learning_rate": 4.907755557302407e-07, "loss": 0.4055, "step": 2365 }, { "epoch": 2.6153279292557112, "grad_norm": 0.3479047417640686, "learning_rate": 4.879978261594037e-07, "loss": 0.4758, "step": 2366 }, { "epoch": 2.616433308769344, "grad_norm": 0.32651323080062866, "learning_rate": 4.852275763773251e-07, "loss": 0.3938, "step": 2367 }, { "epoch": 2.617538688282977, "grad_norm": 0.3210565745830536, "learning_rate": 4.824648109763991e-07, "loss": 0.4035, "step": 2368 }, { "epoch": 2.6186440677966103, "grad_norm": 0.3162902593612671, "learning_rate": 4.797095345366092e-07, "loss": 0.4356, "step": 2369 }, { "epoch": 2.619749447310243, "grad_norm": 0.3402162194252014, "learning_rate": 4.769617516255276e-07, "loss": 0.4474, "step": 2370 }, { "epoch": 2.620854826823876, "grad_norm": 0.30301085114479065, "learning_rate": 4.7422146679829916e-07, "loss": 0.3826, "step": 2371 }, { "epoch": 2.6219602063375094, "grad_norm": 0.3120967447757721, "learning_rate": 4.71488684597643e-07, "loss": 0.4494, "step": 2372 }, { "epoch": 2.6230655858511422, "grad_norm": 0.3060323894023895, "learning_rate": 4.687634095538396e-07, "loss": 0.4201, "step": 2373 }, { "epoch": 2.624170965364775, "grad_norm": 0.31675857305526733, "learning_rate": 4.660456461847224e-07, "loss": 0.3807, "step": 2374 }, { "epoch": 2.625276344878408, "grad_norm": 0.362946093082428, "learning_rate": 4.6333539899567405e-07, "loss": 0.4733, "step": 2375 }, { "epoch": 2.6263817243920413, "grad_norm": 0.31218603253364563, "learning_rate": 4.606326724796195e-07, "loss": 0.3908, "step": 2376 }, { "epoch": 2.627487103905674, "grad_norm": 0.30670008063316345, "learning_rate": 4.57937471117012e-07, "loss": 0.3965, "step": 2377 }, { "epoch": 2.6285924834193075, "grad_norm": 0.378301203250885, "learning_rate": 4.5524979937583203e-07, "loss": 0.46, "step": 2378 }, { "epoch": 2.6296978629329404, "grad_norm": 0.3456161320209503, "learning_rate": 4.525696617115777e-07, "loss": 0.4258, "step": 2379 }, { "epoch": 2.6308032424465733, "grad_norm": 0.330697238445282, "learning_rate": 4.498970625672588e-07, "loss": 0.3858, "step": 2380 }, { "epoch": 2.631908621960206, "grad_norm": 0.31536200642585754, "learning_rate": 4.472320063733843e-07, "loss": 0.4097, "step": 2381 }, { "epoch": 2.6330140014738395, "grad_norm": 0.3253920376300812, "learning_rate": 4.445744975479627e-07, "loss": 0.4442, "step": 2382 }, { "epoch": 2.6341193809874723, "grad_norm": 0.28210219740867615, "learning_rate": 4.419245404964889e-07, "loss": 0.3851, "step": 2383 }, { "epoch": 2.6352247605011057, "grad_norm": 0.33079418540000916, "learning_rate": 4.392821396119407e-07, "loss": 0.4793, "step": 2384 }, { "epoch": 2.6363301400147385, "grad_norm": 0.33646833896636963, "learning_rate": 4.3664729927476556e-07, "loss": 0.413, "step": 2385 }, { "epoch": 2.6374355195283714, "grad_norm": 0.3039708137512207, "learning_rate": 4.3402002385288133e-07, "loss": 0.4202, "step": 2386 }, { "epoch": 2.6385408990420043, "grad_norm": 0.29992517828941345, "learning_rate": 4.3140031770166457e-07, "loss": 0.388, "step": 2387 }, { "epoch": 2.639646278555637, "grad_norm": 0.35522499680519104, "learning_rate": 4.2878818516394307e-07, "loss": 0.4708, "step": 2388 }, { "epoch": 2.6407516580692705, "grad_norm": 0.34144851565361023, "learning_rate": 4.2618363056998844e-07, "loss": 0.4475, "step": 2389 }, { "epoch": 2.6418570375829034, "grad_norm": 0.30601924657821655, "learning_rate": 4.235866582375114e-07, "loss": 0.3633, "step": 2390 }, { "epoch": 2.6429624170965367, "grad_norm": 0.2974865138530731, "learning_rate": 4.209972724716532e-07, "loss": 0.3777, "step": 2391 }, { "epoch": 2.6440677966101696, "grad_norm": 0.3649008274078369, "learning_rate": 4.184154775649768e-07, "loss": 0.4729, "step": 2392 }, { "epoch": 2.6451731761238024, "grad_norm": 0.33369746804237366, "learning_rate": 4.1584127779746354e-07, "loss": 0.393, "step": 2393 }, { "epoch": 2.6462785556374353, "grad_norm": 0.327798455953598, "learning_rate": 4.132746774365021e-07, "loss": 0.4696, "step": 2394 }, { "epoch": 2.6473839351510686, "grad_norm": 0.3007277846336365, "learning_rate": 4.107156807368856e-07, "loss": 0.4078, "step": 2395 }, { "epoch": 2.6484893146647015, "grad_norm": 0.3404757082462311, "learning_rate": 4.0816429194079857e-07, "loss": 0.4272, "step": 2396 }, { "epoch": 2.649594694178335, "grad_norm": 0.3481962978839874, "learning_rate": 4.0562051527781534e-07, "loss": 0.4081, "step": 2397 }, { "epoch": 2.6507000736919677, "grad_norm": 0.3025266230106354, "learning_rate": 4.030843549648944e-07, "loss": 0.3762, "step": 2398 }, { "epoch": 2.6518054532056006, "grad_norm": 0.3138599395751953, "learning_rate": 4.0055581520636157e-07, "loss": 0.4104, "step": 2399 }, { "epoch": 2.6529108327192334, "grad_norm": 0.34392252564430237, "learning_rate": 3.9803490019391545e-07, "loss": 0.3942, "step": 2400 }, { "epoch": 2.6540162122328668, "grad_norm": 0.3289662003517151, "learning_rate": 3.955216141066115e-07, "loss": 0.4277, "step": 2401 }, { "epoch": 2.6551215917464996, "grad_norm": 0.31511351466178894, "learning_rate": 3.930159611108603e-07, "loss": 0.3992, "step": 2402 }, { "epoch": 2.6562269712601325, "grad_norm": 0.31702136993408203, "learning_rate": 3.905179453604163e-07, "loss": 0.3962, "step": 2403 }, { "epoch": 2.657332350773766, "grad_norm": 0.2997090816497803, "learning_rate": 3.880275709963749e-07, "loss": 0.3981, "step": 2404 }, { "epoch": 2.6584377302873987, "grad_norm": 0.34999629855155945, "learning_rate": 3.855448421471641e-07, "loss": 0.4711, "step": 2405 }, { "epoch": 2.6595431098010316, "grad_norm": 0.3091926574707031, "learning_rate": 3.8306976292853794e-07, "loss": 0.4027, "step": 2406 }, { "epoch": 2.6606484893146645, "grad_norm": 0.3116767406463623, "learning_rate": 3.8060233744356634e-07, "loss": 0.3822, "step": 2407 }, { "epoch": 2.661753868828298, "grad_norm": 0.33717626333236694, "learning_rate": 3.7814256978263465e-07, "loss": 0.418, "step": 2408 }, { "epoch": 2.6628592483419307, "grad_norm": 0.30904343724250793, "learning_rate": 3.756904640234327e-07, "loss": 0.3971, "step": 2409 }, { "epoch": 2.663964627855564, "grad_norm": 0.30150285363197327, "learning_rate": 3.7324602423094603e-07, "loss": 0.3969, "step": 2410 }, { "epoch": 2.665070007369197, "grad_norm": 0.33401158452033997, "learning_rate": 3.708092544574554e-07, "loss": 0.4403, "step": 2411 }, { "epoch": 2.6661753868828297, "grad_norm": 0.3189488351345062, "learning_rate": 3.683801587425251e-07, "loss": 0.3913, "step": 2412 }, { "epoch": 2.6672807663964626, "grad_norm": 0.31875526905059814, "learning_rate": 3.6595874111299834e-07, "loss": 0.445, "step": 2413 }, { "epoch": 2.668386145910096, "grad_norm": 0.339716374874115, "learning_rate": 3.635450055829881e-07, "loss": 0.4204, "step": 2414 }, { "epoch": 2.669491525423729, "grad_norm": 0.32124167680740356, "learning_rate": 3.6113895615387416e-07, "loss": 0.4328, "step": 2415 }, { "epoch": 2.6705969049373617, "grad_norm": 0.3280034363269806, "learning_rate": 3.587405968142943e-07, "loss": 0.3893, "step": 2416 }, { "epoch": 2.671702284450995, "grad_norm": 0.3128242790699005, "learning_rate": 3.563499315401386e-07, "loss": 0.4127, "step": 2417 }, { "epoch": 2.672807663964628, "grad_norm": 0.34175074100494385, "learning_rate": 3.539669642945387e-07, "loss": 0.4564, "step": 2418 }, { "epoch": 2.6739130434782608, "grad_norm": 0.33708012104034424, "learning_rate": 3.515916990278706e-07, "loss": 0.4166, "step": 2419 }, { "epoch": 2.6750184229918936, "grad_norm": 0.322527140378952, "learning_rate": 3.4922413967773905e-07, "loss": 0.4112, "step": 2420 }, { "epoch": 2.676123802505527, "grad_norm": 0.3031521737575531, "learning_rate": 3.468642901689728e-07, "loss": 0.4552, "step": 2421 }, { "epoch": 2.67722918201916, "grad_norm": 0.2951265275478363, "learning_rate": 3.4451215441362264e-07, "loss": 0.3791, "step": 2422 }, { "epoch": 2.678334561532793, "grad_norm": 0.32490766048431396, "learning_rate": 3.421677363109499e-07, "loss": 0.3906, "step": 2423 }, { "epoch": 2.679439941046426, "grad_norm": 0.3146548271179199, "learning_rate": 3.3983103974742415e-07, "loss": 0.4374, "step": 2424 }, { "epoch": 2.680545320560059, "grad_norm": 0.30426427721977234, "learning_rate": 3.375020685967112e-07, "loss": 0.4283, "step": 2425 }, { "epoch": 2.6816507000736918, "grad_norm": 0.3029578924179077, "learning_rate": 3.351808267196721e-07, "loss": 0.4062, "step": 2426 }, { "epoch": 2.682756079587325, "grad_norm": 0.33619850873947144, "learning_rate": 3.328673179643555e-07, "loss": 0.4275, "step": 2427 }, { "epoch": 2.683861459100958, "grad_norm": 0.30904456973075867, "learning_rate": 3.3056154616598777e-07, "loss": 0.4302, "step": 2428 }, { "epoch": 2.684966838614591, "grad_norm": 0.32123053073883057, "learning_rate": 3.2826351514697207e-07, "loss": 0.3994, "step": 2429 }, { "epoch": 2.686072218128224, "grad_norm": 0.3152608275413513, "learning_rate": 3.2597322871687766e-07, "loss": 0.3859, "step": 2430 }, { "epoch": 2.687177597641857, "grad_norm": 0.35852691531181335, "learning_rate": 3.2369069067243576e-07, "loss": 0.444, "step": 2431 }, { "epoch": 2.68828297715549, "grad_norm": 0.3038095235824585, "learning_rate": 3.214159047975324e-07, "loss": 0.4215, "step": 2432 }, { "epoch": 2.689388356669123, "grad_norm": 0.33931416273117065, "learning_rate": 3.191488748632016e-07, "loss": 0.4684, "step": 2433 }, { "epoch": 2.690493736182756, "grad_norm": 0.3196990191936493, "learning_rate": 3.1688960462762263e-07, "loss": 0.439, "step": 2434 }, { "epoch": 2.691599115696389, "grad_norm": 0.3219772279262543, "learning_rate": 3.1463809783610855e-07, "loss": 0.4173, "step": 2435 }, { "epoch": 2.6927044952100223, "grad_norm": 0.31706351041793823, "learning_rate": 3.1239435822110253e-07, "loss": 0.4192, "step": 2436 }, { "epoch": 2.693809874723655, "grad_norm": 0.31864187121391296, "learning_rate": 3.101583895021731e-07, "loss": 0.4317, "step": 2437 }, { "epoch": 2.694915254237288, "grad_norm": 0.3042279779911041, "learning_rate": 3.079301953860059e-07, "loss": 0.4318, "step": 2438 }, { "epoch": 2.696020633750921, "grad_norm": 0.3046506941318512, "learning_rate": 3.057097795663988e-07, "loss": 0.3797, "step": 2439 }, { "epoch": 2.6971260132645543, "grad_norm": 0.3208453059196472, "learning_rate": 3.034971457242525e-07, "loss": 0.4191, "step": 2440 }, { "epoch": 2.698231392778187, "grad_norm": 0.3044637441635132, "learning_rate": 3.012922975275712e-07, "loss": 0.4295, "step": 2441 }, { "epoch": 2.6993367722918205, "grad_norm": 0.3187144994735718, "learning_rate": 2.990952386314505e-07, "loss": 0.4105, "step": 2442 }, { "epoch": 2.7004421518054533, "grad_norm": 0.31005996465682983, "learning_rate": 2.969059726780721e-07, "loss": 0.3906, "step": 2443 }, { "epoch": 2.701547531319086, "grad_norm": 0.32591116428375244, "learning_rate": 2.947245032967e-07, "loss": 0.4313, "step": 2444 }, { "epoch": 2.702652910832719, "grad_norm": 0.33381712436676025, "learning_rate": 2.9255083410367426e-07, "loss": 0.4375, "step": 2445 }, { "epoch": 2.7037582903463524, "grad_norm": 0.30554893612861633, "learning_rate": 2.9038496870240187e-07, "loss": 0.424, "step": 2446 }, { "epoch": 2.7048636698599853, "grad_norm": 0.3161725103855133, "learning_rate": 2.8822691068335515e-07, "loss": 0.4094, "step": 2447 }, { "epoch": 2.705969049373618, "grad_norm": 0.342393696308136, "learning_rate": 2.860766636240636e-07, "loss": 0.4333, "step": 2448 }, { "epoch": 2.7070744288872515, "grad_norm": 0.3374008536338806, "learning_rate": 2.839342310891069e-07, "loss": 0.4016, "step": 2449 }, { "epoch": 2.7081798084008843, "grad_norm": 0.34246793389320374, "learning_rate": 2.817996166301107e-07, "loss": 0.4178, "step": 2450 }, { "epoch": 2.7092851879145172, "grad_norm": 0.33946776390075684, "learning_rate": 2.796728237857399e-07, "loss": 0.3811, "step": 2451 }, { "epoch": 2.71039056742815, "grad_norm": 0.31593406200408936, "learning_rate": 2.7755385608169374e-07, "loss": 0.4336, "step": 2452 }, { "epoch": 2.7114959469417834, "grad_norm": 0.3200725018978119, "learning_rate": 2.754427170306995e-07, "loss": 0.4442, "step": 2453 }, { "epoch": 2.7126013264554163, "grad_norm": 0.30526360869407654, "learning_rate": 2.733394101325054e-07, "loss": 0.4253, "step": 2454 }, { "epoch": 2.7137067059690496, "grad_norm": 0.33645331859588623, "learning_rate": 2.7124393887387635e-07, "loss": 0.4243, "step": 2455 }, { "epoch": 2.7148120854826825, "grad_norm": 0.3296240270137787, "learning_rate": 2.691563067285874e-07, "loss": 0.4004, "step": 2456 }, { "epoch": 2.7159174649963154, "grad_norm": 0.2926667630672455, "learning_rate": 2.6707651715742075e-07, "loss": 0.3652, "step": 2457 }, { "epoch": 2.7170228445099482, "grad_norm": 0.33938029408454895, "learning_rate": 2.65004573608153e-07, "loss": 0.428, "step": 2458 }, { "epoch": 2.7181282240235816, "grad_norm": 0.31595200300216675, "learning_rate": 2.629404795155571e-07, "loss": 0.4119, "step": 2459 }, { "epoch": 2.7192336035372144, "grad_norm": 0.32670292258262634, "learning_rate": 2.608842383013943e-07, "loss": 0.4261, "step": 2460 }, { "epoch": 2.7203389830508473, "grad_norm": 0.3276672065258026, "learning_rate": 2.5883585337440455e-07, "loss": 0.4323, "step": 2461 }, { "epoch": 2.7214443625644806, "grad_norm": 0.32694870233535767, "learning_rate": 2.567953281303059e-07, "loss": 0.4265, "step": 2462 }, { "epoch": 2.7225497420781135, "grad_norm": 0.2960048317909241, "learning_rate": 2.5476266595178823e-07, "loss": 0.4181, "step": 2463 }, { "epoch": 2.7236551215917464, "grad_norm": 0.34383127093315125, "learning_rate": 2.527378702085037e-07, "loss": 0.4036, "step": 2464 }, { "epoch": 2.7247605011053793, "grad_norm": 0.3284832537174225, "learning_rate": 2.507209442570652e-07, "loss": 0.4238, "step": 2465 }, { "epoch": 2.7258658806190126, "grad_norm": 0.32598477602005005, "learning_rate": 2.4871189144104025e-07, "loss": 0.454, "step": 2466 }, { "epoch": 2.7269712601326455, "grad_norm": 0.3173064589500427, "learning_rate": 2.4671071509094367e-07, "loss": 0.3645, "step": 2467 }, { "epoch": 2.728076639646279, "grad_norm": 0.3124944865703583, "learning_rate": 2.447174185242324e-07, "loss": 0.4366, "step": 2468 }, { "epoch": 2.7291820191599117, "grad_norm": 0.3024873733520508, "learning_rate": 2.4273200504530136e-07, "loss": 0.4096, "step": 2469 }, { "epoch": 2.7302873986735445, "grad_norm": 0.31801459193229675, "learning_rate": 2.407544779454779e-07, "loss": 0.3938, "step": 2470 }, { "epoch": 2.7313927781871774, "grad_norm": 0.30136021971702576, "learning_rate": 2.3878484050301576e-07, "loss": 0.3868, "step": 2471 }, { "epoch": 2.7324981577008107, "grad_norm": 0.3088766634464264, "learning_rate": 2.368230959830875e-07, "loss": 0.4632, "step": 2472 }, { "epoch": 2.7336035372144436, "grad_norm": 0.2929951846599579, "learning_rate": 2.3486924763778286e-07, "loss": 0.3839, "step": 2473 }, { "epoch": 2.7347089167280765, "grad_norm": 0.3250470459461212, "learning_rate": 2.32923298706102e-07, "loss": 0.4614, "step": 2474 }, { "epoch": 2.73581429624171, "grad_norm": 0.3318677842617035, "learning_rate": 2.3098525241394888e-07, "loss": 0.4311, "step": 2475 }, { "epoch": 2.7369196757553427, "grad_norm": 0.29899752140045166, "learning_rate": 2.2905511197412634e-07, "loss": 0.439, "step": 2476 }, { "epoch": 2.7380250552689755, "grad_norm": 0.29636746644973755, "learning_rate": 2.2713288058633321e-07, "loss": 0.3922, "step": 2477 }, { "epoch": 2.7391304347826084, "grad_norm": 0.2960865795612335, "learning_rate": 2.2521856143715492e-07, "loss": 0.382, "step": 2478 }, { "epoch": 2.7402358142962417, "grad_norm": 0.33268216252326965, "learning_rate": 2.2331215770006299e-07, "loss": 0.4653, "step": 2479 }, { "epoch": 2.7413411938098746, "grad_norm": 0.30552753806114197, "learning_rate": 2.2141367253540325e-07, "loss": 0.3972, "step": 2480 }, { "epoch": 2.742446573323508, "grad_norm": 0.352172315120697, "learning_rate": 2.1952310909039766e-07, "loss": 0.4464, "step": 2481 }, { "epoch": 2.743551952837141, "grad_norm": 0.3132232129573822, "learning_rate": 2.1764047049913528e-07, "loss": 0.4315, "step": 2482 }, { "epoch": 2.7446573323507737, "grad_norm": 0.31299328804016113, "learning_rate": 2.1576575988256688e-07, "loss": 0.3713, "step": 2483 }, { "epoch": 2.7457627118644066, "grad_norm": 0.33321136236190796, "learning_rate": 2.1389898034850086e-07, "loss": 0.4323, "step": 2484 }, { "epoch": 2.74686809137804, "grad_norm": 0.3321469724178314, "learning_rate": 2.120401349915996e-07, "loss": 0.4562, "step": 2485 }, { "epoch": 2.7479734708916728, "grad_norm": 0.3229837119579315, "learning_rate": 2.1018922689336973e-07, "loss": 0.4023, "step": 2486 }, { "epoch": 2.749078850405306, "grad_norm": 0.3356492221355438, "learning_rate": 2.0834625912216133e-07, "loss": 0.3917, "step": 2487 }, { "epoch": 2.750184229918939, "grad_norm": 0.32265686988830566, "learning_rate": 2.0651123473316103e-07, "loss": 0.4376, "step": 2488 }, { "epoch": 2.751289609432572, "grad_norm": 0.3083325922489166, "learning_rate": 2.0468415676838882e-07, "loss": 0.396, "step": 2489 }, { "epoch": 2.7523949889462047, "grad_norm": 0.32437875866889954, "learning_rate": 2.0286502825668852e-07, "loss": 0.4679, "step": 2490 }, { "epoch": 2.753500368459838, "grad_norm": 0.30181628465652466, "learning_rate": 2.0105385221372787e-07, "loss": 0.3997, "step": 2491 }, { "epoch": 2.754605747973471, "grad_norm": 0.3384130895137787, "learning_rate": 1.992506316419912e-07, "loss": 0.4608, "step": 2492 }, { "epoch": 2.755711127487104, "grad_norm": 0.3237173557281494, "learning_rate": 1.974553695307746e-07, "loss": 0.4285, "step": 2493 }, { "epoch": 2.756816507000737, "grad_norm": 0.3220784366130829, "learning_rate": 1.9566806885617906e-07, "loss": 0.3871, "step": 2494 }, { "epoch": 2.75792188651437, "grad_norm": 0.3524687886238098, "learning_rate": 1.9388873258111064e-07, "loss": 0.4634, "step": 2495 }, { "epoch": 2.759027266028003, "grad_norm": 0.3238230049610138, "learning_rate": 1.921173636552698e-07, "loss": 0.4072, "step": 2496 }, { "epoch": 2.7601326455416357, "grad_norm": 0.3045985996723175, "learning_rate": 1.9035396501515148e-07, "loss": 0.3657, "step": 2497 }, { "epoch": 2.761238025055269, "grad_norm": 0.30765384435653687, "learning_rate": 1.8859853958403507e-07, "loss": 0.4082, "step": 2498 }, { "epoch": 2.762343404568902, "grad_norm": 0.31616610288619995, "learning_rate": 1.8685109027198389e-07, "loss": 0.3891, "step": 2499 }, { "epoch": 2.7634487840825352, "grad_norm": 0.3398989737033844, "learning_rate": 1.8511161997584015e-07, "loss": 0.4337, "step": 2500 }, { "epoch": 2.764554163596168, "grad_norm": 0.3086039125919342, "learning_rate": 1.8338013157921552e-07, "loss": 0.3836, "step": 2501 }, { "epoch": 2.765659543109801, "grad_norm": 0.33187663555145264, "learning_rate": 1.8165662795249172e-07, "loss": 0.4312, "step": 2502 }, { "epoch": 2.766764922623434, "grad_norm": 0.313838928937912, "learning_rate": 1.7994111195281438e-07, "loss": 0.4197, "step": 2503 }, { "epoch": 2.767870302137067, "grad_norm": 0.31728529930114746, "learning_rate": 1.7823358642408694e-07, "loss": 0.4101, "step": 2504 }, { "epoch": 2.7689756816507, "grad_norm": 0.34992900490760803, "learning_rate": 1.7653405419696456e-07, "loss": 0.4615, "step": 2505 }, { "epoch": 2.770081061164333, "grad_norm": 0.3347248435020447, "learning_rate": 1.748425180888541e-07, "loss": 0.3824, "step": 2506 }, { "epoch": 2.7711864406779663, "grad_norm": 0.3793220520019531, "learning_rate": 1.7315898090390748e-07, "loss": 0.4024, "step": 2507 }, { "epoch": 2.772291820191599, "grad_norm": 0.30920934677124023, "learning_rate": 1.7148344543301277e-07, "loss": 0.3999, "step": 2508 }, { "epoch": 2.773397199705232, "grad_norm": 0.32772096991539, "learning_rate": 1.6981591445379641e-07, "loss": 0.4172, "step": 2509 }, { "epoch": 2.774502579218865, "grad_norm": 0.32845714688301086, "learning_rate": 1.681563907306133e-07, "loss": 0.4481, "step": 2510 }, { "epoch": 2.775607958732498, "grad_norm": 0.32308119535446167, "learning_rate": 1.6650487701454665e-07, "loss": 0.4208, "step": 2511 }, { "epoch": 2.776713338246131, "grad_norm": 0.2887328565120697, "learning_rate": 1.6486137604339813e-07, "loss": 0.3762, "step": 2512 }, { "epoch": 2.7778187177597644, "grad_norm": 0.32525667548179626, "learning_rate": 1.632258905416889e-07, "loss": 0.4456, "step": 2513 }, { "epoch": 2.7789240972733973, "grad_norm": 0.31139981746673584, "learning_rate": 1.6159842322065022e-07, "loss": 0.4168, "step": 2514 }, { "epoch": 2.78002947678703, "grad_norm": 0.3126009702682495, "learning_rate": 1.5997897677822282e-07, "loss": 0.4352, "step": 2515 }, { "epoch": 2.781134856300663, "grad_norm": 0.2998673915863037, "learning_rate": 1.5836755389905035e-07, "loss": 0.383, "step": 2516 }, { "epoch": 2.7822402358142964, "grad_norm": 0.32473263144493103, "learning_rate": 1.5676415725447424e-07, "loss": 0.4186, "step": 2517 }, { "epoch": 2.7833456153279292, "grad_norm": 0.3185179531574249, "learning_rate": 1.5516878950253333e-07, "loss": 0.4128, "step": 2518 }, { "epoch": 2.784450994841562, "grad_norm": 0.36243706941604614, "learning_rate": 1.535814532879526e-07, "loss": 0.4627, "step": 2519 }, { "epoch": 2.7855563743551954, "grad_norm": 0.29021963477134705, "learning_rate": 1.5200215124214491e-07, "loss": 0.3657, "step": 2520 }, { "epoch": 2.7866617538688283, "grad_norm": 0.3366038203239441, "learning_rate": 1.5043088598320544e-07, "loss": 0.4607, "step": 2521 }, { "epoch": 2.787767133382461, "grad_norm": 0.3503974676132202, "learning_rate": 1.4886766011590449e-07, "loss": 0.4151, "step": 2522 }, { "epoch": 2.788872512896094, "grad_norm": 0.3165070712566376, "learning_rate": 1.4731247623168577e-07, "loss": 0.4045, "step": 2523 }, { "epoch": 2.7899778924097274, "grad_norm": 0.3169047236442566, "learning_rate": 1.4576533690866035e-07, "loss": 0.4334, "step": 2524 }, { "epoch": 2.7910832719233603, "grad_norm": 0.31384870409965515, "learning_rate": 1.442262447116055e-07, "loss": 0.4122, "step": 2525 }, { "epoch": 2.7921886514369936, "grad_norm": 0.30722713470458984, "learning_rate": 1.4269520219195753e-07, "loss": 0.3691, "step": 2526 }, { "epoch": 2.7932940309506264, "grad_norm": 0.32159650325775146, "learning_rate": 1.4117221188780616e-07, "loss": 0.4122, "step": 2527 }, { "epoch": 2.7943994104642593, "grad_norm": 0.34562867879867554, "learning_rate": 1.396572763238957e-07, "loss": 0.4398, "step": 2528 }, { "epoch": 2.795504789977892, "grad_norm": 0.29789167642593384, "learning_rate": 1.3815039801161723e-07, "loss": 0.4032, "step": 2529 }, { "epoch": 2.7966101694915255, "grad_norm": 0.30080607533454895, "learning_rate": 1.3665157944900198e-07, "loss": 0.4254, "step": 2530 }, { "epoch": 2.7977155490051584, "grad_norm": 0.2947518825531006, "learning_rate": 1.35160823120723e-07, "loss": 0.4172, "step": 2531 }, { "epoch": 2.7988209285187917, "grad_norm": 0.3348318636417389, "learning_rate": 1.3367813149808728e-07, "loss": 0.4291, "step": 2532 }, { "epoch": 2.7999263080324246, "grad_norm": 0.31446197628974915, "learning_rate": 1.3220350703903318e-07, "loss": 0.4187, "step": 2533 }, { "epoch": 2.8010316875460575, "grad_norm": 0.29712116718292236, "learning_rate": 1.3073695218812356e-07, "loss": 0.3951, "step": 2534 }, { "epoch": 2.8021370670596903, "grad_norm": 0.32829946279525757, "learning_rate": 1.2927846937654641e-07, "loss": 0.4714, "step": 2535 }, { "epoch": 2.8032424465733237, "grad_norm": 0.28297463059425354, "learning_rate": 1.278280610221072e-07, "loss": 0.3635, "step": 2536 }, { "epoch": 2.8043478260869565, "grad_norm": 0.32341304421424866, "learning_rate": 1.2638572952922478e-07, "loss": 0.4501, "step": 2537 }, { "epoch": 2.8054532056005894, "grad_norm": 0.32119497656822205, "learning_rate": 1.2495147728893043e-07, "loss": 0.413, "step": 2538 }, { "epoch": 2.8065585851142227, "grad_norm": 0.31692424416542053, "learning_rate": 1.2352530667886164e-07, "loss": 0.3946, "step": 2539 }, { "epoch": 2.8076639646278556, "grad_norm": 0.3133922219276428, "learning_rate": 1.2210722006325782e-07, "loss": 0.4298, "step": 2540 }, { "epoch": 2.8087693441414885, "grad_norm": 0.3382653594017029, "learning_rate": 1.206972197929568e-07, "loss": 0.4234, "step": 2541 }, { "epoch": 2.8098747236551214, "grad_norm": 0.3231433033943176, "learning_rate": 1.192953082053927e-07, "loss": 0.3702, "step": 2542 }, { "epoch": 2.8109801031687547, "grad_norm": 0.3071088194847107, "learning_rate": 1.1790148762458931e-07, "loss": 0.4359, "step": 2543 }, { "epoch": 2.8120854826823876, "grad_norm": 0.2816835641860962, "learning_rate": 1.1651576036115942e-07, "loss": 0.4075, "step": 2544 }, { "epoch": 2.813190862196021, "grad_norm": 0.33552783727645874, "learning_rate": 1.1513812871229547e-07, "loss": 0.4448, "step": 2545 }, { "epoch": 2.8142962417096538, "grad_norm": 0.30780816078186035, "learning_rate": 1.1376859496177228e-07, "loss": 0.4076, "step": 2546 }, { "epoch": 2.8154016212232866, "grad_norm": 0.3009544610977173, "learning_rate": 1.1240716137994045e-07, "loss": 0.418, "step": 2547 }, { "epoch": 2.8165070007369195, "grad_norm": 0.3060729205608368, "learning_rate": 1.110538302237213e-07, "loss": 0.4063, "step": 2548 }, { "epoch": 2.817612380250553, "grad_norm": 0.316890686750412, "learning_rate": 1.0970860373660352e-07, "loss": 0.4309, "step": 2549 }, { "epoch": 2.8187177597641857, "grad_norm": 0.30793261528015137, "learning_rate": 1.0837148414864329e-07, "loss": 0.4196, "step": 2550 }, { "epoch": 2.8198231392778186, "grad_norm": 0.31760865449905396, "learning_rate": 1.0704247367645526e-07, "loss": 0.4294, "step": 2551 }, { "epoch": 2.820928518791452, "grad_norm": 0.29133930802345276, "learning_rate": 1.0572157452321097e-07, "loss": 0.3938, "step": 2552 }, { "epoch": 2.8220338983050848, "grad_norm": 0.32197219133377075, "learning_rate": 1.0440878887863604e-07, "loss": 0.3985, "step": 2553 }, { "epoch": 2.8231392778187177, "grad_norm": 0.3215283155441284, "learning_rate": 1.0310411891900629e-07, "loss": 0.4311, "step": 2554 }, { "epoch": 2.8242446573323505, "grad_norm": 0.33433040976524353, "learning_rate": 1.0180756680714277e-07, "loss": 0.4555, "step": 2555 }, { "epoch": 2.825350036845984, "grad_norm": 0.30185604095458984, "learning_rate": 1.0051913469241003e-07, "loss": 0.4065, "step": 2556 }, { "epoch": 2.8264554163596167, "grad_norm": 0.2899397313594818, "learning_rate": 9.923882471071123e-08, "loss": 0.4104, "step": 2557 }, { "epoch": 2.82756079587325, "grad_norm": 0.3248623013496399, "learning_rate": 9.796663898448578e-08, "loss": 0.4446, "step": 2558 }, { "epoch": 2.828666175386883, "grad_norm": 0.31715866923332214, "learning_rate": 9.670257962270279e-08, "loss": 0.4051, "step": 2559 }, { "epoch": 2.829771554900516, "grad_norm": 0.3104207217693329, "learning_rate": 9.544664872086329e-08, "loss": 0.4288, "step": 2560 }, { "epoch": 2.8308769344141487, "grad_norm": 0.34563595056533813, "learning_rate": 9.419884836099014e-08, "loss": 0.4581, "step": 2561 }, { "epoch": 2.831982313927782, "grad_norm": 0.3218534588813782, "learning_rate": 9.295918061163034e-08, "loss": 0.369, "step": 2562 }, { "epoch": 2.833087693441415, "grad_norm": 0.36176514625549316, "learning_rate": 9.172764752784613e-08, "loss": 0.4435, "step": 2563 }, { "epoch": 2.8341930729550477, "grad_norm": 0.3149697482585907, "learning_rate": 9.050425115121775e-08, "loss": 0.4005, "step": 2564 }, { "epoch": 2.835298452468681, "grad_norm": 0.2926124632358551, "learning_rate": 8.928899350983456e-08, "loss": 0.351, "step": 2565 }, { "epoch": 2.836403831982314, "grad_norm": 0.3368280529975891, "learning_rate": 8.808187661829504e-08, "loss": 0.4414, "step": 2566 }, { "epoch": 2.837509211495947, "grad_norm": 0.3031342327594757, "learning_rate": 8.688290247770071e-08, "loss": 0.4121, "step": 2567 }, { "epoch": 2.8386145910095797, "grad_norm": 0.30731186270713806, "learning_rate": 8.569207307565664e-08, "loss": 0.4281, "step": 2568 }, { "epoch": 2.839719970523213, "grad_norm": 0.3078130781650543, "learning_rate": 8.450939038626427e-08, "loss": 0.4548, "step": 2569 }, { "epoch": 2.840825350036846, "grad_norm": 0.3113615810871124, "learning_rate": 8.333485637012029e-08, "loss": 0.4349, "step": 2570 }, { "epoch": 2.841930729550479, "grad_norm": 0.33813953399658203, "learning_rate": 8.216847297431218e-08, "loss": 0.4386, "step": 2571 }, { "epoch": 2.843036109064112, "grad_norm": 0.3199760317802429, "learning_rate": 8.101024213241826e-08, "loss": 0.4334, "step": 2572 }, { "epoch": 2.844141488577745, "grad_norm": 0.3021185100078583, "learning_rate": 7.986016576449874e-08, "loss": 0.3863, "step": 2573 }, { "epoch": 2.845246868091378, "grad_norm": 0.33228638768196106, "learning_rate": 7.871824577709797e-08, "loss": 0.3663, "step": 2574 }, { "epoch": 2.846352247605011, "grad_norm": 0.31640806794166565, "learning_rate": 7.758448406323727e-08, "loss": 0.4547, "step": 2575 }, { "epoch": 2.847457627118644, "grad_norm": 0.3231935203075409, "learning_rate": 7.645888250241485e-08, "loss": 0.4252, "step": 2576 }, { "epoch": 2.8485630066322774, "grad_norm": 0.31141477823257446, "learning_rate": 7.534144296060142e-08, "loss": 0.4163, "step": 2577 }, { "epoch": 2.8496683861459102, "grad_norm": 0.3135634660720825, "learning_rate": 7.423216729023574e-08, "loss": 0.4427, "step": 2578 }, { "epoch": 2.850773765659543, "grad_norm": 0.3040454089641571, "learning_rate": 7.313105733022407e-08, "loss": 0.4383, "step": 2579 }, { "epoch": 2.851879145173176, "grad_norm": 0.31591570377349854, "learning_rate": 7.203811490593626e-08, "loss": 0.4148, "step": 2580 }, { "epoch": 2.8529845246868093, "grad_norm": 0.3239080607891083, "learning_rate": 7.095334182920077e-08, "loss": 0.4343, "step": 2581 }, { "epoch": 2.854089904200442, "grad_norm": 0.2770429849624634, "learning_rate": 6.987673989830523e-08, "loss": 0.3218, "step": 2582 }, { "epoch": 2.855195283714075, "grad_norm": 0.3544721007347107, "learning_rate": 6.880831089798978e-08, "loss": 0.4788, "step": 2583 }, { "epoch": 2.8563006632277084, "grad_norm": 0.3322450518608093, "learning_rate": 6.774805659944761e-08, "loss": 0.4454, "step": 2584 }, { "epoch": 2.8574060427413412, "grad_norm": 0.31595906615257263, "learning_rate": 6.669597876031775e-08, "loss": 0.4113, "step": 2585 }, { "epoch": 2.858511422254974, "grad_norm": 0.30101466178894043, "learning_rate": 6.565207912468785e-08, "loss": 0.4038, "step": 2586 }, { "epoch": 2.859616801768607, "grad_norm": 0.30933207273483276, "learning_rate": 6.461635942308641e-08, "loss": 0.3917, "step": 2587 }, { "epoch": 2.8607221812822403, "grad_norm": 0.3089444935321808, "learning_rate": 6.358882137248001e-08, "loss": 0.379, "step": 2588 }, { "epoch": 2.861827560795873, "grad_norm": 0.32714906334877014, "learning_rate": 6.25694666762755e-08, "loss": 0.4794, "step": 2589 }, { "epoch": 2.8629329403095065, "grad_norm": 0.30374154448509216, "learning_rate": 6.15582970243117e-08, "loss": 0.388, "step": 2590 }, { "epoch": 2.8640383198231394, "grad_norm": 0.32041075825691223, "learning_rate": 6.05553140928583e-08, "loss": 0.4066, "step": 2591 }, { "epoch": 2.8651436993367723, "grad_norm": 0.3196339011192322, "learning_rate": 5.9560519544614725e-08, "loss": 0.4173, "step": 2592 }, { "epoch": 2.866249078850405, "grad_norm": 0.31186214089393616, "learning_rate": 5.857391502870458e-08, "loss": 0.3997, "step": 2593 }, { "epoch": 2.8673544583640385, "grad_norm": 0.3238653838634491, "learning_rate": 5.759550218067622e-08, "loss": 0.4346, "step": 2594 }, { "epoch": 2.8684598378776713, "grad_norm": 0.30448201298713684, "learning_rate": 5.662528262249667e-08, "loss": 0.432, "step": 2595 }, { "epoch": 2.869565217391304, "grad_norm": 0.30383431911468506, "learning_rate": 5.566325796255101e-08, "loss": 0.3986, "step": 2596 }, { "epoch": 2.8706705969049375, "grad_norm": 0.3009508550167084, "learning_rate": 5.47094297956402e-08, "loss": 0.402, "step": 2597 }, { "epoch": 2.8717759764185704, "grad_norm": 0.31604325771331787, "learning_rate": 5.3763799702975516e-08, "loss": 0.4296, "step": 2598 }, { "epoch": 2.8728813559322033, "grad_norm": 0.7109905481338501, "learning_rate": 5.28263692521791e-08, "loss": 0.415, "step": 2599 }, { "epoch": 2.873986735445836, "grad_norm": 0.34884485602378845, "learning_rate": 5.1897139997280074e-08, "loss": 0.4033, "step": 2600 }, { "epoch": 2.8750921149594695, "grad_norm": 0.3229202330112457, "learning_rate": 5.097611347871123e-08, "loss": 0.3951, "step": 2601 }, { "epoch": 2.8761974944731024, "grad_norm": 0.33840277791023254, "learning_rate": 5.0063291223308993e-08, "loss": 0.4271, "step": 2602 }, { "epoch": 2.8773028739867357, "grad_norm": 0.34103095531463623, "learning_rate": 4.9158674744306246e-08, "loss": 0.4429, "step": 2603 }, { "epoch": 2.8784082535003686, "grad_norm": 0.3251691162586212, "learning_rate": 4.826226554133562e-08, "loss": 0.4071, "step": 2604 }, { "epoch": 2.8795136330140014, "grad_norm": 0.31900516152381897, "learning_rate": 4.737406510042286e-08, "loss": 0.3964, "step": 2605 }, { "epoch": 2.8806190125276343, "grad_norm": 0.31169071793556213, "learning_rate": 4.649407489398461e-08, "loss": 0.4127, "step": 2606 }, { "epoch": 2.8817243920412676, "grad_norm": 0.3445770740509033, "learning_rate": 4.5622296380828936e-08, "loss": 0.4602, "step": 2607 }, { "epoch": 2.8828297715549005, "grad_norm": 0.30880963802337646, "learning_rate": 4.4758731006149804e-08, "loss": 0.4124, "step": 2608 }, { "epoch": 2.8839351510685334, "grad_norm": 0.33754661679267883, "learning_rate": 4.390338020152596e-08, "loss": 0.4427, "step": 2609 }, { "epoch": 2.8850405305821667, "grad_norm": 0.31627652049064636, "learning_rate": 4.305624538491815e-08, "loss": 0.3831, "step": 2610 }, { "epoch": 2.8861459100957996, "grad_norm": 0.32945653796195984, "learning_rate": 4.221732796066746e-08, "loss": 0.4728, "step": 2611 }, { "epoch": 2.8872512896094324, "grad_norm": 0.30028197169303894, "learning_rate": 4.138662931949255e-08, "loss": 0.3901, "step": 2612 }, { "epoch": 2.8883566691230653, "grad_norm": 0.3145802617073059, "learning_rate": 4.056415083848742e-08, "loss": 0.4227, "step": 2613 }, { "epoch": 2.8894620486366986, "grad_norm": 0.3299754559993744, "learning_rate": 3.974989388111861e-08, "loss": 0.4197, "step": 2614 }, { "epoch": 2.8905674281503315, "grad_norm": 0.2982333302497864, "learning_rate": 3.894385979722359e-08, "loss": 0.3538, "step": 2615 }, { "epoch": 2.891672807663965, "grad_norm": 0.3011651933193207, "learning_rate": 3.814604992300908e-08, "loss": 0.4506, "step": 2616 }, { "epoch": 2.8927781871775977, "grad_norm": 0.2936476767063141, "learning_rate": 3.7356465581047105e-08, "loss": 0.4157, "step": 2617 }, { "epoch": 2.8938835666912306, "grad_norm": 0.30775657296180725, "learning_rate": 3.657510808027343e-08, "loss": 0.4654, "step": 2618 }, { "epoch": 2.8949889462048635, "grad_norm": 0.3073059916496277, "learning_rate": 3.580197871598745e-08, "loss": 0.3864, "step": 2619 }, { "epoch": 2.896094325718497, "grad_norm": 0.3336029350757599, "learning_rate": 3.503707876984674e-08, "loss": 0.3718, "step": 2620 }, { "epoch": 2.8971997052321297, "grad_norm": 0.2962646186351776, "learning_rate": 3.4280409509867e-08, "loss": 0.405, "step": 2621 }, { "epoch": 2.898305084745763, "grad_norm": 0.3204326033592224, "learning_rate": 3.353197219041981e-08, "loss": 0.4191, "step": 2622 }, { "epoch": 2.899410464259396, "grad_norm": 0.3339061737060547, "learning_rate": 3.279176805222994e-08, "loss": 0.4224, "step": 2623 }, { "epoch": 2.9005158437730287, "grad_norm": 0.3077104985713959, "learning_rate": 3.205979832237416e-08, "loss": 0.4085, "step": 2624 }, { "epoch": 2.9016212232866616, "grad_norm": 0.32161781191825867, "learning_rate": 3.133606421427682e-08, "loss": 0.4343, "step": 2625 }, { "epoch": 2.902726602800295, "grad_norm": 0.2887473404407501, "learning_rate": 3.062056692771154e-08, "loss": 0.4146, "step": 2626 }, { "epoch": 2.903831982313928, "grad_norm": 0.31666043400764465, "learning_rate": 2.9913307648797293e-08, "loss": 0.4303, "step": 2627 }, { "epoch": 2.9049373618275607, "grad_norm": 0.31727319955825806, "learning_rate": 2.9214287549995114e-08, "loss": 0.4206, "step": 2628 }, { "epoch": 2.906042741341194, "grad_norm": 0.33489421010017395, "learning_rate": 2.8523507790108594e-08, "loss": 0.4635, "step": 2629 }, { "epoch": 2.907148120854827, "grad_norm": 0.3052200675010681, "learning_rate": 2.7840969514279503e-08, "loss": 0.372, "step": 2630 }, { "epoch": 2.9082535003684598, "grad_norm": 0.3225933611392975, "learning_rate": 2.7166673853989966e-08, "loss": 0.3911, "step": 2631 }, { "epoch": 2.9093588798820926, "grad_norm": 0.30671951174736023, "learning_rate": 2.6500621927054716e-08, "loss": 0.4493, "step": 2632 }, { "epoch": 2.910464259395726, "grad_norm": 0.318754643201828, "learning_rate": 2.5842814837624964e-08, "loss": 0.3966, "step": 2633 }, { "epoch": 2.911569638909359, "grad_norm": 0.3007314205169678, "learning_rate": 2.519325367618175e-08, "loss": 0.4124, "step": 2634 }, { "epoch": 2.912675018422992, "grad_norm": 0.3081384003162384, "learning_rate": 2.4551939519538713e-08, "loss": 0.4192, "step": 2635 }, { "epoch": 2.913780397936625, "grad_norm": 0.33986949920654297, "learning_rate": 2.3918873430835986e-08, "loss": 0.4527, "step": 2636 }, { "epoch": 2.914885777450258, "grad_norm": 0.3175216019153595, "learning_rate": 2.3294056459541302e-08, "loss": 0.3738, "step": 2637 }, { "epoch": 2.9159911569638908, "grad_norm": 0.3064916431903839, "learning_rate": 2.2677489641448335e-08, "loss": 0.3749, "step": 2638 }, { "epoch": 2.917096536477524, "grad_norm": 0.3000575006008148, "learning_rate": 2.2069173998672256e-08, "loss": 0.39, "step": 2639 }, { "epoch": 2.918201915991157, "grad_norm": 0.3113916516304016, "learning_rate": 2.1469110539650283e-08, "loss": 0.422, "step": 2640 }, { "epoch": 2.91930729550479, "grad_norm": 0.3075152039527893, "learning_rate": 2.0877300259141143e-08, "loss": 0.4494, "step": 2641 }, { "epoch": 2.920412675018423, "grad_norm": 0.30388009548187256, "learning_rate": 2.0293744138219495e-08, "loss": 0.4212, "step": 2642 }, { "epoch": 2.921518054532056, "grad_norm": 0.3475492596626282, "learning_rate": 1.971844314427762e-08, "loss": 0.4357, "step": 2643 }, { "epoch": 2.922623434045689, "grad_norm": 0.32731565833091736, "learning_rate": 1.9151398231024297e-08, "loss": 0.4057, "step": 2644 }, { "epoch": 2.923728813559322, "grad_norm": 0.3045297861099243, "learning_rate": 1.85926103384787e-08, "loss": 0.4354, "step": 2645 }, { "epoch": 2.924834193072955, "grad_norm": 0.30787357687950134, "learning_rate": 1.8042080392974837e-08, "loss": 0.3689, "step": 2646 }, { "epoch": 2.925939572586588, "grad_norm": 0.3507116734981537, "learning_rate": 1.7499809307154892e-08, "loss": 0.4465, "step": 2647 }, { "epoch": 2.9270449521002213, "grad_norm": 0.2966473400592804, "learning_rate": 1.6965797979971442e-08, "loss": 0.3747, "step": 2648 }, { "epoch": 2.928150331613854, "grad_norm": 0.330069899559021, "learning_rate": 1.6440047296683582e-08, "loss": 0.4315, "step": 2649 }, { "epoch": 2.929255711127487, "grad_norm": 0.3098735511302948, "learning_rate": 1.5922558128856903e-08, "loss": 0.4132, "step": 2650 }, { "epoch": 2.93036109064112, "grad_norm": 0.3052027225494385, "learning_rate": 1.541333133436018e-08, "loss": 0.41, "step": 2651 }, { "epoch": 2.9314664701547533, "grad_norm": 0.33234983682632446, "learning_rate": 1.4912367757366485e-08, "loss": 0.4398, "step": 2652 }, { "epoch": 2.932571849668386, "grad_norm": 0.3147795796394348, "learning_rate": 1.4419668228350947e-08, "loss": 0.398, "step": 2653 }, { "epoch": 2.933677229182019, "grad_norm": 0.3135552406311035, "learning_rate": 1.3935233564086326e-08, "loss": 0.4241, "step": 2654 }, { "epoch": 2.9347826086956523, "grad_norm": 0.3299594819545746, "learning_rate": 1.3459064567647452e-08, "loss": 0.4631, "step": 2655 }, { "epoch": 2.935887988209285, "grad_norm": 0.3201850354671478, "learning_rate": 1.2991162028405113e-08, "loss": 0.4108, "step": 2656 }, { "epoch": 2.936993367722918, "grad_norm": 0.31018540263175964, "learning_rate": 1.2531526722026067e-08, "loss": 0.4052, "step": 2657 }, { "epoch": 2.938098747236551, "grad_norm": 0.31812700629234314, "learning_rate": 1.2080159410471914e-08, "loss": 0.4221, "step": 2658 }, { "epoch": 2.9392041267501843, "grad_norm": 0.32685720920562744, "learning_rate": 1.1637060842000224e-08, "loss": 0.3997, "step": 2659 }, { "epoch": 2.940309506263817, "grad_norm": 0.31468573212623596, "learning_rate": 1.1202231751157866e-08, "loss": 0.4226, "step": 2660 }, { "epoch": 2.9414148857774505, "grad_norm": 0.29703202843666077, "learning_rate": 1.0775672858785446e-08, "loss": 0.3951, "step": 2661 }, { "epoch": 2.9425202652910833, "grad_norm": 0.33094730973243713, "learning_rate": 1.0357384872011767e-08, "loss": 0.4194, "step": 2662 }, { "epoch": 2.9436256448047162, "grad_norm": 0.31584885716438293, "learning_rate": 9.947368484256036e-09, "loss": 0.4432, "step": 2663 }, { "epoch": 2.944731024318349, "grad_norm": 0.3220323324203491, "learning_rate": 9.545624375223439e-09, "loss": 0.3988, "step": 2664 }, { "epoch": 2.9458364038319824, "grad_norm": 0.29242783784866333, "learning_rate": 9.152153210907899e-09, "loss": 0.3644, "step": 2665 }, { "epoch": 2.9469417833456153, "grad_norm": 0.3167654871940613, "learning_rate": 8.766955643587094e-09, "loss": 0.4533, "step": 2666 }, { "epoch": 2.9480471628592486, "grad_norm": 0.33437883853912354, "learning_rate": 8.390032311824115e-09, "loss": 0.4462, "step": 2667 }, { "epoch": 2.9491525423728815, "grad_norm": 0.3151646554470062, "learning_rate": 8.021383840465247e-09, "loss": 0.4166, "step": 2668 }, { "epoch": 2.9502579218865144, "grad_norm": 0.298500120639801, "learning_rate": 7.66101084063775e-09, "loss": 0.3877, "step": 2669 }, { "epoch": 2.9513633014001472, "grad_norm": 0.344725638628006, "learning_rate": 7.308913909752635e-09, "loss": 0.4007, "step": 2670 }, { "epoch": 2.9524686809137806, "grad_norm": 0.3228418827056885, "learning_rate": 6.965093631499665e-09, "loss": 0.4309, "step": 2671 }, { "epoch": 2.9535740604274134, "grad_norm": 0.3088592290878296, "learning_rate": 6.629550575847355e-09, "loss": 0.3832, "step": 2672 }, { "epoch": 2.9546794399410463, "grad_norm": 0.30611205101013184, "learning_rate": 6.302285299044642e-09, "loss": 0.4432, "step": 2673 }, { "epoch": 2.9557848194546796, "grad_norm": 0.30386608839035034, "learning_rate": 5.983298343615884e-09, "loss": 0.4056, "step": 2674 }, { "epoch": 2.9568901989683125, "grad_norm": 0.3116095960140228, "learning_rate": 5.67259023836364e-09, "loss": 0.4218, "step": 2675 }, { "epoch": 2.9579955784819454, "grad_norm": 0.3286760747432709, "learning_rate": 5.3701614983647786e-09, "loss": 0.4298, "step": 2676 }, { "epoch": 2.9591009579955783, "grad_norm": 0.31414246559143066, "learning_rate": 5.0760126249715935e-09, "loss": 0.4025, "step": 2677 }, { "epoch": 2.9602063375092116, "grad_norm": 0.31442028284072876, "learning_rate": 4.7901441058118006e-09, "loss": 0.4192, "step": 2678 }, { "epoch": 2.9613117170228445, "grad_norm": 0.32651287317276, "learning_rate": 4.512556414783542e-09, "loss": 0.4252, "step": 2679 }, { "epoch": 2.962417096536478, "grad_norm": 0.30619320273399353, "learning_rate": 4.243250012059275e-09, "loss": 0.4076, "step": 2680 }, { "epoch": 2.9635224760501107, "grad_norm": 0.3239995837211609, "learning_rate": 3.9822253440835456e-09, "loss": 0.4148, "step": 2681 }, { "epoch": 2.9646278555637435, "grad_norm": 0.32809630036354065, "learning_rate": 3.729482843569665e-09, "loss": 0.4077, "step": 2682 }, { "epoch": 2.9657332350773764, "grad_norm": 0.32798078656196594, "learning_rate": 3.48502292950359e-09, "loss": 0.4358, "step": 2683 }, { "epoch": 2.9668386145910097, "grad_norm": 0.3327580690383911, "learning_rate": 3.2488460071389285e-09, "loss": 0.4182, "step": 2684 }, { "epoch": 2.9679439941046426, "grad_norm": 0.3043082356452942, "learning_rate": 3.020952467999716e-09, "loss": 0.4252, "step": 2685 }, { "epoch": 2.9690493736182755, "grad_norm": 0.2868470251560211, "learning_rate": 2.801342689875974e-09, "loss": 0.3805, "step": 2686 }, { "epoch": 2.970154753131909, "grad_norm": 0.3061074912548065, "learning_rate": 2.5900170368281517e-09, "loss": 0.4624, "step": 2687 }, { "epoch": 2.9712601326455417, "grad_norm": 0.289028525352478, "learning_rate": 2.3869758591810177e-09, "loss": 0.4043, "step": 2688 }, { "epoch": 2.9723655121591745, "grad_norm": 0.3057435154914856, "learning_rate": 2.1922194935275474e-09, "loss": 0.4095, "step": 2689 }, { "epoch": 2.9734708916728074, "grad_norm": 0.32886236906051636, "learning_rate": 2.005748262725593e-09, "loss": 0.45, "step": 2690 }, { "epoch": 2.9745762711864407, "grad_norm": 0.31724363565444946, "learning_rate": 1.8275624758984366e-09, "loss": 0.3767, "step": 2691 }, { "epoch": 2.9756816507000736, "grad_norm": 0.32762855291366577, "learning_rate": 1.657662428434792e-09, "loss": 0.4474, "step": 2692 }, { "epoch": 2.976787030213707, "grad_norm": 0.29773688316345215, "learning_rate": 1.4960484019860277e-09, "loss": 0.3883, "step": 2693 }, { "epoch": 2.97789240972734, "grad_norm": 0.3220219612121582, "learning_rate": 1.342720664469499e-09, "loss": 0.3988, "step": 2694 }, { "epoch": 2.9789977892409727, "grad_norm": 0.329543799161911, "learning_rate": 1.197679470064106e-09, "loss": 0.4409, "step": 2695 }, { "epoch": 2.9801031687546056, "grad_norm": 0.30829766392707825, "learning_rate": 1.0609250592130693e-09, "loss": 0.406, "step": 2696 }, { "epoch": 2.981208548268239, "grad_norm": 0.3204265832901001, "learning_rate": 9.324576586211553e-10, "loss": 0.3855, "step": 2697 }, { "epoch": 2.9823139277818718, "grad_norm": 0.3281674385070801, "learning_rate": 8.122774812552303e-10, "loss": 0.4498, "step": 2698 }, { "epoch": 2.9834193072955046, "grad_norm": 0.29491978883743286, "learning_rate": 7.003847263453712e-10, "loss": 0.4118, "step": 2699 }, { "epoch": 2.984524686809138, "grad_norm": 0.3161303699016571, "learning_rate": 5.967795793820896e-10, "loss": 0.4481, "step": 2700 }, { "epoch": 2.985630066322771, "grad_norm": 0.3154834806919098, "learning_rate": 5.014622121163326e-10, "loss": 0.3961, "step": 2701 }, { "epoch": 2.9867354458364037, "grad_norm": 0.31995198130607605, "learning_rate": 4.1443278256170227e-10, "loss": 0.3809, "step": 2702 }, { "epoch": 2.9878408253500366, "grad_norm": 0.32098567485809326, "learning_rate": 3.3569143499112553e-10, "loss": 0.3873, "step": 2703 }, { "epoch": 2.98894620486367, "grad_norm": 0.33732590079307556, "learning_rate": 2.6523829993907456e-10, "loss": 0.4143, "step": 2704 }, { "epoch": 2.990051584377303, "grad_norm": 0.35376566648483276, "learning_rate": 2.0307349419823597e-10, "loss": 0.4289, "step": 2705 }, { "epoch": 2.991156963890936, "grad_norm": 0.3069959878921509, "learning_rate": 1.4919712082339665e-10, "loss": 0.4137, "step": 2706 }, { "epoch": 2.992262343404569, "grad_norm": 0.33258068561553955, "learning_rate": 1.0360926912866831e-10, "loss": 0.3755, "step": 2707 }, { "epoch": 2.993367722918202, "grad_norm": 0.3136303424835205, "learning_rate": 6.63100146863771e-11, "loss": 0.4216, "step": 2708 }, { "epoch": 2.9944731024318347, "grad_norm": 0.33485713601112366, "learning_rate": 3.7299419330394384e-11, "loss": 0.4485, "step": 2709 }, { "epoch": 2.995578481945468, "grad_norm": 0.2978363037109375, "learning_rate": 1.6577531152806027e-11, "loss": 0.3803, "step": 2710 }, { "epoch": 2.996683861459101, "grad_norm": 0.3043343424797058, "learning_rate": 4.144384506132859e-12, "loss": 0.4242, "step": 2711 }, { "epoch": 2.9977892409727342, "grad_norm": 0.30916348099708557, "learning_rate": 0.0, "loss": 0.3874, "step": 2712 }, { "epoch": 2.9977892409727342, "step": 2712, "total_flos": 3294970468040704.0, "train_loss": 0.4674285581351909, "train_runtime": 55101.4915, "train_samples_per_second": 4.728, "train_steps_per_second": 0.049 } ], "logging_steps": 1.0, "max_steps": 2712, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3294970468040704.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }