{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9814814814814814, "eval_steps": 68, "global_step": 540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003703703703703704, "grad_norm": 0.40625306963920593, "learning_rate": 1.0000000000000002e-06, "loss": 1.3427, "step": 1 }, { "epoch": 0.003703703703703704, "eval_loss": 1.3691776990890503, "eval_runtime": 80.493, "eval_samples_per_second": 3.615, "eval_steps_per_second": 0.46, "step": 1 }, { "epoch": 0.007407407407407408, "grad_norm": 0.6216382384300232, "learning_rate": 2.0000000000000003e-06, "loss": 1.3914, "step": 2 }, { "epoch": 0.011111111111111112, "grad_norm": 0.4203539788722992, "learning_rate": 3e-06, "loss": 1.3421, "step": 3 }, { "epoch": 0.014814814814814815, "grad_norm": 0.48187777400016785, "learning_rate": 4.000000000000001e-06, "loss": 1.3913, "step": 4 }, { "epoch": 0.018518518518518517, "grad_norm": 0.4104997515678406, "learning_rate": 5e-06, "loss": 1.3264, "step": 5 }, { "epoch": 0.022222222222222223, "grad_norm": 0.5217423439025879, "learning_rate": 6e-06, "loss": 1.3418, "step": 6 }, { "epoch": 0.025925925925925925, "grad_norm": 0.40824779868125916, "learning_rate": 7e-06, "loss": 1.3761, "step": 7 }, { "epoch": 0.02962962962962963, "grad_norm": 0.41881611943244934, "learning_rate": 8.000000000000001e-06, "loss": 1.3631, "step": 8 }, { "epoch": 0.03333333333333333, "grad_norm": 0.43708905577659607, "learning_rate": 9e-06, "loss": 1.3911, "step": 9 }, { "epoch": 0.037037037037037035, "grad_norm": 0.48373478651046753, "learning_rate": 1e-05, "loss": 1.3813, "step": 10 }, { "epoch": 0.040740740740740744, "grad_norm": 0.428241491317749, "learning_rate": 9.999912161129377e-06, "loss": 1.3825, "step": 11 }, { "epoch": 0.044444444444444446, "grad_norm": 0.4543517827987671, "learning_rate": 9.999648647603774e-06, "loss": 1.3413, "step": 12 }, { "epoch": 0.04814814814814815, "grad_norm": 0.48931288719177246, "learning_rate": 9.999209468681885e-06, "loss": 1.4078, "step": 13 }, { "epoch": 0.05185185185185185, "grad_norm": 0.47361329197883606, "learning_rate": 9.998594639794502e-06, "loss": 1.3926, "step": 14 }, { "epoch": 0.05555555555555555, "grad_norm": 0.46920689940452576, "learning_rate": 9.997804182543973e-06, "loss": 1.3043, "step": 15 }, { "epoch": 0.05925925925925926, "grad_norm": 0.44550788402557373, "learning_rate": 9.996838124703448e-06, "loss": 1.3535, "step": 16 }, { "epoch": 0.06296296296296296, "grad_norm": 0.4951707720756531, "learning_rate": 9.995696500215899e-06, "loss": 1.3355, "step": 17 }, { "epoch": 0.06666666666666667, "grad_norm": 0.5006001591682434, "learning_rate": 9.994379349192927e-06, "loss": 1.3064, "step": 18 }, { "epoch": 0.07037037037037037, "grad_norm": 0.45947596430778503, "learning_rate": 9.992886717913358e-06, "loss": 1.394, "step": 19 }, { "epoch": 0.07407407407407407, "grad_norm": 0.49364641308784485, "learning_rate": 9.991218658821609e-06, "loss": 1.3043, "step": 20 }, { "epoch": 0.07777777777777778, "grad_norm": 0.47694772481918335, "learning_rate": 9.989375230525849e-06, "loss": 1.3287, "step": 21 }, { "epoch": 0.08148148148148149, "grad_norm": 0.5253634452819824, "learning_rate": 9.987356497795944e-06, "loss": 1.3046, "step": 22 }, { "epoch": 0.08518518518518518, "grad_norm": 0.5501742362976074, "learning_rate": 9.985162531561174e-06, "loss": 1.3499, "step": 23 }, { "epoch": 0.08888888888888889, "grad_norm": 0.5258708000183105, "learning_rate": 9.982793408907747e-06, "loss": 1.2779, "step": 24 }, { "epoch": 0.09259259259259259, "grad_norm": 0.4966470003128052, "learning_rate": 9.980249213076085e-06, "loss": 1.2702, "step": 25 }, { "epoch": 0.0962962962962963, "grad_norm": 0.4991610050201416, "learning_rate": 9.977530033457906e-06, "loss": 1.3286, "step": 26 }, { "epoch": 0.1, "grad_norm": 0.5212219953536987, "learning_rate": 9.97463596559307e-06, "loss": 1.2978, "step": 27 }, { "epoch": 0.1037037037037037, "grad_norm": 0.4977610409259796, "learning_rate": 9.971567111166246e-06, "loss": 1.3247, "step": 28 }, { "epoch": 0.10740740740740741, "grad_norm": 0.5000190734863281, "learning_rate": 9.968323578003312e-06, "loss": 1.3017, "step": 29 }, { "epoch": 0.1111111111111111, "grad_norm": 0.476797878742218, "learning_rate": 9.964905480067585e-06, "loss": 1.2287, "step": 30 }, { "epoch": 0.11481481481481481, "grad_norm": 0.5062195062637329, "learning_rate": 9.961312937455812e-06, "loss": 1.2521, "step": 31 }, { "epoch": 0.11851851851851852, "grad_norm": 0.5346536636352539, "learning_rate": 9.957546076393944e-06, "loss": 1.2907, "step": 32 }, { "epoch": 0.12222222222222222, "grad_norm": 0.5018014311790466, "learning_rate": 9.95360502923271e-06, "loss": 1.273, "step": 33 }, { "epoch": 0.1259259259259259, "grad_norm": 0.4412826895713806, "learning_rate": 9.949489934442966e-06, "loss": 1.202, "step": 34 }, { "epoch": 0.12962962962962962, "grad_norm": 0.47726863622665405, "learning_rate": 9.945200936610821e-06, "loss": 1.1432, "step": 35 }, { "epoch": 0.13333333333333333, "grad_norm": 0.4887215197086334, "learning_rate": 9.940738186432565e-06, "loss": 1.1524, "step": 36 }, { "epoch": 0.13703703703703704, "grad_norm": 0.4492252469062805, "learning_rate": 9.936101840709373e-06, "loss": 1.1903, "step": 37 }, { "epoch": 0.14074074074074075, "grad_norm": 0.43920594453811646, "learning_rate": 9.931292062341793e-06, "loss": 1.1942, "step": 38 }, { "epoch": 0.14444444444444443, "grad_norm": 0.44488102197647095, "learning_rate": 9.926309020324025e-06, "loss": 1.1919, "step": 39 }, { "epoch": 0.14814814814814814, "grad_norm": 0.5044857263565063, "learning_rate": 9.921152889737985e-06, "loss": 1.1351, "step": 40 }, { "epoch": 0.15185185185185185, "grad_norm": 0.45221227407455444, "learning_rate": 9.915823851747143e-06, "loss": 1.1624, "step": 41 }, { "epoch": 0.15555555555555556, "grad_norm": 0.5037719011306763, "learning_rate": 9.910322093590177e-06, "loss": 1.1718, "step": 42 }, { "epoch": 0.15925925925925927, "grad_norm": 0.44602254033088684, "learning_rate": 9.90464780857437e-06, "loss": 1.1546, "step": 43 }, { "epoch": 0.16296296296296298, "grad_norm": 0.44312745332717896, "learning_rate": 9.898801196068839e-06, "loss": 1.2048, "step": 44 }, { "epoch": 0.16666666666666666, "grad_norm": 0.5689204931259155, "learning_rate": 9.892782461497521e-06, "loss": 1.2042, "step": 45 }, { "epoch": 0.17037037037037037, "grad_norm": 0.47574153542518616, "learning_rate": 9.886591816331953e-06, "loss": 1.072, "step": 46 }, { "epoch": 0.17407407407407408, "grad_norm": 0.5947781801223755, "learning_rate": 9.880229478083849e-06, "loss": 1.1788, "step": 47 }, { "epoch": 0.17777777777777778, "grad_norm": 0.45822006464004517, "learning_rate": 9.87369567029745e-06, "loss": 1.1901, "step": 48 }, { "epoch": 0.1814814814814815, "grad_norm": 0.4415622055530548, "learning_rate": 9.866990622541677e-06, "loss": 1.1071, "step": 49 }, { "epoch": 0.18518518518518517, "grad_norm": 0.49463754892349243, "learning_rate": 9.860114570402055e-06, "loss": 1.1492, "step": 50 }, { "epoch": 0.18888888888888888, "grad_norm": 0.5251724720001221, "learning_rate": 9.853067755472447e-06, "loss": 1.102, "step": 51 }, { "epoch": 0.1925925925925926, "grad_norm": 0.4823416471481323, "learning_rate": 9.845850425346563e-06, "loss": 1.1561, "step": 52 }, { "epoch": 0.1962962962962963, "grad_norm": 0.5142261385917664, "learning_rate": 9.838462833609249e-06, "loss": 1.2041, "step": 53 }, { "epoch": 0.2, "grad_norm": 0.5137107372283936, "learning_rate": 9.830905239827592e-06, "loss": 1.0813, "step": 54 }, { "epoch": 0.2037037037037037, "grad_norm": 0.41644176840782166, "learning_rate": 9.823177909541795e-06, "loss": 1.0974, "step": 55 }, { "epoch": 0.2074074074074074, "grad_norm": 0.40043726563453674, "learning_rate": 9.815281114255841e-06, "loss": 1.1076, "step": 56 }, { "epoch": 0.2111111111111111, "grad_norm": 0.43805867433547974, "learning_rate": 9.807215131427966e-06, "loss": 1.0959, "step": 57 }, { "epoch": 0.21481481481481482, "grad_norm": 0.5732157230377197, "learning_rate": 9.798980244460892e-06, "loss": 1.0742, "step": 58 }, { "epoch": 0.21851851851851853, "grad_norm": 0.44811880588531494, "learning_rate": 9.790576742691895e-06, "loss": 1.0058, "step": 59 }, { "epoch": 0.2222222222222222, "grad_norm": 0.44900447130203247, "learning_rate": 9.782004921382612e-06, "loss": 1.0982, "step": 60 }, { "epoch": 0.22592592592592592, "grad_norm": 0.521683394908905, "learning_rate": 9.773265081708687e-06, "loss": 1.1294, "step": 61 }, { "epoch": 0.22962962962962963, "grad_norm": 0.48734819889068604, "learning_rate": 9.764357530749178e-06, "loss": 1.0575, "step": 62 }, { "epoch": 0.23333333333333334, "grad_norm": 0.47888699173927307, "learning_rate": 9.755282581475769e-06, "loss": 1.0333, "step": 63 }, { "epoch": 0.23703703703703705, "grad_norm": 0.45292389392852783, "learning_rate": 9.74604055274178e-06, "loss": 1.0786, "step": 64 }, { "epoch": 0.24074074074074073, "grad_norm": 0.46524283289909363, "learning_rate": 9.736631769270958e-06, "loss": 1.0708, "step": 65 }, { "epoch": 0.24444444444444444, "grad_norm": 0.4456775486469269, "learning_rate": 9.727056561646067e-06, "loss": 1.0512, "step": 66 }, { "epoch": 0.24814814814814815, "grad_norm": 0.461698055267334, "learning_rate": 9.717315266297277e-06, "loss": 1.1535, "step": 67 }, { "epoch": 0.2518518518518518, "grad_norm": 0.5552849173545837, "learning_rate": 9.707408225490343e-06, "loss": 1.1064, "step": 68 }, { "epoch": 0.2518518518518518, "eval_loss": 1.065529227256775, "eval_runtime": 80.9702, "eval_samples_per_second": 3.594, "eval_steps_per_second": 0.457, "step": 68 }, { "epoch": 0.25555555555555554, "grad_norm": 0.541875422000885, "learning_rate": 9.697335787314573e-06, "loss": 1.0527, "step": 69 }, { "epoch": 0.25925925925925924, "grad_norm": 0.4617699384689331, "learning_rate": 9.687098305670606e-06, "loss": 1.0456, "step": 70 }, { "epoch": 0.26296296296296295, "grad_norm": 0.4448198080062866, "learning_rate": 9.676696140257969e-06, "loss": 1.0364, "step": 71 }, { "epoch": 0.26666666666666666, "grad_norm": 0.3684210479259491, "learning_rate": 9.66612965656245e-06, "loss": 1.1162, "step": 72 }, { "epoch": 0.27037037037037037, "grad_norm": 0.6021161079406738, "learning_rate": 9.655399225843244e-06, "loss": 1.0799, "step": 73 }, { "epoch": 0.2740740740740741, "grad_norm": 0.575809895992279, "learning_rate": 9.644505225119922e-06, "loss": 1.0222, "step": 74 }, { "epoch": 0.2777777777777778, "grad_norm": 0.5453614592552185, "learning_rate": 9.633448037159167e-06, "loss": 1.0339, "step": 75 }, { "epoch": 0.2814814814814815, "grad_norm": 0.5681980848312378, "learning_rate": 9.622228050461345e-06, "loss": 1.0622, "step": 76 }, { "epoch": 0.2851851851851852, "grad_norm": 0.4109339714050293, "learning_rate": 9.610845659246833e-06, "loss": 1.0395, "step": 77 }, { "epoch": 0.28888888888888886, "grad_norm": 0.4249359667301178, "learning_rate": 9.599301263442194e-06, "loss": 1.0346, "step": 78 }, { "epoch": 0.29259259259259257, "grad_norm": 0.5109196901321411, "learning_rate": 9.587595268666099e-06, "loss": 1.0834, "step": 79 }, { "epoch": 0.2962962962962963, "grad_norm": 0.512137770652771, "learning_rate": 9.575728086215093e-06, "loss": 1.0438, "step": 80 }, { "epoch": 0.3, "grad_norm": 0.5844932198524475, "learning_rate": 9.56370013304914e-06, "loss": 0.9966, "step": 81 }, { "epoch": 0.3037037037037037, "grad_norm": 0.4886794984340668, "learning_rate": 9.551511831776966e-06, "loss": 1.0461, "step": 82 }, { "epoch": 0.3074074074074074, "grad_norm": 0.4917876124382019, "learning_rate": 9.53916361064122e-06, "loss": 1.0121, "step": 83 }, { "epoch": 0.3111111111111111, "grad_norm": 0.48174771666526794, "learning_rate": 9.526655903503423e-06, "loss": 1.0579, "step": 84 }, { "epoch": 0.3148148148148148, "grad_norm": 0.5147380232810974, "learning_rate": 9.513989149828718e-06, "loss": 1.0065, "step": 85 }, { "epoch": 0.31851851851851853, "grad_norm": 0.4484403431415558, "learning_rate": 9.501163794670445e-06, "loss": 1.0089, "step": 86 }, { "epoch": 0.32222222222222224, "grad_norm": 0.45849668979644775, "learning_rate": 9.488180288654485e-06, "loss": 1.0262, "step": 87 }, { "epoch": 0.32592592592592595, "grad_norm": 0.571622908115387, "learning_rate": 9.475039087963443e-06, "loss": 1.0129, "step": 88 }, { "epoch": 0.3296296296296296, "grad_norm": 0.5279180407524109, "learning_rate": 9.461740654320608e-06, "loss": 1.03, "step": 89 }, { "epoch": 0.3333333333333333, "grad_norm": 0.47328171133995056, "learning_rate": 9.448285454973739e-06, "loss": 0.9805, "step": 90 }, { "epoch": 0.337037037037037, "grad_norm": 0.4972725212574005, "learning_rate": 9.434673962678638e-06, "loss": 0.976, "step": 91 }, { "epoch": 0.34074074074074073, "grad_norm": 0.5977814793586731, "learning_rate": 9.420906655682553e-06, "loss": 0.989, "step": 92 }, { "epoch": 0.34444444444444444, "grad_norm": 0.5420663356781006, "learning_rate": 9.40698401770736e-06, "loss": 1.0225, "step": 93 }, { "epoch": 0.34814814814814815, "grad_norm": 0.410198450088501, "learning_rate": 9.392906537932582e-06, "loss": 1.0393, "step": 94 }, { "epoch": 0.35185185185185186, "grad_norm": 0.5001354217529297, "learning_rate": 9.378674710978185e-06, "loss": 0.9712, "step": 95 }, { "epoch": 0.35555555555555557, "grad_norm": 0.5929519534111023, "learning_rate": 9.364289036887214e-06, "loss": 1.0759, "step": 96 }, { "epoch": 0.3592592592592593, "grad_norm": 0.5323709845542908, "learning_rate": 9.349750021108212e-06, "loss": 1.0619, "step": 97 }, { "epoch": 0.362962962962963, "grad_norm": 0.5360124707221985, "learning_rate": 9.335058174477472e-06, "loss": 0.9957, "step": 98 }, { "epoch": 0.36666666666666664, "grad_norm": 0.5704509019851685, "learning_rate": 9.320214013201079e-06, "loss": 1.0591, "step": 99 }, { "epoch": 0.37037037037037035, "grad_norm": 0.4351862967014313, "learning_rate": 9.305218058836778e-06, "loss": 1.014, "step": 100 }, { "epoch": 0.37407407407407406, "grad_norm": 0.48397883772850037, "learning_rate": 9.290070838275649e-06, "loss": 1.0094, "step": 101 }, { "epoch": 0.37777777777777777, "grad_norm": 0.5487049221992493, "learning_rate": 9.274772883723587e-06, "loss": 0.9604, "step": 102 }, { "epoch": 0.3814814814814815, "grad_norm": 0.4735201895236969, "learning_rate": 9.259324732682615e-06, "loss": 0.9577, "step": 103 }, { "epoch": 0.3851851851851852, "grad_norm": 0.5162625312805176, "learning_rate": 9.24372692793199e-06, "loss": 1.0095, "step": 104 }, { "epoch": 0.3888888888888889, "grad_norm": 0.4944085478782654, "learning_rate": 9.22798001750913e-06, "loss": 1.0086, "step": 105 }, { "epoch": 0.3925925925925926, "grad_norm": 0.5985198616981506, "learning_rate": 9.21208455469037e-06, "loss": 0.9878, "step": 106 }, { "epoch": 0.3962962962962963, "grad_norm": 0.6551868915557861, "learning_rate": 9.196041097971509e-06, "loss": 1.0079, "step": 107 }, { "epoch": 0.4, "grad_norm": 0.4953964352607727, "learning_rate": 9.179850211048193e-06, "loss": 1.0403, "step": 108 }, { "epoch": 0.40370370370370373, "grad_norm": 0.46935591101646423, "learning_rate": 9.163512462796113e-06, "loss": 1.0443, "step": 109 }, { "epoch": 0.4074074074074074, "grad_norm": 0.48214173316955566, "learning_rate": 9.14702842725101e-06, "loss": 0.9952, "step": 110 }, { "epoch": 0.4111111111111111, "grad_norm": 0.5411708354949951, "learning_rate": 9.13039868358851e-06, "loss": 1.0634, "step": 111 }, { "epoch": 0.4148148148148148, "grad_norm": 0.68564373254776, "learning_rate": 9.113623816103775e-06, "loss": 0.9307, "step": 112 }, { "epoch": 0.4185185185185185, "grad_norm": 0.536626398563385, "learning_rate": 9.09670441419097e-06, "loss": 1.0535, "step": 113 }, { "epoch": 0.4222222222222222, "grad_norm": 0.485929012298584, "learning_rate": 9.079641072322555e-06, "loss": 1.0176, "step": 114 }, { "epoch": 0.42592592592592593, "grad_norm": 0.5539782047271729, "learning_rate": 9.062434390028407e-06, "loss": 0.9906, "step": 115 }, { "epoch": 0.42962962962962964, "grad_norm": 0.49939635396003723, "learning_rate": 9.045084971874738e-06, "loss": 0.9586, "step": 116 }, { "epoch": 0.43333333333333335, "grad_norm": 0.48620209097862244, "learning_rate": 9.027593427442867e-06, "loss": 1.0209, "step": 117 }, { "epoch": 0.43703703703703706, "grad_norm": 0.4806266725063324, "learning_rate": 9.009960371307798e-06, "loss": 1.0185, "step": 118 }, { "epoch": 0.44074074074074077, "grad_norm": 0.6763521432876587, "learning_rate": 8.992186423016626e-06, "loss": 1.0247, "step": 119 }, { "epoch": 0.4444444444444444, "grad_norm": 0.5310172438621521, "learning_rate": 8.974272207066767e-06, "loss": 1.006, "step": 120 }, { "epoch": 0.44814814814814813, "grad_norm": 0.5065312385559082, "learning_rate": 8.956218352884022e-06, "loss": 0.9535, "step": 121 }, { "epoch": 0.45185185185185184, "grad_norm": 0.5911722183227539, "learning_rate": 8.938025494800454e-06, "loss": 0.9698, "step": 122 }, { "epoch": 0.45555555555555555, "grad_norm": 0.60561203956604, "learning_rate": 8.919694272032108e-06, "loss": 1.0081, "step": 123 }, { "epoch": 0.45925925925925926, "grad_norm": 0.5998137593269348, "learning_rate": 8.901225328656543e-06, "loss": 1.0332, "step": 124 }, { "epoch": 0.46296296296296297, "grad_norm": 0.6571759581565857, "learning_rate": 8.882619313590212e-06, "loss": 1.0501, "step": 125 }, { "epoch": 0.4666666666666667, "grad_norm": 0.5181518793106079, "learning_rate": 8.863876880565656e-06, "loss": 0.9653, "step": 126 }, { "epoch": 0.4703703703703704, "grad_norm": 0.5412523746490479, "learning_rate": 8.844998688108535e-06, "loss": 0.999, "step": 127 }, { "epoch": 0.4740740740740741, "grad_norm": 0.5652058124542236, "learning_rate": 8.825985399514488e-06, "loss": 0.9647, "step": 128 }, { "epoch": 0.4777777777777778, "grad_norm": 0.52536940574646, "learning_rate": 8.806837682825835e-06, "loss": 0.9694, "step": 129 }, { "epoch": 0.48148148148148145, "grad_norm": 0.6217904686927795, "learning_rate": 8.787556210808101e-06, "loss": 1.0241, "step": 130 }, { "epoch": 0.48518518518518516, "grad_norm": 0.43509605526924133, "learning_rate": 8.768141660926375e-06, "loss": 0.9598, "step": 131 }, { "epoch": 0.4888888888888889, "grad_norm": 0.5001434087753296, "learning_rate": 8.748594715321512e-06, "loss": 0.9697, "step": 132 }, { "epoch": 0.4925925925925926, "grad_norm": 0.6269538402557373, "learning_rate": 8.728916060786162e-06, "loss": 1.0074, "step": 133 }, { "epoch": 0.4962962962962963, "grad_norm": 0.6777300834655762, "learning_rate": 8.70910638874064e-06, "loss": 0.9968, "step": 134 }, { "epoch": 0.5, "grad_norm": 0.5371289849281311, "learning_rate": 8.689166395208638e-06, "loss": 0.9684, "step": 135 }, { "epoch": 0.5037037037037037, "grad_norm": 0.6136884093284607, "learning_rate": 8.669096780792754e-06, "loss": 1.0297, "step": 136 }, { "epoch": 0.5037037037037037, "eval_loss": 0.9753141403198242, "eval_runtime": 81.1717, "eval_samples_per_second": 3.585, "eval_steps_per_second": 0.456, "step": 136 }, { "epoch": 0.5074074074074074, "grad_norm": 0.5171265602111816, "learning_rate": 8.6488982506499e-06, "loss": 0.962, "step": 137 }, { "epoch": 0.5111111111111111, "grad_norm": 0.6454190611839294, "learning_rate": 8.628571514466502e-06, "loss": 0.9555, "step": 138 }, { "epoch": 0.5148148148148148, "grad_norm": 0.5578838586807251, "learning_rate": 8.608117286433583e-06, "loss": 0.9079, "step": 139 }, { "epoch": 0.5185185185185185, "grad_norm": 0.5714731216430664, "learning_rate": 8.587536285221656e-06, "loss": 0.9894, "step": 140 }, { "epoch": 0.5222222222222223, "grad_norm": 0.5244677066802979, "learning_rate": 8.566829233955484e-06, "loss": 0.9735, "step": 141 }, { "epoch": 0.5259259259259259, "grad_norm": 0.4161701798439026, "learning_rate": 8.545996860188668e-06, "loss": 0.9945, "step": 142 }, { "epoch": 0.5296296296296297, "grad_norm": 0.6657142639160156, "learning_rate": 8.525039895878078e-06, "loss": 0.982, "step": 143 }, { "epoch": 0.5333333333333333, "grad_norm": 0.7206271886825562, "learning_rate": 8.503959077358143e-06, "loss": 0.9977, "step": 144 }, { "epoch": 0.5370370370370371, "grad_norm": 0.7977305054664612, "learning_rate": 8.482755145314987e-06, "loss": 0.9605, "step": 145 }, { "epoch": 0.5407407407407407, "grad_norm": 0.8049225211143494, "learning_rate": 8.46142884476038e-06, "loss": 0.999, "step": 146 }, { "epoch": 0.5444444444444444, "grad_norm": 0.49984222650527954, "learning_rate": 8.439980925005587e-06, "loss": 0.9595, "step": 147 }, { "epoch": 0.5481481481481482, "grad_norm": 0.48655927181243896, "learning_rate": 8.418412139635026e-06, "loss": 0.9481, "step": 148 }, { "epoch": 0.5518518518518518, "grad_norm": 0.5527738332748413, "learning_rate": 8.396723246479798e-06, "loss": 0.9665, "step": 149 }, { "epoch": 0.5555555555555556, "grad_norm": 0.6328939199447632, "learning_rate": 8.374915007591053e-06, "loss": 1.0021, "step": 150 }, { "epoch": 0.5592592592592592, "grad_norm": 0.6932883262634277, "learning_rate": 8.352988189213223e-06, "loss": 0.9991, "step": 151 }, { "epoch": 0.562962962962963, "grad_norm": 0.5916227698326111, "learning_rate": 8.330943561757092e-06, "loss": 0.9661, "step": 152 }, { "epoch": 0.5666666666666667, "grad_norm": 0.471822589635849, "learning_rate": 8.308781899772731e-06, "loss": 0.9396, "step": 153 }, { "epoch": 0.5703703703703704, "grad_norm": 0.5403897166252136, "learning_rate": 8.286503981922284e-06, "loss": 0.9444, "step": 154 }, { "epoch": 0.5740740740740741, "grad_norm": 0.5560125708580017, "learning_rate": 8.264110590952609e-06, "loss": 0.9487, "step": 155 }, { "epoch": 0.5777777777777777, "grad_norm": 0.6282420754432678, "learning_rate": 8.241602513667775e-06, "loss": 1.0124, "step": 156 }, { "epoch": 0.5814814814814815, "grad_norm": 0.4911057949066162, "learning_rate": 8.218980540901417e-06, "loss": 0.971, "step": 157 }, { "epoch": 0.5851851851851851, "grad_norm": 0.6368396878242493, "learning_rate": 8.19624546748895e-06, "loss": 1.0181, "step": 158 }, { "epoch": 0.5888888888888889, "grad_norm": 0.6642744541168213, "learning_rate": 8.173398092239647e-06, "loss": 1.0051, "step": 159 }, { "epoch": 0.5925925925925926, "grad_norm": 0.554905116558075, "learning_rate": 8.150439217908557e-06, "loss": 0.9329, "step": 160 }, { "epoch": 0.5962962962962963, "grad_norm": 0.5215203762054443, "learning_rate": 8.12736965116832e-06, "loss": 0.9506, "step": 161 }, { "epoch": 0.6, "grad_norm": 0.4904837906360626, "learning_rate": 8.104190202580811e-06, "loss": 0.9864, "step": 162 }, { "epoch": 0.6037037037037037, "grad_norm": 0.570766806602478, "learning_rate": 8.080901686568664e-06, "loss": 0.9379, "step": 163 }, { "epoch": 0.6074074074074074, "grad_norm": 0.687227725982666, "learning_rate": 8.057504921386661e-06, "loss": 0.9714, "step": 164 }, { "epoch": 0.6111111111111112, "grad_norm": 0.6017288565635681, "learning_rate": 8.034000729092967e-06, "loss": 0.9709, "step": 165 }, { "epoch": 0.6148148148148148, "grad_norm": 0.6062106490135193, "learning_rate": 8.010389935520269e-06, "loss": 1.0362, "step": 166 }, { "epoch": 0.6185185185185185, "grad_norm": 0.5548331141471863, "learning_rate": 7.986673370246743e-06, "loss": 0.9581, "step": 167 }, { "epoch": 0.6222222222222222, "grad_norm": 0.5252346396446228, "learning_rate": 7.962851866566912e-06, "loss": 0.9669, "step": 168 }, { "epoch": 0.6259259259259259, "grad_norm": 0.7005597352981567, "learning_rate": 7.938926261462366e-06, "loss": 0.987, "step": 169 }, { "epoch": 0.6296296296296297, "grad_norm": 0.5916934609413147, "learning_rate": 7.914897395572362e-06, "loss": 0.9433, "step": 170 }, { "epoch": 0.6333333333333333, "grad_norm": 0.6202555298805237, "learning_rate": 7.890766113164272e-06, "loss": 0.9833, "step": 171 }, { "epoch": 0.6370370370370371, "grad_norm": 0.5578716397285461, "learning_rate": 7.866533262103937e-06, "loss": 0.9479, "step": 172 }, { "epoch": 0.6407407407407407, "grad_norm": 0.6666351556777954, "learning_rate": 7.842199693825863e-06, "loss": 0.9383, "step": 173 }, { "epoch": 0.6444444444444445, "grad_norm": 0.5507566332817078, "learning_rate": 7.817766263303312e-06, "loss": 0.9767, "step": 174 }, { "epoch": 0.6481481481481481, "grad_norm": 0.6183774471282959, "learning_rate": 7.793233829018263e-06, "loss": 0.9078, "step": 175 }, { "epoch": 0.6518518518518519, "grad_norm": 0.499009370803833, "learning_rate": 7.768603252931243e-06, "loss": 0.9563, "step": 176 }, { "epoch": 0.6555555555555556, "grad_norm": 0.629336416721344, "learning_rate": 7.743875400451047e-06, "loss": 0.911, "step": 177 }, { "epoch": 0.6592592592592592, "grad_norm": 0.5423790812492371, "learning_rate": 7.719051140404327e-06, "loss": 0.9434, "step": 178 }, { "epoch": 0.662962962962963, "grad_norm": 0.6060659289360046, "learning_rate": 7.69413134500507e-06, "loss": 0.95, "step": 179 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5223778486251831, "learning_rate": 7.669116889823955e-06, "loss": 0.9474, "step": 180 }, { "epoch": 0.6703703703703704, "grad_norm": 0.6271294355392456, "learning_rate": 7.644008653757571e-06, "loss": 0.9652, "step": 181 }, { "epoch": 0.674074074074074, "grad_norm": 0.5973348617553711, "learning_rate": 7.6188075189975644e-06, "loss": 0.9333, "step": 182 }, { "epoch": 0.6777777777777778, "grad_norm": 0.5119736790657043, "learning_rate": 7.593514370999617e-06, "loss": 0.9253, "step": 183 }, { "epoch": 0.6814814814814815, "grad_norm": 0.6887508630752563, "learning_rate": 7.568130098452352e-06, "loss": 0.9344, "step": 184 }, { "epoch": 0.6851851851851852, "grad_norm": 0.5387381911277771, "learning_rate": 7.542655593246103e-06, "loss": 0.9645, "step": 185 }, { "epoch": 0.6888888888888889, "grad_norm": 0.5810338854789734, "learning_rate": 7.517091750441576e-06, "loss": 0.9406, "step": 186 }, { "epoch": 0.6925925925925925, "grad_norm": 0.6561952829360962, "learning_rate": 7.491439468238404e-06, "loss": 0.9363, "step": 187 }, { "epoch": 0.6962962962962963, "grad_norm": 0.7444878220558167, "learning_rate": 7.465699647943586e-06, "loss": 0.945, "step": 188 }, { "epoch": 0.7, "grad_norm": 0.6265509724617004, "learning_rate": 7.43987319393982e-06, "loss": 0.9576, "step": 189 }, { "epoch": 0.7037037037037037, "grad_norm": 0.6139175295829773, "learning_rate": 7.413961013653725e-06, "loss": 0.9697, "step": 190 }, { "epoch": 0.7074074074074074, "grad_norm": 0.5767727494239807, "learning_rate": 7.387964017523964e-06, "loss": 0.9721, "step": 191 }, { "epoch": 0.7111111111111111, "grad_norm": 0.757271945476532, "learning_rate": 7.361883118969248e-06, "loss": 1.0013, "step": 192 }, { "epoch": 0.7148148148148148, "grad_norm": 0.6246291995048523, "learning_rate": 7.335719234356245e-06, "loss": 0.9418, "step": 193 }, { "epoch": 0.7185185185185186, "grad_norm": 0.4833630621433258, "learning_rate": 7.309473282967387e-06, "loss": 0.9435, "step": 194 }, { "epoch": 0.7222222222222222, "grad_norm": 0.5289487242698669, "learning_rate": 7.283146186968566e-06, "loss": 0.9617, "step": 195 }, { "epoch": 0.725925925925926, "grad_norm": 0.6008256673812866, "learning_rate": 7.256738871376733e-06, "loss": 0.8983, "step": 196 }, { "epoch": 0.7296296296296296, "grad_norm": 0.5227617621421814, "learning_rate": 7.230252264027398e-06, "loss": 0.8768, "step": 197 }, { "epoch": 0.7333333333333333, "grad_norm": 0.6785119771957397, "learning_rate": 7.203687295542032e-06, "loss": 0.9393, "step": 198 }, { "epoch": 0.737037037037037, "grad_norm": 0.6053286790847778, "learning_rate": 7.1770448992953676e-06, "loss": 0.9125, "step": 199 }, { "epoch": 0.7407407407407407, "grad_norm": 0.7238445281982422, "learning_rate": 7.1503260113826035e-06, "loss": 0.9305, "step": 200 }, { "epoch": 0.7444444444444445, "grad_norm": 0.6719542741775513, "learning_rate": 7.123531570586515e-06, "loss": 0.9643, "step": 201 }, { "epoch": 0.7481481481481481, "grad_norm": 0.5546441674232483, "learning_rate": 7.09666251834447e-06, "loss": 0.9663, "step": 202 }, { "epoch": 0.7518518518518519, "grad_norm": 0.5350282192230225, "learning_rate": 7.069719798715347e-06, "loss": 0.9041, "step": 203 }, { "epoch": 0.7555555555555555, "grad_norm": 0.5801582932472229, "learning_rate": 7.042704358346375e-06, "loss": 0.9444, "step": 204 }, { "epoch": 0.7555555555555555, "eval_loss": 0.9426867961883545, "eval_runtime": 81.1055, "eval_samples_per_second": 3.588, "eval_steps_per_second": 0.456, "step": 204 }, { "epoch": 0.7592592592592593, "grad_norm": 0.7228114008903503, "learning_rate": 7.015617146439863e-06, "loss": 0.931, "step": 205 }, { "epoch": 0.762962962962963, "grad_norm": 0.5295515656471252, "learning_rate": 6.988459114719849e-06, "loss": 0.9457, "step": 206 }, { "epoch": 0.7666666666666667, "grad_norm": 0.5533620119094849, "learning_rate": 6.9612312173986675e-06, "loss": 0.9407, "step": 207 }, { "epoch": 0.7703703703703704, "grad_norm": 0.6508337259292603, "learning_rate": 6.933934411143419e-06, "loss": 0.9176, "step": 208 }, { "epoch": 0.774074074074074, "grad_norm": 0.644389808177948, "learning_rate": 6.906569655042357e-06, "loss": 0.9796, "step": 209 }, { "epoch": 0.7777777777777778, "grad_norm": 0.5943438410758972, "learning_rate": 6.879137910571191e-06, "loss": 0.9508, "step": 210 }, { "epoch": 0.7814814814814814, "grad_norm": 0.5512163639068604, "learning_rate": 6.8516401415593005e-06, "loss": 0.9066, "step": 211 }, { "epoch": 0.7851851851851852, "grad_norm": 0.5512770414352417, "learning_rate": 6.824077314155877e-06, "loss": 0.9169, "step": 212 }, { "epoch": 0.7888888888888889, "grad_norm": 0.7245272397994995, "learning_rate": 6.7964503967959705e-06, "loss": 0.9563, "step": 213 }, { "epoch": 0.7925925925925926, "grad_norm": 0.704143762588501, "learning_rate": 6.768760360166471e-06, "loss": 0.9662, "step": 214 }, { "epoch": 0.7962962962962963, "grad_norm": 0.5439050197601318, "learning_rate": 6.741008177171995e-06, "loss": 0.9609, "step": 215 }, { "epoch": 0.8, "grad_norm": 0.6104442477226257, "learning_rate": 6.713194822900707e-06, "loss": 0.9313, "step": 216 }, { "epoch": 0.8037037037037037, "grad_norm": 0.7294436693191528, "learning_rate": 6.6853212745900585e-06, "loss": 0.933, "step": 217 }, { "epoch": 0.8074074074074075, "grad_norm": 0.5400619506835938, "learning_rate": 6.657388511592453e-06, "loss": 0.9367, "step": 218 }, { "epoch": 0.8111111111111111, "grad_norm": 0.8623405694961548, "learning_rate": 6.62939751534083e-06, "loss": 0.9719, "step": 219 }, { "epoch": 0.8148148148148148, "grad_norm": 0.6787410378456116, "learning_rate": 6.601349269314188e-06, "loss": 0.9882, "step": 220 }, { "epoch": 0.8185185185185185, "grad_norm": 0.6689869165420532, "learning_rate": 6.573244759003033e-06, "loss": 0.9445, "step": 221 }, { "epoch": 0.8222222222222222, "grad_norm": 0.7502297759056091, "learning_rate": 6.545084971874738e-06, "loss": 0.9276, "step": 222 }, { "epoch": 0.825925925925926, "grad_norm": 2.460090160369873, "learning_rate": 6.516870897338864e-06, "loss": 0.9684, "step": 223 }, { "epoch": 0.8296296296296296, "grad_norm": 0.8110550045967102, "learning_rate": 6.488603526712391e-06, "loss": 0.9212, "step": 224 }, { "epoch": 0.8333333333333334, "grad_norm": 0.5253615975379944, "learning_rate": 6.46028385318488e-06, "loss": 0.9385, "step": 225 }, { "epoch": 0.837037037037037, "grad_norm": 0.5551905632019043, "learning_rate": 6.431912871783587e-06, "loss": 0.9331, "step": 226 }, { "epoch": 0.8407407407407408, "grad_norm": 0.6484084129333496, "learning_rate": 6.4034915793385e-06, "loss": 0.9936, "step": 227 }, { "epoch": 0.8444444444444444, "grad_norm": 0.6521108746528625, "learning_rate": 6.3750209744473105e-06, "loss": 0.974, "step": 228 }, { "epoch": 0.8481481481481481, "grad_norm": 0.7478381395339966, "learning_rate": 6.346502057440327e-06, "loss": 0.9569, "step": 229 }, { "epoch": 0.8518518518518519, "grad_norm": 0.6053647398948669, "learning_rate": 6.3179358303453386e-06, "loss": 0.8928, "step": 230 }, { "epoch": 0.8555555555555555, "grad_norm": 0.7461119890213013, "learning_rate": 6.289323296852393e-06, "loss": 0.9121, "step": 231 }, { "epoch": 0.8592592592592593, "grad_norm": 0.6154372692108154, "learning_rate": 6.260665462278544e-06, "loss": 0.952, "step": 232 }, { "epoch": 0.8629629629629629, "grad_norm": 0.710970938205719, "learning_rate": 6.231963333532516e-06, "loss": 0.9365, "step": 233 }, { "epoch": 0.8666666666666667, "grad_norm": 0.6357712149620056, "learning_rate": 6.203217919079343e-06, "loss": 0.8836, "step": 234 }, { "epoch": 0.8703703703703703, "grad_norm": 0.6976805329322815, "learning_rate": 6.17443022890492e-06, "loss": 0.9757, "step": 235 }, { "epoch": 0.8740740740740741, "grad_norm": 0.6872934699058533, "learning_rate": 6.145601274480521e-06, "loss": 0.9814, "step": 236 }, { "epoch": 0.8777777777777778, "grad_norm": 0.7947030663490295, "learning_rate": 6.116732068727271e-06, "loss": 0.9016, "step": 237 }, { "epoch": 0.8814814814814815, "grad_norm": 0.63334721326828, "learning_rate": 6.08782362598054e-06, "loss": 0.9679, "step": 238 }, { "epoch": 0.8851851851851852, "grad_norm": 0.5451921820640564, "learning_rate": 6.058876961954308e-06, "loss": 0.9511, "step": 239 }, { "epoch": 0.8888888888888888, "grad_norm": 0.5797951221466064, "learning_rate": 6.029893093705492e-06, "loss": 0.9553, "step": 240 }, { "epoch": 0.8925925925925926, "grad_norm": 0.5836870074272156, "learning_rate": 6.0008730395981905e-06, "loss": 0.9562, "step": 241 }, { "epoch": 0.8962962962962963, "grad_norm": 0.6153254508972168, "learning_rate": 5.971817819267914e-06, "loss": 0.9199, "step": 242 }, { "epoch": 0.9, "grad_norm": 0.6756653785705566, "learning_rate": 5.9427284535857585e-06, "loss": 0.9599, "step": 243 }, { "epoch": 0.9037037037037037, "grad_norm": 0.6547468304634094, "learning_rate": 5.9136059646225375e-06, "loss": 0.9485, "step": 244 }, { "epoch": 0.9074074074074074, "grad_norm": 0.7384520769119263, "learning_rate": 5.884451375612865e-06, "loss": 0.927, "step": 245 }, { "epoch": 0.9111111111111111, "grad_norm": 0.6480386853218079, "learning_rate": 5.855265710919211e-06, "loss": 1.0039, "step": 246 }, { "epoch": 0.9148148148148149, "grad_norm": 0.5494263768196106, "learning_rate": 5.826049995995905e-06, "loss": 0.9706, "step": 247 }, { "epoch": 0.9185185185185185, "grad_norm": 0.5438244342803955, "learning_rate": 5.796805257353109e-06, "loss": 0.963, "step": 248 }, { "epoch": 0.9222222222222223, "grad_norm": 0.6168299317359924, "learning_rate": 5.767532522520746e-06, "loss": 0.9594, "step": 249 }, { "epoch": 0.9259259259259259, "grad_norm": 0.6753399968147278, "learning_rate": 5.738232820012407e-06, "loss": 0.9181, "step": 250 }, { "epoch": 0.9296296296296296, "grad_norm": 0.5123042464256287, "learning_rate": 5.7089071792892e-06, "loss": 0.9216, "step": 251 }, { "epoch": 0.9333333333333333, "grad_norm": 0.7598656415939331, "learning_rate": 5.679556630723592e-06, "loss": 0.9725, "step": 252 }, { "epoch": 0.937037037037037, "grad_norm": 0.6306942701339722, "learning_rate": 5.6501822055631976e-06, "loss": 0.9041, "step": 253 }, { "epoch": 0.9407407407407408, "grad_norm": 0.7515453696250916, "learning_rate": 5.620784935894548e-06, "loss": 0.9192, "step": 254 }, { "epoch": 0.9444444444444444, "grad_norm": 0.6113058924674988, "learning_rate": 5.591365854606829e-06, "loss": 0.949, "step": 255 }, { "epoch": 0.9481481481481482, "grad_norm": 0.6589618921279907, "learning_rate": 5.561925995355595e-06, "loss": 0.9384, "step": 256 }, { "epoch": 0.9518518518518518, "grad_norm": 0.7518366575241089, "learning_rate": 5.532466392526439e-06, "loss": 0.8959, "step": 257 }, { "epoch": 0.9555555555555556, "grad_norm": 0.5112090110778809, "learning_rate": 5.5029880811986546e-06, "loss": 0.9214, "step": 258 }, { "epoch": 0.9592592592592593, "grad_norm": 0.6436278820037842, "learning_rate": 5.4734920971088766e-06, "loss": 0.9165, "step": 259 }, { "epoch": 0.9629629629629629, "grad_norm": 0.685821533203125, "learning_rate": 5.443979476614674e-06, "loss": 0.9114, "step": 260 }, { "epoch": 0.9666666666666667, "grad_norm": 0.5555897951126099, "learning_rate": 5.4144512566581495e-06, "loss": 0.9791, "step": 261 }, { "epoch": 0.9703703703703703, "grad_norm": 0.6167283058166504, "learning_rate": 5.384908474729501e-06, "loss": 0.9029, "step": 262 }, { "epoch": 0.9740740740740741, "grad_norm": 0.6644378304481506, "learning_rate": 5.3553521688305655e-06, "loss": 0.9659, "step": 263 }, { "epoch": 0.9777777777777777, "grad_norm": 0.6106395721435547, "learning_rate": 5.325783377438357e-06, "loss": 0.9161, "step": 264 }, { "epoch": 0.9814814814814815, "grad_norm": 0.6115413904190063, "learning_rate": 5.296203139468572e-06, "loss": 0.8719, "step": 265 }, { "epoch": 0.9851851851851852, "grad_norm": 0.8100462555885315, "learning_rate": 5.266612494239088e-06, "loss": 0.9013, "step": 266 }, { "epoch": 0.9888888888888889, "grad_norm": 0.7386695742607117, "learning_rate": 5.23701248143345e-06, "loss": 0.9151, "step": 267 }, { "epoch": 0.9925925925925926, "grad_norm": 0.5981118679046631, "learning_rate": 5.207404141064334e-06, "loss": 0.9077, "step": 268 }, { "epoch": 0.9962962962962963, "grad_norm": 0.5839532613754272, "learning_rate": 5.177788513437013e-06, "loss": 0.9564, "step": 269 }, { "epoch": 1.0, "grad_norm": 0.4894520938396454, "learning_rate": 5.148166639112799e-06, "loss": 0.9273, "step": 270 }, { "epoch": 1.0037037037037038, "grad_norm": 0.6211138963699341, "learning_rate": 5.118539558872489e-06, "loss": 0.9478, "step": 271 }, { "epoch": 1.0074074074074073, "grad_norm": 0.7439696192741394, "learning_rate": 5.088908313679788e-06, "loss": 0.9341, "step": 272 }, { "epoch": 1.0074074074074073, "eval_loss": 0.9261184930801392, "eval_runtime": 80.9898, "eval_samples_per_second": 3.593, "eval_steps_per_second": 0.457, "step": 272 }, { "epoch": 1.011111111111111, "grad_norm": 0.6589562296867371, "learning_rate": 5.059273944644742e-06, "loss": 0.9316, "step": 273 }, { "epoch": 1.0148148148148148, "grad_norm": 0.5672058463096619, "learning_rate": 5.029637492987153e-06, "loss": 0.9235, "step": 274 }, { "epoch": 1.0185185185185186, "grad_norm": 0.6068680882453918, "learning_rate": 5e-06, "loss": 0.9136, "step": 275 }, { "epoch": 1.0037037037037038, "grad_norm": 0.7259117960929871, "learning_rate": 4.970362507012848e-06, "loss": 0.8627, "step": 276 }, { "epoch": 1.0074074074074073, "grad_norm": 0.665239691734314, "learning_rate": 4.940726055355259e-06, "loss": 0.9385, "step": 277 }, { "epoch": 1.011111111111111, "grad_norm": 0.71152263879776, "learning_rate": 4.911091686320213e-06, "loss": 0.9532, "step": 278 }, { "epoch": 1.0148148148148148, "grad_norm": 0.7714909911155701, "learning_rate": 4.881460441127513e-06, "loss": 0.8689, "step": 279 }, { "epoch": 1.0185185185185186, "grad_norm": 0.6783362030982971, "learning_rate": 4.8518333608872015e-06, "loss": 0.948, "step": 280 }, { "epoch": 1.0222222222222221, "grad_norm": 0.5598512291908264, "learning_rate": 4.822211486562989e-06, "loss": 0.953, "step": 281 }, { "epoch": 1.025925925925926, "grad_norm": 0.7532334327697754, "learning_rate": 4.792595858935668e-06, "loss": 0.9703, "step": 282 }, { "epoch": 1.0296296296296297, "grad_norm": 0.7283293604850769, "learning_rate": 4.7629875185665505e-06, "loss": 0.9526, "step": 283 }, { "epoch": 1.0333333333333334, "grad_norm": 0.6575984358787537, "learning_rate": 4.733387505760913e-06, "loss": 0.9042, "step": 284 }, { "epoch": 1.037037037037037, "grad_norm": 0.5753719210624695, "learning_rate": 4.703796860531429e-06, "loss": 0.9009, "step": 285 }, { "epoch": 1.0407407407407407, "grad_norm": 0.7370662689208984, "learning_rate": 4.674216622561645e-06, "loss": 0.8645, "step": 286 }, { "epoch": 1.0444444444444445, "grad_norm": 0.602418839931488, "learning_rate": 4.644647831169435e-06, "loss": 0.9141, "step": 287 }, { "epoch": 1.048148148148148, "grad_norm": 0.7609613537788391, "learning_rate": 4.6150915252705005e-06, "loss": 0.8668, "step": 288 }, { "epoch": 1.0518518518518518, "grad_norm": 0.8010672330856323, "learning_rate": 4.585548743341851e-06, "loss": 0.9242, "step": 289 }, { "epoch": 1.0555555555555556, "grad_norm": 0.6908420324325562, "learning_rate": 4.556020523385326e-06, "loss": 0.9566, "step": 290 }, { "epoch": 1.0592592592592593, "grad_norm": 0.7219347357749939, "learning_rate": 4.526507902891124e-06, "loss": 0.8987, "step": 291 }, { "epoch": 1.0629629629629629, "grad_norm": 0.5726153254508972, "learning_rate": 4.497011918801347e-06, "loss": 0.9259, "step": 292 }, { "epoch": 1.0666666666666667, "grad_norm": 0.7002944350242615, "learning_rate": 4.467533607473563e-06, "loss": 0.9171, "step": 293 }, { "epoch": 1.0703703703703704, "grad_norm": 0.7401637434959412, "learning_rate": 4.438074004644407e-06, "loss": 0.9147, "step": 294 }, { "epoch": 1.074074074074074, "grad_norm": 0.7317702770233154, "learning_rate": 4.408634145393172e-06, "loss": 0.8777, "step": 295 }, { "epoch": 1.0777777777777777, "grad_norm": 0.586495041847229, "learning_rate": 4.379215064105454e-06, "loss": 0.8734, "step": 296 }, { "epoch": 1.0814814814814815, "grad_norm": 0.7603331804275513, "learning_rate": 4.349817794436805e-06, "loss": 0.9757, "step": 297 }, { "epoch": 1.0851851851851853, "grad_norm": 0.7039903402328491, "learning_rate": 4.32044336927641e-06, "loss": 0.9117, "step": 298 }, { "epoch": 1.0888888888888888, "grad_norm": 0.7265645265579224, "learning_rate": 4.2910928207108005e-06, "loss": 0.9547, "step": 299 }, { "epoch": 1.0925925925925926, "grad_norm": 0.5854629278182983, "learning_rate": 4.261767179987595e-06, "loss": 0.9309, "step": 300 }, { "epoch": 1.0962962962962963, "grad_norm": 0.7084276676177979, "learning_rate": 4.232467477479255e-06, "loss": 0.9414, "step": 301 }, { "epoch": 1.1, "grad_norm": 0.7032147645950317, "learning_rate": 4.203194742646893e-06, "loss": 0.846, "step": 302 }, { "epoch": 1.1037037037037036, "grad_norm": 0.7182865142822266, "learning_rate": 4.173950004004097e-06, "loss": 0.9737, "step": 303 }, { "epoch": 1.1074074074074074, "grad_norm": 0.6024776697158813, "learning_rate": 4.1447342890807905e-06, "loss": 0.8605, "step": 304 }, { "epoch": 1.1111111111111112, "grad_norm": 0.717693567276001, "learning_rate": 4.115548624387136e-06, "loss": 0.8731, "step": 305 }, { "epoch": 1.1148148148148147, "grad_norm": 0.8089867830276489, "learning_rate": 4.086394035377463e-06, "loss": 0.9019, "step": 306 }, { "epoch": 1.1185185185185185, "grad_norm": 0.5785974860191345, "learning_rate": 4.057271546414242e-06, "loss": 0.9574, "step": 307 }, { "epoch": 1.1222222222222222, "grad_norm": 0.7001700401306152, "learning_rate": 4.028182180732088e-06, "loss": 0.8993, "step": 308 }, { "epoch": 1.125925925925926, "grad_norm": 0.7361912131309509, "learning_rate": 3.99912696040181e-06, "loss": 0.9711, "step": 309 }, { "epoch": 1.1296296296296295, "grad_norm": 0.7708266973495483, "learning_rate": 3.970106906294509e-06, "loss": 0.9195, "step": 310 }, { "epoch": 1.1333333333333333, "grad_norm": 0.5702573657035828, "learning_rate": 3.9411230380456925e-06, "loss": 0.9393, "step": 311 }, { "epoch": 1.137037037037037, "grad_norm": 0.6527413725852966, "learning_rate": 3.912176374019462e-06, "loss": 0.9125, "step": 312 }, { "epoch": 1.1407407407407408, "grad_norm": 0.6216891407966614, "learning_rate": 3.88326793127273e-06, "loss": 0.8595, "step": 313 }, { "epoch": 1.1444444444444444, "grad_norm": 0.7108457684516907, "learning_rate": 3.85439872551948e-06, "loss": 0.945, "step": 314 }, { "epoch": 1.1481481481481481, "grad_norm": 0.564195990562439, "learning_rate": 3.825569771095082e-06, "loss": 0.9172, "step": 315 }, { "epoch": 1.151851851851852, "grad_norm": 0.7456059455871582, "learning_rate": 3.796782080920659e-06, "loss": 0.9229, "step": 316 }, { "epoch": 1.1555555555555554, "grad_norm": 0.6403030157089233, "learning_rate": 3.768036666467486e-06, "loss": 1.0, "step": 317 }, { "epoch": 1.1592592592592592, "grad_norm": 0.6477362513542175, "learning_rate": 3.7393345377214584e-06, "loss": 0.9649, "step": 318 }, { "epoch": 1.162962962962963, "grad_norm": 0.7265921831130981, "learning_rate": 3.7106767031476075e-06, "loss": 0.9558, "step": 319 }, { "epoch": 1.1666666666666667, "grad_norm": 0.6614460349082947, "learning_rate": 3.682064169654663e-06, "loss": 0.9338, "step": 320 }, { "epoch": 1.1703703703703703, "grad_norm": 0.8571596145629883, "learning_rate": 3.6534979425596747e-06, "loss": 0.8639, "step": 321 }, { "epoch": 1.174074074074074, "grad_norm": 0.7662659883499146, "learning_rate": 3.6249790255526916e-06, "loss": 0.9099, "step": 322 }, { "epoch": 1.1777777777777778, "grad_norm": 0.6332697868347168, "learning_rate": 3.5965084206615012e-06, "loss": 0.966, "step": 323 }, { "epoch": 1.1814814814814816, "grad_norm": 0.5719053149223328, "learning_rate": 3.568087128216414e-06, "loss": 0.9005, "step": 324 }, { "epoch": 1.1851851851851851, "grad_norm": 0.7472560405731201, "learning_rate": 3.539716146815122e-06, "loss": 0.8842, "step": 325 }, { "epoch": 1.1888888888888889, "grad_norm": 0.661870002746582, "learning_rate": 3.511396473287611e-06, "loss": 0.9547, "step": 326 }, { "epoch": 1.1925925925925926, "grad_norm": 0.8332524299621582, "learning_rate": 3.483129102661137e-06, "loss": 1.0097, "step": 327 }, { "epoch": 1.1962962962962962, "grad_norm": 0.7124307155609131, "learning_rate": 3.4549150281252635e-06, "loss": 0.9479, "step": 328 }, { "epoch": 1.2, "grad_norm": 0.6653727889060974, "learning_rate": 3.4267552409969694e-06, "loss": 0.9566, "step": 329 }, { "epoch": 1.2037037037037037, "grad_norm": 0.7246274948120117, "learning_rate": 3.398650730685813e-06, "loss": 0.8731, "step": 330 }, { "epoch": 1.2074074074074075, "grad_norm": 0.7679101824760437, "learning_rate": 3.3706024846591717e-06, "loss": 0.8851, "step": 331 }, { "epoch": 1.211111111111111, "grad_norm": 0.6830713152885437, "learning_rate": 3.3426114884075488e-06, "loss": 0.9429, "step": 332 }, { "epoch": 1.2148148148148148, "grad_norm": 0.686126172542572, "learning_rate": 3.3146787254099424e-06, "loss": 0.9363, "step": 333 }, { "epoch": 1.2185185185185186, "grad_norm": 1.0472172498703003, "learning_rate": 3.2868051770992935e-06, "loss": 0.8628, "step": 334 }, { "epoch": 1.2222222222222223, "grad_norm": 0.6732811331748962, "learning_rate": 3.258991822828007e-06, "loss": 0.9343, "step": 335 }, { "epoch": 1.2259259259259259, "grad_norm": 0.6006411910057068, "learning_rate": 3.2312396398335312e-06, "loss": 0.8932, "step": 336 }, { "epoch": 1.2296296296296296, "grad_norm": 0.6865448355674744, "learning_rate": 3.2035496032040303e-06, "loss": 0.9113, "step": 337 }, { "epoch": 1.2333333333333334, "grad_norm": 0.7750067114830017, "learning_rate": 3.175922685844125e-06, "loss": 0.8964, "step": 338 }, { "epoch": 1.237037037037037, "grad_norm": 0.6137946248054504, "learning_rate": 3.1483598584407003e-06, "loss": 0.9198, "step": 339 }, { "epoch": 1.2407407407407407, "grad_norm": 0.5940172672271729, "learning_rate": 3.1208620894288105e-06, "loss": 0.8925, "step": 340 }, { "epoch": 1.2407407407407407, "eval_loss": 0.9176353812217712, "eval_runtime": 80.9941, "eval_samples_per_second": 3.593, "eval_steps_per_second": 0.457, "step": 340 }, { "epoch": 1.2444444444444445, "grad_norm": 0.8746694326400757, "learning_rate": 3.093430344957643e-06, "loss": 0.9542, "step": 341 }, { "epoch": 1.2481481481481482, "grad_norm": 0.7152467370033264, "learning_rate": 3.0660655888565827e-06, "loss": 0.9122, "step": 342 }, { "epoch": 1.2518518518518518, "grad_norm": 0.665104866027832, "learning_rate": 3.038768782601335e-06, "loss": 0.8695, "step": 343 }, { "epoch": 1.2555555555555555, "grad_norm": 0.6397359371185303, "learning_rate": 3.0115408852801535e-06, "loss": 0.9026, "step": 344 }, { "epoch": 1.2592592592592593, "grad_norm": 0.6641426682472229, "learning_rate": 2.98438285356014e-06, "loss": 0.9131, "step": 345 }, { "epoch": 1.262962962962963, "grad_norm": 0.7378568053245544, "learning_rate": 2.9572956416536267e-06, "loss": 0.9778, "step": 346 }, { "epoch": 1.2666666666666666, "grad_norm": 0.7851204872131348, "learning_rate": 2.930280201284654e-06, "loss": 0.9295, "step": 347 }, { "epoch": 1.2703703703703704, "grad_norm": 0.7360734939575195, "learning_rate": 2.9033374816555338e-06, "loss": 0.8333, "step": 348 }, { "epoch": 1.2740740740740741, "grad_norm": 0.5486617088317871, "learning_rate": 2.8764684294134872e-06, "loss": 0.8636, "step": 349 }, { "epoch": 1.2777777777777777, "grad_norm": 0.6200026273727417, "learning_rate": 2.8496739886173994e-06, "loss": 0.9163, "step": 350 }, { "epoch": 1.2814814814814814, "grad_norm": 0.7656910419464111, "learning_rate": 2.822955100704634e-06, "loss": 0.8811, "step": 351 }, { "epoch": 1.2851851851851852, "grad_norm": 0.8108608722686768, "learning_rate": 2.7963127044579697e-06, "loss": 0.9206, "step": 352 }, { "epoch": 1.2888888888888888, "grad_norm": 0.7808861136436462, "learning_rate": 2.769747735972605e-06, "loss": 0.9116, "step": 353 }, { "epoch": 1.2925925925925925, "grad_norm": 0.6127861142158508, "learning_rate": 2.743261128623269e-06, "loss": 0.8986, "step": 354 }, { "epoch": 1.2962962962962963, "grad_norm": 0.8103310465812683, "learning_rate": 2.716853813031435e-06, "loss": 0.8832, "step": 355 }, { "epoch": 1.3, "grad_norm": 0.658495306968689, "learning_rate": 2.6905267170326143e-06, "loss": 0.9457, "step": 356 }, { "epoch": 1.3037037037037038, "grad_norm": 0.6721301078796387, "learning_rate": 2.6642807656437565e-06, "loss": 0.9182, "step": 357 }, { "epoch": 1.3074074074074074, "grad_norm": 0.6494591236114502, "learning_rate": 2.6381168810307536e-06, "loss": 0.9245, "step": 358 }, { "epoch": 1.3111111111111111, "grad_norm": 0.6653662919998169, "learning_rate": 2.612035982476039e-06, "loss": 0.9654, "step": 359 }, { "epoch": 1.3148148148148149, "grad_norm": 0.6556596159934998, "learning_rate": 2.5860389863462765e-06, "loss": 0.9552, "step": 360 }, { "epoch": 1.3185185185185184, "grad_norm": 0.7767282724380493, "learning_rate": 2.5601268060601816e-06, "loss": 0.901, "step": 361 }, { "epoch": 1.3222222222222222, "grad_norm": 0.6174845099449158, "learning_rate": 2.534300352056416e-06, "loss": 0.8979, "step": 362 }, { "epoch": 1.325925925925926, "grad_norm": 0.5829298496246338, "learning_rate": 2.508560531761597e-06, "loss": 0.9774, "step": 363 }, { "epoch": 1.3296296296296295, "grad_norm": 0.6260789632797241, "learning_rate": 2.4829082495584244e-06, "loss": 0.9693, "step": 364 }, { "epoch": 1.3333333333333333, "grad_norm": 0.6920310854911804, "learning_rate": 2.457344406753899e-06, "loss": 0.9345, "step": 365 }, { "epoch": 1.337037037037037, "grad_norm": 0.6177324652671814, "learning_rate": 2.4318699015476495e-06, "loss": 0.9274, "step": 366 }, { "epoch": 1.3407407407407408, "grad_norm": 0.6547250151634216, "learning_rate": 2.4064856290003863e-06, "loss": 0.9309, "step": 367 }, { "epoch": 1.3444444444444446, "grad_norm": 0.7775738835334778, "learning_rate": 2.3811924810024385e-06, "loss": 0.9607, "step": 368 }, { "epoch": 1.348148148148148, "grad_norm": 0.8030884861946106, "learning_rate": 2.35599134624243e-06, "loss": 0.9343, "step": 369 }, { "epoch": 1.3518518518518519, "grad_norm": 0.5836490988731384, "learning_rate": 2.330883110176049e-06, "loss": 0.9088, "step": 370 }, { "epoch": 1.3555555555555556, "grad_norm": 0.7231821417808533, "learning_rate": 2.3058686549949306e-06, "loss": 0.8505, "step": 371 }, { "epoch": 1.3592592592592592, "grad_norm": 0.7363606095314026, "learning_rate": 2.2809488595956746e-06, "loss": 0.9975, "step": 372 }, { "epoch": 1.362962962962963, "grad_norm": 0.7326072454452515, "learning_rate": 2.256124599548957e-06, "loss": 0.9272, "step": 373 }, { "epoch": 1.3666666666666667, "grad_norm": 0.6802873015403748, "learning_rate": 2.2313967470687593e-06, "loss": 0.9038, "step": 374 }, { "epoch": 1.3703703703703702, "grad_norm": 0.6956616640090942, "learning_rate": 2.2067661709817384e-06, "loss": 0.9062, "step": 375 }, { "epoch": 1.374074074074074, "grad_norm": 0.5167267322540283, "learning_rate": 2.18223373669669e-06, "loss": 0.8963, "step": 376 }, { "epoch": 1.3777777777777778, "grad_norm": 0.5965335965156555, "learning_rate": 2.157800306174139e-06, "loss": 0.9253, "step": 377 }, { "epoch": 1.3814814814814815, "grad_norm": 0.6725478768348694, "learning_rate": 2.1334667378960642e-06, "loss": 0.9349, "step": 378 }, { "epoch": 1.3851851851851853, "grad_norm": 0.6209405064582825, "learning_rate": 2.1092338868357305e-06, "loss": 0.9129, "step": 379 }, { "epoch": 1.3888888888888888, "grad_norm": 0.7127699255943298, "learning_rate": 2.0851026044276405e-06, "loss": 0.9502, "step": 380 }, { "epoch": 1.3925925925925926, "grad_norm": 0.7374861836433411, "learning_rate": 2.061073738537635e-06, "loss": 0.913, "step": 381 }, { "epoch": 1.3962962962962964, "grad_norm": 0.6551845669746399, "learning_rate": 2.0371481334330913e-06, "loss": 0.9493, "step": 382 }, { "epoch": 1.4, "grad_norm": 0.7371388077735901, "learning_rate": 2.013326629753259e-06, "loss": 0.9285, "step": 383 }, { "epoch": 1.4037037037037037, "grad_norm": 0.7790321111679077, "learning_rate": 1.9896100644797316e-06, "loss": 0.8598, "step": 384 }, { "epoch": 1.4074074074074074, "grad_norm": 0.6032355427742004, "learning_rate": 1.9659992709070346e-06, "loss": 0.9298, "step": 385 }, { "epoch": 1.411111111111111, "grad_norm": 0.7076795101165771, "learning_rate": 1.9424950786133414e-06, "loss": 0.92, "step": 386 }, { "epoch": 1.4148148148148147, "grad_norm": 0.8255873322486877, "learning_rate": 1.919098313431335e-06, "loss": 0.8778, "step": 387 }, { "epoch": 1.4185185185185185, "grad_norm": 0.7901713848114014, "learning_rate": 1.8958097974191909e-06, "loss": 0.8844, "step": 388 }, { "epoch": 1.4222222222222223, "grad_norm": 0.7087342739105225, "learning_rate": 1.8726303488316822e-06, "loss": 0.9575, "step": 389 }, { "epoch": 1.425925925925926, "grad_norm": 0.7450980544090271, "learning_rate": 1.8495607820914451e-06, "loss": 0.811, "step": 390 }, { "epoch": 1.4296296296296296, "grad_norm": 0.5872119665145874, "learning_rate": 1.826601907760357e-06, "loss": 0.9075, "step": 391 }, { "epoch": 1.4333333333333333, "grad_norm": 0.7893672585487366, "learning_rate": 1.8037545325110506e-06, "loss": 0.9549, "step": 392 }, { "epoch": 1.4370370370370371, "grad_norm": 0.7330056428909302, "learning_rate": 1.781019459098584e-06, "loss": 0.9366, "step": 393 }, { "epoch": 1.4407407407407407, "grad_norm": 0.6859740614891052, "learning_rate": 1.7583974863322272e-06, "loss": 0.9284, "step": 394 }, { "epoch": 1.4444444444444444, "grad_norm": 0.7870423793792725, "learning_rate": 1.7358894090473928e-06, "loss": 0.8698, "step": 395 }, { "epoch": 1.4481481481481482, "grad_norm": 0.6686491370201111, "learning_rate": 1.7134960180777171e-06, "loss": 0.9149, "step": 396 }, { "epoch": 1.4518518518518517, "grad_norm": 0.7235841751098633, "learning_rate": 1.6912181002272714e-06, "loss": 0.9068, "step": 397 }, { "epoch": 1.4555555555555555, "grad_norm": 0.6145541667938232, "learning_rate": 1.6690564382429104e-06, "loss": 0.8985, "step": 398 }, { "epoch": 1.4592592592592593, "grad_norm": 0.5945561528205872, "learning_rate": 1.6470118107867777e-06, "loss": 0.9318, "step": 399 }, { "epoch": 1.462962962962963, "grad_norm": 0.6769958734512329, "learning_rate": 1.6250849924089485e-06, "loss": 0.9207, "step": 400 }, { "epoch": 1.4666666666666668, "grad_norm": 0.6360178589820862, "learning_rate": 1.6032767535202042e-06, "loss": 0.9344, "step": 401 }, { "epoch": 1.4703703703703703, "grad_norm": 0.6406002640724182, "learning_rate": 1.581587860364977e-06, "loss": 0.8903, "step": 402 }, { "epoch": 1.474074074074074, "grad_norm": 0.7576456069946289, "learning_rate": 1.560019074994416e-06, "loss": 0.8949, "step": 403 }, { "epoch": 1.4777777777777779, "grad_norm": 0.8080588579177856, "learning_rate": 1.5385711552396227e-06, "loss": 0.9252, "step": 404 }, { "epoch": 1.4814814814814814, "grad_norm": 0.6083511114120483, "learning_rate": 1.5172448546850166e-06, "loss": 0.9096, "step": 405 }, { "epoch": 1.4851851851851852, "grad_norm": 0.7563885450363159, "learning_rate": 1.4960409226418576e-06, "loss": 0.966, "step": 406 }, { "epoch": 1.488888888888889, "grad_norm": 0.8337453603744507, "learning_rate": 1.4749601041219246e-06, "loss": 0.933, "step": 407 }, { "epoch": 1.4925925925925925, "grad_norm": 0.5826141238212585, "learning_rate": 1.4540031398113335e-06, "loss": 0.896, "step": 408 }, { "epoch": 1.4925925925925925, "eval_loss": 0.9128310084342957, "eval_runtime": 80.6954, "eval_samples_per_second": 3.606, "eval_steps_per_second": 0.459, "step": 408 }, { "epoch": 1.4962962962962962, "grad_norm": 0.64705890417099, "learning_rate": 1.4331707660445155e-06, "loss": 0.8723, "step": 409 }, { "epoch": 1.5, "grad_norm": 0.6053957939147949, "learning_rate": 1.4124637147783431e-06, "loss": 0.8731, "step": 410 }, { "epoch": 1.5037037037037035, "grad_norm": 0.8161659836769104, "learning_rate": 1.3918827135664186e-06, "loss": 0.9542, "step": 411 }, { "epoch": 1.5074074074074075, "grad_norm": 0.8037695288658142, "learning_rate": 1.371428485533498e-06, "loss": 0.9263, "step": 412 }, { "epoch": 1.511111111111111, "grad_norm": 0.6966097950935364, "learning_rate": 1.3511017493501005e-06, "loss": 0.9611, "step": 413 }, { "epoch": 1.5148148148148148, "grad_norm": 0.7274028062820435, "learning_rate": 1.3309032192072463e-06, "loss": 0.8968, "step": 414 }, { "epoch": 1.5185185185185186, "grad_norm": 0.657966136932373, "learning_rate": 1.3108336047913633e-06, "loss": 0.9025, "step": 415 }, { "epoch": 1.5222222222222221, "grad_norm": 0.7330816388130188, "learning_rate": 1.29089361125936e-06, "loss": 0.9422, "step": 416 }, { "epoch": 1.525925925925926, "grad_norm": 0.6839099526405334, "learning_rate": 1.2710839392138386e-06, "loss": 0.9604, "step": 417 }, { "epoch": 1.5296296296296297, "grad_norm": 0.7069361805915833, "learning_rate": 1.251405284678488e-06, "loss": 0.9125, "step": 418 }, { "epoch": 1.5333333333333332, "grad_norm": 0.6964563131332397, "learning_rate": 1.2318583390736256e-06, "loss": 0.899, "step": 419 }, { "epoch": 1.5370370370370372, "grad_norm": 0.6390750408172607, "learning_rate": 1.2124437891918995e-06, "loss": 0.8699, "step": 420 }, { "epoch": 1.5407407407407407, "grad_norm": 0.7302431464195251, "learning_rate": 1.1931623171741653e-06, "loss": 0.9124, "step": 421 }, { "epoch": 1.5444444444444443, "grad_norm": 0.6087197065353394, "learning_rate": 1.1740146004855141e-06, "loss": 0.8754, "step": 422 }, { "epoch": 1.5481481481481483, "grad_norm": 0.768294632434845, "learning_rate": 1.1550013118914665e-06, "loss": 0.9578, "step": 423 }, { "epoch": 1.5518518518518518, "grad_norm": 0.6789171099662781, "learning_rate": 1.1361231194343436e-06, "loss": 0.9235, "step": 424 }, { "epoch": 1.5555555555555556, "grad_norm": 0.9581536054611206, "learning_rate": 1.1173806864097885e-06, "loss": 0.8839, "step": 425 }, { "epoch": 1.5592592592592593, "grad_norm": 0.5669332146644592, "learning_rate": 1.0987746713434578e-06, "loss": 0.9126, "step": 426 }, { "epoch": 1.5629629629629629, "grad_norm": 0.8494063019752502, "learning_rate": 1.080305727967893e-06, "loss": 0.8771, "step": 427 }, { "epoch": 1.5666666666666667, "grad_norm": 0.7187017202377319, "learning_rate": 1.0619745051995473e-06, "loss": 0.9504, "step": 428 }, { "epoch": 1.5703703703703704, "grad_norm": 0.5970302820205688, "learning_rate": 1.043781647115979e-06, "loss": 0.8693, "step": 429 }, { "epoch": 1.574074074074074, "grad_norm": 0.6068917512893677, "learning_rate": 1.0257277929332332e-06, "loss": 0.9556, "step": 430 }, { "epoch": 1.5777777777777777, "grad_norm": 0.5248574018478394, "learning_rate": 1.0078135769833758e-06, "loss": 0.9034, "step": 431 }, { "epoch": 1.5814814814814815, "grad_norm": 0.593900740146637, "learning_rate": 9.900396286922025e-07, "loss": 0.9028, "step": 432 }, { "epoch": 1.585185185185185, "grad_norm": 0.6070235371589661, "learning_rate": 9.72406572557133e-07, "loss": 0.8641, "step": 433 }, { "epoch": 1.588888888888889, "grad_norm": 0.6419976353645325, "learning_rate": 9.549150281252633e-07, "loss": 0.9095, "step": 434 }, { "epoch": 1.5925925925925926, "grad_norm": 0.8620632290840149, "learning_rate": 9.375656099715935e-07, "loss": 0.974, "step": 435 }, { "epoch": 1.5962962962962963, "grad_norm": 0.6101662516593933, "learning_rate": 9.203589276774438e-07, "loss": 0.8868, "step": 436 }, { "epoch": 1.6, "grad_norm": 0.6099417209625244, "learning_rate": 9.032955858090319e-07, "loss": 0.8978, "step": 437 }, { "epoch": 1.6037037037037036, "grad_norm": 0.8809055685997009, "learning_rate": 8.86376183896226e-07, "loss": 0.8905, "step": 438 }, { "epoch": 1.6074074074074074, "grad_norm": 0.5985243320465088, "learning_rate": 8.696013164114902e-07, "loss": 0.8914, "step": 439 }, { "epoch": 1.6111111111111112, "grad_norm": 0.6197245121002197, "learning_rate": 8.529715727489912e-07, "loss": 0.8962, "step": 440 }, { "epoch": 1.6148148148148147, "grad_norm": 0.6981231570243835, "learning_rate": 8.364875372038878e-07, "loss": 0.9588, "step": 441 }, { "epoch": 1.6185185185185185, "grad_norm": 0.9516739249229431, "learning_rate": 8.201497889518073e-07, "loss": 0.894, "step": 442 }, { "epoch": 1.6222222222222222, "grad_norm": 0.7582102417945862, "learning_rate": 8.039589020284926e-07, "loss": 0.8848, "step": 443 }, { "epoch": 1.6259259259259258, "grad_norm": 0.7806898355484009, "learning_rate": 7.879154453096305e-07, "loss": 0.9589, "step": 444 }, { "epoch": 1.6296296296296298, "grad_norm": 0.7488991022109985, "learning_rate": 7.720199824908692e-07, "loss": 0.876, "step": 445 }, { "epoch": 1.6333333333333333, "grad_norm": 0.7136239409446716, "learning_rate": 7.562730720680111e-07, "loss": 0.9316, "step": 446 }, { "epoch": 1.637037037037037, "grad_norm": 0.7333770990371704, "learning_rate": 7.406752673173851e-07, "loss": 0.8628, "step": 447 }, { "epoch": 1.6407407407407408, "grad_norm": 0.8281632661819458, "learning_rate": 7.25227116276413e-07, "loss": 0.9111, "step": 448 }, { "epoch": 1.6444444444444444, "grad_norm": 0.742244303226471, "learning_rate": 7.099291617243526e-07, "loss": 1.0076, "step": 449 }, { "epoch": 1.6481481481481481, "grad_norm": 0.9548508524894714, "learning_rate": 6.947819411632223e-07, "loss": 0.8449, "step": 450 }, { "epoch": 1.651851851851852, "grad_norm": 0.6826335787773132, "learning_rate": 6.797859867989226e-07, "loss": 0.9281, "step": 451 }, { "epoch": 1.6555555555555554, "grad_norm": 0.7542606592178345, "learning_rate": 6.649418255225298e-07, "loss": 0.9343, "step": 452 }, { "epoch": 1.6592592592592592, "grad_norm": 0.6692271828651428, "learning_rate": 6.502499788917893e-07, "loss": 0.9217, "step": 453 }, { "epoch": 1.662962962962963, "grad_norm": 0.6740605235099792, "learning_rate": 6.357109631127889e-07, "loss": 0.9304, "step": 454 }, { "epoch": 1.6666666666666665, "grad_norm": 0.7319871187210083, "learning_rate": 6.213252890218163e-07, "loss": 0.937, "step": 455 }, { "epoch": 1.6703703703703705, "grad_norm": 0.5654203295707703, "learning_rate": 6.07093462067419e-07, "loss": 0.8914, "step": 456 }, { "epoch": 1.674074074074074, "grad_norm": 0.5713678002357483, "learning_rate": 5.930159822926407e-07, "loss": 0.9257, "step": 457 }, { "epoch": 1.6777777777777778, "grad_norm": 0.838940441608429, "learning_rate": 5.79093344317449e-07, "loss": 0.9104, "step": 458 }, { "epoch": 1.6814814814814816, "grad_norm": 0.6449588537216187, "learning_rate": 5.653260373213632e-07, "loss": 0.8805, "step": 459 }, { "epoch": 1.6851851851851851, "grad_norm": 0.7371458411216736, "learning_rate": 5.517145450262639e-07, "loss": 0.8835, "step": 460 }, { "epoch": 1.6888888888888889, "grad_norm": 0.679885983467102, "learning_rate": 5.382593456793933e-07, "loss": 0.9306, "step": 461 }, { "epoch": 1.6925925925925926, "grad_norm": 0.5046345591545105, "learning_rate": 5.249609120365579e-07, "loss": 0.8928, "step": 462 }, { "epoch": 1.6962962962962962, "grad_norm": 0.8400017023086548, "learning_rate": 5.118197113455164e-07, "loss": 0.9142, "step": 463 }, { "epoch": 1.7, "grad_norm": 0.6406440734863281, "learning_rate": 4.988362053295564e-07, "loss": 0.8868, "step": 464 }, { "epoch": 1.7037037037037037, "grad_norm": 0.633682906627655, "learning_rate": 4.860108501712824e-07, "loss": 0.913, "step": 465 }, { "epoch": 1.7074074074074073, "grad_norm": 0.5694250464439392, "learning_rate": 4.733440964965791e-07, "loss": 0.9226, "step": 466 }, { "epoch": 1.7111111111111112, "grad_norm": 0.7359141707420349, "learning_rate": 4.6083638935878025e-07, "loss": 0.9148, "step": 467 }, { "epoch": 1.7148148148148148, "grad_norm": 0.669040322303772, "learning_rate": 4.484881682230341e-07, "loss": 0.8438, "step": 468 }, { "epoch": 1.7185185185185186, "grad_norm": 0.6235198974609375, "learning_rate": 4.3629986695086166e-07, "loss": 0.9097, "step": 469 }, { "epoch": 1.7222222222222223, "grad_norm": 0.617144763469696, "learning_rate": 4.242719137849077e-07, "loss": 0.8961, "step": 470 }, { "epoch": 1.7259259259259259, "grad_norm": 0.6972277164459229, "learning_rate": 4.124047313339025e-07, "loss": 0.8856, "step": 471 }, { "epoch": 1.7296296296296296, "grad_norm": 0.7551361322402954, "learning_rate": 4.00698736557808e-07, "loss": 0.9143, "step": 472 }, { "epoch": 1.7333333333333334, "grad_norm": 0.7269377708435059, "learning_rate": 3.891543407531673e-07, "loss": 0.9145, "step": 473 }, { "epoch": 1.737037037037037, "grad_norm": 0.7635181546211243, "learning_rate": 3.777719495386567e-07, "loss": 0.8855, "step": 474 }, { "epoch": 1.7407407407407407, "grad_norm": 0.745471715927124, "learning_rate": 3.665519628408332e-07, "loss": 0.9049, "step": 475 }, { "epoch": 1.7444444444444445, "grad_norm": 0.983215868473053, "learning_rate": 3.5549477488007853e-07, "loss": 0.8904, "step": 476 }, { "epoch": 1.7444444444444445, "eval_loss": 0.910611093044281, "eval_runtime": 80.7891, "eval_samples_per_second": 3.602, "eval_steps_per_second": 0.458, "step": 476 }, { "epoch": 1.748148148148148, "grad_norm": 0.645391047000885, "learning_rate": 3.4460077415675473e-07, "loss": 0.9156, "step": 477 }, { "epoch": 1.751851851851852, "grad_norm": 0.7084014415740967, "learning_rate": 3.3387034343755063e-07, "loss": 0.9417, "step": 478 }, { "epoch": 1.7555555555555555, "grad_norm": 0.6383021473884583, "learning_rate": 3.2330385974203184e-07, "loss": 0.9339, "step": 479 }, { "epoch": 1.7592592592592593, "grad_norm": 0.6533625721931458, "learning_rate": 3.1290169432939556e-07, "loss": 0.9548, "step": 480 }, { "epoch": 1.762962962962963, "grad_norm": 0.5707213878631592, "learning_rate": 3.0266421268542734e-07, "loss": 0.9544, "step": 481 }, { "epoch": 1.7666666666666666, "grad_norm": 0.6648502945899963, "learning_rate": 2.925917745096568e-07, "loss": 0.8525, "step": 482 }, { "epoch": 1.7703703703703704, "grad_norm": 0.6798570156097412, "learning_rate": 2.826847337027222e-07, "loss": 0.9217, "step": 483 }, { "epoch": 1.7740740740740741, "grad_norm": 0.709642231464386, "learning_rate": 2.7294343835393366e-07, "loss": 0.8996, "step": 484 }, { "epoch": 1.7777777777777777, "grad_norm": 0.7643037438392639, "learning_rate": 2.6336823072904305e-07, "loss": 0.8625, "step": 485 }, { "epoch": 1.7814814814814814, "grad_norm": 0.7004448771476746, "learning_rate": 2.539594472582213e-07, "loss": 0.9118, "step": 486 }, { "epoch": 1.7851851851851852, "grad_norm": 0.6062957048416138, "learning_rate": 2.447174185242324e-07, "loss": 0.8849, "step": 487 }, { "epoch": 1.7888888888888888, "grad_norm": 0.6031033396720886, "learning_rate": 2.3564246925082358e-07, "loss": 0.924, "step": 488 }, { "epoch": 1.7925925925925927, "grad_norm": 0.6818556189537048, "learning_rate": 2.2673491829131365e-07, "loss": 0.9206, "step": 489 }, { "epoch": 1.7962962962962963, "grad_norm": 0.7448561191558838, "learning_rate": 2.179950786173879e-07, "loss": 0.8549, "step": 490 }, { "epoch": 1.8, "grad_norm": 0.603404700756073, "learning_rate": 2.0942325730810565e-07, "loss": 0.919, "step": 491 }, { "epoch": 1.8037037037037038, "grad_norm": 0.7165398001670837, "learning_rate": 2.01019755539108e-07, "loss": 0.9021, "step": 492 }, { "epoch": 1.8074074074074074, "grad_norm": 0.7593845725059509, "learning_rate": 1.9278486857203683e-07, "loss": 0.9153, "step": 493 }, { "epoch": 1.8111111111111111, "grad_norm": 0.6313470602035522, "learning_rate": 1.8471888574415953e-07, "loss": 0.9106, "step": 494 }, { "epoch": 1.8148148148148149, "grad_norm": 0.6173641085624695, "learning_rate": 1.7682209045820687e-07, "loss": 0.8888, "step": 495 }, { "epoch": 1.8185185185185184, "grad_norm": 0.76194167137146, "learning_rate": 1.690947601724091e-07, "loss": 0.9064, "step": 496 }, { "epoch": 1.8222222222222222, "grad_norm": 0.675755500793457, "learning_rate": 1.6153716639075223e-07, "loss": 0.9266, "step": 497 }, { "epoch": 1.825925925925926, "grad_norm": 0.7498816847801208, "learning_rate": 1.5414957465343883e-07, "loss": 0.8579, "step": 498 }, { "epoch": 1.8296296296296295, "grad_norm": 0.656910240650177, "learning_rate": 1.4693224452755284e-07, "loss": 0.821, "step": 499 }, { "epoch": 1.8333333333333335, "grad_norm": 0.6735762357711792, "learning_rate": 1.3988542959794627e-07, "loss": 0.8731, "step": 500 }, { "epoch": 1.837037037037037, "grad_norm": 0.6537667512893677, "learning_rate": 1.330093774583252e-07, "loss": 0.9249, "step": 501 }, { "epoch": 1.8407407407407408, "grad_norm": 0.6112355589866638, "learning_rate": 1.2630432970255014e-07, "loss": 0.8936, "step": 502 }, { "epoch": 1.8444444444444446, "grad_norm": 0.7084822058677673, "learning_rate": 1.1977052191615158e-07, "loss": 0.9221, "step": 503 }, { "epoch": 1.848148148148148, "grad_norm": 0.652979850769043, "learning_rate": 1.1340818366804728e-07, "loss": 0.9073, "step": 504 }, { "epoch": 1.8518518518518519, "grad_norm": 0.6980672478675842, "learning_rate": 1.0721753850247984e-07, "loss": 0.9294, "step": 505 }, { "epoch": 1.8555555555555556, "grad_norm": 0.7224528789520264, "learning_rate": 1.0119880393116177e-07, "loss": 0.8842, "step": 506 }, { "epoch": 1.8592592592592592, "grad_norm": 0.6340327262878418, "learning_rate": 9.535219142563168e-08, "loss": 0.9598, "step": 507 }, { "epoch": 1.862962962962963, "grad_norm": 0.686582624912262, "learning_rate": 8.967790640982466e-08, "loss": 0.9344, "step": 508 }, { "epoch": 1.8666666666666667, "grad_norm": 0.5959629416465759, "learning_rate": 8.417614825285636e-08, "loss": 0.9026, "step": 509 }, { "epoch": 1.8703703703703702, "grad_norm": 0.6575542092323303, "learning_rate": 7.884711026201586e-08, "loss": 0.9288, "step": 510 }, { "epoch": 1.8740740740740742, "grad_norm": 0.5704898238182068, "learning_rate": 7.369097967597493e-08, "loss": 0.8636, "step": 511 }, { "epoch": 1.8777777777777778, "grad_norm": 0.6155747771263123, "learning_rate": 6.870793765820783e-08, "loss": 0.8362, "step": 512 }, { "epoch": 1.8814814814814815, "grad_norm": 0.6208741664886475, "learning_rate": 6.389815929062848e-08, "loss": 0.9179, "step": 513 }, { "epoch": 1.8851851851851853, "grad_norm": 0.7014544010162354, "learning_rate": 5.92618135674361e-08, "loss": 0.9333, "step": 514 }, { "epoch": 1.8888888888888888, "grad_norm": 0.814078152179718, "learning_rate": 5.479906338917984e-08, "loss": 0.9186, "step": 515 }, { "epoch": 1.8925925925925926, "grad_norm": 0.7297834753990173, "learning_rate": 5.0510065557034526e-08, "loss": 0.8992, "step": 516 }, { "epoch": 1.8962962962962964, "grad_norm": 0.6444849371910095, "learning_rate": 4.639497076728949e-08, "loss": 0.94, "step": 517 }, { "epoch": 1.9, "grad_norm": 0.5319604873657227, "learning_rate": 4.245392360605727e-08, "loss": 0.9396, "step": 518 }, { "epoch": 1.9037037037037037, "grad_norm": 0.9374611973762512, "learning_rate": 3.86870625441893e-08, "loss": 0.9718, "step": 519 }, { "epoch": 1.9074074074074074, "grad_norm": 0.5568841695785522, "learning_rate": 3.5094519932415417e-08, "loss": 0.89, "step": 520 }, { "epoch": 1.911111111111111, "grad_norm": 0.9112274646759033, "learning_rate": 3.167642199668863e-08, "loss": 0.925, "step": 521 }, { "epoch": 1.914814814814815, "grad_norm": 0.655830979347229, "learning_rate": 2.843288883375539e-08, "loss": 0.9135, "step": 522 }, { "epoch": 1.9185185185185185, "grad_norm": 0.5499829649925232, "learning_rate": 2.5364034406930026e-08, "loss": 0.8902, "step": 523 }, { "epoch": 1.9222222222222223, "grad_norm": 0.9093420505523682, "learning_rate": 2.2469966542096323e-08, "loss": 0.971, "step": 524 }, { "epoch": 1.925925925925926, "grad_norm": 0.8075233101844788, "learning_rate": 1.975078692391552e-08, "loss": 0.9288, "step": 525 }, { "epoch": 1.9296296296296296, "grad_norm": 0.6721240282058716, "learning_rate": 1.7206591092253642e-08, "loss": 0.8983, "step": 526 }, { "epoch": 1.9333333333333333, "grad_norm": 0.6682837605476379, "learning_rate": 1.4837468438826385e-08, "loss": 0.9423, "step": 527 }, { "epoch": 1.9370370370370371, "grad_norm": 0.653581440448761, "learning_rate": 1.264350220405719e-08, "loss": 0.9542, "step": 528 }, { "epoch": 1.9407407407407407, "grad_norm": 0.5496450066566467, "learning_rate": 1.0624769474152363e-08, "loss": 0.9059, "step": 529 }, { "epoch": 1.9444444444444444, "grad_norm": 0.7037910223007202, "learning_rate": 8.781341178393244e-09, "loss": 0.8928, "step": 530 }, { "epoch": 1.9481481481481482, "grad_norm": 0.6391336917877197, "learning_rate": 7.1132820866431915e-09, "loss": 0.8936, "step": 531 }, { "epoch": 1.9518518518518517, "grad_norm": 0.7979388236999512, "learning_rate": 5.620650807073857e-09, "loss": 0.8871, "step": 532 }, { "epoch": 1.9555555555555557, "grad_norm": 0.6291653513908386, "learning_rate": 4.303499784102383e-09, "loss": 0.8815, "step": 533 }, { "epoch": 1.9592592592592593, "grad_norm": 0.7071843147277832, "learning_rate": 3.1618752965534295e-09, "loss": 0.8984, "step": 534 }, { "epoch": 1.9629629629629628, "grad_norm": 0.5879070162773132, "learning_rate": 2.19581745602826e-09, "loss": 0.849, "step": 535 }, { "epoch": 1.9666666666666668, "grad_norm": 0.743624746799469, "learning_rate": 1.4053602054991954e-09, "loss": 0.879, "step": 536 }, { "epoch": 1.9703703703703703, "grad_norm": 0.5870293974876404, "learning_rate": 7.905313181150176e-10, "loss": 0.9257, "step": 537 }, { "epoch": 1.974074074074074, "grad_norm": 0.7187138199806213, "learning_rate": 3.513523962256349e-10, "loss": 0.9768, "step": 538 }, { "epoch": 1.9777777777777779, "grad_norm": 0.6711537837982178, "learning_rate": 8.783887062324692e-11, "loss": 0.9182, "step": 539 }, { "epoch": 1.9814814814814814, "grad_norm": 0.66741943359375, "learning_rate": 0.0, "loss": 0.8763, "step": 540 } ], "logging_steps": 1, "max_steps": 540, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 135, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.9871043243514266e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }