{ "best_metric": 0.7647964954376221, "best_model_checkpoint": "outputs/checkpoint-350", "epoch": 0.21895527056615577, "eval_steps": 25, "global_step": 350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006255864873318737, "grad_norm": 3.302262783050537, "learning_rate": 4e-05, "loss": 1.7639, "step": 1 }, { "epoch": 0.0012511729746637473, "grad_norm": 3.7728819847106934, "learning_rate": 8e-05, "loss": 2.3471, "step": 2 }, { "epoch": 0.001876759461995621, "grad_norm": 3.575211763381958, "learning_rate": 0.00012, "loss": 1.274, "step": 3 }, { "epoch": 0.0025023459493274947, "grad_norm": 4.3921918869018555, "learning_rate": 0.00016, "loss": 1.8361, "step": 4 }, { "epoch": 0.0031279324366593683, "grad_norm": 3.215696096420288, "learning_rate": 0.0002, "loss": 2.8766, "step": 5 }, { "epoch": 0.003753518923991242, "grad_norm": 4.060017108917236, "learning_rate": 0.0001996638655462185, "loss": 1.4329, "step": 6 }, { "epoch": 0.004379105411323116, "grad_norm": 2.7935523986816406, "learning_rate": 0.00019932773109243698, "loss": 1.2844, "step": 7 }, { "epoch": 0.005004691898654989, "grad_norm": 2.312218189239502, "learning_rate": 0.00019899159663865548, "loss": 1.8112, "step": 8 }, { "epoch": 0.005630278385986863, "grad_norm": 3.5389914512634277, "learning_rate": 0.00019865546218487395, "loss": 2.0504, "step": 9 }, { "epoch": 0.006255864873318737, "grad_norm": 2.913029432296753, "learning_rate": 0.00019831932773109245, "loss": 1.9101, "step": 10 }, { "epoch": 0.00688145136065061, "grad_norm": 3.6916606426239014, "learning_rate": 0.00019798319327731095, "loss": 1.9899, "step": 11 }, { "epoch": 0.007507037847982484, "grad_norm": 3.002810478210449, "learning_rate": 0.00019764705882352942, "loss": 1.3224, "step": 12 }, { "epoch": 0.008132624335314358, "grad_norm": 1.657835602760315, "learning_rate": 0.00019731092436974792, "loss": 1.2208, "step": 13 }, { "epoch": 0.008758210822646231, "grad_norm": 2.414161443710327, "learning_rate": 0.00019697478991596642, "loss": 1.5375, "step": 14 }, { "epoch": 0.009383797309978105, "grad_norm": 1.9695100784301758, "learning_rate": 0.00019663865546218486, "loss": 0.9218, "step": 15 }, { "epoch": 0.010009383797309979, "grad_norm": 3.9755845069885254, "learning_rate": 0.00019630252100840336, "loss": 1.3608, "step": 16 }, { "epoch": 0.010634970284641852, "grad_norm": 6.843455791473389, "learning_rate": 0.00019596638655462186, "loss": 1.2168, "step": 17 }, { "epoch": 0.011260556771973726, "grad_norm": 3.8736443519592285, "learning_rate": 0.00019563025210084033, "loss": 0.7392, "step": 18 }, { "epoch": 0.0118861432593056, "grad_norm": 1.7369539737701416, "learning_rate": 0.00019529411764705883, "loss": 1.0495, "step": 19 }, { "epoch": 0.012511729746637473, "grad_norm": 1.1708225011825562, "learning_rate": 0.0001949579831932773, "loss": 1.2266, "step": 20 }, { "epoch": 0.013137316233969347, "grad_norm": 1.4693603515625, "learning_rate": 0.0001946218487394958, "loss": 1.1364, "step": 21 }, { "epoch": 0.01376290272130122, "grad_norm": 0.8484959602355957, "learning_rate": 0.0001942857142857143, "loss": 0.6253, "step": 22 }, { "epoch": 0.014388489208633094, "grad_norm": 2.7237887382507324, "learning_rate": 0.00019394957983193278, "loss": 1.2932, "step": 23 }, { "epoch": 0.015014075695964968, "grad_norm": 1.1654947996139526, "learning_rate": 0.00019361344537815127, "loss": 0.5659, "step": 24 }, { "epoch": 0.01563966218329684, "grad_norm": 1.7193485498428345, "learning_rate": 0.00019327731092436975, "loss": 1.3627, "step": 25 }, { "epoch": 0.01563966218329684, "eval_loss": 1.0974555015563965, "eval_runtime": 46.8133, "eval_samples_per_second": 5.469, "eval_steps_per_second": 2.734, "step": 25 }, { "epoch": 0.016265248670628715, "grad_norm": 2.883988380432129, "learning_rate": 0.00019294117647058825, "loss": 0.6257, "step": 26 }, { "epoch": 0.01689083515796059, "grad_norm": 1.4707483053207397, "learning_rate": 0.00019260504201680674, "loss": 0.879, "step": 27 }, { "epoch": 0.017516421645292463, "grad_norm": 1.3346422910690308, "learning_rate": 0.00019226890756302522, "loss": 1.0058, "step": 28 }, { "epoch": 0.018142008132624336, "grad_norm": 0.5815519094467163, "learning_rate": 0.00019193277310924372, "loss": 0.3475, "step": 29 }, { "epoch": 0.01876759461995621, "grad_norm": 0.8800593018531799, "learning_rate": 0.00019159663865546221, "loss": 0.5426, "step": 30 }, { "epoch": 0.019393181107288084, "grad_norm": 8.196944236755371, "learning_rate": 0.0001912605042016807, "loss": 1.0088, "step": 31 }, { "epoch": 0.020018767594619957, "grad_norm": 3.264193296432495, "learning_rate": 0.00019092436974789919, "loss": 0.9319, "step": 32 }, { "epoch": 0.02064435408195183, "grad_norm": 1.1047834157943726, "learning_rate": 0.00019058823529411766, "loss": 0.9262, "step": 33 }, { "epoch": 0.021269940569283705, "grad_norm": 1.982783555984497, "learning_rate": 0.00019025210084033613, "loss": 1.2904, "step": 34 }, { "epoch": 0.021895527056615578, "grad_norm": 2.6765289306640625, "learning_rate": 0.00018991596638655463, "loss": 1.0785, "step": 35 }, { "epoch": 0.022521113543947452, "grad_norm": 4.674818992614746, "learning_rate": 0.0001895798319327731, "loss": 0.9822, "step": 36 }, { "epoch": 0.023146700031279326, "grad_norm": 1.6232353448867798, "learning_rate": 0.0001892436974789916, "loss": 0.6441, "step": 37 }, { "epoch": 0.0237722865186112, "grad_norm": 2.623237371444702, "learning_rate": 0.0001889075630252101, "loss": 0.8874, "step": 38 }, { "epoch": 0.024397873005943073, "grad_norm": 1.4366761445999146, "learning_rate": 0.00018857142857142857, "loss": 0.4596, "step": 39 }, { "epoch": 0.025023459493274947, "grad_norm": 1.8809682130813599, "learning_rate": 0.00018823529411764707, "loss": 0.887, "step": 40 }, { "epoch": 0.02564904598060682, "grad_norm": 1.081438660621643, "learning_rate": 0.00018789915966386554, "loss": 0.4735, "step": 41 }, { "epoch": 0.026274632467938694, "grad_norm": 2.1302649974823, "learning_rate": 0.00018756302521008404, "loss": 0.795, "step": 42 }, { "epoch": 0.026900218955270568, "grad_norm": 2.005425453186035, "learning_rate": 0.00018722689075630254, "loss": 0.8891, "step": 43 }, { "epoch": 0.02752580544260244, "grad_norm": 1.7256505489349365, "learning_rate": 0.000186890756302521, "loss": 0.5993, "step": 44 }, { "epoch": 0.028151391929934315, "grad_norm": 0.927653968334198, "learning_rate": 0.0001865546218487395, "loss": 0.6185, "step": 45 }, { "epoch": 0.02877697841726619, "grad_norm": 1.5710850954055786, "learning_rate": 0.000186218487394958, "loss": 0.5258, "step": 46 }, { "epoch": 0.029402564904598062, "grad_norm": 1.8794296979904175, "learning_rate": 0.00018588235294117648, "loss": 0.8168, "step": 47 }, { "epoch": 0.030028151391929936, "grad_norm": 0.9695333242416382, "learning_rate": 0.00018554621848739498, "loss": 0.5458, "step": 48 }, { "epoch": 0.03065373787926181, "grad_norm": 3.7846665382385254, "learning_rate": 0.00018521008403361345, "loss": 0.943, "step": 49 }, { "epoch": 0.03127932436659368, "grad_norm": 1.9213052988052368, "learning_rate": 0.00018487394957983195, "loss": 0.5069, "step": 50 }, { "epoch": 0.03127932436659368, "eval_loss": 0.9765783548355103, "eval_runtime": 43.502, "eval_samples_per_second": 5.885, "eval_steps_per_second": 2.942, "step": 50 }, { "epoch": 0.03190491085392556, "grad_norm": 2.0580382347106934, "learning_rate": 0.00018453781512605045, "loss": 0.9423, "step": 51 }, { "epoch": 0.03253049734125743, "grad_norm": 2.063591957092285, "learning_rate": 0.0001842016806722689, "loss": 0.7054, "step": 52 }, { "epoch": 0.033156083828589304, "grad_norm": 1.2656595706939697, "learning_rate": 0.0001838655462184874, "loss": 0.401, "step": 53 }, { "epoch": 0.03378167031592118, "grad_norm": 1.2392399311065674, "learning_rate": 0.0001835294117647059, "loss": 0.6077, "step": 54 }, { "epoch": 0.03440725680325305, "grad_norm": 0.99504154920578, "learning_rate": 0.00018319327731092437, "loss": 0.6313, "step": 55 }, { "epoch": 0.035032843290584925, "grad_norm": 2.0478012561798096, "learning_rate": 0.00018285714285714286, "loss": 1.2652, "step": 56 }, { "epoch": 0.0356584297779168, "grad_norm": 0.9636131525039673, "learning_rate": 0.00018252100840336134, "loss": 0.7561, "step": 57 }, { "epoch": 0.03628401626524867, "grad_norm": 0.874576210975647, "learning_rate": 0.00018218487394957984, "loss": 0.7461, "step": 58 }, { "epoch": 0.036909602752580546, "grad_norm": 1.3745896816253662, "learning_rate": 0.00018184873949579833, "loss": 1.2856, "step": 59 }, { "epoch": 0.03753518923991242, "grad_norm": 2.4839162826538086, "learning_rate": 0.0001815126050420168, "loss": 1.0574, "step": 60 }, { "epoch": 0.038160775727244294, "grad_norm": 1.2671383619308472, "learning_rate": 0.0001811764705882353, "loss": 0.6177, "step": 61 }, { "epoch": 0.03878636221457617, "grad_norm": 1.1862553358078003, "learning_rate": 0.0001808403361344538, "loss": 1.1169, "step": 62 }, { "epoch": 0.03941194870190804, "grad_norm": 1.1347297430038452, "learning_rate": 0.00018050420168067228, "loss": 1.3303, "step": 63 }, { "epoch": 0.040037535189239915, "grad_norm": 2.1583523750305176, "learning_rate": 0.00018016806722689078, "loss": 0.7941, "step": 64 }, { "epoch": 0.04066312167657179, "grad_norm": 1.2432655096054077, "learning_rate": 0.00017983193277310925, "loss": 0.7848, "step": 65 }, { "epoch": 0.04128870816390366, "grad_norm": 1.3345468044281006, "learning_rate": 0.00017949579831932775, "loss": 0.8953, "step": 66 }, { "epoch": 0.041914294651235535, "grad_norm": 0.6861767768859863, "learning_rate": 0.00017915966386554625, "loss": 0.4162, "step": 67 }, { "epoch": 0.04253988113856741, "grad_norm": 0.85309898853302, "learning_rate": 0.00017882352941176472, "loss": 0.6606, "step": 68 }, { "epoch": 0.04316546762589928, "grad_norm": 1.0247780084609985, "learning_rate": 0.00017848739495798322, "loss": 0.5271, "step": 69 }, { "epoch": 0.043791054113231156, "grad_norm": 1.3019441366195679, "learning_rate": 0.0001781512605042017, "loss": 0.5605, "step": 70 }, { "epoch": 0.04441664060056303, "grad_norm": 1.1024900674819946, "learning_rate": 0.00017781512605042016, "loss": 0.9303, "step": 71 }, { "epoch": 0.045042227087894904, "grad_norm": 1.079655408859253, "learning_rate": 0.00017747899159663866, "loss": 1.0138, "step": 72 }, { "epoch": 0.04566781357522678, "grad_norm": 1.1078468561172485, "learning_rate": 0.00017714285714285713, "loss": 0.9861, "step": 73 }, { "epoch": 0.04629340006255865, "grad_norm": 1.8648931980133057, "learning_rate": 0.00017680672268907563, "loss": 0.6756, "step": 74 }, { "epoch": 0.046918986549890525, "grad_norm": 0.8588104248046875, "learning_rate": 0.00017647058823529413, "loss": 0.4867, "step": 75 }, { "epoch": 0.046918986549890525, "eval_loss": 0.9139823913574219, "eval_runtime": 43.5635, "eval_samples_per_second": 5.876, "eval_steps_per_second": 2.938, "step": 75 }, { "epoch": 0.0475445730372224, "grad_norm": 1.6970480680465698, "learning_rate": 0.0001761344537815126, "loss": 0.5523, "step": 76 }, { "epoch": 0.04817015952455427, "grad_norm": 0.8562026023864746, "learning_rate": 0.0001757983193277311, "loss": 0.4084, "step": 77 }, { "epoch": 0.048795746011886146, "grad_norm": 0.9487925171852112, "learning_rate": 0.0001754621848739496, "loss": 0.6204, "step": 78 }, { "epoch": 0.04942133249921802, "grad_norm": 11.929024696350098, "learning_rate": 0.00017512605042016807, "loss": 1.1662, "step": 79 }, { "epoch": 0.05004691898654989, "grad_norm": 1.3468140363693237, "learning_rate": 0.00017478991596638657, "loss": 0.8037, "step": 80 }, { "epoch": 0.05067250547388177, "grad_norm": 0.7379503846168518, "learning_rate": 0.00017445378151260504, "loss": 0.6564, "step": 81 }, { "epoch": 0.05129809196121364, "grad_norm": 1.0315027236938477, "learning_rate": 0.00017411764705882354, "loss": 0.6377, "step": 82 }, { "epoch": 0.051923678448545514, "grad_norm": 0.5900093913078308, "learning_rate": 0.00017378151260504204, "loss": 0.5122, "step": 83 }, { "epoch": 0.05254926493587739, "grad_norm": 1.5138239860534668, "learning_rate": 0.0001734453781512605, "loss": 0.4769, "step": 84 }, { "epoch": 0.05317485142320926, "grad_norm": 1.016790747642517, "learning_rate": 0.000173109243697479, "loss": 0.6654, "step": 85 }, { "epoch": 0.053800437910541135, "grad_norm": 1.1964718103408813, "learning_rate": 0.00017277310924369748, "loss": 0.6334, "step": 86 }, { "epoch": 0.05442602439787301, "grad_norm": 1.102842092514038, "learning_rate": 0.00017243697478991598, "loss": 0.832, "step": 87 }, { "epoch": 0.05505161088520488, "grad_norm": 6.609305381774902, "learning_rate": 0.00017210084033613448, "loss": 0.6112, "step": 88 }, { "epoch": 0.055677197372536756, "grad_norm": 2.6627745628356934, "learning_rate": 0.00017176470588235293, "loss": 1.032, "step": 89 }, { "epoch": 0.05630278385986863, "grad_norm": 2.114955425262451, "learning_rate": 0.00017142857142857143, "loss": 0.6116, "step": 90 }, { "epoch": 0.0569283703472005, "grad_norm": 1.7707552909851074, "learning_rate": 0.00017109243697478992, "loss": 0.4766, "step": 91 }, { "epoch": 0.05755395683453238, "grad_norm": 0.9983264803886414, "learning_rate": 0.0001707563025210084, "loss": 0.5397, "step": 92 }, { "epoch": 0.05817954332186425, "grad_norm": 8.190524101257324, "learning_rate": 0.0001704201680672269, "loss": 0.9531, "step": 93 }, { "epoch": 0.058805129809196124, "grad_norm": 1.9920661449432373, "learning_rate": 0.0001700840336134454, "loss": 1.3801, "step": 94 }, { "epoch": 0.059430716296528, "grad_norm": 0.8791856169700623, "learning_rate": 0.00016974789915966387, "loss": 0.6218, "step": 95 }, { "epoch": 0.06005630278385987, "grad_norm": 1.0745537281036377, "learning_rate": 0.00016941176470588237, "loss": 0.5578, "step": 96 }, { "epoch": 0.060681889271191745, "grad_norm": 1.4266705513000488, "learning_rate": 0.00016907563025210084, "loss": 1.5821, "step": 97 }, { "epoch": 0.06130747575852362, "grad_norm": 1.1001832485198975, "learning_rate": 0.00016873949579831934, "loss": 0.5972, "step": 98 }, { "epoch": 0.06193306224585549, "grad_norm": 1.3168463706970215, "learning_rate": 0.00016840336134453784, "loss": 0.5794, "step": 99 }, { "epoch": 0.06255864873318737, "grad_norm": 1.0342196226119995, "learning_rate": 0.0001680672268907563, "loss": 0.6827, "step": 100 }, { "epoch": 0.06255864873318737, "eval_loss": 0.8885337114334106, "eval_runtime": 43.4886, "eval_samples_per_second": 5.887, "eval_steps_per_second": 2.943, "step": 100 }, { "epoch": 0.06318423522051923, "grad_norm": 2.2497031688690186, "learning_rate": 0.0001677310924369748, "loss": 0.6468, "step": 101 }, { "epoch": 0.06380982170785111, "grad_norm": 0.8061516284942627, "learning_rate": 0.00016739495798319328, "loss": 0.5388, "step": 102 }, { "epoch": 0.06443540819518298, "grad_norm": 0.6954531669616699, "learning_rate": 0.00016705882352941178, "loss": 0.3191, "step": 103 }, { "epoch": 0.06506099468251486, "grad_norm": 1.3721911907196045, "learning_rate": 0.00016672268907563028, "loss": 0.9, "step": 104 }, { "epoch": 0.06568658116984673, "grad_norm": 1.084492564201355, "learning_rate": 0.00016638655462184875, "loss": 0.6144, "step": 105 }, { "epoch": 0.06631216765717861, "grad_norm": 3.317697525024414, "learning_rate": 0.00016605042016806725, "loss": 0.634, "step": 106 }, { "epoch": 0.06693775414451048, "grad_norm": 2.5598530769348145, "learning_rate": 0.00016571428571428575, "loss": 0.8931, "step": 107 }, { "epoch": 0.06756334063184236, "grad_norm": 3.6414177417755127, "learning_rate": 0.0001653781512605042, "loss": 0.7226, "step": 108 }, { "epoch": 0.06818892711917422, "grad_norm": 2.2443768978118896, "learning_rate": 0.0001650420168067227, "loss": 0.8862, "step": 109 }, { "epoch": 0.0688145136065061, "grad_norm": 0.6285691857337952, "learning_rate": 0.0001647058823529412, "loss": 0.3766, "step": 110 }, { "epoch": 0.06944010009383797, "grad_norm": 0.6171959042549133, "learning_rate": 0.00016436974789915966, "loss": 0.2821, "step": 111 }, { "epoch": 0.07006568658116985, "grad_norm": 1.0057804584503174, "learning_rate": 0.00016403361344537816, "loss": 0.6293, "step": 112 }, { "epoch": 0.07069127306850172, "grad_norm": 1.3190034627914429, "learning_rate": 0.00016369747899159663, "loss": 0.5547, "step": 113 }, { "epoch": 0.0713168595558336, "grad_norm": 0.518517017364502, "learning_rate": 0.00016336134453781513, "loss": 0.1951, "step": 114 }, { "epoch": 0.07194244604316546, "grad_norm": 0.848175048828125, "learning_rate": 0.00016302521008403363, "loss": 0.5091, "step": 115 }, { "epoch": 0.07256803253049735, "grad_norm": 0.7387409806251526, "learning_rate": 0.0001626890756302521, "loss": 0.3872, "step": 116 }, { "epoch": 0.07319361901782921, "grad_norm": 2.828091859817505, "learning_rate": 0.0001623529411764706, "loss": 1.2046, "step": 117 }, { "epoch": 0.07381920550516109, "grad_norm": 1.7653822898864746, "learning_rate": 0.00016201680672268907, "loss": 1.8133, "step": 118 }, { "epoch": 0.07444479199249296, "grad_norm": 3.5097360610961914, "learning_rate": 0.00016168067226890757, "loss": 0.6837, "step": 119 }, { "epoch": 0.07507037847982484, "grad_norm": 1.3884797096252441, "learning_rate": 0.00016134453781512607, "loss": 0.8846, "step": 120 }, { "epoch": 0.0756959649671567, "grad_norm": 22.705190658569336, "learning_rate": 0.00016100840336134454, "loss": 0.7281, "step": 121 }, { "epoch": 0.07632155145448859, "grad_norm": 3.1223599910736084, "learning_rate": 0.00016067226890756304, "loss": 0.6254, "step": 122 }, { "epoch": 0.07694713794182045, "grad_norm": 0.530583381652832, "learning_rate": 0.00016033613445378154, "loss": 0.3292, "step": 123 }, { "epoch": 0.07757272442915233, "grad_norm": 1.4720183610916138, "learning_rate": 0.00016, "loss": 0.8192, "step": 124 }, { "epoch": 0.0781983109164842, "grad_norm": 0.6448870301246643, "learning_rate": 0.0001596638655462185, "loss": 0.2431, "step": 125 }, { "epoch": 0.0781983109164842, "eval_loss": 0.890012800693512, "eval_runtime": 43.5059, "eval_samples_per_second": 5.884, "eval_steps_per_second": 2.942, "step": 125 }, { "epoch": 0.07882389740381608, "grad_norm": 1.803906798362732, "learning_rate": 0.00015932773109243698, "loss": 0.8937, "step": 126 }, { "epoch": 0.07944948389114795, "grad_norm": 2.2447054386138916, "learning_rate": 0.00015899159663865546, "loss": 0.6993, "step": 127 }, { "epoch": 0.08007507037847983, "grad_norm": 0.6667381525039673, "learning_rate": 0.00015865546218487396, "loss": 0.4266, "step": 128 }, { "epoch": 0.0807006568658117, "grad_norm": 1.1449408531188965, "learning_rate": 0.00015831932773109243, "loss": 0.5557, "step": 129 }, { "epoch": 0.08132624335314358, "grad_norm": 1.399849534034729, "learning_rate": 0.00015798319327731093, "loss": 0.6761, "step": 130 }, { "epoch": 0.08195182984047544, "grad_norm": 0.745627760887146, "learning_rate": 0.00015764705882352943, "loss": 0.5323, "step": 131 }, { "epoch": 0.08257741632780732, "grad_norm": 1.162428379058838, "learning_rate": 0.0001573109243697479, "loss": 0.8231, "step": 132 }, { "epoch": 0.08320300281513919, "grad_norm": 1.0329734086990356, "learning_rate": 0.0001569747899159664, "loss": 0.6179, "step": 133 }, { "epoch": 0.08382858930247107, "grad_norm": 0.5739912986755371, "learning_rate": 0.00015663865546218487, "loss": 0.2515, "step": 134 }, { "epoch": 0.08445417578980294, "grad_norm": 1.2065409421920776, "learning_rate": 0.00015630252100840337, "loss": 0.6161, "step": 135 }, { "epoch": 0.08507976227713482, "grad_norm": 1.1025582551956177, "learning_rate": 0.00015596638655462187, "loss": 0.5926, "step": 136 }, { "epoch": 0.08570534876446669, "grad_norm": 0.78680020570755, "learning_rate": 0.00015563025210084034, "loss": 0.9987, "step": 137 }, { "epoch": 0.08633093525179857, "grad_norm": 0.6232782006263733, "learning_rate": 0.00015529411764705884, "loss": 0.4952, "step": 138 }, { "epoch": 0.08695652173913043, "grad_norm": 3.347989559173584, "learning_rate": 0.00015495798319327734, "loss": 1.0787, "step": 139 }, { "epoch": 0.08758210822646231, "grad_norm": 0.9020625352859497, "learning_rate": 0.0001546218487394958, "loss": 0.354, "step": 140 }, { "epoch": 0.08820769471379418, "grad_norm": 1.8955539464950562, "learning_rate": 0.0001542857142857143, "loss": 0.5515, "step": 141 }, { "epoch": 0.08883328120112606, "grad_norm": 5.194116115570068, "learning_rate": 0.00015394957983193278, "loss": 0.6843, "step": 142 }, { "epoch": 0.08945886768845793, "grad_norm": 1.4467953443527222, "learning_rate": 0.00015361344537815128, "loss": 0.4236, "step": 143 }, { "epoch": 0.09008445417578981, "grad_norm": 0.523921012878418, "learning_rate": 0.00015327731092436978, "loss": 0.2165, "step": 144 }, { "epoch": 0.09071004066312167, "grad_norm": 1.653648018836975, "learning_rate": 0.00015294117647058822, "loss": 1.0643, "step": 145 }, { "epoch": 0.09133562715045355, "grad_norm": 0.6991509199142456, "learning_rate": 0.00015260504201680672, "loss": 0.4398, "step": 146 }, { "epoch": 0.09196121363778542, "grad_norm": 1.3986660242080688, "learning_rate": 0.00015226890756302522, "loss": 0.8488, "step": 147 }, { "epoch": 0.0925868001251173, "grad_norm": 1.2424954175949097, "learning_rate": 0.0001519327731092437, "loss": 0.9516, "step": 148 }, { "epoch": 0.09321238661244917, "grad_norm": 0.8900560140609741, "learning_rate": 0.0001515966386554622, "loss": 0.767, "step": 149 }, { "epoch": 0.09383797309978105, "grad_norm": 40.042503356933594, "learning_rate": 0.00015126050420168066, "loss": 0.9691, "step": 150 }, { "epoch": 0.09383797309978105, "eval_loss": 0.8660734295845032, "eval_runtime": 43.5102, "eval_samples_per_second": 5.884, "eval_steps_per_second": 2.942, "step": 150 }, { "epoch": 0.09446355958711292, "grad_norm": 2.816359519958496, "learning_rate": 0.00015092436974789916, "loss": 1.4959, "step": 151 }, { "epoch": 0.0950891460744448, "grad_norm": 1.9332157373428345, "learning_rate": 0.00015058823529411766, "loss": 0.6786, "step": 152 }, { "epoch": 0.09571473256177666, "grad_norm": 1.2608965635299683, "learning_rate": 0.00015025210084033613, "loss": 1.1282, "step": 153 }, { "epoch": 0.09634031904910854, "grad_norm": 1.0167793035507202, "learning_rate": 0.00014991596638655463, "loss": 0.4932, "step": 154 }, { "epoch": 0.09696590553644041, "grad_norm": 1.6121408939361572, "learning_rate": 0.00014957983193277313, "loss": 0.7193, "step": 155 }, { "epoch": 0.09759149202377229, "grad_norm": 2.4104394912719727, "learning_rate": 0.0001492436974789916, "loss": 0.4472, "step": 156 }, { "epoch": 0.09821707851110416, "grad_norm": 1.1095707416534424, "learning_rate": 0.0001489075630252101, "loss": 0.7595, "step": 157 }, { "epoch": 0.09884266499843604, "grad_norm": 1.686458945274353, "learning_rate": 0.00014857142857142857, "loss": 0.5686, "step": 158 }, { "epoch": 0.0994682514857679, "grad_norm": 3.2238378524780273, "learning_rate": 0.00014823529411764707, "loss": 0.4236, "step": 159 }, { "epoch": 0.10009383797309979, "grad_norm": 1.800552248954773, "learning_rate": 0.00014789915966386557, "loss": 0.9519, "step": 160 }, { "epoch": 0.10071942446043165, "grad_norm": 0.6441445350646973, "learning_rate": 0.00014756302521008404, "loss": 0.4119, "step": 161 }, { "epoch": 0.10134501094776353, "grad_norm": 0.5892903804779053, "learning_rate": 0.00014722689075630254, "loss": 0.2956, "step": 162 }, { "epoch": 0.1019705974350954, "grad_norm": 0.8733301758766174, "learning_rate": 0.00014689075630252101, "loss": 0.5749, "step": 163 }, { "epoch": 0.10259618392242728, "grad_norm": 1.0460662841796875, "learning_rate": 0.0001465546218487395, "loss": 0.8167, "step": 164 }, { "epoch": 0.10322177040975915, "grad_norm": 0.8178017735481262, "learning_rate": 0.00014621848739495799, "loss": 0.9027, "step": 165 }, { "epoch": 0.10384735689709103, "grad_norm": 0.5698068737983704, "learning_rate": 0.00014588235294117646, "loss": 0.1829, "step": 166 }, { "epoch": 0.1044729433844229, "grad_norm": 1.0011018514633179, "learning_rate": 0.00014554621848739496, "loss": 0.8985, "step": 167 }, { "epoch": 0.10509852987175478, "grad_norm": 1.189772367477417, "learning_rate": 0.00014521008403361346, "loss": 0.5547, "step": 168 }, { "epoch": 0.10572411635908664, "grad_norm": 0.7990069389343262, "learning_rate": 0.00014487394957983193, "loss": 0.6222, "step": 169 }, { "epoch": 0.10634970284641852, "grad_norm": 0.6419771313667297, "learning_rate": 0.00014453781512605043, "loss": 0.3225, "step": 170 }, { "epoch": 0.10697528933375039, "grad_norm": 0.8978354930877686, "learning_rate": 0.00014420168067226893, "loss": 0.4567, "step": 171 }, { "epoch": 0.10760087582108227, "grad_norm": 0.7193794250488281, "learning_rate": 0.0001438655462184874, "loss": 0.4793, "step": 172 }, { "epoch": 0.10822646230841414, "grad_norm": 0.9533759355545044, "learning_rate": 0.0001435294117647059, "loss": 1.4397, "step": 173 }, { "epoch": 0.10885204879574602, "grad_norm": 0.48348739743232727, "learning_rate": 0.00014319327731092437, "loss": 0.3398, "step": 174 }, { "epoch": 0.10947763528307788, "grad_norm": 0.7699019312858582, "learning_rate": 0.00014285714285714287, "loss": 0.7491, "step": 175 }, { "epoch": 0.10947763528307788, "eval_loss": 0.8425782322883606, "eval_runtime": 43.5013, "eval_samples_per_second": 5.885, "eval_steps_per_second": 2.942, "step": 175 }, { "epoch": 0.11010322177040976, "grad_norm": 0.9201186895370483, "learning_rate": 0.00014252100840336137, "loss": 0.6919, "step": 176 }, { "epoch": 0.11072880825774163, "grad_norm": 0.8190593123435974, "learning_rate": 0.00014218487394957984, "loss": 0.6262, "step": 177 }, { "epoch": 0.11135439474507351, "grad_norm": 0.9715782403945923, "learning_rate": 0.00014184873949579834, "loss": 0.8364, "step": 178 }, { "epoch": 0.11197998123240538, "grad_norm": 0.6699782609939575, "learning_rate": 0.0001415126050420168, "loss": 0.4898, "step": 179 }, { "epoch": 0.11260556771973726, "grad_norm": 1.8386518955230713, "learning_rate": 0.0001411764705882353, "loss": 0.7812, "step": 180 }, { "epoch": 0.11323115420706913, "grad_norm": 0.7240263819694519, "learning_rate": 0.0001408403361344538, "loss": 0.5508, "step": 181 }, { "epoch": 0.113856740694401, "grad_norm": 0.6068630814552307, "learning_rate": 0.00014050420168067225, "loss": 0.5151, "step": 182 }, { "epoch": 0.11448232718173287, "grad_norm": 1.6705517768859863, "learning_rate": 0.00014016806722689075, "loss": 1.2281, "step": 183 }, { "epoch": 0.11510791366906475, "grad_norm": 1.6179956197738647, "learning_rate": 0.00013983193277310925, "loss": 0.7365, "step": 184 }, { "epoch": 0.11573350015639662, "grad_norm": 1.5741758346557617, "learning_rate": 0.00013949579831932772, "loss": 1.0039, "step": 185 }, { "epoch": 0.1163590866437285, "grad_norm": 0.9270511865615845, "learning_rate": 0.00013915966386554622, "loss": 0.5768, "step": 186 }, { "epoch": 0.11698467313106037, "grad_norm": 1.3651914596557617, "learning_rate": 0.00013882352941176472, "loss": 0.7715, "step": 187 }, { "epoch": 0.11761025961839225, "grad_norm": 1.4330601692199707, "learning_rate": 0.0001384873949579832, "loss": 0.4462, "step": 188 }, { "epoch": 0.11823584610572412, "grad_norm": 0.9181672930717468, "learning_rate": 0.0001381512605042017, "loss": 0.3901, "step": 189 }, { "epoch": 0.118861432593056, "grad_norm": 0.5304622650146484, "learning_rate": 0.00013781512605042016, "loss": 0.1718, "step": 190 }, { "epoch": 0.11948701908038786, "grad_norm": 0.7475191354751587, "learning_rate": 0.00013747899159663866, "loss": 0.3602, "step": 191 }, { "epoch": 0.12011260556771974, "grad_norm": 1.2558002471923828, "learning_rate": 0.00013714285714285716, "loss": 0.8558, "step": 192 }, { "epoch": 0.12073819205505161, "grad_norm": 0.9859037399291992, "learning_rate": 0.00013680672268907563, "loss": 0.7155, "step": 193 }, { "epoch": 0.12136377854238349, "grad_norm": 0.6028466820716858, "learning_rate": 0.00013647058823529413, "loss": 0.9596, "step": 194 }, { "epoch": 0.12198936502971536, "grad_norm": 0.5713469386100769, "learning_rate": 0.0001361344537815126, "loss": 0.3442, "step": 195 }, { "epoch": 0.12261495151704724, "grad_norm": 1.0781211853027344, "learning_rate": 0.0001357983193277311, "loss": 0.5569, "step": 196 }, { "epoch": 0.1232405380043791, "grad_norm": 0.7850176095962524, "learning_rate": 0.0001354621848739496, "loss": 0.5853, "step": 197 }, { "epoch": 0.12386612449171099, "grad_norm": 0.8100555539131165, "learning_rate": 0.00013512605042016807, "loss": 0.8285, "step": 198 }, { "epoch": 0.12449171097904285, "grad_norm": 1.106834888458252, "learning_rate": 0.00013478991596638657, "loss": 0.9521, "step": 199 }, { "epoch": 0.12511729746637473, "grad_norm": 1.4412230253219604, "learning_rate": 0.00013445378151260507, "loss": 0.6478, "step": 200 }, { "epoch": 0.12511729746637473, "eval_loss": 0.8300326466560364, "eval_runtime": 43.5102, "eval_samples_per_second": 5.884, "eval_steps_per_second": 2.942, "step": 200 }, { "epoch": 0.1257428839537066, "grad_norm": 1.7852795124053955, "learning_rate": 0.00013411764705882352, "loss": 0.5687, "step": 201 }, { "epoch": 0.12636847044103847, "grad_norm": 2.423583745956421, "learning_rate": 0.00013378151260504202, "loss": 0.9082, "step": 202 }, { "epoch": 0.12699405692837035, "grad_norm": 1.538001298904419, "learning_rate": 0.00013344537815126052, "loss": 0.7143, "step": 203 }, { "epoch": 0.12761964341570223, "grad_norm": 1.7380592823028564, "learning_rate": 0.000133109243697479, "loss": 0.8296, "step": 204 }, { "epoch": 0.1282452299030341, "grad_norm": 0.8279218673706055, "learning_rate": 0.0001327731092436975, "loss": 0.6719, "step": 205 }, { "epoch": 0.12887081639036596, "grad_norm": 0.7059926986694336, "learning_rate": 0.00013243697478991596, "loss": 0.4785, "step": 206 }, { "epoch": 0.12949640287769784, "grad_norm": 0.6946935653686523, "learning_rate": 0.00013210084033613446, "loss": 0.4578, "step": 207 }, { "epoch": 0.13012198936502972, "grad_norm": 0.9800712466239929, "learning_rate": 0.00013176470588235296, "loss": 1.4369, "step": 208 }, { "epoch": 0.1307475758523616, "grad_norm": 0.708831787109375, "learning_rate": 0.00013142857142857143, "loss": 0.5071, "step": 209 }, { "epoch": 0.13137316233969346, "grad_norm": 1.0098780393600464, "learning_rate": 0.00013109243697478993, "loss": 0.9155, "step": 210 }, { "epoch": 0.13199874882702534, "grad_norm": 1.1598243713378906, "learning_rate": 0.0001307563025210084, "loss": 0.3757, "step": 211 }, { "epoch": 0.13262433531435722, "grad_norm": 0.7583935260772705, "learning_rate": 0.0001304201680672269, "loss": 0.3365, "step": 212 }, { "epoch": 0.1332499218016891, "grad_norm": 1.0866564512252808, "learning_rate": 0.0001300840336134454, "loss": 0.6398, "step": 213 }, { "epoch": 0.13387550828902095, "grad_norm": 1.4322006702423096, "learning_rate": 0.00012974789915966387, "loss": 0.6427, "step": 214 }, { "epoch": 0.13450109477635283, "grad_norm": 1.600325345993042, "learning_rate": 0.00012941176470588237, "loss": 0.6884, "step": 215 }, { "epoch": 0.1351266812636847, "grad_norm": 1.0634167194366455, "learning_rate": 0.00012907563025210087, "loss": 1.0343, "step": 216 }, { "epoch": 0.13575226775101656, "grad_norm": 0.9889366626739502, "learning_rate": 0.00012873949579831934, "loss": 0.717, "step": 217 }, { "epoch": 0.13637785423834844, "grad_norm": 2.0635392665863037, "learning_rate": 0.00012840336134453784, "loss": 0.5965, "step": 218 }, { "epoch": 0.13700344072568033, "grad_norm": 0.8937773704528809, "learning_rate": 0.0001280672268907563, "loss": 0.7281, "step": 219 }, { "epoch": 0.1376290272130122, "grad_norm": 0.9768427014350891, "learning_rate": 0.00012773109243697478, "loss": 0.5687, "step": 220 }, { "epoch": 0.13825461370034406, "grad_norm": 1.3913767337799072, "learning_rate": 0.00012739495798319328, "loss": 0.3984, "step": 221 }, { "epoch": 0.13888020018767594, "grad_norm": 1.4933342933654785, "learning_rate": 0.00012705882352941175, "loss": 1.2441, "step": 222 }, { "epoch": 0.13950578667500782, "grad_norm": 1.0846196413040161, "learning_rate": 0.00012672268907563025, "loss": 0.9013, "step": 223 }, { "epoch": 0.1401313731623397, "grad_norm": 0.7788563370704651, "learning_rate": 0.00012638655462184875, "loss": 0.4674, "step": 224 }, { "epoch": 0.14075695964967155, "grad_norm": 0.7341142296791077, "learning_rate": 0.00012605042016806722, "loss": 1.3271, "step": 225 }, { "epoch": 0.14075695964967155, "eval_loss": 0.8179877996444702, "eval_runtime": 43.5514, "eval_samples_per_second": 5.878, "eval_steps_per_second": 2.939, "step": 225 }, { "epoch": 0.14138254613700343, "grad_norm": 6.473598480224609, "learning_rate": 0.00012571428571428572, "loss": 0.6219, "step": 226 }, { "epoch": 0.14200813262433531, "grad_norm": 0.9846400022506714, "learning_rate": 0.0001253781512605042, "loss": 0.4407, "step": 227 }, { "epoch": 0.1426337191116672, "grad_norm": 0.7880604267120361, "learning_rate": 0.0001250420168067227, "loss": 0.3927, "step": 228 }, { "epoch": 0.14325930559899905, "grad_norm": 1.5999399423599243, "learning_rate": 0.0001247058823529412, "loss": 0.6917, "step": 229 }, { "epoch": 0.14388489208633093, "grad_norm": 0.8072729110717773, "learning_rate": 0.00012436974789915966, "loss": 0.4909, "step": 230 }, { "epoch": 0.1445104785736628, "grad_norm": 2.2560601234436035, "learning_rate": 0.00012403361344537816, "loss": 0.3355, "step": 231 }, { "epoch": 0.1451360650609947, "grad_norm": 0.9964832663536072, "learning_rate": 0.00012369747899159666, "loss": 0.4436, "step": 232 }, { "epoch": 0.14576165154832654, "grad_norm": 1.1081007719039917, "learning_rate": 0.00012336134453781513, "loss": 0.6582, "step": 233 }, { "epoch": 0.14638723803565842, "grad_norm": 0.9722908735275269, "learning_rate": 0.00012302521008403363, "loss": 0.7412, "step": 234 }, { "epoch": 0.1470128245229903, "grad_norm": 0.7456592917442322, "learning_rate": 0.0001226890756302521, "loss": 0.4303, "step": 235 }, { "epoch": 0.14763841101032218, "grad_norm": 1.0428457260131836, "learning_rate": 0.0001223529411764706, "loss": 1.0538, "step": 236 }, { "epoch": 0.14826399749765404, "grad_norm": 0.9209719896316528, "learning_rate": 0.00012201680672268909, "loss": 0.5864, "step": 237 }, { "epoch": 0.14888958398498592, "grad_norm": 0.990292489528656, "learning_rate": 0.00012168067226890756, "loss": 0.5929, "step": 238 }, { "epoch": 0.1495151704723178, "grad_norm": 0.6086494326591492, "learning_rate": 0.00012134453781512605, "loss": 0.4436, "step": 239 }, { "epoch": 0.15014075695964968, "grad_norm": 1.429149866104126, "learning_rate": 0.00012100840336134453, "loss": 0.246, "step": 240 }, { "epoch": 0.15076634344698153, "grad_norm": 1.8170491456985474, "learning_rate": 0.00012067226890756302, "loss": 0.6574, "step": 241 }, { "epoch": 0.1513919299343134, "grad_norm": 1.1577768325805664, "learning_rate": 0.00012033613445378152, "loss": 0.5706, "step": 242 }, { "epoch": 0.1520175164216453, "grad_norm": 0.7442137598991394, "learning_rate": 0.00012, "loss": 0.2772, "step": 243 }, { "epoch": 0.15264310290897717, "grad_norm": 1.1375997066497803, "learning_rate": 0.00011966386554621849, "loss": 0.397, "step": 244 }, { "epoch": 0.15326868939630903, "grad_norm": 0.8451513648033142, "learning_rate": 0.00011932773109243697, "loss": 0.5425, "step": 245 }, { "epoch": 0.1538942758836409, "grad_norm": 0.7176560163497925, "learning_rate": 0.00011899159663865547, "loss": 0.4398, "step": 246 }, { "epoch": 0.1545198623709728, "grad_norm": 1.049872875213623, "learning_rate": 0.00011865546218487396, "loss": 0.6479, "step": 247 }, { "epoch": 0.15514544885830467, "grad_norm": 0.6093642115592957, "learning_rate": 0.00011831932773109244, "loss": 0.6125, "step": 248 }, { "epoch": 0.15577103534563652, "grad_norm": 0.9963379502296448, "learning_rate": 0.00011798319327731093, "loss": 0.3768, "step": 249 }, { "epoch": 0.1563966218329684, "grad_norm": 3.4668896198272705, "learning_rate": 0.00011764705882352942, "loss": 0.3744, "step": 250 }, { "epoch": 0.1563966218329684, "eval_loss": 0.8456696271896362, "eval_runtime": 43.5223, "eval_samples_per_second": 5.882, "eval_steps_per_second": 2.941, "step": 250 }, { "epoch": 0.15702220832030028, "grad_norm": 0.6826130747795105, "learning_rate": 0.00011731092436974791, "loss": 0.4877, "step": 251 }, { "epoch": 0.15764779480763216, "grad_norm": 1.8045300245285034, "learning_rate": 0.0001169747899159664, "loss": 0.9699, "step": 252 }, { "epoch": 0.15827338129496402, "grad_norm": 0.7311923503875732, "learning_rate": 0.00011663865546218489, "loss": 0.4648, "step": 253 }, { "epoch": 0.1588989677822959, "grad_norm": 1.7481943368911743, "learning_rate": 0.00011630252100840337, "loss": 0.8871, "step": 254 }, { "epoch": 0.15952455426962778, "grad_norm": 2.6331326961517334, "learning_rate": 0.00011596638655462187, "loss": 0.8109, "step": 255 }, { "epoch": 0.16015014075695966, "grad_norm": 0.899364709854126, "learning_rate": 0.00011563025210084036, "loss": 0.5021, "step": 256 }, { "epoch": 0.1607757272442915, "grad_norm": 0.922218918800354, "learning_rate": 0.00011529411764705881, "loss": 0.5741, "step": 257 }, { "epoch": 0.1614013137316234, "grad_norm": 5.335756301879883, "learning_rate": 0.00011495798319327731, "loss": 0.842, "step": 258 }, { "epoch": 0.16202690021895527, "grad_norm": 0.8632665872573853, "learning_rate": 0.0001146218487394958, "loss": 0.4208, "step": 259 }, { "epoch": 0.16265248670628715, "grad_norm": 4.576591968536377, "learning_rate": 0.00011428571428571428, "loss": 0.8813, "step": 260 }, { "epoch": 0.163278073193619, "grad_norm": 0.907714307308197, "learning_rate": 0.00011394957983193277, "loss": 0.7204, "step": 261 }, { "epoch": 0.16390365968095089, "grad_norm": 0.8328534960746765, "learning_rate": 0.00011361344537815127, "loss": 0.7552, "step": 262 }, { "epoch": 0.16452924616828277, "grad_norm": 1.0882028341293335, "learning_rate": 0.00011327731092436975, "loss": 0.9079, "step": 263 }, { "epoch": 0.16515483265561465, "grad_norm": 1.0093358755111694, "learning_rate": 0.00011294117647058824, "loss": 0.6284, "step": 264 }, { "epoch": 0.1657804191429465, "grad_norm": 0.853907585144043, "learning_rate": 0.00011260504201680672, "loss": 0.508, "step": 265 }, { "epoch": 0.16640600563027838, "grad_norm": 1.0016460418701172, "learning_rate": 0.00011226890756302521, "loss": 0.597, "step": 266 }, { "epoch": 0.16703159211761026, "grad_norm": 1.0138968229293823, "learning_rate": 0.00011193277310924371, "loss": 0.9238, "step": 267 }, { "epoch": 0.16765717860494214, "grad_norm": 1.1728049516677856, "learning_rate": 0.0001115966386554622, "loss": 0.9152, "step": 268 }, { "epoch": 0.168282765092274, "grad_norm": 1.2228264808654785, "learning_rate": 0.00011126050420168068, "loss": 0.7483, "step": 269 }, { "epoch": 0.16890835157960588, "grad_norm": 0.6260212659835815, "learning_rate": 0.00011092436974789917, "loss": 0.5566, "step": 270 }, { "epoch": 0.16953393806693776, "grad_norm": 0.7589625716209412, "learning_rate": 0.00011058823529411766, "loss": 0.6242, "step": 271 }, { "epoch": 0.17015952455426964, "grad_norm": 1.1016935110092163, "learning_rate": 0.00011025210084033615, "loss": 0.4419, "step": 272 }, { "epoch": 0.1707851110416015, "grad_norm": 0.8092851042747498, "learning_rate": 0.00010991596638655464, "loss": 0.5168, "step": 273 }, { "epoch": 0.17141069752893337, "grad_norm": 1.012885332107544, "learning_rate": 0.00010957983193277312, "loss": 0.4334, "step": 274 }, { "epoch": 0.17203628401626525, "grad_norm": 2.6073336601257324, "learning_rate": 0.00010924369747899159, "loss": 0.5262, "step": 275 }, { "epoch": 0.17203628401626525, "eval_loss": 0.8115787506103516, "eval_runtime": 43.4931, "eval_samples_per_second": 5.886, "eval_steps_per_second": 2.943, "step": 275 }, { "epoch": 0.17266187050359713, "grad_norm": 5.577237606048584, "learning_rate": 0.00010890756302521008, "loss": 1.0595, "step": 276 }, { "epoch": 0.17328745699092898, "grad_norm": 1.1434190273284912, "learning_rate": 0.00010857142857142856, "loss": 0.4401, "step": 277 }, { "epoch": 0.17391304347826086, "grad_norm": 0.951992928981781, "learning_rate": 0.00010823529411764706, "loss": 0.4393, "step": 278 }, { "epoch": 0.17453862996559275, "grad_norm": 0.6695138216018677, "learning_rate": 0.00010789915966386555, "loss": 0.314, "step": 279 }, { "epoch": 0.17516421645292463, "grad_norm": 0.40990278124809265, "learning_rate": 0.00010756302521008403, "loss": 0.192, "step": 280 }, { "epoch": 0.17578980294025648, "grad_norm": 0.9555610418319702, "learning_rate": 0.00010722689075630252, "loss": 0.3646, "step": 281 }, { "epoch": 0.17641538942758836, "grad_norm": 0.7370548844337463, "learning_rate": 0.000106890756302521, "loss": 0.8997, "step": 282 }, { "epoch": 0.17704097591492024, "grad_norm": 1.0178982019424438, "learning_rate": 0.0001065546218487395, "loss": 0.986, "step": 283 }, { "epoch": 0.17766656240225212, "grad_norm": 0.41388389468193054, "learning_rate": 0.00010621848739495799, "loss": 0.2069, "step": 284 }, { "epoch": 0.17829214888958397, "grad_norm": 0.7140624523162842, "learning_rate": 0.00010588235294117647, "loss": 0.4852, "step": 285 }, { "epoch": 0.17891773537691585, "grad_norm": 0.7758356332778931, "learning_rate": 0.00010554621848739496, "loss": 0.3943, "step": 286 }, { "epoch": 0.17954332186424773, "grad_norm": 1.4193260669708252, "learning_rate": 0.00010521008403361346, "loss": 0.6412, "step": 287 }, { "epoch": 0.18016890835157962, "grad_norm": 0.7264838814735413, "learning_rate": 0.00010487394957983194, "loss": 0.7834, "step": 288 }, { "epoch": 0.18079449483891147, "grad_norm": 2.4300973415374756, "learning_rate": 0.00010453781512605043, "loss": 0.7462, "step": 289 }, { "epoch": 0.18142008132624335, "grad_norm": 1.033916711807251, "learning_rate": 0.00010420168067226892, "loss": 0.5241, "step": 290 }, { "epoch": 0.18204566781357523, "grad_norm": 0.5583767294883728, "learning_rate": 0.00010386554621848741, "loss": 0.7815, "step": 291 }, { "epoch": 0.1826712543009071, "grad_norm": 0.7440481781959534, "learning_rate": 0.0001035294117647059, "loss": 0.4674, "step": 292 }, { "epoch": 0.18329684078823896, "grad_norm": 4.230656147003174, "learning_rate": 0.00010319327731092439, "loss": 0.5219, "step": 293 }, { "epoch": 0.18392242727557084, "grad_norm": 0.6165269017219543, "learning_rate": 0.00010285714285714286, "loss": 0.3274, "step": 294 }, { "epoch": 0.18454801376290272, "grad_norm": 0.5844498872756958, "learning_rate": 0.00010252100840336134, "loss": 0.3719, "step": 295 }, { "epoch": 0.1851736002502346, "grad_norm": 0.9936206936836243, "learning_rate": 0.00010218487394957983, "loss": 1.0453, "step": 296 }, { "epoch": 0.18579918673756646, "grad_norm": 1.749831199645996, "learning_rate": 0.00010184873949579831, "loss": 0.6634, "step": 297 }, { "epoch": 0.18642477322489834, "grad_norm": 0.4740132689476013, "learning_rate": 0.0001015126050420168, "loss": 0.2901, "step": 298 }, { "epoch": 0.18705035971223022, "grad_norm": 0.664300262928009, "learning_rate": 0.0001011764705882353, "loss": 0.5869, "step": 299 }, { "epoch": 0.1876759461995621, "grad_norm": 0.7400941252708435, "learning_rate": 0.00010084033613445378, "loss": 0.7881, "step": 300 }, { "epoch": 0.1876759461995621, "eval_loss": 0.7877693772315979, "eval_runtime": 43.5162, "eval_samples_per_second": 5.883, "eval_steps_per_second": 2.941, "step": 300 }, { "epoch": 0.18830153268689395, "grad_norm": 0.6142858862876892, "learning_rate": 0.00010050420168067227, "loss": 0.3808, "step": 301 }, { "epoch": 0.18892711917422583, "grad_norm": 1.991969347000122, "learning_rate": 0.00010016806722689076, "loss": 0.7035, "step": 302 }, { "epoch": 0.1895527056615577, "grad_norm": 0.6220730543136597, "learning_rate": 9.983193277310925e-05, "loss": 0.2548, "step": 303 }, { "epoch": 0.1901782921488896, "grad_norm": 0.6476833820343018, "learning_rate": 9.949579831932774e-05, "loss": 0.3569, "step": 304 }, { "epoch": 0.19080387863622145, "grad_norm": 0.7133951783180237, "learning_rate": 9.915966386554623e-05, "loss": 0.4744, "step": 305 }, { "epoch": 0.19142946512355333, "grad_norm": 0.6500736474990845, "learning_rate": 9.882352941176471e-05, "loss": 0.4653, "step": 306 }, { "epoch": 0.1920550516108852, "grad_norm": 1.1231927871704102, "learning_rate": 9.848739495798321e-05, "loss": 0.818, "step": 307 }, { "epoch": 0.1926806380982171, "grad_norm": 0.8654798865318298, "learning_rate": 9.815126050420168e-05, "loss": 0.7065, "step": 308 }, { "epoch": 0.19330622458554894, "grad_norm": 0.45660969614982605, "learning_rate": 9.781512605042017e-05, "loss": 0.2412, "step": 309 }, { "epoch": 0.19393181107288082, "grad_norm": 0.9538519978523254, "learning_rate": 9.747899159663865e-05, "loss": 1.3428, "step": 310 }, { "epoch": 0.1945573975602127, "grad_norm": 0.596633791923523, "learning_rate": 9.714285714285715e-05, "loss": 0.5119, "step": 311 }, { "epoch": 0.19518298404754458, "grad_norm": 0.5247074365615845, "learning_rate": 9.680672268907564e-05, "loss": 0.6413, "step": 312 }, { "epoch": 0.19580857053487644, "grad_norm": 0.7713050246238708, "learning_rate": 9.647058823529412e-05, "loss": 0.49, "step": 313 }, { "epoch": 0.19643415702220832, "grad_norm": 0.6971513628959656, "learning_rate": 9.613445378151261e-05, "loss": 0.6505, "step": 314 }, { "epoch": 0.1970597435095402, "grad_norm": 0.5454917550086975, "learning_rate": 9.579831932773111e-05, "loss": 0.7018, "step": 315 }, { "epoch": 0.19768532999687208, "grad_norm": 0.8349499702453613, "learning_rate": 9.546218487394959e-05, "loss": 0.3179, "step": 316 }, { "epoch": 0.19831091648420393, "grad_norm": 0.5682560801506042, "learning_rate": 9.512605042016806e-05, "loss": 0.4003, "step": 317 }, { "epoch": 0.1989365029715358, "grad_norm": 0.5094739198684692, "learning_rate": 9.478991596638655e-05, "loss": 0.313, "step": 318 }, { "epoch": 0.1995620894588677, "grad_norm": 1.7074236869812012, "learning_rate": 9.445378151260505e-05, "loss": 0.9912, "step": 319 }, { "epoch": 0.20018767594619957, "grad_norm": 1.1477283239364624, "learning_rate": 9.411764705882353e-05, "loss": 0.851, "step": 320 }, { "epoch": 0.20081326243353143, "grad_norm": 0.6616579294204712, "learning_rate": 9.378151260504202e-05, "loss": 0.4844, "step": 321 }, { "epoch": 0.2014388489208633, "grad_norm": 1.0401920080184937, "learning_rate": 9.34453781512605e-05, "loss": 0.5421, "step": 322 }, { "epoch": 0.2020644354081952, "grad_norm": 0.729664146900177, "learning_rate": 9.3109243697479e-05, "loss": 0.6632, "step": 323 }, { "epoch": 0.20269002189552707, "grad_norm": 0.6752575635910034, "learning_rate": 9.277310924369749e-05, "loss": 0.4352, "step": 324 }, { "epoch": 0.20331560838285892, "grad_norm": 0.7963948249816895, "learning_rate": 9.243697478991598e-05, "loss": 0.7614, "step": 325 }, { "epoch": 0.20331560838285892, "eval_loss": 0.771190881729126, "eval_runtime": 43.551, "eval_samples_per_second": 5.878, "eval_steps_per_second": 2.939, "step": 325 }, { "epoch": 0.2039411948701908, "grad_norm": 0.7778791189193726, "learning_rate": 9.210084033613445e-05, "loss": 0.7251, "step": 326 }, { "epoch": 0.20456678135752268, "grad_norm": 3.0929737091064453, "learning_rate": 9.176470588235295e-05, "loss": 0.5375, "step": 327 }, { "epoch": 0.20519236784485456, "grad_norm": 0.6188391447067261, "learning_rate": 9.142857142857143e-05, "loss": 0.4007, "step": 328 }, { "epoch": 0.20581795433218641, "grad_norm": 0.9423925876617432, "learning_rate": 9.109243697478992e-05, "loss": 0.5059, "step": 329 }, { "epoch": 0.2064435408195183, "grad_norm": 0.506572425365448, "learning_rate": 9.07563025210084e-05, "loss": 0.2794, "step": 330 }, { "epoch": 0.20706912730685018, "grad_norm": 1.7139545679092407, "learning_rate": 9.04201680672269e-05, "loss": 0.5984, "step": 331 }, { "epoch": 0.20769471379418206, "grad_norm": 0.5540574789047241, "learning_rate": 9.008403361344539e-05, "loss": 0.323, "step": 332 }, { "epoch": 0.2083203002815139, "grad_norm": 0.6909454464912415, "learning_rate": 8.974789915966387e-05, "loss": 0.5399, "step": 333 }, { "epoch": 0.2089458867688458, "grad_norm": 0.7409022450447083, "learning_rate": 8.941176470588236e-05, "loss": 0.4251, "step": 334 }, { "epoch": 0.20957147325617767, "grad_norm": 0.6636312007904053, "learning_rate": 8.907563025210084e-05, "loss": 0.4021, "step": 335 }, { "epoch": 0.21019705974350955, "grad_norm": 0.5426271557807922, "learning_rate": 8.873949579831933e-05, "loss": 0.2095, "step": 336 }, { "epoch": 0.2108226462308414, "grad_norm": 0.8870647549629211, "learning_rate": 8.840336134453782e-05, "loss": 0.5773, "step": 337 }, { "epoch": 0.21144823271817328, "grad_norm": 0.5508524179458618, "learning_rate": 8.80672268907563e-05, "loss": 0.6744, "step": 338 }, { "epoch": 0.21207381920550517, "grad_norm": 1.6577738523483276, "learning_rate": 8.77310924369748e-05, "loss": 1.1134, "step": 339 }, { "epoch": 0.21269940569283705, "grad_norm": 3.218395233154297, "learning_rate": 8.739495798319329e-05, "loss": 0.5932, "step": 340 }, { "epoch": 0.2133249921801689, "grad_norm": 0.5119672417640686, "learning_rate": 8.705882352941177e-05, "loss": 0.1831, "step": 341 }, { "epoch": 0.21395057866750078, "grad_norm": 0.4874535799026489, "learning_rate": 8.672268907563026e-05, "loss": 0.485, "step": 342 }, { "epoch": 0.21457616515483266, "grad_norm": 0.6597093939781189, "learning_rate": 8.638655462184874e-05, "loss": 0.3588, "step": 343 }, { "epoch": 0.21520175164216454, "grad_norm": 1.1764620542526245, "learning_rate": 8.605042016806724e-05, "loss": 1.8765, "step": 344 }, { "epoch": 0.2158273381294964, "grad_norm": 0.6894935369491577, "learning_rate": 8.571428571428571e-05, "loss": 0.5355, "step": 345 }, { "epoch": 0.21645292461682827, "grad_norm": 0.5896294116973877, "learning_rate": 8.53781512605042e-05, "loss": 0.494, "step": 346 }, { "epoch": 0.21707851110416015, "grad_norm": 0.6212694048881531, "learning_rate": 8.50420168067227e-05, "loss": 0.5721, "step": 347 }, { "epoch": 0.21770409759149204, "grad_norm": 0.5058571100234985, "learning_rate": 8.470588235294118e-05, "loss": 0.5051, "step": 348 }, { "epoch": 0.2183296840788239, "grad_norm": 0.5089401006698608, "learning_rate": 8.436974789915967e-05, "loss": 0.3794, "step": 349 }, { "epoch": 0.21895527056615577, "grad_norm": 6.416032314300537, "learning_rate": 8.403361344537815e-05, "loss": 0.5026, "step": 350 }, { "epoch": 0.21895527056615577, "eval_loss": 0.7647964954376221, "eval_runtime": 43.4854, "eval_samples_per_second": 5.887, "eval_steps_per_second": 2.944, "step": 350 } ], "logging_steps": 1, "max_steps": 600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.5558583235916595e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }