diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21616 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.995172609630441, + "eval_steps": 500, + "global_step": 3081, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.000973393900064893, + "grad_norm": 3.8125, + "learning_rate": 5.000000000000001e-07, + "loss": 1.9271, + "step": 1 + }, + { + "epoch": 0.001946787800129786, + "grad_norm": 3.8125, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.947, + "step": 2 + }, + { + "epoch": 0.0029201817001946787, + "grad_norm": 3.5625, + "learning_rate": 1.5e-06, + "loss": 1.8864, + "step": 3 + }, + { + "epoch": 0.003893575600259572, + "grad_norm": 3.546875, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.9048, + "step": 4 + }, + { + "epoch": 0.004866969500324465, + "grad_norm": 3.40625, + "learning_rate": 2.5e-06, + "loss": 1.9077, + "step": 5 + }, + { + "epoch": 0.005840363400389357, + "grad_norm": 3.0, + "learning_rate": 3e-06, + "loss": 1.9014, + "step": 6 + }, + { + "epoch": 0.00681375730045425, + "grad_norm": 2.78125, + "learning_rate": 3.5e-06, + "loss": 1.912, + "step": 7 + }, + { + "epoch": 0.007787151200519144, + "grad_norm": 2.34375, + "learning_rate": 4.000000000000001e-06, + "loss": 1.8813, + "step": 8 + }, + { + "epoch": 0.008760545100584036, + "grad_norm": 2.109375, + "learning_rate": 4.5e-06, + "loss": 1.8683, + "step": 9 + }, + { + "epoch": 0.00973393900064893, + "grad_norm": 1.8125, + "learning_rate": 5e-06, + "loss": 1.9061, + "step": 10 + }, + { + "epoch": 0.010707332900713823, + "grad_norm": 1.765625, + "learning_rate": 4.999998691872464e-06, + "loss": 1.8596, + "step": 11 + }, + { + "epoch": 0.011680726800778715, + "grad_norm": 1.7734375, + "learning_rate": 4.9999947674912255e-06, + "loss": 1.9086, + "step": 12 + }, + { + "epoch": 0.012654120700843608, + "grad_norm": 1.71875, + "learning_rate": 4.99998822686039e-06, + "loss": 1.8465, + "step": 13 + }, + { + "epoch": 0.0136275146009085, + "grad_norm": 1.609375, + "learning_rate": 4.999979069986803e-06, + "loss": 1.835, + "step": 14 + }, + { + "epoch": 0.014600908500973394, + "grad_norm": 1.5390625, + "learning_rate": 4.999967296880048e-06, + "loss": 1.8464, + "step": 15 + }, + { + "epoch": 0.015574302401038288, + "grad_norm": 1.375, + "learning_rate": 4.999952907552444e-06, + "loss": 1.8715, + "step": 16 + }, + { + "epoch": 0.01654769630110318, + "grad_norm": 1.265625, + "learning_rate": 4.999935902019051e-06, + "loss": 1.8614, + "step": 17 + }, + { + "epoch": 0.01752109020116807, + "grad_norm": 1.15625, + "learning_rate": 4.999916280297664e-06, + "loss": 1.8847, + "step": 18 + }, + { + "epoch": 0.018494484101232965, + "grad_norm": 1.0625, + "learning_rate": 4.999894042408818e-06, + "loss": 1.8204, + "step": 19 + }, + { + "epoch": 0.01946787800129786, + "grad_norm": 1.046875, + "learning_rate": 4.999869188375784e-06, + "loss": 1.8313, + "step": 20 + }, + { + "epoch": 0.020441271901362752, + "grad_norm": 1.015625, + "learning_rate": 4.999841718224574e-06, + "loss": 1.8221, + "step": 21 + }, + { + "epoch": 0.021414665801427646, + "grad_norm": 0.984375, + "learning_rate": 4.999811631983934e-06, + "loss": 1.8274, + "step": 22 + }, + { + "epoch": 0.022388059701492536, + "grad_norm": 0.95703125, + "learning_rate": 4.999778929685348e-06, + "loss": 1.8124, + "step": 23 + }, + { + "epoch": 0.02336145360155743, + "grad_norm": 0.94921875, + "learning_rate": 4.999743611363042e-06, + "loss": 1.8222, + "step": 24 + }, + { + "epoch": 0.024334847501622323, + "grad_norm": 0.91015625, + "learning_rate": 4.9997056770539745e-06, + "loss": 1.8039, + "step": 25 + }, + { + "epoch": 0.025308241401687217, + "grad_norm": 0.90625, + "learning_rate": 4.9996651267978446e-06, + "loss": 1.7916, + "step": 26 + }, + { + "epoch": 0.02628163530175211, + "grad_norm": 0.8671875, + "learning_rate": 4.999621960637089e-06, + "loss": 1.7906, + "step": 27 + }, + { + "epoch": 0.027255029201817, + "grad_norm": 0.8359375, + "learning_rate": 4.99957617861688e-06, + "loss": 1.8106, + "step": 28 + }, + { + "epoch": 0.028228423101881894, + "grad_norm": 0.828125, + "learning_rate": 4.999527780785127e-06, + "loss": 1.7894, + "step": 29 + }, + { + "epoch": 0.029201817001946788, + "grad_norm": 0.8046875, + "learning_rate": 4.9994767671924826e-06, + "loss": 1.7524, + "step": 30 + }, + { + "epoch": 0.03017521090201168, + "grad_norm": 0.80078125, + "learning_rate": 4.999423137892329e-06, + "loss": 1.809, + "step": 31 + }, + { + "epoch": 0.031148604802076575, + "grad_norm": 0.78125, + "learning_rate": 4.9993668929407916e-06, + "loss": 1.7914, + "step": 32 + }, + { + "epoch": 0.03212199870214147, + "grad_norm": 0.75390625, + "learning_rate": 4.999308032396729e-06, + "loss": 1.7832, + "step": 33 + }, + { + "epoch": 0.03309539260220636, + "grad_norm": 0.75390625, + "learning_rate": 4.999246556321741e-06, + "loss": 1.7584, + "step": 34 + }, + { + "epoch": 0.03406878650227125, + "grad_norm": 0.74609375, + "learning_rate": 4.999182464780161e-06, + "loss": 1.7985, + "step": 35 + }, + { + "epoch": 0.03504218040233614, + "grad_norm": 0.7421875, + "learning_rate": 4.99911575783906e-06, + "loss": 1.7827, + "step": 36 + }, + { + "epoch": 0.036015574302401036, + "grad_norm": 0.71875, + "learning_rate": 4.99904643556825e-06, + "loss": 1.7835, + "step": 37 + }, + { + "epoch": 0.03698896820246593, + "grad_norm": 0.70703125, + "learning_rate": 4.998974498040273e-06, + "loss": 1.7315, + "step": 38 + }, + { + "epoch": 0.037962362102530824, + "grad_norm": 0.71484375, + "learning_rate": 4.998899945330416e-06, + "loss": 1.7871, + "step": 39 + }, + { + "epoch": 0.03893575600259572, + "grad_norm": 0.69921875, + "learning_rate": 4.9988227775166954e-06, + "loss": 1.7366, + "step": 40 + }, + { + "epoch": 0.03990914990266061, + "grad_norm": 0.6953125, + "learning_rate": 4.9987429946798684e-06, + "loss": 1.7826, + "step": 41 + }, + { + "epoch": 0.040882543802725504, + "grad_norm": 0.6796875, + "learning_rate": 4.998660596903428e-06, + "loss": 1.772, + "step": 42 + }, + { + "epoch": 0.0418559377027904, + "grad_norm": 0.67578125, + "learning_rate": 4.998575584273604e-06, + "loss": 1.7348, + "step": 43 + }, + { + "epoch": 0.04282933160285529, + "grad_norm": 0.66015625, + "learning_rate": 4.998487956879361e-06, + "loss": 1.755, + "step": 44 + }, + { + "epoch": 0.04380272550292018, + "grad_norm": 0.65625, + "learning_rate": 4.998397714812403e-06, + "loss": 1.7603, + "step": 45 + }, + { + "epoch": 0.04477611940298507, + "grad_norm": 0.65625, + "learning_rate": 4.9983048581671676e-06, + "loss": 1.7151, + "step": 46 + }, + { + "epoch": 0.045749513303049966, + "grad_norm": 0.64453125, + "learning_rate": 4.998209387040829e-06, + "loss": 1.7456, + "step": 47 + }, + { + "epoch": 0.04672290720311486, + "grad_norm": 0.64453125, + "learning_rate": 4.998111301533299e-06, + "loss": 1.7458, + "step": 48 + }, + { + "epoch": 0.04769630110317975, + "grad_norm": 0.6484375, + "learning_rate": 4.998010601747223e-06, + "loss": 1.7416, + "step": 49 + }, + { + "epoch": 0.04866969500324465, + "grad_norm": 0.6328125, + "learning_rate": 4.997907287787985e-06, + "loss": 1.7257, + "step": 50 + }, + { + "epoch": 0.04964308890330954, + "grad_norm": 0.62890625, + "learning_rate": 4.997801359763702e-06, + "loss": 1.742, + "step": 51 + }, + { + "epoch": 0.050616482803374434, + "grad_norm": 0.63671875, + "learning_rate": 4.997692817785229e-06, + "loss": 1.7521, + "step": 52 + }, + { + "epoch": 0.05158987670343933, + "grad_norm": 0.62109375, + "learning_rate": 4.997581661966154e-06, + "loss": 1.7133, + "step": 53 + }, + { + "epoch": 0.05256327060350422, + "grad_norm": 0.61328125, + "learning_rate": 4.997467892422804e-06, + "loss": 1.7177, + "step": 54 + }, + { + "epoch": 0.05353666450356911, + "grad_norm": 0.61328125, + "learning_rate": 4.997351509274236e-06, + "loss": 1.7096, + "step": 55 + }, + { + "epoch": 0.054510058403634, + "grad_norm": 0.609375, + "learning_rate": 4.997232512642248e-06, + "loss": 1.7169, + "step": 56 + }, + { + "epoch": 0.055483452303698895, + "grad_norm": 0.609375, + "learning_rate": 4.997110902651368e-06, + "loss": 1.7097, + "step": 57 + }, + { + "epoch": 0.05645684620376379, + "grad_norm": 0.61328125, + "learning_rate": 4.996986679428863e-06, + "loss": 1.696, + "step": 58 + }, + { + "epoch": 0.05743024010382868, + "grad_norm": 0.60546875, + "learning_rate": 4.996859843104732e-06, + "loss": 1.7385, + "step": 59 + }, + { + "epoch": 0.058403634003893576, + "grad_norm": 0.6015625, + "learning_rate": 4.9967303938117095e-06, + "loss": 1.6892, + "step": 60 + }, + { + "epoch": 0.05937702790395847, + "grad_norm": 0.59375, + "learning_rate": 4.9965983316852655e-06, + "loss": 1.7318, + "step": 61 + }, + { + "epoch": 0.06035042180402336, + "grad_norm": 0.58984375, + "learning_rate": 4.996463656863601e-06, + "loss": 1.7081, + "step": 62 + }, + { + "epoch": 0.06132381570408826, + "grad_norm": 0.5859375, + "learning_rate": 4.996326369487654e-06, + "loss": 1.7108, + "step": 63 + }, + { + "epoch": 0.06229720960415315, + "grad_norm": 0.59375, + "learning_rate": 4.996186469701098e-06, + "loss": 1.7007, + "step": 64 + }, + { + "epoch": 0.06327060350421804, + "grad_norm": 0.5859375, + "learning_rate": 4.996043957650337e-06, + "loss": 1.7047, + "step": 65 + }, + { + "epoch": 0.06424399740428294, + "grad_norm": 0.58984375, + "learning_rate": 4.99589883348451e-06, + "loss": 1.6933, + "step": 66 + }, + { + "epoch": 0.06521739130434782, + "grad_norm": 0.578125, + "learning_rate": 4.99575109735549e-06, + "loss": 1.7098, + "step": 67 + }, + { + "epoch": 0.06619078520441272, + "grad_norm": 0.58203125, + "learning_rate": 4.995600749417883e-06, + "loss": 1.7075, + "step": 68 + }, + { + "epoch": 0.06716417910447761, + "grad_norm": 0.5859375, + "learning_rate": 4.9954477898290285e-06, + "loss": 1.7302, + "step": 69 + }, + { + "epoch": 0.0681375730045425, + "grad_norm": 0.57421875, + "learning_rate": 4.9952922187490005e-06, + "loss": 1.6886, + "step": 70 + }, + { + "epoch": 0.0691109669046074, + "grad_norm": 0.56640625, + "learning_rate": 4.995134036340602e-06, + "loss": 1.6862, + "step": 71 + }, + { + "epoch": 0.07008436080467229, + "grad_norm": 0.5703125, + "learning_rate": 4.994973242769372e-06, + "loss": 1.698, + "step": 72 + }, + { + "epoch": 0.07105775470473719, + "grad_norm": 0.56640625, + "learning_rate": 4.994809838203582e-06, + "loss": 1.6917, + "step": 73 + }, + { + "epoch": 0.07203114860480207, + "grad_norm": 0.5703125, + "learning_rate": 4.9946438228142345e-06, + "loss": 1.6791, + "step": 74 + }, + { + "epoch": 0.07300454250486697, + "grad_norm": 0.5703125, + "learning_rate": 4.994475196775066e-06, + "loss": 1.6923, + "step": 75 + }, + { + "epoch": 0.07397793640493186, + "grad_norm": 0.5625, + "learning_rate": 4.9943039602625435e-06, + "loss": 1.6791, + "step": 76 + }, + { + "epoch": 0.07495133030499676, + "grad_norm": 0.59765625, + "learning_rate": 4.994130113455865e-06, + "loss": 1.7237, + "step": 77 + }, + { + "epoch": 0.07592472420506165, + "grad_norm": 0.5703125, + "learning_rate": 4.9939536565369625e-06, + "loss": 1.6223, + "step": 78 + }, + { + "epoch": 0.07689811810512655, + "grad_norm": 0.55078125, + "learning_rate": 4.9937745896905e-06, + "loss": 1.6775, + "step": 79 + }, + { + "epoch": 0.07787151200519143, + "grad_norm": 0.55859375, + "learning_rate": 4.993592913103868e-06, + "loss": 1.6915, + "step": 80 + }, + { + "epoch": 0.07884490590525632, + "grad_norm": 0.5625, + "learning_rate": 4.993408626967196e-06, + "loss": 1.6782, + "step": 81 + }, + { + "epoch": 0.07981829980532122, + "grad_norm": 0.54296875, + "learning_rate": 4.993221731473335e-06, + "loss": 1.6581, + "step": 82 + }, + { + "epoch": 0.08079169370538611, + "grad_norm": 0.546875, + "learning_rate": 4.993032226817874e-06, + "loss": 1.6621, + "step": 83 + }, + { + "epoch": 0.08176508760545101, + "grad_norm": 0.5625, + "learning_rate": 4.992840113199131e-06, + "loss": 1.667, + "step": 84 + }, + { + "epoch": 0.0827384815055159, + "grad_norm": 0.55078125, + "learning_rate": 4.992645390818151e-06, + "loss": 1.6677, + "step": 85 + }, + { + "epoch": 0.0837118754055808, + "grad_norm": 0.55859375, + "learning_rate": 4.992448059878713e-06, + "loss": 1.6475, + "step": 86 + }, + { + "epoch": 0.08468526930564568, + "grad_norm": 0.546875, + "learning_rate": 4.992248120587323e-06, + "loss": 1.6859, + "step": 87 + }, + { + "epoch": 0.08565866320571058, + "grad_norm": 0.55078125, + "learning_rate": 4.992045573153218e-06, + "loss": 1.6847, + "step": 88 + }, + { + "epoch": 0.08663205710577547, + "grad_norm": 0.54296875, + "learning_rate": 4.9918404177883655e-06, + "loss": 1.6697, + "step": 89 + }, + { + "epoch": 0.08760545100584036, + "grad_norm": 0.546875, + "learning_rate": 4.99163265470746e-06, + "loss": 1.6756, + "step": 90 + }, + { + "epoch": 0.08857884490590526, + "grad_norm": 0.5390625, + "learning_rate": 4.991422284127927e-06, + "loss": 1.6589, + "step": 91 + }, + { + "epoch": 0.08955223880597014, + "grad_norm": 0.54296875, + "learning_rate": 4.991209306269918e-06, + "loss": 1.6587, + "step": 92 + }, + { + "epoch": 0.09052563270603504, + "grad_norm": 0.55078125, + "learning_rate": 4.990993721356317e-06, + "loss": 1.6607, + "step": 93 + }, + { + "epoch": 0.09149902660609993, + "grad_norm": 0.546875, + "learning_rate": 4.990775529612732e-06, + "loss": 1.6582, + "step": 94 + }, + { + "epoch": 0.09247242050616483, + "grad_norm": 0.5390625, + "learning_rate": 4.990554731267502e-06, + "loss": 1.669, + "step": 95 + }, + { + "epoch": 0.09344581440622972, + "grad_norm": 0.53515625, + "learning_rate": 4.990331326551693e-06, + "loss": 1.6586, + "step": 96 + }, + { + "epoch": 0.09441920830629462, + "grad_norm": 0.53515625, + "learning_rate": 4.9901053156990984e-06, + "loss": 1.6385, + "step": 97 + }, + { + "epoch": 0.0953926022063595, + "grad_norm": 0.51953125, + "learning_rate": 4.9898766989462385e-06, + "loss": 1.6393, + "step": 98 + }, + { + "epoch": 0.0963659961064244, + "grad_norm": 0.546875, + "learning_rate": 4.989645476532362e-06, + "loss": 1.6478, + "step": 99 + }, + { + "epoch": 0.0973393900064893, + "grad_norm": 0.5234375, + "learning_rate": 4.9894116486994425e-06, + "loss": 1.648, + "step": 100 + }, + { + "epoch": 0.09831278390655418, + "grad_norm": 0.53515625, + "learning_rate": 4.9891752156921835e-06, + "loss": 1.6419, + "step": 101 + }, + { + "epoch": 0.09928617780661908, + "grad_norm": 0.52734375, + "learning_rate": 4.988936177758011e-06, + "loss": 1.6285, + "step": 102 + }, + { + "epoch": 0.10025957170668397, + "grad_norm": 0.5390625, + "learning_rate": 4.9886945351470775e-06, + "loss": 1.6654, + "step": 103 + }, + { + "epoch": 0.10123296560674887, + "grad_norm": 0.5234375, + "learning_rate": 4.988450288112265e-06, + "loss": 1.6355, + "step": 104 + }, + { + "epoch": 0.10220635950681375, + "grad_norm": 0.53125, + "learning_rate": 4.988203436909177e-06, + "loss": 1.6376, + "step": 105 + }, + { + "epoch": 0.10317975340687865, + "grad_norm": 0.5390625, + "learning_rate": 4.987953981796144e-06, + "loss": 1.6474, + "step": 106 + }, + { + "epoch": 0.10415314730694354, + "grad_norm": 0.51953125, + "learning_rate": 4.987701923034222e-06, + "loss": 1.641, + "step": 107 + }, + { + "epoch": 0.10512654120700844, + "grad_norm": 0.5234375, + "learning_rate": 4.98744726088719e-06, + "loss": 1.6212, + "step": 108 + }, + { + "epoch": 0.10609993510707333, + "grad_norm": 0.5234375, + "learning_rate": 4.987189995621553e-06, + "loss": 1.6218, + "step": 109 + }, + { + "epoch": 0.10707332900713822, + "grad_norm": 0.51953125, + "learning_rate": 4.9869301275065394e-06, + "loss": 1.6329, + "step": 110 + }, + { + "epoch": 0.10804672290720312, + "grad_norm": 0.5234375, + "learning_rate": 4.986667656814102e-06, + "loss": 1.6328, + "step": 111 + }, + { + "epoch": 0.109020116807268, + "grad_norm": 0.515625, + "learning_rate": 4.9864025838189165e-06, + "loss": 1.6055, + "step": 112 + }, + { + "epoch": 0.1099935107073329, + "grad_norm": 0.51953125, + "learning_rate": 4.986134908798383e-06, + "loss": 1.6214, + "step": 113 + }, + { + "epoch": 0.11096690460739779, + "grad_norm": 0.51953125, + "learning_rate": 4.985864632032623e-06, + "loss": 1.6265, + "step": 114 + }, + { + "epoch": 0.11194029850746269, + "grad_norm": 0.51953125, + "learning_rate": 4.985591753804483e-06, + "loss": 1.6191, + "step": 115 + }, + { + "epoch": 0.11291369240752758, + "grad_norm": 0.5078125, + "learning_rate": 4.985316274399529e-06, + "loss": 1.615, + "step": 116 + }, + { + "epoch": 0.11388708630759248, + "grad_norm": 0.51171875, + "learning_rate": 4.985038194106052e-06, + "loss": 1.6472, + "step": 117 + }, + { + "epoch": 0.11486048020765736, + "grad_norm": 0.51171875, + "learning_rate": 4.984757513215063e-06, + "loss": 1.6116, + "step": 118 + }, + { + "epoch": 0.11583387410772227, + "grad_norm": 0.5078125, + "learning_rate": 4.984474232020296e-06, + "loss": 1.5963, + "step": 119 + }, + { + "epoch": 0.11680726800778715, + "grad_norm": 0.515625, + "learning_rate": 4.984188350818204e-06, + "loss": 1.6276, + "step": 120 + }, + { + "epoch": 0.11778066190785204, + "grad_norm": 0.5078125, + "learning_rate": 4.983899869907963e-06, + "loss": 1.6063, + "step": 121 + }, + { + "epoch": 0.11875405580791694, + "grad_norm": 0.5078125, + "learning_rate": 4.983608789591468e-06, + "loss": 1.6289, + "step": 122 + }, + { + "epoch": 0.11972744970798183, + "grad_norm": 0.515625, + "learning_rate": 4.983315110173337e-06, + "loss": 1.587, + "step": 123 + }, + { + "epoch": 0.12070084360804673, + "grad_norm": 0.5, + "learning_rate": 4.9830188319609045e-06, + "loss": 1.629, + "step": 124 + }, + { + "epoch": 0.12167423750811161, + "grad_norm": 0.494140625, + "learning_rate": 4.982719955264227e-06, + "loss": 1.6104, + "step": 125 + }, + { + "epoch": 0.12264763140817651, + "grad_norm": 0.49609375, + "learning_rate": 4.9824184803960794e-06, + "loss": 1.571, + "step": 126 + }, + { + "epoch": 0.1236210253082414, + "grad_norm": 0.52734375, + "learning_rate": 4.982114407671955e-06, + "loss": 1.6102, + "step": 127 + }, + { + "epoch": 0.1245944192083063, + "grad_norm": 0.51171875, + "learning_rate": 4.981807737410068e-06, + "loss": 1.6285, + "step": 128 + }, + { + "epoch": 0.1255678131083712, + "grad_norm": 0.51171875, + "learning_rate": 4.981498469931348e-06, + "loss": 1.6085, + "step": 129 + }, + { + "epoch": 0.12654120700843607, + "grad_norm": 0.494140625, + "learning_rate": 4.981186605559445e-06, + "loss": 1.6036, + "step": 130 + }, + { + "epoch": 0.12751460090850097, + "grad_norm": 0.494140625, + "learning_rate": 4.980872144620726e-06, + "loss": 1.603, + "step": 131 + }, + { + "epoch": 0.12848799480856588, + "grad_norm": 0.5, + "learning_rate": 4.9805550874442735e-06, + "loss": 1.6262, + "step": 132 + }, + { + "epoch": 0.12946138870863075, + "grad_norm": 0.51171875, + "learning_rate": 4.9802354343618895e-06, + "loss": 1.6447, + "step": 133 + }, + { + "epoch": 0.13043478260869565, + "grad_norm": 0.5078125, + "learning_rate": 4.979913185708093e-06, + "loss": 1.5935, + "step": 134 + }, + { + "epoch": 0.13140817650876055, + "grad_norm": 0.498046875, + "learning_rate": 4.979588341820114e-06, + "loss": 1.6112, + "step": 135 + }, + { + "epoch": 0.13238157040882545, + "grad_norm": 0.5078125, + "learning_rate": 4.979260903037906e-06, + "loss": 1.6473, + "step": 136 + }, + { + "epoch": 0.13335496430889032, + "grad_norm": 0.50390625, + "learning_rate": 4.9789308697041325e-06, + "loss": 1.633, + "step": 137 + }, + { + "epoch": 0.13432835820895522, + "grad_norm": 0.5, + "learning_rate": 4.9785982421641756e-06, + "loss": 1.607, + "step": 138 + }, + { + "epoch": 0.13530175210902012, + "grad_norm": 0.498046875, + "learning_rate": 4.978263020766129e-06, + "loss": 1.616, + "step": 139 + }, + { + "epoch": 0.136275146009085, + "grad_norm": 0.50390625, + "learning_rate": 4.977925205860803e-06, + "loss": 1.598, + "step": 140 + }, + { + "epoch": 0.1372485399091499, + "grad_norm": 0.5078125, + "learning_rate": 4.977584797801722e-06, + "loss": 1.6183, + "step": 141 + }, + { + "epoch": 0.1382219338092148, + "grad_norm": 0.50390625, + "learning_rate": 4.977241796945123e-06, + "loss": 1.5891, + "step": 142 + }, + { + "epoch": 0.1391953277092797, + "grad_norm": 0.50390625, + "learning_rate": 4.976896203649958e-06, + "loss": 1.6197, + "step": 143 + }, + { + "epoch": 0.14016872160934457, + "grad_norm": 0.498046875, + "learning_rate": 4.976548018277891e-06, + "loss": 1.607, + "step": 144 + }, + { + "epoch": 0.14114211550940947, + "grad_norm": 1.9140625, + "learning_rate": 4.976197241193298e-06, + "loss": 1.6206, + "step": 145 + }, + { + "epoch": 0.14211550940947437, + "grad_norm": 0.484375, + "learning_rate": 4.975843872763269e-06, + "loss": 1.5968, + "step": 146 + }, + { + "epoch": 0.14308890330953927, + "grad_norm": 0.48046875, + "learning_rate": 4.975487913357603e-06, + "loss": 1.5844, + "step": 147 + }, + { + "epoch": 0.14406229720960415, + "grad_norm": 0.51171875, + "learning_rate": 4.975129363348814e-06, + "loss": 1.5843, + "step": 148 + }, + { + "epoch": 0.14503569110966905, + "grad_norm": 0.53515625, + "learning_rate": 4.974768223112125e-06, + "loss": 1.6291, + "step": 149 + }, + { + "epoch": 0.14600908500973395, + "grad_norm": 0.4921875, + "learning_rate": 4.974404493025469e-06, + "loss": 1.5926, + "step": 150 + }, + { + "epoch": 0.14698247890979882, + "grad_norm": 0.498046875, + "learning_rate": 4.974038173469491e-06, + "loss": 1.5854, + "step": 151 + }, + { + "epoch": 0.14795587280986372, + "grad_norm": 0.5, + "learning_rate": 4.973669264827545e-06, + "loss": 1.6067, + "step": 152 + }, + { + "epoch": 0.14892926670992862, + "grad_norm": 0.486328125, + "learning_rate": 4.973297767485695e-06, + "loss": 1.5998, + "step": 153 + }, + { + "epoch": 0.14990266060999352, + "grad_norm": 0.490234375, + "learning_rate": 4.972923681832714e-06, + "loss": 1.5724, + "step": 154 + }, + { + "epoch": 0.1508760545100584, + "grad_norm": 0.490234375, + "learning_rate": 4.972547008260083e-06, + "loss": 1.6057, + "step": 155 + }, + { + "epoch": 0.1518494484101233, + "grad_norm": 0.4921875, + "learning_rate": 4.97216774716199e-06, + "loss": 1.5807, + "step": 156 + }, + { + "epoch": 0.1528228423101882, + "grad_norm": 0.48828125, + "learning_rate": 4.971785898935335e-06, + "loss": 1.5693, + "step": 157 + }, + { + "epoch": 0.1537962362102531, + "grad_norm": 0.49609375, + "learning_rate": 4.971401463979722e-06, + "loss": 1.5948, + "step": 158 + }, + { + "epoch": 0.15476963011031797, + "grad_norm": 0.4921875, + "learning_rate": 4.971014442697463e-06, + "loss": 1.5749, + "step": 159 + }, + { + "epoch": 0.15574302401038287, + "grad_norm": 0.48046875, + "learning_rate": 4.970624835493576e-06, + "loss": 1.5615, + "step": 160 + }, + { + "epoch": 0.15671641791044777, + "grad_norm": 0.48046875, + "learning_rate": 4.970232642775786e-06, + "loss": 1.5725, + "step": 161 + }, + { + "epoch": 0.15768981181051264, + "grad_norm": 0.490234375, + "learning_rate": 4.969837864954524e-06, + "loss": 1.5923, + "step": 162 + }, + { + "epoch": 0.15866320571057754, + "grad_norm": 0.48046875, + "learning_rate": 4.969440502442926e-06, + "loss": 1.5681, + "step": 163 + }, + { + "epoch": 0.15963659961064244, + "grad_norm": 0.486328125, + "learning_rate": 4.969040555656831e-06, + "loss": 1.5895, + "step": 164 + }, + { + "epoch": 0.16060999351070734, + "grad_norm": 0.486328125, + "learning_rate": 4.968638025014786e-06, + "loss": 1.5866, + "step": 165 + }, + { + "epoch": 0.16158338741077222, + "grad_norm": 0.490234375, + "learning_rate": 4.96823291093804e-06, + "loss": 1.566, + "step": 166 + }, + { + "epoch": 0.16255678131083712, + "grad_norm": 0.478515625, + "learning_rate": 4.967825213850545e-06, + "loss": 1.5402, + "step": 167 + }, + { + "epoch": 0.16353017521090202, + "grad_norm": 0.5703125, + "learning_rate": 4.9674149341789554e-06, + "loss": 1.5918, + "step": 168 + }, + { + "epoch": 0.16450356911096692, + "grad_norm": 0.486328125, + "learning_rate": 4.9670020723526325e-06, + "loss": 1.5902, + "step": 169 + }, + { + "epoch": 0.1654769630110318, + "grad_norm": 0.48828125, + "learning_rate": 4.9665866288036354e-06, + "loss": 1.5843, + "step": 170 + }, + { + "epoch": 0.1664503569110967, + "grad_norm": 0.484375, + "learning_rate": 4.966168603966727e-06, + "loss": 1.5759, + "step": 171 + }, + { + "epoch": 0.1674237508111616, + "grad_norm": 0.48046875, + "learning_rate": 4.965747998279371e-06, + "loss": 1.5591, + "step": 172 + }, + { + "epoch": 0.16839714471122647, + "grad_norm": 0.4921875, + "learning_rate": 4.965324812181733e-06, + "loss": 1.5646, + "step": 173 + }, + { + "epoch": 0.16937053861129137, + "grad_norm": 0.4765625, + "learning_rate": 4.964899046116677e-06, + "loss": 1.5582, + "step": 174 + }, + { + "epoch": 0.17034393251135627, + "grad_norm": 0.484375, + "learning_rate": 4.964470700529769e-06, + "loss": 1.579, + "step": 175 + }, + { + "epoch": 0.17131732641142117, + "grad_norm": 0.4921875, + "learning_rate": 4.964039775869271e-06, + "loss": 1.5866, + "step": 176 + }, + { + "epoch": 0.17229072031148604, + "grad_norm": 0.470703125, + "learning_rate": 4.96360627258615e-06, + "loss": 1.5627, + "step": 177 + }, + { + "epoch": 0.17326411421155094, + "grad_norm": 0.48046875, + "learning_rate": 4.963170191134067e-06, + "loss": 1.5562, + "step": 178 + }, + { + "epoch": 0.17423750811161584, + "grad_norm": 0.474609375, + "learning_rate": 4.96273153196938e-06, + "loss": 1.5689, + "step": 179 + }, + { + "epoch": 0.1752109020116807, + "grad_norm": 0.47265625, + "learning_rate": 4.96229029555115e-06, + "loss": 1.5693, + "step": 180 + }, + { + "epoch": 0.17618429591174561, + "grad_norm": 0.47265625, + "learning_rate": 4.9618464823411285e-06, + "loss": 1.576, + "step": 181 + }, + { + "epoch": 0.17715768981181051, + "grad_norm": 0.47265625, + "learning_rate": 4.9614000928037694e-06, + "loss": 1.539, + "step": 182 + }, + { + "epoch": 0.17813108371187542, + "grad_norm": 0.484375, + "learning_rate": 4.96095112740622e-06, + "loss": 1.58, + "step": 183 + }, + { + "epoch": 0.1791044776119403, + "grad_norm": 0.4765625, + "learning_rate": 4.960499586618322e-06, + "loss": 1.5663, + "step": 184 + }, + { + "epoch": 0.1800778715120052, + "grad_norm": 0.47265625, + "learning_rate": 4.960045470912615e-06, + "loss": 1.5625, + "step": 185 + }, + { + "epoch": 0.1810512654120701, + "grad_norm": 0.47265625, + "learning_rate": 4.959588780764333e-06, + "loss": 1.5449, + "step": 186 + }, + { + "epoch": 0.182024659312135, + "grad_norm": 0.46484375, + "learning_rate": 4.9591295166514e-06, + "loss": 1.5538, + "step": 187 + }, + { + "epoch": 0.18299805321219986, + "grad_norm": 0.4765625, + "learning_rate": 4.9586676790544395e-06, + "loss": 1.5541, + "step": 188 + }, + { + "epoch": 0.18397144711226476, + "grad_norm": 0.4765625, + "learning_rate": 4.958203268456765e-06, + "loss": 1.5581, + "step": 189 + }, + { + "epoch": 0.18494484101232966, + "grad_norm": 0.47265625, + "learning_rate": 4.957736285344383e-06, + "loss": 1.5518, + "step": 190 + }, + { + "epoch": 0.18591823491239454, + "grad_norm": 0.470703125, + "learning_rate": 4.957266730205991e-06, + "loss": 1.5802, + "step": 191 + }, + { + "epoch": 0.18689162881245944, + "grad_norm": 0.486328125, + "learning_rate": 4.956794603532981e-06, + "loss": 1.5601, + "step": 192 + }, + { + "epoch": 0.18786502271252434, + "grad_norm": 0.45703125, + "learning_rate": 4.956319905819433e-06, + "loss": 1.5332, + "step": 193 + }, + { + "epoch": 0.18883841661258924, + "grad_norm": 0.4765625, + "learning_rate": 4.955842637562121e-06, + "loss": 1.5651, + "step": 194 + }, + { + "epoch": 0.1898118105126541, + "grad_norm": 0.47265625, + "learning_rate": 4.955362799260507e-06, + "loss": 1.564, + "step": 195 + }, + { + "epoch": 0.190785204412719, + "grad_norm": 0.46484375, + "learning_rate": 4.954880391416742e-06, + "loss": 1.5524, + "step": 196 + }, + { + "epoch": 0.1917585983127839, + "grad_norm": 0.470703125, + "learning_rate": 4.954395414535666e-06, + "loss": 1.5427, + "step": 197 + }, + { + "epoch": 0.1927319922128488, + "grad_norm": 0.47265625, + "learning_rate": 4.95390786912481e-06, + "loss": 1.5694, + "step": 198 + }, + { + "epoch": 0.19370538611291369, + "grad_norm": 0.470703125, + "learning_rate": 4.95341775569439e-06, + "loss": 1.5655, + "step": 199 + }, + { + "epoch": 0.1946787800129786, + "grad_norm": 0.45703125, + "learning_rate": 4.952925074757311e-06, + "loss": 1.5213, + "step": 200 + }, + { + "epoch": 0.1956521739130435, + "grad_norm": 0.46875, + "learning_rate": 4.9524298268291646e-06, + "loss": 1.552, + "step": 201 + }, + { + "epoch": 0.19662556781310836, + "grad_norm": 0.478515625, + "learning_rate": 4.951932012428229e-06, + "loss": 1.5534, + "step": 202 + }, + { + "epoch": 0.19759896171317326, + "grad_norm": 0.470703125, + "learning_rate": 4.951431632075468e-06, + "loss": 1.5501, + "step": 203 + }, + { + "epoch": 0.19857235561323816, + "grad_norm": 0.458984375, + "learning_rate": 4.950928686294531e-06, + "loss": 1.5389, + "step": 204 + }, + { + "epoch": 0.19954574951330306, + "grad_norm": 0.4609375, + "learning_rate": 4.950423175611751e-06, + "loss": 1.561, + "step": 205 + }, + { + "epoch": 0.20051914341336793, + "grad_norm": 0.46875, + "learning_rate": 4.949915100556146e-06, + "loss": 1.5662, + "step": 206 + }, + { + "epoch": 0.20149253731343283, + "grad_norm": 0.46875, + "learning_rate": 4.949404461659417e-06, + "loss": 1.5414, + "step": 207 + }, + { + "epoch": 0.20246593121349774, + "grad_norm": 0.46484375, + "learning_rate": 4.948891259455951e-06, + "loss": 1.5569, + "step": 208 + }, + { + "epoch": 0.2034393251135626, + "grad_norm": 0.458984375, + "learning_rate": 4.948375494482813e-06, + "loss": 1.5561, + "step": 209 + }, + { + "epoch": 0.2044127190136275, + "grad_norm": 0.46875, + "learning_rate": 4.947857167279753e-06, + "loss": 1.5661, + "step": 210 + }, + { + "epoch": 0.2053861129136924, + "grad_norm": 0.470703125, + "learning_rate": 4.947336278389201e-06, + "loss": 1.5579, + "step": 211 + }, + { + "epoch": 0.2063595068137573, + "grad_norm": 0.470703125, + "learning_rate": 4.946812828356268e-06, + "loss": 1.5325, + "step": 212 + }, + { + "epoch": 0.20733290071382218, + "grad_norm": 0.458984375, + "learning_rate": 4.946286817728746e-06, + "loss": 1.5208, + "step": 213 + }, + { + "epoch": 0.20830629461388708, + "grad_norm": 0.462890625, + "learning_rate": 4.945758247057107e-06, + "loss": 1.5509, + "step": 214 + }, + { + "epoch": 0.20927968851395198, + "grad_norm": 0.453125, + "learning_rate": 4.9452271168945e-06, + "loss": 1.5414, + "step": 215 + }, + { + "epoch": 0.21025308241401688, + "grad_norm": 0.45703125, + "learning_rate": 4.944693427796754e-06, + "loss": 1.5746, + "step": 216 + }, + { + "epoch": 0.21122647631408176, + "grad_norm": 0.47265625, + "learning_rate": 4.944157180322377e-06, + "loss": 1.5315, + "step": 217 + }, + { + "epoch": 0.21219987021414666, + "grad_norm": 0.462890625, + "learning_rate": 4.9436183750325505e-06, + "loss": 1.5468, + "step": 218 + }, + { + "epoch": 0.21317326411421156, + "grad_norm": 0.46875, + "learning_rate": 4.943077012491138e-06, + "loss": 1.5213, + "step": 219 + }, + { + "epoch": 0.21414665801427643, + "grad_norm": 0.46484375, + "learning_rate": 4.942533093264675e-06, + "loss": 1.547, + "step": 220 + }, + { + "epoch": 0.21512005191434133, + "grad_norm": 0.4765625, + "learning_rate": 4.941986617922374e-06, + "loss": 1.5467, + "step": 221 + }, + { + "epoch": 0.21609344581440623, + "grad_norm": 0.46484375, + "learning_rate": 4.941437587036123e-06, + "loss": 1.5447, + "step": 222 + }, + { + "epoch": 0.21706683971447113, + "grad_norm": 0.462890625, + "learning_rate": 4.940886001180485e-06, + "loss": 1.5026, + "step": 223 + }, + { + "epoch": 0.218040233614536, + "grad_norm": 0.4609375, + "learning_rate": 4.940331860932694e-06, + "loss": 1.5049, + "step": 224 + }, + { + "epoch": 0.2190136275146009, + "grad_norm": 0.466796875, + "learning_rate": 4.9397751668726595e-06, + "loss": 1.5445, + "step": 225 + }, + { + "epoch": 0.2199870214146658, + "grad_norm": 0.45703125, + "learning_rate": 4.939215919582963e-06, + "loss": 1.5168, + "step": 226 + }, + { + "epoch": 0.2209604153147307, + "grad_norm": 0.470703125, + "learning_rate": 4.938654119648858e-06, + "loss": 1.5318, + "step": 227 + }, + { + "epoch": 0.22193380921479558, + "grad_norm": 0.45703125, + "learning_rate": 4.938089767658269e-06, + "loss": 1.4995, + "step": 228 + }, + { + "epoch": 0.22290720311486048, + "grad_norm": 0.4609375, + "learning_rate": 4.937522864201792e-06, + "loss": 1.513, + "step": 229 + }, + { + "epoch": 0.22388059701492538, + "grad_norm": 0.462890625, + "learning_rate": 4.936953409872692e-06, + "loss": 1.549, + "step": 230 + }, + { + "epoch": 0.22485399091499025, + "grad_norm": 0.4609375, + "learning_rate": 4.936381405266904e-06, + "loss": 1.5253, + "step": 231 + }, + { + "epoch": 0.22582738481505515, + "grad_norm": 0.4609375, + "learning_rate": 4.935806850983034e-06, + "loss": 1.536, + "step": 232 + }, + { + "epoch": 0.22680077871512005, + "grad_norm": 0.458984375, + "learning_rate": 4.935229747622352e-06, + "loss": 1.5603, + "step": 233 + }, + { + "epoch": 0.22777417261518496, + "grad_norm": 0.482421875, + "learning_rate": 4.934650095788798e-06, + "loss": 1.538, + "step": 234 + }, + { + "epoch": 0.22874756651524983, + "grad_norm": 0.466796875, + "learning_rate": 4.934067896088979e-06, + "loss": 1.5108, + "step": 235 + }, + { + "epoch": 0.22972096041531473, + "grad_norm": 0.462890625, + "learning_rate": 4.9334831491321685e-06, + "loss": 1.52, + "step": 236 + }, + { + "epoch": 0.23069435431537963, + "grad_norm": 0.45703125, + "learning_rate": 4.932895855530307e-06, + "loss": 1.5124, + "step": 237 + }, + { + "epoch": 0.23166774821544453, + "grad_norm": 0.484375, + "learning_rate": 4.932306015897995e-06, + "loss": 1.5195, + "step": 238 + }, + { + "epoch": 0.2326411421155094, + "grad_norm": 0.466796875, + "learning_rate": 4.9317136308525025e-06, + "loss": 1.5301, + "step": 239 + }, + { + "epoch": 0.2336145360155743, + "grad_norm": 0.462890625, + "learning_rate": 4.931118701013763e-06, + "loss": 1.518, + "step": 240 + }, + { + "epoch": 0.2345879299156392, + "grad_norm": 0.46484375, + "learning_rate": 4.93052122700437e-06, + "loss": 1.5004, + "step": 241 + }, + { + "epoch": 0.23556132381570408, + "grad_norm": 0.462890625, + "learning_rate": 4.9299212094495816e-06, + "loss": 1.5155, + "step": 242 + }, + { + "epoch": 0.23653471771576898, + "grad_norm": 0.466796875, + "learning_rate": 4.929318648977318e-06, + "loss": 1.5499, + "step": 243 + }, + { + "epoch": 0.23750811161583388, + "grad_norm": 0.455078125, + "learning_rate": 4.92871354621816e-06, + "loss": 1.5221, + "step": 244 + }, + { + "epoch": 0.23848150551589878, + "grad_norm": 0.451171875, + "learning_rate": 4.9281059018053475e-06, + "loss": 1.535, + "step": 245 + }, + { + "epoch": 0.23945489941596365, + "grad_norm": 0.453125, + "learning_rate": 4.927495716374783e-06, + "loss": 1.512, + "step": 246 + }, + { + "epoch": 0.24042829331602855, + "grad_norm": 0.4609375, + "learning_rate": 4.9268829905650274e-06, + "loss": 1.5274, + "step": 247 + }, + { + "epoch": 0.24140168721609345, + "grad_norm": 0.451171875, + "learning_rate": 4.926267725017297e-06, + "loss": 1.5027, + "step": 248 + }, + { + "epoch": 0.24237508111615833, + "grad_norm": 0.462890625, + "learning_rate": 4.925649920375471e-06, + "loss": 1.5368, + "step": 249 + }, + { + "epoch": 0.24334847501622323, + "grad_norm": 0.455078125, + "learning_rate": 4.9250295772860815e-06, + "loss": 1.522, + "step": 250 + }, + { + "epoch": 0.24432186891628813, + "grad_norm": 0.45703125, + "learning_rate": 4.924406696398319e-06, + "loss": 1.5478, + "step": 251 + }, + { + "epoch": 0.24529526281635303, + "grad_norm": 0.453125, + "learning_rate": 4.9237812783640304e-06, + "loss": 1.5049, + "step": 252 + }, + { + "epoch": 0.2462686567164179, + "grad_norm": 0.458984375, + "learning_rate": 4.923153323837717e-06, + "loss": 1.5182, + "step": 253 + }, + { + "epoch": 0.2472420506164828, + "grad_norm": 0.458984375, + "learning_rate": 4.922522833476533e-06, + "loss": 1.5137, + "step": 254 + }, + { + "epoch": 0.2482154445165477, + "grad_norm": 0.45703125, + "learning_rate": 4.92188980794029e-06, + "loss": 1.5455, + "step": 255 + }, + { + "epoch": 0.2491888384166126, + "grad_norm": 0.443359375, + "learning_rate": 4.921254247891449e-06, + "loss": 1.501, + "step": 256 + }, + { + "epoch": 0.2501622323166775, + "grad_norm": 0.451171875, + "learning_rate": 4.920616153995126e-06, + "loss": 1.5195, + "step": 257 + }, + { + "epoch": 0.2511356262167424, + "grad_norm": 0.46484375, + "learning_rate": 4.9199755269190865e-06, + "loss": 1.5014, + "step": 258 + }, + { + "epoch": 0.25210902011680725, + "grad_norm": 0.447265625, + "learning_rate": 4.919332367333748e-06, + "loss": 1.4972, + "step": 259 + }, + { + "epoch": 0.25308241401687215, + "grad_norm": 0.447265625, + "learning_rate": 4.918686675912178e-06, + "loss": 1.5035, + "step": 260 + }, + { + "epoch": 0.25405580791693705, + "grad_norm": 0.453125, + "learning_rate": 4.918038453330095e-06, + "loss": 1.5082, + "step": 261 + }, + { + "epoch": 0.25502920181700195, + "grad_norm": 0.44921875, + "learning_rate": 4.917387700265866e-06, + "loss": 1.518, + "step": 262 + }, + { + "epoch": 0.25600259571706685, + "grad_norm": 0.4453125, + "learning_rate": 4.916734417400503e-06, + "loss": 1.4958, + "step": 263 + }, + { + "epoch": 0.25697598961713175, + "grad_norm": 0.4609375, + "learning_rate": 4.91607860541767e-06, + "loss": 1.5237, + "step": 264 + }, + { + "epoch": 0.25794938351719665, + "grad_norm": 0.453125, + "learning_rate": 4.915420265003674e-06, + "loss": 1.4898, + "step": 265 + }, + { + "epoch": 0.2589227774172615, + "grad_norm": 0.44921875, + "learning_rate": 4.9147593968474705e-06, + "loss": 1.4946, + "step": 266 + }, + { + "epoch": 0.2598961713173264, + "grad_norm": 0.4609375, + "learning_rate": 4.914096001640659e-06, + "loss": 1.4958, + "step": 267 + }, + { + "epoch": 0.2608695652173913, + "grad_norm": 0.453125, + "learning_rate": 4.913430080077483e-06, + "loss": 1.5221, + "step": 268 + }, + { + "epoch": 0.2618429591174562, + "grad_norm": 0.447265625, + "learning_rate": 4.912761632854834e-06, + "loss": 1.4937, + "step": 269 + }, + { + "epoch": 0.2628163530175211, + "grad_norm": 0.453125, + "learning_rate": 4.91209066067224e-06, + "loss": 1.5216, + "step": 270 + }, + { + "epoch": 0.263789746917586, + "grad_norm": 0.462890625, + "learning_rate": 4.911417164231875e-06, + "loss": 1.5069, + "step": 271 + }, + { + "epoch": 0.2647631408176509, + "grad_norm": 0.466796875, + "learning_rate": 4.910741144238556e-06, + "loss": 1.5145, + "step": 272 + }, + { + "epoch": 0.26573653471771574, + "grad_norm": 0.45703125, + "learning_rate": 4.910062601399739e-06, + "loss": 1.5049, + "step": 273 + }, + { + "epoch": 0.26670992861778064, + "grad_norm": 0.46484375, + "learning_rate": 4.9093815364255204e-06, + "loss": 1.5153, + "step": 274 + }, + { + "epoch": 0.26768332251784555, + "grad_norm": 0.4609375, + "learning_rate": 4.9086979500286345e-06, + "loss": 1.5106, + "step": 275 + }, + { + "epoch": 0.26865671641791045, + "grad_norm": 0.478515625, + "learning_rate": 4.908011842924458e-06, + "loss": 1.5176, + "step": 276 + }, + { + "epoch": 0.26963011031797535, + "grad_norm": 0.45703125, + "learning_rate": 4.9073232158310025e-06, + "loss": 1.4883, + "step": 277 + }, + { + "epoch": 0.27060350421804025, + "grad_norm": 0.466796875, + "learning_rate": 4.906632069468917e-06, + "loss": 1.5318, + "step": 278 + }, + { + "epoch": 0.27157689811810515, + "grad_norm": 0.48046875, + "learning_rate": 4.905938404561489e-06, + "loss": 1.518, + "step": 279 + }, + { + "epoch": 0.27255029201817, + "grad_norm": 0.458984375, + "learning_rate": 4.905242221834638e-06, + "loss": 1.4833, + "step": 280 + }, + { + "epoch": 0.2735236859182349, + "grad_norm": 0.453125, + "learning_rate": 4.904543522016923e-06, + "loss": 1.522, + "step": 281 + }, + { + "epoch": 0.2744970798182998, + "grad_norm": 0.447265625, + "learning_rate": 4.903842305839534e-06, + "loss": 1.4828, + "step": 282 + }, + { + "epoch": 0.2754704737183647, + "grad_norm": 0.46875, + "learning_rate": 4.903138574036295e-06, + "loss": 1.5441, + "step": 283 + }, + { + "epoch": 0.2764438676184296, + "grad_norm": 0.47265625, + "learning_rate": 4.902432327343662e-06, + "loss": 1.4891, + "step": 284 + }, + { + "epoch": 0.2774172615184945, + "grad_norm": 0.470703125, + "learning_rate": 4.901723566500725e-06, + "loss": 1.4813, + "step": 285 + }, + { + "epoch": 0.2783906554185594, + "grad_norm": 0.453125, + "learning_rate": 4.901012292249203e-06, + "loss": 1.4753, + "step": 286 + }, + { + "epoch": 0.2793640493186243, + "grad_norm": 0.4453125, + "learning_rate": 4.900298505333446e-06, + "loss": 1.4873, + "step": 287 + }, + { + "epoch": 0.28033744321868914, + "grad_norm": 0.455078125, + "learning_rate": 4.899582206500433e-06, + "loss": 1.4863, + "step": 288 + }, + { + "epoch": 0.28131083711875404, + "grad_norm": 0.46875, + "learning_rate": 4.898863396499772e-06, + "loss": 1.5016, + "step": 289 + }, + { + "epoch": 0.28228423101881894, + "grad_norm": 0.46875, + "learning_rate": 4.898142076083701e-06, + "loss": 1.4733, + "step": 290 + }, + { + "epoch": 0.28325762491888384, + "grad_norm": 0.4609375, + "learning_rate": 4.8974182460070814e-06, + "loss": 1.5243, + "step": 291 + }, + { + "epoch": 0.28423101881894874, + "grad_norm": 0.453125, + "learning_rate": 4.896691907027404e-06, + "loss": 1.489, + "step": 292 + }, + { + "epoch": 0.28520441271901364, + "grad_norm": 0.443359375, + "learning_rate": 4.895963059904782e-06, + "loss": 1.4813, + "step": 293 + }, + { + "epoch": 0.28617780661907855, + "grad_norm": 0.458984375, + "learning_rate": 4.895231705401958e-06, + "loss": 1.4781, + "step": 294 + }, + { + "epoch": 0.2871512005191434, + "grad_norm": 0.474609375, + "learning_rate": 4.8944978442842944e-06, + "loss": 1.4688, + "step": 295 + }, + { + "epoch": 0.2881245944192083, + "grad_norm": 0.46484375, + "learning_rate": 4.893761477319779e-06, + "loss": 1.4892, + "step": 296 + }, + { + "epoch": 0.2890979883192732, + "grad_norm": 0.44921875, + "learning_rate": 4.8930226052790204e-06, + "loss": 1.5027, + "step": 297 + }, + { + "epoch": 0.2900713822193381, + "grad_norm": 0.45703125, + "learning_rate": 4.892281228935252e-06, + "loss": 1.502, + "step": 298 + }, + { + "epoch": 0.291044776119403, + "grad_norm": 0.44140625, + "learning_rate": 4.891537349064322e-06, + "loss": 1.4795, + "step": 299 + }, + { + "epoch": 0.2920181700194679, + "grad_norm": 0.45703125, + "learning_rate": 4.890790966444705e-06, + "loss": 1.4875, + "step": 300 + }, + { + "epoch": 0.2929915639195328, + "grad_norm": 0.455078125, + "learning_rate": 4.8900420818574915e-06, + "loss": 1.4629, + "step": 301 + }, + { + "epoch": 0.29396495781959764, + "grad_norm": 0.4375, + "learning_rate": 4.889290696086391e-06, + "loss": 1.4935, + "step": 302 + }, + { + "epoch": 0.29493835171966254, + "grad_norm": 0.44921875, + "learning_rate": 4.888536809917728e-06, + "loss": 1.47, + "step": 303 + }, + { + "epoch": 0.29591174561972744, + "grad_norm": 0.44140625, + "learning_rate": 4.887780424140448e-06, + "loss": 1.4882, + "step": 304 + }, + { + "epoch": 0.29688513951979234, + "grad_norm": 0.435546875, + "learning_rate": 4.88702153954611e-06, + "loss": 1.4968, + "step": 305 + }, + { + "epoch": 0.29785853341985724, + "grad_norm": 0.455078125, + "learning_rate": 4.8862601569288885e-06, + "loss": 1.4711, + "step": 306 + }, + { + "epoch": 0.29883192731992214, + "grad_norm": 0.44921875, + "learning_rate": 4.885496277085571e-06, + "loss": 1.4824, + "step": 307 + }, + { + "epoch": 0.29980532121998704, + "grad_norm": 0.439453125, + "learning_rate": 4.884729900815559e-06, + "loss": 1.4758, + "step": 308 + }, + { + "epoch": 0.3007787151200519, + "grad_norm": 0.45703125, + "learning_rate": 4.8839610289208695e-06, + "loss": 1.4932, + "step": 309 + }, + { + "epoch": 0.3017521090201168, + "grad_norm": 0.451171875, + "learning_rate": 4.8831896622061256e-06, + "loss": 1.4829, + "step": 310 + }, + { + "epoch": 0.3027255029201817, + "grad_norm": 0.45703125, + "learning_rate": 4.882415801478565e-06, + "loss": 1.4996, + "step": 311 + }, + { + "epoch": 0.3036988968202466, + "grad_norm": 0.46875, + "learning_rate": 4.881639447548034e-06, + "loss": 1.4822, + "step": 312 + }, + { + "epoch": 0.3046722907203115, + "grad_norm": 0.455078125, + "learning_rate": 4.88086060122699e-06, + "loss": 1.485, + "step": 313 + }, + { + "epoch": 0.3056456846203764, + "grad_norm": 0.44921875, + "learning_rate": 4.880079263330497e-06, + "loss": 1.4654, + "step": 314 + }, + { + "epoch": 0.3066190785204413, + "grad_norm": 0.44140625, + "learning_rate": 4.8792954346762256e-06, + "loss": 1.4699, + "step": 315 + }, + { + "epoch": 0.3075924724205062, + "grad_norm": 0.443359375, + "learning_rate": 4.878509116084455e-06, + "loss": 1.4921, + "step": 316 + }, + { + "epoch": 0.30856586632057104, + "grad_norm": 0.447265625, + "learning_rate": 4.8777203083780675e-06, + "loss": 1.4881, + "step": 317 + }, + { + "epoch": 0.30953926022063594, + "grad_norm": 0.453125, + "learning_rate": 4.876929012382555e-06, + "loss": 1.4741, + "step": 318 + }, + { + "epoch": 0.31051265412070084, + "grad_norm": 0.44921875, + "learning_rate": 4.876135228926008e-06, + "loss": 1.475, + "step": 319 + }, + { + "epoch": 0.31148604802076574, + "grad_norm": 0.443359375, + "learning_rate": 4.875338958839123e-06, + "loss": 1.5087, + "step": 320 + }, + { + "epoch": 0.31245944192083064, + "grad_norm": 0.45703125, + "learning_rate": 4.8745402029551995e-06, + "loss": 1.4926, + "step": 321 + }, + { + "epoch": 0.31343283582089554, + "grad_norm": 0.47265625, + "learning_rate": 4.873738962110135e-06, + "loss": 1.4801, + "step": 322 + }, + { + "epoch": 0.31440622972096044, + "grad_norm": 0.4765625, + "learning_rate": 4.872935237142431e-06, + "loss": 1.4819, + "step": 323 + }, + { + "epoch": 0.3153796236210253, + "grad_norm": 0.4453125, + "learning_rate": 4.872129028893186e-06, + "loss": 1.4921, + "step": 324 + }, + { + "epoch": 0.3163530175210902, + "grad_norm": 0.4453125, + "learning_rate": 4.871320338206101e-06, + "loss": 1.4897, + "step": 325 + }, + { + "epoch": 0.3173264114211551, + "grad_norm": 0.451171875, + "learning_rate": 4.870509165927471e-06, + "loss": 1.4856, + "step": 326 + }, + { + "epoch": 0.31829980532122, + "grad_norm": 0.45703125, + "learning_rate": 4.86969551290619e-06, + "loss": 1.4761, + "step": 327 + }, + { + "epoch": 0.3192731992212849, + "grad_norm": 0.45703125, + "learning_rate": 4.868879379993746e-06, + "loss": 1.5015, + "step": 328 + }, + { + "epoch": 0.3202465931213498, + "grad_norm": 0.447265625, + "learning_rate": 4.868060768044225e-06, + "loss": 1.4904, + "step": 329 + }, + { + "epoch": 0.3212199870214147, + "grad_norm": 0.482421875, + "learning_rate": 4.867239677914306e-06, + "loss": 1.4805, + "step": 330 + }, + { + "epoch": 0.32219338092147953, + "grad_norm": 0.53125, + "learning_rate": 4.866416110463261e-06, + "loss": 1.4899, + "step": 331 + }, + { + "epoch": 0.32316677482154443, + "grad_norm": 0.4765625, + "learning_rate": 4.8655900665529565e-06, + "loss": 1.4891, + "step": 332 + }, + { + "epoch": 0.32414016872160933, + "grad_norm": 0.43359375, + "learning_rate": 4.864761547047847e-06, + "loss": 1.4681, + "step": 333 + }, + { + "epoch": 0.32511356262167423, + "grad_norm": 0.482421875, + "learning_rate": 4.863930552814981e-06, + "loss": 1.493, + "step": 334 + }, + { + "epoch": 0.32608695652173914, + "grad_norm": 0.494140625, + "learning_rate": 4.863097084723996e-06, + "loss": 1.4591, + "step": 335 + }, + { + "epoch": 0.32706035042180404, + "grad_norm": 0.4765625, + "learning_rate": 4.862261143647117e-06, + "loss": 1.4805, + "step": 336 + }, + { + "epoch": 0.32803374432186894, + "grad_norm": 0.44921875, + "learning_rate": 4.861422730459159e-06, + "loss": 1.4848, + "step": 337 + }, + { + "epoch": 0.32900713822193384, + "grad_norm": 0.4609375, + "learning_rate": 4.860581846037522e-06, + "loss": 1.4645, + "step": 338 + }, + { + "epoch": 0.3299805321219987, + "grad_norm": 0.466796875, + "learning_rate": 4.859738491262195e-06, + "loss": 1.4922, + "step": 339 + }, + { + "epoch": 0.3309539260220636, + "grad_norm": 0.48046875, + "learning_rate": 4.858892667015749e-06, + "loss": 1.4678, + "step": 340 + }, + { + "epoch": 0.3319273199221285, + "grad_norm": 0.470703125, + "learning_rate": 4.8580443741833404e-06, + "loss": 1.4982, + "step": 341 + }, + { + "epoch": 0.3329007138221934, + "grad_norm": 0.44921875, + "learning_rate": 4.857193613652711e-06, + "loss": 1.4497, + "step": 342 + }, + { + "epoch": 0.3338741077222583, + "grad_norm": 0.4609375, + "learning_rate": 4.8563403863141825e-06, + "loss": 1.4563, + "step": 343 + }, + { + "epoch": 0.3348475016223232, + "grad_norm": 0.458984375, + "learning_rate": 4.855484693060658e-06, + "loss": 1.4787, + "step": 344 + }, + { + "epoch": 0.3358208955223881, + "grad_norm": 0.455078125, + "learning_rate": 4.854626534787625e-06, + "loss": 1.493, + "step": 345 + }, + { + "epoch": 0.33679428942245293, + "grad_norm": 0.4453125, + "learning_rate": 4.853765912393146e-06, + "loss": 1.4569, + "step": 346 + }, + { + "epoch": 0.33776768332251783, + "grad_norm": 0.443359375, + "learning_rate": 4.852902826777863e-06, + "loss": 1.4463, + "step": 347 + }, + { + "epoch": 0.33874107722258273, + "grad_norm": 0.458984375, + "learning_rate": 4.852037278845e-06, + "loss": 1.4696, + "step": 348 + }, + { + "epoch": 0.33971447112264763, + "grad_norm": 0.44140625, + "learning_rate": 4.851169269500351e-06, + "loss": 1.4719, + "step": 349 + }, + { + "epoch": 0.34068786502271253, + "grad_norm": 0.447265625, + "learning_rate": 4.850298799652293e-06, + "loss": 1.4561, + "step": 350 + }, + { + "epoch": 0.34166125892277743, + "grad_norm": 0.458984375, + "learning_rate": 4.8494258702117715e-06, + "loss": 1.4814, + "step": 351 + }, + { + "epoch": 0.34263465282284233, + "grad_norm": 0.458984375, + "learning_rate": 4.8485504820923115e-06, + "loss": 1.4758, + "step": 352 + }, + { + "epoch": 0.3436080467229072, + "grad_norm": 0.4609375, + "learning_rate": 4.847672636210005e-06, + "loss": 1.4657, + "step": 353 + }, + { + "epoch": 0.3445814406229721, + "grad_norm": 0.44921875, + "learning_rate": 4.8467923334835245e-06, + "loss": 1.4647, + "step": 354 + }, + { + "epoch": 0.345554834523037, + "grad_norm": 0.455078125, + "learning_rate": 4.8459095748341045e-06, + "loss": 1.4916, + "step": 355 + }, + { + "epoch": 0.3465282284231019, + "grad_norm": 0.462890625, + "learning_rate": 4.845024361185555e-06, + "loss": 1.4676, + "step": 356 + }, + { + "epoch": 0.3475016223231668, + "grad_norm": 0.5390625, + "learning_rate": 4.8441366934642545e-06, + "loss": 1.4645, + "step": 357 + }, + { + "epoch": 0.3484750162232317, + "grad_norm": 0.439453125, + "learning_rate": 4.8432465725991475e-06, + "loss": 1.4519, + "step": 358 + }, + { + "epoch": 0.3494484101232966, + "grad_norm": 0.443359375, + "learning_rate": 4.842353999521749e-06, + "loss": 1.4563, + "step": 359 + }, + { + "epoch": 0.3504218040233614, + "grad_norm": 0.453125, + "learning_rate": 4.841458975166137e-06, + "loss": 1.4553, + "step": 360 + }, + { + "epoch": 0.35139519792342633, + "grad_norm": 0.44921875, + "learning_rate": 4.840561500468958e-06, + "loss": 1.4512, + "step": 361 + }, + { + "epoch": 0.35236859182349123, + "grad_norm": 0.4375, + "learning_rate": 4.839661576369419e-06, + "loss": 1.4644, + "step": 362 + }, + { + "epoch": 0.35334198572355613, + "grad_norm": 0.443359375, + "learning_rate": 4.838759203809295e-06, + "loss": 1.466, + "step": 363 + }, + { + "epoch": 0.35431537962362103, + "grad_norm": 0.46875, + "learning_rate": 4.837854383732918e-06, + "loss": 1.4882, + "step": 364 + }, + { + "epoch": 0.35528877352368593, + "grad_norm": 0.458984375, + "learning_rate": 4.836947117087186e-06, + "loss": 1.4511, + "step": 365 + }, + { + "epoch": 0.35626216742375083, + "grad_norm": 0.515625, + "learning_rate": 4.836037404821554e-06, + "loss": 1.4536, + "step": 366 + }, + { + "epoch": 0.35723556132381573, + "grad_norm": 0.453125, + "learning_rate": 4.83512524788804e-06, + "loss": 1.4477, + "step": 367 + }, + { + "epoch": 0.3582089552238806, + "grad_norm": 0.462890625, + "learning_rate": 4.834210647241215e-06, + "loss": 1.4668, + "step": 368 + }, + { + "epoch": 0.3591823491239455, + "grad_norm": 0.4921875, + "learning_rate": 4.8332936038382125e-06, + "loss": 1.5002, + "step": 369 + }, + { + "epoch": 0.3601557430240104, + "grad_norm": 0.46875, + "learning_rate": 4.8323741186387205e-06, + "loss": 1.4618, + "step": 370 + }, + { + "epoch": 0.3611291369240753, + "grad_norm": 0.44921875, + "learning_rate": 4.831452192604981e-06, + "loss": 1.4647, + "step": 371 + }, + { + "epoch": 0.3621025308241402, + "grad_norm": 0.451171875, + "learning_rate": 4.830527826701791e-06, + "loss": 1.4363, + "step": 372 + }, + { + "epoch": 0.3630759247242051, + "grad_norm": 0.47265625, + "learning_rate": 4.829601021896503e-06, + "loss": 1.4425, + "step": 373 + }, + { + "epoch": 0.36404931862427, + "grad_norm": 0.455078125, + "learning_rate": 4.828671779159019e-06, + "loss": 1.4738, + "step": 374 + }, + { + "epoch": 0.3650227125243348, + "grad_norm": 0.451171875, + "learning_rate": 4.827740099461793e-06, + "loss": 1.4419, + "step": 375 + }, + { + "epoch": 0.3659961064243997, + "grad_norm": 0.466796875, + "learning_rate": 4.826805983779831e-06, + "loss": 1.4411, + "step": 376 + }, + { + "epoch": 0.3669695003244646, + "grad_norm": 0.443359375, + "learning_rate": 4.825869433090686e-06, + "loss": 1.442, + "step": 377 + }, + { + "epoch": 0.3679428942245295, + "grad_norm": 0.4453125, + "learning_rate": 4.824930448374462e-06, + "loss": 1.4477, + "step": 378 + }, + { + "epoch": 0.3689162881245944, + "grad_norm": 0.44921875, + "learning_rate": 4.823989030613805e-06, + "loss": 1.4556, + "step": 379 + }, + { + "epoch": 0.3698896820246593, + "grad_norm": 0.46875, + "learning_rate": 4.823045180793914e-06, + "loss": 1.4586, + "step": 380 + }, + { + "epoch": 0.37086307592472423, + "grad_norm": 0.4453125, + "learning_rate": 4.822098899902527e-06, + "loss": 1.463, + "step": 381 + }, + { + "epoch": 0.3718364698247891, + "grad_norm": 0.4453125, + "learning_rate": 4.82115018892993e-06, + "loss": 1.4751, + "step": 382 + }, + { + "epoch": 0.372809863724854, + "grad_norm": 0.439453125, + "learning_rate": 4.8201990488689524e-06, + "loss": 1.4391, + "step": 383 + }, + { + "epoch": 0.3737832576249189, + "grad_norm": 0.455078125, + "learning_rate": 4.819245480714962e-06, + "loss": 1.4352, + "step": 384 + }, + { + "epoch": 0.3747566515249838, + "grad_norm": 0.4453125, + "learning_rate": 4.8182894854658715e-06, + "loss": 1.4346, + "step": 385 + }, + { + "epoch": 0.3757300454250487, + "grad_norm": 0.453125, + "learning_rate": 4.81733106412213e-06, + "loss": 1.4661, + "step": 386 + }, + { + "epoch": 0.3767034393251136, + "grad_norm": 0.43359375, + "learning_rate": 4.816370217686729e-06, + "loss": 1.4604, + "step": 387 + }, + { + "epoch": 0.3776768332251785, + "grad_norm": 0.419921875, + "learning_rate": 4.8154069471651956e-06, + "loss": 1.443, + "step": 388 + }, + { + "epoch": 0.3786502271252433, + "grad_norm": 0.4453125, + "learning_rate": 4.814441253565594e-06, + "loss": 1.4584, + "step": 389 + }, + { + "epoch": 0.3796236210253082, + "grad_norm": 0.431640625, + "learning_rate": 4.813473137898526e-06, + "loss": 1.4461, + "step": 390 + }, + { + "epoch": 0.3805970149253731, + "grad_norm": 0.443359375, + "learning_rate": 4.8125026011771244e-06, + "loss": 1.4514, + "step": 391 + }, + { + "epoch": 0.381570408825438, + "grad_norm": 0.435546875, + "learning_rate": 4.81152964441706e-06, + "loss": 1.4393, + "step": 392 + }, + { + "epoch": 0.3825438027255029, + "grad_norm": 0.435546875, + "learning_rate": 4.810554268636532e-06, + "loss": 1.4668, + "step": 393 + }, + { + "epoch": 0.3835171966255678, + "grad_norm": 0.421875, + "learning_rate": 4.8095764748562754e-06, + "loss": 1.4354, + "step": 394 + }, + { + "epoch": 0.3844905905256327, + "grad_norm": 0.43359375, + "learning_rate": 4.808596264099552e-06, + "loss": 1.4275, + "step": 395 + }, + { + "epoch": 0.3854639844256976, + "grad_norm": 0.4453125, + "learning_rate": 4.807613637392153e-06, + "loss": 1.4668, + "step": 396 + }, + { + "epoch": 0.38643737832576247, + "grad_norm": 0.447265625, + "learning_rate": 4.806628595762403e-06, + "loss": 1.4209, + "step": 397 + }, + { + "epoch": 0.38741077222582737, + "grad_norm": 0.435546875, + "learning_rate": 4.805641140241146e-06, + "loss": 1.4311, + "step": 398 + }, + { + "epoch": 0.38838416612589227, + "grad_norm": 0.4453125, + "learning_rate": 4.8046512718617585e-06, + "loss": 1.4645, + "step": 399 + }, + { + "epoch": 0.3893575600259572, + "grad_norm": 0.447265625, + "learning_rate": 4.80365899166014e-06, + "loss": 1.4332, + "step": 400 + }, + { + "epoch": 0.3903309539260221, + "grad_norm": 0.44140625, + "learning_rate": 4.802664300674712e-06, + "loss": 1.4599, + "step": 401 + }, + { + "epoch": 0.391304347826087, + "grad_norm": 0.451171875, + "learning_rate": 4.801667199946422e-06, + "loss": 1.45, + "step": 402 + }, + { + "epoch": 0.3922777417261519, + "grad_norm": 0.447265625, + "learning_rate": 4.800667690518737e-06, + "loss": 1.4397, + "step": 403 + }, + { + "epoch": 0.3932511356262167, + "grad_norm": 0.453125, + "learning_rate": 4.799665773437648e-06, + "loss": 1.4442, + "step": 404 + }, + { + "epoch": 0.3942245295262816, + "grad_norm": 0.439453125, + "learning_rate": 4.79866144975166e-06, + "loss": 1.4539, + "step": 405 + }, + { + "epoch": 0.3951979234263465, + "grad_norm": 0.470703125, + "learning_rate": 4.797654720511802e-06, + "loss": 1.4537, + "step": 406 + }, + { + "epoch": 0.3961713173264114, + "grad_norm": 0.458984375, + "learning_rate": 4.7966455867716165e-06, + "loss": 1.4368, + "step": 407 + }, + { + "epoch": 0.3971447112264763, + "grad_norm": 0.474609375, + "learning_rate": 4.795634049587165e-06, + "loss": 1.4435, + "step": 408 + }, + { + "epoch": 0.3981181051265412, + "grad_norm": 0.43359375, + "learning_rate": 4.794620110017025e-06, + "loss": 1.4547, + "step": 409 + }, + { + "epoch": 0.3990914990266061, + "grad_norm": 0.474609375, + "learning_rate": 4.793603769122283e-06, + "loss": 1.4532, + "step": 410 + }, + { + "epoch": 0.40006489292667097, + "grad_norm": 0.4453125, + "learning_rate": 4.792585027966544e-06, + "loss": 1.4506, + "step": 411 + }, + { + "epoch": 0.40103828682673587, + "grad_norm": 0.486328125, + "learning_rate": 4.791563887615921e-06, + "loss": 1.4642, + "step": 412 + }, + { + "epoch": 0.40201168072680077, + "grad_norm": 0.5078125, + "learning_rate": 4.790540349139041e-06, + "loss": 1.4293, + "step": 413 + }, + { + "epoch": 0.40298507462686567, + "grad_norm": 0.4375, + "learning_rate": 4.789514413607039e-06, + "loss": 1.4423, + "step": 414 + }, + { + "epoch": 0.40395846852693057, + "grad_norm": 0.4453125, + "learning_rate": 4.7884860820935574e-06, + "loss": 1.4504, + "step": 415 + }, + { + "epoch": 0.40493186242699547, + "grad_norm": 0.447265625, + "learning_rate": 4.787455355674748e-06, + "loss": 1.4428, + "step": 416 + }, + { + "epoch": 0.40590525632706037, + "grad_norm": 0.4765625, + "learning_rate": 4.786422235429269e-06, + "loss": 1.4195, + "step": 417 + }, + { + "epoch": 0.4068786502271252, + "grad_norm": 0.44921875, + "learning_rate": 4.785386722438281e-06, + "loss": 1.4364, + "step": 418 + }, + { + "epoch": 0.4078520441271901, + "grad_norm": 0.4375, + "learning_rate": 4.7843488177854516e-06, + "loss": 1.4277, + "step": 419 + }, + { + "epoch": 0.408825438027255, + "grad_norm": 0.44140625, + "learning_rate": 4.783308522556949e-06, + "loss": 1.452, + "step": 420 + }, + { + "epoch": 0.4097988319273199, + "grad_norm": 0.451171875, + "learning_rate": 4.782265837841446e-06, + "loss": 1.4367, + "step": 421 + }, + { + "epoch": 0.4107722258273848, + "grad_norm": 0.443359375, + "learning_rate": 4.781220764730113e-06, + "loss": 1.4328, + "step": 422 + }, + { + "epoch": 0.4117456197274497, + "grad_norm": 0.447265625, + "learning_rate": 4.780173304316622e-06, + "loss": 1.439, + "step": 423 + }, + { + "epoch": 0.4127190136275146, + "grad_norm": 0.447265625, + "learning_rate": 4.779123457697142e-06, + "loss": 1.4275, + "step": 424 + }, + { + "epoch": 0.4136924075275795, + "grad_norm": 0.470703125, + "learning_rate": 4.77807122597034e-06, + "loss": 1.3996, + "step": 425 + }, + { + "epoch": 0.41466580142764436, + "grad_norm": 0.462890625, + "learning_rate": 4.777016610237377e-06, + "loss": 1.4428, + "step": 426 + }, + { + "epoch": 0.41563919532770927, + "grad_norm": 0.49609375, + "learning_rate": 4.7759596116019124e-06, + "loss": 1.4351, + "step": 427 + }, + { + "epoch": 0.41661258922777417, + "grad_norm": 0.462890625, + "learning_rate": 4.774900231170096e-06, + "loss": 1.4532, + "step": 428 + }, + { + "epoch": 0.41758598312783907, + "grad_norm": 0.4453125, + "learning_rate": 4.773838470050574e-06, + "loss": 1.4287, + "step": 429 + }, + { + "epoch": 0.41855937702790397, + "grad_norm": 0.44921875, + "learning_rate": 4.772774329354479e-06, + "loss": 1.4431, + "step": 430 + }, + { + "epoch": 0.41953277092796887, + "grad_norm": 0.4609375, + "learning_rate": 4.771707810195437e-06, + "loss": 1.436, + "step": 431 + }, + { + "epoch": 0.42050616482803377, + "grad_norm": 0.490234375, + "learning_rate": 4.770638913689563e-06, + "loss": 1.4336, + "step": 432 + }, + { + "epoch": 0.4214795587280986, + "grad_norm": 0.4765625, + "learning_rate": 4.7695676409554595e-06, + "loss": 1.436, + "step": 433 + }, + { + "epoch": 0.4224529526281635, + "grad_norm": 0.458984375, + "learning_rate": 4.768493993114215e-06, + "loss": 1.4055, + "step": 434 + }, + { + "epoch": 0.4234263465282284, + "grad_norm": 0.4453125, + "learning_rate": 4.767417971289403e-06, + "loss": 1.4437, + "step": 435 + }, + { + "epoch": 0.4243997404282933, + "grad_norm": 0.4375, + "learning_rate": 4.7663395766070854e-06, + "loss": 1.4219, + "step": 436 + }, + { + "epoch": 0.4253731343283582, + "grad_norm": 0.447265625, + "learning_rate": 4.765258810195802e-06, + "loss": 1.4476, + "step": 437 + }, + { + "epoch": 0.4263465282284231, + "grad_norm": 0.453125, + "learning_rate": 4.764175673186579e-06, + "loss": 1.4434, + "step": 438 + }, + { + "epoch": 0.427319922128488, + "grad_norm": 0.431640625, + "learning_rate": 4.763090166712919e-06, + "loss": 1.4167, + "step": 439 + }, + { + "epoch": 0.42829331602855286, + "grad_norm": 0.4453125, + "learning_rate": 4.76200229191081e-06, + "loss": 1.4278, + "step": 440 + }, + { + "epoch": 0.42926670992861776, + "grad_norm": 0.443359375, + "learning_rate": 4.760912049918711e-06, + "loss": 1.4133, + "step": 441 + }, + { + "epoch": 0.43024010382868266, + "grad_norm": 0.44921875, + "learning_rate": 4.759819441877567e-06, + "loss": 1.4294, + "step": 442 + }, + { + "epoch": 0.43121349772874756, + "grad_norm": 0.443359375, + "learning_rate": 4.758724468930791e-06, + "loss": 1.4389, + "step": 443 + }, + { + "epoch": 0.43218689162881246, + "grad_norm": 0.427734375, + "learning_rate": 4.757627132224276e-06, + "loss": 1.4482, + "step": 444 + }, + { + "epoch": 0.43316028552887736, + "grad_norm": 0.42578125, + "learning_rate": 4.756527432906387e-06, + "loss": 1.4239, + "step": 445 + }, + { + "epoch": 0.43413367942894227, + "grad_norm": 0.462890625, + "learning_rate": 4.755425372127961e-06, + "loss": 1.4439, + "step": 446 + }, + { + "epoch": 0.43510707332900717, + "grad_norm": 0.451171875, + "learning_rate": 4.754320951042307e-06, + "loss": 1.4092, + "step": 447 + }, + { + "epoch": 0.436080467229072, + "grad_norm": 0.439453125, + "learning_rate": 4.753214170805205e-06, + "loss": 1.4312, + "step": 448 + }, + { + "epoch": 0.4370538611291369, + "grad_norm": 0.43359375, + "learning_rate": 4.752105032574902e-06, + "loss": 1.4296, + "step": 449 + }, + { + "epoch": 0.4380272550292018, + "grad_norm": 0.435546875, + "learning_rate": 4.7509935375121115e-06, + "loss": 1.4436, + "step": 450 + }, + { + "epoch": 0.4390006489292667, + "grad_norm": 0.46484375, + "learning_rate": 4.749879686780019e-06, + "loss": 1.4108, + "step": 451 + }, + { + "epoch": 0.4399740428293316, + "grad_norm": 0.466796875, + "learning_rate": 4.74876348154427e-06, + "loss": 1.4423, + "step": 452 + }, + { + "epoch": 0.4409474367293965, + "grad_norm": 0.4296875, + "learning_rate": 4.747644922972973e-06, + "loss": 1.4195, + "step": 453 + }, + { + "epoch": 0.4419208306294614, + "grad_norm": 0.43359375, + "learning_rate": 4.746524012236706e-06, + "loss": 1.4384, + "step": 454 + }, + { + "epoch": 0.44289422452952626, + "grad_norm": 0.43359375, + "learning_rate": 4.745400750508502e-06, + "loss": 1.4084, + "step": 455 + }, + { + "epoch": 0.44386761842959116, + "grad_norm": 0.4375, + "learning_rate": 4.7442751389638564e-06, + "loss": 1.4368, + "step": 456 + }, + { + "epoch": 0.44484101232965606, + "grad_norm": 0.447265625, + "learning_rate": 4.743147178780725e-06, + "loss": 1.4065, + "step": 457 + }, + { + "epoch": 0.44581440622972096, + "grad_norm": 0.4375, + "learning_rate": 4.7420168711395194e-06, + "loss": 1.4054, + "step": 458 + }, + { + "epoch": 0.44678780012978586, + "grad_norm": 0.421875, + "learning_rate": 4.7408842172231115e-06, + "loss": 1.3868, + "step": 459 + }, + { + "epoch": 0.44776119402985076, + "grad_norm": 0.43359375, + "learning_rate": 4.739749218216823e-06, + "loss": 1.4552, + "step": 460 + }, + { + "epoch": 0.44873458792991566, + "grad_norm": 0.451171875, + "learning_rate": 4.7386118753084325e-06, + "loss": 1.4349, + "step": 461 + }, + { + "epoch": 0.4497079818299805, + "grad_norm": 0.458984375, + "learning_rate": 4.737472189688173e-06, + "loss": 1.4277, + "step": 462 + }, + { + "epoch": 0.4506813757300454, + "grad_norm": 0.43359375, + "learning_rate": 4.736330162548729e-06, + "loss": 1.4151, + "step": 463 + }, + { + "epoch": 0.4516547696301103, + "grad_norm": 0.439453125, + "learning_rate": 4.735185795085231e-06, + "loss": 1.4178, + "step": 464 + }, + { + "epoch": 0.4526281635301752, + "grad_norm": 0.427734375, + "learning_rate": 4.734039088495265e-06, + "loss": 1.4401, + "step": 465 + }, + { + "epoch": 0.4536015574302401, + "grad_norm": 0.435546875, + "learning_rate": 4.73289004397886e-06, + "loss": 1.416, + "step": 466 + }, + { + "epoch": 0.454574951330305, + "grad_norm": 0.44140625, + "learning_rate": 4.731738662738494e-06, + "loss": 1.4224, + "step": 467 + }, + { + "epoch": 0.4555483452303699, + "grad_norm": 0.435546875, + "learning_rate": 4.73058494597909e-06, + "loss": 1.4286, + "step": 468 + }, + { + "epoch": 0.45652173913043476, + "grad_norm": 0.435546875, + "learning_rate": 4.729428894908013e-06, + "loss": 1.4023, + "step": 469 + }, + { + "epoch": 0.45749513303049966, + "grad_norm": 0.44921875, + "learning_rate": 4.728270510735076e-06, + "loss": 1.4043, + "step": 470 + }, + { + "epoch": 0.45846852693056456, + "grad_norm": 0.4296875, + "learning_rate": 4.727109794672528e-06, + "loss": 1.4357, + "step": 471 + }, + { + "epoch": 0.45944192083062946, + "grad_norm": 0.435546875, + "learning_rate": 4.725946747935062e-06, + "loss": 1.4129, + "step": 472 + }, + { + "epoch": 0.46041531473069436, + "grad_norm": 0.4609375, + "learning_rate": 4.724781371739807e-06, + "loss": 1.4292, + "step": 473 + }, + { + "epoch": 0.46138870863075926, + "grad_norm": 0.453125, + "learning_rate": 4.723613667306333e-06, + "loss": 1.4386, + "step": 474 + }, + { + "epoch": 0.46236210253082416, + "grad_norm": 0.4453125, + "learning_rate": 4.722443635856646e-06, + "loss": 1.4274, + "step": 475 + }, + { + "epoch": 0.46333549643088906, + "grad_norm": 0.421875, + "learning_rate": 4.721271278615185e-06, + "loss": 1.4315, + "step": 476 + }, + { + "epoch": 0.4643088903309539, + "grad_norm": 0.451171875, + "learning_rate": 4.720096596808824e-06, + "loss": 1.4202, + "step": 477 + }, + { + "epoch": 0.4652822842310188, + "grad_norm": 0.431640625, + "learning_rate": 4.718919591666871e-06, + "loss": 1.42, + "step": 478 + }, + { + "epoch": 0.4662556781310837, + "grad_norm": 0.44140625, + "learning_rate": 4.717740264421063e-06, + "loss": 1.4341, + "step": 479 + }, + { + "epoch": 0.4672290720311486, + "grad_norm": 0.44140625, + "learning_rate": 4.716558616305568e-06, + "loss": 1.4016, + "step": 480 + }, + { + "epoch": 0.4682024659312135, + "grad_norm": 0.455078125, + "learning_rate": 4.715374648556985e-06, + "loss": 1.4382, + "step": 481 + }, + { + "epoch": 0.4691758598312784, + "grad_norm": 0.431640625, + "learning_rate": 4.714188362414337e-06, + "loss": 1.4271, + "step": 482 + }, + { + "epoch": 0.4701492537313433, + "grad_norm": 0.43359375, + "learning_rate": 4.712999759119076e-06, + "loss": 1.43, + "step": 483 + }, + { + "epoch": 0.47112264763140815, + "grad_norm": 0.443359375, + "learning_rate": 4.7118088399150776e-06, + "loss": 1.4367, + "step": 484 + }, + { + "epoch": 0.47209604153147305, + "grad_norm": 0.44921875, + "learning_rate": 4.71061560604864e-06, + "loss": 1.4129, + "step": 485 + }, + { + "epoch": 0.47306943543153795, + "grad_norm": 0.44921875, + "learning_rate": 4.709420058768487e-06, + "loss": 1.3979, + "step": 486 + }, + { + "epoch": 0.47404282933160286, + "grad_norm": 0.44921875, + "learning_rate": 4.708222199325759e-06, + "loss": 1.418, + "step": 487 + }, + { + "epoch": 0.47501622323166776, + "grad_norm": 0.44140625, + "learning_rate": 4.70702202897402e-06, + "loss": 1.4164, + "step": 488 + }, + { + "epoch": 0.47598961713173266, + "grad_norm": 0.451171875, + "learning_rate": 4.70581954896925e-06, + "loss": 1.4043, + "step": 489 + }, + { + "epoch": 0.47696301103179756, + "grad_norm": 0.4375, + "learning_rate": 4.704614760569846e-06, + "loss": 1.4203, + "step": 490 + }, + { + "epoch": 0.4779364049318624, + "grad_norm": 0.435546875, + "learning_rate": 4.703407665036622e-06, + "loss": 1.4177, + "step": 491 + }, + { + "epoch": 0.4789097988319273, + "grad_norm": 0.462890625, + "learning_rate": 4.702198263632808e-06, + "loss": 1.4099, + "step": 492 + }, + { + "epoch": 0.4798831927319922, + "grad_norm": 0.447265625, + "learning_rate": 4.700986557624041e-06, + "loss": 1.4183, + "step": 493 + }, + { + "epoch": 0.4808565866320571, + "grad_norm": 0.42578125, + "learning_rate": 4.699772548278378e-06, + "loss": 1.4027, + "step": 494 + }, + { + "epoch": 0.481829980532122, + "grad_norm": 0.42578125, + "learning_rate": 4.6985562368662795e-06, + "loss": 1.3994, + "step": 495 + }, + { + "epoch": 0.4828033744321869, + "grad_norm": 0.4375, + "learning_rate": 4.697337624660619e-06, + "loss": 1.4274, + "step": 496 + }, + { + "epoch": 0.4837767683322518, + "grad_norm": 0.4375, + "learning_rate": 4.696116712936676e-06, + "loss": 1.4226, + "step": 497 + }, + { + "epoch": 0.48475016223231665, + "grad_norm": 0.4453125, + "learning_rate": 4.694893502972137e-06, + "loss": 1.43, + "step": 498 + }, + { + "epoch": 0.48572355613238155, + "grad_norm": 0.4296875, + "learning_rate": 4.693667996047094e-06, + "loss": 1.4295, + "step": 499 + }, + { + "epoch": 0.48669695003244645, + "grad_norm": 0.431640625, + "learning_rate": 4.692440193444043e-06, + "loss": 1.4063, + "step": 500 + }, + { + "epoch": 0.48767034393251135, + "grad_norm": 0.41796875, + "learning_rate": 4.6912100964478825e-06, + "loss": 1.4152, + "step": 501 + }, + { + "epoch": 0.48864373783257625, + "grad_norm": 0.44140625, + "learning_rate": 4.689977706345909e-06, + "loss": 1.4157, + "step": 502 + }, + { + "epoch": 0.48961713173264115, + "grad_norm": 0.451171875, + "learning_rate": 4.6887430244278235e-06, + "loss": 1.4212, + "step": 503 + }, + { + "epoch": 0.49059052563270605, + "grad_norm": 0.443359375, + "learning_rate": 4.6875060519857215e-06, + "loss": 1.4256, + "step": 504 + }, + { + "epoch": 0.49156391953277095, + "grad_norm": 0.431640625, + "learning_rate": 4.686266790314099e-06, + "loss": 1.4218, + "step": 505 + }, + { + "epoch": 0.4925373134328358, + "grad_norm": 0.431640625, + "learning_rate": 4.685025240709845e-06, + "loss": 1.4024, + "step": 506 + }, + { + "epoch": 0.4935107073329007, + "grad_norm": 0.453125, + "learning_rate": 4.683781404472243e-06, + "loss": 1.4169, + "step": 507 + }, + { + "epoch": 0.4944841012329656, + "grad_norm": 0.435546875, + "learning_rate": 4.6825352829029705e-06, + "loss": 1.4094, + "step": 508 + }, + { + "epoch": 0.4954574951330305, + "grad_norm": 0.431640625, + "learning_rate": 4.6812868773060975e-06, + "loss": 1.4059, + "step": 509 + }, + { + "epoch": 0.4964308890330954, + "grad_norm": 0.4296875, + "learning_rate": 4.6800361889880805e-06, + "loss": 1.4261, + "step": 510 + }, + { + "epoch": 0.4974042829331603, + "grad_norm": 0.455078125, + "learning_rate": 4.67878321925777e-06, + "loss": 1.4211, + "step": 511 + }, + { + "epoch": 0.4983776768332252, + "grad_norm": 0.439453125, + "learning_rate": 4.6775279694264e-06, + "loss": 1.3971, + "step": 512 + }, + { + "epoch": 0.49935107073329005, + "grad_norm": 0.439453125, + "learning_rate": 4.6762704408075925e-06, + "loss": 1.3995, + "step": 513 + }, + { + "epoch": 0.500324464633355, + "grad_norm": 0.44921875, + "learning_rate": 4.675010634717353e-06, + "loss": 1.4011, + "step": 514 + }, + { + "epoch": 0.5012978585334199, + "grad_norm": 0.44140625, + "learning_rate": 4.673748552474071e-06, + "loss": 1.4245, + "step": 515 + }, + { + "epoch": 0.5022712524334848, + "grad_norm": 0.451171875, + "learning_rate": 4.672484195398519e-06, + "loss": 1.4161, + "step": 516 + }, + { + "epoch": 0.5032446463335496, + "grad_norm": 0.427734375, + "learning_rate": 4.671217564813849e-06, + "loss": 1.4103, + "step": 517 + }, + { + "epoch": 0.5042180402336145, + "grad_norm": 0.4296875, + "learning_rate": 4.669948662045593e-06, + "loss": 1.407, + "step": 518 + }, + { + "epoch": 0.5051914341336794, + "grad_norm": 0.4453125, + "learning_rate": 4.668677488421659e-06, + "loss": 1.4174, + "step": 519 + }, + { + "epoch": 0.5061648280337443, + "grad_norm": 0.4296875, + "learning_rate": 4.667404045272334e-06, + "loss": 1.4044, + "step": 520 + }, + { + "epoch": 0.5071382219338092, + "grad_norm": 0.435546875, + "learning_rate": 4.666128333930278e-06, + "loss": 1.3874, + "step": 521 + }, + { + "epoch": 0.5081116158338741, + "grad_norm": 0.44140625, + "learning_rate": 4.664850355730526e-06, + "loss": 1.408, + "step": 522 + }, + { + "epoch": 0.509085009733939, + "grad_norm": 0.427734375, + "learning_rate": 4.663570112010485e-06, + "loss": 1.4181, + "step": 523 + }, + { + "epoch": 0.5100584036340039, + "grad_norm": 0.443359375, + "learning_rate": 4.662287604109932e-06, + "loss": 1.4218, + "step": 524 + }, + { + "epoch": 0.5110317975340688, + "grad_norm": 0.435546875, + "learning_rate": 4.661002833371014e-06, + "loss": 1.4302, + "step": 525 + }, + { + "epoch": 0.5120051914341337, + "grad_norm": 0.44140625, + "learning_rate": 4.659715801138247e-06, + "loss": 1.4138, + "step": 526 + }, + { + "epoch": 0.5129785853341986, + "grad_norm": 0.46875, + "learning_rate": 4.658426508758512e-06, + "loss": 1.3998, + "step": 527 + }, + { + "epoch": 0.5139519792342635, + "grad_norm": 0.45703125, + "learning_rate": 4.657134957581057e-06, + "loss": 1.3993, + "step": 528 + }, + { + "epoch": 0.5149253731343284, + "grad_norm": 0.4453125, + "learning_rate": 4.655841148957493e-06, + "loss": 1.4133, + "step": 529 + }, + { + "epoch": 0.5158987670343933, + "grad_norm": 0.458984375, + "learning_rate": 4.654545084241792e-06, + "loss": 1.4066, + "step": 530 + }, + { + "epoch": 0.5168721609344581, + "grad_norm": 0.462890625, + "learning_rate": 4.653246764790289e-06, + "loss": 1.3966, + "step": 531 + }, + { + "epoch": 0.517845554834523, + "grad_norm": 0.423828125, + "learning_rate": 4.6519461919616795e-06, + "loss": 1.3933, + "step": 532 + }, + { + "epoch": 0.5188189487345879, + "grad_norm": 0.423828125, + "learning_rate": 4.650643367117013e-06, + "loss": 1.4092, + "step": 533 + }, + { + "epoch": 0.5197923426346528, + "grad_norm": 0.427734375, + "learning_rate": 4.649338291619699e-06, + "loss": 1.3936, + "step": 534 + }, + { + "epoch": 0.5207657365347177, + "grad_norm": 0.44140625, + "learning_rate": 4.648030966835502e-06, + "loss": 1.3975, + "step": 535 + }, + { + "epoch": 0.5217391304347826, + "grad_norm": 0.4453125, + "learning_rate": 4.646721394132541e-06, + "loss": 1.4235, + "step": 536 + }, + { + "epoch": 0.5227125243348475, + "grad_norm": 0.4296875, + "learning_rate": 4.645409574881287e-06, + "loss": 1.3871, + "step": 537 + }, + { + "epoch": 0.5236859182349124, + "grad_norm": 0.431640625, + "learning_rate": 4.6440955104545595e-06, + "loss": 1.4192, + "step": 538 + }, + { + "epoch": 0.5246593121349773, + "grad_norm": 0.4375, + "learning_rate": 4.6427792022275296e-06, + "loss": 1.4187, + "step": 539 + }, + { + "epoch": 0.5256327060350422, + "grad_norm": 0.443359375, + "learning_rate": 4.641460651577717e-06, + "loss": 1.3986, + "step": 540 + }, + { + "epoch": 0.5266060999351071, + "grad_norm": 0.435546875, + "learning_rate": 4.640139859884989e-06, + "loss": 1.371, + "step": 541 + }, + { + "epoch": 0.527579493835172, + "grad_norm": 0.421875, + "learning_rate": 4.638816828531555e-06, + "loss": 1.4026, + "step": 542 + }, + { + "epoch": 0.5285528877352369, + "grad_norm": 0.447265625, + "learning_rate": 4.637491558901971e-06, + "loss": 1.4008, + "step": 543 + }, + { + "epoch": 0.5295262816353018, + "grad_norm": 0.423828125, + "learning_rate": 4.636164052383134e-06, + "loss": 1.3944, + "step": 544 + }, + { + "epoch": 0.5304996755353667, + "grad_norm": 0.44140625, + "learning_rate": 4.634834310364282e-06, + "loss": 1.421, + "step": 545 + }, + { + "epoch": 0.5314730694354315, + "grad_norm": 0.44140625, + "learning_rate": 4.633502334236993e-06, + "loss": 1.4209, + "step": 546 + }, + { + "epoch": 0.5324464633354964, + "grad_norm": 0.4140625, + "learning_rate": 4.632168125395183e-06, + "loss": 1.3769, + "step": 547 + }, + { + "epoch": 0.5334198572355613, + "grad_norm": 0.419921875, + "learning_rate": 4.6308316852351036e-06, + "loss": 1.4062, + "step": 548 + }, + { + "epoch": 0.5343932511356262, + "grad_norm": 0.43359375, + "learning_rate": 4.629493015155343e-06, + "loss": 1.4029, + "step": 549 + }, + { + "epoch": 0.5353666450356911, + "grad_norm": 0.435546875, + "learning_rate": 4.628152116556821e-06, + "loss": 1.4101, + "step": 550 + }, + { + "epoch": 0.536340038935756, + "grad_norm": 0.421875, + "learning_rate": 4.626808990842793e-06, + "loss": 1.4199, + "step": 551 + }, + { + "epoch": 0.5373134328358209, + "grad_norm": 0.431640625, + "learning_rate": 4.625463639418839e-06, + "loss": 1.4257, + "step": 552 + }, + { + "epoch": 0.5382868267358858, + "grad_norm": 0.419921875, + "learning_rate": 4.624116063692875e-06, + "loss": 1.3929, + "step": 553 + }, + { + "epoch": 0.5392602206359507, + "grad_norm": 0.431640625, + "learning_rate": 4.622766265075141e-06, + "loss": 1.4009, + "step": 554 + }, + { + "epoch": 0.5402336145360156, + "grad_norm": 0.427734375, + "learning_rate": 4.621414244978204e-06, + "loss": 1.3887, + "step": 555 + }, + { + "epoch": 0.5412070084360805, + "grad_norm": 0.43359375, + "learning_rate": 4.620060004816957e-06, + "loss": 1.4077, + "step": 556 + }, + { + "epoch": 0.5421804023361454, + "grad_norm": 0.43359375, + "learning_rate": 4.618703546008611e-06, + "loss": 1.4113, + "step": 557 + }, + { + "epoch": 0.5431537962362103, + "grad_norm": 0.44140625, + "learning_rate": 4.617344869972707e-06, + "loss": 1.4042, + "step": 558 + }, + { + "epoch": 0.5441271901362752, + "grad_norm": 0.439453125, + "learning_rate": 4.615983978131102e-06, + "loss": 1.4071, + "step": 559 + }, + { + "epoch": 0.54510058403634, + "grad_norm": 0.4296875, + "learning_rate": 4.614620871907969e-06, + "loss": 1.3912, + "step": 560 + }, + { + "epoch": 0.5460739779364049, + "grad_norm": 0.455078125, + "learning_rate": 4.613255552729805e-06, + "loss": 1.4088, + "step": 561 + }, + { + "epoch": 0.5470473718364698, + "grad_norm": 0.439453125, + "learning_rate": 4.611888022025417e-06, + "loss": 1.3976, + "step": 562 + }, + { + "epoch": 0.5480207657365347, + "grad_norm": 0.443359375, + "learning_rate": 4.610518281225929e-06, + "loss": 1.4268, + "step": 563 + }, + { + "epoch": 0.5489941596365996, + "grad_norm": 0.4375, + "learning_rate": 4.609146331764778e-06, + "loss": 1.3967, + "step": 564 + }, + { + "epoch": 0.5499675535366645, + "grad_norm": 0.435546875, + "learning_rate": 4.607772175077712e-06, + "loss": 1.4042, + "step": 565 + }, + { + "epoch": 0.5509409474367294, + "grad_norm": 0.482421875, + "learning_rate": 4.606395812602788e-06, + "loss": 1.4014, + "step": 566 + }, + { + "epoch": 0.5519143413367943, + "grad_norm": 0.482421875, + "learning_rate": 4.605017245780372e-06, + "loss": 1.417, + "step": 567 + }, + { + "epoch": 0.5528877352368592, + "grad_norm": 0.45703125, + "learning_rate": 4.603636476053139e-06, + "loss": 1.387, + "step": 568 + }, + { + "epoch": 0.5538611291369241, + "grad_norm": 0.443359375, + "learning_rate": 4.602253504866066e-06, + "loss": 1.4108, + "step": 569 + }, + { + "epoch": 0.554834523036989, + "grad_norm": 0.4453125, + "learning_rate": 4.600868333666434e-06, + "loss": 1.4, + "step": 570 + }, + { + "epoch": 0.5558079169370539, + "grad_norm": 0.462890625, + "learning_rate": 4.5994809639038285e-06, + "loss": 1.398, + "step": 571 + }, + { + "epoch": 0.5567813108371188, + "grad_norm": 0.4375, + "learning_rate": 4.598091397030136e-06, + "loss": 1.3986, + "step": 572 + }, + { + "epoch": 0.5577547047371837, + "grad_norm": 0.453125, + "learning_rate": 4.596699634499538e-06, + "loss": 1.3918, + "step": 573 + }, + { + "epoch": 0.5587280986372486, + "grad_norm": 0.439453125, + "learning_rate": 4.59530567776852e-06, + "loss": 1.3921, + "step": 574 + }, + { + "epoch": 0.5597014925373134, + "grad_norm": 0.453125, + "learning_rate": 4.593909528295859e-06, + "loss": 1.3768, + "step": 575 + }, + { + "epoch": 0.5606748864373783, + "grad_norm": 0.462890625, + "learning_rate": 4.5925111875426285e-06, + "loss": 1.4077, + "step": 576 + }, + { + "epoch": 0.5616482803374432, + "grad_norm": 0.462890625, + "learning_rate": 4.591110656972195e-06, + "loss": 1.411, + "step": 577 + }, + { + "epoch": 0.5626216742375081, + "grad_norm": 0.4296875, + "learning_rate": 4.589707938050216e-06, + "loss": 1.3958, + "step": 578 + }, + { + "epoch": 0.563595068137573, + "grad_norm": 0.431640625, + "learning_rate": 4.588303032244641e-06, + "loss": 1.4149, + "step": 579 + }, + { + "epoch": 0.5645684620376379, + "grad_norm": 0.427734375, + "learning_rate": 4.586895941025705e-06, + "loss": 1.3885, + "step": 580 + }, + { + "epoch": 0.5655418559377028, + "grad_norm": 0.4453125, + "learning_rate": 4.585486665865933e-06, + "loss": 1.3821, + "step": 581 + }, + { + "epoch": 0.5665152498377677, + "grad_norm": 0.4375, + "learning_rate": 4.584075208240135e-06, + "loss": 1.3888, + "step": 582 + }, + { + "epoch": 0.5674886437378326, + "grad_norm": 0.42578125, + "learning_rate": 4.5826615696254026e-06, + "loss": 1.374, + "step": 583 + }, + { + "epoch": 0.5684620376378975, + "grad_norm": 0.443359375, + "learning_rate": 4.581245751501113e-06, + "loss": 1.3988, + "step": 584 + }, + { + "epoch": 0.5694354315379624, + "grad_norm": 0.435546875, + "learning_rate": 4.579827755348921e-06, + "loss": 1.3786, + "step": 585 + }, + { + "epoch": 0.5704088254380273, + "grad_norm": 0.435546875, + "learning_rate": 4.578407582652764e-06, + "loss": 1.3826, + "step": 586 + }, + { + "epoch": 0.5713822193380922, + "grad_norm": 0.44140625, + "learning_rate": 4.576985234898855e-06, + "loss": 1.3793, + "step": 587 + }, + { + "epoch": 0.5723556132381571, + "grad_norm": 0.439453125, + "learning_rate": 4.575560713575684e-06, + "loss": 1.4009, + "step": 588 + }, + { + "epoch": 0.5733290071382219, + "grad_norm": 0.466796875, + "learning_rate": 4.5741340201740146e-06, + "loss": 1.4198, + "step": 589 + }, + { + "epoch": 0.5743024010382868, + "grad_norm": 0.4609375, + "learning_rate": 4.572705156186886e-06, + "loss": 1.4197, + "step": 590 + }, + { + "epoch": 0.5752757949383517, + "grad_norm": 0.447265625, + "learning_rate": 4.571274123109606e-06, + "loss": 1.3729, + "step": 591 + }, + { + "epoch": 0.5762491888384166, + "grad_norm": 0.421875, + "learning_rate": 4.569840922439753e-06, + "loss": 1.3909, + "step": 592 + }, + { + "epoch": 0.5772225827384815, + "grad_norm": 0.43359375, + "learning_rate": 4.568405555677177e-06, + "loss": 1.3993, + "step": 593 + }, + { + "epoch": 0.5781959766385464, + "grad_norm": 0.474609375, + "learning_rate": 4.566968024323989e-06, + "loss": 1.405, + "step": 594 + }, + { + "epoch": 0.5791693705386113, + "grad_norm": 0.439453125, + "learning_rate": 4.565528329884571e-06, + "loss": 1.4054, + "step": 595 + }, + { + "epoch": 0.5801427644386762, + "grad_norm": 0.4453125, + "learning_rate": 4.564086473865565e-06, + "loss": 1.3597, + "step": 596 + }, + { + "epoch": 0.5811161583387411, + "grad_norm": 0.4375, + "learning_rate": 4.562642457775876e-06, + "loss": 1.3909, + "step": 597 + }, + { + "epoch": 0.582089552238806, + "grad_norm": 0.453125, + "learning_rate": 4.56119628312667e-06, + "loss": 1.4039, + "step": 598 + }, + { + "epoch": 0.5830629461388709, + "grad_norm": 0.427734375, + "learning_rate": 4.559747951431372e-06, + "loss": 1.395, + "step": 599 + }, + { + "epoch": 0.5840363400389358, + "grad_norm": 0.44140625, + "learning_rate": 4.558297464205666e-06, + "loss": 1.3963, + "step": 600 + }, + { + "epoch": 0.5850097339390007, + "grad_norm": 0.44140625, + "learning_rate": 4.556844822967486e-06, + "loss": 1.4153, + "step": 601 + }, + { + "epoch": 0.5859831278390656, + "grad_norm": 0.453125, + "learning_rate": 4.555390029237026e-06, + "loss": 1.3682, + "step": 602 + }, + { + "epoch": 0.5869565217391305, + "grad_norm": 0.44140625, + "learning_rate": 4.55393308453673e-06, + "loss": 1.4046, + "step": 603 + }, + { + "epoch": 0.5879299156391953, + "grad_norm": 0.439453125, + "learning_rate": 4.552473990391294e-06, + "loss": 1.3929, + "step": 604 + }, + { + "epoch": 0.5889033095392602, + "grad_norm": 0.4140625, + "learning_rate": 4.551012748327663e-06, + "loss": 1.3667, + "step": 605 + }, + { + "epoch": 0.5898767034393251, + "grad_norm": 0.44921875, + "learning_rate": 4.549549359875031e-06, + "loss": 1.3875, + "step": 606 + }, + { + "epoch": 0.59085009733939, + "grad_norm": 0.427734375, + "learning_rate": 4.548083826564834e-06, + "loss": 1.3889, + "step": 607 + }, + { + "epoch": 0.5918234912394549, + "grad_norm": 0.431640625, + "learning_rate": 4.546616149930758e-06, + "loss": 1.3711, + "step": 608 + }, + { + "epoch": 0.5927968851395198, + "grad_norm": 0.431640625, + "learning_rate": 4.54514633150873e-06, + "loss": 1.3763, + "step": 609 + }, + { + "epoch": 0.5937702790395847, + "grad_norm": 0.421875, + "learning_rate": 4.543674372836915e-06, + "loss": 1.3942, + "step": 610 + }, + { + "epoch": 0.5947436729396496, + "grad_norm": 0.42578125, + "learning_rate": 4.542200275455724e-06, + "loss": 1.3893, + "step": 611 + }, + { + "epoch": 0.5957170668397145, + "grad_norm": 0.423828125, + "learning_rate": 4.5407240409078e-06, + "loss": 1.3826, + "step": 612 + }, + { + "epoch": 0.5966904607397794, + "grad_norm": 0.431640625, + "learning_rate": 4.539245670738029e-06, + "loss": 1.3885, + "step": 613 + }, + { + "epoch": 0.5976638546398443, + "grad_norm": 0.431640625, + "learning_rate": 4.537765166493524e-06, + "loss": 1.4051, + "step": 614 + }, + { + "epoch": 0.5986372485399092, + "grad_norm": 0.44140625, + "learning_rate": 4.5362825297236394e-06, + "loss": 1.3893, + "step": 615 + }, + { + "epoch": 0.5996106424399741, + "grad_norm": 0.4375, + "learning_rate": 4.534797761979955e-06, + "loss": 1.3847, + "step": 616 + }, + { + "epoch": 0.600584036340039, + "grad_norm": 0.423828125, + "learning_rate": 4.533310864816286e-06, + "loss": 1.3936, + "step": 617 + }, + { + "epoch": 0.6015574302401038, + "grad_norm": 0.42578125, + "learning_rate": 4.531821839788671e-06, + "loss": 1.383, + "step": 618 + }, + { + "epoch": 0.6025308241401687, + "grad_norm": 0.427734375, + "learning_rate": 4.5303306884553785e-06, + "loss": 1.3955, + "step": 619 + }, + { + "epoch": 0.6035042180402336, + "grad_norm": 0.43359375, + "learning_rate": 4.528837412376902e-06, + "loss": 1.3963, + "step": 620 + }, + { + "epoch": 0.6044776119402985, + "grad_norm": 0.431640625, + "learning_rate": 4.527342013115956e-06, + "loss": 1.3767, + "step": 621 + }, + { + "epoch": 0.6054510058403634, + "grad_norm": 0.4296875, + "learning_rate": 4.525844492237481e-06, + "loss": 1.3808, + "step": 622 + }, + { + "epoch": 0.6064243997404283, + "grad_norm": 0.435546875, + "learning_rate": 4.524344851308635e-06, + "loss": 1.3882, + "step": 623 + }, + { + "epoch": 0.6073977936404932, + "grad_norm": 0.423828125, + "learning_rate": 4.522843091898795e-06, + "loss": 1.3824, + "step": 624 + }, + { + "epoch": 0.6083711875405581, + "grad_norm": 0.42578125, + "learning_rate": 4.521339215579555e-06, + "loss": 1.3764, + "step": 625 + }, + { + "epoch": 0.609344581440623, + "grad_norm": 0.439453125, + "learning_rate": 4.519833223924725e-06, + "loss": 1.3805, + "step": 626 + }, + { + "epoch": 0.6103179753406879, + "grad_norm": 0.439453125, + "learning_rate": 4.518325118510328e-06, + "loss": 1.3689, + "step": 627 + }, + { + "epoch": 0.6112913692407528, + "grad_norm": 0.4375, + "learning_rate": 4.516814900914601e-06, + "loss": 1.386, + "step": 628 + }, + { + "epoch": 0.6122647631408177, + "grad_norm": 0.431640625, + "learning_rate": 4.515302572717987e-06, + "loss": 1.4075, + "step": 629 + }, + { + "epoch": 0.6132381570408826, + "grad_norm": 0.421875, + "learning_rate": 4.513788135503142e-06, + "loss": 1.3808, + "step": 630 + }, + { + "epoch": 0.6142115509409475, + "grad_norm": 0.43359375, + "learning_rate": 4.512271590854929e-06, + "loss": 1.3767, + "step": 631 + }, + { + "epoch": 0.6151849448410124, + "grad_norm": 0.431640625, + "learning_rate": 4.5107529403604126e-06, + "loss": 1.3818, + "step": 632 + }, + { + "epoch": 0.6161583387410772, + "grad_norm": 0.42578125, + "learning_rate": 4.509232185608864e-06, + "loss": 1.3886, + "step": 633 + }, + { + "epoch": 0.6171317326411421, + "grad_norm": 0.423828125, + "learning_rate": 4.507709328191758e-06, + "loss": 1.3751, + "step": 634 + }, + { + "epoch": 0.618105126541207, + "grad_norm": 0.435546875, + "learning_rate": 4.506184369702766e-06, + "loss": 1.3608, + "step": 635 + }, + { + "epoch": 0.6190785204412719, + "grad_norm": 0.4296875, + "learning_rate": 4.5046573117377616e-06, + "loss": 1.3771, + "step": 636 + }, + { + "epoch": 0.6200519143413368, + "grad_norm": 0.4375, + "learning_rate": 4.503128155894812e-06, + "loss": 1.3854, + "step": 637 + }, + { + "epoch": 0.6210253082414017, + "grad_norm": 0.439453125, + "learning_rate": 4.501596903774184e-06, + "loss": 1.3916, + "step": 638 + }, + { + "epoch": 0.6219987021414666, + "grad_norm": 0.470703125, + "learning_rate": 4.5000635569783365e-06, + "loss": 1.3768, + "step": 639 + }, + { + "epoch": 0.6229720960415315, + "grad_norm": 0.419921875, + "learning_rate": 4.498528117111918e-06, + "loss": 1.3629, + "step": 640 + }, + { + "epoch": 0.6239454899415964, + "grad_norm": 0.41796875, + "learning_rate": 4.49699058578177e-06, + "loss": 1.3698, + "step": 641 + }, + { + "epoch": 0.6249188838416613, + "grad_norm": 0.4375, + "learning_rate": 4.495450964596923e-06, + "loss": 1.3571, + "step": 642 + }, + { + "epoch": 0.6258922777417262, + "grad_norm": 0.4453125, + "learning_rate": 4.493909255168592e-06, + "loss": 1.3787, + "step": 643 + }, + { + "epoch": 0.6268656716417911, + "grad_norm": 0.44140625, + "learning_rate": 4.492365459110182e-06, + "loss": 1.3611, + "step": 644 + }, + { + "epoch": 0.627839065541856, + "grad_norm": 0.42578125, + "learning_rate": 4.490819578037275e-06, + "loss": 1.3921, + "step": 645 + }, + { + "epoch": 0.6288124594419209, + "grad_norm": 0.41796875, + "learning_rate": 4.4892716135676415e-06, + "loss": 1.4044, + "step": 646 + }, + { + "epoch": 0.6297858533419857, + "grad_norm": 0.431640625, + "learning_rate": 4.487721567321229e-06, + "loss": 1.3714, + "step": 647 + }, + { + "epoch": 0.6307592472420506, + "grad_norm": 0.431640625, + "learning_rate": 4.486169440920163e-06, + "loss": 1.3805, + "step": 648 + }, + { + "epoch": 0.6317326411421155, + "grad_norm": 0.443359375, + "learning_rate": 4.484615235988747e-06, + "loss": 1.3888, + "step": 649 + }, + { + "epoch": 0.6327060350421804, + "grad_norm": 0.412109375, + "learning_rate": 4.4830589541534615e-06, + "loss": 1.3453, + "step": 650 + }, + { + "epoch": 0.6336794289422453, + "grad_norm": 0.42578125, + "learning_rate": 4.481500597042956e-06, + "loss": 1.4119, + "step": 651 + }, + { + "epoch": 0.6346528228423102, + "grad_norm": 0.443359375, + "learning_rate": 4.479940166288056e-06, + "loss": 1.3965, + "step": 652 + }, + { + "epoch": 0.6356262167423751, + "grad_norm": 0.439453125, + "learning_rate": 4.4783776635217555e-06, + "loss": 1.4012, + "step": 653 + }, + { + "epoch": 0.63659961064244, + "grad_norm": 0.427734375, + "learning_rate": 4.476813090379216e-06, + "loss": 1.3946, + "step": 654 + }, + { + "epoch": 0.6375730045425049, + "grad_norm": 0.427734375, + "learning_rate": 4.475246448497766e-06, + "loss": 1.3893, + "step": 655 + }, + { + "epoch": 0.6385463984425698, + "grad_norm": 0.44921875, + "learning_rate": 4.473677739516901e-06, + "loss": 1.3874, + "step": 656 + }, + { + "epoch": 0.6395197923426347, + "grad_norm": 0.4296875, + "learning_rate": 4.472106965078277e-06, + "loss": 1.3767, + "step": 657 + }, + { + "epoch": 0.6404931862426996, + "grad_norm": 0.427734375, + "learning_rate": 4.470534126825714e-06, + "loss": 1.3864, + "step": 658 + }, + { + "epoch": 0.6414665801427645, + "grad_norm": 0.439453125, + "learning_rate": 4.468959226405188e-06, + "loss": 1.3849, + "step": 659 + }, + { + "epoch": 0.6424399740428294, + "grad_norm": 0.423828125, + "learning_rate": 4.467382265464838e-06, + "loss": 1.3789, + "step": 660 + }, + { + "epoch": 0.6434133679428943, + "grad_norm": 0.41796875, + "learning_rate": 4.465803245654955e-06, + "loss": 1.3473, + "step": 661 + }, + { + "epoch": 0.6443867618429591, + "grad_norm": 0.416015625, + "learning_rate": 4.464222168627987e-06, + "loss": 1.3614, + "step": 662 + }, + { + "epoch": 0.645360155743024, + "grad_norm": 0.4296875, + "learning_rate": 4.462639036038536e-06, + "loss": 1.368, + "step": 663 + }, + { + "epoch": 0.6463335496430889, + "grad_norm": 0.451171875, + "learning_rate": 4.461053849543351e-06, + "loss": 1.3965, + "step": 664 + }, + { + "epoch": 0.6473069435431538, + "grad_norm": 0.43359375, + "learning_rate": 4.459466610801333e-06, + "loss": 1.3626, + "step": 665 + }, + { + "epoch": 0.6482803374432187, + "grad_norm": 0.421875, + "learning_rate": 4.457877321473532e-06, + "loss": 1.3897, + "step": 666 + }, + { + "epoch": 0.6492537313432836, + "grad_norm": 0.431640625, + "learning_rate": 4.456285983223143e-06, + "loss": 1.3792, + "step": 667 + }, + { + "epoch": 0.6502271252433485, + "grad_norm": 0.431640625, + "learning_rate": 4.454692597715502e-06, + "loss": 1.3975, + "step": 668 + }, + { + "epoch": 0.6512005191434134, + "grad_norm": 0.443359375, + "learning_rate": 4.4530971666180925e-06, + "loss": 1.3743, + "step": 669 + }, + { + "epoch": 0.6521739130434783, + "grad_norm": 0.4296875, + "learning_rate": 4.451499691600536e-06, + "loss": 1.3572, + "step": 670 + }, + { + "epoch": 0.6531473069435432, + "grad_norm": 0.431640625, + "learning_rate": 4.449900174334592e-06, + "loss": 1.3806, + "step": 671 + }, + { + "epoch": 0.6541207008436081, + "grad_norm": 0.4375, + "learning_rate": 4.44829861649416e-06, + "loss": 1.3852, + "step": 672 + }, + { + "epoch": 0.655094094743673, + "grad_norm": 0.412109375, + "learning_rate": 4.446695019755274e-06, + "loss": 1.3605, + "step": 673 + }, + { + "epoch": 0.6560674886437379, + "grad_norm": 0.419921875, + "learning_rate": 4.445089385796099e-06, + "loss": 1.3732, + "step": 674 + }, + { + "epoch": 0.6570408825438028, + "grad_norm": 0.42578125, + "learning_rate": 4.443481716296936e-06, + "loss": 1.3804, + "step": 675 + }, + { + "epoch": 0.6580142764438677, + "grad_norm": 0.423828125, + "learning_rate": 4.4418720129402145e-06, + "loss": 1.3737, + "step": 676 + }, + { + "epoch": 0.6589876703439325, + "grad_norm": 0.43359375, + "learning_rate": 4.440260277410491e-06, + "loss": 1.3599, + "step": 677 + }, + { + "epoch": 0.6599610642439974, + "grad_norm": 0.435546875, + "learning_rate": 4.438646511394451e-06, + "loss": 1.3439, + "step": 678 + }, + { + "epoch": 0.6609344581440623, + "grad_norm": 0.423828125, + "learning_rate": 4.437030716580904e-06, + "loss": 1.3733, + "step": 679 + }, + { + "epoch": 0.6619078520441272, + "grad_norm": 0.41796875, + "learning_rate": 4.435412894660782e-06, + "loss": 1.365, + "step": 680 + }, + { + "epoch": 0.6628812459441921, + "grad_norm": 0.40625, + "learning_rate": 4.433793047327138e-06, + "loss": 1.3667, + "step": 681 + }, + { + "epoch": 0.663854639844257, + "grad_norm": 0.44140625, + "learning_rate": 4.432171176275149e-06, + "loss": 1.3674, + "step": 682 + }, + { + "epoch": 0.6648280337443219, + "grad_norm": 0.443359375, + "learning_rate": 4.430547283202103e-06, + "loss": 1.3543, + "step": 683 + }, + { + "epoch": 0.6658014276443868, + "grad_norm": 0.423828125, + "learning_rate": 4.428921369807407e-06, + "loss": 1.3757, + "step": 684 + }, + { + "epoch": 0.6667748215444517, + "grad_norm": 0.427734375, + "learning_rate": 4.427293437792585e-06, + "loss": 1.371, + "step": 685 + }, + { + "epoch": 0.6677482154445166, + "grad_norm": 0.421875, + "learning_rate": 4.42566348886127e-06, + "loss": 1.3876, + "step": 686 + }, + { + "epoch": 0.6687216093445815, + "grad_norm": 0.41796875, + "learning_rate": 4.424031524719208e-06, + "loss": 1.3651, + "step": 687 + }, + { + "epoch": 0.6696950032446464, + "grad_norm": 0.42578125, + "learning_rate": 4.422397547074252e-06, + "loss": 1.3677, + "step": 688 + }, + { + "epoch": 0.6706683971447113, + "grad_norm": 0.4296875, + "learning_rate": 4.420761557636362e-06, + "loss": 1.3676, + "step": 689 + }, + { + "epoch": 0.6716417910447762, + "grad_norm": 0.443359375, + "learning_rate": 4.419123558117605e-06, + "loss": 1.3757, + "step": 690 + }, + { + "epoch": 0.672615184944841, + "grad_norm": 0.41796875, + "learning_rate": 4.417483550232151e-06, + "loss": 1.3686, + "step": 691 + }, + { + "epoch": 0.6735885788449059, + "grad_norm": 0.42578125, + "learning_rate": 4.415841535696271e-06, + "loss": 1.3566, + "step": 692 + }, + { + "epoch": 0.6745619727449708, + "grad_norm": 0.412109375, + "learning_rate": 4.414197516228338e-06, + "loss": 1.3666, + "step": 693 + }, + { + "epoch": 0.6755353666450357, + "grad_norm": 0.4140625, + "learning_rate": 4.41255149354882e-06, + "loss": 1.3482, + "step": 694 + }, + { + "epoch": 0.6765087605451006, + "grad_norm": 0.423828125, + "learning_rate": 4.410903469380284e-06, + "loss": 1.3735, + "step": 695 + }, + { + "epoch": 0.6774821544451655, + "grad_norm": 0.416015625, + "learning_rate": 4.40925344544739e-06, + "loss": 1.3773, + "step": 696 + }, + { + "epoch": 0.6784555483452304, + "grad_norm": 0.419921875, + "learning_rate": 4.407601423476893e-06, + "loss": 1.3859, + "step": 697 + }, + { + "epoch": 0.6794289422452953, + "grad_norm": 0.41796875, + "learning_rate": 4.405947405197635e-06, + "loss": 1.3765, + "step": 698 + }, + { + "epoch": 0.6804023361453602, + "grad_norm": 0.4375, + "learning_rate": 4.404291392340551e-06, + "loss": 1.3512, + "step": 699 + }, + { + "epoch": 0.6813757300454251, + "grad_norm": 0.41796875, + "learning_rate": 4.402633386638662e-06, + "loss": 1.3713, + "step": 700 + }, + { + "epoch": 0.68234912394549, + "grad_norm": 0.431640625, + "learning_rate": 4.400973389827072e-06, + "loss": 1.3983, + "step": 701 + }, + { + "epoch": 0.6833225178455549, + "grad_norm": 0.42578125, + "learning_rate": 4.399311403642975e-06, + "loss": 1.3858, + "step": 702 + }, + { + "epoch": 0.6842959117456198, + "grad_norm": 0.453125, + "learning_rate": 4.3976474298256395e-06, + "loss": 1.3683, + "step": 703 + }, + { + "epoch": 0.6852693056456847, + "grad_norm": 0.447265625, + "learning_rate": 4.395981470116419e-06, + "loss": 1.3935, + "step": 704 + }, + { + "epoch": 0.6862426995457496, + "grad_norm": 0.423828125, + "learning_rate": 4.394313526258743e-06, + "loss": 1.373, + "step": 705 + }, + { + "epoch": 0.6872160934458144, + "grad_norm": 0.435546875, + "learning_rate": 4.3926435999981194e-06, + "loss": 1.3831, + "step": 706 + }, + { + "epoch": 0.6881894873458793, + "grad_norm": 0.435546875, + "learning_rate": 4.390971693082128e-06, + "loss": 1.3843, + "step": 707 + }, + { + "epoch": 0.6891628812459442, + "grad_norm": 0.443359375, + "learning_rate": 4.3892978072604235e-06, + "loss": 1.3679, + "step": 708 + }, + { + "epoch": 0.6901362751460091, + "grad_norm": 0.423828125, + "learning_rate": 4.38762194428473e-06, + "loss": 1.3644, + "step": 709 + }, + { + "epoch": 0.691109669046074, + "grad_norm": 0.41796875, + "learning_rate": 4.3859441059088435e-06, + "loss": 1.3766, + "step": 710 + }, + { + "epoch": 0.6920830629461389, + "grad_norm": 0.453125, + "learning_rate": 4.384264293888624e-06, + "loss": 1.3654, + "step": 711 + }, + { + "epoch": 0.6930564568462038, + "grad_norm": 0.427734375, + "learning_rate": 4.382582509981996e-06, + "loss": 1.3694, + "step": 712 + }, + { + "epoch": 0.6940298507462687, + "grad_norm": 0.42578125, + "learning_rate": 4.3808987559489536e-06, + "loss": 1.3698, + "step": 713 + }, + { + "epoch": 0.6950032446463336, + "grad_norm": 0.44140625, + "learning_rate": 4.379213033551547e-06, + "loss": 1.3681, + "step": 714 + }, + { + "epoch": 0.6959766385463985, + "grad_norm": 0.427734375, + "learning_rate": 4.377525344553888e-06, + "loss": 1.3757, + "step": 715 + }, + { + "epoch": 0.6969500324464634, + "grad_norm": 0.41796875, + "learning_rate": 4.375835690722147e-06, + "loss": 1.3604, + "step": 716 + }, + { + "epoch": 0.6979234263465283, + "grad_norm": 0.44140625, + "learning_rate": 4.374144073824549e-06, + "loss": 1.3715, + "step": 717 + }, + { + "epoch": 0.6988968202465932, + "grad_norm": 0.447265625, + "learning_rate": 4.372450495631376e-06, + "loss": 1.3486, + "step": 718 + }, + { + "epoch": 0.6998702141466581, + "grad_norm": 0.455078125, + "learning_rate": 4.3707549579149605e-06, + "loss": 1.3803, + "step": 719 + }, + { + "epoch": 0.7008436080467229, + "grad_norm": 0.427734375, + "learning_rate": 4.369057462449686e-06, + "loss": 1.3779, + "step": 720 + }, + { + "epoch": 0.7018170019467878, + "grad_norm": 0.4375, + "learning_rate": 4.367358011011985e-06, + "loss": 1.375, + "step": 721 + }, + { + "epoch": 0.7027903958468527, + "grad_norm": 0.42578125, + "learning_rate": 4.365656605380338e-06, + "loss": 1.351, + "step": 722 + }, + { + "epoch": 0.7037637897469176, + "grad_norm": 0.44921875, + "learning_rate": 4.363953247335267e-06, + "loss": 1.3634, + "step": 723 + }, + { + "epoch": 0.7047371836469825, + "grad_norm": 0.447265625, + "learning_rate": 4.362247938659342e-06, + "loss": 1.3717, + "step": 724 + }, + { + "epoch": 0.7057105775470474, + "grad_norm": 0.43359375, + "learning_rate": 4.36054068113717e-06, + "loss": 1.3568, + "step": 725 + }, + { + "epoch": 0.7066839714471123, + "grad_norm": 0.44921875, + "learning_rate": 4.358831476555401e-06, + "loss": 1.3884, + "step": 726 + }, + { + "epoch": 0.7076573653471772, + "grad_norm": 0.4453125, + "learning_rate": 4.357120326702721e-06, + "loss": 1.3781, + "step": 727 + }, + { + "epoch": 0.7086307592472421, + "grad_norm": 0.435546875, + "learning_rate": 4.35540723336985e-06, + "loss": 1.3788, + "step": 728 + }, + { + "epoch": 0.709604153147307, + "grad_norm": 0.43359375, + "learning_rate": 4.353692198349547e-06, + "loss": 1.3691, + "step": 729 + }, + { + "epoch": 0.7105775470473719, + "grad_norm": 0.431640625, + "learning_rate": 4.351975223436597e-06, + "loss": 1.3503, + "step": 730 + }, + { + "epoch": 0.7115509409474368, + "grad_norm": 0.443359375, + "learning_rate": 4.3502563104278175e-06, + "loss": 1.3927, + "step": 731 + }, + { + "epoch": 0.7125243348475017, + "grad_norm": 0.431640625, + "learning_rate": 4.3485354611220555e-06, + "loss": 1.366, + "step": 732 + }, + { + "epoch": 0.7134977287475666, + "grad_norm": 0.46484375, + "learning_rate": 4.346812677320183e-06, + "loss": 1.3639, + "step": 733 + }, + { + "epoch": 0.7144711226476315, + "grad_norm": 0.455078125, + "learning_rate": 4.345087960825098e-06, + "loss": 1.3415, + "step": 734 + }, + { + "epoch": 0.7154445165476963, + "grad_norm": 0.453125, + "learning_rate": 4.343361313441717e-06, + "loss": 1.3652, + "step": 735 + }, + { + "epoch": 0.7164179104477612, + "grad_norm": 0.4375, + "learning_rate": 4.3416327369769824e-06, + "loss": 1.3618, + "step": 736 + }, + { + "epoch": 0.717391304347826, + "grad_norm": 0.421875, + "learning_rate": 4.339902233239853e-06, + "loss": 1.366, + "step": 737 + }, + { + "epoch": 0.718364698247891, + "grad_norm": 0.435546875, + "learning_rate": 4.338169804041303e-06, + "loss": 1.3752, + "step": 738 + }, + { + "epoch": 0.7193380921479559, + "grad_norm": 0.431640625, + "learning_rate": 4.336435451194324e-06, + "loss": 1.337, + "step": 739 + }, + { + "epoch": 0.7203114860480208, + "grad_norm": 0.439453125, + "learning_rate": 4.334699176513919e-06, + "loss": 1.3579, + "step": 740 + }, + { + "epoch": 0.7212848799480857, + "grad_norm": 0.455078125, + "learning_rate": 4.3329609818171035e-06, + "loss": 1.3837, + "step": 741 + }, + { + "epoch": 0.7222582738481506, + "grad_norm": 0.419921875, + "learning_rate": 4.3312208689229026e-06, + "loss": 1.3677, + "step": 742 + }, + { + "epoch": 0.7232316677482155, + "grad_norm": 0.423828125, + "learning_rate": 4.3294788396523465e-06, + "loss": 1.3692, + "step": 743 + }, + { + "epoch": 0.7242050616482804, + "grad_norm": 0.4296875, + "learning_rate": 4.327734895828473e-06, + "loss": 1.3583, + "step": 744 + }, + { + "epoch": 0.7251784555483453, + "grad_norm": 0.42578125, + "learning_rate": 4.325989039276323e-06, + "loss": 1.353, + "step": 745 + }, + { + "epoch": 0.7261518494484102, + "grad_norm": 0.421875, + "learning_rate": 4.324241271822939e-06, + "loss": 1.3476, + "step": 746 + }, + { + "epoch": 0.7271252433484751, + "grad_norm": 0.45703125, + "learning_rate": 4.322491595297363e-06, + "loss": 1.3385, + "step": 747 + }, + { + "epoch": 0.72809863724854, + "grad_norm": 0.4296875, + "learning_rate": 4.320740011530634e-06, + "loss": 1.3517, + "step": 748 + }, + { + "epoch": 0.7290720311486047, + "grad_norm": 0.435546875, + "learning_rate": 4.31898652235579e-06, + "loss": 1.3732, + "step": 749 + }, + { + "epoch": 0.7300454250486696, + "grad_norm": 0.41796875, + "learning_rate": 4.317231129607859e-06, + "loss": 1.3647, + "step": 750 + }, + { + "epoch": 0.7310188189487346, + "grad_norm": 0.43359375, + "learning_rate": 4.3154738351238655e-06, + "loss": 1.3595, + "step": 751 + }, + { + "epoch": 0.7319922128487995, + "grad_norm": 0.423828125, + "learning_rate": 4.3137146407428196e-06, + "loss": 1.3575, + "step": 752 + }, + { + "epoch": 0.7329656067488644, + "grad_norm": 0.431640625, + "learning_rate": 4.311953548305722e-06, + "loss": 1.3698, + "step": 753 + }, + { + "epoch": 0.7339390006489293, + "grad_norm": 0.42578125, + "learning_rate": 4.31019055965556e-06, + "loss": 1.3651, + "step": 754 + }, + { + "epoch": 0.7349123945489942, + "grad_norm": 0.4375, + "learning_rate": 4.3084256766373056e-06, + "loss": 1.3609, + "step": 755 + }, + { + "epoch": 0.735885788449059, + "grad_norm": 0.431640625, + "learning_rate": 4.306658901097911e-06, + "loss": 1.3398, + "step": 756 + }, + { + "epoch": 0.736859182349124, + "grad_norm": 0.419921875, + "learning_rate": 4.3048902348863116e-06, + "loss": 1.3856, + "step": 757 + }, + { + "epoch": 0.7378325762491889, + "grad_norm": 0.421875, + "learning_rate": 4.303119679853419e-06, + "loss": 1.3818, + "step": 758 + }, + { + "epoch": 0.7388059701492538, + "grad_norm": 0.427734375, + "learning_rate": 4.3013472378521236e-06, + "loss": 1.3709, + "step": 759 + }, + { + "epoch": 0.7397793640493187, + "grad_norm": 0.4296875, + "learning_rate": 4.299572910737289e-06, + "loss": 1.3466, + "step": 760 + }, + { + "epoch": 0.7407527579493836, + "grad_norm": 0.427734375, + "learning_rate": 4.297796700365752e-06, + "loss": 1.3511, + "step": 761 + }, + { + "epoch": 0.7417261518494485, + "grad_norm": 0.408203125, + "learning_rate": 4.296018608596321e-06, + "loss": 1.3448, + "step": 762 + }, + { + "epoch": 0.7426995457495134, + "grad_norm": 0.41796875, + "learning_rate": 4.294238637289772e-06, + "loss": 1.3608, + "step": 763 + }, + { + "epoch": 0.7436729396495781, + "grad_norm": 0.43359375, + "learning_rate": 4.29245678830885e-06, + "loss": 1.3773, + "step": 764 + }, + { + "epoch": 0.744646333549643, + "grad_norm": 0.423828125, + "learning_rate": 4.290673063518261e-06, + "loss": 1.3641, + "step": 765 + }, + { + "epoch": 0.745619727449708, + "grad_norm": 0.42578125, + "learning_rate": 4.288887464784679e-06, + "loss": 1.3438, + "step": 766 + }, + { + "epoch": 0.7465931213497728, + "grad_norm": 0.4296875, + "learning_rate": 4.287099993976735e-06, + "loss": 1.3731, + "step": 767 + }, + { + "epoch": 0.7475665152498377, + "grad_norm": 0.42578125, + "learning_rate": 4.2853106529650225e-06, + "loss": 1.3327, + "step": 768 + }, + { + "epoch": 0.7485399091499026, + "grad_norm": 0.421875, + "learning_rate": 4.28351944362209e-06, + "loss": 1.3578, + "step": 769 + }, + { + "epoch": 0.7495133030499675, + "grad_norm": 0.42578125, + "learning_rate": 4.28172636782244e-06, + "loss": 1.3547, + "step": 770 + }, + { + "epoch": 0.7504866969500325, + "grad_norm": 0.435546875, + "learning_rate": 4.279931427442532e-06, + "loss": 1.3565, + "step": 771 + }, + { + "epoch": 0.7514600908500974, + "grad_norm": 0.419921875, + "learning_rate": 4.278134624360773e-06, + "loss": 1.357, + "step": 772 + }, + { + "epoch": 0.7524334847501623, + "grad_norm": 0.423828125, + "learning_rate": 4.276335960457522e-06, + "loss": 1.3664, + "step": 773 + }, + { + "epoch": 0.7534068786502272, + "grad_norm": 0.44140625, + "learning_rate": 4.2745354376150865e-06, + "loss": 1.3623, + "step": 774 + }, + { + "epoch": 0.754380272550292, + "grad_norm": 0.431640625, + "learning_rate": 4.272733057717714e-06, + "loss": 1.3567, + "step": 775 + }, + { + "epoch": 0.755353666450357, + "grad_norm": 0.423828125, + "learning_rate": 4.2709288226516e-06, + "loss": 1.3633, + "step": 776 + }, + { + "epoch": 0.7563270603504219, + "grad_norm": 0.421875, + "learning_rate": 4.269122734304881e-06, + "loss": 1.361, + "step": 777 + }, + { + "epoch": 0.7573004542504866, + "grad_norm": 0.44140625, + "learning_rate": 4.2673147945676305e-06, + "loss": 1.3854, + "step": 778 + }, + { + "epoch": 0.7582738481505515, + "grad_norm": 0.427734375, + "learning_rate": 4.265505005331863e-06, + "loss": 1.3854, + "step": 779 + }, + { + "epoch": 0.7592472420506164, + "grad_norm": 0.4140625, + "learning_rate": 4.263693368491524e-06, + "loss": 1.3353, + "step": 780 + }, + { + "epoch": 0.7602206359506813, + "grad_norm": 0.470703125, + "learning_rate": 4.261879885942497e-06, + "loss": 1.3398, + "step": 781 + }, + { + "epoch": 0.7611940298507462, + "grad_norm": 0.47265625, + "learning_rate": 4.260064559582596e-06, + "loss": 1.3785, + "step": 782 + }, + { + "epoch": 0.7621674237508111, + "grad_norm": 0.435546875, + "learning_rate": 4.258247391311562e-06, + "loss": 1.3685, + "step": 783 + }, + { + "epoch": 0.763140817650876, + "grad_norm": 0.431640625, + "learning_rate": 4.256428383031065e-06, + "loss": 1.3556, + "step": 784 + }, + { + "epoch": 0.764114211550941, + "grad_norm": 0.427734375, + "learning_rate": 4.254607536644702e-06, + "loss": 1.3519, + "step": 785 + }, + { + "epoch": 0.7650876054510058, + "grad_norm": 0.4765625, + "learning_rate": 4.252784854057993e-06, + "loss": 1.3622, + "step": 786 + }, + { + "epoch": 0.7660609993510707, + "grad_norm": 0.447265625, + "learning_rate": 4.2509603371783776e-06, + "loss": 1.3613, + "step": 787 + }, + { + "epoch": 0.7670343932511356, + "grad_norm": 0.443359375, + "learning_rate": 4.249133987915217e-06, + "loss": 1.3614, + "step": 788 + }, + { + "epoch": 0.7680077871512005, + "grad_norm": 0.4453125, + "learning_rate": 4.247305808179789e-06, + "loss": 1.3472, + "step": 789 + }, + { + "epoch": 0.7689811810512654, + "grad_norm": 0.451171875, + "learning_rate": 4.245475799885288e-06, + "loss": 1.342, + "step": 790 + }, + { + "epoch": 0.7699545749513304, + "grad_norm": 0.44921875, + "learning_rate": 4.243643964946821e-06, + "loss": 1.3809, + "step": 791 + }, + { + "epoch": 0.7709279688513953, + "grad_norm": 0.443359375, + "learning_rate": 4.241810305281407e-06, + "loss": 1.3536, + "step": 792 + }, + { + "epoch": 0.77190136275146, + "grad_norm": 0.447265625, + "learning_rate": 4.239974822807976e-06, + "loss": 1.3412, + "step": 793 + }, + { + "epoch": 0.7728747566515249, + "grad_norm": 0.458984375, + "learning_rate": 4.238137519447362e-06, + "loss": 1.3441, + "step": 794 + }, + { + "epoch": 0.7738481505515898, + "grad_norm": 0.451171875, + "learning_rate": 4.236298397122307e-06, + "loss": 1.3554, + "step": 795 + }, + { + "epoch": 0.7748215444516547, + "grad_norm": 0.44921875, + "learning_rate": 4.234457457757457e-06, + "loss": 1.3543, + "step": 796 + }, + { + "epoch": 0.7757949383517196, + "grad_norm": 0.44921875, + "learning_rate": 4.232614703279359e-06, + "loss": 1.3508, + "step": 797 + }, + { + "epoch": 0.7767683322517845, + "grad_norm": 0.439453125, + "learning_rate": 4.230770135616459e-06, + "loss": 1.3559, + "step": 798 + }, + { + "epoch": 0.7777417261518494, + "grad_norm": 0.4453125, + "learning_rate": 4.2289237566991e-06, + "loss": 1.3658, + "step": 799 + }, + { + "epoch": 0.7787151200519143, + "grad_norm": 0.431640625, + "learning_rate": 4.227075568459522e-06, + "loss": 1.367, + "step": 800 + }, + { + "epoch": 0.7796885139519792, + "grad_norm": 0.43359375, + "learning_rate": 4.225225572831858e-06, + "loss": 1.3504, + "step": 801 + }, + { + "epoch": 0.7806619078520441, + "grad_norm": 0.439453125, + "learning_rate": 4.223373771752131e-06, + "loss": 1.3512, + "step": 802 + }, + { + "epoch": 0.781635301752109, + "grad_norm": 0.44921875, + "learning_rate": 4.221520167158257e-06, + "loss": 1.3597, + "step": 803 + }, + { + "epoch": 0.782608695652174, + "grad_norm": 0.435546875, + "learning_rate": 4.219664760990035e-06, + "loss": 1.3292, + "step": 804 + }, + { + "epoch": 0.7835820895522388, + "grad_norm": 0.41796875, + "learning_rate": 4.217807555189151e-06, + "loss": 1.3392, + "step": 805 + }, + { + "epoch": 0.7845554834523037, + "grad_norm": 0.41796875, + "learning_rate": 4.215948551699175e-06, + "loss": 1.3377, + "step": 806 + }, + { + "epoch": 0.7855288773523685, + "grad_norm": 0.4296875, + "learning_rate": 4.21408775246556e-06, + "loss": 1.3513, + "step": 807 + }, + { + "epoch": 0.7865022712524334, + "grad_norm": 0.453125, + "learning_rate": 4.212225159435633e-06, + "loss": 1.3877, + "step": 808 + }, + { + "epoch": 0.7874756651524983, + "grad_norm": 0.439453125, + "learning_rate": 4.210360774558604e-06, + "loss": 1.3519, + "step": 809 + }, + { + "epoch": 0.7884490590525632, + "grad_norm": 0.431640625, + "learning_rate": 4.208494599785553e-06, + "loss": 1.3441, + "step": 810 + }, + { + "epoch": 0.7894224529526281, + "grad_norm": 0.421875, + "learning_rate": 4.206626637069438e-06, + "loss": 1.3656, + "step": 811 + }, + { + "epoch": 0.790395846852693, + "grad_norm": 0.4140625, + "learning_rate": 4.204756888365085e-06, + "loss": 1.3504, + "step": 812 + }, + { + "epoch": 0.7913692407527579, + "grad_norm": 0.431640625, + "learning_rate": 4.202885355629189e-06, + "loss": 1.3429, + "step": 813 + }, + { + "epoch": 0.7923426346528228, + "grad_norm": 0.435546875, + "learning_rate": 4.201012040820314e-06, + "loss": 1.3432, + "step": 814 + }, + { + "epoch": 0.7933160285528877, + "grad_norm": 0.4375, + "learning_rate": 4.199136945898887e-06, + "loss": 1.3612, + "step": 815 + }, + { + "epoch": 0.7942894224529526, + "grad_norm": 0.431640625, + "learning_rate": 4.197260072827199e-06, + "loss": 1.3556, + "step": 816 + }, + { + "epoch": 0.7952628163530175, + "grad_norm": 0.4296875, + "learning_rate": 4.1953814235694015e-06, + "loss": 1.3423, + "step": 817 + }, + { + "epoch": 0.7962362102530824, + "grad_norm": 0.41796875, + "learning_rate": 4.193501000091504e-06, + "loss": 1.37, + "step": 818 + }, + { + "epoch": 0.7972096041531473, + "grad_norm": 0.421875, + "learning_rate": 4.1916188043613754e-06, + "loss": 1.3564, + "step": 819 + }, + { + "epoch": 0.7981829980532122, + "grad_norm": 0.431640625, + "learning_rate": 4.189734838348736e-06, + "loss": 1.3681, + "step": 820 + }, + { + "epoch": 0.7991563919532771, + "grad_norm": 0.421875, + "learning_rate": 4.187849104025159e-06, + "loss": 1.38, + "step": 821 + }, + { + "epoch": 0.8001297858533419, + "grad_norm": 0.421875, + "learning_rate": 4.18596160336407e-06, + "loss": 1.3596, + "step": 822 + }, + { + "epoch": 0.8011031797534068, + "grad_norm": 0.4140625, + "learning_rate": 4.184072338340743e-06, + "loss": 1.3484, + "step": 823 + }, + { + "epoch": 0.8020765736534717, + "grad_norm": 0.4296875, + "learning_rate": 4.1821813109322975e-06, + "loss": 1.3658, + "step": 824 + }, + { + "epoch": 0.8030499675535366, + "grad_norm": 0.4140625, + "learning_rate": 4.180288523117697e-06, + "loss": 1.3523, + "step": 825 + }, + { + "epoch": 0.8040233614536015, + "grad_norm": 0.416015625, + "learning_rate": 4.178393976877749e-06, + "loss": 1.3549, + "step": 826 + }, + { + "epoch": 0.8049967553536664, + "grad_norm": 0.40625, + "learning_rate": 4.176497674195098e-06, + "loss": 1.3393, + "step": 827 + }, + { + "epoch": 0.8059701492537313, + "grad_norm": 0.4296875, + "learning_rate": 4.17459961705423e-06, + "loss": 1.351, + "step": 828 + }, + { + "epoch": 0.8069435431537962, + "grad_norm": 0.439453125, + "learning_rate": 4.172699807441465e-06, + "loss": 1.3442, + "step": 829 + }, + { + "epoch": 0.8079169370538611, + "grad_norm": 0.42578125, + "learning_rate": 4.1707982473449584e-06, + "loss": 1.3427, + "step": 830 + }, + { + "epoch": 0.808890330953926, + "grad_norm": 0.416015625, + "learning_rate": 4.168894938754696e-06, + "loss": 1.3437, + "step": 831 + }, + { + "epoch": 0.8098637248539909, + "grad_norm": 0.419921875, + "learning_rate": 4.166989883662495e-06, + "loss": 1.3536, + "step": 832 + }, + { + "epoch": 0.8108371187540558, + "grad_norm": 0.408203125, + "learning_rate": 4.165083084061997e-06, + "loss": 1.3342, + "step": 833 + }, + { + "epoch": 0.8118105126541207, + "grad_norm": 0.45703125, + "learning_rate": 4.1631745419486744e-06, + "loss": 1.3379, + "step": 834 + }, + { + "epoch": 0.8127839065541856, + "grad_norm": 0.43359375, + "learning_rate": 4.16126425931982e-06, + "loss": 1.3536, + "step": 835 + }, + { + "epoch": 0.8137573004542504, + "grad_norm": 0.4453125, + "learning_rate": 4.1593522381745465e-06, + "loss": 1.3368, + "step": 836 + }, + { + "epoch": 0.8147306943543153, + "grad_norm": 0.4375, + "learning_rate": 4.15743848051379e-06, + "loss": 1.3444, + "step": 837 + }, + { + "epoch": 0.8157040882543802, + "grad_norm": 0.419921875, + "learning_rate": 4.155522988340301e-06, + "loss": 1.3621, + "step": 838 + }, + { + "epoch": 0.8166774821544451, + "grad_norm": 0.42578125, + "learning_rate": 4.153605763658645e-06, + "loss": 1.3682, + "step": 839 + }, + { + "epoch": 0.81765087605451, + "grad_norm": 0.42578125, + "learning_rate": 4.151686808475204e-06, + "loss": 1.3607, + "step": 840 + }, + { + "epoch": 0.8186242699545749, + "grad_norm": 0.427734375, + "learning_rate": 4.149766124798166e-06, + "loss": 1.3299, + "step": 841 + }, + { + "epoch": 0.8195976638546398, + "grad_norm": 0.431640625, + "learning_rate": 4.1478437146375315e-06, + "loss": 1.3649, + "step": 842 + }, + { + "epoch": 0.8205710577547047, + "grad_norm": 0.447265625, + "learning_rate": 4.145919580005107e-06, + "loss": 1.372, + "step": 843 + }, + { + "epoch": 0.8215444516547696, + "grad_norm": 0.423828125, + "learning_rate": 4.143993722914502e-06, + "loss": 1.3601, + "step": 844 + }, + { + "epoch": 0.8225178455548345, + "grad_norm": 0.41796875, + "learning_rate": 4.142066145381131e-06, + "loss": 1.3618, + "step": 845 + }, + { + "epoch": 0.8234912394548994, + "grad_norm": 0.4296875, + "learning_rate": 4.1401368494222075e-06, + "loss": 1.3613, + "step": 846 + }, + { + "epoch": 0.8244646333549643, + "grad_norm": 0.423828125, + "learning_rate": 4.138205837056743e-06, + "loss": 1.3517, + "step": 847 + }, + { + "epoch": 0.8254380272550292, + "grad_norm": 0.42578125, + "learning_rate": 4.136273110305547e-06, + "loss": 1.3322, + "step": 848 + }, + { + "epoch": 0.8264114211550941, + "grad_norm": 0.4375, + "learning_rate": 4.134338671191221e-06, + "loss": 1.359, + "step": 849 + }, + { + "epoch": 0.827384815055159, + "grad_norm": 0.412109375, + "learning_rate": 4.132402521738159e-06, + "loss": 1.3477, + "step": 850 + }, + { + "epoch": 0.8283582089552238, + "grad_norm": 0.4296875, + "learning_rate": 4.130464663972548e-06, + "loss": 1.3535, + "step": 851 + }, + { + "epoch": 0.8293316028552887, + "grad_norm": 0.41796875, + "learning_rate": 4.128525099922357e-06, + "loss": 1.3613, + "step": 852 + }, + { + "epoch": 0.8303049967553536, + "grad_norm": 0.435546875, + "learning_rate": 4.1265838316173455e-06, + "loss": 1.3557, + "step": 853 + }, + { + "epoch": 0.8312783906554185, + "grad_norm": 0.43359375, + "learning_rate": 4.124640861089055e-06, + "loss": 1.336, + "step": 854 + }, + { + "epoch": 0.8322517845554834, + "grad_norm": 0.421875, + "learning_rate": 4.122696190370805e-06, + "loss": 1.3498, + "step": 855 + }, + { + "epoch": 0.8332251784555483, + "grad_norm": 0.443359375, + "learning_rate": 4.1207498214977e-06, + "loss": 1.3572, + "step": 856 + }, + { + "epoch": 0.8341985723556132, + "grad_norm": 0.427734375, + "learning_rate": 4.11880175650662e-06, + "loss": 1.3538, + "step": 857 + }, + { + "epoch": 0.8351719662556781, + "grad_norm": 0.427734375, + "learning_rate": 4.1168519974362175e-06, + "loss": 1.3554, + "step": 858 + }, + { + "epoch": 0.836145360155743, + "grad_norm": 0.43359375, + "learning_rate": 4.1149005463269186e-06, + "loss": 1.324, + "step": 859 + }, + { + "epoch": 0.8371187540558079, + "grad_norm": 0.43359375, + "learning_rate": 4.112947405220921e-06, + "loss": 1.3603, + "step": 860 + }, + { + "epoch": 0.8380921479558728, + "grad_norm": 0.421875, + "learning_rate": 4.110992576162193e-06, + "loss": 1.3518, + "step": 861 + }, + { + "epoch": 0.8390655418559377, + "grad_norm": 0.41796875, + "learning_rate": 4.1090360611964644e-06, + "loss": 1.3701, + "step": 862 + }, + { + "epoch": 0.8400389357560026, + "grad_norm": 0.419921875, + "learning_rate": 4.107077862371235e-06, + "loss": 1.3422, + "step": 863 + }, + { + "epoch": 0.8410123296560675, + "grad_norm": 0.419921875, + "learning_rate": 4.1051179817357605e-06, + "loss": 1.3492, + "step": 864 + }, + { + "epoch": 0.8419857235561323, + "grad_norm": 0.427734375, + "learning_rate": 4.103156421341062e-06, + "loss": 1.3381, + "step": 865 + }, + { + "epoch": 0.8429591174561972, + "grad_norm": 0.4140625, + "learning_rate": 4.101193183239916e-06, + "loss": 1.3399, + "step": 866 + }, + { + "epoch": 0.8439325113562621, + "grad_norm": 0.4140625, + "learning_rate": 4.0992282694868555e-06, + "loss": 1.3459, + "step": 867 + }, + { + "epoch": 0.844905905256327, + "grad_norm": 0.41015625, + "learning_rate": 4.097261682138166e-06, + "loss": 1.331, + "step": 868 + }, + { + "epoch": 0.8458792991563919, + "grad_norm": 0.416015625, + "learning_rate": 4.095293423251886e-06, + "loss": 1.3371, + "step": 869 + }, + { + "epoch": 0.8468526930564568, + "grad_norm": 0.4140625, + "learning_rate": 4.0933234948878025e-06, + "loss": 1.3529, + "step": 870 + }, + { + "epoch": 0.8478260869565217, + "grad_norm": 0.419921875, + "learning_rate": 4.091351899107448e-06, + "loss": 1.3533, + "step": 871 + }, + { + "epoch": 0.8487994808565866, + "grad_norm": 0.416015625, + "learning_rate": 4.089378637974103e-06, + "loss": 1.3438, + "step": 872 + }, + { + "epoch": 0.8497728747566515, + "grad_norm": 0.4140625, + "learning_rate": 4.087403713552789e-06, + "loss": 1.3272, + "step": 873 + }, + { + "epoch": 0.8507462686567164, + "grad_norm": 0.4296875, + "learning_rate": 4.085427127910268e-06, + "loss": 1.3609, + "step": 874 + }, + { + "epoch": 0.8517196625567813, + "grad_norm": 0.42578125, + "learning_rate": 4.083448883115041e-06, + "loss": 1.371, + "step": 875 + }, + { + "epoch": 0.8526930564568462, + "grad_norm": 0.419921875, + "learning_rate": 4.081468981237345e-06, + "loss": 1.3452, + "step": 876 + }, + { + "epoch": 0.8536664503569111, + "grad_norm": 0.439453125, + "learning_rate": 4.079487424349152e-06, + "loss": 1.34, + "step": 877 + }, + { + "epoch": 0.854639844256976, + "grad_norm": 0.419921875, + "learning_rate": 4.077504214524166e-06, + "loss": 1.3307, + "step": 878 + }, + { + "epoch": 0.8556132381570409, + "grad_norm": 0.44921875, + "learning_rate": 4.075519353837818e-06, + "loss": 1.3515, + "step": 879 + }, + { + "epoch": 0.8565866320571057, + "grad_norm": 0.419921875, + "learning_rate": 4.0735328443672694e-06, + "loss": 1.3461, + "step": 880 + }, + { + "epoch": 0.8575600259571706, + "grad_norm": 0.43359375, + "learning_rate": 4.071544688191407e-06, + "loss": 1.3386, + "step": 881 + }, + { + "epoch": 0.8585334198572355, + "grad_norm": 0.455078125, + "learning_rate": 4.06955488739084e-06, + "loss": 1.3717, + "step": 882 + }, + { + "epoch": 0.8595068137573004, + "grad_norm": 0.4609375, + "learning_rate": 4.0675634440478995e-06, + "loss": 1.3352, + "step": 883 + }, + { + "epoch": 0.8604802076573653, + "grad_norm": 0.42578125, + "learning_rate": 4.065570360246634e-06, + "loss": 1.3568, + "step": 884 + }, + { + "epoch": 0.8614536015574302, + "grad_norm": 0.416015625, + "learning_rate": 4.0635756380728096e-06, + "loss": 1.3352, + "step": 885 + }, + { + "epoch": 0.8624269954574951, + "grad_norm": 0.42578125, + "learning_rate": 4.061579279613909e-06, + "loss": 1.3333, + "step": 886 + }, + { + "epoch": 0.86340038935756, + "grad_norm": 0.431640625, + "learning_rate": 4.0595812869591235e-06, + "loss": 1.3464, + "step": 887 + }, + { + "epoch": 0.8643737832576249, + "grad_norm": 0.447265625, + "learning_rate": 4.057581662199357e-06, + "loss": 1.3583, + "step": 888 + }, + { + "epoch": 0.8653471771576898, + "grad_norm": 0.44140625, + "learning_rate": 4.055580407427222e-06, + "loss": 1.363, + "step": 889 + }, + { + "epoch": 0.8663205710577547, + "grad_norm": 0.439453125, + "learning_rate": 4.053577524737034e-06, + "loss": 1.3543, + "step": 890 + }, + { + "epoch": 0.8672939649578196, + "grad_norm": 0.419921875, + "learning_rate": 4.051573016224813e-06, + "loss": 1.3434, + "step": 891 + }, + { + "epoch": 0.8682673588578845, + "grad_norm": 0.42578125, + "learning_rate": 4.0495668839882846e-06, + "loss": 1.3433, + "step": 892 + }, + { + "epoch": 0.8692407527579494, + "grad_norm": 0.435546875, + "learning_rate": 4.047559130126868e-06, + "loss": 1.3248, + "step": 893 + }, + { + "epoch": 0.8702141466580143, + "grad_norm": 0.447265625, + "learning_rate": 4.045549756741682e-06, + "loss": 1.3511, + "step": 894 + }, + { + "epoch": 0.8711875405580791, + "grad_norm": 0.44140625, + "learning_rate": 4.043538765935539e-06, + "loss": 1.3574, + "step": 895 + }, + { + "epoch": 0.872160934458144, + "grad_norm": 0.42578125, + "learning_rate": 4.0415261598129465e-06, + "loss": 1.3671, + "step": 896 + }, + { + "epoch": 0.8731343283582089, + "grad_norm": 0.423828125, + "learning_rate": 4.0395119404801e-06, + "loss": 1.3383, + "step": 897 + }, + { + "epoch": 0.8741077222582738, + "grad_norm": 0.42578125, + "learning_rate": 4.037496110044885e-06, + "loss": 1.3695, + "step": 898 + }, + { + "epoch": 0.8750811161583387, + "grad_norm": 0.44140625, + "learning_rate": 4.03547867061687e-06, + "loss": 1.3367, + "step": 899 + }, + { + "epoch": 0.8760545100584036, + "grad_norm": 0.419921875, + "learning_rate": 4.0334596243073125e-06, + "loss": 1.3357, + "step": 900 + }, + { + "epoch": 0.8770279039584685, + "grad_norm": 0.4140625, + "learning_rate": 4.031438973229147e-06, + "loss": 1.3463, + "step": 901 + }, + { + "epoch": 0.8780012978585334, + "grad_norm": 0.419921875, + "learning_rate": 4.029416719496988e-06, + "loss": 1.3597, + "step": 902 + }, + { + "epoch": 0.8789746917585983, + "grad_norm": 0.42578125, + "learning_rate": 4.027392865227131e-06, + "loss": 1.3586, + "step": 903 + }, + { + "epoch": 0.8799480856586632, + "grad_norm": 0.412109375, + "learning_rate": 4.025367412537539e-06, + "loss": 1.3201, + "step": 904 + }, + { + "epoch": 0.8809214795587281, + "grad_norm": 0.416015625, + "learning_rate": 4.023340363547858e-06, + "loss": 1.3621, + "step": 905 + }, + { + "epoch": 0.881894873458793, + "grad_norm": 0.419921875, + "learning_rate": 4.021311720379394e-06, + "loss": 1.372, + "step": 906 + }, + { + "epoch": 0.8828682673588579, + "grad_norm": 0.421875, + "learning_rate": 4.0192814851551284e-06, + "loss": 1.3457, + "step": 907 + }, + { + "epoch": 0.8838416612589228, + "grad_norm": 0.419921875, + "learning_rate": 4.017249659999707e-06, + "loss": 1.3675, + "step": 908 + }, + { + "epoch": 0.8848150551589876, + "grad_norm": 0.421875, + "learning_rate": 4.015216247039438e-06, + "loss": 1.3445, + "step": 909 + }, + { + "epoch": 0.8857884490590525, + "grad_norm": 0.4140625, + "learning_rate": 4.013181248402293e-06, + "loss": 1.3588, + "step": 910 + }, + { + "epoch": 0.8867618429591174, + "grad_norm": 0.41015625, + "learning_rate": 4.0111446662179e-06, + "loss": 1.3382, + "step": 911 + }, + { + "epoch": 0.8877352368591823, + "grad_norm": 0.41796875, + "learning_rate": 4.009106502617548e-06, + "loss": 1.3339, + "step": 912 + }, + { + "epoch": 0.8887086307592472, + "grad_norm": 0.4140625, + "learning_rate": 4.0070667597341806e-06, + "loss": 1.3351, + "step": 913 + }, + { + "epoch": 0.8896820246593121, + "grad_norm": 0.423828125, + "learning_rate": 4.005025439702391e-06, + "loss": 1.3449, + "step": 914 + }, + { + "epoch": 0.890655418559377, + "grad_norm": 0.435546875, + "learning_rate": 4.002982544658426e-06, + "loss": 1.3419, + "step": 915 + }, + { + "epoch": 0.8916288124594419, + "grad_norm": 0.423828125, + "learning_rate": 4.000938076740177e-06, + "loss": 1.3651, + "step": 916 + }, + { + "epoch": 0.8926022063595068, + "grad_norm": 0.4140625, + "learning_rate": 3.998892038087187e-06, + "loss": 1.3451, + "step": 917 + }, + { + "epoch": 0.8935756002595717, + "grad_norm": 0.4140625, + "learning_rate": 3.996844430840637e-06, + "loss": 1.3337, + "step": 918 + }, + { + "epoch": 0.8945489941596366, + "grad_norm": 0.41796875, + "learning_rate": 3.994795257143355e-06, + "loss": 1.3516, + "step": 919 + }, + { + "epoch": 0.8955223880597015, + "grad_norm": 0.416015625, + "learning_rate": 3.992744519139803e-06, + "loss": 1.3304, + "step": 920 + }, + { + "epoch": 0.8964957819597664, + "grad_norm": 0.419921875, + "learning_rate": 3.990692218976082e-06, + "loss": 1.3498, + "step": 921 + }, + { + "epoch": 0.8974691758598313, + "grad_norm": 0.416015625, + "learning_rate": 3.988638358799931e-06, + "loss": 1.3217, + "step": 922 + }, + { + "epoch": 0.8984425697598962, + "grad_norm": 0.421875, + "learning_rate": 3.986582940760717e-06, + "loss": 1.3579, + "step": 923 + }, + { + "epoch": 0.899415963659961, + "grad_norm": 0.42578125, + "learning_rate": 3.98452596700944e-06, + "loss": 1.3152, + "step": 924 + }, + { + "epoch": 0.9003893575600259, + "grad_norm": 0.431640625, + "learning_rate": 3.982467439698725e-06, + "loss": 1.3425, + "step": 925 + }, + { + "epoch": 0.9013627514600908, + "grad_norm": 0.412109375, + "learning_rate": 3.980407360982828e-06, + "loss": 1.3399, + "step": 926 + }, + { + "epoch": 0.9023361453601557, + "grad_norm": 0.41015625, + "learning_rate": 3.978345733017624e-06, + "loss": 1.3438, + "step": 927 + }, + { + "epoch": 0.9033095392602206, + "grad_norm": 0.421875, + "learning_rate": 3.976282557960611e-06, + "loss": 1.3449, + "step": 928 + }, + { + "epoch": 0.9042829331602855, + "grad_norm": 0.42578125, + "learning_rate": 3.974217837970906e-06, + "loss": 1.3385, + "step": 929 + }, + { + "epoch": 0.9052563270603504, + "grad_norm": 0.435546875, + "learning_rate": 3.9721515752092434e-06, + "loss": 1.3309, + "step": 930 + }, + { + "epoch": 0.9062297209604153, + "grad_norm": 0.421875, + "learning_rate": 3.970083771837969e-06, + "loss": 1.3521, + "step": 931 + }, + { + "epoch": 0.9072031148604802, + "grad_norm": 0.408203125, + "learning_rate": 3.968014430021046e-06, + "loss": 1.3293, + "step": 932 + }, + { + "epoch": 0.9081765087605451, + "grad_norm": 0.419921875, + "learning_rate": 3.965943551924043e-06, + "loss": 1.3597, + "step": 933 + }, + { + "epoch": 0.90914990266061, + "grad_norm": 0.43359375, + "learning_rate": 3.963871139714139e-06, + "loss": 1.3457, + "step": 934 + }, + { + "epoch": 0.9101232965606749, + "grad_norm": 0.431640625, + "learning_rate": 3.961797195560118e-06, + "loss": 1.3425, + "step": 935 + }, + { + "epoch": 0.9110966904607398, + "grad_norm": 0.41796875, + "learning_rate": 3.959721721632366e-06, + "loss": 1.3502, + "step": 936 + }, + { + "epoch": 0.9120700843608047, + "grad_norm": 0.3984375, + "learning_rate": 3.95764472010287e-06, + "loss": 1.3603, + "step": 937 + }, + { + "epoch": 0.9130434782608695, + "grad_norm": 0.41796875, + "learning_rate": 3.955566193145218e-06, + "loss": 1.3416, + "step": 938 + }, + { + "epoch": 0.9140168721609344, + "grad_norm": 0.416015625, + "learning_rate": 3.953486142934591e-06, + "loss": 1.3304, + "step": 939 + }, + { + "epoch": 0.9149902660609993, + "grad_norm": 0.421875, + "learning_rate": 3.9514045716477665e-06, + "loss": 1.3487, + "step": 940 + }, + { + "epoch": 0.9159636599610642, + "grad_norm": 0.4140625, + "learning_rate": 3.949321481463114e-06, + "loss": 1.3587, + "step": 941 + }, + { + "epoch": 0.9169370538611291, + "grad_norm": 0.408203125, + "learning_rate": 3.947236874560591e-06, + "loss": 1.3468, + "step": 942 + }, + { + "epoch": 0.917910447761194, + "grad_norm": 0.40625, + "learning_rate": 3.945150753121742e-06, + "loss": 1.3397, + "step": 943 + }, + { + "epoch": 0.9188838416612589, + "grad_norm": 0.412109375, + "learning_rate": 3.9430631193296976e-06, + "loss": 1.3269, + "step": 944 + }, + { + "epoch": 0.9198572355613238, + "grad_norm": 0.412109375, + "learning_rate": 3.940973975369171e-06, + "loss": 1.3579, + "step": 945 + }, + { + "epoch": 0.9208306294613887, + "grad_norm": 0.421875, + "learning_rate": 3.938883323426456e-06, + "loss": 1.3617, + "step": 946 + }, + { + "epoch": 0.9218040233614536, + "grad_norm": 0.431640625, + "learning_rate": 3.936791165689424e-06, + "loss": 1.3583, + "step": 947 + }, + { + "epoch": 0.9227774172615185, + "grad_norm": 0.4375, + "learning_rate": 3.9346975043475216e-06, + "loss": 1.3346, + "step": 948 + }, + { + "epoch": 0.9237508111615834, + "grad_norm": 0.42578125, + "learning_rate": 3.9326023415917704e-06, + "loss": 1.334, + "step": 949 + }, + { + "epoch": 0.9247242050616483, + "grad_norm": 0.419921875, + "learning_rate": 3.930505679614762e-06, + "loss": 1.3414, + "step": 950 + }, + { + "epoch": 0.9256975989617132, + "grad_norm": 0.408203125, + "learning_rate": 3.928407520610658e-06, + "loss": 1.3375, + "step": 951 + }, + { + "epoch": 0.9266709928617781, + "grad_norm": 0.439453125, + "learning_rate": 3.926307866775186e-06, + "loss": 1.3527, + "step": 952 + }, + { + "epoch": 0.9276443867618429, + "grad_norm": 0.439453125, + "learning_rate": 3.924206720305638e-06, + "loss": 1.3498, + "step": 953 + }, + { + "epoch": 0.9286177806619078, + "grad_norm": 0.423828125, + "learning_rate": 3.922104083400866e-06, + "loss": 1.3638, + "step": 954 + }, + { + "epoch": 0.9295911745619727, + "grad_norm": 0.400390625, + "learning_rate": 3.919999958261287e-06, + "loss": 1.334, + "step": 955 + }, + { + "epoch": 0.9305645684620376, + "grad_norm": 0.44921875, + "learning_rate": 3.91789434708887e-06, + "loss": 1.3619, + "step": 956 + }, + { + "epoch": 0.9315379623621025, + "grad_norm": 0.45703125, + "learning_rate": 3.915787252087143e-06, + "loss": 1.3265, + "step": 957 + }, + { + "epoch": 0.9325113562621674, + "grad_norm": 0.431640625, + "learning_rate": 3.913678675461184e-06, + "loss": 1.345, + "step": 958 + }, + { + "epoch": 0.9334847501622323, + "grad_norm": 0.3984375, + "learning_rate": 3.9115686194176225e-06, + "loss": 1.3243, + "step": 959 + }, + { + "epoch": 0.9344581440622972, + "grad_norm": 0.408203125, + "learning_rate": 3.909457086164638e-06, + "loss": 1.33, + "step": 960 + }, + { + "epoch": 0.9354315379623621, + "grad_norm": 0.435546875, + "learning_rate": 3.907344077911952e-06, + "loss": 1.3443, + "step": 961 + }, + { + "epoch": 0.936404931862427, + "grad_norm": 0.439453125, + "learning_rate": 3.905229596870833e-06, + "loss": 1.3452, + "step": 962 + }, + { + "epoch": 0.9373783257624919, + "grad_norm": 0.4140625, + "learning_rate": 3.9031136452540915e-06, + "loss": 1.3114, + "step": 963 + }, + { + "epoch": 0.9383517196625568, + "grad_norm": 0.4140625, + "learning_rate": 3.900996225276073e-06, + "loss": 1.3469, + "step": 964 + }, + { + "epoch": 0.9393251135626217, + "grad_norm": 0.412109375, + "learning_rate": 3.8988773391526626e-06, + "loss": 1.3281, + "step": 965 + }, + { + "epoch": 0.9402985074626866, + "grad_norm": 0.4375, + "learning_rate": 3.896756989101278e-06, + "loss": 1.3396, + "step": 966 + }, + { + "epoch": 0.9412719013627514, + "grad_norm": 0.44921875, + "learning_rate": 3.894635177340871e-06, + "loss": 1.3279, + "step": 967 + }, + { + "epoch": 0.9422452952628163, + "grad_norm": 0.439453125, + "learning_rate": 3.892511906091921e-06, + "loss": 1.3368, + "step": 968 + }, + { + "epoch": 0.9432186891628812, + "grad_norm": 0.431640625, + "learning_rate": 3.890387177576437e-06, + "loss": 1.3565, + "step": 969 + }, + { + "epoch": 0.9441920830629461, + "grad_norm": 0.466796875, + "learning_rate": 3.88826099401795e-06, + "loss": 1.3467, + "step": 970 + }, + { + "epoch": 0.945165476963011, + "grad_norm": 0.427734375, + "learning_rate": 3.886133357641516e-06, + "loss": 1.351, + "step": 971 + }, + { + "epoch": 0.9461388708630759, + "grad_norm": 0.4296875, + "learning_rate": 3.884004270673711e-06, + "loss": 1.312, + "step": 972 + }, + { + "epoch": 0.9471122647631408, + "grad_norm": 0.4375, + "learning_rate": 3.88187373534263e-06, + "loss": 1.334, + "step": 973 + }, + { + "epoch": 0.9480856586632057, + "grad_norm": 0.47265625, + "learning_rate": 3.879741753877881e-06, + "loss": 1.3522, + "step": 974 + }, + { + "epoch": 0.9490590525632706, + "grad_norm": 0.44140625, + "learning_rate": 3.877608328510587e-06, + "loss": 1.3301, + "step": 975 + }, + { + "epoch": 0.9500324464633355, + "grad_norm": 0.419921875, + "learning_rate": 3.875473461473383e-06, + "loss": 1.3216, + "step": 976 + }, + { + "epoch": 0.9510058403634004, + "grad_norm": 0.4375, + "learning_rate": 3.873337155000409e-06, + "loss": 1.3288, + "step": 977 + }, + { + "epoch": 0.9519792342634653, + "grad_norm": 0.44140625, + "learning_rate": 3.871199411327318e-06, + "loss": 1.3373, + "step": 978 + }, + { + "epoch": 0.9529526281635302, + "grad_norm": 0.455078125, + "learning_rate": 3.86906023269126e-06, + "loss": 1.3399, + "step": 979 + }, + { + "epoch": 0.9539260220635951, + "grad_norm": 0.44140625, + "learning_rate": 3.866919621330892e-06, + "loss": 1.329, + "step": 980 + }, + { + "epoch": 0.95489941596366, + "grad_norm": 0.466796875, + "learning_rate": 3.864777579486366e-06, + "loss": 1.3293, + "step": 981 + }, + { + "epoch": 0.9558728098637248, + "grad_norm": 0.443359375, + "learning_rate": 3.8626341093993346e-06, + "loss": 1.3483, + "step": 982 + }, + { + "epoch": 0.9568462037637897, + "grad_norm": 0.435546875, + "learning_rate": 3.860489213312943e-06, + "loss": 1.3413, + "step": 983 + }, + { + "epoch": 0.9578195976638546, + "grad_norm": 0.458984375, + "learning_rate": 3.85834289347183e-06, + "loss": 1.3715, + "step": 984 + }, + { + "epoch": 0.9587929915639195, + "grad_norm": 0.4453125, + "learning_rate": 3.856195152122123e-06, + "loss": 1.3392, + "step": 985 + }, + { + "epoch": 0.9597663854639844, + "grad_norm": 0.423828125, + "learning_rate": 3.854045991511438e-06, + "loss": 1.3204, + "step": 986 + }, + { + "epoch": 0.9607397793640493, + "grad_norm": 0.419921875, + "learning_rate": 3.851895413888875e-06, + "loss": 1.3382, + "step": 987 + }, + { + "epoch": 0.9617131732641142, + "grad_norm": 0.423828125, + "learning_rate": 3.849743421505019e-06, + "loss": 1.347, + "step": 988 + }, + { + "epoch": 0.9626865671641791, + "grad_norm": 0.43359375, + "learning_rate": 3.847590016611934e-06, + "loss": 1.3517, + "step": 989 + }, + { + "epoch": 0.963659961064244, + "grad_norm": 0.4296875, + "learning_rate": 3.8454352014631624e-06, + "loss": 1.3339, + "step": 990 + }, + { + "epoch": 0.9646333549643089, + "grad_norm": 0.427734375, + "learning_rate": 3.843278978313724e-06, + "loss": 1.3211, + "step": 991 + }, + { + "epoch": 0.9656067488643738, + "grad_norm": 0.4140625, + "learning_rate": 3.841121349420109e-06, + "loss": 1.3338, + "step": 992 + }, + { + "epoch": 0.9665801427644387, + "grad_norm": 0.439453125, + "learning_rate": 3.83896231704028e-06, + "loss": 1.3416, + "step": 993 + }, + { + "epoch": 0.9675535366645036, + "grad_norm": 0.455078125, + "learning_rate": 3.8368018834336694e-06, + "loss": 1.3275, + "step": 994 + }, + { + "epoch": 0.9685269305645685, + "grad_norm": 0.4453125, + "learning_rate": 3.834640050861177e-06, + "loss": 1.3369, + "step": 995 + }, + { + "epoch": 0.9695003244646333, + "grad_norm": 0.412109375, + "learning_rate": 3.832476821585164e-06, + "loss": 1.333, + "step": 996 + }, + { + "epoch": 0.9704737183646982, + "grad_norm": 0.41015625, + "learning_rate": 3.830312197869453e-06, + "loss": 1.3361, + "step": 997 + }, + { + "epoch": 0.9714471122647631, + "grad_norm": 0.41796875, + "learning_rate": 3.828146181979327e-06, + "loss": 1.3312, + "step": 998 + }, + { + "epoch": 0.972420506164828, + "grad_norm": 0.423828125, + "learning_rate": 3.825978776181528e-06, + "loss": 1.3344, + "step": 999 + }, + { + "epoch": 0.9733939000648929, + "grad_norm": 0.41796875, + "learning_rate": 3.8238099827442494e-06, + "loss": 1.3319, + "step": 1000 + }, + { + "epoch": 0.9743672939649578, + "grad_norm": 0.42578125, + "learning_rate": 3.821639803937138e-06, + "loss": 1.3529, + "step": 1001 + }, + { + "epoch": 0.9753406878650227, + "grad_norm": 0.41796875, + "learning_rate": 3.819468242031291e-06, + "loss": 1.3247, + "step": 1002 + }, + { + "epoch": 0.9763140817650876, + "grad_norm": 0.4140625, + "learning_rate": 3.8172952992992515e-06, + "loss": 1.3332, + "step": 1003 + }, + { + "epoch": 0.9772874756651525, + "grad_norm": 0.4140625, + "learning_rate": 3.815120978015008e-06, + "loss": 1.3292, + "step": 1004 + }, + { + "epoch": 0.9782608695652174, + "grad_norm": 0.408203125, + "learning_rate": 3.8129452804539934e-06, + "loss": 1.3356, + "step": 1005 + }, + { + "epoch": 0.9792342634652823, + "grad_norm": 0.4296875, + "learning_rate": 3.8107682088930797e-06, + "loss": 1.3534, + "step": 1006 + }, + { + "epoch": 0.9802076573653472, + "grad_norm": 0.416015625, + "learning_rate": 3.808589765610575e-06, + "loss": 1.3265, + "step": 1007 + }, + { + "epoch": 0.9811810512654121, + "grad_norm": 0.41015625, + "learning_rate": 3.806409952886226e-06, + "loss": 1.2973, + "step": 1008 + }, + { + "epoch": 0.982154445165477, + "grad_norm": 0.4140625, + "learning_rate": 3.8042287730012117e-06, + "loss": 1.3629, + "step": 1009 + }, + { + "epoch": 0.9831278390655419, + "grad_norm": 0.416015625, + "learning_rate": 3.8020462282381397e-06, + "loss": 1.3478, + "step": 1010 + }, + { + "epoch": 0.9841012329656067, + "grad_norm": 0.41015625, + "learning_rate": 3.799862320881048e-06, + "loss": 1.3204, + "step": 1011 + }, + { + "epoch": 0.9850746268656716, + "grad_norm": 0.419921875, + "learning_rate": 3.7976770532154006e-06, + "loss": 1.3189, + "step": 1012 + }, + { + "epoch": 0.9860480207657365, + "grad_norm": 0.419921875, + "learning_rate": 3.7954904275280844e-06, + "loss": 1.333, + "step": 1013 + }, + { + "epoch": 0.9870214146658014, + "grad_norm": 0.419921875, + "learning_rate": 3.7933024461074075e-06, + "loss": 1.3276, + "step": 1014 + }, + { + "epoch": 0.9879948085658663, + "grad_norm": 0.40625, + "learning_rate": 3.7911131112430966e-06, + "loss": 1.3269, + "step": 1015 + }, + { + "epoch": 0.9889682024659312, + "grad_norm": 0.400390625, + "learning_rate": 3.7889224252262956e-06, + "loss": 1.3074, + "step": 1016 + }, + { + "epoch": 0.9899415963659961, + "grad_norm": 0.412109375, + "learning_rate": 3.786730390349561e-06, + "loss": 1.3545, + "step": 1017 + }, + { + "epoch": 0.990914990266061, + "grad_norm": 0.431640625, + "learning_rate": 3.7845370089068626e-06, + "loss": 1.318, + "step": 1018 + }, + { + "epoch": 0.9918883841661259, + "grad_norm": 0.423828125, + "learning_rate": 3.7823422831935796e-06, + "loss": 1.3276, + "step": 1019 + }, + { + "epoch": 0.9928617780661908, + "grad_norm": 0.408203125, + "learning_rate": 3.780146215506494e-06, + "loss": 1.3135, + "step": 1020 + }, + { + "epoch": 0.9938351719662557, + "grad_norm": 0.408203125, + "learning_rate": 3.777948808143797e-06, + "loss": 1.3509, + "step": 1021 + }, + { + "epoch": 0.9948085658663206, + "grad_norm": 0.43359375, + "learning_rate": 3.7757500634050797e-06, + "loss": 1.3448, + "step": 1022 + }, + { + "epoch": 0.9957819597663855, + "grad_norm": 0.41796875, + "learning_rate": 3.7735499835913324e-06, + "loss": 1.3266, + "step": 1023 + }, + { + "epoch": 0.9967553536664504, + "grad_norm": 0.421875, + "learning_rate": 3.7713485710049445e-06, + "loss": 1.3315, + "step": 1024 + }, + { + "epoch": 0.9977287475665152, + "grad_norm": 0.44921875, + "learning_rate": 3.769145827949697e-06, + "loss": 1.3565, + "step": 1025 + }, + { + "epoch": 0.9987021414665801, + "grad_norm": 0.44140625, + "learning_rate": 3.766941756730766e-06, + "loss": 1.3316, + "step": 1026 + }, + { + "epoch": 0.999675535366645, + "grad_norm": 0.416015625, + "learning_rate": 3.764736359654716e-06, + "loss": 1.3254, + "step": 1027 + }, + { + "epoch": 1.00064892926671, + "grad_norm": 0.412109375, + "learning_rate": 3.7625296390294996e-06, + "loss": 1.3144, + "step": 1028 + }, + { + "epoch": 1.001622323166775, + "grad_norm": 0.41796875, + "learning_rate": 3.7603215971644545e-06, + "loss": 1.3256, + "step": 1029 + }, + { + "epoch": 1.001622323166775, + "eval_loss": 1.3576956987380981, + "eval_runtime": 1522.156, + "eval_samples_per_second": 27.431, + "eval_steps_per_second": 3.429, + "step": 1029 + }, + { + "epoch": 1.0005680204487362, + "grad_norm": 0.421875, + "learning_rate": 3.7581122363703016e-06, + "loss": 1.3287, + "step": 1030 + }, + { + "epoch": 1.0015417697894267, + "grad_norm": 0.423828125, + "learning_rate": 3.7559015589591397e-06, + "loss": 1.3425, + "step": 1031 + }, + { + "epoch": 1.0025155191301172, + "grad_norm": 0.419921875, + "learning_rate": 3.753689567244449e-06, + "loss": 1.3431, + "step": 1032 + }, + { + "epoch": 1.0034892684708079, + "grad_norm": 0.41015625, + "learning_rate": 3.751476263541083e-06, + "loss": 1.3238, + "step": 1033 + }, + { + "epoch": 1.0044630178114984, + "grad_norm": 0.41015625, + "learning_rate": 3.7492616501652674e-06, + "loss": 1.3389, + "step": 1034 + }, + { + "epoch": 1.0054367671521889, + "grad_norm": 0.412109375, + "learning_rate": 3.7470457294346012e-06, + "loss": 1.3284, + "step": 1035 + }, + { + "epoch": 1.0064105164928794, + "grad_norm": 0.416015625, + "learning_rate": 3.744828503668049e-06, + "loss": 1.3488, + "step": 1036 + }, + { + "epoch": 1.00738426583357, + "grad_norm": 0.412109375, + "learning_rate": 3.7426099751859413e-06, + "loss": 1.3519, + "step": 1037 + }, + { + "epoch": 1.0083580151742606, + "grad_norm": 0.421875, + "learning_rate": 3.7403901463099745e-06, + "loss": 1.3461, + "step": 1038 + }, + { + "epoch": 1.009331764514951, + "grad_norm": 0.4140625, + "learning_rate": 3.7381690193632027e-06, + "loss": 1.3512, + "step": 1039 + }, + { + "epoch": 1.0103055138556416, + "grad_norm": 0.421875, + "learning_rate": 3.7359465966700405e-06, + "loss": 1.3379, + "step": 1040 + }, + { + "epoch": 1.011279263196332, + "grad_norm": 0.3984375, + "learning_rate": 3.7337228805562566e-06, + "loss": 1.3377, + "step": 1041 + }, + { + "epoch": 1.0122530125370228, + "grad_norm": 0.423828125, + "learning_rate": 3.7314978733489754e-06, + "loss": 1.3243, + "step": 1042 + }, + { + "epoch": 1.0132267618777133, + "grad_norm": 0.416015625, + "learning_rate": 3.7292715773766715e-06, + "loss": 1.3242, + "step": 1043 + }, + { + "epoch": 1.0142005112184038, + "grad_norm": 0.42578125, + "learning_rate": 3.7270439949691677e-06, + "loss": 1.3469, + "step": 1044 + }, + { + "epoch": 1.0151742605590943, + "grad_norm": 0.419921875, + "learning_rate": 3.7248151284576347e-06, + "loss": 1.3463, + "step": 1045 + }, + { + "epoch": 1.016148009899785, + "grad_norm": 0.421875, + "learning_rate": 3.7225849801745835e-06, + "loss": 1.3616, + "step": 1046 + }, + { + "epoch": 1.0171217592404755, + "grad_norm": 0.41015625, + "learning_rate": 3.7203535524538704e-06, + "loss": 1.317, + "step": 1047 + }, + { + "epoch": 1.018095508581166, + "grad_norm": 0.3984375, + "learning_rate": 3.7181208476306892e-06, + "loss": 1.3045, + "step": 1048 + }, + { + "epoch": 1.0190692579218565, + "grad_norm": 0.41015625, + "learning_rate": 3.71588686804157e-06, + "loss": 1.3566, + "step": 1049 + }, + { + "epoch": 1.0200430072625473, + "grad_norm": 0.4296875, + "learning_rate": 3.713651616024376e-06, + "loss": 1.3379, + "step": 1050 + }, + { + "epoch": 1.0210167566032378, + "grad_norm": 0.427734375, + "learning_rate": 3.711415093918303e-06, + "loss": 1.3284, + "step": 1051 + }, + { + "epoch": 1.0219905059439283, + "grad_norm": 0.431640625, + "learning_rate": 3.709177304063877e-06, + "loss": 1.3292, + "step": 1052 + }, + { + "epoch": 1.0229642552846188, + "grad_norm": 0.412109375, + "learning_rate": 3.7069382488029494e-06, + "loss": 1.3282, + "step": 1053 + }, + { + "epoch": 1.0239380046253093, + "grad_norm": 0.412109375, + "learning_rate": 3.7046979304786958e-06, + "loss": 1.3306, + "step": 1054 + }, + { + "epoch": 1.024911753966, + "grad_norm": 0.421875, + "learning_rate": 3.7024563514356132e-06, + "loss": 1.3347, + "step": 1055 + }, + { + "epoch": 1.0258855033066905, + "grad_norm": 0.4296875, + "learning_rate": 3.7002135140195193e-06, + "loss": 1.3151, + "step": 1056 + }, + { + "epoch": 1.026859252647381, + "grad_norm": 0.40625, + "learning_rate": 3.6979694205775486e-06, + "loss": 1.3277, + "step": 1057 + }, + { + "epoch": 1.0278330019880715, + "grad_norm": 0.412109375, + "learning_rate": 3.695724073458149e-06, + "loss": 1.3234, + "step": 1058 + }, + { + "epoch": 1.0288067513287622, + "grad_norm": 0.42578125, + "learning_rate": 3.6934774750110796e-06, + "loss": 1.3497, + "step": 1059 + }, + { + "epoch": 1.0297805006694527, + "grad_norm": 0.41015625, + "learning_rate": 3.691229627587412e-06, + "loss": 1.3346, + "step": 1060 + }, + { + "epoch": 1.0307542500101432, + "grad_norm": 0.435546875, + "learning_rate": 3.6889805335395216e-06, + "loss": 1.3188, + "step": 1061 + }, + { + "epoch": 1.0317279993508337, + "grad_norm": 0.412109375, + "learning_rate": 3.686730195221091e-06, + "loss": 1.3469, + "step": 1062 + }, + { + "epoch": 1.0327017486915244, + "grad_norm": 0.408203125, + "learning_rate": 3.6844786149871044e-06, + "loss": 1.347, + "step": 1063 + }, + { + "epoch": 1.033675498032215, + "grad_norm": 0.41796875, + "learning_rate": 3.6822257951938432e-06, + "loss": 1.3236, + "step": 1064 + }, + { + "epoch": 1.0346492473729054, + "grad_norm": 0.408203125, + "learning_rate": 3.6799717381988885e-06, + "loss": 1.3313, + "step": 1065 + }, + { + "epoch": 1.035622996713596, + "grad_norm": 0.4140625, + "learning_rate": 3.677716446361116e-06, + "loss": 1.3147, + "step": 1066 + }, + { + "epoch": 1.0365967460542864, + "grad_norm": 0.42578125, + "learning_rate": 3.6754599220406935e-06, + "loss": 1.3147, + "step": 1067 + }, + { + "epoch": 1.0375704953949771, + "grad_norm": 0.421875, + "learning_rate": 3.673202167599078e-06, + "loss": 1.3392, + "step": 1068 + }, + { + "epoch": 1.0385442447356676, + "grad_norm": 0.408203125, + "learning_rate": 3.670943185399013e-06, + "loss": 1.3457, + "step": 1069 + }, + { + "epoch": 1.0395179940763581, + "grad_norm": 0.42578125, + "learning_rate": 3.668682977804529e-06, + "loss": 1.3177, + "step": 1070 + }, + { + "epoch": 1.0404917434170486, + "grad_norm": 0.416015625, + "learning_rate": 3.666421547180938e-06, + "loss": 1.3379, + "step": 1071 + }, + { + "epoch": 1.0414654927577394, + "grad_norm": 0.40234375, + "learning_rate": 3.6641588958948315e-06, + "loss": 1.3175, + "step": 1072 + }, + { + "epoch": 1.0424392420984299, + "grad_norm": 0.41796875, + "learning_rate": 3.6618950263140785e-06, + "loss": 1.3534, + "step": 1073 + }, + { + "epoch": 1.0434129914391204, + "grad_norm": 0.416015625, + "learning_rate": 3.6596299408078228e-06, + "loss": 1.349, + "step": 1074 + }, + { + "epoch": 1.0443867407798109, + "grad_norm": 0.41015625, + "learning_rate": 3.657363641746482e-06, + "loss": 1.3124, + "step": 1075 + }, + { + "epoch": 1.0453604901205016, + "grad_norm": 0.421875, + "learning_rate": 3.6550961315017412e-06, + "loss": 1.3346, + "step": 1076 + }, + { + "epoch": 1.046334239461192, + "grad_norm": 0.43359375, + "learning_rate": 3.652827412446556e-06, + "loss": 1.3266, + "step": 1077 + }, + { + "epoch": 1.0473079888018826, + "grad_norm": 0.423828125, + "learning_rate": 3.650557486955145e-06, + "loss": 1.3395, + "step": 1078 + }, + { + "epoch": 1.048281738142573, + "grad_norm": 0.4140625, + "learning_rate": 3.6482863574029893e-06, + "loss": 1.3218, + "step": 1079 + }, + { + "epoch": 1.0492554874832636, + "grad_norm": 0.41796875, + "learning_rate": 3.646014026166831e-06, + "loss": 1.3231, + "step": 1080 + }, + { + "epoch": 1.0502292368239543, + "grad_norm": 0.42578125, + "learning_rate": 3.6437404956246686e-06, + "loss": 1.3384, + "step": 1081 + }, + { + "epoch": 1.0512029861646448, + "grad_norm": 0.416015625, + "learning_rate": 3.6414657681557576e-06, + "loss": 1.3292, + "step": 1082 + }, + { + "epoch": 1.0521767355053353, + "grad_norm": 0.421875, + "learning_rate": 3.6391898461406045e-06, + "loss": 1.3263, + "step": 1083 + }, + { + "epoch": 1.0531504848460258, + "grad_norm": 0.4140625, + "learning_rate": 3.636912731960966e-06, + "loss": 1.3308, + "step": 1084 + }, + { + "epoch": 1.0541242341867165, + "grad_norm": 0.408203125, + "learning_rate": 3.634634427999847e-06, + "loss": 1.3353, + "step": 1085 + }, + { + "epoch": 1.055097983527407, + "grad_norm": 0.416015625, + "learning_rate": 3.632354936641497e-06, + "loss": 1.3165, + "step": 1086 + }, + { + "epoch": 1.0560717328680975, + "grad_norm": 0.42578125, + "learning_rate": 3.630074260271409e-06, + "loss": 1.3247, + "step": 1087 + }, + { + "epoch": 1.057045482208788, + "grad_norm": 0.439453125, + "learning_rate": 3.6277924012763145e-06, + "loss": 1.346, + "step": 1088 + }, + { + "epoch": 1.0580192315494787, + "grad_norm": 0.4296875, + "learning_rate": 3.6255093620441835e-06, + "loss": 1.3533, + "step": 1089 + }, + { + "epoch": 1.0589929808901692, + "grad_norm": 0.419921875, + "learning_rate": 3.6232251449642225e-06, + "loss": 1.3351, + "step": 1090 + }, + { + "epoch": 1.0599667302308597, + "grad_norm": 0.421875, + "learning_rate": 3.620939752426868e-06, + "loss": 1.3338, + "step": 1091 + }, + { + "epoch": 1.0609404795715502, + "grad_norm": 0.42578125, + "learning_rate": 3.6186531868237882e-06, + "loss": 1.3226, + "step": 1092 + }, + { + "epoch": 1.0619142289122407, + "grad_norm": 0.41015625, + "learning_rate": 3.6163654505478796e-06, + "loss": 1.3064, + "step": 1093 + }, + { + "epoch": 1.0628879782529315, + "grad_norm": 0.4296875, + "learning_rate": 3.6140765459932615e-06, + "loss": 1.3237, + "step": 1094 + }, + { + "epoch": 1.063861727593622, + "grad_norm": 0.447265625, + "learning_rate": 3.611786475555278e-06, + "loss": 1.3342, + "step": 1095 + }, + { + "epoch": 1.0648354769343125, + "grad_norm": 0.4296875, + "learning_rate": 3.6094952416304923e-06, + "loss": 1.3248, + "step": 1096 + }, + { + "epoch": 1.065809226275003, + "grad_norm": 0.421875, + "learning_rate": 3.607202846616685e-06, + "loss": 1.3255, + "step": 1097 + }, + { + "epoch": 1.0667829756156937, + "grad_norm": 0.431640625, + "learning_rate": 3.6049092929128527e-06, + "loss": 1.322, + "step": 1098 + }, + { + "epoch": 1.0677567249563842, + "grad_norm": 0.416015625, + "learning_rate": 3.6026145829192033e-06, + "loss": 1.3385, + "step": 1099 + }, + { + "epoch": 1.0687304742970747, + "grad_norm": 0.4140625, + "learning_rate": 3.600318719037156e-06, + "loss": 1.3149, + "step": 1100 + }, + { + "epoch": 1.0697042236377652, + "grad_norm": 0.3984375, + "learning_rate": 3.598021703669337e-06, + "loss": 1.3329, + "step": 1101 + }, + { + "epoch": 1.070677972978456, + "grad_norm": 0.4140625, + "learning_rate": 3.5957235392195777e-06, + "loss": 1.3234, + "step": 1102 + }, + { + "epoch": 1.0716517223191464, + "grad_norm": 0.421875, + "learning_rate": 3.593424228092911e-06, + "loss": 1.3348, + "step": 1103 + }, + { + "epoch": 1.072625471659837, + "grad_norm": 0.431640625, + "learning_rate": 3.5911237726955717e-06, + "loss": 1.3505, + "step": 1104 + }, + { + "epoch": 1.0735992210005274, + "grad_norm": 0.421875, + "learning_rate": 3.58882217543499e-06, + "loss": 1.3292, + "step": 1105 + }, + { + "epoch": 1.074572970341218, + "grad_norm": 0.435546875, + "learning_rate": 3.5865194387197932e-06, + "loss": 1.321, + "step": 1106 + }, + { + "epoch": 1.0755467196819086, + "grad_norm": 0.40625, + "learning_rate": 3.5842155649597993e-06, + "loss": 1.2894, + "step": 1107 + }, + { + "epoch": 1.0765204690225991, + "grad_norm": 0.421875, + "learning_rate": 3.581910556566017e-06, + "loss": 1.3462, + "step": 1108 + }, + { + "epoch": 1.0774942183632896, + "grad_norm": 0.40625, + "learning_rate": 3.579604415950642e-06, + "loss": 1.3344, + "step": 1109 + }, + { + "epoch": 1.0784679677039801, + "grad_norm": 0.40625, + "learning_rate": 3.5772971455270554e-06, + "loss": 1.3305, + "step": 1110 + }, + { + "epoch": 1.0794417170446708, + "grad_norm": 0.43359375, + "learning_rate": 3.5749887477098213e-06, + "loss": 1.3326, + "step": 1111 + }, + { + "epoch": 1.0804154663853613, + "grad_norm": 0.421875, + "learning_rate": 3.5726792249146814e-06, + "loss": 1.331, + "step": 1112 + }, + { + "epoch": 1.0813892157260518, + "grad_norm": 0.48828125, + "learning_rate": 3.5703685795585558e-06, + "loss": 1.3332, + "step": 1113 + }, + { + "epoch": 1.0823629650667423, + "grad_norm": 0.4375, + "learning_rate": 3.56805681405954e-06, + "loss": 1.3129, + "step": 1114 + }, + { + "epoch": 1.0833367144074328, + "grad_norm": 0.421875, + "learning_rate": 3.565743930836902e-06, + "loss": 1.323, + "step": 1115 + }, + { + "epoch": 1.0843104637481236, + "grad_norm": 0.41796875, + "learning_rate": 3.5634299323110777e-06, + "loss": 1.3322, + "step": 1116 + }, + { + "epoch": 1.085284213088814, + "grad_norm": 0.40625, + "learning_rate": 3.5611148209036716e-06, + "loss": 1.3294, + "step": 1117 + }, + { + "epoch": 1.0862579624295046, + "grad_norm": 0.421875, + "learning_rate": 3.5587985990374535e-06, + "loss": 1.3162, + "step": 1118 + }, + { + "epoch": 1.087231711770195, + "grad_norm": 0.4375, + "learning_rate": 3.5564812691363527e-06, + "loss": 1.3204, + "step": 1119 + }, + { + "epoch": 1.0882054611108858, + "grad_norm": 0.423828125, + "learning_rate": 3.55416283362546e-06, + "loss": 1.3289, + "step": 1120 + }, + { + "epoch": 1.0891792104515763, + "grad_norm": 0.42578125, + "learning_rate": 3.551843294931024e-06, + "loss": 1.3371, + "step": 1121 + }, + { + "epoch": 1.0901529597922668, + "grad_norm": 0.416015625, + "learning_rate": 3.5495226554804453e-06, + "loss": 1.3267, + "step": 1122 + }, + { + "epoch": 1.0911267091329573, + "grad_norm": 0.423828125, + "learning_rate": 3.547200917702279e-06, + "loss": 1.3404, + "step": 1123 + }, + { + "epoch": 1.092100458473648, + "grad_norm": 0.412109375, + "learning_rate": 3.5448780840262274e-06, + "loss": 1.3423, + "step": 1124 + }, + { + "epoch": 1.0930742078143385, + "grad_norm": 0.423828125, + "learning_rate": 3.5425541568831406e-06, + "loss": 1.3314, + "step": 1125 + }, + { + "epoch": 1.094047957155029, + "grad_norm": 0.41015625, + "learning_rate": 3.5402291387050135e-06, + "loss": 1.3427, + "step": 1126 + }, + { + "epoch": 1.0950217064957195, + "grad_norm": 0.408203125, + "learning_rate": 3.537903031924983e-06, + "loss": 1.3438, + "step": 1127 + }, + { + "epoch": 1.0959954558364102, + "grad_norm": 0.412109375, + "learning_rate": 3.535575838977323e-06, + "loss": 1.3431, + "step": 1128 + }, + { + "epoch": 1.0969692051771007, + "grad_norm": 0.416015625, + "learning_rate": 3.5332475622974477e-06, + "loss": 1.3289, + "step": 1129 + }, + { + "epoch": 1.0979429545177912, + "grad_norm": 0.42578125, + "learning_rate": 3.5309182043219016e-06, + "loss": 1.3165, + "step": 1130 + }, + { + "epoch": 1.0989167038584817, + "grad_norm": 0.4140625, + "learning_rate": 3.528587767488363e-06, + "loss": 1.3249, + "step": 1131 + }, + { + "epoch": 1.0998904531991722, + "grad_norm": 0.408203125, + "learning_rate": 3.526256254235638e-06, + "loss": 1.3293, + "step": 1132 + }, + { + "epoch": 1.100864202539863, + "grad_norm": 0.4140625, + "learning_rate": 3.523923667003662e-06, + "loss": 1.3224, + "step": 1133 + }, + { + "epoch": 1.1018379518805534, + "grad_norm": 0.421875, + "learning_rate": 3.521590008233491e-06, + "loss": 1.3258, + "step": 1134 + }, + { + "epoch": 1.102811701221244, + "grad_norm": 0.416015625, + "learning_rate": 3.5192552803673043e-06, + "loss": 1.3404, + "step": 1135 + }, + { + "epoch": 1.1037854505619344, + "grad_norm": 0.400390625, + "learning_rate": 3.516919485848398e-06, + "loss": 1.322, + "step": 1136 + }, + { + "epoch": 1.1047591999026252, + "grad_norm": 0.4140625, + "learning_rate": 3.5145826271211862e-06, + "loss": 1.3295, + "step": 1137 + }, + { + "epoch": 1.1057329492433157, + "grad_norm": 0.421875, + "learning_rate": 3.5122447066311973e-06, + "loss": 1.3475, + "step": 1138 + }, + { + "epoch": 1.1067066985840062, + "grad_norm": 0.4140625, + "learning_rate": 3.5099057268250692e-06, + "loss": 1.3253, + "step": 1139 + }, + { + "epoch": 1.1076804479246967, + "grad_norm": 0.419921875, + "learning_rate": 3.5075656901505495e-06, + "loss": 1.32, + "step": 1140 + }, + { + "epoch": 1.1086541972653872, + "grad_norm": 0.419921875, + "learning_rate": 3.5052245990564902e-06, + "loss": 1.3076, + "step": 1141 + }, + { + "epoch": 1.1096279466060779, + "grad_norm": 0.4140625, + "learning_rate": 3.5028824559928488e-06, + "loss": 1.3443, + "step": 1142 + }, + { + "epoch": 1.1106016959467684, + "grad_norm": 0.41015625, + "learning_rate": 3.5005392634106827e-06, + "loss": 1.3314, + "step": 1143 + }, + { + "epoch": 1.1115754452874589, + "grad_norm": 0.412109375, + "learning_rate": 3.4981950237621476e-06, + "loss": 1.3521, + "step": 1144 + }, + { + "epoch": 1.1125491946281494, + "grad_norm": 0.400390625, + "learning_rate": 3.4958497395004946e-06, + "loss": 1.3114, + "step": 1145 + }, + { + "epoch": 1.11352294396884, + "grad_norm": 0.408203125, + "learning_rate": 3.4935034130800693e-06, + "loss": 1.3359, + "step": 1146 + }, + { + "epoch": 1.1144966933095306, + "grad_norm": 0.41796875, + "learning_rate": 3.491156046956306e-06, + "loss": 1.3377, + "step": 1147 + }, + { + "epoch": 1.115470442650221, + "grad_norm": 0.42578125, + "learning_rate": 3.4888076435857286e-06, + "loss": 1.3571, + "step": 1148 + }, + { + "epoch": 1.1164441919909116, + "grad_norm": 0.412109375, + "learning_rate": 3.4864582054259474e-06, + "loss": 1.3289, + "step": 1149 + }, + { + "epoch": 1.1174179413316023, + "grad_norm": 0.4140625, + "learning_rate": 3.4841077349356518e-06, + "loss": 1.3388, + "step": 1150 + }, + { + "epoch": 1.1183916906722928, + "grad_norm": 0.419921875, + "learning_rate": 3.4817562345746146e-06, + "loss": 1.318, + "step": 1151 + }, + { + "epoch": 1.1193654400129833, + "grad_norm": 0.416015625, + "learning_rate": 3.4794037068036867e-06, + "loss": 1.3307, + "step": 1152 + }, + { + "epoch": 1.1203391893536738, + "grad_norm": 0.412109375, + "learning_rate": 3.4770501540847935e-06, + "loss": 1.3155, + "step": 1153 + }, + { + "epoch": 1.1213129386943643, + "grad_norm": 0.416015625, + "learning_rate": 3.4746955788809306e-06, + "loss": 1.3097, + "step": 1154 + }, + { + "epoch": 1.122286688035055, + "grad_norm": 0.3984375, + "learning_rate": 3.472339983656167e-06, + "loss": 1.3396, + "step": 1155 + }, + { + "epoch": 1.1232604373757455, + "grad_norm": 0.416015625, + "learning_rate": 3.469983370875638e-06, + "loss": 1.3431, + "step": 1156 + }, + { + "epoch": 1.124234186716436, + "grad_norm": 0.43359375, + "learning_rate": 3.4676257430055438e-06, + "loss": 1.3206, + "step": 1157 + }, + { + "epoch": 1.1252079360571265, + "grad_norm": 0.416015625, + "learning_rate": 3.4652671025131457e-06, + "loss": 1.3194, + "step": 1158 + }, + { + "epoch": 1.1261816853978173, + "grad_norm": 0.40234375, + "learning_rate": 3.4629074518667666e-06, + "loss": 1.3156, + "step": 1159 + }, + { + "epoch": 1.1271554347385078, + "grad_norm": 0.408203125, + "learning_rate": 3.4605467935357856e-06, + "loss": 1.3325, + "step": 1160 + }, + { + "epoch": 1.1281291840791983, + "grad_norm": 0.4140625, + "learning_rate": 3.4581851299906367e-06, + "loss": 1.326, + "step": 1161 + }, + { + "epoch": 1.1291029334198888, + "grad_norm": 0.421875, + "learning_rate": 3.4558224637028055e-06, + "loss": 1.3283, + "step": 1162 + }, + { + "epoch": 1.1300766827605795, + "grad_norm": 0.40625, + "learning_rate": 3.4534587971448265e-06, + "loss": 1.3237, + "step": 1163 + }, + { + "epoch": 1.13105043210127, + "grad_norm": 0.41796875, + "learning_rate": 3.4510941327902815e-06, + "loss": 1.327, + "step": 1164 + }, + { + "epoch": 1.1320241814419605, + "grad_norm": 0.412109375, + "learning_rate": 3.448728473113797e-06, + "loss": 1.3273, + "step": 1165 + }, + { + "epoch": 1.132997930782651, + "grad_norm": 0.41796875, + "learning_rate": 3.446361820591041e-06, + "loss": 1.3341, + "step": 1166 + }, + { + "epoch": 1.1339716801233415, + "grad_norm": 0.43359375, + "learning_rate": 3.4439941776987195e-06, + "loss": 1.3253, + "step": 1167 + }, + { + "epoch": 1.1349454294640322, + "grad_norm": 0.416015625, + "learning_rate": 3.4416255469145765e-06, + "loss": 1.3398, + "step": 1168 + }, + { + "epoch": 1.1359191788047227, + "grad_norm": 0.41796875, + "learning_rate": 3.4392559307173876e-06, + "loss": 1.3284, + "step": 1169 + }, + { + "epoch": 1.1368929281454132, + "grad_norm": 0.421875, + "learning_rate": 3.4368853315869616e-06, + "loss": 1.3433, + "step": 1170 + }, + { + "epoch": 1.1378666774861037, + "grad_norm": 0.431640625, + "learning_rate": 3.4345137520041354e-06, + "loss": 1.3196, + "step": 1171 + }, + { + "epoch": 1.1388404268267944, + "grad_norm": 0.427734375, + "learning_rate": 3.432141194450772e-06, + "loss": 1.3346, + "step": 1172 + }, + { + "epoch": 1.139814176167485, + "grad_norm": 0.419921875, + "learning_rate": 3.4297676614097573e-06, + "loss": 1.3232, + "step": 1173 + }, + { + "epoch": 1.1407879255081754, + "grad_norm": 0.41796875, + "learning_rate": 3.4273931553649986e-06, + "loss": 1.3498, + "step": 1174 + }, + { + "epoch": 1.141761674848866, + "grad_norm": 0.412109375, + "learning_rate": 3.4250176788014217e-06, + "loss": 1.349, + "step": 1175 + }, + { + "epoch": 1.1427354241895564, + "grad_norm": 0.423828125, + "learning_rate": 3.4226412342049662e-06, + "loss": 1.3685, + "step": 1176 + }, + { + "epoch": 1.1437091735302471, + "grad_norm": 0.462890625, + "learning_rate": 3.420263824062588e-06, + "loss": 1.3228, + "step": 1177 + }, + { + "epoch": 1.1446829228709376, + "grad_norm": 0.41796875, + "learning_rate": 3.4178854508622506e-06, + "loss": 1.3361, + "step": 1178 + }, + { + "epoch": 1.1456566722116281, + "grad_norm": 0.408203125, + "learning_rate": 3.415506117092927e-06, + "loss": 1.3417, + "step": 1179 + }, + { + "epoch": 1.1466304215523186, + "grad_norm": 0.4140625, + "learning_rate": 3.4131258252445947e-06, + "loss": 1.3278, + "step": 1180 + }, + { + "epoch": 1.1476041708930094, + "grad_norm": 0.412109375, + "learning_rate": 3.4107445778082334e-06, + "loss": 1.3197, + "step": 1181 + }, + { + "epoch": 1.1485779202336999, + "grad_norm": 0.41796875, + "learning_rate": 3.4083623772758236e-06, + "loss": 1.3394, + "step": 1182 + }, + { + "epoch": 1.1495516695743904, + "grad_norm": 0.4140625, + "learning_rate": 3.4059792261403422e-06, + "loss": 1.3305, + "step": 1183 + }, + { + "epoch": 1.1505254189150809, + "grad_norm": 0.4140625, + "learning_rate": 3.4035951268957647e-06, + "loss": 1.3281, + "step": 1184 + }, + { + "epoch": 1.1514991682557716, + "grad_norm": 0.41796875, + "learning_rate": 3.401210082037052e-06, + "loss": 1.3526, + "step": 1185 + }, + { + "epoch": 1.152472917596462, + "grad_norm": 0.40625, + "learning_rate": 3.3988240940601604e-06, + "loss": 1.3178, + "step": 1186 + }, + { + "epoch": 1.1534466669371526, + "grad_norm": 0.408203125, + "learning_rate": 3.3964371654620305e-06, + "loss": 1.3464, + "step": 1187 + }, + { + "epoch": 1.154420416277843, + "grad_norm": 0.408203125, + "learning_rate": 3.3940492987405888e-06, + "loss": 1.3251, + "step": 1188 + }, + { + "epoch": 1.1553941656185338, + "grad_norm": 0.416015625, + "learning_rate": 3.391660496394742e-06, + "loss": 1.3338, + "step": 1189 + }, + { + "epoch": 1.1563679149592243, + "grad_norm": 0.3984375, + "learning_rate": 3.389270760924377e-06, + "loss": 1.3217, + "step": 1190 + }, + { + "epoch": 1.1573416642999148, + "grad_norm": 0.416015625, + "learning_rate": 3.386880094830356e-06, + "loss": 1.3445, + "step": 1191 + }, + { + "epoch": 1.1583154136406053, + "grad_norm": 0.40625, + "learning_rate": 3.3844885006145162e-06, + "loss": 1.3222, + "step": 1192 + }, + { + "epoch": 1.1592891629812958, + "grad_norm": 0.412109375, + "learning_rate": 3.382095980779666e-06, + "loss": 1.3295, + "step": 1193 + }, + { + "epoch": 1.1602629123219865, + "grad_norm": 0.423828125, + "learning_rate": 3.3797025378295826e-06, + "loss": 1.3269, + "step": 1194 + }, + { + "epoch": 1.161236661662677, + "grad_norm": 0.4375, + "learning_rate": 3.3773081742690097e-06, + "loss": 1.3061, + "step": 1195 + }, + { + "epoch": 1.1622104110033675, + "grad_norm": 0.408203125, + "learning_rate": 3.374912892603651e-06, + "loss": 1.3263, + "step": 1196 + }, + { + "epoch": 1.163184160344058, + "grad_norm": 0.41015625, + "learning_rate": 3.372516695340176e-06, + "loss": 1.3231, + "step": 1197 + }, + { + "epoch": 1.1641579096847487, + "grad_norm": 0.4140625, + "learning_rate": 3.370119584986209e-06, + "loss": 1.3239, + "step": 1198 + }, + { + "epoch": 1.1651316590254392, + "grad_norm": 0.41015625, + "learning_rate": 3.36772156405033e-06, + "loss": 1.3341, + "step": 1199 + }, + { + "epoch": 1.1661054083661297, + "grad_norm": 0.416015625, + "learning_rate": 3.365322635042075e-06, + "loss": 1.338, + "step": 1200 + }, + { + "epoch": 1.1670791577068202, + "grad_norm": 0.423828125, + "learning_rate": 3.362922800471927e-06, + "loss": 1.3264, + "step": 1201 + }, + { + "epoch": 1.1680529070475107, + "grad_norm": 0.427734375, + "learning_rate": 3.360522062851317e-06, + "loss": 1.3335, + "step": 1202 + }, + { + "epoch": 1.1690266563882015, + "grad_norm": 0.408203125, + "learning_rate": 3.3581204246926223e-06, + "loss": 1.3147, + "step": 1203 + }, + { + "epoch": 1.170000405728892, + "grad_norm": 0.4140625, + "learning_rate": 3.3557178885091625e-06, + "loss": 1.3319, + "step": 1204 + }, + { + "epoch": 1.1709741550695825, + "grad_norm": 0.4296875, + "learning_rate": 3.3533144568151956e-06, + "loss": 1.3282, + "step": 1205 + }, + { + "epoch": 1.171947904410273, + "grad_norm": 0.4140625, + "learning_rate": 3.35091013212592e-06, + "loss": 1.325, + "step": 1206 + }, + { + "epoch": 1.1729216537509637, + "grad_norm": 0.427734375, + "learning_rate": 3.348504916957463e-06, + "loss": 1.3234, + "step": 1207 + }, + { + "epoch": 1.1738954030916542, + "grad_norm": 0.421875, + "learning_rate": 3.3460988138268895e-06, + "loss": 1.3274, + "step": 1208 + }, + { + "epoch": 1.1748691524323447, + "grad_norm": 0.431640625, + "learning_rate": 3.343691825252191e-06, + "loss": 1.3238, + "step": 1209 + }, + { + "epoch": 1.1758429017730352, + "grad_norm": 0.431640625, + "learning_rate": 3.3412839537522858e-06, + "loss": 1.3196, + "step": 1210 + }, + { + "epoch": 1.1768166511137257, + "grad_norm": 0.4140625, + "learning_rate": 3.3388752018470156e-06, + "loss": 1.3278, + "step": 1211 + }, + { + "epoch": 1.1777904004544164, + "grad_norm": 0.42578125, + "learning_rate": 3.3364655720571453e-06, + "loss": 1.3312, + "step": 1212 + }, + { + "epoch": 1.178764149795107, + "grad_norm": 0.42578125, + "learning_rate": 3.334055066904357e-06, + "loss": 1.3238, + "step": 1213 + }, + { + "epoch": 1.1797378991357974, + "grad_norm": 0.427734375, + "learning_rate": 3.331643688911248e-06, + "loss": 1.3206, + "step": 1214 + }, + { + "epoch": 1.1807116484764881, + "grad_norm": 0.412109375, + "learning_rate": 3.329231440601332e-06, + "loss": 1.311, + "step": 1215 + }, + { + "epoch": 1.1816853978171786, + "grad_norm": 0.412109375, + "learning_rate": 3.3268183244990308e-06, + "loss": 1.3446, + "step": 1216 + }, + { + "epoch": 1.1826591471578691, + "grad_norm": 0.41015625, + "learning_rate": 3.324404343129676e-06, + "loss": 1.3251, + "step": 1217 + }, + { + "epoch": 1.1836328964985596, + "grad_norm": 0.421875, + "learning_rate": 3.3219894990195036e-06, + "loss": 1.3382, + "step": 1218 + }, + { + "epoch": 1.1846066458392501, + "grad_norm": 0.4296875, + "learning_rate": 3.3195737946956525e-06, + "loss": 1.3156, + "step": 1219 + }, + { + "epoch": 1.1855803951799408, + "grad_norm": 0.427734375, + "learning_rate": 3.3171572326861624e-06, + "loss": 1.3352, + "step": 1220 + }, + { + "epoch": 1.1865541445206313, + "grad_norm": 0.419921875, + "learning_rate": 3.3147398155199706e-06, + "loss": 1.2905, + "step": 1221 + }, + { + "epoch": 1.1875278938613218, + "grad_norm": 0.408203125, + "learning_rate": 3.3123215457269086e-06, + "loss": 1.3017, + "step": 1222 + }, + { + "epoch": 1.1885016432020123, + "grad_norm": 0.4375, + "learning_rate": 3.3099024258377017e-06, + "loss": 1.3375, + "step": 1223 + }, + { + "epoch": 1.189475392542703, + "grad_norm": 0.412109375, + "learning_rate": 3.307482458383962e-06, + "loss": 1.3315, + "step": 1224 + }, + { + "epoch": 1.1904491418833936, + "grad_norm": 0.3984375, + "learning_rate": 3.305061645898192e-06, + "loss": 1.3343, + "step": 1225 + }, + { + "epoch": 1.191422891224084, + "grad_norm": 0.427734375, + "learning_rate": 3.3026399909137757e-06, + "loss": 1.3336, + "step": 1226 + }, + { + "epoch": 1.1923966405647746, + "grad_norm": 0.431640625, + "learning_rate": 3.300217495964981e-06, + "loss": 1.3156, + "step": 1227 + }, + { + "epoch": 1.193370389905465, + "grad_norm": 0.43359375, + "learning_rate": 3.2977941635869537e-06, + "loss": 1.3331, + "step": 1228 + }, + { + "epoch": 1.1943441392461558, + "grad_norm": 0.443359375, + "learning_rate": 3.295369996315715e-06, + "loss": 1.3356, + "step": 1229 + }, + { + "epoch": 1.1953178885868463, + "grad_norm": 0.41796875, + "learning_rate": 3.292944996688161e-06, + "loss": 1.3177, + "step": 1230 + }, + { + "epoch": 1.1962916379275368, + "grad_norm": 0.416015625, + "learning_rate": 3.29051916724206e-06, + "loss": 1.3256, + "step": 1231 + }, + { + "epoch": 1.1972653872682273, + "grad_norm": 0.4140625, + "learning_rate": 3.2880925105160464e-06, + "loss": 1.3083, + "step": 1232 + }, + { + "epoch": 1.198239136608918, + "grad_norm": 0.4140625, + "learning_rate": 3.2856650290496216e-06, + "loss": 1.3297, + "step": 1233 + }, + { + "epoch": 1.1992128859496085, + "grad_norm": 0.41015625, + "learning_rate": 3.2832367253831506e-06, + "loss": 1.3139, + "step": 1234 + }, + { + "epoch": 1.200186635290299, + "grad_norm": 0.421875, + "learning_rate": 3.280807602057857e-06, + "loss": 1.3298, + "step": 1235 + }, + { + "epoch": 1.2011603846309895, + "grad_norm": 0.4296875, + "learning_rate": 3.278377661615823e-06, + "loss": 1.3233, + "step": 1236 + }, + { + "epoch": 1.20213413397168, + "grad_norm": 0.41796875, + "learning_rate": 3.2759469065999877e-06, + "loss": 1.3144, + "step": 1237 + }, + { + "epoch": 1.2031078833123707, + "grad_norm": 0.421875, + "learning_rate": 3.2735153395541403e-06, + "loss": 1.3305, + "step": 1238 + }, + { + "epoch": 1.2040816326530612, + "grad_norm": 0.404296875, + "learning_rate": 3.2710829630229208e-06, + "loss": 1.3223, + "step": 1239 + }, + { + "epoch": 1.2050553819937517, + "grad_norm": 0.408203125, + "learning_rate": 3.268649779551816e-06, + "loss": 1.3294, + "step": 1240 + }, + { + "epoch": 1.2060291313344425, + "grad_norm": 0.41015625, + "learning_rate": 3.266215791687157e-06, + "loss": 1.3111, + "step": 1241 + }, + { + "epoch": 1.207002880675133, + "grad_norm": 0.43359375, + "learning_rate": 3.263781001976118e-06, + "loss": 1.3286, + "step": 1242 + }, + { + "epoch": 1.2079766300158234, + "grad_norm": 0.423828125, + "learning_rate": 3.261345412966711e-06, + "loss": 1.3251, + "step": 1243 + }, + { + "epoch": 1.208950379356514, + "grad_norm": 0.439453125, + "learning_rate": 3.2589090272077845e-06, + "loss": 1.3358, + "step": 1244 + }, + { + "epoch": 1.2099241286972044, + "grad_norm": 0.416015625, + "learning_rate": 3.2564718472490215e-06, + "loss": 1.3325, + "step": 1245 + }, + { + "epoch": 1.2108978780378952, + "grad_norm": 0.40625, + "learning_rate": 3.2540338756409353e-06, + "loss": 1.3274, + "step": 1246 + }, + { + "epoch": 1.2118716273785857, + "grad_norm": 0.423828125, + "learning_rate": 3.2515951149348683e-06, + "loss": 1.3338, + "step": 1247 + }, + { + "epoch": 1.2128453767192762, + "grad_norm": 0.423828125, + "learning_rate": 3.2491555676829894e-06, + "loss": 1.3228, + "step": 1248 + }, + { + "epoch": 1.2138191260599667, + "grad_norm": 0.4296875, + "learning_rate": 3.2467152364382883e-06, + "loss": 1.328, + "step": 1249 + }, + { + "epoch": 1.2147928754006574, + "grad_norm": 0.41015625, + "learning_rate": 3.244274123754578e-06, + "loss": 1.3115, + "step": 1250 + }, + { + "epoch": 1.215766624741348, + "grad_norm": 0.40625, + "learning_rate": 3.2418322321864866e-06, + "loss": 1.3481, + "step": 1251 + }, + { + "epoch": 1.2167403740820384, + "grad_norm": 0.423828125, + "learning_rate": 3.2393895642894596e-06, + "loss": 1.3045, + "step": 1252 + }, + { + "epoch": 1.217714123422729, + "grad_norm": 0.4140625, + "learning_rate": 3.236946122619753e-06, + "loss": 1.304, + "step": 1253 + }, + { + "epoch": 1.2186878727634194, + "grad_norm": 0.421875, + "learning_rate": 3.2345019097344336e-06, + "loss": 1.3004, + "step": 1254 + }, + { + "epoch": 1.21966162210411, + "grad_norm": 0.427734375, + "learning_rate": 3.232056928191376e-06, + "loss": 1.3315, + "step": 1255 + }, + { + "epoch": 1.2206353714448006, + "grad_norm": 0.4140625, + "learning_rate": 3.229611180549258e-06, + "loss": 1.303, + "step": 1256 + }, + { + "epoch": 1.221609120785491, + "grad_norm": 0.42578125, + "learning_rate": 3.2271646693675596e-06, + "loss": 1.3242, + "step": 1257 + }, + { + "epoch": 1.2225828701261816, + "grad_norm": 0.40234375, + "learning_rate": 3.2247173972065593e-06, + "loss": 1.2989, + "step": 1258 + }, + { + "epoch": 1.2235566194668723, + "grad_norm": 0.42578125, + "learning_rate": 3.2222693666273318e-06, + "loss": 1.336, + "step": 1259 + }, + { + "epoch": 1.2245303688075628, + "grad_norm": 0.3984375, + "learning_rate": 3.219820580191747e-06, + "loss": 1.336, + "step": 1260 + }, + { + "epoch": 1.2255041181482533, + "grad_norm": 0.423828125, + "learning_rate": 3.2173710404624646e-06, + "loss": 1.3578, + "step": 1261 + }, + { + "epoch": 1.2264778674889438, + "grad_norm": 0.42578125, + "learning_rate": 3.2149207500029337e-06, + "loss": 1.3048, + "step": 1262 + }, + { + "epoch": 1.2274516168296343, + "grad_norm": 0.400390625, + "learning_rate": 3.212469711377387e-06, + "loss": 1.3316, + "step": 1263 + }, + { + "epoch": 1.228425366170325, + "grad_norm": 0.400390625, + "learning_rate": 3.210017927150842e-06, + "loss": 1.3255, + "step": 1264 + }, + { + "epoch": 1.2293991155110155, + "grad_norm": 0.421875, + "learning_rate": 3.2075653998890953e-06, + "loss": 1.3153, + "step": 1265 + }, + { + "epoch": 1.230372864851706, + "grad_norm": 0.419921875, + "learning_rate": 3.205112132158722e-06, + "loss": 1.3091, + "step": 1266 + }, + { + "epoch": 1.2313466141923965, + "grad_norm": 0.4296875, + "learning_rate": 3.202658126527073e-06, + "loss": 1.3172, + "step": 1267 + }, + { + "epoch": 1.2323203635330873, + "grad_norm": 0.419921875, + "learning_rate": 3.2002033855622683e-06, + "loss": 1.3196, + "step": 1268 + }, + { + "epoch": 1.2332941128737778, + "grad_norm": 0.427734375, + "learning_rate": 3.1977479118331994e-06, + "loss": 1.3345, + "step": 1269 + }, + { + "epoch": 1.2342678622144683, + "grad_norm": 0.42578125, + "learning_rate": 3.195291707909526e-06, + "loss": 1.3345, + "step": 1270 + }, + { + "epoch": 1.2352416115551588, + "grad_norm": 0.40625, + "learning_rate": 3.192834776361669e-06, + "loss": 1.3376, + "step": 1271 + }, + { + "epoch": 1.2362153608958493, + "grad_norm": 0.419921875, + "learning_rate": 3.190377119760813e-06, + "loss": 1.332, + "step": 1272 + }, + { + "epoch": 1.23718911023654, + "grad_norm": 0.42578125, + "learning_rate": 3.1879187406789014e-06, + "loss": 1.3407, + "step": 1273 + }, + { + "epoch": 1.2381628595772305, + "grad_norm": 0.41796875, + "learning_rate": 3.1854596416886313e-06, + "loss": 1.3402, + "step": 1274 + }, + { + "epoch": 1.239136608917921, + "grad_norm": 0.42578125, + "learning_rate": 3.1829998253634554e-06, + "loss": 1.3287, + "step": 1275 + }, + { + "epoch": 1.2401103582586117, + "grad_norm": 0.396484375, + "learning_rate": 3.180539294277577e-06, + "loss": 1.3168, + "step": 1276 + }, + { + "epoch": 1.2410841075993022, + "grad_norm": 0.40625, + "learning_rate": 3.178078051005946e-06, + "loss": 1.3151, + "step": 1277 + }, + { + "epoch": 1.2420578569399927, + "grad_norm": 0.416015625, + "learning_rate": 3.1756160981242596e-06, + "loss": 1.3262, + "step": 1278 + }, + { + "epoch": 1.2430316062806832, + "grad_norm": 0.41015625, + "learning_rate": 3.1731534382089552e-06, + "loss": 1.3139, + "step": 1279 + }, + { + "epoch": 1.2440053556213737, + "grad_norm": 0.443359375, + "learning_rate": 3.170690073837212e-06, + "loss": 1.3259, + "step": 1280 + }, + { + "epoch": 1.2449791049620644, + "grad_norm": 0.419921875, + "learning_rate": 3.168226007586946e-06, + "loss": 1.3313, + "step": 1281 + }, + { + "epoch": 1.245952854302755, + "grad_norm": 0.41796875, + "learning_rate": 3.1657612420368074e-06, + "loss": 1.3243, + "step": 1282 + }, + { + "epoch": 1.2469266036434454, + "grad_norm": 0.412109375, + "learning_rate": 3.163295779766178e-06, + "loss": 1.3218, + "step": 1283 + }, + { + "epoch": 1.247900352984136, + "grad_norm": 0.416015625, + "learning_rate": 3.16082962335517e-06, + "loss": 1.3141, + "step": 1284 + }, + { + "epoch": 1.2488741023248267, + "grad_norm": 0.412109375, + "learning_rate": 3.15836277538462e-06, + "loss": 1.3284, + "step": 1285 + }, + { + "epoch": 1.2498478516655172, + "grad_norm": 0.423828125, + "learning_rate": 3.15589523843609e-06, + "loss": 1.3417, + "step": 1286 + }, + { + "epoch": 1.2508216010062077, + "grad_norm": 0.416015625, + "learning_rate": 3.1534270150918616e-06, + "loss": 1.3158, + "step": 1287 + }, + { + "epoch": 1.2517953503468982, + "grad_norm": 0.419921875, + "learning_rate": 3.1509581079349373e-06, + "loss": 1.3368, + "step": 1288 + }, + { + "epoch": 1.2527690996875886, + "grad_norm": 0.427734375, + "learning_rate": 3.1484885195490323e-06, + "loss": 1.3687, + "step": 1289 + }, + { + "epoch": 1.2537428490282794, + "grad_norm": 0.416015625, + "learning_rate": 3.146018252518575e-06, + "loss": 1.3235, + "step": 1290 + }, + { + "epoch": 1.2547165983689699, + "grad_norm": 0.4140625, + "learning_rate": 3.1435473094287063e-06, + "loss": 1.3594, + "step": 1291 + }, + { + "epoch": 1.2556903477096604, + "grad_norm": 0.4296875, + "learning_rate": 3.1410756928652723e-06, + "loss": 1.3163, + "step": 1292 + }, + { + "epoch": 1.256664097050351, + "grad_norm": 0.41015625, + "learning_rate": 3.1386034054148258e-06, + "loss": 1.324, + "step": 1293 + }, + { + "epoch": 1.2576378463910416, + "grad_norm": 0.419921875, + "learning_rate": 3.1361304496646195e-06, + "loss": 1.3288, + "step": 1294 + }, + { + "epoch": 1.258611595731732, + "grad_norm": 0.416015625, + "learning_rate": 3.133656828202607e-06, + "loss": 1.3164, + "step": 1295 + }, + { + "epoch": 1.2595853450724226, + "grad_norm": 0.419921875, + "learning_rate": 3.131182543617439e-06, + "loss": 1.3116, + "step": 1296 + }, + { + "epoch": 1.260559094413113, + "grad_norm": 0.40625, + "learning_rate": 3.1287075984984573e-06, + "loss": 1.3235, + "step": 1297 + }, + { + "epoch": 1.2615328437538036, + "grad_norm": 0.412109375, + "learning_rate": 3.126231995435699e-06, + "loss": 1.3236, + "step": 1298 + }, + { + "epoch": 1.2625065930944943, + "grad_norm": 0.427734375, + "learning_rate": 3.123755737019886e-06, + "loss": 1.3198, + "step": 1299 + }, + { + "epoch": 1.2634803424351848, + "grad_norm": 0.416015625, + "learning_rate": 3.1212788258424304e-06, + "loss": 1.3232, + "step": 1300 + }, + { + "epoch": 1.2644540917758753, + "grad_norm": 0.41796875, + "learning_rate": 3.118801264495422e-06, + "loss": 1.3241, + "step": 1301 + }, + { + "epoch": 1.265427841116566, + "grad_norm": 0.423828125, + "learning_rate": 3.116323055571635e-06, + "loss": 1.3101, + "step": 1302 + }, + { + "epoch": 1.2664015904572565, + "grad_norm": 0.416015625, + "learning_rate": 3.1138442016645197e-06, + "loss": 1.3365, + "step": 1303 + }, + { + "epoch": 1.267375339797947, + "grad_norm": 0.43359375, + "learning_rate": 3.1113647053682024e-06, + "loss": 1.3186, + "step": 1304 + }, + { + "epoch": 1.2683490891386375, + "grad_norm": 0.419921875, + "learning_rate": 3.1088845692774798e-06, + "loss": 1.3203, + "step": 1305 + }, + { + "epoch": 1.269322838479328, + "grad_norm": 0.421875, + "learning_rate": 3.106403795987821e-06, + "loss": 1.3171, + "step": 1306 + }, + { + "epoch": 1.2702965878200185, + "grad_norm": 0.45703125, + "learning_rate": 3.1039223880953593e-06, + "loss": 1.3087, + "step": 1307 + }, + { + "epoch": 1.2712703371607093, + "grad_norm": 0.412109375, + "learning_rate": 3.1014403481968936e-06, + "loss": 1.3124, + "step": 1308 + }, + { + "epoch": 1.2722440865013998, + "grad_norm": 0.423828125, + "learning_rate": 3.098957678889882e-06, + "loss": 1.3306, + "step": 1309 + }, + { + "epoch": 1.2732178358420903, + "grad_norm": 0.439453125, + "learning_rate": 3.0964743827724453e-06, + "loss": 1.3571, + "step": 1310 + }, + { + "epoch": 1.274191585182781, + "grad_norm": 0.4296875, + "learning_rate": 3.093990462443357e-06, + "loss": 1.3433, + "step": 1311 + }, + { + "epoch": 1.2751653345234715, + "grad_norm": 0.408203125, + "learning_rate": 3.0915059205020443e-06, + "loss": 1.3189, + "step": 1312 + }, + { + "epoch": 1.276139083864162, + "grad_norm": 0.41015625, + "learning_rate": 3.0890207595485856e-06, + "loss": 1.3024, + "step": 1313 + }, + { + "epoch": 1.2771128332048525, + "grad_norm": 0.416015625, + "learning_rate": 3.086534982183707e-06, + "loss": 1.3039, + "step": 1314 + }, + { + "epoch": 1.278086582545543, + "grad_norm": 0.416015625, + "learning_rate": 3.0840485910087797e-06, + "loss": 1.3412, + "step": 1315 + }, + { + "epoch": 1.2790603318862337, + "grad_norm": 0.423828125, + "learning_rate": 3.081561588625817e-06, + "loss": 1.3334, + "step": 1316 + }, + { + "epoch": 1.2800340812269242, + "grad_norm": 0.41015625, + "learning_rate": 3.079073977637472e-06, + "loss": 1.33, + "step": 1317 + }, + { + "epoch": 1.2810078305676147, + "grad_norm": 0.416015625, + "learning_rate": 3.0765857606470353e-06, + "loss": 1.329, + "step": 1318 + }, + { + "epoch": 1.2819815799083052, + "grad_norm": 0.4375, + "learning_rate": 3.0740969402584287e-06, + "loss": 1.3007, + "step": 1319 + }, + { + "epoch": 1.282955329248996, + "grad_norm": 0.41796875, + "learning_rate": 3.0716075190762103e-06, + "loss": 1.3283, + "step": 1320 + }, + { + "epoch": 1.2839290785896864, + "grad_norm": 0.41015625, + "learning_rate": 3.0691174997055632e-06, + "loss": 1.3142, + "step": 1321 + }, + { + "epoch": 1.284902827930377, + "grad_norm": 0.431640625, + "learning_rate": 3.0666268847522984e-06, + "loss": 1.3155, + "step": 1322 + }, + { + "epoch": 1.2858765772710674, + "grad_norm": 0.416015625, + "learning_rate": 3.064135676822849e-06, + "loss": 1.3017, + "step": 1323 + }, + { + "epoch": 1.286850326611758, + "grad_norm": 0.40625, + "learning_rate": 3.061643878524269e-06, + "loss": 1.3107, + "step": 1324 + }, + { + "epoch": 1.2878240759524486, + "grad_norm": 0.4140625, + "learning_rate": 3.059151492464231e-06, + "loss": 1.3271, + "step": 1325 + }, + { + "epoch": 1.2887978252931391, + "grad_norm": 0.435546875, + "learning_rate": 3.0566585212510213e-06, + "loss": 1.32, + "step": 1326 + }, + { + "epoch": 1.2897715746338296, + "grad_norm": 0.41796875, + "learning_rate": 3.05416496749354e-06, + "loss": 1.3155, + "step": 1327 + }, + { + "epoch": 1.2907453239745204, + "grad_norm": 0.431640625, + "learning_rate": 3.051670833801296e-06, + "loss": 1.3437, + "step": 1328 + }, + { + "epoch": 1.2917190733152109, + "grad_norm": 0.412109375, + "learning_rate": 3.0491761227844054e-06, + "loss": 1.313, + "step": 1329 + }, + { + "epoch": 1.2926928226559014, + "grad_norm": 0.42578125, + "learning_rate": 3.0466808370535878e-06, + "loss": 1.3255, + "step": 1330 + }, + { + "epoch": 1.2936665719965919, + "grad_norm": 0.4296875, + "learning_rate": 3.044184979220165e-06, + "loss": 1.3475, + "step": 1331 + }, + { + "epoch": 1.2946403213372824, + "grad_norm": 0.421875, + "learning_rate": 3.041688551896057e-06, + "loss": 1.3305, + "step": 1332 + }, + { + "epoch": 1.2956140706779729, + "grad_norm": 0.408203125, + "learning_rate": 3.039191557693782e-06, + "loss": 1.2907, + "step": 1333 + }, + { + "epoch": 1.2965878200186636, + "grad_norm": 0.404296875, + "learning_rate": 3.0366939992264467e-06, + "loss": 1.3218, + "step": 1334 + }, + { + "epoch": 1.297561569359354, + "grad_norm": 0.435546875, + "learning_rate": 3.0341958791077524e-06, + "loss": 1.3171, + "step": 1335 + }, + { + "epoch": 1.2985353187000446, + "grad_norm": 0.439453125, + "learning_rate": 3.031697199951987e-06, + "loss": 1.3191, + "step": 1336 + }, + { + "epoch": 1.2995090680407353, + "grad_norm": 0.43359375, + "learning_rate": 3.0291979643740235e-06, + "loss": 1.3216, + "step": 1337 + }, + { + "epoch": 1.3004828173814258, + "grad_norm": 0.43359375, + "learning_rate": 3.026698174989316e-06, + "loss": 1.3069, + "step": 1338 + }, + { + "epoch": 1.3014565667221163, + "grad_norm": 0.419921875, + "learning_rate": 3.0241978344139e-06, + "loss": 1.3342, + "step": 1339 + }, + { + "epoch": 1.3024303160628068, + "grad_norm": 0.41015625, + "learning_rate": 3.021696945264387e-06, + "loss": 1.3129, + "step": 1340 + }, + { + "epoch": 1.3034040654034973, + "grad_norm": 0.423828125, + "learning_rate": 3.019195510157962e-06, + "loss": 1.3072, + "step": 1341 + }, + { + "epoch": 1.304377814744188, + "grad_norm": 0.423828125, + "learning_rate": 3.0166935317123824e-06, + "loss": 1.3239, + "step": 1342 + }, + { + "epoch": 1.3053515640848785, + "grad_norm": 0.423828125, + "learning_rate": 3.0141910125459744e-06, + "loss": 1.3281, + "step": 1343 + }, + { + "epoch": 1.306325313425569, + "grad_norm": 0.421875, + "learning_rate": 3.011687955277628e-06, + "loss": 1.3332, + "step": 1344 + }, + { + "epoch": 1.3072990627662595, + "grad_norm": 0.41796875, + "learning_rate": 3.0091843625267975e-06, + "loss": 1.3286, + "step": 1345 + }, + { + "epoch": 1.3082728121069502, + "grad_norm": 0.41015625, + "learning_rate": 3.0066802369134994e-06, + "loss": 1.3131, + "step": 1346 + }, + { + "epoch": 1.3092465614476407, + "grad_norm": 0.431640625, + "learning_rate": 3.0041755810583057e-06, + "loss": 1.3039, + "step": 1347 + }, + { + "epoch": 1.3102203107883312, + "grad_norm": 0.443359375, + "learning_rate": 3.0016703975823424e-06, + "loss": 1.3248, + "step": 1348 + }, + { + "epoch": 1.3111940601290217, + "grad_norm": 0.41796875, + "learning_rate": 2.9991646891072913e-06, + "loss": 1.2892, + "step": 1349 + }, + { + "epoch": 1.3121678094697122, + "grad_norm": 0.41015625, + "learning_rate": 2.99665845825538e-06, + "loss": 1.3322, + "step": 1350 + }, + { + "epoch": 1.313141558810403, + "grad_norm": 0.41796875, + "learning_rate": 2.994151707649384e-06, + "loss": 1.3231, + "step": 1351 + }, + { + "epoch": 1.3141153081510935, + "grad_norm": 0.41015625, + "learning_rate": 2.9916444399126245e-06, + "loss": 1.3194, + "step": 1352 + }, + { + "epoch": 1.315089057491784, + "grad_norm": 0.431640625, + "learning_rate": 2.98913665766896e-06, + "loss": 1.3136, + "step": 1353 + }, + { + "epoch": 1.3160628068324747, + "grad_norm": 0.41796875, + "learning_rate": 2.986628363542792e-06, + "loss": 1.3007, + "step": 1354 + }, + { + "epoch": 1.3170365561731652, + "grad_norm": 0.423828125, + "learning_rate": 2.9841195601590526e-06, + "loss": 1.3282, + "step": 1355 + }, + { + "epoch": 1.3180103055138557, + "grad_norm": 0.427734375, + "learning_rate": 2.981610250143213e-06, + "loss": 1.3295, + "step": 1356 + }, + { + "epoch": 1.3189840548545462, + "grad_norm": 0.41796875, + "learning_rate": 2.9791004361212687e-06, + "loss": 1.3109, + "step": 1357 + }, + { + "epoch": 1.3199578041952367, + "grad_norm": 0.431640625, + "learning_rate": 2.976590120719746e-06, + "loss": 1.3077, + "step": 1358 + }, + { + "epoch": 1.3209315535359272, + "grad_norm": 0.421875, + "learning_rate": 2.9740793065656953e-06, + "loss": 1.3384, + "step": 1359 + }, + { + "epoch": 1.321905302876618, + "grad_norm": 0.41796875, + "learning_rate": 2.9715679962866885e-06, + "loss": 1.2938, + "step": 1360 + }, + { + "epoch": 1.3228790522173084, + "grad_norm": 0.42578125, + "learning_rate": 2.9690561925108168e-06, + "loss": 1.3282, + "step": 1361 + }, + { + "epoch": 1.323852801557999, + "grad_norm": 0.431640625, + "learning_rate": 2.966543897866688e-06, + "loss": 1.3424, + "step": 1362 + }, + { + "epoch": 1.3248265508986896, + "grad_norm": 0.4296875, + "learning_rate": 2.9640311149834234e-06, + "loss": 1.3368, + "step": 1363 + }, + { + "epoch": 1.3258003002393801, + "grad_norm": 0.42578125, + "learning_rate": 2.9615178464906558e-06, + "loss": 1.318, + "step": 1364 + }, + { + "epoch": 1.3267740495800706, + "grad_norm": 0.43359375, + "learning_rate": 2.959004095018525e-06, + "loss": 1.3023, + "step": 1365 + }, + { + "epoch": 1.3277477989207611, + "grad_norm": 0.447265625, + "learning_rate": 2.9564898631976786e-06, + "loss": 1.306, + "step": 1366 + }, + { + "epoch": 1.3287215482614516, + "grad_norm": 0.421875, + "learning_rate": 2.953975153659263e-06, + "loss": 1.3101, + "step": 1367 + }, + { + "epoch": 1.3296952976021421, + "grad_norm": 0.421875, + "learning_rate": 2.9514599690349287e-06, + "loss": 1.3358, + "step": 1368 + }, + { + "epoch": 1.3306690469428328, + "grad_norm": 0.416015625, + "learning_rate": 2.9489443119568204e-06, + "loss": 1.3106, + "step": 1369 + }, + { + "epoch": 1.3316427962835233, + "grad_norm": 0.404296875, + "learning_rate": 2.9464281850575795e-06, + "loss": 1.3143, + "step": 1370 + }, + { + "epoch": 1.3326165456242138, + "grad_norm": 0.408203125, + "learning_rate": 2.9439115909703366e-06, + "loss": 1.3247, + "step": 1371 + }, + { + "epoch": 1.3335902949649046, + "grad_norm": 0.43359375, + "learning_rate": 2.9413945323287136e-06, + "loss": 1.3274, + "step": 1372 + }, + { + "epoch": 1.334564044305595, + "grad_norm": 0.447265625, + "learning_rate": 2.938877011766817e-06, + "loss": 1.3221, + "step": 1373 + }, + { + "epoch": 1.3355377936462856, + "grad_norm": 0.435546875, + "learning_rate": 2.936359031919237e-06, + "loss": 1.3119, + "step": 1374 + }, + { + "epoch": 1.336511542986976, + "grad_norm": 0.42578125, + "learning_rate": 2.9338405954210446e-06, + "loss": 1.3163, + "step": 1375 + }, + { + "epoch": 1.3374852923276666, + "grad_norm": 0.41796875, + "learning_rate": 2.9313217049077887e-06, + "loss": 1.3262, + "step": 1376 + }, + { + "epoch": 1.3384590416683573, + "grad_norm": 0.408203125, + "learning_rate": 2.928802363015494e-06, + "loss": 1.3322, + "step": 1377 + }, + { + "epoch": 1.3394327910090478, + "grad_norm": 0.4140625, + "learning_rate": 2.9262825723806563e-06, + "loss": 1.3207, + "step": 1378 + }, + { + "epoch": 1.3404065403497383, + "grad_norm": 0.43359375, + "learning_rate": 2.9237623356402423e-06, + "loss": 1.33, + "step": 1379 + }, + { + "epoch": 1.3413802896904288, + "grad_norm": 0.44140625, + "learning_rate": 2.9212416554316837e-06, + "loss": 1.3286, + "step": 1380 + }, + { + "epoch": 1.3423540390311195, + "grad_norm": 0.412109375, + "learning_rate": 2.9187205343928783e-06, + "loss": 1.3208, + "step": 1381 + }, + { + "epoch": 1.34332778837181, + "grad_norm": 0.44140625, + "learning_rate": 2.916198975162184e-06, + "loss": 1.3396, + "step": 1382 + }, + { + "epoch": 1.3443015377125005, + "grad_norm": 0.412109375, + "learning_rate": 2.913676980378418e-06, + "loss": 1.3267, + "step": 1383 + }, + { + "epoch": 1.345275287053191, + "grad_norm": 0.4140625, + "learning_rate": 2.9111545526808528e-06, + "loss": 1.3116, + "step": 1384 + }, + { + "epoch": 1.3462490363938815, + "grad_norm": 0.4296875, + "learning_rate": 2.9086316947092135e-06, + "loss": 1.3048, + "step": 1385 + }, + { + "epoch": 1.3472227857345722, + "grad_norm": 0.447265625, + "learning_rate": 2.9061084091036767e-06, + "loss": 1.3175, + "step": 1386 + }, + { + "epoch": 1.3481965350752627, + "grad_norm": 0.4453125, + "learning_rate": 2.9035846985048666e-06, + "loss": 1.3457, + "step": 1387 + }, + { + "epoch": 1.3491702844159532, + "grad_norm": 0.423828125, + "learning_rate": 2.90106056555385e-06, + "loss": 1.3214, + "step": 1388 + }, + { + "epoch": 1.350144033756644, + "grad_norm": 0.41796875, + "learning_rate": 2.8985360128921388e-06, + "loss": 1.316, + "step": 1389 + }, + { + "epoch": 1.3511177830973344, + "grad_norm": 0.4140625, + "learning_rate": 2.8960110431616806e-06, + "loss": 1.3004, + "step": 1390 + }, + { + "epoch": 1.352091532438025, + "grad_norm": 0.427734375, + "learning_rate": 2.8934856590048625e-06, + "loss": 1.3256, + "step": 1391 + }, + { + "epoch": 1.3530652817787154, + "grad_norm": 0.4609375, + "learning_rate": 2.8909598630645042e-06, + "loss": 1.3187, + "step": 1392 + }, + { + "epoch": 1.354039031119406, + "grad_norm": 0.423828125, + "learning_rate": 2.8884336579838556e-06, + "loss": 1.3232, + "step": 1393 + }, + { + "epoch": 1.3550127804600964, + "grad_norm": 0.4375, + "learning_rate": 2.8859070464065963e-06, + "loss": 1.3082, + "step": 1394 + }, + { + "epoch": 1.3559865298007872, + "grad_norm": 0.45703125, + "learning_rate": 2.88338003097683e-06, + "loss": 1.3194, + "step": 1395 + }, + { + "epoch": 1.3569602791414777, + "grad_norm": 0.4375, + "learning_rate": 2.880852614339083e-06, + "loss": 1.3119, + "step": 1396 + }, + { + "epoch": 1.3579340284821682, + "grad_norm": 0.404296875, + "learning_rate": 2.878324799138303e-06, + "loss": 1.2957, + "step": 1397 + }, + { + "epoch": 1.3589077778228589, + "grad_norm": 0.408203125, + "learning_rate": 2.8757965880198523e-06, + "loss": 1.3294, + "step": 1398 + }, + { + "epoch": 1.3598815271635494, + "grad_norm": 0.4296875, + "learning_rate": 2.8732679836295103e-06, + "loss": 1.3227, + "step": 1399 + }, + { + "epoch": 1.3608552765042399, + "grad_norm": 0.4375, + "learning_rate": 2.870738988613467e-06, + "loss": 1.3249, + "step": 1400 + }, + { + "epoch": 1.3618290258449304, + "grad_norm": 0.423828125, + "learning_rate": 2.868209605618319e-06, + "loss": 1.3452, + "step": 1401 + }, + { + "epoch": 1.3628027751856209, + "grad_norm": 0.4140625, + "learning_rate": 2.8656798372910714e-06, + "loss": 1.3414, + "step": 1402 + }, + { + "epoch": 1.3637765245263116, + "grad_norm": 0.427734375, + "learning_rate": 2.863149686279133e-06, + "loss": 1.3234, + "step": 1403 + }, + { + "epoch": 1.364750273867002, + "grad_norm": 0.43359375, + "learning_rate": 2.8606191552303103e-06, + "loss": 1.3342, + "step": 1404 + }, + { + "epoch": 1.3657240232076926, + "grad_norm": 0.41015625, + "learning_rate": 2.858088246792811e-06, + "loss": 1.3379, + "step": 1405 + }, + { + "epoch": 1.366697772548383, + "grad_norm": 0.4296875, + "learning_rate": 2.855556963615234e-06, + "loss": 1.3388, + "step": 1406 + }, + { + "epoch": 1.3676715218890738, + "grad_norm": 0.439453125, + "learning_rate": 2.853025308346574e-06, + "loss": 1.331, + "step": 1407 + }, + { + "epoch": 1.3686452712297643, + "grad_norm": 0.462890625, + "learning_rate": 2.850493283636212e-06, + "loss": 1.3115, + "step": 1408 + }, + { + "epoch": 1.3696190205704548, + "grad_norm": 0.40625, + "learning_rate": 2.847960892133918e-06, + "loss": 1.3168, + "step": 1409 + }, + { + "epoch": 1.3705927699111453, + "grad_norm": 0.419921875, + "learning_rate": 2.845428136489844e-06, + "loss": 1.319, + "step": 1410 + }, + { + "epoch": 1.3715665192518358, + "grad_norm": 0.419921875, + "learning_rate": 2.842895019354525e-06, + "loss": 1.347, + "step": 1411 + }, + { + "epoch": 1.3725402685925265, + "grad_norm": 0.416015625, + "learning_rate": 2.8403615433788722e-06, + "loss": 1.3205, + "step": 1412 + }, + { + "epoch": 1.373514017933217, + "grad_norm": 0.427734375, + "learning_rate": 2.837827711214173e-06, + "loss": 1.306, + "step": 1413 + }, + { + "epoch": 1.3744877672739075, + "grad_norm": 0.412109375, + "learning_rate": 2.8352935255120893e-06, + "loss": 1.3281, + "step": 1414 + }, + { + "epoch": 1.3754615166145983, + "grad_norm": 0.416015625, + "learning_rate": 2.8327589889246513e-06, + "loss": 1.3142, + "step": 1415 + }, + { + "epoch": 1.3764352659552888, + "grad_norm": 0.41796875, + "learning_rate": 2.8302241041042564e-06, + "loss": 1.3356, + "step": 1416 + }, + { + "epoch": 1.3774090152959793, + "grad_norm": 0.419921875, + "learning_rate": 2.8276888737036657e-06, + "loss": 1.3273, + "step": 1417 + }, + { + "epoch": 1.3783827646366698, + "grad_norm": 0.41015625, + "learning_rate": 2.8251533003760044e-06, + "loss": 1.3069, + "step": 1418 + }, + { + "epoch": 1.3793565139773603, + "grad_norm": 0.419921875, + "learning_rate": 2.822617386774754e-06, + "loss": 1.3231, + "step": 1419 + }, + { + "epoch": 1.3803302633180508, + "grad_norm": 0.43359375, + "learning_rate": 2.8200811355537543e-06, + "loss": 1.3291, + "step": 1420 + }, + { + "epoch": 1.3813040126587415, + "grad_norm": 0.408203125, + "learning_rate": 2.817544549367197e-06, + "loss": 1.3474, + "step": 1421 + }, + { + "epoch": 1.382277761999432, + "grad_norm": 0.408203125, + "learning_rate": 2.8150076308696247e-06, + "loss": 1.3157, + "step": 1422 + }, + { + "epoch": 1.3832515113401225, + "grad_norm": 0.412109375, + "learning_rate": 2.812470382715927e-06, + "loss": 1.3027, + "step": 1423 + }, + { + "epoch": 1.3842252606808132, + "grad_norm": 0.41015625, + "learning_rate": 2.8099328075613403e-06, + "loss": 1.3373, + "step": 1424 + }, + { + "epoch": 1.3851990100215037, + "grad_norm": 0.419921875, + "learning_rate": 2.807394908061441e-06, + "loss": 1.3396, + "step": 1425 + }, + { + "epoch": 1.3861727593621942, + "grad_norm": 0.4140625, + "learning_rate": 2.8048566868721473e-06, + "loss": 1.3278, + "step": 1426 + }, + { + "epoch": 1.3871465087028847, + "grad_norm": 0.416015625, + "learning_rate": 2.802318146649713e-06, + "loss": 1.2906, + "step": 1427 + }, + { + "epoch": 1.3881202580435752, + "grad_norm": 0.416015625, + "learning_rate": 2.7997792900507236e-06, + "loss": 1.3173, + "step": 1428 + }, + { + "epoch": 1.389094007384266, + "grad_norm": 0.419921875, + "learning_rate": 2.7972401197321e-06, + "loss": 1.3236, + "step": 1429 + }, + { + "epoch": 1.3900677567249564, + "grad_norm": 0.42578125, + "learning_rate": 2.7947006383510868e-06, + "loss": 1.3112, + "step": 1430 + }, + { + "epoch": 1.391041506065647, + "grad_norm": 0.41015625, + "learning_rate": 2.7921608485652585e-06, + "loss": 1.3085, + "step": 1431 + }, + { + "epoch": 1.3920152554063374, + "grad_norm": 0.40234375, + "learning_rate": 2.7896207530325093e-06, + "loss": 1.3015, + "step": 1432 + }, + { + "epoch": 1.3929890047470281, + "grad_norm": 0.412109375, + "learning_rate": 2.7870803544110546e-06, + "loss": 1.3219, + "step": 1433 + }, + { + "epoch": 1.3939627540877186, + "grad_norm": 0.4296875, + "learning_rate": 2.7845396553594267e-06, + "loss": 1.3122, + "step": 1434 + }, + { + "epoch": 1.3949365034284091, + "grad_norm": 0.42578125, + "learning_rate": 2.7819986585364715e-06, + "loss": 1.2978, + "step": 1435 + }, + { + "epoch": 1.3959102527690996, + "grad_norm": 0.47265625, + "learning_rate": 2.7794573666013485e-06, + "loss": 1.3255, + "step": 1436 + }, + { + "epoch": 1.3968840021097901, + "grad_norm": 0.4140625, + "learning_rate": 2.7769157822135246e-06, + "loss": 1.3289, + "step": 1437 + }, + { + "epoch": 1.3978577514504809, + "grad_norm": 0.40625, + "learning_rate": 2.7743739080327724e-06, + "loss": 1.2938, + "step": 1438 + }, + { + "epoch": 1.3988315007911714, + "grad_norm": 0.421875, + "learning_rate": 2.771831746719169e-06, + "loss": 1.3264, + "step": 1439 + }, + { + "epoch": 1.3998052501318619, + "grad_norm": 0.404296875, + "learning_rate": 2.769289300933091e-06, + "loss": 1.3336, + "step": 1440 + }, + { + "epoch": 1.4007789994725526, + "grad_norm": 0.404296875, + "learning_rate": 2.7667465733352133e-06, + "loss": 1.317, + "step": 1441 + }, + { + "epoch": 1.401752748813243, + "grad_norm": 0.408203125, + "learning_rate": 2.7642035665865065e-06, + "loss": 1.3164, + "step": 1442 + }, + { + "epoch": 1.4027264981539336, + "grad_norm": 0.42578125, + "learning_rate": 2.76166028334823e-06, + "loss": 1.3167, + "step": 1443 + }, + { + "epoch": 1.403700247494624, + "grad_norm": 0.427734375, + "learning_rate": 2.7591167262819356e-06, + "loss": 1.3123, + "step": 1444 + }, + { + "epoch": 1.4046739968353146, + "grad_norm": 0.404296875, + "learning_rate": 2.7565728980494622e-06, + "loss": 1.3292, + "step": 1445 + }, + { + "epoch": 1.405647746176005, + "grad_norm": 0.42578125, + "learning_rate": 2.754028801312931e-06, + "loss": 1.325, + "step": 1446 + }, + { + "epoch": 1.4066214955166958, + "grad_norm": 0.416015625, + "learning_rate": 2.7514844387347434e-06, + "loss": 1.3307, + "step": 1447 + }, + { + "epoch": 1.4075952448573863, + "grad_norm": 0.400390625, + "learning_rate": 2.7489398129775803e-06, + "loss": 1.3054, + "step": 1448 + }, + { + "epoch": 1.4085689941980768, + "grad_norm": 0.404296875, + "learning_rate": 2.7463949267043978e-06, + "loss": 1.3235, + "step": 1449 + }, + { + "epoch": 1.4095427435387675, + "grad_norm": 0.40625, + "learning_rate": 2.7438497825784242e-06, + "loss": 1.3009, + "step": 1450 + }, + { + "epoch": 1.410516492879458, + "grad_norm": 0.408203125, + "learning_rate": 2.74130438326316e-06, + "loss": 1.3134, + "step": 1451 + }, + { + "epoch": 1.4114902422201485, + "grad_norm": 0.41015625, + "learning_rate": 2.7387587314223673e-06, + "loss": 1.3411, + "step": 1452 + }, + { + "epoch": 1.412463991560839, + "grad_norm": 0.408203125, + "learning_rate": 2.7362128297200784e-06, + "loss": 1.3121, + "step": 1453 + }, + { + "epoch": 1.4134377409015295, + "grad_norm": 0.3984375, + "learning_rate": 2.7336666808205844e-06, + "loss": 1.3225, + "step": 1454 + }, + { + "epoch": 1.41441149024222, + "grad_norm": 0.404296875, + "learning_rate": 2.7311202873884345e-06, + "loss": 1.31, + "step": 1455 + }, + { + "epoch": 1.4153852395829107, + "grad_norm": 0.41015625, + "learning_rate": 2.7285736520884355e-06, + "loss": 1.3111, + "step": 1456 + }, + { + "epoch": 1.4163589889236012, + "grad_norm": 0.41015625, + "learning_rate": 2.726026777585645e-06, + "loss": 1.3287, + "step": 1457 + }, + { + "epoch": 1.4173327382642917, + "grad_norm": 0.4140625, + "learning_rate": 2.723479666545373e-06, + "loss": 1.3063, + "step": 1458 + }, + { + "epoch": 1.4183064876049825, + "grad_norm": 0.40625, + "learning_rate": 2.7209323216331774e-06, + "loss": 1.3414, + "step": 1459 + }, + { + "epoch": 1.419280236945673, + "grad_norm": 0.4296875, + "learning_rate": 2.7183847455148594e-06, + "loss": 1.3401, + "step": 1460 + }, + { + "epoch": 1.4202539862863635, + "grad_norm": 0.40625, + "learning_rate": 2.715836940856461e-06, + "loss": 1.3144, + "step": 1461 + }, + { + "epoch": 1.421227735627054, + "grad_norm": 0.400390625, + "learning_rate": 2.7132889103242664e-06, + "loss": 1.322, + "step": 1462 + }, + { + "epoch": 1.4222014849677445, + "grad_norm": 0.4140625, + "learning_rate": 2.7107406565847943e-06, + "loss": 1.3164, + "step": 1463 + }, + { + "epoch": 1.4231752343084352, + "grad_norm": 0.419921875, + "learning_rate": 2.708192182304798e-06, + "loss": 1.333, + "step": 1464 + }, + { + "epoch": 1.4241489836491257, + "grad_norm": 0.412109375, + "learning_rate": 2.70564349015126e-06, + "loss": 1.3385, + "step": 1465 + }, + { + "epoch": 1.4251227329898162, + "grad_norm": 0.400390625, + "learning_rate": 2.7030945827913935e-06, + "loss": 1.312, + "step": 1466 + }, + { + "epoch": 1.4260964823305067, + "grad_norm": 0.3984375, + "learning_rate": 2.700545462892633e-06, + "loss": 1.3395, + "step": 1467 + }, + { + "epoch": 1.4270702316711974, + "grad_norm": 0.4140625, + "learning_rate": 2.6979961331226386e-06, + "loss": 1.3315, + "step": 1468 + }, + { + "epoch": 1.428043981011888, + "grad_norm": 0.404296875, + "learning_rate": 2.6954465961492892e-06, + "loss": 1.306, + "step": 1469 + }, + { + "epoch": 1.4290177303525784, + "grad_norm": 0.408203125, + "learning_rate": 2.692896854640681e-06, + "loss": 1.3272, + "step": 1470 + }, + { + "epoch": 1.429991479693269, + "grad_norm": 0.416015625, + "learning_rate": 2.6903469112651233e-06, + "loss": 1.3382, + "step": 1471 + }, + { + "epoch": 1.4309652290339594, + "grad_norm": 0.416015625, + "learning_rate": 2.687796768691136e-06, + "loss": 1.3116, + "step": 1472 + }, + { + "epoch": 1.4319389783746501, + "grad_norm": 0.412109375, + "learning_rate": 2.685246429587449e-06, + "loss": 1.3214, + "step": 1473 + }, + { + "epoch": 1.4329127277153406, + "grad_norm": 0.40234375, + "learning_rate": 2.6826958966229988e-06, + "loss": 1.3184, + "step": 1474 + }, + { + "epoch": 1.4338864770560311, + "grad_norm": 0.40625, + "learning_rate": 2.680145172466921e-06, + "loss": 1.3083, + "step": 1475 + }, + { + "epoch": 1.4348602263967218, + "grad_norm": 0.392578125, + "learning_rate": 2.677594259788555e-06, + "loss": 1.3113, + "step": 1476 + }, + { + "epoch": 1.4358339757374123, + "grad_norm": 0.41796875, + "learning_rate": 2.6750431612574366e-06, + "loss": 1.335, + "step": 1477 + }, + { + "epoch": 1.4368077250781028, + "grad_norm": 0.423828125, + "learning_rate": 2.6724918795432942e-06, + "loss": 1.3067, + "step": 1478 + }, + { + "epoch": 1.4377814744187933, + "grad_norm": 0.3984375, + "learning_rate": 2.6699404173160496e-06, + "loss": 1.3199, + "step": 1479 + }, + { + "epoch": 1.4387552237594838, + "grad_norm": 0.40234375, + "learning_rate": 2.667388777245814e-06, + "loss": 1.2809, + "step": 1480 + }, + { + "epoch": 1.4397289731001743, + "grad_norm": 0.4140625, + "learning_rate": 2.664836962002883e-06, + "loss": 1.3153, + "step": 1481 + }, + { + "epoch": 1.440702722440865, + "grad_norm": 0.40234375, + "learning_rate": 2.662284974257738e-06, + "loss": 1.3068, + "step": 1482 + }, + { + "epoch": 1.4416764717815556, + "grad_norm": 0.40234375, + "learning_rate": 2.659732816681037e-06, + "loss": 1.3182, + "step": 1483 + }, + { + "epoch": 1.442650221122246, + "grad_norm": 0.41796875, + "learning_rate": 2.6571804919436195e-06, + "loss": 1.3285, + "step": 1484 + }, + { + "epoch": 1.4436239704629368, + "grad_norm": 0.4140625, + "learning_rate": 2.6546280027164985e-06, + "loss": 1.3073, + "step": 1485 + }, + { + "epoch": 1.4445977198036273, + "grad_norm": 0.400390625, + "learning_rate": 2.652075351670858e-06, + "loss": 1.3153, + "step": 1486 + }, + { + "epoch": 1.4455714691443178, + "grad_norm": 0.404296875, + "learning_rate": 2.6495225414780545e-06, + "loss": 1.3037, + "step": 1487 + }, + { + "epoch": 1.4465452184850083, + "grad_norm": 0.40625, + "learning_rate": 2.646969574809608e-06, + "loss": 1.326, + "step": 1488 + }, + { + "epoch": 1.4475189678256988, + "grad_norm": 0.40625, + "learning_rate": 2.644416454337203e-06, + "loss": 1.3135, + "step": 1489 + }, + { + "epoch": 1.4484927171663895, + "grad_norm": 0.41796875, + "learning_rate": 2.6418631827326857e-06, + "loss": 1.3028, + "step": 1490 + }, + { + "epoch": 1.44946646650708, + "grad_norm": 0.4296875, + "learning_rate": 2.639309762668059e-06, + "loss": 1.3188, + "step": 1491 + }, + { + "epoch": 1.4504402158477705, + "grad_norm": 0.4140625, + "learning_rate": 2.636756196815484e-06, + "loss": 1.3191, + "step": 1492 + }, + { + "epoch": 1.451413965188461, + "grad_norm": 0.404296875, + "learning_rate": 2.634202487847271e-06, + "loss": 1.3261, + "step": 1493 + }, + { + "epoch": 1.4523877145291517, + "grad_norm": 0.40234375, + "learning_rate": 2.631648638435883e-06, + "loss": 1.3176, + "step": 1494 + }, + { + "epoch": 1.4533614638698422, + "grad_norm": 0.4140625, + "learning_rate": 2.629094651253927e-06, + "loss": 1.3236, + "step": 1495 + }, + { + "epoch": 1.4543352132105327, + "grad_norm": 0.400390625, + "learning_rate": 2.6265405289741567e-06, + "loss": 1.3197, + "step": 1496 + }, + { + "epoch": 1.4553089625512232, + "grad_norm": 0.404296875, + "learning_rate": 2.6239862742694668e-06, + "loss": 1.3219, + "step": 1497 + }, + { + "epoch": 1.4562827118919137, + "grad_norm": 0.41796875, + "learning_rate": 2.621431889812888e-06, + "loss": 1.3157, + "step": 1498 + }, + { + "epoch": 1.4572564612326044, + "grad_norm": 0.416015625, + "learning_rate": 2.6188773782775917e-06, + "loss": 1.318, + "step": 1499 + }, + { + "epoch": 1.458230210573295, + "grad_norm": 0.40625, + "learning_rate": 2.616322742336877e-06, + "loss": 1.3216, + "step": 1500 + }, + { + "epoch": 1.4592039599139854, + "grad_norm": 0.416015625, + "learning_rate": 2.6137679846641765e-06, + "loss": 1.3314, + "step": 1501 + }, + { + "epoch": 1.4601777092546762, + "grad_norm": 0.412109375, + "learning_rate": 2.6112131079330494e-06, + "loss": 1.3215, + "step": 1502 + }, + { + "epoch": 1.4611514585953667, + "grad_norm": 0.39453125, + "learning_rate": 2.6086581148171784e-06, + "loss": 1.3238, + "step": 1503 + }, + { + "epoch": 1.4621252079360572, + "grad_norm": 0.427734375, + "learning_rate": 2.606103007990371e-06, + "loss": 1.3354, + "step": 1504 + }, + { + "epoch": 1.4630989572767477, + "grad_norm": 0.41015625, + "learning_rate": 2.6035477901265506e-06, + "loss": 1.325, + "step": 1505 + }, + { + "epoch": 1.4640727066174382, + "grad_norm": 0.4140625, + "learning_rate": 2.6009924638997566e-06, + "loss": 1.3116, + "step": 1506 + }, + { + "epoch": 1.4650464559581287, + "grad_norm": 0.41015625, + "learning_rate": 2.5984370319841444e-06, + "loss": 1.3227, + "step": 1507 + }, + { + "epoch": 1.4660202052988194, + "grad_norm": 0.4140625, + "learning_rate": 2.595881497053979e-06, + "loss": 1.3134, + "step": 1508 + }, + { + "epoch": 1.4669939546395099, + "grad_norm": 0.40234375, + "learning_rate": 2.593325861783632e-06, + "loss": 1.3098, + "step": 1509 + }, + { + "epoch": 1.4679677039802004, + "grad_norm": 0.40234375, + "learning_rate": 2.590770128847582e-06, + "loss": 1.3104, + "step": 1510 + }, + { + "epoch": 1.468941453320891, + "grad_norm": 0.412109375, + "learning_rate": 2.5882143009204074e-06, + "loss": 1.3373, + "step": 1511 + }, + { + "epoch": 1.4699152026615816, + "grad_norm": 0.404296875, + "learning_rate": 2.585658380676788e-06, + "loss": 1.3148, + "step": 1512 + }, + { + "epoch": 1.470888952002272, + "grad_norm": 0.408203125, + "learning_rate": 2.5831023707915e-06, + "loss": 1.299, + "step": 1513 + }, + { + "epoch": 1.4718627013429626, + "grad_norm": 0.41015625, + "learning_rate": 2.5805462739394118e-06, + "loss": 1.3312, + "step": 1514 + }, + { + "epoch": 1.472836450683653, + "grad_norm": 0.412109375, + "learning_rate": 2.5779900927954844e-06, + "loss": 1.2993, + "step": 1515 + }, + { + "epoch": 1.4738102000243436, + "grad_norm": 0.41796875, + "learning_rate": 2.575433830034767e-06, + "loss": 1.3354, + "step": 1516 + }, + { + "epoch": 1.4747839493650343, + "grad_norm": 0.408203125, + "learning_rate": 2.572877488332393e-06, + "loss": 1.3343, + "step": 1517 + }, + { + "epoch": 1.4757576987057248, + "grad_norm": 0.3984375, + "learning_rate": 2.57032107036358e-06, + "loss": 1.317, + "step": 1518 + }, + { + "epoch": 1.4767314480464153, + "grad_norm": 0.40625, + "learning_rate": 2.5677645788036245e-06, + "loss": 1.3155, + "step": 1519 + }, + { + "epoch": 1.477705197387106, + "grad_norm": 0.41796875, + "learning_rate": 2.5652080163278986e-06, + "loss": 1.3329, + "step": 1520 + }, + { + "epoch": 1.4786789467277965, + "grad_norm": 0.427734375, + "learning_rate": 2.5626513856118527e-06, + "loss": 1.3123, + "step": 1521 + }, + { + "epoch": 1.479652696068487, + "grad_norm": 0.40234375, + "learning_rate": 2.5600946893310043e-06, + "loss": 1.329, + "step": 1522 + }, + { + "epoch": 1.4806264454091775, + "grad_norm": 0.40234375, + "learning_rate": 2.557537930160941e-06, + "loss": 1.3235, + "step": 1523 + }, + { + "epoch": 1.481600194749868, + "grad_norm": 0.40625, + "learning_rate": 2.5549811107773175e-06, + "loss": 1.314, + "step": 1524 + }, + { + "epoch": 1.4825739440905588, + "grad_norm": 0.400390625, + "learning_rate": 2.5524242338558498e-06, + "loss": 1.3204, + "step": 1525 + }, + { + "epoch": 1.4835476934312493, + "grad_norm": 0.404296875, + "learning_rate": 2.5498673020723147e-06, + "loss": 1.31, + "step": 1526 + }, + { + "epoch": 1.4845214427719398, + "grad_norm": 0.435546875, + "learning_rate": 2.547310318102548e-06, + "loss": 1.3296, + "step": 1527 + }, + { + "epoch": 1.4854951921126303, + "grad_norm": 0.416015625, + "learning_rate": 2.5447532846224364e-06, + "loss": 1.3187, + "step": 1528 + }, + { + "epoch": 1.486468941453321, + "grad_norm": 0.41796875, + "learning_rate": 2.5421962043079217e-06, + "loss": 1.3136, + "step": 1529 + }, + { + "epoch": 1.4874426907940115, + "grad_norm": 0.416015625, + "learning_rate": 2.539639079834994e-06, + "loss": 1.3299, + "step": 1530 + }, + { + "epoch": 1.488416440134702, + "grad_norm": 0.41015625, + "learning_rate": 2.5370819138796884e-06, + "loss": 1.3279, + "step": 1531 + }, + { + "epoch": 1.4893901894753925, + "grad_norm": 0.4140625, + "learning_rate": 2.534524709118085e-06, + "loss": 1.3395, + "step": 1532 + }, + { + "epoch": 1.490363938816083, + "grad_norm": 0.4375, + "learning_rate": 2.5319674682263033e-06, + "loss": 1.3349, + "step": 1533 + }, + { + "epoch": 1.4913376881567737, + "grad_norm": 0.423828125, + "learning_rate": 2.5294101938805017e-06, + "loss": 1.3386, + "step": 1534 + }, + { + "epoch": 1.4923114374974642, + "grad_norm": 0.42578125, + "learning_rate": 2.5268528887568723e-06, + "loss": 1.3233, + "step": 1535 + }, + { + "epoch": 1.4932851868381547, + "grad_norm": 0.427734375, + "learning_rate": 2.5242955555316396e-06, + "loss": 1.3073, + "step": 1536 + }, + { + "epoch": 1.4942589361788454, + "grad_norm": 0.408203125, + "learning_rate": 2.52173819688106e-06, + "loss": 1.3273, + "step": 1537 + }, + { + "epoch": 1.495232685519536, + "grad_norm": 0.40234375, + "learning_rate": 2.5191808154814124e-06, + "loss": 1.3105, + "step": 1538 + }, + { + "epoch": 1.4962064348602264, + "grad_norm": 0.423828125, + "learning_rate": 2.516623414009003e-06, + "loss": 1.309, + "step": 1539 + }, + { + "epoch": 1.497180184200917, + "grad_norm": 0.40234375, + "learning_rate": 2.514065995140156e-06, + "loss": 1.316, + "step": 1540 + }, + { + "epoch": 1.4981539335416074, + "grad_norm": 0.419921875, + "learning_rate": 2.5115085615512173e-06, + "loss": 1.3237, + "step": 1541 + }, + { + "epoch": 1.499127682882298, + "grad_norm": 0.404296875, + "learning_rate": 2.5089511159185465e-06, + "loss": 1.309, + "step": 1542 + }, + { + "epoch": 1.5001014322229886, + "grad_norm": 0.404296875, + "learning_rate": 2.5063936609185133e-06, + "loss": 1.3211, + "step": 1543 + }, + { + "epoch": 1.5010751815636791, + "grad_norm": 0.4140625, + "learning_rate": 2.503836199227502e-06, + "loss": 1.3328, + "step": 1544 + }, + { + "epoch": 1.5020489309043699, + "grad_norm": 0.419921875, + "learning_rate": 2.5012787335218997e-06, + "loss": 1.3355, + "step": 1545 + }, + { + "epoch": 1.5030226802450604, + "grad_norm": 0.412109375, + "learning_rate": 2.498721266478101e-06, + "loss": 1.3007, + "step": 1546 + }, + { + "epoch": 1.5039964295857509, + "grad_norm": 0.41015625, + "learning_rate": 2.496163800772499e-06, + "loss": 1.308, + "step": 1547 + }, + { + "epoch": 1.5049701789264414, + "grad_norm": 0.40625, + "learning_rate": 2.493606339081487e-06, + "loss": 1.326, + "step": 1548 + }, + { + "epoch": 1.5059439282671319, + "grad_norm": 0.423828125, + "learning_rate": 2.4910488840814543e-06, + "loss": 1.3158, + "step": 1549 + }, + { + "epoch": 1.5069176776078224, + "grad_norm": 0.41015625, + "learning_rate": 2.4884914384487822e-06, + "loss": 1.3075, + "step": 1550 + }, + { + "epoch": 1.5078914269485129, + "grad_norm": 0.408203125, + "learning_rate": 2.4859340048598438e-06, + "loss": 1.3076, + "step": 1551 + }, + { + "epoch": 1.5088651762892036, + "grad_norm": 0.44140625, + "learning_rate": 2.483376585990998e-06, + "loss": 1.317, + "step": 1552 + }, + { + "epoch": 1.509838925629894, + "grad_norm": 0.396484375, + "learning_rate": 2.4808191845185885e-06, + "loss": 1.3139, + "step": 1553 + }, + { + "epoch": 1.5108126749705848, + "grad_norm": 0.408203125, + "learning_rate": 2.4782618031189405e-06, + "loss": 1.3104, + "step": 1554 + }, + { + "epoch": 1.5117864243112753, + "grad_norm": 0.404296875, + "learning_rate": 2.475704444468361e-06, + "loss": 1.3422, + "step": 1555 + }, + { + "epoch": 1.5127601736519658, + "grad_norm": 0.4140625, + "learning_rate": 2.473147111243129e-06, + "loss": 1.3094, + "step": 1556 + }, + { + "epoch": 1.5137339229926563, + "grad_norm": 0.431640625, + "learning_rate": 2.470589806119499e-06, + "loss": 1.3046, + "step": 1557 + }, + { + "epoch": 1.5147076723333468, + "grad_norm": 0.416015625, + "learning_rate": 2.468032531773697e-06, + "loss": 1.3303, + "step": 1558 + }, + { + "epoch": 1.5156814216740373, + "grad_norm": 0.41796875, + "learning_rate": 2.4654752908819153e-06, + "loss": 1.319, + "step": 1559 + }, + { + "epoch": 1.5166551710147278, + "grad_norm": 0.408203125, + "learning_rate": 2.4629180861203116e-06, + "loss": 1.3279, + "step": 1560 + }, + { + "epoch": 1.5176289203554185, + "grad_norm": 0.3984375, + "learning_rate": 2.4603609201650066e-06, + "loss": 1.3235, + "step": 1561 + }, + { + "epoch": 1.518602669696109, + "grad_norm": 0.3984375, + "learning_rate": 2.4578037956920783e-06, + "loss": 1.3155, + "step": 1562 + }, + { + "epoch": 1.5195764190367997, + "grad_norm": 0.419921875, + "learning_rate": 2.455246715377564e-06, + "loss": 1.3035, + "step": 1563 + }, + { + "epoch": 1.5205501683774902, + "grad_norm": 0.421875, + "learning_rate": 2.4526896818974534e-06, + "loss": 1.312, + "step": 1564 + }, + { + "epoch": 1.5215239177181807, + "grad_norm": 0.435546875, + "learning_rate": 2.4501326979276857e-06, + "loss": 1.3306, + "step": 1565 + }, + { + "epoch": 1.5224976670588712, + "grad_norm": 0.419921875, + "learning_rate": 2.447575766144151e-06, + "loss": 1.3141, + "step": 1566 + }, + { + "epoch": 1.5234714163995617, + "grad_norm": 0.41796875, + "learning_rate": 2.4450188892226834e-06, + "loss": 1.3198, + "step": 1567 + }, + { + "epoch": 1.5244451657402522, + "grad_norm": 0.412109375, + "learning_rate": 2.4424620698390597e-06, + "loss": 1.3094, + "step": 1568 + }, + { + "epoch": 1.525418915080943, + "grad_norm": 0.400390625, + "learning_rate": 2.4399053106689965e-06, + "loss": 1.3242, + "step": 1569 + }, + { + "epoch": 1.5263926644216335, + "grad_norm": 0.40625, + "learning_rate": 2.4373486143881477e-06, + "loss": 1.3158, + "step": 1570 + }, + { + "epoch": 1.527366413762324, + "grad_norm": 0.42578125, + "learning_rate": 2.4347919836721014e-06, + "loss": 1.3152, + "step": 1571 + }, + { + "epoch": 1.5283401631030147, + "grad_norm": 0.419921875, + "learning_rate": 2.432235421196376e-06, + "loss": 1.3307, + "step": 1572 + }, + { + "epoch": 1.5293139124437052, + "grad_norm": 0.416015625, + "learning_rate": 2.4296789296364202e-06, + "loss": 1.3083, + "step": 1573 + }, + { + "epoch": 1.5302876617843957, + "grad_norm": 0.4140625, + "learning_rate": 2.427122511667608e-06, + "loss": 1.3176, + "step": 1574 + }, + { + "epoch": 1.5312614111250862, + "grad_norm": 0.40234375, + "learning_rate": 2.4245661699652343e-06, + "loss": 1.3168, + "step": 1575 + }, + { + "epoch": 1.5322351604657767, + "grad_norm": 0.40234375, + "learning_rate": 2.4220099072045164e-06, + "loss": 1.327, + "step": 1576 + }, + { + "epoch": 1.5332089098064672, + "grad_norm": 0.408203125, + "learning_rate": 2.4194537260605895e-06, + "loss": 1.3198, + "step": 1577 + }, + { + "epoch": 1.534182659147158, + "grad_norm": 0.40625, + "learning_rate": 2.4168976292085013e-06, + "loss": 1.3137, + "step": 1578 + }, + { + "epoch": 1.5351564084878484, + "grad_norm": 0.412109375, + "learning_rate": 2.4143416193232123e-06, + "loss": 1.3177, + "step": 1579 + }, + { + "epoch": 1.5361301578285391, + "grad_norm": 0.40234375, + "learning_rate": 2.411785699079593e-06, + "loss": 1.3057, + "step": 1580 + }, + { + "epoch": 1.5371039071692296, + "grad_norm": 0.42578125, + "learning_rate": 2.4092298711524183e-06, + "loss": 1.3536, + "step": 1581 + }, + { + "epoch": 1.5380776565099201, + "grad_norm": 0.40234375, + "learning_rate": 2.406674138216368e-06, + "loss": 1.303, + "step": 1582 + }, + { + "epoch": 1.5390514058506106, + "grad_norm": 0.40234375, + "learning_rate": 2.4041185029460214e-06, + "loss": 1.3182, + "step": 1583 + }, + { + "epoch": 1.5400251551913011, + "grad_norm": 0.3984375, + "learning_rate": 2.4015629680158555e-06, + "loss": 1.3098, + "step": 1584 + }, + { + "epoch": 1.5409989045319916, + "grad_norm": 0.40234375, + "learning_rate": 2.3990075361002447e-06, + "loss": 1.3131, + "step": 1585 + }, + { + "epoch": 1.5419726538726821, + "grad_norm": 0.40625, + "learning_rate": 2.396452209873451e-06, + "loss": 1.3019, + "step": 1586 + }, + { + "epoch": 1.5429464032133728, + "grad_norm": 0.416015625, + "learning_rate": 2.39389699200963e-06, + "loss": 1.3238, + "step": 1587 + }, + { + "epoch": 1.5439201525540633, + "grad_norm": 0.404296875, + "learning_rate": 2.391341885182822e-06, + "loss": 1.3219, + "step": 1588 + }, + { + "epoch": 1.544893901894754, + "grad_norm": 0.40625, + "learning_rate": 2.3887868920669514e-06, + "loss": 1.3393, + "step": 1589 + }, + { + "epoch": 1.5458676512354446, + "grad_norm": 0.404296875, + "learning_rate": 2.3862320153358243e-06, + "loss": 1.3254, + "step": 1590 + }, + { + "epoch": 1.546841400576135, + "grad_norm": 0.419921875, + "learning_rate": 2.3836772576631236e-06, + "loss": 1.3146, + "step": 1591 + }, + { + "epoch": 1.5478151499168256, + "grad_norm": 0.41796875, + "learning_rate": 2.3811226217224087e-06, + "loss": 1.3222, + "step": 1592 + }, + { + "epoch": 1.548788899257516, + "grad_norm": 0.416015625, + "learning_rate": 2.378568110187112e-06, + "loss": 1.3232, + "step": 1593 + }, + { + "epoch": 1.5497626485982066, + "grad_norm": 0.41796875, + "learning_rate": 2.3760137257305336e-06, + "loss": 1.3076, + "step": 1594 + }, + { + "epoch": 1.5507363979388973, + "grad_norm": 0.404296875, + "learning_rate": 2.373459471025843e-06, + "loss": 1.3447, + "step": 1595 + }, + { + "epoch": 1.5517101472795878, + "grad_norm": 0.412109375, + "learning_rate": 2.3709053487460738e-06, + "loss": 1.3181, + "step": 1596 + }, + { + "epoch": 1.5526838966202783, + "grad_norm": 0.41796875, + "learning_rate": 2.368351361564118e-06, + "loss": 1.3174, + "step": 1597 + }, + { + "epoch": 1.553657645960969, + "grad_norm": 0.41796875, + "learning_rate": 2.3657975121527295e-06, + "loss": 1.3236, + "step": 1598 + }, + { + "epoch": 1.5546313953016595, + "grad_norm": 0.41015625, + "learning_rate": 2.3632438031845167e-06, + "loss": 1.3348, + "step": 1599 + }, + { + "epoch": 1.55560514464235, + "grad_norm": 0.408203125, + "learning_rate": 2.3606902373319414e-06, + "loss": 1.3159, + "step": 1600 + }, + { + "epoch": 1.5565788939830405, + "grad_norm": 0.40625, + "learning_rate": 2.358136817267315e-06, + "loss": 1.2833, + "step": 1601 + }, + { + "epoch": 1.557552643323731, + "grad_norm": 0.412109375, + "learning_rate": 2.3555835456627976e-06, + "loss": 1.3318, + "step": 1602 + }, + { + "epoch": 1.5585263926644215, + "grad_norm": 0.408203125, + "learning_rate": 2.3530304251903926e-06, + "loss": 1.2971, + "step": 1603 + }, + { + "epoch": 1.5595001420051122, + "grad_norm": 0.408203125, + "learning_rate": 2.350477458521946e-06, + "loss": 1.3297, + "step": 1604 + }, + { + "epoch": 1.5604738913458027, + "grad_norm": 0.40625, + "learning_rate": 2.347924648329142e-06, + "loss": 1.3151, + "step": 1605 + }, + { + "epoch": 1.5614476406864934, + "grad_norm": 0.421875, + "learning_rate": 2.345371997283502e-06, + "loss": 1.3315, + "step": 1606 + }, + { + "epoch": 1.562421390027184, + "grad_norm": 0.40234375, + "learning_rate": 2.342819508056382e-06, + "loss": 1.3173, + "step": 1607 + }, + { + "epoch": 1.5633951393678744, + "grad_norm": 0.416015625, + "learning_rate": 2.3402671833189646e-06, + "loss": 1.3461, + "step": 1608 + }, + { + "epoch": 1.564368888708565, + "grad_norm": 0.3984375, + "learning_rate": 2.3377150257422637e-06, + "loss": 1.312, + "step": 1609 + }, + { + "epoch": 1.5653426380492554, + "grad_norm": 0.40625, + "learning_rate": 2.3351630379971176e-06, + "loss": 1.3165, + "step": 1610 + }, + { + "epoch": 1.566316387389946, + "grad_norm": 0.400390625, + "learning_rate": 2.332611222754187e-06, + "loss": 1.3003, + "step": 1611 + }, + { + "epoch": 1.5672901367306364, + "grad_norm": 0.392578125, + "learning_rate": 2.330059582683951e-06, + "loss": 1.315, + "step": 1612 + }, + { + "epoch": 1.5682638860713272, + "grad_norm": 0.416015625, + "learning_rate": 2.3275081204567066e-06, + "loss": 1.3279, + "step": 1613 + }, + { + "epoch": 1.5692376354120177, + "grad_norm": 0.4140625, + "learning_rate": 2.3249568387425642e-06, + "loss": 1.306, + "step": 1614 + }, + { + "epoch": 1.5702113847527084, + "grad_norm": 0.40234375, + "learning_rate": 2.322405740211445e-06, + "loss": 1.3177, + "step": 1615 + }, + { + "epoch": 1.5711851340933989, + "grad_norm": 0.400390625, + "learning_rate": 2.319854827533079e-06, + "loss": 1.339, + "step": 1616 + }, + { + "epoch": 1.5721588834340894, + "grad_norm": 0.408203125, + "learning_rate": 2.3173041033770016e-06, + "loss": 1.3263, + "step": 1617 + }, + { + "epoch": 1.5731326327747799, + "grad_norm": 0.40234375, + "learning_rate": 2.314753570412552e-06, + "loss": 1.3069, + "step": 1618 + }, + { + "epoch": 1.5741063821154704, + "grad_norm": 0.396484375, + "learning_rate": 2.312203231308865e-06, + "loss": 1.3169, + "step": 1619 + }, + { + "epoch": 1.5750801314561609, + "grad_norm": 0.40625, + "learning_rate": 2.309653088734878e-06, + "loss": 1.2937, + "step": 1620 + }, + { + "epoch": 1.5760538807968514, + "grad_norm": 0.39453125, + "learning_rate": 2.30710314535932e-06, + "loss": 1.3169, + "step": 1621 + }, + { + "epoch": 1.577027630137542, + "grad_norm": 0.40234375, + "learning_rate": 2.304553403850711e-06, + "loss": 1.2974, + "step": 1622 + }, + { + "epoch": 1.5780013794782326, + "grad_norm": 0.41015625, + "learning_rate": 2.302003866877362e-06, + "loss": 1.3381, + "step": 1623 + }, + { + "epoch": 1.5789751288189233, + "grad_norm": 0.4140625, + "learning_rate": 2.2994545371073677e-06, + "loss": 1.3296, + "step": 1624 + }, + { + "epoch": 1.5799488781596138, + "grad_norm": 0.396484375, + "learning_rate": 2.2969054172086073e-06, + "loss": 1.3238, + "step": 1625 + }, + { + "epoch": 1.5809226275003043, + "grad_norm": 0.412109375, + "learning_rate": 2.29435650984874e-06, + "loss": 1.318, + "step": 1626 + }, + { + "epoch": 1.5818963768409948, + "grad_norm": 0.435546875, + "learning_rate": 2.291807817695202e-06, + "loss": 1.3192, + "step": 1627 + }, + { + "epoch": 1.5828701261816853, + "grad_norm": 0.404296875, + "learning_rate": 2.289259343415206e-06, + "loss": 1.3055, + "step": 1628 + }, + { + "epoch": 1.5838438755223758, + "grad_norm": 0.3984375, + "learning_rate": 2.286711089675735e-06, + "loss": 1.3111, + "step": 1629 + }, + { + "epoch": 1.5848176248630665, + "grad_norm": 0.421875, + "learning_rate": 2.2841630591435403e-06, + "loss": 1.3275, + "step": 1630 + }, + { + "epoch": 1.585791374203757, + "grad_norm": 0.40625, + "learning_rate": 2.2816152544851423e-06, + "loss": 1.3026, + "step": 1631 + }, + { + "epoch": 1.5867651235444478, + "grad_norm": 0.40234375, + "learning_rate": 2.2790676783668235e-06, + "loss": 1.3393, + "step": 1632 + }, + { + "epoch": 1.5877388728851383, + "grad_norm": 0.40234375, + "learning_rate": 2.2765203334546272e-06, + "loss": 1.3204, + "step": 1633 + }, + { + "epoch": 1.5887126222258288, + "grad_norm": 0.41796875, + "learning_rate": 2.2739732224143556e-06, + "loss": 1.3085, + "step": 1634 + }, + { + "epoch": 1.5896863715665193, + "grad_norm": 0.419921875, + "learning_rate": 2.2714263479115658e-06, + "loss": 1.3125, + "step": 1635 + }, + { + "epoch": 1.5906601209072098, + "grad_norm": 0.3984375, + "learning_rate": 2.268879712611566e-06, + "loss": 1.3001, + "step": 1636 + }, + { + "epoch": 1.5916338702479003, + "grad_norm": 0.40625, + "learning_rate": 2.2663333191794165e-06, + "loss": 1.3272, + "step": 1637 + }, + { + "epoch": 1.5926076195885908, + "grad_norm": 0.40234375, + "learning_rate": 2.263787170279922e-06, + "loss": 1.3116, + "step": 1638 + }, + { + "epoch": 1.5935813689292815, + "grad_norm": 0.404296875, + "learning_rate": 2.2612412685776335e-06, + "loss": 1.308, + "step": 1639 + }, + { + "epoch": 1.594555118269972, + "grad_norm": 0.404296875, + "learning_rate": 2.258695616736842e-06, + "loss": 1.3227, + "step": 1640 + }, + { + "epoch": 1.5955288676106627, + "grad_norm": 0.421875, + "learning_rate": 2.256150217421576e-06, + "loss": 1.3295, + "step": 1641 + }, + { + "epoch": 1.5965026169513532, + "grad_norm": 0.41015625, + "learning_rate": 2.2536050732956035e-06, + "loss": 1.3188, + "step": 1642 + }, + { + "epoch": 1.5974763662920437, + "grad_norm": 0.40234375, + "learning_rate": 2.2510601870224206e-06, + "loss": 1.3081, + "step": 1643 + }, + { + "epoch": 1.5984501156327342, + "grad_norm": 0.40234375, + "learning_rate": 2.2485155612652574e-06, + "loss": 1.3342, + "step": 1644 + }, + { + "epoch": 1.5994238649734247, + "grad_norm": 0.421875, + "learning_rate": 2.2459711986870704e-06, + "loss": 1.3203, + "step": 1645 + }, + { + "epoch": 1.6003976143141152, + "grad_norm": 0.404296875, + "learning_rate": 2.243427101950538e-06, + "loss": 1.3271, + "step": 1646 + }, + { + "epoch": 1.6013713636548057, + "grad_norm": 0.404296875, + "learning_rate": 2.2408832737180648e-06, + "loss": 1.3244, + "step": 1647 + }, + { + "epoch": 1.6023451129954964, + "grad_norm": 0.42578125, + "learning_rate": 2.238339716651771e-06, + "loss": 1.3197, + "step": 1648 + }, + { + "epoch": 1.603318862336187, + "grad_norm": 0.39453125, + "learning_rate": 2.2357964334134947e-06, + "loss": 1.3157, + "step": 1649 + }, + { + "epoch": 1.6042926116768776, + "grad_norm": 0.412109375, + "learning_rate": 2.2332534266647867e-06, + "loss": 1.3465, + "step": 1650 + }, + { + "epoch": 1.6052663610175681, + "grad_norm": 0.40625, + "learning_rate": 2.2307106990669093e-06, + "loss": 1.33, + "step": 1651 + }, + { + "epoch": 1.6062401103582586, + "grad_norm": 0.408203125, + "learning_rate": 2.2281682532808317e-06, + "loss": 1.3204, + "step": 1652 + }, + { + "epoch": 1.6072138596989491, + "grad_norm": 0.41796875, + "learning_rate": 2.225626091967228e-06, + "loss": 1.3131, + "step": 1653 + }, + { + "epoch": 1.6081876090396396, + "grad_norm": 0.4296875, + "learning_rate": 2.2230842177864766e-06, + "loss": 1.3227, + "step": 1654 + }, + { + "epoch": 1.6091613583803301, + "grad_norm": 0.400390625, + "learning_rate": 2.220542633398652e-06, + "loss": 1.3009, + "step": 1655 + }, + { + "epoch": 1.6101351077210209, + "grad_norm": 0.41015625, + "learning_rate": 2.218001341463529e-06, + "loss": 1.3153, + "step": 1656 + }, + { + "epoch": 1.6111088570617114, + "grad_norm": 0.4140625, + "learning_rate": 2.215460344640574e-06, + "loss": 1.3216, + "step": 1657 + }, + { + "epoch": 1.6120826064024019, + "grad_norm": 0.408203125, + "learning_rate": 2.212919645588946e-06, + "loss": 1.3238, + "step": 1658 + }, + { + "epoch": 1.6130563557430926, + "grad_norm": 0.41796875, + "learning_rate": 2.210379246967491e-06, + "loss": 1.3253, + "step": 1659 + }, + { + "epoch": 1.614030105083783, + "grad_norm": 0.416015625, + "learning_rate": 2.207839151434742e-06, + "loss": 1.3132, + "step": 1660 + }, + { + "epoch": 1.6150038544244736, + "grad_norm": 0.40234375, + "learning_rate": 2.205299361648913e-06, + "loss": 1.3274, + "step": 1661 + }, + { + "epoch": 1.615977603765164, + "grad_norm": 0.408203125, + "learning_rate": 2.2027598802679013e-06, + "loss": 1.3096, + "step": 1662 + }, + { + "epoch": 1.6169513531058546, + "grad_norm": 0.41015625, + "learning_rate": 2.2002207099492776e-06, + "loss": 1.319, + "step": 1663 + }, + { + "epoch": 1.617925102446545, + "grad_norm": 0.404296875, + "learning_rate": 2.1976818533502886e-06, + "loss": 1.3269, + "step": 1664 + }, + { + "epoch": 1.6188988517872358, + "grad_norm": 0.41796875, + "learning_rate": 2.1951433131278535e-06, + "loss": 1.3117, + "step": 1665 + }, + { + "epoch": 1.6198726011279263, + "grad_norm": 0.396484375, + "learning_rate": 2.1926050919385594e-06, + "loss": 1.3188, + "step": 1666 + }, + { + "epoch": 1.620846350468617, + "grad_norm": 0.408203125, + "learning_rate": 2.1900671924386606e-06, + "loss": 1.3052, + "step": 1667 + }, + { + "epoch": 1.6218200998093075, + "grad_norm": 0.4140625, + "learning_rate": 2.1875296172840737e-06, + "loss": 1.3015, + "step": 1668 + }, + { + "epoch": 1.622793849149998, + "grad_norm": 0.416015625, + "learning_rate": 2.1849923691303757e-06, + "loss": 1.3102, + "step": 1669 + }, + { + "epoch": 1.6237675984906885, + "grad_norm": 0.408203125, + "learning_rate": 2.1824554506328033e-06, + "loss": 1.3063, + "step": 1670 + }, + { + "epoch": 1.624741347831379, + "grad_norm": 0.400390625, + "learning_rate": 2.1799188644462457e-06, + "loss": 1.3198, + "step": 1671 + }, + { + "epoch": 1.6257150971720695, + "grad_norm": 0.421875, + "learning_rate": 2.1773826132252456e-06, + "loss": 1.3444, + "step": 1672 + }, + { + "epoch": 1.62668884651276, + "grad_norm": 0.40234375, + "learning_rate": 2.174846699623997e-06, + "loss": 1.315, + "step": 1673 + }, + { + "epoch": 1.6276625958534507, + "grad_norm": 0.419921875, + "learning_rate": 2.172311126296335e-06, + "loss": 1.3202, + "step": 1674 + }, + { + "epoch": 1.6286363451941412, + "grad_norm": 0.4140625, + "learning_rate": 2.169775895895745e-06, + "loss": 1.3349, + "step": 1675 + }, + { + "epoch": 1.629610094534832, + "grad_norm": 0.4140625, + "learning_rate": 2.1672410110753495e-06, + "loss": 1.3099, + "step": 1676 + }, + { + "epoch": 1.6305838438755225, + "grad_norm": 0.41796875, + "learning_rate": 2.164706474487911e-06, + "loss": 1.3121, + "step": 1677 + }, + { + "epoch": 1.631557593216213, + "grad_norm": 0.40234375, + "learning_rate": 2.1621722887858273e-06, + "loss": 1.3133, + "step": 1678 + }, + { + "epoch": 1.6325313425569035, + "grad_norm": 0.4140625, + "learning_rate": 2.1596384566211286e-06, + "loss": 1.3271, + "step": 1679 + }, + { + "epoch": 1.633505091897594, + "grad_norm": 0.41796875, + "learning_rate": 2.157104980645476e-06, + "loss": 1.3363, + "step": 1680 + }, + { + "epoch": 1.6344788412382845, + "grad_norm": 0.40234375, + "learning_rate": 2.1545718635101563e-06, + "loss": 1.3166, + "step": 1681 + }, + { + "epoch": 1.635452590578975, + "grad_norm": 0.41015625, + "learning_rate": 2.1520391078660823e-06, + "loss": 1.2876, + "step": 1682 + }, + { + "epoch": 1.6364263399196657, + "grad_norm": 0.41015625, + "learning_rate": 2.149506716363788e-06, + "loss": 1.3082, + "step": 1683 + }, + { + "epoch": 1.6374000892603562, + "grad_norm": 0.41796875, + "learning_rate": 2.146974691653427e-06, + "loss": 1.318, + "step": 1684 + }, + { + "epoch": 1.638373838601047, + "grad_norm": 0.408203125, + "learning_rate": 2.144443036384767e-06, + "loss": 1.317, + "step": 1685 + }, + { + "epoch": 1.6393475879417374, + "grad_norm": 0.421875, + "learning_rate": 2.1419117532071906e-06, + "loss": 1.3125, + "step": 1686 + }, + { + "epoch": 1.640321337282428, + "grad_norm": 0.416015625, + "learning_rate": 2.1393808447696906e-06, + "loss": 1.3225, + "step": 1687 + }, + { + "epoch": 1.6412950866231184, + "grad_norm": 0.40625, + "learning_rate": 2.136850313720868e-06, + "loss": 1.3023, + "step": 1688 + }, + { + "epoch": 1.642268835963809, + "grad_norm": 0.40625, + "learning_rate": 2.134320162708929e-06, + "loss": 1.3211, + "step": 1689 + }, + { + "epoch": 1.6432425853044994, + "grad_norm": 0.42578125, + "learning_rate": 2.131790394381682e-06, + "loss": 1.3269, + "step": 1690 + }, + { + "epoch": 1.6442163346451901, + "grad_norm": 0.419921875, + "learning_rate": 2.129261011386534e-06, + "loss": 1.3202, + "step": 1691 + }, + { + "epoch": 1.6451900839858806, + "grad_norm": 0.412109375, + "learning_rate": 2.1267320163704897e-06, + "loss": 1.3397, + "step": 1692 + }, + { + "epoch": 1.6461638333265713, + "grad_norm": 0.416015625, + "learning_rate": 2.1242034119801477e-06, + "loss": 1.3212, + "step": 1693 + }, + { + "epoch": 1.6471375826672618, + "grad_norm": 0.41015625, + "learning_rate": 2.1216752008616974e-06, + "loss": 1.3154, + "step": 1694 + }, + { + "epoch": 1.6481113320079523, + "grad_norm": 0.408203125, + "learning_rate": 2.1191473856609182e-06, + "loss": 1.2978, + "step": 1695 + }, + { + "epoch": 1.6490850813486428, + "grad_norm": 0.40234375, + "learning_rate": 2.1166199690231715e-06, + "loss": 1.3023, + "step": 1696 + }, + { + "epoch": 1.6500588306893333, + "grad_norm": 0.41015625, + "learning_rate": 2.114092953593405e-06, + "loss": 1.3054, + "step": 1697 + }, + { + "epoch": 1.6510325800300238, + "grad_norm": 0.40234375, + "learning_rate": 2.111566342016145e-06, + "loss": 1.3176, + "step": 1698 + }, + { + "epoch": 1.6520063293707143, + "grad_norm": 0.3984375, + "learning_rate": 2.1090401369354966e-06, + "loss": 1.3254, + "step": 1699 + }, + { + "epoch": 1.652980078711405, + "grad_norm": 0.41796875, + "learning_rate": 2.1065143409951384e-06, + "loss": 1.3353, + "step": 1700 + }, + { + "epoch": 1.6539538280520956, + "grad_norm": 0.4140625, + "learning_rate": 2.1039889568383202e-06, + "loss": 1.3259, + "step": 1701 + }, + { + "epoch": 1.6549275773927863, + "grad_norm": 0.416015625, + "learning_rate": 2.1014639871078625e-06, + "loss": 1.3283, + "step": 1702 + }, + { + "epoch": 1.6559013267334768, + "grad_norm": 0.419921875, + "learning_rate": 2.09893943444615e-06, + "loss": 1.3329, + "step": 1703 + }, + { + "epoch": 1.6568750760741673, + "grad_norm": 0.400390625, + "learning_rate": 2.096415301495134e-06, + "loss": 1.2812, + "step": 1704 + }, + { + "epoch": 1.6578488254148578, + "grad_norm": 0.400390625, + "learning_rate": 2.0938915908963232e-06, + "loss": 1.3271, + "step": 1705 + }, + { + "epoch": 1.6588225747555483, + "grad_norm": 0.40234375, + "learning_rate": 2.091368305290788e-06, + "loss": 1.3063, + "step": 1706 + }, + { + "epoch": 1.6597963240962388, + "grad_norm": 0.40234375, + "learning_rate": 2.088845447319149e-06, + "loss": 1.3102, + "step": 1707 + }, + { + "epoch": 1.6607700734369293, + "grad_norm": 0.4296875, + "learning_rate": 2.0863230196215833e-06, + "loss": 1.3243, + "step": 1708 + }, + { + "epoch": 1.66174382277762, + "grad_norm": 0.408203125, + "learning_rate": 2.083801024837817e-06, + "loss": 1.3017, + "step": 1709 + }, + { + "epoch": 1.6627175721183105, + "grad_norm": 0.412109375, + "learning_rate": 2.081279465607123e-06, + "loss": 1.3047, + "step": 1710 + }, + { + "epoch": 1.6636913214590012, + "grad_norm": 0.431640625, + "learning_rate": 2.078758344568317e-06, + "loss": 1.2974, + "step": 1711 + }, + { + "epoch": 1.6646650707996917, + "grad_norm": 0.404296875, + "learning_rate": 2.0762376643597586e-06, + "loss": 1.3087, + "step": 1712 + }, + { + "epoch": 1.6656388201403822, + "grad_norm": 0.400390625, + "learning_rate": 2.073717427619344e-06, + "loss": 1.3216, + "step": 1713 + }, + { + "epoch": 1.6666125694810727, + "grad_norm": 0.40625, + "learning_rate": 2.0711976369845065e-06, + "loss": 1.341, + "step": 1714 + }, + { + "epoch": 1.6675863188217632, + "grad_norm": 0.40625, + "learning_rate": 2.0686782950922113e-06, + "loss": 1.3201, + "step": 1715 + }, + { + "epoch": 1.6685600681624537, + "grad_norm": 0.412109375, + "learning_rate": 2.066159404578956e-06, + "loss": 1.3223, + "step": 1716 + }, + { + "epoch": 1.6695338175031444, + "grad_norm": 0.400390625, + "learning_rate": 2.0636409680807646e-06, + "loss": 1.3329, + "step": 1717 + }, + { + "epoch": 1.670507566843835, + "grad_norm": 0.404296875, + "learning_rate": 2.0611229882331844e-06, + "loss": 1.3013, + "step": 1718 + }, + { + "epoch": 1.6714813161845254, + "grad_norm": 0.4140625, + "learning_rate": 2.0586054676712873e-06, + "loss": 1.3448, + "step": 1719 + }, + { + "epoch": 1.6724550655252162, + "grad_norm": 0.396484375, + "learning_rate": 2.0560884090296642e-06, + "loss": 1.3304, + "step": 1720 + }, + { + "epoch": 1.6734288148659067, + "grad_norm": 0.40625, + "learning_rate": 2.0535718149424213e-06, + "loss": 1.3149, + "step": 1721 + }, + { + "epoch": 1.6744025642065972, + "grad_norm": 0.40234375, + "learning_rate": 2.05105568804318e-06, + "loss": 1.3121, + "step": 1722 + }, + { + "epoch": 1.6753763135472877, + "grad_norm": 0.40234375, + "learning_rate": 2.048540030965072e-06, + "loss": 1.3351, + "step": 1723 + }, + { + "epoch": 1.6763500628879782, + "grad_norm": 0.412109375, + "learning_rate": 2.0460248463407377e-06, + "loss": 1.3046, + "step": 1724 + }, + { + "epoch": 1.6773238122286687, + "grad_norm": 0.40234375, + "learning_rate": 2.0435101368023222e-06, + "loss": 1.3182, + "step": 1725 + }, + { + "epoch": 1.6782975615693594, + "grad_norm": 0.396484375, + "learning_rate": 2.0409959049814753e-06, + "loss": 1.334, + "step": 1726 + }, + { + "epoch": 1.6792713109100499, + "grad_norm": 0.416015625, + "learning_rate": 2.038482153509345e-06, + "loss": 1.327, + "step": 1727 + }, + { + "epoch": 1.6802450602507406, + "grad_norm": 0.419921875, + "learning_rate": 2.0359688850165775e-06, + "loss": 1.324, + "step": 1728 + }, + { + "epoch": 1.681218809591431, + "grad_norm": 0.40625, + "learning_rate": 2.033456102133313e-06, + "loss": 1.3314, + "step": 1729 + }, + { + "epoch": 1.6821925589321216, + "grad_norm": 0.400390625, + "learning_rate": 2.030943807489184e-06, + "loss": 1.316, + "step": 1730 + }, + { + "epoch": 1.683166308272812, + "grad_norm": 0.404296875, + "learning_rate": 2.0284320037133124e-06, + "loss": 1.3223, + "step": 1731 + }, + { + "epoch": 1.6841400576135026, + "grad_norm": 0.40625, + "learning_rate": 2.025920693434305e-06, + "loss": 1.3131, + "step": 1732 + }, + { + "epoch": 1.685113806954193, + "grad_norm": 0.40234375, + "learning_rate": 2.0234098792802543e-06, + "loss": 1.3433, + "step": 1733 + }, + { + "epoch": 1.6860875562948836, + "grad_norm": 0.412109375, + "learning_rate": 2.0208995638787317e-06, + "loss": 1.3107, + "step": 1734 + }, + { + "epoch": 1.6870613056355743, + "grad_norm": 0.408203125, + "learning_rate": 2.0183897498567874e-06, + "loss": 1.3225, + "step": 1735 + }, + { + "epoch": 1.6880350549762648, + "grad_norm": 0.416015625, + "learning_rate": 2.0158804398409478e-06, + "loss": 1.3166, + "step": 1736 + }, + { + "epoch": 1.6890088043169555, + "grad_norm": 0.404296875, + "learning_rate": 2.0133716364572094e-06, + "loss": 1.3202, + "step": 1737 + }, + { + "epoch": 1.689982553657646, + "grad_norm": 0.408203125, + "learning_rate": 2.0108633423310407e-06, + "loss": 1.3136, + "step": 1738 + }, + { + "epoch": 1.6909563029983365, + "grad_norm": 0.40625, + "learning_rate": 2.008355560087377e-06, + "loss": 1.3043, + "step": 1739 + }, + { + "epoch": 1.691930052339027, + "grad_norm": 0.41015625, + "learning_rate": 2.0058482923506168e-06, + "loss": 1.3084, + "step": 1740 + }, + { + "epoch": 1.6929038016797175, + "grad_norm": 0.400390625, + "learning_rate": 2.0033415417446213e-06, + "loss": 1.2842, + "step": 1741 + }, + { + "epoch": 1.693877551020408, + "grad_norm": 0.40625, + "learning_rate": 2.0008353108927096e-06, + "loss": 1.3056, + "step": 1742 + }, + { + "epoch": 1.6948513003610988, + "grad_norm": 0.41015625, + "learning_rate": 1.998329602417658e-06, + "loss": 1.3268, + "step": 1743 + }, + { + "epoch": 1.6958250497017893, + "grad_norm": 0.41015625, + "learning_rate": 1.9958244189416955e-06, + "loss": 1.3373, + "step": 1744 + }, + { + "epoch": 1.6967987990424798, + "grad_norm": 0.408203125, + "learning_rate": 1.9933197630865014e-06, + "loss": 1.3278, + "step": 1745 + }, + { + "epoch": 1.6977725483831705, + "grad_norm": 0.3984375, + "learning_rate": 1.990815637473203e-06, + "loss": 1.3251, + "step": 1746 + }, + { + "epoch": 1.698746297723861, + "grad_norm": 0.400390625, + "learning_rate": 1.988312044722373e-06, + "loss": 1.3134, + "step": 1747 + }, + { + "epoch": 1.6997200470645515, + "grad_norm": 0.4140625, + "learning_rate": 1.9858089874540264e-06, + "loss": 1.3539, + "step": 1748 + }, + { + "epoch": 1.700693796405242, + "grad_norm": 0.408203125, + "learning_rate": 1.9833064682876175e-06, + "loss": 1.3348, + "step": 1749 + }, + { + "epoch": 1.7016675457459325, + "grad_norm": 0.416015625, + "learning_rate": 1.9808044898420387e-06, + "loss": 1.3189, + "step": 1750 + }, + { + "epoch": 1.702641295086623, + "grad_norm": 0.408203125, + "learning_rate": 1.9783030547356134e-06, + "loss": 1.3063, + "step": 1751 + }, + { + "epoch": 1.7036150444273137, + "grad_norm": 0.4140625, + "learning_rate": 1.9758021655861005e-06, + "loss": 1.3115, + "step": 1752 + }, + { + "epoch": 1.7045887937680042, + "grad_norm": 0.4140625, + "learning_rate": 1.973301825010685e-06, + "loss": 1.3155, + "step": 1753 + }, + { + "epoch": 1.705562543108695, + "grad_norm": 0.404296875, + "learning_rate": 1.970802035625978e-06, + "loss": 1.301, + "step": 1754 + }, + { + "epoch": 1.7065362924493854, + "grad_norm": 0.392578125, + "learning_rate": 1.9683028000480135e-06, + "loss": 1.3158, + "step": 1755 + }, + { + "epoch": 1.707510041790076, + "grad_norm": 0.40234375, + "learning_rate": 1.965804120892248e-06, + "loss": 1.3212, + "step": 1756 + }, + { + "epoch": 1.7084837911307664, + "grad_norm": 0.408203125, + "learning_rate": 1.963306000773554e-06, + "loss": 1.3197, + "step": 1757 + }, + { + "epoch": 1.709457540471457, + "grad_norm": 0.40234375, + "learning_rate": 1.960808442306219e-06, + "loss": 1.336, + "step": 1758 + }, + { + "epoch": 1.7104312898121474, + "grad_norm": 0.400390625, + "learning_rate": 1.9583114481039428e-06, + "loss": 1.3246, + "step": 1759 + }, + { + "epoch": 1.711405039152838, + "grad_norm": 0.41015625, + "learning_rate": 1.955815020779835e-06, + "loss": 1.3058, + "step": 1760 + }, + { + "epoch": 1.7123787884935286, + "grad_norm": 0.41796875, + "learning_rate": 1.953319162946413e-06, + "loss": 1.3121, + "step": 1761 + }, + { + "epoch": 1.7133525378342191, + "grad_norm": 0.41015625, + "learning_rate": 1.950823877215596e-06, + "loss": 1.3095, + "step": 1762 + }, + { + "epoch": 1.7143262871749099, + "grad_norm": 0.400390625, + "learning_rate": 1.948329166198705e-06, + "loss": 1.3092, + "step": 1763 + }, + { + "epoch": 1.7153000365156004, + "grad_norm": 0.404296875, + "learning_rate": 1.9458350325064606e-06, + "loss": 1.3149, + "step": 1764 + }, + { + "epoch": 1.7162737858562909, + "grad_norm": 0.41015625, + "learning_rate": 1.943341478748979e-06, + "loss": 1.3227, + "step": 1765 + }, + { + "epoch": 1.7172475351969814, + "grad_norm": 0.443359375, + "learning_rate": 1.9408485075357695e-06, + "loss": 1.2979, + "step": 1766 + }, + { + "epoch": 1.7182212845376719, + "grad_norm": 0.404296875, + "learning_rate": 1.9383561214757317e-06, + "loss": 1.3171, + "step": 1767 + }, + { + "epoch": 1.7191950338783624, + "grad_norm": 0.4140625, + "learning_rate": 1.9358643231771517e-06, + "loss": 1.3133, + "step": 1768 + }, + { + "epoch": 1.7201687832190529, + "grad_norm": 0.40234375, + "learning_rate": 1.933373115247702e-06, + "loss": 1.309, + "step": 1769 + }, + { + "epoch": 1.7211425325597436, + "grad_norm": 0.404296875, + "learning_rate": 1.930882500294437e-06, + "loss": 1.3193, + "step": 1770 + }, + { + "epoch": 1.722116281900434, + "grad_norm": 0.421875, + "learning_rate": 1.92839248092379e-06, + "loss": 1.3358, + "step": 1771 + }, + { + "epoch": 1.7230900312411248, + "grad_norm": 0.40625, + "learning_rate": 1.9259030597415725e-06, + "loss": 1.308, + "step": 1772 + }, + { + "epoch": 1.7240637805818153, + "grad_norm": 0.408203125, + "learning_rate": 1.9234142393529664e-06, + "loss": 1.3055, + "step": 1773 + }, + { + "epoch": 1.7250375299225058, + "grad_norm": 0.390625, + "learning_rate": 1.920926022362529e-06, + "loss": 1.2888, + "step": 1774 + }, + { + "epoch": 1.7260112792631963, + "grad_norm": 0.40234375, + "learning_rate": 1.918438411374184e-06, + "loss": 1.3207, + "step": 1775 + }, + { + "epoch": 1.7269850286038868, + "grad_norm": 0.400390625, + "learning_rate": 1.915951408991221e-06, + "loss": 1.3243, + "step": 1776 + }, + { + "epoch": 1.7279587779445773, + "grad_norm": 0.404296875, + "learning_rate": 1.9134650178162937e-06, + "loss": 1.3249, + "step": 1777 + }, + { + "epoch": 1.728932527285268, + "grad_norm": 0.41015625, + "learning_rate": 1.910979240451415e-06, + "loss": 1.3181, + "step": 1778 + }, + { + "epoch": 1.7299062766259585, + "grad_norm": 0.408203125, + "learning_rate": 1.908494079497956e-06, + "loss": 1.3125, + "step": 1779 + }, + { + "epoch": 1.7308800259666492, + "grad_norm": 0.39453125, + "learning_rate": 1.9060095375566434e-06, + "loss": 1.313, + "step": 1780 + }, + { + "epoch": 1.7318537753073397, + "grad_norm": 0.404296875, + "learning_rate": 1.903525617227555e-06, + "loss": 1.3111, + "step": 1781 + }, + { + "epoch": 1.7328275246480302, + "grad_norm": 0.404296875, + "learning_rate": 1.9010423211101179e-06, + "loss": 1.291, + "step": 1782 + }, + { + "epoch": 1.7338012739887207, + "grad_norm": 0.40625, + "learning_rate": 1.8985596518031069e-06, + "loss": 1.3198, + "step": 1783 + }, + { + "epoch": 1.7347750233294112, + "grad_norm": 0.404296875, + "learning_rate": 1.8960776119046417e-06, + "loss": 1.3269, + "step": 1784 + }, + { + "epoch": 1.7357487726701017, + "grad_norm": 0.40234375, + "learning_rate": 1.8935962040121797e-06, + "loss": 1.3087, + "step": 1785 + }, + { + "epoch": 1.7367225220107922, + "grad_norm": 0.404296875, + "learning_rate": 1.8911154307225204e-06, + "loss": 1.3191, + "step": 1786 + }, + { + "epoch": 1.737696271351483, + "grad_norm": 0.396484375, + "learning_rate": 1.8886352946317984e-06, + "loss": 1.3191, + "step": 1787 + }, + { + "epoch": 1.7386700206921735, + "grad_norm": 0.40234375, + "learning_rate": 1.8861557983354812e-06, + "loss": 1.3295, + "step": 1788 + }, + { + "epoch": 1.7396437700328642, + "grad_norm": 0.40234375, + "learning_rate": 1.8836769444283658e-06, + "loss": 1.3094, + "step": 1789 + }, + { + "epoch": 1.7406175193735547, + "grad_norm": 0.41796875, + "learning_rate": 1.8811987355045789e-06, + "loss": 1.3324, + "step": 1790 + }, + { + "epoch": 1.7415912687142452, + "grad_norm": 0.408203125, + "learning_rate": 1.8787211741575706e-06, + "loss": 1.3338, + "step": 1791 + }, + { + "epoch": 1.7425650180549357, + "grad_norm": 0.4140625, + "learning_rate": 1.8762442629801139e-06, + "loss": 1.3038, + "step": 1792 + }, + { + "epoch": 1.7435387673956262, + "grad_norm": 0.40234375, + "learning_rate": 1.8737680045643013e-06, + "loss": 1.3108, + "step": 1793 + }, + { + "epoch": 1.7445125167363167, + "grad_norm": 0.419921875, + "learning_rate": 1.8712924015015429e-06, + "loss": 1.3124, + "step": 1794 + }, + { + "epoch": 1.7454862660770072, + "grad_norm": 0.400390625, + "learning_rate": 1.8688174563825629e-06, + "loss": 1.343, + "step": 1795 + }, + { + "epoch": 1.746460015417698, + "grad_norm": 0.41015625, + "learning_rate": 1.8663431717973939e-06, + "loss": 1.3321, + "step": 1796 + }, + { + "epoch": 1.7474337647583884, + "grad_norm": 0.396484375, + "learning_rate": 1.8638695503353816e-06, + "loss": 1.3017, + "step": 1797 + }, + { + "epoch": 1.7484075140990791, + "grad_norm": 0.400390625, + "learning_rate": 1.8613965945851753e-06, + "loss": 1.3163, + "step": 1798 + }, + { + "epoch": 1.7493812634397696, + "grad_norm": 0.400390625, + "learning_rate": 1.8589243071347279e-06, + "loss": 1.3126, + "step": 1799 + }, + { + "epoch": 1.7503550127804601, + "grad_norm": 0.408203125, + "learning_rate": 1.8564526905712943e-06, + "loss": 1.3137, + "step": 1800 + }, + { + "epoch": 1.7513287621211506, + "grad_norm": 0.40234375, + "learning_rate": 1.8539817474814257e-06, + "loss": 1.3132, + "step": 1801 + }, + { + "epoch": 1.7523025114618411, + "grad_norm": 0.396484375, + "learning_rate": 1.8515114804509687e-06, + "loss": 1.3407, + "step": 1802 + }, + { + "epoch": 1.7532762608025316, + "grad_norm": 0.408203125, + "learning_rate": 1.8490418920650633e-06, + "loss": 1.3373, + "step": 1803 + }, + { + "epoch": 1.7542500101432223, + "grad_norm": 0.396484375, + "learning_rate": 1.8465729849081382e-06, + "loss": 1.3004, + "step": 1804 + }, + { + "epoch": 1.7552237594839128, + "grad_norm": 0.419921875, + "learning_rate": 1.8441047615639103e-06, + "loss": 1.3301, + "step": 1805 + }, + { + "epoch": 1.7561975088246033, + "grad_norm": 0.4140625, + "learning_rate": 1.8416372246153813e-06, + "loss": 1.2893, + "step": 1806 + }, + { + "epoch": 1.757171258165294, + "grad_norm": 0.4140625, + "learning_rate": 1.8391703766448314e-06, + "loss": 1.3386, + "step": 1807 + }, + { + "epoch": 1.7581450075059846, + "grad_norm": 0.400390625, + "learning_rate": 1.8367042202338228e-06, + "loss": 1.3159, + "step": 1808 + }, + { + "epoch": 1.759118756846675, + "grad_norm": 0.404296875, + "learning_rate": 1.8342387579631938e-06, + "loss": 1.3156, + "step": 1809 + }, + { + "epoch": 1.7600925061873656, + "grad_norm": 0.39453125, + "learning_rate": 1.8317739924130548e-06, + "loss": 1.3203, + "step": 1810 + }, + { + "epoch": 1.761066255528056, + "grad_norm": 0.404296875, + "learning_rate": 1.8293099261627887e-06, + "loss": 1.2946, + "step": 1811 + }, + { + "epoch": 1.7620400048687466, + "grad_norm": 0.392578125, + "learning_rate": 1.8268465617910456e-06, + "loss": 1.2961, + "step": 1812 + }, + { + "epoch": 1.7630137542094373, + "grad_norm": 0.3984375, + "learning_rate": 1.8243839018757412e-06, + "loss": 1.3179, + "step": 1813 + }, + { + "epoch": 1.7639875035501278, + "grad_norm": 0.408203125, + "learning_rate": 1.8219219489940542e-06, + "loss": 1.3049, + "step": 1814 + }, + { + "epoch": 1.7649612528908185, + "grad_norm": 0.40625, + "learning_rate": 1.8194607057224234e-06, + "loss": 1.3293, + "step": 1815 + }, + { + "epoch": 1.765935002231509, + "grad_norm": 0.40625, + "learning_rate": 1.8170001746365445e-06, + "loss": 1.3112, + "step": 1816 + }, + { + "epoch": 1.7669087515721995, + "grad_norm": 0.396484375, + "learning_rate": 1.81454035831137e-06, + "loss": 1.3313, + "step": 1817 + }, + { + "epoch": 1.76788250091289, + "grad_norm": 0.40625, + "learning_rate": 1.8120812593210999e-06, + "loss": 1.3131, + "step": 1818 + }, + { + "epoch": 1.7688562502535805, + "grad_norm": 0.40234375, + "learning_rate": 1.8096228802391875e-06, + "loss": 1.2929, + "step": 1819 + }, + { + "epoch": 1.769829999594271, + "grad_norm": 0.400390625, + "learning_rate": 1.8071652236383316e-06, + "loss": 1.3389, + "step": 1820 + }, + { + "epoch": 1.7708037489349615, + "grad_norm": 0.412109375, + "learning_rate": 1.8047082920904748e-06, + "loss": 1.3255, + "step": 1821 + }, + { + "epoch": 1.7717774982756522, + "grad_norm": 0.458984375, + "learning_rate": 1.802252088166801e-06, + "loss": 1.3184, + "step": 1822 + }, + { + "epoch": 1.7727512476163427, + "grad_norm": 0.4140625, + "learning_rate": 1.7997966144377328e-06, + "loss": 1.3293, + "step": 1823 + }, + { + "epoch": 1.7737249969570335, + "grad_norm": 0.412109375, + "learning_rate": 1.7973418734729278e-06, + "loss": 1.3226, + "step": 1824 + }, + { + "epoch": 1.774698746297724, + "grad_norm": 0.408203125, + "learning_rate": 1.7948878678412779e-06, + "loss": 1.3325, + "step": 1825 + }, + { + "epoch": 1.7756724956384144, + "grad_norm": 0.408203125, + "learning_rate": 1.792434600110905e-06, + "loss": 1.2998, + "step": 1826 + }, + { + "epoch": 1.776646244979105, + "grad_norm": 0.404296875, + "learning_rate": 1.789982072849159e-06, + "loss": 1.3398, + "step": 1827 + }, + { + "epoch": 1.7776199943197954, + "grad_norm": 0.40234375, + "learning_rate": 1.7875302886226143e-06, + "loss": 1.3189, + "step": 1828 + }, + { + "epoch": 1.778593743660486, + "grad_norm": 0.400390625, + "learning_rate": 1.7850792499970673e-06, + "loss": 1.3301, + "step": 1829 + }, + { + "epoch": 1.7795674930011764, + "grad_norm": 0.39453125, + "learning_rate": 1.7826289595375356e-06, + "loss": 1.3093, + "step": 1830 + }, + { + "epoch": 1.7805412423418672, + "grad_norm": 0.3984375, + "learning_rate": 1.7801794198082534e-06, + "loss": 1.3059, + "step": 1831 + }, + { + "epoch": 1.7815149916825577, + "grad_norm": 0.404296875, + "learning_rate": 1.7777306333726689e-06, + "loss": 1.3308, + "step": 1832 + }, + { + "epoch": 1.7824887410232484, + "grad_norm": 0.3984375, + "learning_rate": 1.7752826027934418e-06, + "loss": 1.3162, + "step": 1833 + }, + { + "epoch": 1.783462490363939, + "grad_norm": 0.400390625, + "learning_rate": 1.7728353306324408e-06, + "loss": 1.2821, + "step": 1834 + }, + { + "epoch": 1.7844362397046294, + "grad_norm": 0.408203125, + "learning_rate": 1.7703888194507425e-06, + "loss": 1.3234, + "step": 1835 + }, + { + "epoch": 1.78540998904532, + "grad_norm": 0.400390625, + "learning_rate": 1.7679430718086244e-06, + "loss": 1.3237, + "step": 1836 + }, + { + "epoch": 1.7863837383860104, + "grad_norm": 0.39453125, + "learning_rate": 1.7654980902655666e-06, + "loss": 1.3349, + "step": 1837 + }, + { + "epoch": 1.7873574877267009, + "grad_norm": 0.396484375, + "learning_rate": 1.7630538773802477e-06, + "loss": 1.3151, + "step": 1838 + }, + { + "epoch": 1.7883312370673916, + "grad_norm": 0.400390625, + "learning_rate": 1.7606104357105418e-06, + "loss": 1.2914, + "step": 1839 + }, + { + "epoch": 1.789304986408082, + "grad_norm": 0.40625, + "learning_rate": 1.7581677678135146e-06, + "loss": 1.3065, + "step": 1840 + }, + { + "epoch": 1.7902787357487728, + "grad_norm": 0.40234375, + "learning_rate": 1.7557258762454232e-06, + "loss": 1.3159, + "step": 1841 + }, + { + "epoch": 1.7912524850894633, + "grad_norm": 0.40625, + "learning_rate": 1.7532847635617123e-06, + "loss": 1.3185, + "step": 1842 + }, + { + "epoch": 1.7922262344301538, + "grad_norm": 0.404296875, + "learning_rate": 1.7508444323170115e-06, + "loss": 1.3384, + "step": 1843 + }, + { + "epoch": 1.7931999837708443, + "grad_norm": 0.39453125, + "learning_rate": 1.7484048850651325e-06, + "loss": 1.3023, + "step": 1844 + }, + { + "epoch": 1.7941737331115348, + "grad_norm": 0.404296875, + "learning_rate": 1.7459661243590656e-06, + "loss": 1.3117, + "step": 1845 + }, + { + "epoch": 1.7951474824522253, + "grad_norm": 0.41015625, + "learning_rate": 1.7435281527509796e-06, + "loss": 1.3083, + "step": 1846 + }, + { + "epoch": 1.7961212317929158, + "grad_norm": 0.40234375, + "learning_rate": 1.741090972792216e-06, + "loss": 1.3406, + "step": 1847 + }, + { + "epoch": 1.7970949811336066, + "grad_norm": 0.396484375, + "learning_rate": 1.7386545870332893e-06, + "loss": 1.3224, + "step": 1848 + }, + { + "epoch": 1.798068730474297, + "grad_norm": 0.419921875, + "learning_rate": 1.736218998023882e-06, + "loss": 1.3155, + "step": 1849 + }, + { + "epoch": 1.7990424798149878, + "grad_norm": 0.416015625, + "learning_rate": 1.7337842083128435e-06, + "loss": 1.3394, + "step": 1850 + }, + { + "epoch": 1.8000162291556783, + "grad_norm": 0.4140625, + "learning_rate": 1.7313502204481847e-06, + "loss": 1.2991, + "step": 1851 + }, + { + "epoch": 1.8009899784963688, + "grad_norm": 0.3984375, + "learning_rate": 1.7289170369770797e-06, + "loss": 1.3287, + "step": 1852 + }, + { + "epoch": 1.8019637278370593, + "grad_norm": 0.40234375, + "learning_rate": 1.7264846604458607e-06, + "loss": 1.2975, + "step": 1853 + }, + { + "epoch": 1.8029374771777498, + "grad_norm": 0.404296875, + "learning_rate": 1.7240530934000134e-06, + "loss": 1.2853, + "step": 1854 + }, + { + "epoch": 1.8039112265184403, + "grad_norm": 0.40234375, + "learning_rate": 1.7216223383841774e-06, + "loss": 1.3112, + "step": 1855 + }, + { + "epoch": 1.8048849758591308, + "grad_norm": 0.40625, + "learning_rate": 1.719192397942144e-06, + "loss": 1.3069, + "step": 1856 + }, + { + "epoch": 1.8058587251998215, + "grad_norm": 0.40234375, + "learning_rate": 1.7167632746168503e-06, + "loss": 1.2939, + "step": 1857 + }, + { + "epoch": 1.806832474540512, + "grad_norm": 0.404296875, + "learning_rate": 1.7143349709503786e-06, + "loss": 1.3234, + "step": 1858 + }, + { + "epoch": 1.8078062238812027, + "grad_norm": 0.3984375, + "learning_rate": 1.7119074894839538e-06, + "loss": 1.33, + "step": 1859 + }, + { + "epoch": 1.8087799732218932, + "grad_norm": 0.396484375, + "learning_rate": 1.7094808327579401e-06, + "loss": 1.3097, + "step": 1860 + }, + { + "epoch": 1.8097537225625837, + "grad_norm": 0.412109375, + "learning_rate": 1.7070550033118393e-06, + "loss": 1.3193, + "step": 1861 + }, + { + "epoch": 1.8107274719032742, + "grad_norm": 0.404296875, + "learning_rate": 1.7046300036842864e-06, + "loss": 1.3302, + "step": 1862 + }, + { + "epoch": 1.8117012212439647, + "grad_norm": 0.408203125, + "learning_rate": 1.7022058364130478e-06, + "loss": 1.3024, + "step": 1863 + }, + { + "epoch": 1.8126749705846552, + "grad_norm": 0.400390625, + "learning_rate": 1.6997825040350196e-06, + "loss": 1.299, + "step": 1864 + }, + { + "epoch": 1.813648719925346, + "grad_norm": 0.396484375, + "learning_rate": 1.6973600090862247e-06, + "loss": 1.3096, + "step": 1865 + }, + { + "epoch": 1.8146224692660364, + "grad_norm": 0.416015625, + "learning_rate": 1.6949383541018088e-06, + "loss": 1.3065, + "step": 1866 + }, + { + "epoch": 1.815596218606727, + "grad_norm": 0.404296875, + "learning_rate": 1.6925175416160387e-06, + "loss": 1.3274, + "step": 1867 + }, + { + "epoch": 1.8165699679474177, + "grad_norm": 0.400390625, + "learning_rate": 1.6900975741622994e-06, + "loss": 1.3031, + "step": 1868 + }, + { + "epoch": 1.8175437172881082, + "grad_norm": 0.40234375, + "learning_rate": 1.6876784542730918e-06, + "loss": 1.3244, + "step": 1869 + }, + { + "epoch": 1.8185174666287987, + "grad_norm": 0.39453125, + "learning_rate": 1.6852601844800298e-06, + "loss": 1.3006, + "step": 1870 + }, + { + "epoch": 1.8194912159694892, + "grad_norm": 0.40234375, + "learning_rate": 1.6828427673138378e-06, + "loss": 1.3155, + "step": 1871 + }, + { + "epoch": 1.8204649653101797, + "grad_norm": 0.40234375, + "learning_rate": 1.6804262053043488e-06, + "loss": 1.3076, + "step": 1872 + }, + { + "epoch": 1.8214387146508701, + "grad_norm": 0.400390625, + "learning_rate": 1.6780105009804976e-06, + "loss": 1.3134, + "step": 1873 + }, + { + "epoch": 1.8224124639915609, + "grad_norm": 0.40625, + "learning_rate": 1.6755956568703247e-06, + "loss": 1.3144, + "step": 1874 + }, + { + "epoch": 1.8233862133322514, + "grad_norm": 0.390625, + "learning_rate": 1.6731816755009696e-06, + "loss": 1.3084, + "step": 1875 + }, + { + "epoch": 1.824359962672942, + "grad_norm": 0.3984375, + "learning_rate": 1.6707685593986687e-06, + "loss": 1.2964, + "step": 1876 + }, + { + "epoch": 1.8253337120136326, + "grad_norm": 0.4140625, + "learning_rate": 1.6683563110887523e-06, + "loss": 1.308, + "step": 1877 + }, + { + "epoch": 1.826307461354323, + "grad_norm": 0.396484375, + "learning_rate": 1.665944933095644e-06, + "loss": 1.3091, + "step": 1878 + }, + { + "epoch": 1.8272812106950136, + "grad_norm": 0.39453125, + "learning_rate": 1.6635344279428553e-06, + "loss": 1.3191, + "step": 1879 + }, + { + "epoch": 1.828254960035704, + "grad_norm": 0.4140625, + "learning_rate": 1.6611247981529846e-06, + "loss": 1.3227, + "step": 1880 + }, + { + "epoch": 1.8292287093763946, + "grad_norm": 0.404296875, + "learning_rate": 1.6587160462477149e-06, + "loss": 1.3134, + "step": 1881 + }, + { + "epoch": 1.830202458717085, + "grad_norm": 0.41015625, + "learning_rate": 1.6563081747478093e-06, + "loss": 1.3145, + "step": 1882 + }, + { + "epoch": 1.8311762080577758, + "grad_norm": 0.404296875, + "learning_rate": 1.6539011861731115e-06, + "loss": 1.3177, + "step": 1883 + }, + { + "epoch": 1.8321499573984663, + "grad_norm": 0.400390625, + "learning_rate": 1.651495083042538e-06, + "loss": 1.3174, + "step": 1884 + }, + { + "epoch": 1.833123706739157, + "grad_norm": 0.3984375, + "learning_rate": 1.6490898678740819e-06, + "loss": 1.3085, + "step": 1885 + }, + { + "epoch": 1.8340974560798475, + "grad_norm": 0.400390625, + "learning_rate": 1.646685543184805e-06, + "loss": 1.3278, + "step": 1886 + }, + { + "epoch": 1.835071205420538, + "grad_norm": 0.400390625, + "learning_rate": 1.6442821114908385e-06, + "loss": 1.33, + "step": 1887 + }, + { + "epoch": 1.8360449547612285, + "grad_norm": 0.39453125, + "learning_rate": 1.6418795753073785e-06, + "loss": 1.3125, + "step": 1888 + }, + { + "epoch": 1.837018704101919, + "grad_norm": 0.40234375, + "learning_rate": 1.6394779371486838e-06, + "loss": 1.3102, + "step": 1889 + }, + { + "epoch": 1.8379924534426095, + "grad_norm": 0.3984375, + "learning_rate": 1.6370771995280737e-06, + "loss": 1.3222, + "step": 1890 + }, + { + "epoch": 1.8389662027833003, + "grad_norm": 0.396484375, + "learning_rate": 1.634677364957925e-06, + "loss": 1.3038, + "step": 1891 + }, + { + "epoch": 1.8399399521239908, + "grad_norm": 0.40234375, + "learning_rate": 1.6322784359496697e-06, + "loss": 1.335, + "step": 1892 + }, + { + "epoch": 1.8409137014646813, + "grad_norm": 0.408203125, + "learning_rate": 1.6298804150137914e-06, + "loss": 1.3319, + "step": 1893 + }, + { + "epoch": 1.841887450805372, + "grad_norm": 0.40625, + "learning_rate": 1.6274833046598254e-06, + "loss": 1.3062, + "step": 1894 + }, + { + "epoch": 1.8428612001460625, + "grad_norm": 0.40234375, + "learning_rate": 1.6250871073963498e-06, + "loss": 1.314, + "step": 1895 + }, + { + "epoch": 1.843834949486753, + "grad_norm": 0.416015625, + "learning_rate": 1.6226918257309916e-06, + "loss": 1.3173, + "step": 1896 + }, + { + "epoch": 1.8448086988274435, + "grad_norm": 0.4140625, + "learning_rate": 1.6202974621704176e-06, + "loss": 1.3172, + "step": 1897 + }, + { + "epoch": 1.845782448168134, + "grad_norm": 0.396484375, + "learning_rate": 1.6179040192203343e-06, + "loss": 1.3011, + "step": 1898 + }, + { + "epoch": 1.8467561975088245, + "grad_norm": 0.400390625, + "learning_rate": 1.6155114993854846e-06, + "loss": 1.3096, + "step": 1899 + }, + { + "epoch": 1.8477299468495152, + "grad_norm": 0.3984375, + "learning_rate": 1.613119905169645e-06, + "loss": 1.3185, + "step": 1900 + }, + { + "epoch": 1.8487036961902057, + "grad_norm": 0.408203125, + "learning_rate": 1.6107292390756241e-06, + "loss": 1.2993, + "step": 1901 + }, + { + "epoch": 1.8496774455308964, + "grad_norm": 0.390625, + "learning_rate": 1.6083395036052586e-06, + "loss": 1.3028, + "step": 1902 + }, + { + "epoch": 1.850651194871587, + "grad_norm": 0.400390625, + "learning_rate": 1.6059507012594116e-06, + "loss": 1.29, + "step": 1903 + }, + { + "epoch": 1.8516249442122774, + "grad_norm": 0.40625, + "learning_rate": 1.6035628345379695e-06, + "loss": 1.3278, + "step": 1904 + }, + { + "epoch": 1.852598693552968, + "grad_norm": 0.408203125, + "learning_rate": 1.601175905939841e-06, + "loss": 1.3225, + "step": 1905 + }, + { + "epoch": 1.8535724428936584, + "grad_norm": 0.400390625, + "learning_rate": 1.5987899179629492e-06, + "loss": 1.3327, + "step": 1906 + }, + { + "epoch": 1.854546192234349, + "grad_norm": 0.40234375, + "learning_rate": 1.596404873104237e-06, + "loss": 1.3084, + "step": 1907 + }, + { + "epoch": 1.8555199415750394, + "grad_norm": 0.43359375, + "learning_rate": 1.594020773859658e-06, + "loss": 1.3321, + "step": 1908 + }, + { + "epoch": 1.8564936909157301, + "grad_norm": 0.404296875, + "learning_rate": 1.5916376227241776e-06, + "loss": 1.3147, + "step": 1909 + }, + { + "epoch": 1.8574674402564206, + "grad_norm": 0.412109375, + "learning_rate": 1.5892554221917677e-06, + "loss": 1.3217, + "step": 1910 + }, + { + "epoch": 1.8584411895971114, + "grad_norm": 0.396484375, + "learning_rate": 1.5868741747554061e-06, + "loss": 1.3225, + "step": 1911 + }, + { + "epoch": 1.8594149389378019, + "grad_norm": 0.41796875, + "learning_rate": 1.5844938829070733e-06, + "loss": 1.3308, + "step": 1912 + }, + { + "epoch": 1.8603886882784924, + "grad_norm": 0.3984375, + "learning_rate": 1.5821145491377494e-06, + "loss": 1.3021, + "step": 1913 + }, + { + "epoch": 1.8613624376191829, + "grad_norm": 0.404296875, + "learning_rate": 1.579736175937412e-06, + "loss": 1.3113, + "step": 1914 + }, + { + "epoch": 1.8623361869598734, + "grad_norm": 0.400390625, + "learning_rate": 1.5773587657950338e-06, + "loss": 1.3239, + "step": 1915 + }, + { + "epoch": 1.8633099363005639, + "grad_norm": 0.400390625, + "learning_rate": 1.5749823211985798e-06, + "loss": 1.323, + "step": 1916 + }, + { + "epoch": 1.8642836856412544, + "grad_norm": 0.3984375, + "learning_rate": 1.5726068446350024e-06, + "loss": 1.3104, + "step": 1917 + }, + { + "epoch": 1.865257434981945, + "grad_norm": 0.40625, + "learning_rate": 1.5702323385902435e-06, + "loss": 1.3393, + "step": 1918 + }, + { + "epoch": 1.8662311843226356, + "grad_norm": 0.408203125, + "learning_rate": 1.5678588055492289e-06, + "loss": 1.3089, + "step": 1919 + }, + { + "epoch": 1.8672049336633263, + "grad_norm": 0.396484375, + "learning_rate": 1.5654862479958652e-06, + "loss": 1.306, + "step": 1920 + }, + { + "epoch": 1.8681786830040168, + "grad_norm": 0.396484375, + "learning_rate": 1.5631146684130389e-06, + "loss": 1.3229, + "step": 1921 + }, + { + "epoch": 1.8691524323447073, + "grad_norm": 0.4140625, + "learning_rate": 1.5607440692826132e-06, + "loss": 1.3009, + "step": 1922 + }, + { + "epoch": 1.8701261816853978, + "grad_norm": 0.4140625, + "learning_rate": 1.5583744530854243e-06, + "loss": 1.322, + "step": 1923 + }, + { + "epoch": 1.8710999310260883, + "grad_norm": 0.41015625, + "learning_rate": 1.5560058223012805e-06, + "loss": 1.3319, + "step": 1924 + }, + { + "epoch": 1.8720736803667788, + "grad_norm": 0.40234375, + "learning_rate": 1.553638179408959e-06, + "loss": 1.323, + "step": 1925 + }, + { + "epoch": 1.8730474297074695, + "grad_norm": 0.392578125, + "learning_rate": 1.5512715268862033e-06, + "loss": 1.3077, + "step": 1926 + }, + { + "epoch": 1.87402117904816, + "grad_norm": 0.408203125, + "learning_rate": 1.5489058672097195e-06, + "loss": 1.325, + "step": 1927 + }, + { + "epoch": 1.8749949283888507, + "grad_norm": 0.3984375, + "learning_rate": 1.546541202855175e-06, + "loss": 1.3195, + "step": 1928 + }, + { + "epoch": 1.8759686777295412, + "grad_norm": 0.40625, + "learning_rate": 1.5441775362971955e-06, + "loss": 1.3225, + "step": 1929 + }, + { + "epoch": 1.8769424270702317, + "grad_norm": 0.408203125, + "learning_rate": 1.541814870009364e-06, + "loss": 1.3365, + "step": 1930 + }, + { + "epoch": 1.8779161764109222, + "grad_norm": 0.3984375, + "learning_rate": 1.5394532064642148e-06, + "loss": 1.321, + "step": 1931 + }, + { + "epoch": 1.8788899257516127, + "grad_norm": 0.400390625, + "learning_rate": 1.5370925481332338e-06, + "loss": 1.3168, + "step": 1932 + }, + { + "epoch": 1.8798636750923032, + "grad_norm": 0.400390625, + "learning_rate": 1.534732897486855e-06, + "loss": 1.3231, + "step": 1933 + }, + { + "epoch": 1.8808374244329937, + "grad_norm": 0.4140625, + "learning_rate": 1.5323742569944573e-06, + "loss": 1.3295, + "step": 1934 + }, + { + "epoch": 1.8818111737736845, + "grad_norm": 0.404296875, + "learning_rate": 1.530016629124363e-06, + "loss": 1.294, + "step": 1935 + }, + { + "epoch": 1.882784923114375, + "grad_norm": 0.408203125, + "learning_rate": 1.5276600163438338e-06, + "loss": 1.2921, + "step": 1936 + }, + { + "epoch": 1.8837586724550657, + "grad_norm": 0.388671875, + "learning_rate": 1.5253044211190705e-06, + "loss": 1.2998, + "step": 1937 + }, + { + "epoch": 1.8847324217957562, + "grad_norm": 0.46875, + "learning_rate": 1.522949845915208e-06, + "loss": 1.3007, + "step": 1938 + }, + { + "epoch": 1.8857061711364467, + "grad_norm": 0.3984375, + "learning_rate": 1.5205962931963135e-06, + "loss": 1.329, + "step": 1939 + }, + { + "epoch": 1.8866799204771372, + "grad_norm": 0.40234375, + "learning_rate": 1.5182437654253856e-06, + "loss": 1.2925, + "step": 1940 + }, + { + "epoch": 1.8876536698178277, + "grad_norm": 0.4140625, + "learning_rate": 1.515892265064349e-06, + "loss": 1.2956, + "step": 1941 + }, + { + "epoch": 1.8886274191585182, + "grad_norm": 0.400390625, + "learning_rate": 1.5135417945740533e-06, + "loss": 1.3301, + "step": 1942 + }, + { + "epoch": 1.8896011684992087, + "grad_norm": 0.42578125, + "learning_rate": 1.5111923564142716e-06, + "loss": 1.3004, + "step": 1943 + }, + { + "epoch": 1.8905749178398994, + "grad_norm": 0.421875, + "learning_rate": 1.5088439530436943e-06, + "loss": 1.3135, + "step": 1944 + }, + { + "epoch": 1.89154866718059, + "grad_norm": 0.404296875, + "learning_rate": 1.5064965869199316e-06, + "loss": 1.3066, + "step": 1945 + }, + { + "epoch": 1.8925224165212806, + "grad_norm": 0.39453125, + "learning_rate": 1.5041502604995056e-06, + "loss": 1.3019, + "step": 1946 + }, + { + "epoch": 1.8934961658619711, + "grad_norm": 0.392578125, + "learning_rate": 1.5018049762378528e-06, + "loss": 1.3055, + "step": 1947 + }, + { + "epoch": 1.8944699152026616, + "grad_norm": 0.3984375, + "learning_rate": 1.4994607365893173e-06, + "loss": 1.3339, + "step": 1948 + }, + { + "epoch": 1.8954436645433521, + "grad_norm": 0.3984375, + "learning_rate": 1.4971175440071516e-06, + "loss": 1.3305, + "step": 1949 + }, + { + "epoch": 1.8964174138840426, + "grad_norm": 0.40234375, + "learning_rate": 1.49477540094351e-06, + "loss": 1.3034, + "step": 1950 + }, + { + "epoch": 1.8973911632247331, + "grad_norm": 0.40625, + "learning_rate": 1.492434309849451e-06, + "loss": 1.3158, + "step": 1951 + }, + { + "epoch": 1.8983649125654238, + "grad_norm": 0.3984375, + "learning_rate": 1.4900942731749314e-06, + "loss": 1.3214, + "step": 1952 + }, + { + "epoch": 1.8993386619061143, + "grad_norm": 0.4140625, + "learning_rate": 1.4877552933688033e-06, + "loss": 1.3366, + "step": 1953 + }, + { + "epoch": 1.9003124112468048, + "grad_norm": 0.423828125, + "learning_rate": 1.4854173728788144e-06, + "loss": 1.3046, + "step": 1954 + }, + { + "epoch": 1.9012861605874956, + "grad_norm": 0.40625, + "learning_rate": 1.483080514151603e-06, + "loss": 1.3136, + "step": 1955 + }, + { + "epoch": 1.902259909928186, + "grad_norm": 0.40234375, + "learning_rate": 1.4807447196326967e-06, + "loss": 1.3015, + "step": 1956 + }, + { + "epoch": 1.9032336592688766, + "grad_norm": 0.40625, + "learning_rate": 1.4784099917665094e-06, + "loss": 1.3246, + "step": 1957 + }, + { + "epoch": 1.904207408609567, + "grad_norm": 0.388671875, + "learning_rate": 1.4760763329963378e-06, + "loss": 1.3252, + "step": 1958 + }, + { + "epoch": 1.9051811579502576, + "grad_norm": 0.408203125, + "learning_rate": 1.4737437457643616e-06, + "loss": 1.3049, + "step": 1959 + }, + { + "epoch": 1.906154907290948, + "grad_norm": 0.39453125, + "learning_rate": 1.471412232511638e-06, + "loss": 1.3029, + "step": 1960 + }, + { + "epoch": 1.9071286566316388, + "grad_norm": 0.39453125, + "learning_rate": 1.4690817956781e-06, + "loss": 1.3111, + "step": 1961 + }, + { + "epoch": 1.9081024059723293, + "grad_norm": 0.41796875, + "learning_rate": 1.4667524377025535e-06, + "loss": 1.3004, + "step": 1962 + }, + { + "epoch": 1.90907615531302, + "grad_norm": 0.408203125, + "learning_rate": 1.4644241610226776e-06, + "loss": 1.3212, + "step": 1963 + }, + { + "epoch": 1.9100499046537105, + "grad_norm": 0.3984375, + "learning_rate": 1.462096968075018e-06, + "loss": 1.3024, + "step": 1964 + }, + { + "epoch": 1.911023653994401, + "grad_norm": 0.408203125, + "learning_rate": 1.459770861294987e-06, + "loss": 1.2928, + "step": 1965 + }, + { + "epoch": 1.9119974033350915, + "grad_norm": 0.400390625, + "learning_rate": 1.45744584311686e-06, + "loss": 1.3056, + "step": 1966 + }, + { + "epoch": 1.912971152675782, + "grad_norm": 0.396484375, + "learning_rate": 1.4551219159737728e-06, + "loss": 1.3234, + "step": 1967 + }, + { + "epoch": 1.9139449020164725, + "grad_norm": 0.408203125, + "learning_rate": 1.4527990822977216e-06, + "loss": 1.3294, + "step": 1968 + }, + { + "epoch": 1.914918651357163, + "grad_norm": 0.3984375, + "learning_rate": 1.4504773445195544e-06, + "loss": 1.315, + "step": 1969 + }, + { + "epoch": 1.9158924006978537, + "grad_norm": 0.40625, + "learning_rate": 1.4481567050689764e-06, + "loss": 1.303, + "step": 1970 + }, + { + "epoch": 1.9168661500385442, + "grad_norm": 0.40234375, + "learning_rate": 1.44583716637454e-06, + "loss": 1.3114, + "step": 1971 + }, + { + "epoch": 1.917839899379235, + "grad_norm": 0.3984375, + "learning_rate": 1.4435187308636486e-06, + "loss": 1.3197, + "step": 1972 + }, + { + "epoch": 1.9188136487199254, + "grad_norm": 0.40625, + "learning_rate": 1.4412014009625476e-06, + "loss": 1.3281, + "step": 1973 + }, + { + "epoch": 1.919787398060616, + "grad_norm": 0.404296875, + "learning_rate": 1.438885179096329e-06, + "loss": 1.3262, + "step": 1974 + }, + { + "epoch": 1.9207611474013064, + "grad_norm": 0.40625, + "learning_rate": 1.4365700676889227e-06, + "loss": 1.3261, + "step": 1975 + }, + { + "epoch": 1.921734896741997, + "grad_norm": 0.39453125, + "learning_rate": 1.4342560691630991e-06, + "loss": 1.3037, + "step": 1976 + }, + { + "epoch": 1.9227086460826874, + "grad_norm": 0.3984375, + "learning_rate": 1.4319431859404603e-06, + "loss": 1.3175, + "step": 1977 + }, + { + "epoch": 1.923682395423378, + "grad_norm": 0.40625, + "learning_rate": 1.4296314204414453e-06, + "loss": 1.3127, + "step": 1978 + }, + { + "epoch": 1.9246561447640687, + "grad_norm": 0.400390625, + "learning_rate": 1.427320775085319e-06, + "loss": 1.3142, + "step": 1979 + }, + { + "epoch": 1.9256298941047592, + "grad_norm": 0.396484375, + "learning_rate": 1.4250112522901794e-06, + "loss": 1.3255, + "step": 1980 + }, + { + "epoch": 1.9266036434454499, + "grad_norm": 0.40234375, + "learning_rate": 1.4227028544729448e-06, + "loss": 1.3185, + "step": 1981 + }, + { + "epoch": 1.9275773927861404, + "grad_norm": 0.400390625, + "learning_rate": 1.4203955840493588e-06, + "loss": 1.3147, + "step": 1982 + }, + { + "epoch": 1.9285511421268309, + "grad_norm": 0.412109375, + "learning_rate": 1.4180894434339836e-06, + "loss": 1.3391, + "step": 1983 + }, + { + "epoch": 1.9295248914675214, + "grad_norm": 0.400390625, + "learning_rate": 1.4157844350402017e-06, + "loss": 1.2939, + "step": 1984 + }, + { + "epoch": 1.9304986408082119, + "grad_norm": 0.3984375, + "learning_rate": 1.4134805612802072e-06, + "loss": 1.3273, + "step": 1985 + }, + { + "epoch": 1.9314723901489024, + "grad_norm": 0.41796875, + "learning_rate": 1.4111778245650107e-06, + "loss": 1.3191, + "step": 1986 + }, + { + "epoch": 1.932446139489593, + "grad_norm": 0.408203125, + "learning_rate": 1.4088762273044287e-06, + "loss": 1.3335, + "step": 1987 + }, + { + "epoch": 1.9334198888302836, + "grad_norm": 0.396484375, + "learning_rate": 1.4065757719070896e-06, + "loss": 1.3201, + "step": 1988 + }, + { + "epoch": 1.9343936381709743, + "grad_norm": 0.4140625, + "learning_rate": 1.4042764607804238e-06, + "loss": 1.3164, + "step": 1989 + }, + { + "epoch": 1.9353673875116648, + "grad_norm": 0.41015625, + "learning_rate": 1.4019782963306636e-06, + "loss": 1.3188, + "step": 1990 + }, + { + "epoch": 1.9363411368523553, + "grad_norm": 0.392578125, + "learning_rate": 1.399681280962845e-06, + "loss": 1.3138, + "step": 1991 + }, + { + "epoch": 1.9373148861930458, + "grad_norm": 0.396484375, + "learning_rate": 1.397385417080797e-06, + "loss": 1.3319, + "step": 1992 + }, + { + "epoch": 1.9382886355337363, + "grad_norm": 0.392578125, + "learning_rate": 1.3950907070871483e-06, + "loss": 1.2966, + "step": 1993 + }, + { + "epoch": 1.9392623848744268, + "grad_norm": 0.392578125, + "learning_rate": 1.3927971533833163e-06, + "loss": 1.3071, + "step": 1994 + }, + { + "epoch": 1.9402361342151173, + "grad_norm": 0.40625, + "learning_rate": 1.3905047583695086e-06, + "loss": 1.3234, + "step": 1995 + }, + { + "epoch": 1.941209883555808, + "grad_norm": 0.4140625, + "learning_rate": 1.388213524444723e-06, + "loss": 1.3245, + "step": 1996 + }, + { + "epoch": 1.9421836328964985, + "grad_norm": 0.40234375, + "learning_rate": 1.3859234540067397e-06, + "loss": 1.2904, + "step": 1997 + }, + { + "epoch": 1.9431573822371893, + "grad_norm": 0.3984375, + "learning_rate": 1.3836345494521215e-06, + "loss": 1.3329, + "step": 1998 + }, + { + "epoch": 1.9441311315778798, + "grad_norm": 0.40625, + "learning_rate": 1.3813468131762126e-06, + "loss": 1.3248, + "step": 1999 + }, + { + "epoch": 1.9451048809185703, + "grad_norm": 0.404296875, + "learning_rate": 1.3790602475731323e-06, + "loss": 1.3367, + "step": 2000 + }, + { + "epoch": 1.9460786302592608, + "grad_norm": 0.40234375, + "learning_rate": 1.3767748550357785e-06, + "loss": 1.3169, + "step": 2001 + }, + { + "epoch": 1.9470523795999513, + "grad_norm": 0.41015625, + "learning_rate": 1.3744906379558165e-06, + "loss": 1.3082, + "step": 2002 + }, + { + "epoch": 1.9480261289406418, + "grad_norm": 0.40625, + "learning_rate": 1.3722075987236865e-06, + "loss": 1.3302, + "step": 2003 + }, + { + "epoch": 1.9489998782813323, + "grad_norm": 0.390625, + "learning_rate": 1.3699257397285926e-06, + "loss": 1.3229, + "step": 2004 + }, + { + "epoch": 1.949973627622023, + "grad_norm": 0.3984375, + "learning_rate": 1.3676450633585037e-06, + "loss": 1.2998, + "step": 2005 + }, + { + "epoch": 1.9509473769627135, + "grad_norm": 0.404296875, + "learning_rate": 1.365365572000154e-06, + "loss": 1.3457, + "step": 2006 + }, + { + "epoch": 1.9519211263034042, + "grad_norm": 0.392578125, + "learning_rate": 1.3630872680390356e-06, + "loss": 1.3216, + "step": 2007 + }, + { + "epoch": 1.9528948756440947, + "grad_norm": 0.392578125, + "learning_rate": 1.3608101538593965e-06, + "loss": 1.3328, + "step": 2008 + }, + { + "epoch": 1.9538686249847852, + "grad_norm": 0.400390625, + "learning_rate": 1.3585342318442434e-06, + "loss": 1.2837, + "step": 2009 + }, + { + "epoch": 1.9548423743254757, + "grad_norm": 0.39453125, + "learning_rate": 1.3562595043753318e-06, + "loss": 1.32, + "step": 2010 + }, + { + "epoch": 1.9558161236661662, + "grad_norm": 0.388671875, + "learning_rate": 1.3539859738331707e-06, + "loss": 1.3208, + "step": 2011 + }, + { + "epoch": 1.9567898730068567, + "grad_norm": 0.3984375, + "learning_rate": 1.3517136425970115e-06, + "loss": 1.3032, + "step": 2012 + }, + { + "epoch": 1.9577636223475474, + "grad_norm": 0.40625, + "learning_rate": 1.3494425130448562e-06, + "loss": 1.3074, + "step": 2013 + }, + { + "epoch": 1.958737371688238, + "grad_norm": 0.400390625, + "learning_rate": 1.347172587553444e-06, + "loss": 1.3137, + "step": 2014 + }, + { + "epoch": 1.9597111210289284, + "grad_norm": 0.400390625, + "learning_rate": 1.3449038684982602e-06, + "loss": 1.3113, + "step": 2015 + }, + { + "epoch": 1.9606848703696191, + "grad_norm": 0.40625, + "learning_rate": 1.3426363582535193e-06, + "loss": 1.3086, + "step": 2016 + }, + { + "epoch": 1.9616586197103096, + "grad_norm": 0.388671875, + "learning_rate": 1.3403700591921787e-06, + "loss": 1.3039, + "step": 2017 + }, + { + "epoch": 1.9626323690510001, + "grad_norm": 0.3984375, + "learning_rate": 1.3381049736859225e-06, + "loss": 1.3114, + "step": 2018 + }, + { + "epoch": 1.9636061183916906, + "grad_norm": 0.408203125, + "learning_rate": 1.33584110410517e-06, + "loss": 1.319, + "step": 2019 + }, + { + "epoch": 1.9645798677323811, + "grad_norm": 0.40625, + "learning_rate": 1.3335784528190627e-06, + "loss": 1.3085, + "step": 2020 + }, + { + "epoch": 1.9655536170730716, + "grad_norm": 0.39453125, + "learning_rate": 1.3313170221954719e-06, + "loss": 1.3145, + "step": 2021 + }, + { + "epoch": 1.9665273664137624, + "grad_norm": 0.408203125, + "learning_rate": 1.3290568146009874e-06, + "loss": 1.3138, + "step": 2022 + }, + { + "epoch": 1.9675011157544529, + "grad_norm": 0.40234375, + "learning_rate": 1.3267978324009235e-06, + "loss": 1.3036, + "step": 2023 + }, + { + "epoch": 1.9684748650951436, + "grad_norm": 0.400390625, + "learning_rate": 1.3245400779593067e-06, + "loss": 1.3135, + "step": 2024 + }, + { + "epoch": 1.969448614435834, + "grad_norm": 0.39453125, + "learning_rate": 1.3222835536388845e-06, + "loss": 1.307, + "step": 2025 + }, + { + "epoch": 1.9704223637765246, + "grad_norm": 0.3984375, + "learning_rate": 1.3200282618011115e-06, + "loss": 1.3144, + "step": 2026 + }, + { + "epoch": 1.971396113117215, + "grad_norm": 0.404296875, + "learning_rate": 1.3177742048061587e-06, + "loss": 1.3344, + "step": 2027 + }, + { + "epoch": 1.9723698624579056, + "grad_norm": 0.40625, + "learning_rate": 1.3155213850128968e-06, + "loss": 1.2997, + "step": 2028 + }, + { + "epoch": 1.973343611798596, + "grad_norm": 0.39453125, + "learning_rate": 1.31326980477891e-06, + "loss": 1.3037, + "step": 2029 + }, + { + "epoch": 1.9743173611392866, + "grad_norm": 0.3984375, + "learning_rate": 1.3110194664604786e-06, + "loss": 1.3135, + "step": 2030 + }, + { + "epoch": 1.9752911104799773, + "grad_norm": 0.39453125, + "learning_rate": 1.3087703724125895e-06, + "loss": 1.3181, + "step": 2031 + }, + { + "epoch": 1.9762648598206678, + "grad_norm": 0.39453125, + "learning_rate": 1.3065225249889208e-06, + "loss": 1.3113, + "step": 2032 + }, + { + "epoch": 1.9772386091613585, + "grad_norm": 0.40625, + "learning_rate": 1.3042759265418523e-06, + "loss": 1.3233, + "step": 2033 + }, + { + "epoch": 1.978212358502049, + "grad_norm": 0.40234375, + "learning_rate": 1.3020305794224516e-06, + "loss": 1.3193, + "step": 2034 + }, + { + "epoch": 1.9791861078427395, + "grad_norm": 0.400390625, + "learning_rate": 1.299786485980481e-06, + "loss": 1.3201, + "step": 2035 + }, + { + "epoch": 1.98015985718343, + "grad_norm": 0.404296875, + "learning_rate": 1.2975436485643865e-06, + "loss": 1.3062, + "step": 2036 + }, + { + "epoch": 1.9811336065241205, + "grad_norm": 0.396484375, + "learning_rate": 1.2953020695213048e-06, + "loss": 1.3092, + "step": 2037 + }, + { + "epoch": 1.982107355864811, + "grad_norm": 0.412109375, + "learning_rate": 1.2930617511970514e-06, + "loss": 1.3274, + "step": 2038 + }, + { + "epoch": 1.9830811052055017, + "grad_norm": 0.40234375, + "learning_rate": 1.2908226959361241e-06, + "loss": 1.293, + "step": 2039 + }, + { + "epoch": 1.9840548545461922, + "grad_norm": 0.392578125, + "learning_rate": 1.2885849060816973e-06, + "loss": 1.304, + "step": 2040 + }, + { + "epoch": 1.9850286038868827, + "grad_norm": 0.400390625, + "learning_rate": 1.2863483839756254e-06, + "loss": 1.3259, + "step": 2041 + }, + { + "epoch": 1.9860023532275735, + "grad_norm": 0.396484375, + "learning_rate": 1.284113131958431e-06, + "loss": 1.3236, + "step": 2042 + }, + { + "epoch": 1.986976102568264, + "grad_norm": 0.412109375, + "learning_rate": 1.2818791523693114e-06, + "loss": 1.3307, + "step": 2043 + }, + { + "epoch": 1.9879498519089545, + "grad_norm": 0.390625, + "learning_rate": 1.2796464475461296e-06, + "loss": 1.317, + "step": 2044 + }, + { + "epoch": 1.988923601249645, + "grad_norm": 0.392578125, + "learning_rate": 1.277415019825417e-06, + "loss": 1.3171, + "step": 2045 + }, + { + "epoch": 1.9898973505903355, + "grad_norm": 0.404296875, + "learning_rate": 1.275184871542366e-06, + "loss": 1.3081, + "step": 2046 + }, + { + "epoch": 1.990871099931026, + "grad_norm": 0.408203125, + "learning_rate": 1.2729560050308325e-06, + "loss": 1.328, + "step": 2047 + }, + { + "epoch": 1.9918448492717167, + "grad_norm": 0.3984375, + "learning_rate": 1.2707284226233283e-06, + "loss": 1.3265, + "step": 2048 + }, + { + "epoch": 1.9928185986124072, + "grad_norm": 0.388671875, + "learning_rate": 1.2685021266510256e-06, + "loss": 1.3072, + "step": 2049 + }, + { + "epoch": 1.993792347953098, + "grad_norm": 0.408203125, + "learning_rate": 1.266277119443744e-06, + "loss": 1.3155, + "step": 2050 + }, + { + "epoch": 1.9947660972937884, + "grad_norm": 0.400390625, + "learning_rate": 1.264053403329961e-06, + "loss": 1.3152, + "step": 2051 + }, + { + "epoch": 1.995739846634479, + "grad_norm": 0.396484375, + "learning_rate": 1.261830980636798e-06, + "loss": 1.2953, + "step": 2052 + }, + { + "epoch": 1.9967135959751694, + "grad_norm": 0.400390625, + "learning_rate": 1.2596098536900263e-06, + "loss": 1.3345, + "step": 2053 + }, + { + "epoch": 1.99768734531586, + "grad_norm": 0.392578125, + "learning_rate": 1.2573900248140586e-06, + "loss": 1.3066, + "step": 2054 + }, + { + "epoch": 1.9986610946565504, + "grad_norm": 0.3984375, + "learning_rate": 1.255171496331952e-06, + "loss": 1.3078, + "step": 2055 + }, + { + "epoch": 1.999634843997241, + "grad_norm": 0.396484375, + "learning_rate": 1.2529542705653992e-06, + "loss": 1.3204, + "step": 2056 + }, + { + "epoch": 2.0006085933379314, + "grad_norm": 0.41015625, + "learning_rate": 1.2507383498347328e-06, + "loss": 1.3295, + "step": 2057 + }, + { + "epoch": 2.0015823426786223, + "grad_norm": 0.400390625, + "learning_rate": 1.248523736458917e-06, + "loss": 1.3092, + "step": 2058 + }, + { + "epoch": 2.0015823426786223, + "eval_loss": 1.337652564048767, + "eval_runtime": 1517.6913, + "eval_samples_per_second": 27.512, + "eval_steps_per_second": 3.439, + "step": 2058 + }, + { + "epoch": 2.000162265222506, + "grad_norm": 0.408203125, + "learning_rate": 1.246310432755551e-06, + "loss": 1.3348, + "step": 2059 + }, + { + "epoch": 2.0011358565575432, + "grad_norm": 0.412109375, + "learning_rate": 1.2440984410408607e-06, + "loss": 1.3299, + "step": 2060 + }, + { + "epoch": 2.0021094478925803, + "grad_norm": 0.390625, + "learning_rate": 1.2418877636297e-06, + "loss": 1.3285, + "step": 2061 + }, + { + "epoch": 2.0030830392276173, + "grad_norm": 0.41015625, + "learning_rate": 1.239678402835546e-06, + "loss": 1.2974, + "step": 2062 + }, + { + "epoch": 2.004056630562655, + "grad_norm": 0.40234375, + "learning_rate": 1.2374703609705014e-06, + "loss": 1.3299, + "step": 2063 + }, + { + "epoch": 2.005030221897692, + "grad_norm": 0.40234375, + "learning_rate": 1.2352636403452844e-06, + "loss": 1.3071, + "step": 2064 + }, + { + "epoch": 2.006003813232729, + "grad_norm": 0.39453125, + "learning_rate": 1.2330582432692349e-06, + "loss": 1.3317, + "step": 2065 + }, + { + "epoch": 2.006977404567766, + "grad_norm": 0.400390625, + "learning_rate": 1.2308541720503029e-06, + "loss": 1.3287, + "step": 2066 + }, + { + "epoch": 2.007950995902803, + "grad_norm": 0.3984375, + "learning_rate": 1.228651428995056e-06, + "loss": 1.3298, + "step": 2067 + }, + { + "epoch": 2.00892458723784, + "grad_norm": 0.396484375, + "learning_rate": 1.2264500164086667e-06, + "loss": 1.325, + "step": 2068 + }, + { + "epoch": 2.0098981785728776, + "grad_norm": 0.392578125, + "learning_rate": 1.2242499365949203e-06, + "loss": 1.3052, + "step": 2069 + }, + { + "epoch": 2.0108717699079146, + "grad_norm": 0.392578125, + "learning_rate": 1.2220511918562036e-06, + "loss": 1.3097, + "step": 2070 + }, + { + "epoch": 2.0118453612429517, + "grad_norm": 0.40234375, + "learning_rate": 1.219853784493507e-06, + "loss": 1.3139, + "step": 2071 + }, + { + "epoch": 2.0128189525779887, + "grad_norm": 0.40234375, + "learning_rate": 1.2176577168064213e-06, + "loss": 1.3104, + "step": 2072 + }, + { + "epoch": 2.0137925439130258, + "grad_norm": 0.40234375, + "learning_rate": 1.2154629910931376e-06, + "loss": 1.3206, + "step": 2073 + }, + { + "epoch": 2.014766135248063, + "grad_norm": 0.392578125, + "learning_rate": 1.213269609650439e-06, + "loss": 1.3183, + "step": 2074 + }, + { + "epoch": 2.0157397265831003, + "grad_norm": 0.3984375, + "learning_rate": 1.2110775747737052e-06, + "loss": 1.3096, + "step": 2075 + }, + { + "epoch": 2.0167133179181373, + "grad_norm": 0.400390625, + "learning_rate": 1.2088868887569036e-06, + "loss": 1.3128, + "step": 2076 + }, + { + "epoch": 2.0176869092531744, + "grad_norm": 0.400390625, + "learning_rate": 1.206697553892593e-06, + "loss": 1.3306, + "step": 2077 + }, + { + "epoch": 2.0186605005882114, + "grad_norm": 0.384765625, + "learning_rate": 1.2045095724719156e-06, + "loss": 1.3094, + "step": 2078 + }, + { + "epoch": 2.0196340919232485, + "grad_norm": 0.396484375, + "learning_rate": 1.2023229467845996e-06, + "loss": 1.342, + "step": 2079 + }, + { + "epoch": 2.0206076832582855, + "grad_norm": 0.388671875, + "learning_rate": 1.2001376791189526e-06, + "loss": 1.3286, + "step": 2080 + }, + { + "epoch": 2.0215812745933226, + "grad_norm": 0.40625, + "learning_rate": 1.1979537717618605e-06, + "loss": 1.3285, + "step": 2081 + }, + { + "epoch": 2.02255486592836, + "grad_norm": 0.396484375, + "learning_rate": 1.195771226998789e-06, + "loss": 1.3068, + "step": 2082 + }, + { + "epoch": 2.023528457263397, + "grad_norm": 0.400390625, + "learning_rate": 1.1935900471137742e-06, + "loss": 1.3012, + "step": 2083 + }, + { + "epoch": 2.024502048598434, + "grad_norm": 0.39453125, + "learning_rate": 1.191410234389425e-06, + "loss": 1.3158, + "step": 2084 + }, + { + "epoch": 2.0254756399334712, + "grad_norm": 0.39453125, + "learning_rate": 1.1892317911069212e-06, + "loss": 1.3224, + "step": 2085 + }, + { + "epoch": 2.0264492312685083, + "grad_norm": 0.39453125, + "learning_rate": 1.1870547195460063e-06, + "loss": 1.3266, + "step": 2086 + }, + { + "epoch": 2.0274228226035453, + "grad_norm": 0.3984375, + "learning_rate": 1.1848790219849923e-06, + "loss": 1.321, + "step": 2087 + }, + { + "epoch": 2.028396413938583, + "grad_norm": 0.396484375, + "learning_rate": 1.1827047007007497e-06, + "loss": 1.3121, + "step": 2088 + }, + { + "epoch": 2.02937000527362, + "grad_norm": 0.392578125, + "learning_rate": 1.1805317579687095e-06, + "loss": 1.3191, + "step": 2089 + }, + { + "epoch": 2.030343596608657, + "grad_norm": 0.404296875, + "learning_rate": 1.1783601960628624e-06, + "loss": 1.3041, + "step": 2090 + }, + { + "epoch": 2.031317187943694, + "grad_norm": 0.40625, + "learning_rate": 1.1761900172557508e-06, + "loss": 1.3053, + "step": 2091 + }, + { + "epoch": 2.032290779278731, + "grad_norm": 0.3984375, + "learning_rate": 1.1740212238184726e-06, + "loss": 1.3006, + "step": 2092 + }, + { + "epoch": 2.033264370613768, + "grad_norm": 0.400390625, + "learning_rate": 1.171853818020674e-06, + "loss": 1.3158, + "step": 2093 + }, + { + "epoch": 2.034237961948805, + "grad_norm": 0.392578125, + "learning_rate": 1.1696878021305483e-06, + "loss": 1.3132, + "step": 2094 + }, + { + "epoch": 2.0352115532838426, + "grad_norm": 0.400390625, + "learning_rate": 1.1675231784148378e-06, + "loss": 1.3303, + "step": 2095 + }, + { + "epoch": 2.0361851446188797, + "grad_norm": 0.390625, + "learning_rate": 1.1653599491388234e-06, + "loss": 1.3153, + "step": 2096 + }, + { + "epoch": 2.0371587359539167, + "grad_norm": 0.396484375, + "learning_rate": 1.1631981165663308e-06, + "loss": 1.3044, + "step": 2097 + }, + { + "epoch": 2.0381323272889538, + "grad_norm": 0.41015625, + "learning_rate": 1.1610376829597215e-06, + "loss": 1.3269, + "step": 2098 + }, + { + "epoch": 2.039105918623991, + "grad_norm": 0.40625, + "learning_rate": 1.1588786505798924e-06, + "loss": 1.3155, + "step": 2099 + }, + { + "epoch": 2.040079509959028, + "grad_norm": 0.3984375, + "learning_rate": 1.1567210216862773e-06, + "loss": 1.314, + "step": 2100 + }, + { + "epoch": 2.0410531012940654, + "grad_norm": 0.404296875, + "learning_rate": 1.1545647985368375e-06, + "loss": 1.3135, + "step": 2101 + }, + { + "epoch": 2.0420266926291024, + "grad_norm": 0.408203125, + "learning_rate": 1.1524099833880667e-06, + "loss": 1.3141, + "step": 2102 + }, + { + "epoch": 2.0430002839641395, + "grad_norm": 0.390625, + "learning_rate": 1.150256578494981e-06, + "loss": 1.3114, + "step": 2103 + }, + { + "epoch": 2.0439738752991765, + "grad_norm": 0.38671875, + "learning_rate": 1.1481045861111256e-06, + "loss": 1.3159, + "step": 2104 + }, + { + "epoch": 2.0449474666342136, + "grad_norm": 0.40234375, + "learning_rate": 1.1459540084885635e-06, + "loss": 1.3034, + "step": 2105 + }, + { + "epoch": 2.0459210579692506, + "grad_norm": 0.390625, + "learning_rate": 1.1438048478778786e-06, + "loss": 1.3174, + "step": 2106 + }, + { + "epoch": 2.0468946493042877, + "grad_norm": 0.3984375, + "learning_rate": 1.1416571065281706e-06, + "loss": 1.3028, + "step": 2107 + }, + { + "epoch": 2.047868240639325, + "grad_norm": 0.39453125, + "learning_rate": 1.1395107866870579e-06, + "loss": 1.3072, + "step": 2108 + }, + { + "epoch": 2.048841831974362, + "grad_norm": 0.39453125, + "learning_rate": 1.1373658906006656e-06, + "loss": 1.2953, + "step": 2109 + }, + { + "epoch": 2.0498154233093993, + "grad_norm": 0.392578125, + "learning_rate": 1.1352224205136348e-06, + "loss": 1.2935, + "step": 2110 + }, + { + "epoch": 2.0507890146444363, + "grad_norm": 0.408203125, + "learning_rate": 1.1330803786691086e-06, + "loss": 1.3207, + "step": 2111 + }, + { + "epoch": 2.0517626059794734, + "grad_norm": 0.40234375, + "learning_rate": 1.1309397673087405e-06, + "loss": 1.3186, + "step": 2112 + }, + { + "epoch": 2.0527361973145104, + "grad_norm": 0.400390625, + "learning_rate": 1.1288005886726823e-06, + "loss": 1.2882, + "step": 2113 + }, + { + "epoch": 2.053709788649548, + "grad_norm": 0.388671875, + "learning_rate": 1.1266628449995913e-06, + "loss": 1.2869, + "step": 2114 + }, + { + "epoch": 2.054683379984585, + "grad_norm": 0.3984375, + "learning_rate": 1.1245265385266187e-06, + "loss": 1.3011, + "step": 2115 + }, + { + "epoch": 2.055656971319622, + "grad_norm": 0.396484375, + "learning_rate": 1.1223916714894146e-06, + "loss": 1.3219, + "step": 2116 + }, + { + "epoch": 2.056630562654659, + "grad_norm": 0.40625, + "learning_rate": 1.1202582461221202e-06, + "loss": 1.3128, + "step": 2117 + }, + { + "epoch": 2.057604153989696, + "grad_norm": 0.396484375, + "learning_rate": 1.1181262646573712e-06, + "loss": 1.3329, + "step": 2118 + }, + { + "epoch": 2.058577745324733, + "grad_norm": 0.40234375, + "learning_rate": 1.1159957293262888e-06, + "loss": 1.2994, + "step": 2119 + }, + { + "epoch": 2.05955133665977, + "grad_norm": 0.404296875, + "learning_rate": 1.1138666423584848e-06, + "loss": 1.3412, + "step": 2120 + }, + { + "epoch": 2.0605249279948077, + "grad_norm": 0.400390625, + "learning_rate": 1.1117390059820507e-06, + "loss": 1.3094, + "step": 2121 + }, + { + "epoch": 2.0614985193298447, + "grad_norm": 0.388671875, + "learning_rate": 1.1096128224235644e-06, + "loss": 1.2937, + "step": 2122 + }, + { + "epoch": 2.062472110664882, + "grad_norm": 0.390625, + "learning_rate": 1.1074880939080793e-06, + "loss": 1.338, + "step": 2123 + }, + { + "epoch": 2.063445701999919, + "grad_norm": 0.396484375, + "learning_rate": 1.1053648226591298e-06, + "loss": 1.3177, + "step": 2124 + }, + { + "epoch": 2.064419293334956, + "grad_norm": 0.40625, + "learning_rate": 1.1032430108987221e-06, + "loss": 1.322, + "step": 2125 + }, + { + "epoch": 2.065392884669993, + "grad_norm": 0.39453125, + "learning_rate": 1.1011226608473391e-06, + "loss": 1.3191, + "step": 2126 + }, + { + "epoch": 2.0663664760050304, + "grad_norm": 0.400390625, + "learning_rate": 1.0990037747239278e-06, + "loss": 1.3111, + "step": 2127 + }, + { + "epoch": 2.0673400673400675, + "grad_norm": 0.396484375, + "learning_rate": 1.0968863547459096e-06, + "loss": 1.3288, + "step": 2128 + }, + { + "epoch": 2.0683136586751045, + "grad_norm": 0.400390625, + "learning_rate": 1.0947704031291668e-06, + "loss": 1.3115, + "step": 2129 + }, + { + "epoch": 2.0692872500101416, + "grad_norm": 0.390625, + "learning_rate": 1.092655922088049e-06, + "loss": 1.3068, + "step": 2130 + }, + { + "epoch": 2.0702608413451786, + "grad_norm": 0.39453125, + "learning_rate": 1.0905429138353628e-06, + "loss": 1.3019, + "step": 2131 + }, + { + "epoch": 2.0712344326802157, + "grad_norm": 0.400390625, + "learning_rate": 1.0884313805823783e-06, + "loss": 1.2997, + "step": 2132 + }, + { + "epoch": 2.0722080240152527, + "grad_norm": 0.396484375, + "learning_rate": 1.0863213245388167e-06, + "loss": 1.3196, + "step": 2133 + }, + { + "epoch": 2.07318161535029, + "grad_norm": 0.39453125, + "learning_rate": 1.0842127479128579e-06, + "loss": 1.3132, + "step": 2134 + }, + { + "epoch": 2.0741552066853273, + "grad_norm": 0.400390625, + "learning_rate": 1.0821056529111296e-06, + "loss": 1.3221, + "step": 2135 + }, + { + "epoch": 2.0751287980203643, + "grad_norm": 0.408203125, + "learning_rate": 1.0800000417387132e-06, + "loss": 1.3214, + "step": 2136 + }, + { + "epoch": 2.0761023893554014, + "grad_norm": 0.392578125, + "learning_rate": 1.0778959165991342e-06, + "loss": 1.3201, + "step": 2137 + }, + { + "epoch": 2.0770759806904384, + "grad_norm": 0.40234375, + "learning_rate": 1.0757932796943637e-06, + "loss": 1.3115, + "step": 2138 + }, + { + "epoch": 2.0780495720254755, + "grad_norm": 0.41015625, + "learning_rate": 1.0736921332248145e-06, + "loss": 1.3333, + "step": 2139 + }, + { + "epoch": 2.079023163360513, + "grad_norm": 0.404296875, + "learning_rate": 1.0715924793893425e-06, + "loss": 1.3109, + "step": 2140 + }, + { + "epoch": 2.07999675469555, + "grad_norm": 0.388671875, + "learning_rate": 1.0694943203852382e-06, + "loss": 1.3066, + "step": 2141 + }, + { + "epoch": 2.080970346030587, + "grad_norm": 0.40625, + "learning_rate": 1.0673976584082304e-06, + "loss": 1.3067, + "step": 2142 + }, + { + "epoch": 2.081943937365624, + "grad_norm": 0.40625, + "learning_rate": 1.0653024956524788e-06, + "loss": 1.3261, + "step": 2143 + }, + { + "epoch": 2.082917528700661, + "grad_norm": 0.408203125, + "learning_rate": 1.063208834310577e-06, + "loss": 1.3376, + "step": 2144 + }, + { + "epoch": 2.083891120035698, + "grad_norm": 0.404296875, + "learning_rate": 1.0611166765735442e-06, + "loss": 1.3078, + "step": 2145 + }, + { + "epoch": 2.0848647113707353, + "grad_norm": 0.400390625, + "learning_rate": 1.0590260246308296e-06, + "loss": 1.3155, + "step": 2146 + }, + { + "epoch": 2.0858383027057728, + "grad_norm": 0.384765625, + "learning_rate": 1.0569368806703029e-06, + "loss": 1.3193, + "step": 2147 + }, + { + "epoch": 2.08681189404081, + "grad_norm": 0.40234375, + "learning_rate": 1.0548492468782598e-06, + "loss": 1.3227, + "step": 2148 + }, + { + "epoch": 2.087785485375847, + "grad_norm": 0.3984375, + "learning_rate": 1.0527631254394103e-06, + "loss": 1.3228, + "step": 2149 + }, + { + "epoch": 2.088759076710884, + "grad_norm": 0.40234375, + "learning_rate": 1.050678518536887e-06, + "loss": 1.3149, + "step": 2150 + }, + { + "epoch": 2.089732668045921, + "grad_norm": 0.412109375, + "learning_rate": 1.0485954283522335e-06, + "loss": 1.3298, + "step": 2151 + }, + { + "epoch": 2.090706259380958, + "grad_norm": 0.396484375, + "learning_rate": 1.0465138570654096e-06, + "loss": 1.3265, + "step": 2152 + }, + { + "epoch": 2.0916798507159955, + "grad_norm": 0.40625, + "learning_rate": 1.0444338068547824e-06, + "loss": 1.3284, + "step": 2153 + }, + { + "epoch": 2.0926534420510325, + "grad_norm": 0.408203125, + "learning_rate": 1.0423552798971303e-06, + "loss": 1.3074, + "step": 2154 + }, + { + "epoch": 2.0936270333860696, + "grad_norm": 0.400390625, + "learning_rate": 1.0402782783676343e-06, + "loss": 1.2978, + "step": 2155 + }, + { + "epoch": 2.0946006247211066, + "grad_norm": 0.396484375, + "learning_rate": 1.0382028044398823e-06, + "loss": 1.3092, + "step": 2156 + }, + { + "epoch": 2.0955742160561437, + "grad_norm": 0.396484375, + "learning_rate": 1.0361288602858605e-06, + "loss": 1.3075, + "step": 2157 + }, + { + "epoch": 2.0965478073911807, + "grad_norm": 0.39453125, + "learning_rate": 1.0340564480759568e-06, + "loss": 1.3138, + "step": 2158 + }, + { + "epoch": 2.097521398726218, + "grad_norm": 0.404296875, + "learning_rate": 1.0319855699789546e-06, + "loss": 1.3134, + "step": 2159 + }, + { + "epoch": 2.0984949900612553, + "grad_norm": 0.404296875, + "learning_rate": 1.0299162281620318e-06, + "loss": 1.3094, + "step": 2160 + }, + { + "epoch": 2.0994685813962923, + "grad_norm": 0.396484375, + "learning_rate": 1.0278484247907578e-06, + "loss": 1.3168, + "step": 2161 + }, + { + "epoch": 2.1004421727313294, + "grad_norm": 0.412109375, + "learning_rate": 1.0257821620290948e-06, + "loss": 1.3154, + "step": 2162 + }, + { + "epoch": 2.1014157640663664, + "grad_norm": 0.390625, + "learning_rate": 1.0237174420393894e-06, + "loss": 1.3323, + "step": 2163 + }, + { + "epoch": 2.1023893554014035, + "grad_norm": 0.40234375, + "learning_rate": 1.0216542669823768e-06, + "loss": 1.3194, + "step": 2164 + }, + { + "epoch": 2.1033629467364405, + "grad_norm": 0.396484375, + "learning_rate": 1.0195926390171724e-06, + "loss": 1.3194, + "step": 2165 + }, + { + "epoch": 2.104336538071478, + "grad_norm": 0.3984375, + "learning_rate": 1.0175325603012754e-06, + "loss": 1.3186, + "step": 2166 + }, + { + "epoch": 2.105310129406515, + "grad_norm": 0.419921875, + "learning_rate": 1.0154740329905607e-06, + "loss": 1.32, + "step": 2167 + }, + { + "epoch": 2.106283720741552, + "grad_norm": 0.396484375, + "learning_rate": 1.0134170592392837e-06, + "loss": 1.3077, + "step": 2168 + }, + { + "epoch": 2.107257312076589, + "grad_norm": 0.396484375, + "learning_rate": 1.0113616412000688e-06, + "loss": 1.3067, + "step": 2169 + }, + { + "epoch": 2.1082309034116262, + "grad_norm": 0.3984375, + "learning_rate": 1.0093077810239185e-06, + "loss": 1.328, + "step": 2170 + }, + { + "epoch": 2.1092044947466633, + "grad_norm": 0.38671875, + "learning_rate": 1.0072554808601981e-06, + "loss": 1.3042, + "step": 2171 + }, + { + "epoch": 2.1101780860817003, + "grad_norm": 0.396484375, + "learning_rate": 1.0052047428566461e-06, + "loss": 1.3384, + "step": 2172 + }, + { + "epoch": 2.111151677416738, + "grad_norm": 0.40234375, + "learning_rate": 1.0031555691593627e-06, + "loss": 1.3132, + "step": 2173 + }, + { + "epoch": 2.112125268751775, + "grad_norm": 0.38671875, + "learning_rate": 1.001107961912814e-06, + "loss": 1.303, + "step": 2174 + }, + { + "epoch": 2.113098860086812, + "grad_norm": 0.408203125, + "learning_rate": 9.990619232598228e-07, + "loss": 1.2988, + "step": 2175 + }, + { + "epoch": 2.114072451421849, + "grad_norm": 0.400390625, + "learning_rate": 9.970174553415754e-07, + "loss": 1.2889, + "step": 2176 + }, + { + "epoch": 2.115046042756886, + "grad_norm": 0.404296875, + "learning_rate": 9.94974560297609e-07, + "loss": 1.3143, + "step": 2177 + }, + { + "epoch": 2.116019634091923, + "grad_norm": 0.39453125, + "learning_rate": 9.929332402658198e-07, + "loss": 1.3233, + "step": 2178 + }, + { + "epoch": 2.1169932254269606, + "grad_norm": 0.40625, + "learning_rate": 9.908934973824522e-07, + "loss": 1.3423, + "step": 2179 + }, + { + "epoch": 2.1179668167619976, + "grad_norm": 0.400390625, + "learning_rate": 9.888553337821005e-07, + "loss": 1.3449, + "step": 2180 + }, + { + "epoch": 2.1189404080970347, + "grad_norm": 0.3984375, + "learning_rate": 9.868187515977085e-07, + "loss": 1.3163, + "step": 2181 + }, + { + "epoch": 2.1199139994320717, + "grad_norm": 0.3984375, + "learning_rate": 9.847837529605631e-07, + "loss": 1.3289, + "step": 2182 + }, + { + "epoch": 2.1208875907671088, + "grad_norm": 0.396484375, + "learning_rate": 9.827503400002933e-07, + "loss": 1.3202, + "step": 2183 + }, + { + "epoch": 2.121861182102146, + "grad_norm": 0.400390625, + "learning_rate": 9.80718514844872e-07, + "loss": 1.3237, + "step": 2184 + }, + { + "epoch": 2.122834773437183, + "grad_norm": 0.404296875, + "learning_rate": 9.786882796206062e-07, + "loss": 1.3416, + "step": 2185 + }, + { + "epoch": 2.1238083647722203, + "grad_norm": 0.408203125, + "learning_rate": 9.76659636452143e-07, + "loss": 1.325, + "step": 2186 + }, + { + "epoch": 2.1247819561072574, + "grad_norm": 0.400390625, + "learning_rate": 9.74632587462461e-07, + "loss": 1.3403, + "step": 2187 + }, + { + "epoch": 2.1257555474422944, + "grad_norm": 0.400390625, + "learning_rate": 9.726071347728702e-07, + "loss": 1.2848, + "step": 2188 + }, + { + "epoch": 2.1267291387773315, + "grad_norm": 0.3984375, + "learning_rate": 9.705832805030124e-07, + "loss": 1.3335, + "step": 2189 + }, + { + "epoch": 2.1277027301123685, + "grad_norm": 0.404296875, + "learning_rate": 9.685610267708533e-07, + "loss": 1.3331, + "step": 2190 + }, + { + "epoch": 2.1286763214474056, + "grad_norm": 0.396484375, + "learning_rate": 9.66540375692688e-07, + "loss": 1.3259, + "step": 2191 + }, + { + "epoch": 2.129649912782443, + "grad_norm": 0.404296875, + "learning_rate": 9.645213293831304e-07, + "loss": 1.3321, + "step": 2192 + }, + { + "epoch": 2.13062350411748, + "grad_norm": 0.3984375, + "learning_rate": 9.625038899551162e-07, + "loss": 1.3234, + "step": 2193 + }, + { + "epoch": 2.131597095452517, + "grad_norm": 0.40234375, + "learning_rate": 9.604880595199011e-07, + "loss": 1.3221, + "step": 2194 + }, + { + "epoch": 2.1325706867875542, + "grad_norm": 0.404296875, + "learning_rate": 9.584738401870545e-07, + "loss": 1.3266, + "step": 2195 + }, + { + "epoch": 2.1335442781225913, + "grad_norm": 0.390625, + "learning_rate": 9.564612340644622e-07, + "loss": 1.2918, + "step": 2196 + }, + { + "epoch": 2.1345178694576283, + "grad_norm": 0.400390625, + "learning_rate": 9.5445024325832e-07, + "loss": 1.2987, + "step": 2197 + }, + { + "epoch": 2.1354914607926654, + "grad_norm": 0.3984375, + "learning_rate": 9.52440869873133e-07, + "loss": 1.338, + "step": 2198 + }, + { + "epoch": 2.136465052127703, + "grad_norm": 0.3984375, + "learning_rate": 9.504331160117164e-07, + "loss": 1.3319, + "step": 2199 + }, + { + "epoch": 2.13743864346274, + "grad_norm": 0.4140625, + "learning_rate": 9.484269837751869e-07, + "loss": 1.3222, + "step": 2200 + }, + { + "epoch": 2.138412234797777, + "grad_norm": 0.39453125, + "learning_rate": 9.464224752629675e-07, + "loss": 1.2974, + "step": 2201 + }, + { + "epoch": 2.139385826132814, + "grad_norm": 0.39453125, + "learning_rate": 9.444195925727786e-07, + "loss": 1.3124, + "step": 2202 + }, + { + "epoch": 2.140359417467851, + "grad_norm": 0.40234375, + "learning_rate": 9.424183378006433e-07, + "loss": 1.3252, + "step": 2203 + }, + { + "epoch": 2.141333008802888, + "grad_norm": 0.39453125, + "learning_rate": 9.404187130408773e-07, + "loss": 1.3056, + "step": 2204 + }, + { + "epoch": 2.1423066001379256, + "grad_norm": 0.39453125, + "learning_rate": 9.384207203860923e-07, + "loss": 1.3379, + "step": 2205 + }, + { + "epoch": 2.1432801914729627, + "grad_norm": 0.400390625, + "learning_rate": 9.364243619271906e-07, + "loss": 1.3085, + "step": 2206 + }, + { + "epoch": 2.1442537828079997, + "grad_norm": 0.40234375, + "learning_rate": 9.344296397533672e-07, + "loss": 1.3085, + "step": 2207 + }, + { + "epoch": 2.1452273741430368, + "grad_norm": 0.3984375, + "learning_rate": 9.324365559521015e-07, + "loss": 1.3084, + "step": 2208 + }, + { + "epoch": 2.146200965478074, + "grad_norm": 0.396484375, + "learning_rate": 9.30445112609161e-07, + "loss": 1.2956, + "step": 2209 + }, + { + "epoch": 2.147174556813111, + "grad_norm": 0.39453125, + "learning_rate": 9.284553118085934e-07, + "loss": 1.3094, + "step": 2210 + }, + { + "epoch": 2.148148148148148, + "grad_norm": 0.39453125, + "learning_rate": 9.264671556327315e-07, + "loss": 1.3072, + "step": 2211 + }, + { + "epoch": 2.1491217394831854, + "grad_norm": 0.388671875, + "learning_rate": 9.244806461621828e-07, + "loss": 1.2836, + "step": 2212 + }, + { + "epoch": 2.1500953308182225, + "grad_norm": 0.40625, + "learning_rate": 9.224957854758352e-07, + "loss": 1.2966, + "step": 2213 + }, + { + "epoch": 2.1510689221532595, + "grad_norm": 0.3984375, + "learning_rate": 9.205125756508485e-07, + "loss": 1.3254, + "step": 2214 + }, + { + "epoch": 2.1520425134882966, + "grad_norm": 0.392578125, + "learning_rate": 9.185310187626556e-07, + "loss": 1.3218, + "step": 2215 + }, + { + "epoch": 2.1530161048233336, + "grad_norm": 0.486328125, + "learning_rate": 9.165511168849594e-07, + "loss": 1.3148, + "step": 2216 + }, + { + "epoch": 2.1539896961583707, + "grad_norm": 0.396484375, + "learning_rate": 9.145728720897329e-07, + "loss": 1.2954, + "step": 2217 + }, + { + "epoch": 2.154963287493408, + "grad_norm": 0.416015625, + "learning_rate": 9.125962864472115e-07, + "loss": 1.329, + "step": 2218 + }, + { + "epoch": 2.155936878828445, + "grad_norm": 0.400390625, + "learning_rate": 9.106213620258977e-07, + "loss": 1.3077, + "step": 2219 + }, + { + "epoch": 2.1569104701634823, + "grad_norm": 0.408203125, + "learning_rate": 9.086481008925521e-07, + "loss": 1.308, + "step": 2220 + }, + { + "epoch": 2.1578840614985193, + "grad_norm": 0.396484375, + "learning_rate": 9.066765051121985e-07, + "loss": 1.3217, + "step": 2221 + }, + { + "epoch": 2.1588576528335564, + "grad_norm": 0.408203125, + "learning_rate": 9.047065767481139e-07, + "loss": 1.3222, + "step": 2222 + }, + { + "epoch": 2.1598312441685934, + "grad_norm": 0.3984375, + "learning_rate": 9.02738317861834e-07, + "loss": 1.3065, + "step": 2223 + }, + { + "epoch": 2.1608048355036304, + "grad_norm": 0.400390625, + "learning_rate": 9.007717305131444e-07, + "loss": 1.313, + "step": 2224 + }, + { + "epoch": 2.161778426838668, + "grad_norm": 0.396484375, + "learning_rate": 8.988068167600849e-07, + "loss": 1.3338, + "step": 2225 + }, + { + "epoch": 2.162752018173705, + "grad_norm": 0.396484375, + "learning_rate": 8.968435786589388e-07, + "loss": 1.2983, + "step": 2226 + }, + { + "epoch": 2.163725609508742, + "grad_norm": 0.39453125, + "learning_rate": 8.948820182642406e-07, + "loss": 1.3118, + "step": 2227 + }, + { + "epoch": 2.164699200843779, + "grad_norm": 0.40625, + "learning_rate": 8.929221376287661e-07, + "loss": 1.3278, + "step": 2228 + }, + { + "epoch": 2.165672792178816, + "grad_norm": 0.404296875, + "learning_rate": 8.90963938803536e-07, + "loss": 1.3068, + "step": 2229 + }, + { + "epoch": 2.166646383513853, + "grad_norm": 0.400390625, + "learning_rate": 8.890074238378074e-07, + "loss": 1.3066, + "step": 2230 + }, + { + "epoch": 2.1676199748488907, + "grad_norm": 0.396484375, + "learning_rate": 8.870525947790792e-07, + "loss": 1.3118, + "step": 2231 + }, + { + "epoch": 2.1685935661839277, + "grad_norm": 0.390625, + "learning_rate": 8.85099453673082e-07, + "loss": 1.2933, + "step": 2232 + }, + { + "epoch": 2.169567157518965, + "grad_norm": 0.400390625, + "learning_rate": 8.831480025637837e-07, + "loss": 1.3178, + "step": 2233 + }, + { + "epoch": 2.170540748854002, + "grad_norm": 0.396484375, + "learning_rate": 8.8119824349338e-07, + "loss": 1.306, + "step": 2234 + }, + { + "epoch": 2.171514340189039, + "grad_norm": 0.41015625, + "learning_rate": 8.792501785022997e-07, + "loss": 1.3078, + "step": 2235 + }, + { + "epoch": 2.172487931524076, + "grad_norm": 0.390625, + "learning_rate": 8.773038096291958e-07, + "loss": 1.3018, + "step": 2236 + }, + { + "epoch": 2.1734615228591134, + "grad_norm": 0.412109375, + "learning_rate": 8.753591389109475e-07, + "loss": 1.344, + "step": 2237 + }, + { + "epoch": 2.1744351141941505, + "grad_norm": 0.400390625, + "learning_rate": 8.734161683826554e-07, + "loss": 1.3137, + "step": 2238 + }, + { + "epoch": 2.1754087055291875, + "grad_norm": 0.396484375, + "learning_rate": 8.714749000776441e-07, + "loss": 1.3293, + "step": 2239 + }, + { + "epoch": 2.1763822968642246, + "grad_norm": 0.39453125, + "learning_rate": 8.695353360274527e-07, + "loss": 1.3067, + "step": 2240 + }, + { + "epoch": 2.1773558881992616, + "grad_norm": 0.404296875, + "learning_rate": 8.67597478261841e-07, + "loss": 1.3205, + "step": 2241 + }, + { + "epoch": 2.1783294795342987, + "grad_norm": 0.40234375, + "learning_rate": 8.656613288087795e-07, + "loss": 1.3126, + "step": 2242 + }, + { + "epoch": 2.1793030708693357, + "grad_norm": 0.404296875, + "learning_rate": 8.63726889694454e-07, + "loss": 1.2934, + "step": 2243 + }, + { + "epoch": 2.180276662204373, + "grad_norm": 0.388671875, + "learning_rate": 8.617941629432572e-07, + "loss": 1.2984, + "step": 2244 + }, + { + "epoch": 2.1812502535394103, + "grad_norm": 0.3984375, + "learning_rate": 8.598631505777932e-07, + "loss": 1.3209, + "step": 2245 + }, + { + "epoch": 2.1822238448744473, + "grad_norm": 0.3984375, + "learning_rate": 8.57933854618869e-07, + "loss": 1.314, + "step": 2246 + }, + { + "epoch": 2.1831974362094844, + "grad_norm": 0.39453125, + "learning_rate": 8.56006277085499e-07, + "loss": 1.3065, + "step": 2247 + }, + { + "epoch": 2.1841710275445214, + "grad_norm": 0.3984375, + "learning_rate": 8.540804199948938e-07, + "loss": 1.3275, + "step": 2248 + }, + { + "epoch": 2.1851446188795585, + "grad_norm": 0.3984375, + "learning_rate": 8.521562853624693e-07, + "loss": 1.3211, + "step": 2249 + }, + { + "epoch": 2.186118210214596, + "grad_norm": 0.40625, + "learning_rate": 8.502338752018344e-07, + "loss": 1.335, + "step": 2250 + }, + { + "epoch": 2.187091801549633, + "grad_norm": 0.396484375, + "learning_rate": 8.483131915247969e-07, + "loss": 1.3178, + "step": 2251 + }, + { + "epoch": 2.18806539288467, + "grad_norm": 0.40625, + "learning_rate": 8.463942363413546e-07, + "loss": 1.3232, + "step": 2252 + }, + { + "epoch": 2.189038984219707, + "grad_norm": 0.39453125, + "learning_rate": 8.444770116596998e-07, + "loss": 1.3106, + "step": 2253 + }, + { + "epoch": 2.190012575554744, + "grad_norm": 0.392578125, + "learning_rate": 8.425615194862102e-07, + "loss": 1.3297, + "step": 2254 + }, + { + "epoch": 2.190986166889781, + "grad_norm": 0.392578125, + "learning_rate": 8.406477618254538e-07, + "loss": 1.3106, + "step": 2255 + }, + { + "epoch": 2.1919597582248187, + "grad_norm": 0.3984375, + "learning_rate": 8.387357406801804e-07, + "loss": 1.3031, + "step": 2256 + }, + { + "epoch": 2.1929333495598557, + "grad_norm": 0.3984375, + "learning_rate": 8.368254580513261e-07, + "loss": 1.3099, + "step": 2257 + }, + { + "epoch": 2.193906940894893, + "grad_norm": 0.388671875, + "learning_rate": 8.349169159380027e-07, + "loss": 1.3115, + "step": 2258 + }, + { + "epoch": 2.19488053222993, + "grad_norm": 0.39453125, + "learning_rate": 8.33010116337507e-07, + "loss": 1.3064, + "step": 2259 + }, + { + "epoch": 2.195854123564967, + "grad_norm": 0.39453125, + "learning_rate": 8.311050612453048e-07, + "loss": 1.3139, + "step": 2260 + }, + { + "epoch": 2.196827714900004, + "grad_norm": 0.39453125, + "learning_rate": 8.292017526550425e-07, + "loss": 1.3012, + "step": 2261 + }, + { + "epoch": 2.197801306235041, + "grad_norm": 0.400390625, + "learning_rate": 8.273001925585353e-07, + "loss": 1.3185, + "step": 2262 + }, + { + "epoch": 2.1987748975700785, + "grad_norm": 0.466796875, + "learning_rate": 8.254003829457707e-07, + "loss": 1.3078, + "step": 2263 + }, + { + "epoch": 2.1997484889051155, + "grad_norm": 0.396484375, + "learning_rate": 8.235023258049024e-07, + "loss": 1.3091, + "step": 2264 + }, + { + "epoch": 2.2007220802401526, + "grad_norm": 0.39453125, + "learning_rate": 8.21606023122252e-07, + "loss": 1.3116, + "step": 2265 + }, + { + "epoch": 2.2016956715751896, + "grad_norm": 0.3984375, + "learning_rate": 8.197114768823027e-07, + "loss": 1.342, + "step": 2266 + }, + { + "epoch": 2.2026692629102267, + "grad_norm": 0.396484375, + "learning_rate": 8.178186890677029e-07, + "loss": 1.3108, + "step": 2267 + }, + { + "epoch": 2.2036428542452637, + "grad_norm": 0.40234375, + "learning_rate": 8.159276616592568e-07, + "loss": 1.3002, + "step": 2268 + }, + { + "epoch": 2.2046164455803012, + "grad_norm": 0.390625, + "learning_rate": 8.140383966359302e-07, + "loss": 1.3349, + "step": 2269 + }, + { + "epoch": 2.2055900369153383, + "grad_norm": 0.400390625, + "learning_rate": 8.121508959748423e-07, + "loss": 1.3135, + "step": 2270 + }, + { + "epoch": 2.2065636282503753, + "grad_norm": 0.3984375, + "learning_rate": 8.102651616512658e-07, + "loss": 1.3086, + "step": 2271 + }, + { + "epoch": 2.2075372195854124, + "grad_norm": 0.3984375, + "learning_rate": 8.083811956386253e-07, + "loss": 1.324, + "step": 2272 + }, + { + "epoch": 2.2085108109204494, + "grad_norm": 0.39453125, + "learning_rate": 8.06498999908496e-07, + "loss": 1.3149, + "step": 2273 + }, + { + "epoch": 2.2094844022554865, + "grad_norm": 0.390625, + "learning_rate": 8.046185764305986e-07, + "loss": 1.3083, + "step": 2274 + }, + { + "epoch": 2.2104579935905235, + "grad_norm": 0.404296875, + "learning_rate": 8.027399271728012e-07, + "loss": 1.292, + "step": 2275 + }, + { + "epoch": 2.211431584925561, + "grad_norm": 0.40625, + "learning_rate": 8.008630541011128e-07, + "loss": 1.3117, + "step": 2276 + }, + { + "epoch": 2.212405176260598, + "grad_norm": 0.390625, + "learning_rate": 7.989879591796862e-07, + "loss": 1.3037, + "step": 2277 + }, + { + "epoch": 2.213378767595635, + "grad_norm": 0.3984375, + "learning_rate": 7.971146443708117e-07, + "loss": 1.3396, + "step": 2278 + }, + { + "epoch": 2.214352358930672, + "grad_norm": 0.40625, + "learning_rate": 7.952431116349155e-07, + "loss": 1.3452, + "step": 2279 + }, + { + "epoch": 2.215325950265709, + "grad_norm": 0.3984375, + "learning_rate": 7.933733629305626e-07, + "loss": 1.3219, + "step": 2280 + }, + { + "epoch": 2.2162995416007463, + "grad_norm": 0.3984375, + "learning_rate": 7.915054002144478e-07, + "loss": 1.3486, + "step": 2281 + }, + { + "epoch": 2.2172731329357838, + "grad_norm": 0.390625, + "learning_rate": 7.89639225441397e-07, + "loss": 1.3059, + "step": 2282 + }, + { + "epoch": 2.218246724270821, + "grad_norm": 0.396484375, + "learning_rate": 7.877748405643676e-07, + "loss": 1.3048, + "step": 2283 + }, + { + "epoch": 2.219220315605858, + "grad_norm": 0.400390625, + "learning_rate": 7.859122475344408e-07, + "loss": 1.3334, + "step": 2284 + }, + { + "epoch": 2.220193906940895, + "grad_norm": 0.388671875, + "learning_rate": 7.840514483008252e-07, + "loss": 1.3033, + "step": 2285 + }, + { + "epoch": 2.221167498275932, + "grad_norm": 0.404296875, + "learning_rate": 7.821924448108492e-07, + "loss": 1.343, + "step": 2286 + }, + { + "epoch": 2.222141089610969, + "grad_norm": 0.400390625, + "learning_rate": 7.803352390099661e-07, + "loss": 1.3136, + "step": 2287 + }, + { + "epoch": 2.223114680946006, + "grad_norm": 0.392578125, + "learning_rate": 7.784798328417439e-07, + "loss": 1.3015, + "step": 2288 + }, + { + "epoch": 2.2240882722810436, + "grad_norm": 0.400390625, + "learning_rate": 7.766262282478687e-07, + "loss": 1.3181, + "step": 2289 + }, + { + "epoch": 2.2250618636160806, + "grad_norm": 0.3984375, + "learning_rate": 7.747744271681429e-07, + "loss": 1.2962, + "step": 2290 + }, + { + "epoch": 2.2260354549511177, + "grad_norm": 0.396484375, + "learning_rate": 7.729244315404782e-07, + "loss": 1.3244, + "step": 2291 + }, + { + "epoch": 2.2270090462861547, + "grad_norm": 0.40234375, + "learning_rate": 7.710762433009009e-07, + "loss": 1.3174, + "step": 2292 + }, + { + "epoch": 2.2279826376211918, + "grad_norm": 0.40234375, + "learning_rate": 7.692298643835424e-07, + "loss": 1.312, + "step": 2293 + }, + { + "epoch": 2.228956228956229, + "grad_norm": 0.396484375, + "learning_rate": 7.673852967206416e-07, + "loss": 1.306, + "step": 2294 + }, + { + "epoch": 2.2299298202912663, + "grad_norm": 0.396484375, + "learning_rate": 7.655425422425436e-07, + "loss": 1.3128, + "step": 2295 + }, + { + "epoch": 2.2309034116263033, + "grad_norm": 0.3984375, + "learning_rate": 7.637016028776941e-07, + "loss": 1.3313, + "step": 2296 + }, + { + "epoch": 2.2318770029613404, + "grad_norm": 0.408203125, + "learning_rate": 7.618624805526389e-07, + "loss": 1.3239, + "step": 2297 + }, + { + "epoch": 2.2328505942963774, + "grad_norm": 0.392578125, + "learning_rate": 7.600251771920253e-07, + "loss": 1.3198, + "step": 2298 + }, + { + "epoch": 2.2338241856314145, + "grad_norm": 0.4140625, + "learning_rate": 7.58189694718593e-07, + "loss": 1.305, + "step": 2299 + }, + { + "epoch": 2.2347977769664515, + "grad_norm": 0.396484375, + "learning_rate": 7.563560350531795e-07, + "loss": 1.32, + "step": 2300 + }, + { + "epoch": 2.2357713683014886, + "grad_norm": 0.416015625, + "learning_rate": 7.545242001147123e-07, + "loss": 1.2973, + "step": 2301 + }, + { + "epoch": 2.236744959636526, + "grad_norm": 0.400390625, + "learning_rate": 7.526941918202116e-07, + "loss": 1.3227, + "step": 2302 + }, + { + "epoch": 2.237718550971563, + "grad_norm": 0.392578125, + "learning_rate": 7.508660120847841e-07, + "loss": 1.3226, + "step": 2303 + }, + { + "epoch": 2.2386921423066, + "grad_norm": 0.400390625, + "learning_rate": 7.490396628216237e-07, + "loss": 1.3061, + "step": 2304 + }, + { + "epoch": 2.2396657336416372, + "grad_norm": 0.408203125, + "learning_rate": 7.472151459420079e-07, + "loss": 1.3107, + "step": 2305 + }, + { + "epoch": 2.2406393249766743, + "grad_norm": 0.396484375, + "learning_rate": 7.453924633552986e-07, + "loss": 1.312, + "step": 2306 + }, + { + "epoch": 2.2416129163117113, + "grad_norm": 0.408203125, + "learning_rate": 7.435716169689355e-07, + "loss": 1.3287, + "step": 2307 + }, + { + "epoch": 2.242586507646749, + "grad_norm": 0.396484375, + "learning_rate": 7.417526086884394e-07, + "loss": 1.3028, + "step": 2308 + }, + { + "epoch": 2.243560098981786, + "grad_norm": 0.390625, + "learning_rate": 7.399354404174047e-07, + "loss": 1.2748, + "step": 2309 + }, + { + "epoch": 2.244533690316823, + "grad_norm": 0.39453125, + "learning_rate": 7.381201140575031e-07, + "loss": 1.3154, + "step": 2310 + }, + { + "epoch": 2.24550728165186, + "grad_norm": 0.392578125, + "learning_rate": 7.363066315084761e-07, + "loss": 1.3212, + "step": 2311 + }, + { + "epoch": 2.246480872986897, + "grad_norm": 0.40234375, + "learning_rate": 7.344949946681382e-07, + "loss": 1.3299, + "step": 2312 + }, + { + "epoch": 2.247454464321934, + "grad_norm": 0.400390625, + "learning_rate": 7.326852054323693e-07, + "loss": 1.3035, + "step": 2313 + }, + { + "epoch": 2.248428055656971, + "grad_norm": 0.404296875, + "learning_rate": 7.308772656951202e-07, + "loss": 1.3132, + "step": 2314 + }, + { + "epoch": 2.2494016469920086, + "grad_norm": 0.388671875, + "learning_rate": 7.290711773484002e-07, + "loss": 1.3047, + "step": 2315 + }, + { + "epoch": 2.2503752383270457, + "grad_norm": 0.390625, + "learning_rate": 7.272669422822867e-07, + "loss": 1.3009, + "step": 2316 + }, + { + "epoch": 2.2513488296620827, + "grad_norm": 0.3984375, + "learning_rate": 7.254645623849141e-07, + "loss": 1.3161, + "step": 2317 + }, + { + "epoch": 2.2523224209971198, + "grad_norm": 0.3984375, + "learning_rate": 7.23664039542478e-07, + "loss": 1.2989, + "step": 2318 + }, + { + "epoch": 2.253296012332157, + "grad_norm": 0.40234375, + "learning_rate": 7.218653756392272e-07, + "loss": 1.3291, + "step": 2319 + }, + { + "epoch": 2.254269603667194, + "grad_norm": 0.3984375, + "learning_rate": 7.200685725574694e-07, + "loss": 1.2851, + "step": 2320 + }, + { + "epoch": 2.2552431950022314, + "grad_norm": 0.404296875, + "learning_rate": 7.182736321775607e-07, + "loss": 1.3195, + "step": 2321 + }, + { + "epoch": 2.2562167863372684, + "grad_norm": 0.40234375, + "learning_rate": 7.164805563779115e-07, + "loss": 1.2928, + "step": 2322 + }, + { + "epoch": 2.2571903776723055, + "grad_norm": 0.3984375, + "learning_rate": 7.146893470349778e-07, + "loss": 1.3178, + "step": 2323 + }, + { + "epoch": 2.2581639690073425, + "grad_norm": 0.39453125, + "learning_rate": 7.129000060232649e-07, + "loss": 1.2987, + "step": 2324 + }, + { + "epoch": 2.2591375603423796, + "grad_norm": 0.40234375, + "learning_rate": 7.111125352153217e-07, + "loss": 1.3162, + "step": 2325 + }, + { + "epoch": 2.2601111516774166, + "grad_norm": 0.40625, + "learning_rate": 7.093269364817398e-07, + "loss": 1.3139, + "step": 2326 + }, + { + "epoch": 2.2610847430124537, + "grad_norm": 0.40625, + "learning_rate": 7.07543211691151e-07, + "loss": 1.3175, + "step": 2327 + }, + { + "epoch": 2.262058334347491, + "grad_norm": 0.396484375, + "learning_rate": 7.057613627102286e-07, + "loss": 1.3247, + "step": 2328 + }, + { + "epoch": 2.263031925682528, + "grad_norm": 0.408203125, + "learning_rate": 7.039813914036792e-07, + "loss": 1.3079, + "step": 2329 + }, + { + "epoch": 2.2640055170175652, + "grad_norm": 0.396484375, + "learning_rate": 7.022032996342485e-07, + "loss": 1.3464, + "step": 2330 + }, + { + "epoch": 2.2649791083526023, + "grad_norm": 0.388671875, + "learning_rate": 7.004270892627113e-07, + "loss": 1.2976, + "step": 2331 + }, + { + "epoch": 2.2659526996876393, + "grad_norm": 0.3984375, + "learning_rate": 6.986527621478773e-07, + "loss": 1.3219, + "step": 2332 + }, + { + "epoch": 2.2669262910226764, + "grad_norm": 0.392578125, + "learning_rate": 6.96880320146581e-07, + "loss": 1.2983, + "step": 2333 + }, + { + "epoch": 2.267899882357714, + "grad_norm": 0.3984375, + "learning_rate": 6.95109765113689e-07, + "loss": 1.3336, + "step": 2334 + }, + { + "epoch": 2.268873473692751, + "grad_norm": 0.396484375, + "learning_rate": 6.933410989020886e-07, + "loss": 1.297, + "step": 2335 + }, + { + "epoch": 2.269847065027788, + "grad_norm": 0.404296875, + "learning_rate": 6.915743233626954e-07, + "loss": 1.3146, + "step": 2336 + }, + { + "epoch": 2.270820656362825, + "grad_norm": 0.40234375, + "learning_rate": 6.8980944034444e-07, + "loss": 1.313, + "step": 2337 + }, + { + "epoch": 2.271794247697862, + "grad_norm": 0.396484375, + "learning_rate": 6.880464516942787e-07, + "loss": 1.3195, + "step": 2338 + }, + { + "epoch": 2.272767839032899, + "grad_norm": 0.39453125, + "learning_rate": 6.86285359257181e-07, + "loss": 1.3295, + "step": 2339 + }, + { + "epoch": 2.273741430367936, + "grad_norm": 0.40234375, + "learning_rate": 6.845261648761353e-07, + "loss": 1.3069, + "step": 2340 + }, + { + "epoch": 2.2747150217029737, + "grad_norm": 0.39453125, + "learning_rate": 6.827688703921407e-07, + "loss": 1.3146, + "step": 2341 + }, + { + "epoch": 2.2756886130380107, + "grad_norm": 0.400390625, + "learning_rate": 6.810134776442107e-07, + "loss": 1.318, + "step": 2342 + }, + { + "epoch": 2.276662204373048, + "grad_norm": 0.400390625, + "learning_rate": 6.79259988469366e-07, + "loss": 1.2957, + "step": 2343 + }, + { + "epoch": 2.277635795708085, + "grad_norm": 0.3984375, + "learning_rate": 6.775084047026381e-07, + "loss": 1.3402, + "step": 2344 + }, + { + "epoch": 2.278609387043122, + "grad_norm": 0.404296875, + "learning_rate": 6.757587281770614e-07, + "loss": 1.3118, + "step": 2345 + }, + { + "epoch": 2.279582978378159, + "grad_norm": 0.39453125, + "learning_rate": 6.740109607236775e-07, + "loss": 1.3179, + "step": 2346 + }, + { + "epoch": 2.2805565697131964, + "grad_norm": 0.400390625, + "learning_rate": 6.722651041715278e-07, + "loss": 1.313, + "step": 2347 + }, + { + "epoch": 2.2815301610482335, + "grad_norm": 0.400390625, + "learning_rate": 6.705211603476547e-07, + "loss": 1.3097, + "step": 2348 + }, + { + "epoch": 2.2825037523832705, + "grad_norm": 0.3984375, + "learning_rate": 6.687791310770983e-07, + "loss": 1.3099, + "step": 2349 + }, + { + "epoch": 2.2834773437183076, + "grad_norm": 0.40234375, + "learning_rate": 6.670390181828973e-07, + "loss": 1.3247, + "step": 2350 + }, + { + "epoch": 2.2844509350533446, + "grad_norm": 0.40234375, + "learning_rate": 6.653008234860814e-07, + "loss": 1.3361, + "step": 2351 + }, + { + "epoch": 2.2854245263883817, + "grad_norm": 0.396484375, + "learning_rate": 6.635645488056771e-07, + "loss": 1.3343, + "step": 2352 + }, + { + "epoch": 2.2863981177234187, + "grad_norm": 0.404296875, + "learning_rate": 6.618301959586973e-07, + "loss": 1.3328, + "step": 2353 + }, + { + "epoch": 2.287371709058456, + "grad_norm": 0.404296875, + "learning_rate": 6.600977667601477e-07, + "loss": 1.3181, + "step": 2354 + }, + { + "epoch": 2.2883453003934933, + "grad_norm": 0.388671875, + "learning_rate": 6.58367263023017e-07, + "loss": 1.3199, + "step": 2355 + }, + { + "epoch": 2.2893188917285303, + "grad_norm": 0.404296875, + "learning_rate": 6.566386865582827e-07, + "loss": 1.3364, + "step": 2356 + }, + { + "epoch": 2.2902924830635674, + "grad_norm": 0.396484375, + "learning_rate": 6.549120391749023e-07, + "loss": 1.2999, + "step": 2357 + }, + { + "epoch": 2.2912660743986044, + "grad_norm": 0.40234375, + "learning_rate": 6.531873226798174e-07, + "loss": 1.3147, + "step": 2358 + }, + { + "epoch": 2.2922396657336415, + "grad_norm": 0.388671875, + "learning_rate": 6.514645388779447e-07, + "loss": 1.2999, + "step": 2359 + }, + { + "epoch": 2.293213257068679, + "grad_norm": 0.3984375, + "learning_rate": 6.497436895721834e-07, + "loss": 1.3116, + "step": 2360 + }, + { + "epoch": 2.294186848403716, + "grad_norm": 0.392578125, + "learning_rate": 6.48024776563404e-07, + "loss": 1.3199, + "step": 2361 + }, + { + "epoch": 2.295160439738753, + "grad_norm": 0.40234375, + "learning_rate": 6.46307801650454e-07, + "loss": 1.3237, + "step": 2362 + }, + { + "epoch": 2.29613403107379, + "grad_norm": 0.3984375, + "learning_rate": 6.445927666301494e-07, + "loss": 1.3241, + "step": 2363 + }, + { + "epoch": 2.297107622408827, + "grad_norm": 0.396484375, + "learning_rate": 6.428796732972797e-07, + "loss": 1.3209, + "step": 2364 + }, + { + "epoch": 2.298081213743864, + "grad_norm": 0.400390625, + "learning_rate": 6.411685234445989e-07, + "loss": 1.3029, + "step": 2365 + }, + { + "epoch": 2.2990548050789013, + "grad_norm": 0.392578125, + "learning_rate": 6.394593188628303e-07, + "loss": 1.2924, + "step": 2366 + }, + { + "epoch": 2.3000283964139387, + "grad_norm": 0.396484375, + "learning_rate": 6.377520613406585e-07, + "loss": 1.328, + "step": 2367 + }, + { + "epoch": 2.301001987748976, + "grad_norm": 0.404296875, + "learning_rate": 6.360467526647333e-07, + "loss": 1.3369, + "step": 2368 + }, + { + "epoch": 2.301975579084013, + "grad_norm": 0.392578125, + "learning_rate": 6.343433946196631e-07, + "loss": 1.3079, + "step": 2369 + }, + { + "epoch": 2.30294917041905, + "grad_norm": 0.3984375, + "learning_rate": 6.326419889880156e-07, + "loss": 1.3376, + "step": 2370 + }, + { + "epoch": 2.303922761754087, + "grad_norm": 0.40234375, + "learning_rate": 6.309425375503144e-07, + "loss": 1.298, + "step": 2371 + }, + { + "epoch": 2.304896353089124, + "grad_norm": 0.390625, + "learning_rate": 6.292450420850402e-07, + "loss": 1.3031, + "step": 2372 + }, + { + "epoch": 2.3058699444241615, + "grad_norm": 0.408203125, + "learning_rate": 6.275495043686244e-07, + "loss": 1.3244, + "step": 2373 + }, + { + "epoch": 2.3068435357591985, + "grad_norm": 0.404296875, + "learning_rate": 6.258559261754518e-07, + "loss": 1.3114, + "step": 2374 + }, + { + "epoch": 2.3078171270942356, + "grad_norm": 0.3984375, + "learning_rate": 6.24164309277854e-07, + "loss": 1.3265, + "step": 2375 + }, + { + "epoch": 2.3087907184292726, + "grad_norm": 0.400390625, + "learning_rate": 6.224746554461128e-07, + "loss": 1.2972, + "step": 2376 + }, + { + "epoch": 2.3097643097643097, + "grad_norm": 0.39453125, + "learning_rate": 6.207869664484542e-07, + "loss": 1.3105, + "step": 2377 + }, + { + "epoch": 2.3107379010993467, + "grad_norm": 0.400390625, + "learning_rate": 6.191012440510469e-07, + "loss": 1.3157, + "step": 2378 + }, + { + "epoch": 2.311711492434384, + "grad_norm": 0.4140625, + "learning_rate": 6.174174900180044e-07, + "loss": 1.3024, + "step": 2379 + }, + { + "epoch": 2.3126850837694213, + "grad_norm": 0.396484375, + "learning_rate": 6.157357061113781e-07, + "loss": 1.3134, + "step": 2380 + }, + { + "epoch": 2.3136586751044583, + "grad_norm": 0.392578125, + "learning_rate": 6.140558940911573e-07, + "loss": 1.3184, + "step": 2381 + }, + { + "epoch": 2.3146322664394954, + "grad_norm": 0.412109375, + "learning_rate": 6.123780557152703e-07, + "loss": 1.3197, + "step": 2382 + }, + { + "epoch": 2.3156058577745324, + "grad_norm": 0.39453125, + "learning_rate": 6.10702192739577e-07, + "loss": 1.337, + "step": 2383 + }, + { + "epoch": 2.3165794491095695, + "grad_norm": 0.404296875, + "learning_rate": 6.09028306917873e-07, + "loss": 1.303, + "step": 2384 + }, + { + "epoch": 2.3175530404446065, + "grad_norm": 0.41796875, + "learning_rate": 6.073564000018811e-07, + "loss": 1.3265, + "step": 2385 + }, + { + "epoch": 2.318526631779644, + "grad_norm": 0.40234375, + "learning_rate": 6.056864737412574e-07, + "loss": 1.3251, + "step": 2386 + }, + { + "epoch": 2.319500223114681, + "grad_norm": 0.453125, + "learning_rate": 6.040185298835821e-07, + "loss": 1.3304, + "step": 2387 + }, + { + "epoch": 2.320473814449718, + "grad_norm": 0.3984375, + "learning_rate": 6.02352570174361e-07, + "loss": 1.2987, + "step": 2388 + }, + { + "epoch": 2.321447405784755, + "grad_norm": 0.3984375, + "learning_rate": 6.006885963570261e-07, + "loss": 1.3073, + "step": 2389 + }, + { + "epoch": 2.322420997119792, + "grad_norm": 0.3984375, + "learning_rate": 5.990266101729278e-07, + "loss": 1.3253, + "step": 2390 + }, + { + "epoch": 2.3233945884548293, + "grad_norm": 0.392578125, + "learning_rate": 5.973666133613393e-07, + "loss": 1.2776, + "step": 2391 + }, + { + "epoch": 2.3243681797898663, + "grad_norm": 0.390625, + "learning_rate": 5.957086076594502e-07, + "loss": 1.3026, + "step": 2392 + }, + { + "epoch": 2.325341771124904, + "grad_norm": 0.400390625, + "learning_rate": 5.940525948023657e-07, + "loss": 1.3391, + "step": 2393 + }, + { + "epoch": 2.326315362459941, + "grad_norm": 0.400390625, + "learning_rate": 5.923985765231082e-07, + "loss": 1.307, + "step": 2394 + }, + { + "epoch": 2.327288953794978, + "grad_norm": 0.3984375, + "learning_rate": 5.907465545526109e-07, + "loss": 1.3212, + "step": 2395 + }, + { + "epoch": 2.328262545130015, + "grad_norm": 0.404296875, + "learning_rate": 5.890965306197166e-07, + "loss": 1.319, + "step": 2396 + }, + { + "epoch": 2.329236136465052, + "grad_norm": 0.39453125, + "learning_rate": 5.874485064511809e-07, + "loss": 1.3012, + "step": 2397 + }, + { + "epoch": 2.330209727800089, + "grad_norm": 0.388671875, + "learning_rate": 5.858024837716628e-07, + "loss": 1.3028, + "step": 2398 + }, + { + "epoch": 2.3311833191351266, + "grad_norm": 0.412109375, + "learning_rate": 5.841584643037295e-07, + "loss": 1.3361, + "step": 2399 + }, + { + "epoch": 2.3321569104701636, + "grad_norm": 0.400390625, + "learning_rate": 5.825164497678495e-07, + "loss": 1.3167, + "step": 2400 + }, + { + "epoch": 2.3331305018052007, + "grad_norm": 0.408203125, + "learning_rate": 5.808764418823959e-07, + "loss": 1.318, + "step": 2401 + }, + { + "epoch": 2.3341040931402377, + "grad_norm": 0.388671875, + "learning_rate": 5.792384423636394e-07, + "loss": 1.3019, + "step": 2402 + }, + { + "epoch": 2.3350776844752748, + "grad_norm": 0.392578125, + "learning_rate": 5.77602452925749e-07, + "loss": 1.3256, + "step": 2403 + }, + { + "epoch": 2.336051275810312, + "grad_norm": 0.3984375, + "learning_rate": 5.759684752807925e-07, + "loss": 1.3099, + "step": 2404 + }, + { + "epoch": 2.337024867145349, + "grad_norm": 0.3984375, + "learning_rate": 5.743365111387303e-07, + "loss": 1.319, + "step": 2405 + }, + { + "epoch": 2.3379984584803863, + "grad_norm": 0.3984375, + "learning_rate": 5.727065622074154e-07, + "loss": 1.3009, + "step": 2406 + }, + { + "epoch": 2.3389720498154234, + "grad_norm": 0.400390625, + "learning_rate": 5.710786301925938e-07, + "loss": 1.3294, + "step": 2407 + }, + { + "epoch": 2.3399456411504604, + "grad_norm": 0.3984375, + "learning_rate": 5.694527167978986e-07, + "loss": 1.2959, + "step": 2408 + }, + { + "epoch": 2.3409192324854975, + "grad_norm": 0.3984375, + "learning_rate": 5.678288237248525e-07, + "loss": 1.3163, + "step": 2409 + }, + { + "epoch": 2.3418928238205345, + "grad_norm": 0.3984375, + "learning_rate": 5.662069526728617e-07, + "loss": 1.3368, + "step": 2410 + }, + { + "epoch": 2.3428664151555716, + "grad_norm": 0.39453125, + "learning_rate": 5.645871053392191e-07, + "loss": 1.3278, + "step": 2411 + }, + { + "epoch": 2.343840006490609, + "grad_norm": 0.400390625, + "learning_rate": 5.629692834190963e-07, + "loss": 1.3079, + "step": 2412 + }, + { + "epoch": 2.344813597825646, + "grad_norm": 0.396484375, + "learning_rate": 5.613534886055502e-07, + "loss": 1.2973, + "step": 2413 + }, + { + "epoch": 2.345787189160683, + "grad_norm": 0.38671875, + "learning_rate": 5.597397225895098e-07, + "loss": 1.3207, + "step": 2414 + }, + { + "epoch": 2.3467607804957202, + "grad_norm": 0.39453125, + "learning_rate": 5.581279870597866e-07, + "loss": 1.3242, + "step": 2415 + }, + { + "epoch": 2.3477343718307573, + "grad_norm": 0.388671875, + "learning_rate": 5.565182837030645e-07, + "loss": 1.3133, + "step": 2416 + }, + { + "epoch": 2.3487079631657943, + "grad_norm": 0.390625, + "learning_rate": 5.549106142039018e-07, + "loss": 1.3171, + "step": 2417 + }, + { + "epoch": 2.3496815545008314, + "grad_norm": 0.390625, + "learning_rate": 5.533049802447268e-07, + "loss": 1.286, + "step": 2418 + }, + { + "epoch": 2.350655145835869, + "grad_norm": 0.388671875, + "learning_rate": 5.517013835058404e-07, + "loss": 1.3036, + "step": 2419 + }, + { + "epoch": 2.351628737170906, + "grad_norm": 0.400390625, + "learning_rate": 5.50099825665408e-07, + "loss": 1.3215, + "step": 2420 + }, + { + "epoch": 2.352602328505943, + "grad_norm": 0.39453125, + "learning_rate": 5.485003083994649e-07, + "loss": 1.3113, + "step": 2421 + }, + { + "epoch": 2.35357591984098, + "grad_norm": 0.40234375, + "learning_rate": 5.469028333819077e-07, + "loss": 1.3337, + "step": 2422 + }, + { + "epoch": 2.354549511176017, + "grad_norm": 0.390625, + "learning_rate": 5.453074022844984e-07, + "loss": 1.2914, + "step": 2423 + }, + { + "epoch": 2.3555231025110546, + "grad_norm": 0.390625, + "learning_rate": 5.437140167768582e-07, + "loss": 1.2937, + "step": 2424 + }, + { + "epoch": 2.3564966938460916, + "grad_norm": 0.40625, + "learning_rate": 5.421226785264685e-07, + "loss": 1.3063, + "step": 2425 + }, + { + "epoch": 2.3574702851811287, + "grad_norm": 0.40234375, + "learning_rate": 5.405333891986672e-07, + "loss": 1.3183, + "step": 2426 + }, + { + "epoch": 2.3584438765161657, + "grad_norm": 0.3984375, + "learning_rate": 5.389461504566503e-07, + "loss": 1.3231, + "step": 2427 + }, + { + "epoch": 2.3594174678512028, + "grad_norm": 0.404296875, + "learning_rate": 5.37360963961465e-07, + "loss": 1.3191, + "step": 2428 + }, + { + "epoch": 2.36039105918624, + "grad_norm": 0.392578125, + "learning_rate": 5.35777831372013e-07, + "loss": 1.3077, + "step": 2429 + }, + { + "epoch": 2.361364650521277, + "grad_norm": 0.3984375, + "learning_rate": 5.341967543450452e-07, + "loss": 1.3377, + "step": 2430 + }, + { + "epoch": 2.362338241856314, + "grad_norm": 0.400390625, + "learning_rate": 5.326177345351627e-07, + "loss": 1.3204, + "step": 2431 + }, + { + "epoch": 2.3633118331913514, + "grad_norm": 0.41015625, + "learning_rate": 5.31040773594812e-07, + "loss": 1.3141, + "step": 2432 + }, + { + "epoch": 2.3642854245263885, + "grad_norm": 0.392578125, + "learning_rate": 5.294658731742869e-07, + "loss": 1.3197, + "step": 2433 + }, + { + "epoch": 2.3652590158614255, + "grad_norm": 0.38671875, + "learning_rate": 5.278930349217226e-07, + "loss": 1.3094, + "step": 2434 + }, + { + "epoch": 2.3662326071964626, + "grad_norm": 0.40234375, + "learning_rate": 5.263222604831001e-07, + "loss": 1.3223, + "step": 2435 + }, + { + "epoch": 2.3672061985314996, + "grad_norm": 0.404296875, + "learning_rate": 5.247535515022345e-07, + "loss": 1.3183, + "step": 2436 + }, + { + "epoch": 2.368179789866537, + "grad_norm": 0.40234375, + "learning_rate": 5.231869096207853e-07, + "loss": 1.3117, + "step": 2437 + }, + { + "epoch": 2.369153381201574, + "grad_norm": 0.38671875, + "learning_rate": 5.216223364782455e-07, + "loss": 1.3197, + "step": 2438 + }, + { + "epoch": 2.370126972536611, + "grad_norm": 0.388671875, + "learning_rate": 5.200598337119447e-07, + "loss": 1.3198, + "step": 2439 + }, + { + "epoch": 2.3711005638716482, + "grad_norm": 0.39453125, + "learning_rate": 5.184994029570443e-07, + "loss": 1.3035, + "step": 2440 + }, + { + "epoch": 2.3720741552066853, + "grad_norm": 0.390625, + "learning_rate": 5.169410458465393e-07, + "loss": 1.3379, + "step": 2441 + }, + { + "epoch": 2.3730477465417223, + "grad_norm": 0.392578125, + "learning_rate": 5.153847640112528e-07, + "loss": 1.3303, + "step": 2442 + }, + { + "epoch": 2.3740213378767594, + "grad_norm": 0.3984375, + "learning_rate": 5.138305590798376e-07, + "loss": 1.3176, + "step": 2443 + }, + { + "epoch": 2.3749949292117964, + "grad_norm": 0.388671875, + "learning_rate": 5.122784326787711e-07, + "loss": 1.3209, + "step": 2444 + }, + { + "epoch": 2.375968520546834, + "grad_norm": 0.392578125, + "learning_rate": 5.107283864323584e-07, + "loss": 1.2801, + "step": 2445 + }, + { + "epoch": 2.376942111881871, + "grad_norm": 0.3984375, + "learning_rate": 5.091804219627253e-07, + "loss": 1.3275, + "step": 2446 + }, + { + "epoch": 2.377915703216908, + "grad_norm": 0.404296875, + "learning_rate": 5.076345408898192e-07, + "loss": 1.3178, + "step": 2447 + }, + { + "epoch": 2.378889294551945, + "grad_norm": 0.396484375, + "learning_rate": 5.060907448314078e-07, + "loss": 1.3144, + "step": 2448 + }, + { + "epoch": 2.379862885886982, + "grad_norm": 0.404296875, + "learning_rate": 5.04549035403078e-07, + "loss": 1.3149, + "step": 2449 + }, + { + "epoch": 2.3808364772220196, + "grad_norm": 0.396484375, + "learning_rate": 5.030094142182301e-07, + "loss": 1.3017, + "step": 2450 + }, + { + "epoch": 2.3818100685570567, + "grad_norm": 0.390625, + "learning_rate": 5.014718828880827e-07, + "loss": 1.3011, + "step": 2451 + }, + { + "epoch": 2.3827836598920937, + "grad_norm": 0.396484375, + "learning_rate": 4.999364430216639e-07, + "loss": 1.3088, + "step": 2452 + }, + { + "epoch": 2.383757251227131, + "grad_norm": 0.400390625, + "learning_rate": 4.984030962258158e-07, + "loss": 1.3191, + "step": 2453 + }, + { + "epoch": 2.384730842562168, + "grad_norm": 0.39453125, + "learning_rate": 4.968718441051876e-07, + "loss": 1.3256, + "step": 2454 + }, + { + "epoch": 2.385704433897205, + "grad_norm": 0.390625, + "learning_rate": 4.953426882622392e-07, + "loss": 1.3182, + "step": 2455 + }, + { + "epoch": 2.386678025232242, + "grad_norm": 0.392578125, + "learning_rate": 4.938156302972338e-07, + "loss": 1.3034, + "step": 2456 + }, + { + "epoch": 2.387651616567279, + "grad_norm": 0.3984375, + "learning_rate": 4.922906718082431e-07, + "loss": 1.3164, + "step": 2457 + }, + { + "epoch": 2.3886252079023165, + "grad_norm": 0.40234375, + "learning_rate": 4.907678143911363e-07, + "loss": 1.324, + "step": 2458 + }, + { + "epoch": 2.3895987992373535, + "grad_norm": 0.412109375, + "learning_rate": 4.892470596395887e-07, + "loss": 1.3342, + "step": 2459 + }, + { + "epoch": 2.3905723905723906, + "grad_norm": 0.392578125, + "learning_rate": 4.877284091450718e-07, + "loss": 1.3079, + "step": 2460 + }, + { + "epoch": 2.3915459819074276, + "grad_norm": 0.3984375, + "learning_rate": 4.862118644968584e-07, + "loss": 1.3134, + "step": 2461 + }, + { + "epoch": 2.3925195732424647, + "grad_norm": 0.390625, + "learning_rate": 4.846974272820131e-07, + "loss": 1.285, + "step": 2462 + }, + { + "epoch": 2.393493164577502, + "grad_norm": 0.400390625, + "learning_rate": 4.831850990854e-07, + "loss": 1.3168, + "step": 2463 + }, + { + "epoch": 2.394466755912539, + "grad_norm": 0.390625, + "learning_rate": 4.816748814896716e-07, + "loss": 1.326, + "step": 2464 + }, + { + "epoch": 2.3954403472475763, + "grad_norm": 0.390625, + "learning_rate": 4.801667760752754e-07, + "loss": 1.2944, + "step": 2465 + }, + { + "epoch": 2.3964139385826133, + "grad_norm": 0.388671875, + "learning_rate": 4.786607844204449e-07, + "loss": 1.3209, + "step": 2466 + }, + { + "epoch": 2.3973875299176504, + "grad_norm": 0.400390625, + "learning_rate": 4.771569081012053e-07, + "loss": 1.3157, + "step": 2467 + }, + { + "epoch": 2.3983611212526874, + "grad_norm": 0.390625, + "learning_rate": 4.756551486913655e-07, + "loss": 1.3212, + "step": 2468 + }, + { + "epoch": 2.3993347125877245, + "grad_norm": 0.390625, + "learning_rate": 4.741555077625193e-07, + "loss": 1.2997, + "step": 2469 + }, + { + "epoch": 2.4003083039227615, + "grad_norm": 0.3984375, + "learning_rate": 4.726579868840439e-07, + "loss": 1.3039, + "step": 2470 + }, + { + "epoch": 2.401281895257799, + "grad_norm": 0.400390625, + "learning_rate": 4.711625876230988e-07, + "loss": 1.3264, + "step": 2471 + }, + { + "epoch": 2.402255486592836, + "grad_norm": 0.392578125, + "learning_rate": 4.696693115446216e-07, + "loss": 1.295, + "step": 2472 + }, + { + "epoch": 2.403229077927873, + "grad_norm": 0.404296875, + "learning_rate": 4.6817816021132916e-07, + "loss": 1.3141, + "step": 2473 + }, + { + "epoch": 2.40420266926291, + "grad_norm": 0.421875, + "learning_rate": 4.6668913518371413e-07, + "loss": 1.2946, + "step": 2474 + }, + { + "epoch": 2.405176260597947, + "grad_norm": 0.408203125, + "learning_rate": 4.6520223802004464e-07, + "loss": 1.3183, + "step": 2475 + }, + { + "epoch": 2.4061498519329847, + "grad_norm": 0.404296875, + "learning_rate": 4.637174702763608e-07, + "loss": 1.308, + "step": 2476 + }, + { + "epoch": 2.4071234432680217, + "grad_norm": 0.400390625, + "learning_rate": 4.622348335064761e-07, + "loss": 1.2958, + "step": 2477 + }, + { + "epoch": 2.408097034603059, + "grad_norm": 0.400390625, + "learning_rate": 4.607543292619726e-07, + "loss": 1.3342, + "step": 2478 + }, + { + "epoch": 2.409070625938096, + "grad_norm": 0.39453125, + "learning_rate": 4.592759590922005e-07, + "loss": 1.3052, + "step": 2479 + }, + { + "epoch": 2.410044217273133, + "grad_norm": 0.404296875, + "learning_rate": 4.5779972454427694e-07, + "loss": 1.3167, + "step": 2480 + }, + { + "epoch": 2.41101780860817, + "grad_norm": 0.44921875, + "learning_rate": 4.563256271630856e-07, + "loss": 1.2966, + "step": 2481 + }, + { + "epoch": 2.411991399943207, + "grad_norm": 0.396484375, + "learning_rate": 4.5485366849127105e-07, + "loss": 1.3211, + "step": 2482 + }, + { + "epoch": 2.412964991278244, + "grad_norm": 0.392578125, + "learning_rate": 4.5338385006924223e-07, + "loss": 1.3314, + "step": 2483 + }, + { + "epoch": 2.4139385826132815, + "grad_norm": 0.40625, + "learning_rate": 4.5191617343516596e-07, + "loss": 1.3233, + "step": 2484 + }, + { + "epoch": 2.4149121739483186, + "grad_norm": 0.3984375, + "learning_rate": 4.5045064012497003e-07, + "loss": 1.3321, + "step": 2485 + }, + { + "epoch": 2.4158857652833556, + "grad_norm": 0.3984375, + "learning_rate": 4.489872516723373e-07, + "loss": 1.3154, + "step": 2486 + }, + { + "epoch": 2.4168593566183927, + "grad_norm": 0.392578125, + "learning_rate": 4.475260096087064e-07, + "loss": 1.3271, + "step": 2487 + }, + { + "epoch": 2.4178329479534297, + "grad_norm": 0.400390625, + "learning_rate": 4.46066915463271e-07, + "loss": 1.3055, + "step": 2488 + }, + { + "epoch": 2.4188065392884672, + "grad_norm": 0.392578125, + "learning_rate": 4.4460997076297504e-07, + "loss": 1.3123, + "step": 2489 + }, + { + "epoch": 2.4197801306235043, + "grad_norm": 0.396484375, + "learning_rate": 4.4315517703251514e-07, + "loss": 1.3276, + "step": 2490 + }, + { + "epoch": 2.4207537219585413, + "grad_norm": 0.390625, + "learning_rate": 4.417025357943355e-07, + "loss": 1.3138, + "step": 2491 + }, + { + "epoch": 2.4217273132935784, + "grad_norm": 0.38671875, + "learning_rate": 4.402520485686276e-07, + "loss": 1.2835, + "step": 2492 + }, + { + "epoch": 2.4227009046286154, + "grad_norm": 0.392578125, + "learning_rate": 4.3880371687333027e-07, + "loss": 1.3218, + "step": 2493 + }, + { + "epoch": 2.4236744959636525, + "grad_norm": 0.39453125, + "learning_rate": 4.3735754222412494e-07, + "loss": 1.3177, + "step": 2494 + }, + { + "epoch": 2.4246480872986895, + "grad_norm": 0.392578125, + "learning_rate": 4.3591352613443606e-07, + "loss": 1.3138, + "step": 2495 + }, + { + "epoch": 2.425621678633727, + "grad_norm": 0.39453125, + "learning_rate": 4.3447167011543e-07, + "loss": 1.3114, + "step": 2496 + }, + { + "epoch": 2.426595269968764, + "grad_norm": 0.392578125, + "learning_rate": 4.3303197567601125e-07, + "loss": 1.3237, + "step": 2497 + }, + { + "epoch": 2.427568861303801, + "grad_norm": 0.40234375, + "learning_rate": 4.3159444432282426e-07, + "loss": 1.3405, + "step": 2498 + }, + { + "epoch": 2.428542452638838, + "grad_norm": 0.390625, + "learning_rate": 4.3015907756024683e-07, + "loss": 1.3163, + "step": 2499 + }, + { + "epoch": 2.429516043973875, + "grad_norm": 0.396484375, + "learning_rate": 4.2872587689039486e-07, + "loss": 1.3104, + "step": 2500 + }, + { + "epoch": 2.4304896353089123, + "grad_norm": 0.396484375, + "learning_rate": 4.272948438131144e-07, + "loss": 1.2947, + "step": 2501 + }, + { + "epoch": 2.4314632266439498, + "grad_norm": 0.408203125, + "learning_rate": 4.2586597982598536e-07, + "loss": 1.3231, + "step": 2502 + }, + { + "epoch": 2.432436817978987, + "grad_norm": 0.400390625, + "learning_rate": 4.244392864243169e-07, + "loss": 1.3201, + "step": 2503 + }, + { + "epoch": 2.433410409314024, + "grad_norm": 0.388671875, + "learning_rate": 4.230147651011457e-07, + "loss": 1.3002, + "step": 2504 + }, + { + "epoch": 2.434384000649061, + "grad_norm": 0.400390625, + "learning_rate": 4.215924173472363e-07, + "loss": 1.3024, + "step": 2505 + }, + { + "epoch": 2.435357591984098, + "grad_norm": 0.38671875, + "learning_rate": 4.2017224465107947e-07, + "loss": 1.2984, + "step": 2506 + }, + { + "epoch": 2.436331183319135, + "grad_norm": 0.39453125, + "learning_rate": 4.187542484988874e-07, + "loss": 1.3176, + "step": 2507 + }, + { + "epoch": 2.437304774654172, + "grad_norm": 0.392578125, + "learning_rate": 4.1733843037459754e-07, + "loss": 1.3094, + "step": 2508 + }, + { + "epoch": 2.4382783659892096, + "grad_norm": 0.392578125, + "learning_rate": 4.1592479175986494e-07, + "loss": 1.2799, + "step": 2509 + }, + { + "epoch": 2.4392519573242466, + "grad_norm": 0.396484375, + "learning_rate": 4.145133341340665e-07, + "loss": 1.303, + "step": 2510 + }, + { + "epoch": 2.4402255486592836, + "grad_norm": 0.400390625, + "learning_rate": 4.1310405897429463e-07, + "loss": 1.3046, + "step": 2511 + }, + { + "epoch": 2.4411991399943207, + "grad_norm": 0.392578125, + "learning_rate": 4.1169696775535923e-07, + "loss": 1.3219, + "step": 2512 + }, + { + "epoch": 2.4421727313293577, + "grad_norm": 0.39453125, + "learning_rate": 4.102920619497841e-07, + "loss": 1.321, + "step": 2513 + }, + { + "epoch": 2.443146322664395, + "grad_norm": 0.40234375, + "learning_rate": 4.088893430278057e-07, + "loss": 1.3411, + "step": 2514 + }, + { + "epoch": 2.4441199139994323, + "grad_norm": 0.38671875, + "learning_rate": 4.0748881245737173e-07, + "loss": 1.2986, + "step": 2515 + }, + { + "epoch": 2.4450935053344693, + "grad_norm": 0.396484375, + "learning_rate": 4.060904717041417e-07, + "loss": 1.3207, + "step": 2516 + }, + { + "epoch": 2.4460670966695064, + "grad_norm": 0.39453125, + "learning_rate": 4.046943222314803e-07, + "loss": 1.3151, + "step": 2517 + }, + { + "epoch": 2.4470406880045434, + "grad_norm": 0.3984375, + "learning_rate": 4.033003655004622e-07, + "loss": 1.3021, + "step": 2518 + }, + { + "epoch": 2.4480142793395805, + "grad_norm": 0.400390625, + "learning_rate": 4.019086029698649e-07, + "loss": 1.3171, + "step": 2519 + }, + { + "epoch": 2.4489878706746175, + "grad_norm": 0.400390625, + "learning_rate": 4.0051903609617195e-07, + "loss": 1.3259, + "step": 2520 + }, + { + "epoch": 2.4499614620096546, + "grad_norm": 0.39453125, + "learning_rate": 3.9913166633356645e-07, + "loss": 1.3218, + "step": 2521 + }, + { + "epoch": 2.450935053344692, + "grad_norm": 0.392578125, + "learning_rate": 3.977464951339352e-07, + "loss": 1.3049, + "step": 2522 + }, + { + "epoch": 2.451908644679729, + "grad_norm": 0.396484375, + "learning_rate": 3.9636352394686134e-07, + "loss": 1.3084, + "step": 2523 + }, + { + "epoch": 2.452882236014766, + "grad_norm": 0.404296875, + "learning_rate": 3.9498275421962854e-07, + "loss": 1.3172, + "step": 2524 + }, + { + "epoch": 2.4538558273498032, + "grad_norm": 0.39453125, + "learning_rate": 3.9360418739721295e-07, + "loss": 1.3097, + "step": 2525 + }, + { + "epoch": 2.4548294186848403, + "grad_norm": 0.396484375, + "learning_rate": 3.922278249222894e-07, + "loss": 1.3161, + "step": 2526 + }, + { + "epoch": 2.4558030100198773, + "grad_norm": 0.3984375, + "learning_rate": 3.908536682352229e-07, + "loss": 1.3021, + "step": 2527 + }, + { + "epoch": 2.456776601354915, + "grad_norm": 0.3984375, + "learning_rate": 3.8948171877407207e-07, + "loss": 1.3062, + "step": 2528 + }, + { + "epoch": 2.457750192689952, + "grad_norm": 0.3984375, + "learning_rate": 3.88111977974584e-07, + "loss": 1.3074, + "step": 2529 + }, + { + "epoch": 2.458723784024989, + "grad_norm": 0.396484375, + "learning_rate": 3.867444472701959e-07, + "loss": 1.3241, + "step": 2530 + }, + { + "epoch": 2.459697375360026, + "grad_norm": 0.3984375, + "learning_rate": 3.8537912809203075e-07, + "loss": 1.2956, + "step": 2531 + }, + { + "epoch": 2.460670966695063, + "grad_norm": 0.4140625, + "learning_rate": 3.8401602186889904e-07, + "loss": 1.3255, + "step": 2532 + }, + { + "epoch": 2.4616445580301, + "grad_norm": 0.396484375, + "learning_rate": 3.826551300272924e-07, + "loss": 1.3229, + "step": 2533 + }, + { + "epoch": 2.462618149365137, + "grad_norm": 0.3984375, + "learning_rate": 3.812964539913888e-07, + "loss": 1.294, + "step": 2534 + }, + { + "epoch": 2.4635917407001746, + "grad_norm": 0.41015625, + "learning_rate": 3.7993999518304433e-07, + "loss": 1.3159, + "step": 2535 + }, + { + "epoch": 2.4645653320352117, + "grad_norm": 0.39453125, + "learning_rate": 3.7858575502179613e-07, + "loss": 1.3169, + "step": 2536 + }, + { + "epoch": 2.4655389233702487, + "grad_norm": 0.3984375, + "learning_rate": 3.772337349248589e-07, + "loss": 1.3146, + "step": 2537 + }, + { + "epoch": 2.4665125147052858, + "grad_norm": 0.39453125, + "learning_rate": 3.7588393630712513e-07, + "loss": 1.3161, + "step": 2538 + }, + { + "epoch": 2.467486106040323, + "grad_norm": 0.396484375, + "learning_rate": 3.745363605811611e-07, + "loss": 1.3002, + "step": 2539 + }, + { + "epoch": 2.46845969737536, + "grad_norm": 0.39453125, + "learning_rate": 3.731910091572083e-07, + "loss": 1.2935, + "step": 2540 + }, + { + "epoch": 2.4694332887103974, + "grad_norm": 0.390625, + "learning_rate": 3.718478834431788e-07, + "loss": 1.3054, + "step": 2541 + }, + { + "epoch": 2.4704068800454344, + "grad_norm": 0.39453125, + "learning_rate": 3.7050698484465767e-07, + "loss": 1.3202, + "step": 2542 + }, + { + "epoch": 2.4713804713804715, + "grad_norm": 0.40234375, + "learning_rate": 3.691683147648964e-07, + "loss": 1.3242, + "step": 2543 + }, + { + "epoch": 2.4723540627155085, + "grad_norm": 0.392578125, + "learning_rate": 3.6783187460481763e-07, + "loss": 1.3024, + "step": 2544 + }, + { + "epoch": 2.4733276540505456, + "grad_norm": 0.396484375, + "learning_rate": 3.6649766576300707e-07, + "loss": 1.3246, + "step": 2545 + }, + { + "epoch": 2.4743012453855826, + "grad_norm": 0.396484375, + "learning_rate": 3.651656896357189e-07, + "loss": 1.3226, + "step": 2546 + }, + { + "epoch": 2.4752748367206197, + "grad_norm": 0.400390625, + "learning_rate": 3.638359476168668e-07, + "loss": 1.305, + "step": 2547 + }, + { + "epoch": 2.476248428055657, + "grad_norm": 0.39453125, + "learning_rate": 3.625084410980298e-07, + "loss": 1.3425, + "step": 2548 + }, + { + "epoch": 2.477222019390694, + "grad_norm": 0.388671875, + "learning_rate": 3.6118317146844524e-07, + "loss": 1.309, + "step": 2549 + }, + { + "epoch": 2.4781956107257312, + "grad_norm": 0.400390625, + "learning_rate": 3.5986014011501175e-07, + "loss": 1.3193, + "step": 2550 + }, + { + "epoch": 2.4791692020607683, + "grad_norm": 0.404296875, + "learning_rate": 3.585393484222829e-07, + "loss": 1.3187, + "step": 2551 + }, + { + "epoch": 2.4801427933958053, + "grad_norm": 0.39453125, + "learning_rate": 3.572207977724709e-07, + "loss": 1.3145, + "step": 2552 + }, + { + "epoch": 2.4811163847308424, + "grad_norm": 0.40625, + "learning_rate": 3.559044895454411e-07, + "loss": 1.3195, + "step": 2553 + }, + { + "epoch": 2.48208997606588, + "grad_norm": 0.39453125, + "learning_rate": 3.545904251187135e-07, + "loss": 1.3314, + "step": 2554 + }, + { + "epoch": 2.483063567400917, + "grad_norm": 0.388671875, + "learning_rate": 3.532786058674581e-07, + "loss": 1.2886, + "step": 2555 + }, + { + "epoch": 2.484037158735954, + "grad_norm": 0.3828125, + "learning_rate": 3.519690331644973e-07, + "loss": 1.3003, + "step": 2556 + }, + { + "epoch": 2.485010750070991, + "grad_norm": 0.40234375, + "learning_rate": 3.5066170838030153e-07, + "loss": 1.3048, + "step": 2557 + }, + { + "epoch": 2.485984341406028, + "grad_norm": 0.404296875, + "learning_rate": 3.493566328829884e-07, + "loss": 1.3262, + "step": 2558 + }, + { + "epoch": 2.486957932741065, + "grad_norm": 0.392578125, + "learning_rate": 3.4805380803832167e-07, + "loss": 1.3179, + "step": 2559 + }, + { + "epoch": 2.487931524076102, + "grad_norm": 0.404296875, + "learning_rate": 3.4675323520971137e-07, + "loss": 1.3123, + "step": 2560 + }, + { + "epoch": 2.4889051154111397, + "grad_norm": 0.38671875, + "learning_rate": 3.454549157582082e-07, + "loss": 1.3129, + "step": 2561 + }, + { + "epoch": 2.4898787067461767, + "grad_norm": 0.392578125, + "learning_rate": 3.4415885104250787e-07, + "loss": 1.3066, + "step": 2562 + }, + { + "epoch": 2.490852298081214, + "grad_norm": 0.400390625, + "learning_rate": 3.4286504241894283e-07, + "loss": 1.3166, + "step": 2563 + }, + { + "epoch": 2.491825889416251, + "grad_norm": 0.400390625, + "learning_rate": 3.415734912414878e-07, + "loss": 1.3, + "step": 2564 + }, + { + "epoch": 2.492799480751288, + "grad_norm": 0.39453125, + "learning_rate": 3.4028419886175306e-07, + "loss": 1.3168, + "step": 2565 + }, + { + "epoch": 2.493773072086325, + "grad_norm": 0.3984375, + "learning_rate": 3.3899716662898623e-07, + "loss": 1.2988, + "step": 2566 + }, + { + "epoch": 2.4947466634213624, + "grad_norm": 0.400390625, + "learning_rate": 3.377123958900688e-07, + "loss": 1.314, + "step": 2567 + }, + { + "epoch": 2.4957202547563995, + "grad_norm": 0.392578125, + "learning_rate": 3.3642988798951587e-07, + "loss": 1.3127, + "step": 2568 + }, + { + "epoch": 2.4966938460914365, + "grad_norm": 0.412109375, + "learning_rate": 3.3514964426947433e-07, + "loss": 1.3219, + "step": 2569 + }, + { + "epoch": 2.4976674374264736, + "grad_norm": 0.3984375, + "learning_rate": 3.338716660697225e-07, + "loss": 1.3277, + "step": 2570 + }, + { + "epoch": 2.4986410287615106, + "grad_norm": 0.396484375, + "learning_rate": 3.3259595472766635e-07, + "loss": 1.3038, + "step": 2571 + }, + { + "epoch": 2.4996146200965477, + "grad_norm": 0.40625, + "learning_rate": 3.3132251157834137e-07, + "loss": 1.318, + "step": 2572 + }, + { + "epoch": 2.5005882114315847, + "grad_norm": 0.400390625, + "learning_rate": 3.3005133795440734e-07, + "loss": 1.315, + "step": 2573 + }, + { + "epoch": 2.5015618027666218, + "grad_norm": 0.388671875, + "learning_rate": 3.2878243518615127e-07, + "loss": 1.3067, + "step": 2574 + }, + { + "epoch": 2.5025353941016593, + "grad_norm": 0.400390625, + "learning_rate": 3.2751580460148094e-07, + "loss": 1.311, + "step": 2575 + }, + { + "epoch": 2.5035089854366963, + "grad_norm": 0.40234375, + "learning_rate": 3.262514475259296e-07, + "loss": 1.3077, + "step": 2576 + }, + { + "epoch": 2.5044825767717334, + "grad_norm": 0.384765625, + "learning_rate": 3.249893652826483e-07, + "loss": 1.3187, + "step": 2577 + }, + { + "epoch": 2.5054561681067704, + "grad_norm": 0.396484375, + "learning_rate": 3.2372955919240834e-07, + "loss": 1.3228, + "step": 2578 + }, + { + "epoch": 2.506429759441808, + "grad_norm": 0.396484375, + "learning_rate": 3.2247203057360066e-07, + "loss": 1.3064, + "step": 2579 + }, + { + "epoch": 2.507403350776845, + "grad_norm": 0.408203125, + "learning_rate": 3.212167807422306e-07, + "loss": 1.2972, + "step": 2580 + }, + { + "epoch": 2.508376942111882, + "grad_norm": 0.3984375, + "learning_rate": 3.1996381101191936e-07, + "loss": 1.3273, + "step": 2581 + }, + { + "epoch": 2.509350533446919, + "grad_norm": 0.416015625, + "learning_rate": 3.187131226939036e-07, + "loss": 1.3228, + "step": 2582 + }, + { + "epoch": 2.510324124781956, + "grad_norm": 0.396484375, + "learning_rate": 3.1746471709702963e-07, + "loss": 1.3191, + "step": 2583 + }, + { + "epoch": 2.511297716116993, + "grad_norm": 0.39453125, + "learning_rate": 3.1621859552775774e-07, + "loss": 1.2978, + "step": 2584 + }, + { + "epoch": 2.51227130745203, + "grad_norm": 0.3984375, + "learning_rate": 3.1497475929015614e-07, + "loss": 1.307, + "step": 2585 + }, + { + "epoch": 2.5132448987870672, + "grad_norm": 0.40234375, + "learning_rate": 3.137332096859014e-07, + "loss": 1.3124, + "step": 2586 + }, + { + "epoch": 2.5142184901221047, + "grad_norm": 0.408203125, + "learning_rate": 3.124939480142786e-07, + "loss": 1.3189, + "step": 2587 + }, + { + "epoch": 2.515192081457142, + "grad_norm": 0.392578125, + "learning_rate": 3.11256975572177e-07, + "loss": 1.327, + "step": 2588 + }, + { + "epoch": 2.516165672792179, + "grad_norm": 0.390625, + "learning_rate": 3.100222936540914e-07, + "loss": 1.322, + "step": 2589 + }, + { + "epoch": 2.517139264127216, + "grad_norm": 0.392578125, + "learning_rate": 3.0878990355211886e-07, + "loss": 1.316, + "step": 2590 + }, + { + "epoch": 2.518112855462253, + "grad_norm": 0.3984375, + "learning_rate": 3.075598065559571e-07, + "loss": 1.3309, + "step": 2591 + }, + { + "epoch": 2.5190864467972904, + "grad_norm": 0.392578125, + "learning_rate": 3.063320039529064e-07, + "loss": 1.2845, + "step": 2592 + }, + { + "epoch": 2.5200600381323275, + "grad_norm": 0.40234375, + "learning_rate": 3.051064970278633e-07, + "loss": 1.3211, + "step": 2593 + }, + { + "epoch": 2.5210336294673645, + "grad_norm": 0.396484375, + "learning_rate": 3.038832870633249e-07, + "loss": 1.3163, + "step": 2594 + }, + { + "epoch": 2.5220072208024016, + "grad_norm": 0.39453125, + "learning_rate": 3.0266237533938204e-07, + "loss": 1.3347, + "step": 2595 + }, + { + "epoch": 2.5229808121374386, + "grad_norm": 0.392578125, + "learning_rate": 3.014437631337211e-07, + "loss": 1.3153, + "step": 2596 + }, + { + "epoch": 2.5239544034724757, + "grad_norm": 0.396484375, + "learning_rate": 3.002274517216228e-07, + "loss": 1.3202, + "step": 2597 + }, + { + "epoch": 2.5249279948075127, + "grad_norm": 0.388671875, + "learning_rate": 2.9901344237595856e-07, + "loss": 1.2834, + "step": 2598 + }, + { + "epoch": 2.52590158614255, + "grad_norm": 0.3984375, + "learning_rate": 2.978017363671931e-07, + "loss": 1.3174, + "step": 2599 + }, + { + "epoch": 2.5268751774775873, + "grad_norm": 0.400390625, + "learning_rate": 2.965923349633779e-07, + "loss": 1.3101, + "step": 2600 + }, + { + "epoch": 2.5278487688126243, + "grad_norm": 0.396484375, + "learning_rate": 2.9538523943015455e-07, + "loss": 1.2955, + "step": 2601 + }, + { + "epoch": 2.5288223601476614, + "grad_norm": 0.40234375, + "learning_rate": 2.9418045103075137e-07, + "loss": 1.3198, + "step": 2602 + }, + { + "epoch": 2.5297959514826984, + "grad_norm": 0.400390625, + "learning_rate": 2.929779710259811e-07, + "loss": 1.2972, + "step": 2603 + }, + { + "epoch": 2.5307695428177355, + "grad_norm": 0.41015625, + "learning_rate": 2.917778006742414e-07, + "loss": 1.3272, + "step": 2604 + }, + { + "epoch": 2.531743134152773, + "grad_norm": 0.39453125, + "learning_rate": 2.905799412315141e-07, + "loss": 1.306, + "step": 2605 + }, + { + "epoch": 2.53271672548781, + "grad_norm": 0.3984375, + "learning_rate": 2.8938439395135995e-07, + "loss": 1.322, + "step": 2606 + }, + { + "epoch": 2.533690316822847, + "grad_norm": 0.39453125, + "learning_rate": 2.8819116008492327e-07, + "loss": 1.3134, + "step": 2607 + }, + { + "epoch": 2.534663908157884, + "grad_norm": 0.396484375, + "learning_rate": 2.8700024088092415e-07, + "loss": 1.2964, + "step": 2608 + }, + { + "epoch": 2.535637499492921, + "grad_norm": 0.396484375, + "learning_rate": 2.8581163758566346e-07, + "loss": 1.3143, + "step": 2609 + }, + { + "epoch": 2.536611090827958, + "grad_norm": 0.400390625, + "learning_rate": 2.8462535144301554e-07, + "loss": 1.3134, + "step": 2610 + }, + { + "epoch": 2.5375846821629953, + "grad_norm": 0.408203125, + "learning_rate": 2.834413836944325e-07, + "loss": 1.3109, + "step": 2611 + }, + { + "epoch": 2.5385582734980323, + "grad_norm": 0.404296875, + "learning_rate": 2.822597355789383e-07, + "loss": 1.2995, + "step": 2612 + }, + { + "epoch": 2.53953186483307, + "grad_norm": 0.3984375, + "learning_rate": 2.8108040833313035e-07, + "loss": 1.3202, + "step": 2613 + }, + { + "epoch": 2.540505456168107, + "grad_norm": 0.390625, + "learning_rate": 2.799034031911765e-07, + "loss": 1.3064, + "step": 2614 + }, + { + "epoch": 2.541479047503144, + "grad_norm": 0.40234375, + "learning_rate": 2.7872872138481557e-07, + "loss": 1.3254, + "step": 2615 + }, + { + "epoch": 2.542452638838181, + "grad_norm": 0.40234375, + "learning_rate": 2.77556364143354e-07, + "loss": 1.3129, + "step": 2616 + }, + { + "epoch": 2.543426230173218, + "grad_norm": 0.3984375, + "learning_rate": 2.7638633269366666e-07, + "loss": 1.3286, + "step": 2617 + }, + { + "epoch": 2.5443998215082555, + "grad_norm": 0.396484375, + "learning_rate": 2.7521862826019317e-07, + "loss": 1.3266, + "step": 2618 + }, + { + "epoch": 2.5453734128432925, + "grad_norm": 0.38671875, + "learning_rate": 2.7405325206493914e-07, + "loss": 1.2944, + "step": 2619 + }, + { + "epoch": 2.5463470041783296, + "grad_norm": 0.3984375, + "learning_rate": 2.7289020532747263e-07, + "loss": 1.3001, + "step": 2620 + }, + { + "epoch": 2.5473205955133666, + "grad_norm": 0.39453125, + "learning_rate": 2.7172948926492497e-07, + "loss": 1.3071, + "step": 2621 + }, + { + "epoch": 2.5482941868484037, + "grad_norm": 0.39453125, + "learning_rate": 2.705711050919871e-07, + "loss": 1.3266, + "step": 2622 + }, + { + "epoch": 2.5492677781834407, + "grad_norm": 0.392578125, + "learning_rate": 2.694150540209117e-07, + "loss": 1.3262, + "step": 2623 + }, + { + "epoch": 2.550241369518478, + "grad_norm": 0.396484375, + "learning_rate": 2.68261337261507e-07, + "loss": 1.3199, + "step": 2624 + }, + { + "epoch": 2.551214960853515, + "grad_norm": 0.40625, + "learning_rate": 2.67109956021141e-07, + "loss": 1.3013, + "step": 2625 + }, + { + "epoch": 2.5521885521885523, + "grad_norm": 0.39453125, + "learning_rate": 2.659609115047354e-07, + "loss": 1.3188, + "step": 2626 + }, + { + "epoch": 2.5531621435235894, + "grad_norm": 0.396484375, + "learning_rate": 2.648142049147692e-07, + "loss": 1.3065, + "step": 2627 + }, + { + "epoch": 2.5541357348586264, + "grad_norm": 0.390625, + "learning_rate": 2.6366983745127197e-07, + "loss": 1.3006, + "step": 2628 + }, + { + "epoch": 2.5551093261936635, + "grad_norm": 0.40625, + "learning_rate": 2.62527810311827e-07, + "loss": 1.3011, + "step": 2629 + }, + { + "epoch": 2.5560829175287005, + "grad_norm": 0.390625, + "learning_rate": 2.6138812469156784e-07, + "loss": 1.2994, + "step": 2630 + }, + { + "epoch": 2.557056508863738, + "grad_norm": 0.408203125, + "learning_rate": 2.602507817831784e-07, + "loss": 1.3111, + "step": 2631 + }, + { + "epoch": 2.558030100198775, + "grad_norm": 0.39453125, + "learning_rate": 2.591157827768892e-07, + "loss": 1.3137, + "step": 2632 + }, + { + "epoch": 2.559003691533812, + "grad_norm": 0.384765625, + "learning_rate": 2.5798312886048034e-07, + "loss": 1.3057, + "step": 2633 + }, + { + "epoch": 2.559977282868849, + "grad_norm": 0.396484375, + "learning_rate": 2.568528212192756e-07, + "loss": 1.3158, + "step": 2634 + }, + { + "epoch": 2.5609508742038862, + "grad_norm": 0.40234375, + "learning_rate": 2.557248610361443e-07, + "loss": 1.3071, + "step": 2635 + }, + { + "epoch": 2.5619244655389233, + "grad_norm": 0.40234375, + "learning_rate": 2.5459924949149896e-07, + "loss": 1.3166, + "step": 2636 + }, + { + "epoch": 2.5628980568739603, + "grad_norm": 0.390625, + "learning_rate": 2.53475987763295e-07, + "loss": 1.312, + "step": 2637 + }, + { + "epoch": 2.5638716482089974, + "grad_norm": 0.390625, + "learning_rate": 2.523550770270269e-07, + "loss": 1.2926, + "step": 2638 + }, + { + "epoch": 2.564845239544035, + "grad_norm": 0.404296875, + "learning_rate": 2.512365184557314e-07, + "loss": 1.3246, + "step": 2639 + }, + { + "epoch": 2.565818830879072, + "grad_norm": 0.390625, + "learning_rate": 2.5012031321998117e-07, + "loss": 1.3306, + "step": 2640 + }, + { + "epoch": 2.566792422214109, + "grad_norm": 0.38671875, + "learning_rate": 2.4900646248788823e-07, + "loss": 1.3108, + "step": 2641 + }, + { + "epoch": 2.567766013549146, + "grad_norm": 0.390625, + "learning_rate": 2.4789496742509883e-07, + "loss": 1.3182, + "step": 2642 + }, + { + "epoch": 2.568739604884183, + "grad_norm": 0.39453125, + "learning_rate": 2.4678582919479557e-07, + "loss": 1.3009, + "step": 2643 + }, + { + "epoch": 2.5697131962192206, + "grad_norm": 0.404296875, + "learning_rate": 2.4567904895769267e-07, + "loss": 1.3197, + "step": 2644 + }, + { + "epoch": 2.5706867875542576, + "grad_norm": 0.400390625, + "learning_rate": 2.445746278720398e-07, + "loss": 1.3177, + "step": 2645 + }, + { + "epoch": 2.5716603788892947, + "grad_norm": 0.38671875, + "learning_rate": 2.434725670936139e-07, + "loss": 1.3048, + "step": 2646 + }, + { + "epoch": 2.5726339702243317, + "grad_norm": 0.392578125, + "learning_rate": 2.423728677757248e-07, + "loss": 1.3191, + "step": 2647 + }, + { + "epoch": 2.5736075615593688, + "grad_norm": 0.3984375, + "learning_rate": 2.412755310692097e-07, + "loss": 1.3201, + "step": 2648 + }, + { + "epoch": 2.574581152894406, + "grad_norm": 0.400390625, + "learning_rate": 2.4018055812243394e-07, + "loss": 1.3176, + "step": 2649 + }, + { + "epoch": 2.575554744229443, + "grad_norm": 0.404296875, + "learning_rate": 2.3908795008128873e-07, + "loss": 1.3146, + "step": 2650 + }, + { + "epoch": 2.57652833556448, + "grad_norm": 0.3984375, + "learning_rate": 2.3799770808919126e-07, + "loss": 1.3261, + "step": 2651 + }, + { + "epoch": 2.5775019268995174, + "grad_norm": 0.3984375, + "learning_rate": 2.369098332870809e-07, + "loss": 1.3223, + "step": 2652 + }, + { + "epoch": 2.5784755182345545, + "grad_norm": 0.400390625, + "learning_rate": 2.3582432681342194e-07, + "loss": 1.3249, + "step": 2653 + }, + { + "epoch": 2.5794491095695915, + "grad_norm": 0.388671875, + "learning_rate": 2.347411898041979e-07, + "loss": 1.3046, + "step": 2654 + }, + { + "epoch": 2.5804227009046286, + "grad_norm": 0.390625, + "learning_rate": 2.3366042339291517e-07, + "loss": 1.3086, + "step": 2655 + }, + { + "epoch": 2.5813962922396656, + "grad_norm": 0.392578125, + "learning_rate": 2.325820287105973e-07, + "loss": 1.2968, + "step": 2656 + }, + { + "epoch": 2.582369883574703, + "grad_norm": 0.4453125, + "learning_rate": 2.3150600688578661e-07, + "loss": 1.302, + "step": 2657 + }, + { + "epoch": 2.58334347490974, + "grad_norm": 0.3984375, + "learning_rate": 2.3043235904454148e-07, + "loss": 1.3028, + "step": 2658 + }, + { + "epoch": 2.584317066244777, + "grad_norm": 0.388671875, + "learning_rate": 2.2936108631043785e-07, + "loss": 1.3239, + "step": 2659 + }, + { + "epoch": 2.5852906575798142, + "grad_norm": 0.39453125, + "learning_rate": 2.2829218980456342e-07, + "loss": 1.3075, + "step": 2660 + }, + { + "epoch": 2.5862642489148513, + "grad_norm": 0.400390625, + "learning_rate": 2.272256706455217e-07, + "loss": 1.3313, + "step": 2661 + }, + { + "epoch": 2.5872378402498883, + "grad_norm": 0.388671875, + "learning_rate": 2.261615299494263e-07, + "loss": 1.3042, + "step": 2662 + }, + { + "epoch": 2.5882114315849254, + "grad_norm": 0.404296875, + "learning_rate": 2.2509976882990364e-07, + "loss": 1.3205, + "step": 2663 + }, + { + "epoch": 2.5891850229199624, + "grad_norm": 0.41015625, + "learning_rate": 2.2404038839808766e-07, + "loss": 1.3346, + "step": 2664 + }, + { + "epoch": 2.590158614255, + "grad_norm": 0.40234375, + "learning_rate": 2.2298338976262324e-07, + "loss": 1.3188, + "step": 2665 + }, + { + "epoch": 2.591132205590037, + "grad_norm": 0.400390625, + "learning_rate": 2.219287740296605e-07, + "loss": 1.3062, + "step": 2666 + }, + { + "epoch": 2.592105796925074, + "grad_norm": 0.39453125, + "learning_rate": 2.208765423028586e-07, + "loss": 1.3295, + "step": 2667 + }, + { + "epoch": 2.593079388260111, + "grad_norm": 0.3984375, + "learning_rate": 2.1982669568337806e-07, + "loss": 1.3132, + "step": 2668 + }, + { + "epoch": 2.594052979595148, + "grad_norm": 0.40234375, + "learning_rate": 2.18779235269887e-07, + "loss": 1.3178, + "step": 2669 + }, + { + "epoch": 2.5950265709301856, + "grad_norm": 0.396484375, + "learning_rate": 2.1773416215855407e-07, + "loss": 1.3137, + "step": 2670 + }, + { + "epoch": 2.5960001622652227, + "grad_norm": 0.39453125, + "learning_rate": 2.166914774430512e-07, + "loss": 1.3098, + "step": 2671 + }, + { + "epoch": 2.5969737536002597, + "grad_norm": 0.400390625, + "learning_rate": 2.1565118221454905e-07, + "loss": 1.3295, + "step": 2672 + }, + { + "epoch": 2.5979473449352968, + "grad_norm": 0.400390625, + "learning_rate": 2.146132775617199e-07, + "loss": 1.3178, + "step": 2673 + }, + { + "epoch": 2.598920936270334, + "grad_norm": 0.390625, + "learning_rate": 2.135777645707318e-07, + "loss": 1.3179, + "step": 2674 + }, + { + "epoch": 2.599894527605371, + "grad_norm": 0.396484375, + "learning_rate": 2.1254464432525214e-07, + "loss": 1.3161, + "step": 2675 + }, + { + "epoch": 2.600868118940408, + "grad_norm": 0.400390625, + "learning_rate": 2.1151391790644322e-07, + "loss": 1.3216, + "step": 2676 + }, + { + "epoch": 2.601841710275445, + "grad_norm": 0.40234375, + "learning_rate": 2.104855863929617e-07, + "loss": 1.3194, + "step": 2677 + }, + { + "epoch": 2.6028153016104825, + "grad_norm": 0.388671875, + "learning_rate": 2.094596508609595e-07, + "loss": 1.31, + "step": 2678 + }, + { + "epoch": 2.6037888929455195, + "grad_norm": 0.404296875, + "learning_rate": 2.0843611238407945e-07, + "loss": 1.3241, + "step": 2679 + }, + { + "epoch": 2.6047624842805566, + "grad_norm": 0.40234375, + "learning_rate": 2.0741497203345673e-07, + "loss": 1.3255, + "step": 2680 + }, + { + "epoch": 2.6057360756155936, + "grad_norm": 0.400390625, + "learning_rate": 2.063962308777176e-07, + "loss": 1.2954, + "step": 2681 + }, + { + "epoch": 2.6067096669506307, + "grad_norm": 0.3984375, + "learning_rate": 2.0537988998297565e-07, + "loss": 1.3256, + "step": 2682 + }, + { + "epoch": 2.607683258285668, + "grad_norm": 0.392578125, + "learning_rate": 2.0436595041283454e-07, + "loss": 1.3044, + "step": 2683 + }, + { + "epoch": 2.608656849620705, + "grad_norm": 0.396484375, + "learning_rate": 2.033544132283838e-07, + "loss": 1.327, + "step": 2684 + }, + { + "epoch": 2.6096304409557423, + "grad_norm": 0.3984375, + "learning_rate": 2.023452794881986e-07, + "loss": 1.3128, + "step": 2685 + }, + { + "epoch": 2.6106040322907793, + "grad_norm": 0.39453125, + "learning_rate": 2.013385502483406e-07, + "loss": 1.3102, + "step": 2686 + }, + { + "epoch": 2.6115776236258164, + "grad_norm": 0.388671875, + "learning_rate": 2.0033422656235258e-07, + "loss": 1.3093, + "step": 2687 + }, + { + "epoch": 2.6125512149608534, + "grad_norm": 0.396484375, + "learning_rate": 1.993323094812627e-07, + "loss": 1.3181, + "step": 2688 + }, + { + "epoch": 2.6135248062958905, + "grad_norm": 0.392578125, + "learning_rate": 1.9833280005357864e-07, + "loss": 1.3326, + "step": 2689 + }, + { + "epoch": 2.6144983976309275, + "grad_norm": 0.400390625, + "learning_rate": 1.973356993252884e-07, + "loss": 1.3082, + "step": 2690 + }, + { + "epoch": 2.615471988965965, + "grad_norm": 0.39453125, + "learning_rate": 1.963410083398609e-07, + "loss": 1.3226, + "step": 2691 + }, + { + "epoch": 2.616445580301002, + "grad_norm": 0.40234375, + "learning_rate": 1.9534872813824158e-07, + "loss": 1.3134, + "step": 2692 + }, + { + "epoch": 2.617419171636039, + "grad_norm": 0.388671875, + "learning_rate": 1.9435885975885443e-07, + "loss": 1.3277, + "step": 2693 + }, + { + "epoch": 2.618392762971076, + "grad_norm": 0.400390625, + "learning_rate": 1.9337140423759838e-07, + "loss": 1.3206, + "step": 2694 + }, + { + "epoch": 2.619366354306113, + "grad_norm": 0.388671875, + "learning_rate": 1.9238636260784675e-07, + "loss": 1.3258, + "step": 2695 + }, + { + "epoch": 2.6203399456411507, + "grad_norm": 0.392578125, + "learning_rate": 1.914037359004489e-07, + "loss": 1.3277, + "step": 2696 + }, + { + "epoch": 2.6213135369761877, + "grad_norm": 0.392578125, + "learning_rate": 1.9042352514372504e-07, + "loss": 1.3244, + "step": 2697 + }, + { + "epoch": 2.622287128311225, + "grad_norm": 0.39453125, + "learning_rate": 1.894457313634679e-07, + "loss": 1.296, + "step": 2698 + }, + { + "epoch": 2.623260719646262, + "grad_norm": 0.380859375, + "learning_rate": 1.8847035558294037e-07, + "loss": 1.3252, + "step": 2699 + }, + { + "epoch": 2.624234310981299, + "grad_norm": 0.392578125, + "learning_rate": 1.8749739882287566e-07, + "loss": 1.304, + "step": 2700 + }, + { + "epoch": 2.625207902316336, + "grad_norm": 0.388671875, + "learning_rate": 1.8652686210147485e-07, + "loss": 1.3261, + "step": 2701 + }, + { + "epoch": 2.626181493651373, + "grad_norm": 0.392578125, + "learning_rate": 1.8555874643440662e-07, + "loss": 1.3186, + "step": 2702 + }, + { + "epoch": 2.62715508498641, + "grad_norm": 0.38671875, + "learning_rate": 1.8459305283480528e-07, + "loss": 1.3255, + "step": 2703 + }, + { + "epoch": 2.6281286763214475, + "grad_norm": 0.390625, + "learning_rate": 1.8362978231327184e-07, + "loss": 1.3142, + "step": 2704 + }, + { + "epoch": 2.6291022676564846, + "grad_norm": 0.396484375, + "learning_rate": 1.826689358778705e-07, + "loss": 1.2996, + "step": 2705 + }, + { + "epoch": 2.6300758589915216, + "grad_norm": 0.388671875, + "learning_rate": 1.817105145341297e-07, + "loss": 1.3071, + "step": 2706 + }, + { + "epoch": 2.6310494503265587, + "grad_norm": 0.3984375, + "learning_rate": 1.807545192850385e-07, + "loss": 1.3153, + "step": 2707 + }, + { + "epoch": 2.6320230416615957, + "grad_norm": 0.384765625, + "learning_rate": 1.7980095113104835e-07, + "loss": 1.2973, + "step": 2708 + }, + { + "epoch": 2.6329966329966332, + "grad_norm": 0.400390625, + "learning_rate": 1.7884981107006981e-07, + "loss": 1.3153, + "step": 2709 + }, + { + "epoch": 2.6339702243316703, + "grad_norm": 0.396484375, + "learning_rate": 1.7790110009747368e-07, + "loss": 1.3058, + "step": 2710 + }, + { + "epoch": 2.6349438156667073, + "grad_norm": 0.396484375, + "learning_rate": 1.7695481920608716e-07, + "loss": 1.3462, + "step": 2711 + }, + { + "epoch": 2.6359174070017444, + "grad_norm": 0.390625, + "learning_rate": 1.7601096938619556e-07, + "loss": 1.2838, + "step": 2712 + }, + { + "epoch": 2.6368909983367814, + "grad_norm": 0.392578125, + "learning_rate": 1.7506955162553908e-07, + "loss": 1.2936, + "step": 2713 + }, + { + "epoch": 2.6378645896718185, + "grad_norm": 0.404296875, + "learning_rate": 1.7413056690931406e-07, + "loss": 1.3187, + "step": 2714 + }, + { + "epoch": 2.6388381810068555, + "grad_norm": 0.3984375, + "learning_rate": 1.7319401622016897e-07, + "loss": 1.3051, + "step": 2715 + }, + { + "epoch": 2.6398117723418926, + "grad_norm": 0.39453125, + "learning_rate": 1.7225990053820724e-07, + "loss": 1.3195, + "step": 2716 + }, + { + "epoch": 2.64078536367693, + "grad_norm": 0.396484375, + "learning_rate": 1.7132822084098165e-07, + "loss": 1.3136, + "step": 2717 + }, + { + "epoch": 2.641758955011967, + "grad_norm": 0.396484375, + "learning_rate": 1.7039897810349786e-07, + "loss": 1.3162, + "step": 2718 + }, + { + "epoch": 2.642732546347004, + "grad_norm": 0.400390625, + "learning_rate": 1.6947217329820937e-07, + "loss": 1.3118, + "step": 2719 + }, + { + "epoch": 2.643706137682041, + "grad_norm": 0.3984375, + "learning_rate": 1.6854780739502003e-07, + "loss": 1.3109, + "step": 2720 + }, + { + "epoch": 2.6446797290170783, + "grad_norm": 0.39453125, + "learning_rate": 1.6762588136127995e-07, + "loss": 1.3261, + "step": 2721 + }, + { + "epoch": 2.6456533203521158, + "grad_norm": 0.40234375, + "learning_rate": 1.6670639616178792e-07, + "loss": 1.3326, + "step": 2722 + }, + { + "epoch": 2.646626911687153, + "grad_norm": 0.396484375, + "learning_rate": 1.6578935275878533e-07, + "loss": 1.3104, + "step": 2723 + }, + { + "epoch": 2.64760050302219, + "grad_norm": 0.390625, + "learning_rate": 1.6487475211196118e-07, + "loss": 1.3226, + "step": 2724 + }, + { + "epoch": 2.648574094357227, + "grad_norm": 0.400390625, + "learning_rate": 1.6396259517844598e-07, + "loss": 1.3035, + "step": 2725 + }, + { + "epoch": 2.649547685692264, + "grad_norm": 0.39453125, + "learning_rate": 1.6305288291281474e-07, + "loss": 1.3106, + "step": 2726 + }, + { + "epoch": 2.650521277027301, + "grad_norm": 0.400390625, + "learning_rate": 1.6214561626708258e-07, + "loss": 1.3109, + "step": 2727 + }, + { + "epoch": 2.651494868362338, + "grad_norm": 0.388671875, + "learning_rate": 1.6124079619070614e-07, + "loss": 1.3198, + "step": 2728 + }, + { + "epoch": 2.652468459697375, + "grad_norm": 0.396484375, + "learning_rate": 1.6033842363058134e-07, + "loss": 1.3292, + "step": 2729 + }, + { + "epoch": 2.6534420510324126, + "grad_norm": 0.400390625, + "learning_rate": 1.594384995310433e-07, + "loss": 1.2973, + "step": 2730 + }, + { + "epoch": 2.6544156423674496, + "grad_norm": 0.396484375, + "learning_rate": 1.585410248338634e-07, + "loss": 1.3261, + "step": 2731 + }, + { + "epoch": 2.6553892337024867, + "grad_norm": 0.400390625, + "learning_rate": 1.5764600047825207e-07, + "loss": 1.3151, + "step": 2732 + }, + { + "epoch": 2.6563628250375237, + "grad_norm": 0.404296875, + "learning_rate": 1.567534274008531e-07, + "loss": 1.3178, + "step": 2733 + }, + { + "epoch": 2.657336416372561, + "grad_norm": 0.392578125, + "learning_rate": 1.5586330653574704e-07, + "loss": 1.3188, + "step": 2734 + }, + { + "epoch": 2.6583100077075983, + "grad_norm": 0.3984375, + "learning_rate": 1.5497563881444577e-07, + "loss": 1.3305, + "step": 2735 + }, + { + "epoch": 2.6592835990426353, + "grad_norm": 0.392578125, + "learning_rate": 1.5409042516589646e-07, + "loss": 1.3147, + "step": 2736 + }, + { + "epoch": 2.6602571903776724, + "grad_norm": 0.400390625, + "learning_rate": 1.5320766651647613e-07, + "loss": 1.3299, + "step": 2737 + }, + { + "epoch": 2.6612307817127094, + "grad_norm": 0.408203125, + "learning_rate": 1.5232736378999468e-07, + "loss": 1.345, + "step": 2738 + }, + { + "epoch": 2.6622043730477465, + "grad_norm": 0.39453125, + "learning_rate": 1.5144951790768942e-07, + "loss": 1.3179, + "step": 2739 + }, + { + "epoch": 2.6631779643827835, + "grad_norm": 0.400390625, + "learning_rate": 1.5057412978822906e-07, + "loss": 1.3291, + "step": 2740 + }, + { + "epoch": 2.6641515557178206, + "grad_norm": 0.400390625, + "learning_rate": 1.4970120034770775e-07, + "loss": 1.3238, + "step": 2741 + }, + { + "epoch": 2.6651251470528576, + "grad_norm": 0.3984375, + "learning_rate": 1.488307304996492e-07, + "loss": 1.3265, + "step": 2742 + }, + { + "epoch": 2.666098738387895, + "grad_norm": 0.39453125, + "learning_rate": 1.4796272115500082e-07, + "loss": 1.3026, + "step": 2743 + }, + { + "epoch": 2.667072329722932, + "grad_norm": 0.388671875, + "learning_rate": 1.4709717322213712e-07, + "loss": 1.3354, + "step": 2744 + }, + { + "epoch": 2.6680459210579692, + "grad_norm": 0.390625, + "learning_rate": 1.462340876068549e-07, + "loss": 1.3258, + "step": 2745 + }, + { + "epoch": 2.6690195123930063, + "grad_norm": 0.39453125, + "learning_rate": 1.453734652123756e-07, + "loss": 1.3267, + "step": 2746 + }, + { + "epoch": 2.6699931037280433, + "grad_norm": 0.40234375, + "learning_rate": 1.445153069393418e-07, + "loss": 1.3174, + "step": 2747 + }, + { + "epoch": 2.670966695063081, + "grad_norm": 0.396484375, + "learning_rate": 1.4365961368581844e-07, + "loss": 1.3072, + "step": 2748 + }, + { + "epoch": 2.671940286398118, + "grad_norm": 0.392578125, + "learning_rate": 1.428063863472895e-07, + "loss": 1.3099, + "step": 2749 + }, + { + "epoch": 2.672913877733155, + "grad_norm": 0.404296875, + "learning_rate": 1.4195562581666017e-07, + "loss": 1.3071, + "step": 2750 + }, + { + "epoch": 2.673887469068192, + "grad_norm": 0.3984375, + "learning_rate": 1.411073329842519e-07, + "loss": 1.3049, + "step": 2751 + }, + { + "epoch": 2.674861060403229, + "grad_norm": 0.390625, + "learning_rate": 1.4026150873780564e-07, + "loss": 1.3209, + "step": 2752 + }, + { + "epoch": 2.675834651738266, + "grad_norm": 0.392578125, + "learning_rate": 1.3941815396247783e-07, + "loss": 1.3175, + "step": 2753 + }, + { + "epoch": 2.676808243073303, + "grad_norm": 0.40234375, + "learning_rate": 1.3857726954084134e-07, + "loss": 1.3318, + "step": 2754 + }, + { + "epoch": 2.67778183440834, + "grad_norm": 0.400390625, + "learning_rate": 1.3773885635288308e-07, + "loss": 1.3257, + "step": 2755 + }, + { + "epoch": 2.6787554257433777, + "grad_norm": 0.392578125, + "learning_rate": 1.3690291527600458e-07, + "loss": 1.2978, + "step": 2756 + }, + { + "epoch": 2.6797290170784147, + "grad_norm": 0.404296875, + "learning_rate": 1.3606944718501908e-07, + "loss": 1.3257, + "step": 2757 + }, + { + "epoch": 2.6807026084134518, + "grad_norm": 0.388671875, + "learning_rate": 1.3523845295215332e-07, + "loss": 1.3187, + "step": 2758 + }, + { + "epoch": 2.681676199748489, + "grad_norm": 0.396484375, + "learning_rate": 1.344099334470439e-07, + "loss": 1.3108, + "step": 2759 + }, + { + "epoch": 2.682649791083526, + "grad_norm": 0.38671875, + "learning_rate": 1.335838895367389e-07, + "loss": 1.3245, + "step": 2760 + }, + { + "epoch": 2.6836233824185634, + "grad_norm": 0.396484375, + "learning_rate": 1.3276032208569407e-07, + "loss": 1.309, + "step": 2761 + }, + { + "epoch": 2.6845969737536004, + "grad_norm": 0.400390625, + "learning_rate": 1.319392319557755e-07, + "loss": 1.3319, + "step": 2762 + }, + { + "epoch": 2.6855705650886375, + "grad_norm": 0.3984375, + "learning_rate": 1.3112062000625452e-07, + "loss": 1.3149, + "step": 2763 + }, + { + "epoch": 2.6865441564236745, + "grad_norm": 0.392578125, + "learning_rate": 1.3030448709381082e-07, + "loss": 1.3122, + "step": 2764 + }, + { + "epoch": 2.6875177477587116, + "grad_norm": 0.40234375, + "learning_rate": 1.2949083407252898e-07, + "loss": 1.303, + "step": 2765 + }, + { + "epoch": 2.6884913390937486, + "grad_norm": 0.4140625, + "learning_rate": 1.2867966179389902e-07, + "loss": 1.3133, + "step": 2766 + }, + { + "epoch": 2.6894649304287856, + "grad_norm": 0.400390625, + "learning_rate": 1.278709711068138e-07, + "loss": 1.3121, + "step": 2767 + }, + { + "epoch": 2.6904385217638227, + "grad_norm": 0.39453125, + "learning_rate": 1.2706476285756997e-07, + "loss": 1.323, + "step": 2768 + }, + { + "epoch": 2.69141211309886, + "grad_norm": 0.3984375, + "learning_rate": 1.26261037889866e-07, + "loss": 1.3148, + "step": 2769 + }, + { + "epoch": 2.6923857044338972, + "grad_norm": 0.39453125, + "learning_rate": 1.2545979704480181e-07, + "loss": 1.2991, + "step": 2770 + }, + { + "epoch": 2.6933592957689343, + "grad_norm": 0.40234375, + "learning_rate": 1.2466104116087728e-07, + "loss": 1.3178, + "step": 2771 + }, + { + "epoch": 2.6943328871039713, + "grad_norm": 0.404296875, + "learning_rate": 1.2386477107399264e-07, + "loss": 1.3164, + "step": 2772 + }, + { + "epoch": 2.695306478439009, + "grad_norm": 0.39453125, + "learning_rate": 1.2307098761744547e-07, + "loss": 1.3033, + "step": 2773 + }, + { + "epoch": 2.696280069774046, + "grad_norm": 0.404296875, + "learning_rate": 1.2227969162193238e-07, + "loss": 1.3221, + "step": 2774 + }, + { + "epoch": 2.697253661109083, + "grad_norm": 0.396484375, + "learning_rate": 1.2149088391554599e-07, + "loss": 1.3038, + "step": 2775 + }, + { + "epoch": 2.69822725244412, + "grad_norm": 0.40625, + "learning_rate": 1.2070456532377483e-07, + "loss": 1.2977, + "step": 2776 + }, + { + "epoch": 2.699200843779157, + "grad_norm": 0.3984375, + "learning_rate": 1.1992073666950376e-07, + "loss": 1.3398, + "step": 2777 + }, + { + "epoch": 2.700174435114194, + "grad_norm": 0.3984375, + "learning_rate": 1.1913939877301023e-07, + "loss": 1.3146, + "step": 2778 + }, + { + "epoch": 2.701148026449231, + "grad_norm": 0.40625, + "learning_rate": 1.1836055245196598e-07, + "loss": 1.3235, + "step": 2779 + }, + { + "epoch": 2.702121617784268, + "grad_norm": 0.390625, + "learning_rate": 1.1758419852143599e-07, + "loss": 1.2857, + "step": 2780 + }, + { + "epoch": 2.7030952091193052, + "grad_norm": 0.392578125, + "learning_rate": 1.1681033779387507e-07, + "loss": 1.3169, + "step": 2781 + }, + { + "epoch": 2.7040688004543427, + "grad_norm": 0.39453125, + "learning_rate": 1.1603897107913126e-07, + "loss": 1.3133, + "step": 2782 + }, + { + "epoch": 2.7050423917893798, + "grad_norm": 0.408203125, + "learning_rate": 1.1527009918444076e-07, + "loss": 1.3187, + "step": 2783 + }, + { + "epoch": 2.706015983124417, + "grad_norm": 0.40625, + "learning_rate": 1.145037229144294e-07, + "loss": 1.3237, + "step": 2784 + }, + { + "epoch": 2.706989574459454, + "grad_norm": 0.40625, + "learning_rate": 1.137398430711123e-07, + "loss": 1.3193, + "step": 2785 + }, + { + "epoch": 2.7079631657944914, + "grad_norm": 0.404296875, + "learning_rate": 1.1297846045389e-07, + "loss": 1.3331, + "step": 2786 + }, + { + "epoch": 2.7089367571295284, + "grad_norm": 0.392578125, + "learning_rate": 1.1221957585955207e-07, + "loss": 1.2998, + "step": 2787 + }, + { + "epoch": 2.7099103484645655, + "grad_norm": 0.40625, + "learning_rate": 1.1146319008227214e-07, + "loss": 1.3189, + "step": 2788 + }, + { + "epoch": 2.7108839397996025, + "grad_norm": 0.396484375, + "learning_rate": 1.1070930391361007e-07, + "loss": 1.3314, + "step": 2789 + }, + { + "epoch": 2.7118575311346396, + "grad_norm": 0.384765625, + "learning_rate": 1.0995791814250894e-07, + "loss": 1.2824, + "step": 2790 + }, + { + "epoch": 2.7128311224696766, + "grad_norm": 0.390625, + "learning_rate": 1.0920903355529505e-07, + "loss": 1.3075, + "step": 2791 + }, + { + "epoch": 2.7138047138047137, + "grad_norm": 0.400390625, + "learning_rate": 1.0846265093567815e-07, + "loss": 1.2965, + "step": 2792 + }, + { + "epoch": 2.7147783051397507, + "grad_norm": 0.396484375, + "learning_rate": 1.0771877106474927e-07, + "loss": 1.3235, + "step": 2793 + }, + { + "epoch": 2.715751896474788, + "grad_norm": 0.400390625, + "learning_rate": 1.0697739472097956e-07, + "loss": 1.3308, + "step": 2794 + }, + { + "epoch": 2.7167254878098253, + "grad_norm": 0.3984375, + "learning_rate": 1.062385226802215e-07, + "loss": 1.338, + "step": 2795 + }, + { + "epoch": 2.7176990791448623, + "grad_norm": 0.396484375, + "learning_rate": 1.0550215571570599e-07, + "loss": 1.3218, + "step": 2796 + }, + { + "epoch": 2.7186726704798994, + "grad_norm": 0.392578125, + "learning_rate": 1.0476829459804245e-07, + "loss": 1.3114, + "step": 2797 + }, + { + "epoch": 2.7196462618149364, + "grad_norm": 0.390625, + "learning_rate": 1.0403694009521793e-07, + "loss": 1.3237, + "step": 2798 + }, + { + "epoch": 2.720619853149974, + "grad_norm": 0.39453125, + "learning_rate": 1.0330809297259714e-07, + "loss": 1.33, + "step": 2799 + }, + { + "epoch": 2.721593444485011, + "grad_norm": 0.408203125, + "learning_rate": 1.0258175399291914e-07, + "loss": 1.3131, + "step": 2800 + }, + { + "epoch": 2.722567035820048, + "grad_norm": 0.3984375, + "learning_rate": 1.0185792391629978e-07, + "loss": 1.3231, + "step": 2801 + }, + { + "epoch": 2.723540627155085, + "grad_norm": 0.404296875, + "learning_rate": 1.0113660350022786e-07, + "loss": 1.3088, + "step": 2802 + }, + { + "epoch": 2.724514218490122, + "grad_norm": 0.404296875, + "learning_rate": 1.0041779349956788e-07, + "loss": 1.3269, + "step": 2803 + }, + { + "epoch": 2.725487809825159, + "grad_norm": 0.39453125, + "learning_rate": 9.970149466655477e-08, + "loss": 1.3081, + "step": 2804 + }, + { + "epoch": 2.726461401160196, + "grad_norm": 0.39453125, + "learning_rate": 9.898770775079752e-08, + "loss": 1.3188, + "step": 2805 + }, + { + "epoch": 2.7274349924952332, + "grad_norm": 0.396484375, + "learning_rate": 9.82764334992753e-08, + "loss": 1.303, + "step": 2806 + }, + { + "epoch": 2.7284085838302707, + "grad_norm": 0.400390625, + "learning_rate": 9.75676726563382e-08, + "loss": 1.312, + "step": 2807 + }, + { + "epoch": 2.729382175165308, + "grad_norm": 0.404296875, + "learning_rate": 9.686142596370545e-08, + "loss": 1.315, + "step": 2808 + }, + { + "epoch": 2.730355766500345, + "grad_norm": 0.39453125, + "learning_rate": 9.615769416046639e-08, + "loss": 1.2908, + "step": 2809 + }, + { + "epoch": 2.731329357835382, + "grad_norm": 0.39453125, + "learning_rate": 9.54564779830769e-08, + "loss": 1.3208, + "step": 2810 + }, + { + "epoch": 2.732302949170419, + "grad_norm": 0.39453125, + "learning_rate": 9.475777816536219e-08, + "loss": 1.3289, + "step": 2811 + }, + { + "epoch": 2.7332765405054564, + "grad_norm": 0.400390625, + "learning_rate": 9.406159543851184e-08, + "loss": 1.3351, + "step": 2812 + }, + { + "epoch": 2.7342501318404935, + "grad_norm": 0.400390625, + "learning_rate": 9.336793053108361e-08, + "loss": 1.3239, + "step": 2813 + }, + { + "epoch": 2.7352237231755305, + "grad_norm": 0.388671875, + "learning_rate": 9.267678416899823e-08, + "loss": 1.3134, + "step": 2814 + }, + { + "epoch": 2.7361973145105676, + "grad_norm": 0.400390625, + "learning_rate": 9.198815707554237e-08, + "loss": 1.3196, + "step": 2815 + }, + { + "epoch": 2.7371709058456046, + "grad_norm": 0.3984375, + "learning_rate": 9.130204997136543e-08, + "loss": 1.306, + "step": 2816 + }, + { + "epoch": 2.7381444971806417, + "grad_norm": 0.390625, + "learning_rate": 9.061846357448028e-08, + "loss": 1.2892, + "step": 2817 + }, + { + "epoch": 2.7391180885156787, + "grad_norm": 0.396484375, + "learning_rate": 8.993739860026108e-08, + "loss": 1.3408, + "step": 2818 + }, + { + "epoch": 2.7400916798507158, + "grad_norm": 0.396484375, + "learning_rate": 8.925885576144377e-08, + "loss": 1.3204, + "step": 2819 + }, + { + "epoch": 2.7410652711857533, + "grad_norm": 0.39453125, + "learning_rate": 8.858283576812482e-08, + "loss": 1.3129, + "step": 2820 + }, + { + "epoch": 2.7420388625207903, + "grad_norm": 0.39453125, + "learning_rate": 8.79093393277608e-08, + "loss": 1.3181, + "step": 2821 + }, + { + "epoch": 2.7430124538558274, + "grad_norm": 0.39453125, + "learning_rate": 8.723836714516681e-08, + "loss": 1.3159, + "step": 2822 + }, + { + "epoch": 2.7439860451908644, + "grad_norm": 0.39453125, + "learning_rate": 8.656991992251674e-08, + "loss": 1.3085, + "step": 2823 + }, + { + "epoch": 2.7449596365259015, + "grad_norm": 0.3984375, + "learning_rate": 8.590399835934154e-08, + "loss": 1.3067, + "step": 2824 + }, + { + "epoch": 2.745933227860939, + "grad_norm": 0.40234375, + "learning_rate": 8.524060315253019e-08, + "loss": 1.3227, + "step": 2825 + }, + { + "epoch": 2.746906819195976, + "grad_norm": 0.39453125, + "learning_rate": 8.45797349963265e-08, + "loss": 1.3254, + "step": 2826 + }, + { + "epoch": 2.747880410531013, + "grad_norm": 0.404296875, + "learning_rate": 8.392139458233056e-08, + "loss": 1.3098, + "step": 2827 + }, + { + "epoch": 2.74885400186605, + "grad_norm": 0.3984375, + "learning_rate": 8.326558259949713e-08, + "loss": 1.3429, + "step": 2828 + }, + { + "epoch": 2.749827593201087, + "grad_norm": 0.39453125, + "learning_rate": 8.26122997341347e-08, + "loss": 1.3122, + "step": 2829 + }, + { + "epoch": 2.750801184536124, + "grad_norm": 0.396484375, + "learning_rate": 8.196154666990475e-08, + "loss": 1.3015, + "step": 2830 + }, + { + "epoch": 2.7517747758711613, + "grad_norm": 0.404296875, + "learning_rate": 8.131332408782222e-08, + "loss": 1.3588, + "step": 2831 + }, + { + "epoch": 2.7527483672061983, + "grad_norm": 0.392578125, + "learning_rate": 8.066763266625283e-08, + "loss": 1.3171, + "step": 2832 + }, + { + "epoch": 2.753721958541236, + "grad_norm": 0.396484375, + "learning_rate": 8.002447308091466e-08, + "loss": 1.3136, + "step": 2833 + }, + { + "epoch": 2.754695549876273, + "grad_norm": 0.384765625, + "learning_rate": 7.938384600487486e-08, + "loss": 1.3255, + "step": 2834 + }, + { + "epoch": 2.75566914121131, + "grad_norm": 0.396484375, + "learning_rate": 7.874575210855134e-08, + "loss": 1.2997, + "step": 2835 + }, + { + "epoch": 2.756642732546347, + "grad_norm": 0.404296875, + "learning_rate": 7.811019205971021e-08, + "loss": 1.3134, + "step": 2836 + }, + { + "epoch": 2.757616323881384, + "grad_norm": 0.40234375, + "learning_rate": 7.747716652346692e-08, + "loss": 1.3239, + "step": 2837 + }, + { + "epoch": 2.7585899152164215, + "grad_norm": 0.404296875, + "learning_rate": 7.684667616228353e-08, + "loss": 1.3154, + "step": 2838 + }, + { + "epoch": 2.7595635065514585, + "grad_norm": 0.3984375, + "learning_rate": 7.621872163596999e-08, + "loss": 1.3054, + "step": 2839 + }, + { + "epoch": 2.7605370978864956, + "grad_norm": 0.400390625, + "learning_rate": 7.559330360168148e-08, + "loss": 1.3236, + "step": 2840 + }, + { + "epoch": 2.7615106892215326, + "grad_norm": 0.39453125, + "learning_rate": 7.497042271391947e-08, + "loss": 1.2818, + "step": 2841 + }, + { + "epoch": 2.7624842805565697, + "grad_norm": 0.40625, + "learning_rate": 7.435007962452972e-08, + "loss": 1.3123, + "step": 2842 + }, + { + "epoch": 2.7634578718916067, + "grad_norm": 0.400390625, + "learning_rate": 7.373227498270353e-08, + "loss": 1.3257, + "step": 2843 + }, + { + "epoch": 2.764431463226644, + "grad_norm": 0.40234375, + "learning_rate": 7.311700943497369e-08, + "loss": 1.3013, + "step": 2844 + }, + { + "epoch": 2.765405054561681, + "grad_norm": 0.390625, + "learning_rate": 7.250428362521711e-08, + "loss": 1.3261, + "step": 2845 + }, + { + "epoch": 2.7663786458967183, + "grad_norm": 0.39453125, + "learning_rate": 7.189409819465255e-08, + "loss": 1.3097, + "step": 2846 + }, + { + "epoch": 2.7673522372317554, + "grad_norm": 0.390625, + "learning_rate": 7.128645378184085e-08, + "loss": 1.2819, + "step": 2847 + }, + { + "epoch": 2.7683258285667924, + "grad_norm": 0.396484375, + "learning_rate": 7.068135102268226e-08, + "loss": 1.3146, + "step": 2848 + }, + { + "epoch": 2.7692994199018295, + "grad_norm": 0.396484375, + "learning_rate": 7.007879055041855e-08, + "loss": 1.3027, + "step": 2849 + }, + { + "epoch": 2.7702730112368665, + "grad_norm": 0.39453125, + "learning_rate": 6.947877299563032e-08, + "loss": 1.3155, + "step": 2850 + }, + { + "epoch": 2.771246602571904, + "grad_norm": 0.39453125, + "learning_rate": 6.888129898623752e-08, + "loss": 1.3179, + "step": 2851 + }, + { + "epoch": 2.772220193906941, + "grad_norm": 0.3984375, + "learning_rate": 6.828636914749748e-08, + "loss": 1.3121, + "step": 2852 + }, + { + "epoch": 2.773193785241978, + "grad_norm": 0.396484375, + "learning_rate": 6.769398410200579e-08, + "loss": 1.3119, + "step": 2853 + }, + { + "epoch": 2.774167376577015, + "grad_norm": 0.388671875, + "learning_rate": 6.710414446969405e-08, + "loss": 1.3194, + "step": 2854 + }, + { + "epoch": 2.7751409679120522, + "grad_norm": 0.390625, + "learning_rate": 6.651685086783155e-08, + "loss": 1.3126, + "step": 2855 + }, + { + "epoch": 2.7761145592470893, + "grad_norm": 0.392578125, + "learning_rate": 6.593210391102139e-08, + "loss": 1.3027, + "step": 2856 + }, + { + "epoch": 2.7770881505821263, + "grad_norm": 0.396484375, + "learning_rate": 6.534990421120296e-08, + "loss": 1.3011, + "step": 2857 + }, + { + "epoch": 2.7780617419171634, + "grad_norm": 0.396484375, + "learning_rate": 6.477025237764889e-08, + "loss": 1.3246, + "step": 2858 + }, + { + "epoch": 2.779035333252201, + "grad_norm": 0.400390625, + "learning_rate": 6.419314901696671e-08, + "loss": 1.3045, + "step": 2859 + }, + { + "epoch": 2.780008924587238, + "grad_norm": 0.392578125, + "learning_rate": 6.361859473309556e-08, + "loss": 1.3222, + "step": 2860 + }, + { + "epoch": 2.780982515922275, + "grad_norm": 0.396484375, + "learning_rate": 6.304659012730835e-08, + "loss": 1.3308, + "step": 2861 + }, + { + "epoch": 2.781956107257312, + "grad_norm": 0.38671875, + "learning_rate": 6.247713579820847e-08, + "loss": 1.3075, + "step": 2862 + }, + { + "epoch": 2.782929698592349, + "grad_norm": 0.396484375, + "learning_rate": 6.191023234173143e-08, + "loss": 1.3197, + "step": 2863 + }, + { + "epoch": 2.7839032899273866, + "grad_norm": 0.39453125, + "learning_rate": 6.134588035114242e-08, + "loss": 1.3026, + "step": 2864 + }, + { + "epoch": 2.7848768812624236, + "grad_norm": 0.388671875, + "learning_rate": 6.078408041703732e-08, + "loss": 1.311, + "step": 2865 + }, + { + "epoch": 2.7858504725974607, + "grad_norm": 0.396484375, + "learning_rate": 6.022483312734112e-08, + "loss": 1.3102, + "step": 2866 + }, + { + "epoch": 2.7868240639324977, + "grad_norm": 0.39453125, + "learning_rate": 5.966813906730679e-08, + "loss": 1.3189, + "step": 2867 + }, + { + "epoch": 2.7877976552675348, + "grad_norm": 0.388671875, + "learning_rate": 5.9113998819515515e-08, + "loss": 1.3078, + "step": 2868 + }, + { + "epoch": 2.788771246602572, + "grad_norm": 0.400390625, + "learning_rate": 5.8562412963877044e-08, + "loss": 1.2978, + "step": 2869 + }, + { + "epoch": 2.789744837937609, + "grad_norm": 0.3984375, + "learning_rate": 5.80133820776263e-08, + "loss": 1.3239, + "step": 2870 + }, + { + "epoch": 2.790718429272646, + "grad_norm": 0.392578125, + "learning_rate": 5.74669067353259e-08, + "loss": 1.3089, + "step": 2871 + }, + { + "epoch": 2.7916920206076834, + "grad_norm": 0.390625, + "learning_rate": 5.6922987508862546e-08, + "loss": 1.294, + "step": 2872 + }, + { + "epoch": 2.7926656119427204, + "grad_norm": 0.396484375, + "learning_rate": 5.638162496744981e-08, + "loss": 1.3267, + "step": 2873 + }, + { + "epoch": 2.7936392032777575, + "grad_norm": 0.408203125, + "learning_rate": 5.5842819677624225e-08, + "loss": 1.3228, + "step": 2874 + }, + { + "epoch": 2.7946127946127945, + "grad_norm": 0.392578125, + "learning_rate": 5.530657220324615e-08, + "loss": 1.3066, + "step": 2875 + }, + { + "epoch": 2.7955863859478316, + "grad_norm": 0.39453125, + "learning_rate": 5.477288310550055e-08, + "loss": 1.302, + "step": 2876 + }, + { + "epoch": 2.796559977282869, + "grad_norm": 0.408203125, + "learning_rate": 5.424175294289374e-08, + "loss": 1.3167, + "step": 2877 + }, + { + "epoch": 2.797533568617906, + "grad_norm": 0.3984375, + "learning_rate": 5.371318227125416e-08, + "loss": 1.2813, + "step": 2878 + }, + { + "epoch": 2.798507159952943, + "grad_norm": 0.388671875, + "learning_rate": 5.318717164373266e-08, + "loss": 1.3026, + "step": 2879 + }, + { + "epoch": 2.7994807512879802, + "grad_norm": 0.3984375, + "learning_rate": 5.266372161079975e-08, + "loss": 1.3161, + "step": 2880 + }, + { + "epoch": 2.8004543426230173, + "grad_norm": 0.40625, + "learning_rate": 5.2142832720247784e-08, + "loss": 1.3258, + "step": 2881 + }, + { + "epoch": 2.8014279339580543, + "grad_norm": 0.392578125, + "learning_rate": 5.16245055171874e-08, + "loss": 1.3059, + "step": 2882 + }, + { + "epoch": 2.8024015252930914, + "grad_norm": 0.388671875, + "learning_rate": 5.110874054404941e-08, + "loss": 1.306, + "step": 2883 + }, + { + "epoch": 2.8033751166281284, + "grad_norm": 0.404296875, + "learning_rate": 5.059553834058289e-08, + "loss": 1.3173, + "step": 2884 + }, + { + "epoch": 2.804348707963166, + "grad_norm": 0.400390625, + "learning_rate": 5.008489944385464e-08, + "loss": 1.3198, + "step": 2885 + }, + { + "epoch": 2.805322299298203, + "grad_norm": 0.39453125, + "learning_rate": 4.957682438824996e-08, + "loss": 1.3263, + "step": 2886 + }, + { + "epoch": 2.80629589063324, + "grad_norm": 0.396484375, + "learning_rate": 4.9071313705469635e-08, + "loss": 1.3138, + "step": 2887 + }, + { + "epoch": 2.807269481968277, + "grad_norm": 0.39453125, + "learning_rate": 4.856836792453218e-08, + "loss": 1.298, + "step": 2888 + }, + { + "epoch": 2.808243073303314, + "grad_norm": 0.396484375, + "learning_rate": 4.806798757177128e-08, + "loss": 1.3065, + "step": 2889 + }, + { + "epoch": 2.8092166646383516, + "grad_norm": 0.40234375, + "learning_rate": 4.7570173170835846e-08, + "loss": 1.3114, + "step": 2890 + }, + { + "epoch": 2.8101902559733887, + "grad_norm": 0.390625, + "learning_rate": 4.70749252426897e-08, + "loss": 1.3133, + "step": 2891 + }, + { + "epoch": 2.8111638473084257, + "grad_norm": 0.396484375, + "learning_rate": 4.6582244305611034e-08, + "loss": 1.3155, + "step": 2892 + }, + { + "epoch": 2.8121374386434628, + "grad_norm": 0.4140625, + "learning_rate": 4.6092130875190766e-08, + "loss": 1.3006, + "step": 2893 + }, + { + "epoch": 2.8131110299785, + "grad_norm": 0.390625, + "learning_rate": 4.5604585464334436e-08, + "loss": 1.3369, + "step": 2894 + }, + { + "epoch": 2.814084621313537, + "grad_norm": 0.38671875, + "learning_rate": 4.5119608583258646e-08, + "loss": 1.3226, + "step": 2895 + }, + { + "epoch": 2.815058212648574, + "grad_norm": 0.396484375, + "learning_rate": 4.4637200739493514e-08, + "loss": 1.3146, + "step": 2896 + }, + { + "epoch": 2.816031803983611, + "grad_norm": 0.396484375, + "learning_rate": 4.415736243787882e-08, + "loss": 1.308, + "step": 2897 + }, + { + "epoch": 2.8170053953186485, + "grad_norm": 0.3984375, + "learning_rate": 4.368009418056707e-08, + "loss": 1.3103, + "step": 2898 + }, + { + "epoch": 2.8179789866536855, + "grad_norm": 0.392578125, + "learning_rate": 4.32053964670201e-08, + "loss": 1.3023, + "step": 2899 + }, + { + "epoch": 2.8189525779887226, + "grad_norm": 0.41015625, + "learning_rate": 4.273326979400999e-08, + "loss": 1.3158, + "step": 2900 + }, + { + "epoch": 2.8199261693237596, + "grad_norm": 0.404296875, + "learning_rate": 4.226371465561846e-08, + "loss": 1.3245, + "step": 2901 + }, + { + "epoch": 2.8208997606587967, + "grad_norm": 0.40625, + "learning_rate": 4.179673154323605e-08, + "loss": 1.2983, + "step": 2902 + }, + { + "epoch": 2.821873351993834, + "grad_norm": 0.400390625, + "learning_rate": 4.1332320945561e-08, + "loss": 1.3317, + "step": 2903 + }, + { + "epoch": 2.822846943328871, + "grad_norm": 0.404296875, + "learning_rate": 4.0870483348600386e-08, + "loss": 1.3029, + "step": 2904 + }, + { + "epoch": 2.8238205346639083, + "grad_norm": 0.3984375, + "learning_rate": 4.0411219235667877e-08, + "loss": 1.3209, + "step": 2905 + }, + { + "epoch": 2.8247941259989453, + "grad_norm": 0.390625, + "learning_rate": 3.9954529087384844e-08, + "loss": 1.3222, + "step": 2906 + }, + { + "epoch": 2.8257677173339824, + "grad_norm": 0.390625, + "learning_rate": 3.950041338167787e-08, + "loss": 1.3242, + "step": 2907 + }, + { + "epoch": 2.8267413086690194, + "grad_norm": 0.404296875, + "learning_rate": 3.9048872593780426e-08, + "loss": 1.3201, + "step": 2908 + }, + { + "epoch": 2.8277149000040565, + "grad_norm": 0.39453125, + "learning_rate": 3.8599907196230636e-08, + "loss": 1.3172, + "step": 2909 + }, + { + "epoch": 2.8286884913390935, + "grad_norm": 0.396484375, + "learning_rate": 3.815351765887182e-08, + "loss": 1.3079, + "step": 2910 + }, + { + "epoch": 2.829662082674131, + "grad_norm": 0.396484375, + "learning_rate": 3.770970444885114e-08, + "loss": 1.3196, + "step": 2911 + }, + { + "epoch": 2.830635674009168, + "grad_norm": 0.384765625, + "learning_rate": 3.72684680306204e-08, + "loss": 1.3144, + "step": 2912 + }, + { + "epoch": 2.831609265344205, + "grad_norm": 0.400390625, + "learning_rate": 3.682980886593412e-08, + "loss": 1.3211, + "step": 2913 + }, + { + "epoch": 2.832582856679242, + "grad_norm": 0.408203125, + "learning_rate": 3.639372741385039e-08, + "loss": 1.3257, + "step": 2914 + }, + { + "epoch": 2.833556448014279, + "grad_norm": 0.3984375, + "learning_rate": 3.596022413072886e-08, + "loss": 1.3351, + "step": 2915 + }, + { + "epoch": 2.8345300393493167, + "grad_norm": 0.396484375, + "learning_rate": 3.5529299470232206e-08, + "loss": 1.3217, + "step": 2916 + }, + { + "epoch": 2.8355036306843537, + "grad_norm": 0.3984375, + "learning_rate": 3.51009538833233e-08, + "loss": 1.3064, + "step": 2917 + }, + { + "epoch": 2.836477222019391, + "grad_norm": 0.396484375, + "learning_rate": 3.4675187818267174e-08, + "loss": 1.3091, + "step": 2918 + }, + { + "epoch": 2.837450813354428, + "grad_norm": 0.3984375, + "learning_rate": 3.4252001720628816e-08, + "loss": 1.3193, + "step": 2919 + }, + { + "epoch": 2.838424404689465, + "grad_norm": 0.392578125, + "learning_rate": 3.383139603327312e-08, + "loss": 1.3046, + "step": 2920 + }, + { + "epoch": 2.839397996024502, + "grad_norm": 0.39453125, + "learning_rate": 3.3413371196364954e-08, + "loss": 1.3227, + "step": 2921 + }, + { + "epoch": 2.840371587359539, + "grad_norm": 0.388671875, + "learning_rate": 3.299792764736798e-08, + "loss": 1.2949, + "step": 2922 + }, + { + "epoch": 2.841345178694576, + "grad_norm": 0.3984375, + "learning_rate": 3.25850658210447e-08, + "loss": 1.3157, + "step": 2923 + }, + { + "epoch": 2.8423187700296135, + "grad_norm": 0.40234375, + "learning_rate": 3.2174786149456184e-08, + "loss": 1.32, + "step": 2924 + }, + { + "epoch": 2.8432923613646506, + "grad_norm": 0.39453125, + "learning_rate": 3.176708906196063e-08, + "loss": 1.3047, + "step": 2925 + }, + { + "epoch": 2.8442659526996876, + "grad_norm": 0.39453125, + "learning_rate": 3.136197498521398e-08, + "loss": 1.3176, + "step": 2926 + }, + { + "epoch": 2.8452395440347247, + "grad_norm": 0.390625, + "learning_rate": 3.0959444343169055e-08, + "loss": 1.3141, + "step": 2927 + }, + { + "epoch": 2.8462131353697617, + "grad_norm": 0.396484375, + "learning_rate": 3.0559497557074715e-08, + "loss": 1.3156, + "step": 2928 + }, + { + "epoch": 2.847186726704799, + "grad_norm": 0.392578125, + "learning_rate": 3.016213504547616e-08, + "loss": 1.3105, + "step": 2929 + }, + { + "epoch": 2.8481603180398363, + "grad_norm": 0.400390625, + "learning_rate": 2.9767357224214365e-08, + "loss": 1.3098, + "step": 2930 + }, + { + "epoch": 2.8491339093748733, + "grad_norm": 0.39453125, + "learning_rate": 2.9375164506424693e-08, + "loss": 1.311, + "step": 2931 + }, + { + "epoch": 2.8501075007099104, + "grad_norm": 0.3984375, + "learning_rate": 2.8985557302537994e-08, + "loss": 1.3018, + "step": 2932 + }, + { + "epoch": 2.8510810920449474, + "grad_norm": 0.39453125, + "learning_rate": 2.8598536020278678e-08, + "loss": 1.3184, + "step": 2933 + }, + { + "epoch": 2.8520546833799845, + "grad_norm": 0.41015625, + "learning_rate": 2.8214101064665545e-08, + "loss": 1.3352, + "step": 2934 + }, + { + "epoch": 2.8530282747150215, + "grad_norm": 0.390625, + "learning_rate": 2.7832252838010387e-08, + "loss": 1.3224, + "step": 2935 + }, + { + "epoch": 2.8540018660500586, + "grad_norm": 0.3984375, + "learning_rate": 2.7452991739918277e-08, + "loss": 1.3408, + "step": 2936 + }, + { + "epoch": 2.854975457385096, + "grad_norm": 0.41796875, + "learning_rate": 2.7076318167286452e-08, + "loss": 1.3088, + "step": 2937 + }, + { + "epoch": 2.855949048720133, + "grad_norm": 0.388671875, + "learning_rate": 2.670223251430515e-08, + "loss": 1.298, + "step": 2938 + }, + { + "epoch": 2.85692264005517, + "grad_norm": 0.39453125, + "learning_rate": 2.63307351724551e-08, + "loss": 1.2909, + "step": 2939 + }, + { + "epoch": 2.857896231390207, + "grad_norm": 0.416015625, + "learning_rate": 2.5961826530509214e-08, + "loss": 1.3313, + "step": 2940 + }, + { + "epoch": 2.8588698227252443, + "grad_norm": 0.396484375, + "learning_rate": 2.5595506974531448e-08, + "loss": 1.3273, + "step": 2941 + }, + { + "epoch": 2.8598434140602818, + "grad_norm": 0.38671875, + "learning_rate": 2.5231776887875703e-08, + "loss": 1.3218, + "step": 2942 + }, + { + "epoch": 2.860817005395319, + "grad_norm": 0.400390625, + "learning_rate": 2.4870636651186388e-08, + "loss": 1.3012, + "step": 2943 + }, + { + "epoch": 2.861790596730356, + "grad_norm": 0.3984375, + "learning_rate": 2.451208664239757e-08, + "loss": 1.3256, + "step": 2944 + }, + { + "epoch": 2.862764188065393, + "grad_norm": 0.39453125, + "learning_rate": 2.4156127236732162e-08, + "loss": 1.3031, + "step": 2945 + }, + { + "epoch": 2.86373777940043, + "grad_norm": 0.390625, + "learning_rate": 2.3802758806702463e-08, + "loss": 1.3073, + "step": 2946 + }, + { + "epoch": 2.864711370735467, + "grad_norm": 0.39453125, + "learning_rate": 2.3451981722109608e-08, + "loss": 1.3062, + "step": 2947 + }, + { + "epoch": 2.865684962070504, + "grad_norm": 0.40625, + "learning_rate": 2.310379635004245e-08, + "loss": 1.3162, + "step": 2948 + }, + { + "epoch": 2.866658553405541, + "grad_norm": 0.390625, + "learning_rate": 2.2758203054877302e-08, + "loss": 1.338, + "step": 2949 + }, + { + "epoch": 2.8676321447405786, + "grad_norm": 0.404296875, + "learning_rate": 2.241520219827903e-08, + "loss": 1.3093, + "step": 2950 + }, + { + "epoch": 2.8686057360756156, + "grad_norm": 0.390625, + "learning_rate": 2.207479413919772e-08, + "loss": 1.3074, + "step": 2951 + }, + { + "epoch": 2.8695793274106527, + "grad_norm": 0.3984375, + "learning_rate": 2.1736979233872025e-08, + "loss": 1.3125, + "step": 2952 + }, + { + "epoch": 2.8705529187456897, + "grad_norm": 0.388671875, + "learning_rate": 2.1401757835824988e-08, + "loss": 1.3036, + "step": 2953 + }, + { + "epoch": 2.871526510080727, + "grad_norm": 0.40625, + "learning_rate": 2.1069130295867656e-08, + "loss": 1.3257, + "step": 2954 + }, + { + "epoch": 2.8725001014157643, + "grad_norm": 0.384765625, + "learning_rate": 2.073909696209436e-08, + "loss": 1.3061, + "step": 2955 + }, + { + "epoch": 2.8734736927508013, + "grad_norm": 0.392578125, + "learning_rate": 2.0411658179886053e-08, + "loss": 1.3274, + "step": 2956 + }, + { + "epoch": 2.8744472840858384, + "grad_norm": 0.392578125, + "learning_rate": 2.0086814291908064e-08, + "loss": 1.3454, + "step": 2957 + }, + { + "epoch": 2.8754208754208754, + "grad_norm": 0.3984375, + "learning_rate": 1.976456563811041e-08, + "loss": 1.3012, + "step": 2958 + }, + { + "epoch": 2.8763944667559125, + "grad_norm": 0.39453125, + "learning_rate": 1.9444912555726668e-08, + "loss": 1.3053, + "step": 2959 + }, + { + "epoch": 2.8773680580909495, + "grad_norm": 0.40234375, + "learning_rate": 1.9127855379274528e-08, + "loss": 1.2956, + "step": 2960 + }, + { + "epoch": 2.8783416494259866, + "grad_norm": 0.3828125, + "learning_rate": 1.8813394440554966e-08, + "loss": 1.292, + "step": 2961 + }, + { + "epoch": 2.8793152407610236, + "grad_norm": 0.390625, + "learning_rate": 1.850153006865224e-08, + "loss": 1.323, + "step": 2962 + }, + { + "epoch": 2.880288832096061, + "grad_norm": 0.390625, + "learning_rate": 1.8192262589932507e-08, + "loss": 1.3126, + "step": 2963 + }, + { + "epoch": 2.881262423431098, + "grad_norm": 0.3984375, + "learning_rate": 1.788559232804521e-08, + "loss": 1.3386, + "step": 2964 + }, + { + "epoch": 2.8822360147661352, + "grad_norm": 0.390625, + "learning_rate": 1.7581519603921406e-08, + "loss": 1.3285, + "step": 2965 + }, + { + "epoch": 2.8832096061011723, + "grad_norm": 0.400390625, + "learning_rate": 1.7280044735773493e-08, + "loss": 1.3347, + "step": 2966 + }, + { + "epoch": 2.8841831974362093, + "grad_norm": 0.40234375, + "learning_rate": 1.6981168039095774e-08, + "loss": 1.3068, + "step": 2967 + }, + { + "epoch": 2.885156788771247, + "grad_norm": 0.396484375, + "learning_rate": 1.668488982666361e-08, + "loss": 1.3398, + "step": 2968 + }, + { + "epoch": 2.886130380106284, + "grad_norm": 0.396484375, + "learning_rate": 1.6391210408532032e-08, + "loss": 1.3475, + "step": 2969 + }, + { + "epoch": 2.887103971441321, + "grad_norm": 0.388671875, + "learning_rate": 1.6100130092037704e-08, + "loss": 1.3005, + "step": 2970 + }, + { + "epoch": 2.888077562776358, + "grad_norm": 0.388671875, + "learning_rate": 1.5811649181796673e-08, + "loss": 1.306, + "step": 2971 + }, + { + "epoch": 2.889051154111395, + "grad_norm": 0.39453125, + "learning_rate": 1.5525767979704675e-08, + "loss": 1.3159, + "step": 2972 + }, + { + "epoch": 2.890024745446432, + "grad_norm": 0.404296875, + "learning_rate": 1.5242486784937115e-08, + "loss": 1.3084, + "step": 2973 + }, + { + "epoch": 2.890998336781469, + "grad_norm": 0.392578125, + "learning_rate": 1.4961805893948245e-08, + "loss": 1.2883, + "step": 2974 + }, + { + "epoch": 2.891971928116506, + "grad_norm": 0.40234375, + "learning_rate": 1.4683725600471155e-08, + "loss": 1.3183, + "step": 2975 + }, + { + "epoch": 2.8929455194515437, + "grad_norm": 0.392578125, + "learning_rate": 1.4408246195517506e-08, + "loss": 1.3028, + "step": 2976 + }, + { + "epoch": 2.8939191107865807, + "grad_norm": 0.404296875, + "learning_rate": 1.4135367967377244e-08, + "loss": 1.3117, + "step": 2977 + }, + { + "epoch": 2.8948927021216178, + "grad_norm": 0.392578125, + "learning_rate": 1.3865091201617498e-08, + "loss": 1.3144, + "step": 2978 + }, + { + "epoch": 2.895866293456655, + "grad_norm": 0.3984375, + "learning_rate": 1.3597416181083678e-08, + "loss": 1.3064, + "step": 2979 + }, + { + "epoch": 2.896839884791692, + "grad_norm": 0.390625, + "learning_rate": 1.3332343185898377e-08, + "loss": 1.3151, + "step": 2980 + }, + { + "epoch": 2.8978134761267293, + "grad_norm": 0.38671875, + "learning_rate": 1.3069872493461089e-08, + "loss": 1.2951, + "step": 2981 + }, + { + "epoch": 2.8987870674617664, + "grad_norm": 0.396484375, + "learning_rate": 1.2810004378447649e-08, + "loss": 1.3181, + "step": 2982 + }, + { + "epoch": 2.8997606587968034, + "grad_norm": 0.38671875, + "learning_rate": 1.255273911281052e-08, + "loss": 1.3188, + "step": 2983 + }, + { + "epoch": 2.9007342501318405, + "grad_norm": 0.40234375, + "learning_rate": 1.2298076965778782e-08, + "loss": 1.3106, + "step": 2984 + }, + { + "epoch": 2.9017078414668775, + "grad_norm": 0.40234375, + "learning_rate": 1.204601820385648e-08, + "loss": 1.3223, + "step": 2985 + }, + { + "epoch": 2.9026814328019146, + "grad_norm": 0.392578125, + "learning_rate": 1.1796563090823443e-08, + "loss": 1.2958, + "step": 2986 + }, + { + "epoch": 2.9036550241369516, + "grad_norm": 0.400390625, + "learning_rate": 1.154971188773557e-08, + "loss": 1.2908, + "step": 2987 + }, + { + "epoch": 2.9046286154719887, + "grad_norm": 0.388671875, + "learning_rate": 1.1305464852922887e-08, + "loss": 1.3144, + "step": 2988 + }, + { + "epoch": 2.905602206807026, + "grad_norm": 0.396484375, + "learning_rate": 1.1063822241990097e-08, + "loss": 1.3207, + "step": 2989 + }, + { + "epoch": 2.9065757981420632, + "grad_norm": 0.39453125, + "learning_rate": 1.0824784307817138e-08, + "loss": 1.3336, + "step": 2990 + }, + { + "epoch": 2.9075493894771003, + "grad_norm": 0.400390625, + "learning_rate": 1.058835130055752e-08, + "loss": 1.2953, + "step": 2991 + }, + { + "epoch": 2.9085229808121373, + "grad_norm": 0.392578125, + "learning_rate": 1.03545234676386e-08, + "loss": 1.3062, + "step": 2992 + }, + { + "epoch": 2.909496572147175, + "grad_norm": 0.40234375, + "learning_rate": 1.0123301053762136e-08, + "loss": 1.3272, + "step": 2993 + }, + { + "epoch": 2.910470163482212, + "grad_norm": 0.390625, + "learning_rate": 9.894684300902347e-09, + "loss": 1.3112, + "step": 2994 + }, + { + "epoch": 2.911443754817249, + "grad_norm": 0.404296875, + "learning_rate": 9.668673448307575e-09, + "loss": 1.3279, + "step": 2995 + }, + { + "epoch": 2.912417346152286, + "grad_norm": 0.39453125, + "learning_rate": 9.445268732498625e-09, + "loss": 1.3128, + "step": 2996 + }, + { + "epoch": 2.913390937487323, + "grad_norm": 0.396484375, + "learning_rate": 9.224470387268759e-09, + "loss": 1.3207, + "step": 2997 + }, + { + "epoch": 2.91436452882236, + "grad_norm": 0.400390625, + "learning_rate": 9.006278643683697e-09, + "loss": 1.3169, + "step": 2998 + }, + { + "epoch": 2.915338120157397, + "grad_norm": 0.40234375, + "learning_rate": 8.790693730082179e-09, + "loss": 1.3137, + "step": 2999 + }, + { + "epoch": 2.916311711492434, + "grad_norm": 0.39453125, + "learning_rate": 8.577715872073734e-09, + "loss": 1.3052, + "step": 3000 + }, + { + "epoch": 2.9172853028274712, + "grad_norm": 0.392578125, + "learning_rate": 8.367345292540074e-09, + "loss": 1.3251, + "step": 3001 + }, + { + "epoch": 2.9182588941625087, + "grad_norm": 0.392578125, + "learning_rate": 8.15958221163482e-09, + "loss": 1.3077, + "step": 3002 + }, + { + "epoch": 2.9192324854975458, + "grad_norm": 0.392578125, + "learning_rate": 7.95442684678238e-09, + "loss": 1.3223, + "step": 3003 + }, + { + "epoch": 2.920206076832583, + "grad_norm": 0.39453125, + "learning_rate": 7.751879412677966e-09, + "loss": 1.3218, + "step": 3004 + }, + { + "epoch": 2.92117966816762, + "grad_norm": 0.39453125, + "learning_rate": 7.551940121288126e-09, + "loss": 1.2997, + "step": 3005 + }, + { + "epoch": 2.9221532595026574, + "grad_norm": 0.390625, + "learning_rate": 7.354609181849659e-09, + "loss": 1.3248, + "step": 3006 + }, + { + "epoch": 2.9231268508376944, + "grad_norm": 0.3984375, + "learning_rate": 7.159886800869875e-09, + "loss": 1.3029, + "step": 3007 + }, + { + "epoch": 2.9241004421727315, + "grad_norm": 0.396484375, + "learning_rate": 6.967773182126048e-09, + "loss": 1.3156, + "step": 3008 + }, + { + "epoch": 2.9250740335077685, + "grad_norm": 0.39453125, + "learning_rate": 6.77826852666541e-09, + "loss": 1.32, + "step": 3009 + }, + { + "epoch": 2.9260476248428056, + "grad_norm": 0.390625, + "learning_rate": 6.591373032805437e-09, + "loss": 1.2957, + "step": 3010 + }, + { + "epoch": 2.9270212161778426, + "grad_norm": 0.390625, + "learning_rate": 6.407086896131898e-09, + "loss": 1.3003, + "step": 3011 + }, + { + "epoch": 2.9279948075128797, + "grad_norm": 0.392578125, + "learning_rate": 6.225410309501079e-09, + "loss": 1.3187, + "step": 3012 + }, + { + "epoch": 2.9289683988479167, + "grad_norm": 0.3984375, + "learning_rate": 6.046343463037563e-09, + "loss": 1.3073, + "step": 3013 + }, + { + "epoch": 2.929941990182954, + "grad_norm": 0.390625, + "learning_rate": 5.869886544135617e-09, + "loss": 1.3149, + "step": 3014 + }, + { + "epoch": 2.9309155815179913, + "grad_norm": 0.388671875, + "learning_rate": 5.696039737457526e-09, + "loss": 1.3011, + "step": 3015 + }, + { + "epoch": 2.9318891728530283, + "grad_norm": 0.38671875, + "learning_rate": 5.524803224934427e-09, + "loss": 1.3196, + "step": 3016 + }, + { + "epoch": 2.9328627641880654, + "grad_norm": 0.3984375, + "learning_rate": 5.356177185765477e-09, + "loss": 1.3142, + "step": 3017 + }, + { + "epoch": 2.9338363555231024, + "grad_norm": 0.3984375, + "learning_rate": 5.190161796418125e-09, + "loss": 1.3129, + "step": 3018 + }, + { + "epoch": 2.93480994685814, + "grad_norm": 0.390625, + "learning_rate": 5.026757230628121e-09, + "loss": 1.3309, + "step": 3019 + }, + { + "epoch": 2.935783538193177, + "grad_norm": 0.404296875, + "learning_rate": 4.865963659398676e-09, + "loss": 1.3174, + "step": 3020 + }, + { + "epoch": 2.936757129528214, + "grad_norm": 0.40625, + "learning_rate": 4.7077812510001875e-09, + "loss": 1.317, + "step": 3021 + }, + { + "epoch": 2.937730720863251, + "grad_norm": 0.40234375, + "learning_rate": 4.5522101709713496e-09, + "loss": 1.3266, + "step": 3022 + }, + { + "epoch": 2.938704312198288, + "grad_norm": 0.400390625, + "learning_rate": 4.3992505821172095e-09, + "loss": 1.3269, + "step": 3023 + }, + { + "epoch": 2.939677903533325, + "grad_norm": 0.392578125, + "learning_rate": 4.248902644510555e-09, + "loss": 1.31, + "step": 3024 + }, + { + "epoch": 2.940651494868362, + "grad_norm": 0.40625, + "learning_rate": 4.1011665154905285e-09, + "loss": 1.3415, + "step": 3025 + }, + { + "epoch": 2.9416250862033992, + "grad_norm": 0.392578125, + "learning_rate": 3.956042349663736e-09, + "loss": 1.3199, + "step": 3026 + }, + { + "epoch": 2.9425986775384367, + "grad_norm": 0.392578125, + "learning_rate": 3.813530298902579e-09, + "loss": 1.3236, + "step": 3027 + }, + { + "epoch": 2.943572268873474, + "grad_norm": 0.39453125, + "learning_rate": 3.6736305123460938e-09, + "loss": 1.315, + "step": 3028 + }, + { + "epoch": 2.944545860208511, + "grad_norm": 0.40234375, + "learning_rate": 3.5363431363999447e-09, + "loss": 1.3063, + "step": 3029 + }, + { + "epoch": 2.945519451543548, + "grad_norm": 0.392578125, + "learning_rate": 3.4016683147355954e-09, + "loss": 1.2863, + "step": 3030 + }, + { + "epoch": 2.946493042878585, + "grad_norm": 0.392578125, + "learning_rate": 3.2696061882905862e-09, + "loss": 1.3141, + "step": 3031 + }, + { + "epoch": 2.9474666342136224, + "grad_norm": 0.388671875, + "learning_rate": 3.1401568952679763e-09, + "loss": 1.3131, + "step": 3032 + }, + { + "epoch": 2.9484402255486595, + "grad_norm": 0.396484375, + "learning_rate": 3.013320571136902e-09, + "loss": 1.3477, + "step": 3033 + }, + { + "epoch": 2.9494138168836965, + "grad_norm": 0.388671875, + "learning_rate": 2.8890973486320193e-09, + "loss": 1.2948, + "step": 3034 + }, + { + "epoch": 2.9503874082187336, + "grad_norm": 0.392578125, + "learning_rate": 2.767487357752674e-09, + "loss": 1.319, + "step": 3035 + }, + { + "epoch": 2.9513609995537706, + "grad_norm": 0.40234375, + "learning_rate": 2.6484907257642856e-09, + "loss": 1.2956, + "step": 3036 + }, + { + "epoch": 2.9523345908888077, + "grad_norm": 0.3984375, + "learning_rate": 2.5321075771969627e-09, + "loss": 1.3126, + "step": 3037 + }, + { + "epoch": 2.9533081822238447, + "grad_norm": 0.3984375, + "learning_rate": 2.418338033846057e-09, + "loss": 1.2997, + "step": 3038 + }, + { + "epoch": 2.9542817735588818, + "grad_norm": 0.396484375, + "learning_rate": 2.3071822147716083e-09, + "loss": 1.3128, + "step": 3039 + }, + { + "epoch": 2.9552553648939193, + "grad_norm": 0.390625, + "learning_rate": 2.198640236298344e-09, + "loss": 1.3263, + "step": 3040 + }, + { + "epoch": 2.9562289562289563, + "grad_norm": 0.40625, + "learning_rate": 2.092712212015402e-09, + "loss": 1.2956, + "step": 3041 + }, + { + "epoch": 2.9572025475639934, + "grad_norm": 0.400390625, + "learning_rate": 1.989398252777164e-09, + "loss": 1.3155, + "step": 3042 + }, + { + "epoch": 2.9581761388990304, + "grad_norm": 0.412109375, + "learning_rate": 1.8886984667015883e-09, + "loss": 1.3278, + "step": 3043 + }, + { + "epoch": 2.9591497302340675, + "grad_norm": 0.396484375, + "learning_rate": 1.7906129591713228e-09, + "loss": 1.3078, + "step": 3044 + }, + { + "epoch": 2.960123321569105, + "grad_norm": 0.400390625, + "learning_rate": 1.695141832833147e-09, + "loss": 1.3291, + "step": 3045 + }, + { + "epoch": 2.961096912904142, + "grad_norm": 0.390625, + "learning_rate": 1.6022851875974189e-09, + "loss": 1.338, + "step": 3046 + }, + { + "epoch": 2.962070504239179, + "grad_norm": 0.40234375, + "learning_rate": 1.5120431206391839e-09, + "loss": 1.3142, + "step": 3047 + }, + { + "epoch": 2.963044095574216, + "grad_norm": 0.3984375, + "learning_rate": 1.4244157263967884e-09, + "loss": 1.3227, + "step": 3048 + }, + { + "epoch": 2.964017686909253, + "grad_norm": 0.40234375, + "learning_rate": 1.3394030965724335e-09, + "loss": 1.2864, + "step": 3049 + }, + { + "epoch": 2.96499127824429, + "grad_norm": 0.39453125, + "learning_rate": 1.2570053201318987e-09, + "loss": 1.3075, + "step": 3050 + }, + { + "epoch": 2.9659648695793273, + "grad_norm": 0.40234375, + "learning_rate": 1.177222483305096e-09, + "loss": 1.3226, + "step": 3051 + }, + { + "epoch": 2.9669384609143643, + "grad_norm": 0.400390625, + "learning_rate": 1.100054669584405e-09, + "loss": 1.3121, + "step": 3052 + }, + { + "epoch": 2.967912052249402, + "grad_norm": 0.396484375, + "learning_rate": 1.0255019597266158e-09, + "loss": 1.3362, + "step": 3053 + }, + { + "epoch": 2.968885643584439, + "grad_norm": 0.40234375, + "learning_rate": 9.535644317507086e-10, + "loss": 1.3209, + "step": 3054 + }, + { + "epoch": 2.969859234919476, + "grad_norm": 0.404296875, + "learning_rate": 8.842421609397966e-10, + "loss": 1.3304, + "step": 3055 + }, + { + "epoch": 2.970832826254513, + "grad_norm": 0.40234375, + "learning_rate": 8.17535219839738e-10, + "loss": 1.3378, + "step": 3056 + }, + { + "epoch": 2.97180641758955, + "grad_norm": 0.40234375, + "learning_rate": 7.534436782594135e-10, + "loss": 1.3223, + "step": 3057 + }, + { + "epoch": 2.9727800089245875, + "grad_norm": 0.40625, + "learning_rate": 6.919676032710044e-10, + "loss": 1.327, + "step": 3058 + }, + { + "epoch": 2.9737536002596245, + "grad_norm": 0.3984375, + "learning_rate": 6.331070592088817e-10, + "loss": 1.3237, + "step": 3059 + }, + { + "epoch": 2.9747271915946616, + "grad_norm": 0.40234375, + "learning_rate": 5.768621076712721e-10, + "loss": 1.3338, + "step": 3060 + }, + { + "epoch": 2.9757007829296986, + "grad_norm": 0.39453125, + "learning_rate": 5.232328075180371e-10, + "loss": 1.3052, + "step": 3061 + }, + { + "epoch": 2.9766743742647357, + "grad_norm": 0.39453125, + "learning_rate": 4.722192148728932e-10, + "loss": 1.3246, + "step": 3062 + }, + { + "epoch": 2.9776479655997727, + "grad_norm": 0.40234375, + "learning_rate": 4.2382138312119237e-10, + "loss": 1.32, + "step": 3063 + }, + { + "epoch": 2.97862155693481, + "grad_norm": 0.396484375, + "learning_rate": 3.7803936291186395e-10, + "loss": 1.3072, + "step": 3064 + }, + { + "epoch": 2.979595148269847, + "grad_norm": 0.41015625, + "learning_rate": 3.3487320215547235e-10, + "loss": 1.3114, + "step": 3065 + }, + { + "epoch": 2.9805687396048843, + "grad_norm": 0.392578125, + "learning_rate": 2.9432294602560476e-10, + "loss": 1.313, + "step": 3066 + }, + { + "epoch": 2.9815423309399214, + "grad_norm": 0.396484375, + "learning_rate": 2.563886369583157e-10, + "loss": 1.3014, + "step": 3067 + }, + { + "epoch": 2.9825159222749584, + "grad_norm": 0.41015625, + "learning_rate": 2.2107031465185002e-10, + "loss": 1.3306, + "step": 3068 + }, + { + "epoch": 2.9834895136099955, + "grad_norm": 0.40234375, + "learning_rate": 1.8836801606664233e-10, + "loss": 1.3305, + "step": 3069 + }, + { + "epoch": 2.9844631049450325, + "grad_norm": 0.40625, + "learning_rate": 1.5828177542642765e-10, + "loss": 1.318, + "step": 3070 + }, + { + "epoch": 2.98543669628007, + "grad_norm": 0.400390625, + "learning_rate": 1.3081162421574312e-10, + "loss": 1.3064, + "step": 3071 + }, + { + "epoch": 2.986410287615107, + "grad_norm": 0.39453125, + "learning_rate": 1.059575911824262e-10, + "loss": 1.2982, + "step": 3072 + }, + { + "epoch": 2.987383878950144, + "grad_norm": 0.400390625, + "learning_rate": 8.37197023365044e-11, + "loss": 1.3013, + "step": 3073 + }, + { + "epoch": 2.988357470285181, + "grad_norm": 0.400390625, + "learning_rate": 6.40979809499176e-11, + "loss": 1.3008, + "step": 3074 + }, + { + "epoch": 2.989331061620218, + "grad_norm": 0.388671875, + "learning_rate": 4.7092447556518204e-11, + "loss": 1.3004, + "step": 3075 + }, + { + "epoch": 2.9903046529552553, + "grad_norm": 0.390625, + "learning_rate": 3.270311995262621e-11, + "loss": 1.3092, + "step": 3076 + }, + { + "epoch": 2.9912782442902923, + "grad_norm": 0.388671875, + "learning_rate": 2.0930013197306697e-11, + "loss": 1.297, + "step": 3077 + }, + { + "epoch": 2.9922518356253294, + "grad_norm": 0.396484375, + "learning_rate": 1.1773139610427031e-11, + "loss": 1.3032, + "step": 3078 + }, + { + "epoch": 2.993225426960367, + "grad_norm": 0.388671875, + "learning_rate": 5.232508775154799e-12, + "loss": 1.3081, + "step": 3079 + }, + { + "epoch": 2.994199018295404, + "grad_norm": 0.40234375, + "learning_rate": 1.308127536014947e-12, + "loss": 1.3196, + "step": 3080 + }, + { + "epoch": 2.995172609630441, + "grad_norm": 0.39453125, + "learning_rate": 0.0, + "loss": 1.3089, + "step": 3081 + } + ], + "logging_steps": 1, + "max_steps": 3081, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1027, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2.829516122582994e+19, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}