{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999207397622193, "eval_steps": 500, "global_step": 2838, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 54.60578280011217, "learning_rate": 3.4482758620689656e-07, "loss": 2.0852, "step": 1 }, { "epoch": 0.01, "grad_norm": 42.54745106429926, "learning_rate": 1.724137931034483e-06, "loss": 2.0381, "step": 5 }, { "epoch": 0.01, "grad_norm": 9.212853479586382, "learning_rate": 3.448275862068966e-06, "loss": 1.6632, "step": 10 }, { "epoch": 0.02, "grad_norm": 5.314912275996213, "learning_rate": 5.172413793103449e-06, "loss": 1.3468, "step": 15 }, { "epoch": 0.02, "grad_norm": 4.03433890035307, "learning_rate": 6.896551724137932e-06, "loss": 1.2538, "step": 20 }, { "epoch": 0.03, "grad_norm": 2.6447411660183016, "learning_rate": 8.620689655172414e-06, "loss": 1.2604, "step": 25 }, { "epoch": 0.03, "grad_norm": 2.1857065034640795, "learning_rate": 9.999996872939885e-06, "loss": 1.1935, "step": 30 }, { "epoch": 0.04, "grad_norm": 1.7526909326796276, "learning_rate": 9.999887426246524e-06, "loss": 1.1939, "step": 35 }, { "epoch": 0.04, "grad_norm": 1.5378704654179458, "learning_rate": 9.999621630458743e-06, "loss": 1.1626, "step": 40 }, { "epoch": 0.05, "grad_norm": 1.3071739468687105, "learning_rate": 9.999199493888118e-06, "loss": 1.122, "step": 45 }, { "epoch": 0.05, "grad_norm": 1.1982413397341172, "learning_rate": 9.998621029735082e-06, "loss": 1.1191, "step": 50 }, { "epoch": 0.06, "grad_norm": 1.0956550608235818, "learning_rate": 9.997886256088507e-06, "loss": 1.123, "step": 55 }, { "epoch": 0.06, "grad_norm": 1.0821639142212824, "learning_rate": 9.996995195925152e-06, "loss": 1.0751, "step": 60 }, { "epoch": 0.07, "grad_norm": 1.0865870956103822, "learning_rate": 9.995947877108933e-06, "loss": 1.114, "step": 65 }, { "epoch": 0.07, "grad_norm": 1.0582468505041362, "learning_rate": 9.99474433239006e-06, "loss": 1.0609, "step": 70 }, { "epoch": 0.08, "grad_norm": 1.0556893008132, "learning_rate": 9.993384599404001e-06, "loss": 1.0861, "step": 75 }, { "epoch": 0.08, "grad_norm": 1.053104954690895, "learning_rate": 9.991868720670322e-06, "loss": 1.0792, "step": 80 }, { "epoch": 0.09, "grad_norm": 1.069931327549148, "learning_rate": 9.990196743591341e-06, "loss": 1.0722, "step": 85 }, { "epoch": 0.1, "grad_norm": 1.0793525694046096, "learning_rate": 9.988368720450656e-06, "loss": 1.0561, "step": 90 }, { "epoch": 0.1, "grad_norm": 1.0449093375062701, "learning_rate": 9.986384708411507e-06, "loss": 1.0675, "step": 95 }, { "epoch": 0.11, "grad_norm": 0.9947694351867048, "learning_rate": 9.984244769514988e-06, "loss": 1.0395, "step": 100 }, { "epoch": 0.11, "grad_norm": 1.0281907914775885, "learning_rate": 9.981948970678107e-06, "loss": 1.0455, "step": 105 }, { "epoch": 0.12, "grad_norm": 1.0429607059919377, "learning_rate": 9.979497383691695e-06, "loss": 1.014, "step": 110 }, { "epoch": 0.12, "grad_norm": 1.1551790238118493, "learning_rate": 9.976890085218157e-06, "loss": 1.0335, "step": 115 }, { "epoch": 0.13, "grad_norm": 1.0958017153997939, "learning_rate": 9.974127156789082e-06, "loss": 1.0566, "step": 120 }, { "epoch": 0.13, "grad_norm": 1.0545164230640043, "learning_rate": 9.971208684802686e-06, "loss": 1.0234, "step": 125 }, { "epoch": 0.14, "grad_norm": 1.1119539198620567, "learning_rate": 9.968134760521114e-06, "loss": 0.9956, "step": 130 }, { "epoch": 0.14, "grad_norm": 1.045532676163788, "learning_rate": 9.964905480067585e-06, "loss": 1.0103, "step": 135 }, { "epoch": 0.15, "grad_norm": 1.0818099938062198, "learning_rate": 9.96152094442339e-06, "loss": 0.987, "step": 140 }, { "epoch": 0.15, "grad_norm": 1.06916616510137, "learning_rate": 9.957981259424724e-06, "loss": 1.0189, "step": 145 }, { "epoch": 0.16, "grad_norm": 1.1000812098052206, "learning_rate": 9.954286535759394e-06, "loss": 1.0025, "step": 150 }, { "epoch": 0.16, "grad_norm": 1.0740685860653156, "learning_rate": 9.950436888963337e-06, "loss": 1.0394, "step": 155 }, { "epoch": 0.17, "grad_norm": 1.0578416601226404, "learning_rate": 9.946432439417021e-06, "loss": 1.0419, "step": 160 }, { "epoch": 0.17, "grad_norm": 1.1378367934770748, "learning_rate": 9.942273312341679e-06, "loss": 1.04, "step": 165 }, { "epoch": 0.18, "grad_norm": 1.106141894903122, "learning_rate": 9.937959637795389e-06, "loss": 1.0112, "step": 170 }, { "epoch": 0.18, "grad_norm": 1.0459501547982482, "learning_rate": 9.93349155066901e-06, "loss": 0.9959, "step": 175 }, { "epoch": 0.19, "grad_norm": 1.1420602608538855, "learning_rate": 9.928869190681964e-06, "loss": 0.9952, "step": 180 }, { "epoch": 0.2, "grad_norm": 1.0748374838181862, "learning_rate": 9.924092702377863e-06, "loss": 1.0094, "step": 185 }, { "epoch": 0.2, "grad_norm": 1.0535011085546289, "learning_rate": 9.919162235119996e-06, "loss": 1.0054, "step": 190 }, { "epoch": 0.21, "grad_norm": 1.0310625793824704, "learning_rate": 9.91407794308665e-06, "loss": 1.0117, "step": 195 }, { "epoch": 0.21, "grad_norm": 1.0359842004906923, "learning_rate": 9.908839985266297e-06, "loss": 0.9982, "step": 200 }, { "epoch": 0.22, "grad_norm": 1.084059570369228, "learning_rate": 9.903448525452618e-06, "loss": 1.0127, "step": 205 }, { "epoch": 0.22, "grad_norm": 1.1227120329409497, "learning_rate": 9.89790373223938e-06, "loss": 1.048, "step": 210 }, { "epoch": 0.23, "grad_norm": 1.0615021518173307, "learning_rate": 9.892205779015167e-06, "loss": 1.0021, "step": 215 }, { "epoch": 0.23, "grad_norm": 1.070318683802529, "learning_rate": 9.886354843957953e-06, "loss": 1.0043, "step": 220 }, { "epoch": 0.24, "grad_norm": 1.0419755132096296, "learning_rate": 9.88035111002954e-06, "loss": 0.9743, "step": 225 }, { "epoch": 0.24, "grad_norm": 1.1796172322040084, "learning_rate": 9.874194764969827e-06, "loss": 0.9957, "step": 230 }, { "epoch": 0.25, "grad_norm": 1.0933963352790785, "learning_rate": 9.867886001290943e-06, "loss": 0.9814, "step": 235 }, { "epoch": 0.25, "grad_norm": 1.1066280030775704, "learning_rate": 9.861425016271227e-06, "loss": 0.9832, "step": 240 }, { "epoch": 0.26, "grad_norm": 1.1138948008724274, "learning_rate": 9.854812011949059e-06, "loss": 0.9871, "step": 245 }, { "epoch": 0.26, "grad_norm": 1.0644401239508805, "learning_rate": 9.848047195116543e-06, "loss": 0.9951, "step": 250 }, { "epoch": 0.27, "grad_norm": 1.1880183474724784, "learning_rate": 9.841130777313039e-06, "loss": 0.9902, "step": 255 }, { "epoch": 0.27, "grad_norm": 1.0747113009717828, "learning_rate": 9.834062974818547e-06, "loss": 0.9433, "step": 260 }, { "epoch": 0.28, "grad_norm": 1.1442114734348945, "learning_rate": 9.826844008646949e-06, "loss": 0.9703, "step": 265 }, { "epoch": 0.29, "grad_norm": 1.0895758630826766, "learning_rate": 9.81947410453909e-06, "loss": 1.0236, "step": 270 }, { "epoch": 0.29, "grad_norm": 0.996075250542336, "learning_rate": 9.811953492955728e-06, "loss": 0.9577, "step": 275 }, { "epoch": 0.3, "grad_norm": 1.1734623195649692, "learning_rate": 9.80428240907032e-06, "loss": 0.9752, "step": 280 }, { "epoch": 0.3, "grad_norm": 1.282701051609298, "learning_rate": 9.796461092761668e-06, "loss": 0.987, "step": 285 }, { "epoch": 0.31, "grad_norm": 1.0721992980205135, "learning_rate": 9.788489788606423e-06, "loss": 0.944, "step": 290 }, { "epoch": 0.31, "grad_norm": 1.105694230535082, "learning_rate": 9.780368745871438e-06, "loss": 0.9804, "step": 295 }, { "epoch": 0.32, "grad_norm": 1.1121587653939105, "learning_rate": 9.772098218505963e-06, "loss": 1.0099, "step": 300 }, { "epoch": 0.32, "grad_norm": 1.1073177873687883, "learning_rate": 9.763678465133712e-06, "loss": 0.9887, "step": 305 }, { "epoch": 0.33, "grad_norm": 1.1986141459298305, "learning_rate": 9.755109749044781e-06, "loss": 0.9749, "step": 310 }, { "epoch": 0.33, "grad_norm": 1.0864391212895972, "learning_rate": 9.7463923381874e-06, "loss": 0.9767, "step": 315 }, { "epoch": 0.34, "grad_norm": 1.0595953209575595, "learning_rate": 9.737526505159564e-06, "loss": 0.9297, "step": 320 }, { "epoch": 0.34, "grad_norm": 1.083224438455533, "learning_rate": 9.728512527200509e-06, "loss": 0.9498, "step": 325 }, { "epoch": 0.35, "grad_norm": 1.1306776282190978, "learning_rate": 9.719350686182041e-06, "loss": 0.982, "step": 330 }, { "epoch": 0.35, "grad_norm": 1.07939319367538, "learning_rate": 9.710041268599718e-06, "loss": 0.9669, "step": 335 }, { "epoch": 0.36, "grad_norm": 1.1100410279851476, "learning_rate": 9.700584565563897e-06, "loss": 0.956, "step": 340 }, { "epoch": 0.36, "grad_norm": 1.0917533373255544, "learning_rate": 9.690980872790627e-06, "loss": 0.9878, "step": 345 }, { "epoch": 0.37, "grad_norm": 1.1287494016251205, "learning_rate": 9.681230490592403e-06, "loss": 0.9604, "step": 350 }, { "epoch": 0.38, "grad_norm": 1.0366025693971206, "learning_rate": 9.671333723868773e-06, "loss": 0.9809, "step": 355 }, { "epoch": 0.38, "grad_norm": 1.1876939558601538, "learning_rate": 9.66129088209681e-06, "loss": 0.9324, "step": 360 }, { "epoch": 0.39, "grad_norm": 1.1296469706806582, "learning_rate": 9.651102279321429e-06, "loss": 0.98, "step": 365 }, { "epoch": 0.39, "grad_norm": 1.0920615981549329, "learning_rate": 9.640768234145563e-06, "loss": 0.9474, "step": 370 }, { "epoch": 0.4, "grad_norm": 1.045353192143218, "learning_rate": 9.630289069720213e-06, "loss": 0.9416, "step": 375 }, { "epoch": 0.4, "grad_norm": 1.0546831730532094, "learning_rate": 9.619665113734327e-06, "loss": 0.9583, "step": 380 }, { "epoch": 0.41, "grad_norm": 1.120397617115956, "learning_rate": 9.608896698404567e-06, "loss": 0.9739, "step": 385 }, { "epoch": 0.41, "grad_norm": 1.0897789727469696, "learning_rate": 9.597984160464908e-06, "loss": 0.9882, "step": 390 }, { "epoch": 0.42, "grad_norm": 1.0655227440534312, "learning_rate": 9.586927841156121e-06, "loss": 0.973, "step": 395 }, { "epoch": 0.42, "grad_norm": 1.024445190271631, "learning_rate": 9.575728086215093e-06, "loss": 0.9488, "step": 400 }, { "epoch": 0.43, "grad_norm": 1.0957551302719917, "learning_rate": 9.564385245864015e-06, "loss": 0.9395, "step": 405 }, { "epoch": 0.43, "grad_norm": 1.0348921383964815, "learning_rate": 9.552899674799438e-06, "loss": 0.9618, "step": 410 }, { "epoch": 0.44, "grad_norm": 1.1320917241343242, "learning_rate": 9.541271732181174e-06, "loss": 0.9737, "step": 415 }, { "epoch": 0.44, "grad_norm": 1.0955620287950987, "learning_rate": 9.52950178162107e-06, "loss": 0.9765, "step": 420 }, { "epoch": 0.45, "grad_norm": 1.0865957472837047, "learning_rate": 9.517590191171638e-06, "loss": 0.9402, "step": 425 }, { "epoch": 0.45, "grad_norm": 1.0608004961340336, "learning_rate": 9.505537333314534e-06, "loss": 0.938, "step": 430 }, { "epoch": 0.46, "grad_norm": 1.0436288259170787, "learning_rate": 9.493343584948931e-06, "loss": 0.9495, "step": 435 }, { "epoch": 0.46, "grad_norm": 1.0827000850655668, "learning_rate": 9.481009327379714e-06, "loss": 0.9505, "step": 440 }, { "epoch": 0.47, "grad_norm": 1.0958366892000795, "learning_rate": 9.46853494630557e-06, "loss": 0.9536, "step": 445 }, { "epoch": 0.48, "grad_norm": 1.0431220913897328, "learning_rate": 9.455920831806917e-06, "loss": 0.942, "step": 450 }, { "epoch": 0.48, "grad_norm": 1.1372655798293543, "learning_rate": 9.443167378333711e-06, "loss": 0.9447, "step": 455 }, { "epoch": 0.49, "grad_norm": 1.0890187843066097, "learning_rate": 9.43027498469311e-06, "loss": 0.9291, "step": 460 }, { "epoch": 0.49, "grad_norm": 1.128255566030822, "learning_rate": 9.41724405403701e-06, "loss": 0.9418, "step": 465 }, { "epoch": 0.5, "grad_norm": 1.0200134644324146, "learning_rate": 9.404074993849421e-06, "loss": 0.927, "step": 470 }, { "epoch": 0.5, "grad_norm": 1.0912622433950008, "learning_rate": 9.390768215933746e-06, "loss": 0.943, "step": 475 }, { "epoch": 0.51, "grad_norm": 1.1784430852167105, "learning_rate": 9.377324136399887e-06, "loss": 0.9409, "step": 480 }, { "epoch": 0.51, "grad_norm": 1.0732445497397998, "learning_rate": 9.36374317565124e-06, "loss": 0.9401, "step": 485 }, { "epoch": 0.52, "grad_norm": 1.1241973380928443, "learning_rate": 9.350025758371554e-06, "loss": 0.9188, "step": 490 }, { "epoch": 0.52, "grad_norm": 1.0680249447424572, "learning_rate": 9.336172313511636e-06, "loss": 0.9304, "step": 495 }, { "epoch": 0.53, "grad_norm": 1.0400938648362148, "learning_rate": 9.322183274275954e-06, "loss": 0.9465, "step": 500 }, { "epoch": 0.53, "grad_norm": 1.1484166178621282, "learning_rate": 9.308059078109078e-06, "loss": 0.9431, "step": 505 }, { "epoch": 0.54, "grad_norm": 1.0928763685485705, "learning_rate": 9.29380016668201e-06, "loss": 0.9368, "step": 510 }, { "epoch": 0.54, "grad_norm": 1.0470334802413224, "learning_rate": 9.279406985878367e-06, "loss": 0.9529, "step": 515 }, { "epoch": 0.55, "grad_norm": 1.055693577627048, "learning_rate": 9.264879985780436e-06, "loss": 0.9237, "step": 520 }, { "epoch": 0.55, "grad_norm": 1.0582407523485609, "learning_rate": 9.250219620655112e-06, "loss": 0.9455, "step": 525 }, { "epoch": 0.56, "grad_norm": 1.0392740863841614, "learning_rate": 9.235426348939674e-06, "loss": 0.9866, "step": 530 }, { "epoch": 0.57, "grad_norm": 1.087021743413759, "learning_rate": 9.220500633227467e-06, "loss": 0.9797, "step": 535 }, { "epoch": 0.57, "grad_norm": 1.0905659766649087, "learning_rate": 9.205442940253426e-06, "loss": 0.9231, "step": 540 }, { "epoch": 0.58, "grad_norm": 1.0838061353931883, "learning_rate": 9.190253740879484e-06, "loss": 0.9155, "step": 545 }, { "epoch": 0.58, "grad_norm": 1.1721559515157844, "learning_rate": 9.174933510079847e-06, "loss": 0.9132, "step": 550 }, { "epoch": 0.59, "grad_norm": 1.0711291424853389, "learning_rate": 9.159482726926147e-06, "loss": 0.9368, "step": 555 }, { "epoch": 0.59, "grad_norm": 1.0906836737125443, "learning_rate": 9.14390187457245e-06, "loss": 0.9652, "step": 560 }, { "epoch": 0.6, "grad_norm": 1.2147816750505283, "learning_rate": 9.128191440240159e-06, "loss": 0.922, "step": 565 }, { "epoch": 0.6, "grad_norm": 1.0745698856829782, "learning_rate": 9.11235191520277e-06, "loss": 0.9267, "step": 570 }, { "epoch": 0.61, "grad_norm": 1.1107563079565528, "learning_rate": 9.096383794770513e-06, "loss": 0.9403, "step": 575 }, { "epoch": 0.61, "grad_norm": 1.0645734678937102, "learning_rate": 9.080287578274866e-06, "loss": 0.9149, "step": 580 }, { "epoch": 0.62, "grad_norm": 1.1729380707889032, "learning_rate": 9.064063769052933e-06, "loss": 0.9236, "step": 585 }, { "epoch": 0.62, "grad_norm": 1.0634029251400858, "learning_rate": 9.047712874431716e-06, "loss": 0.9264, "step": 590 }, { "epoch": 0.63, "grad_norm": 1.185148731024843, "learning_rate": 9.031235405712239e-06, "loss": 0.9632, "step": 595 }, { "epoch": 0.63, "grad_norm": 1.1238661801404854, "learning_rate": 9.014631878153564e-06, "loss": 0.9364, "step": 600 }, { "epoch": 0.64, "grad_norm": 1.1101591200426506, "learning_rate": 8.997902810956682e-06, "loss": 0.9121, "step": 605 }, { "epoch": 0.64, "grad_norm": 1.1328306862765927, "learning_rate": 8.98104872724827e-06, "loss": 0.9637, "step": 610 }, { "epoch": 0.65, "grad_norm": 1.1182389860600772, "learning_rate": 8.964070154064343e-06, "loss": 0.9431, "step": 615 }, { "epoch": 0.66, "grad_norm": 1.2315329373588069, "learning_rate": 8.94696762233376e-06, "loss": 0.9261, "step": 620 }, { "epoch": 0.66, "grad_norm": 1.0785263989248792, "learning_rate": 8.92974166686163e-06, "loss": 0.9218, "step": 625 }, { "epoch": 0.67, "grad_norm": 1.0293877329539916, "learning_rate": 8.912392826312595e-06, "loss": 0.9516, "step": 630 }, { "epoch": 0.67, "grad_norm": 1.0797961930582287, "learning_rate": 8.894921643193966e-06, "loss": 0.94, "step": 635 }, { "epoch": 0.68, "grad_norm": 1.0052477432214972, "learning_rate": 8.877328663838776e-06, "loss": 0.9207, "step": 640 }, { "epoch": 0.68, "grad_norm": 1.0126272743426095, "learning_rate": 8.85961443838869e-06, "loss": 0.9292, "step": 645 }, { "epoch": 0.69, "grad_norm": 1.0166858946265631, "learning_rate": 8.841779520776803e-06, "loss": 0.9171, "step": 650 }, { "epoch": 0.69, "grad_norm": 1.0674058891203713, "learning_rate": 8.823824468710312e-06, "loss": 0.9238, "step": 655 }, { "epoch": 0.7, "grad_norm": 1.0826543746678357, "learning_rate": 8.805749843653086e-06, "loss": 0.8903, "step": 660 }, { "epoch": 0.7, "grad_norm": 1.0474293060948185, "learning_rate": 8.787556210808101e-06, "loss": 0.8952, "step": 665 }, { "epoch": 0.71, "grad_norm": 1.1092322508696293, "learning_rate": 8.769244139099774e-06, "loss": 0.9191, "step": 670 }, { "epoch": 0.71, "grad_norm": 1.0453618423472522, "learning_rate": 8.750814201156157e-06, "loss": 0.9287, "step": 675 }, { "epoch": 0.72, "grad_norm": 1.0150902528617922, "learning_rate": 8.732266973291053e-06, "loss": 0.9005, "step": 680 }, { "epoch": 0.72, "grad_norm": 1.111573072134849, "learning_rate": 8.713603035485972e-06, "loss": 0.9061, "step": 685 }, { "epoch": 0.73, "grad_norm": 1.0266552996471214, "learning_rate": 8.694822971372012e-06, "loss": 0.8981, "step": 690 }, { "epoch": 0.73, "grad_norm": 1.026959416886306, "learning_rate": 8.675927368211599e-06, "loss": 0.9119, "step": 695 }, { "epoch": 0.74, "grad_norm": 0.990879098356618, "learning_rate": 8.656916816880122e-06, "loss": 0.934, "step": 700 }, { "epoch": 0.75, "grad_norm": 1.016936193517629, "learning_rate": 8.637791911847462e-06, "loss": 0.9031, "step": 705 }, { "epoch": 0.75, "grad_norm": 1.0105346034407392, "learning_rate": 8.618553251159405e-06, "loss": 0.8918, "step": 710 }, { "epoch": 0.76, "grad_norm": 1.0219526658502593, "learning_rate": 8.599201436418927e-06, "loss": 0.9202, "step": 715 }, { "epoch": 0.76, "grad_norm": 1.0611008297726183, "learning_rate": 8.579737072767396e-06, "loss": 0.8956, "step": 720 }, { "epoch": 0.77, "grad_norm": 1.0532525094762688, "learning_rate": 8.560160768865642e-06, "loss": 0.8782, "step": 725 }, { "epoch": 0.77, "grad_norm": 1.0472370063073, "learning_rate": 8.540473136874926e-06, "loss": 0.9215, "step": 730 }, { "epoch": 0.78, "grad_norm": 1.0503901600633805, "learning_rate": 8.520674792437793e-06, "loss": 0.905, "step": 735 }, { "epoch": 0.78, "grad_norm": 1.0699401745712223, "learning_rate": 8.50076635465883e-06, "loss": 0.8914, "step": 740 }, { "epoch": 0.79, "grad_norm": 1.1604934245734189, "learning_rate": 8.480748446085293e-06, "loss": 0.923, "step": 745 }, { "epoch": 0.79, "grad_norm": 1.0575469862405844, "learning_rate": 8.460621692687656e-06, "loss": 0.91, "step": 750 }, { "epoch": 0.8, "grad_norm": 1.1861862918344839, "learning_rate": 8.44038672384002e-06, "loss": 0.9183, "step": 755 }, { "epoch": 0.8, "grad_norm": 1.0866238920331526, "learning_rate": 8.420044172300443e-06, "loss": 0.9012, "step": 760 }, { "epoch": 0.81, "grad_norm": 1.0963030089254635, "learning_rate": 8.399594674191147e-06, "loss": 0.8867, "step": 765 }, { "epoch": 0.81, "grad_norm": 1.0516263694748806, "learning_rate": 8.379038868978635e-06, "loss": 0.9204, "step": 770 }, { "epoch": 0.82, "grad_norm": 1.0602404388082067, "learning_rate": 8.358377399453684e-06, "loss": 0.8975, "step": 775 }, { "epoch": 0.82, "grad_norm": 1.0524212623827451, "learning_rate": 8.337610911711248e-06, "loss": 0.9182, "step": 780 }, { "epoch": 0.83, "grad_norm": 1.0486851629524967, "learning_rate": 8.316740055130263e-06, "loss": 0.8996, "step": 785 }, { "epoch": 0.83, "grad_norm": 1.0382393662171674, "learning_rate": 8.295765482353326e-06, "loss": 0.8898, "step": 790 }, { "epoch": 0.84, "grad_norm": 1.0801053233779676, "learning_rate": 8.274687849266295e-06, "loss": 0.8942, "step": 795 }, { "epoch": 0.85, "grad_norm": 1.082914632918619, "learning_rate": 8.253507814977779e-06, "loss": 0.9335, "step": 800 }, { "epoch": 0.85, "grad_norm": 1.115797305584172, "learning_rate": 8.232226041798528e-06, "loss": 0.8733, "step": 805 }, { "epoch": 0.86, "grad_norm": 1.0758274816242523, "learning_rate": 8.210843195220717e-06, "loss": 0.9121, "step": 810 }, { "epoch": 0.86, "grad_norm": 0.9966437564306923, "learning_rate": 8.189359943897137e-06, "loss": 0.9126, "step": 815 }, { "epoch": 0.87, "grad_norm": 1.1254388184304862, "learning_rate": 8.167776959620298e-06, "loss": 0.9113, "step": 820 }, { "epoch": 0.87, "grad_norm": 1.033615919920944, "learning_rate": 8.1460949173014e-06, "loss": 0.8863, "step": 825 }, { "epoch": 0.88, "grad_norm": 1.0126421627367477, "learning_rate": 8.124314494949247e-06, "loss": 0.9044, "step": 830 }, { "epoch": 0.88, "grad_norm": 1.0545539629522227, "learning_rate": 8.102436373649029e-06, "loss": 0.8942, "step": 835 }, { "epoch": 0.89, "grad_norm": 1.004956283976033, "learning_rate": 8.080461237541049e-06, "loss": 0.9255, "step": 840 }, { "epoch": 0.89, "grad_norm": 1.0862660155528163, "learning_rate": 8.0583897737993e-06, "loss": 0.9275, "step": 845 }, { "epoch": 0.9, "grad_norm": 1.0697124134441602, "learning_rate": 8.036222672609994e-06, "loss": 0.9161, "step": 850 }, { "epoch": 0.9, "grad_norm": 1.0639070724236763, "learning_rate": 8.013960627149981e-06, "loss": 0.8874, "step": 855 }, { "epoch": 0.91, "grad_norm": 1.166900094582672, "learning_rate": 7.991604333565062e-06, "loss": 0.8897, "step": 860 }, { "epoch": 0.91, "grad_norm": 1.1335592965754175, "learning_rate": 7.969154490948225e-06, "loss": 0.8964, "step": 865 }, { "epoch": 0.92, "grad_norm": 1.0520381511921073, "learning_rate": 7.946611801317794e-06, "loss": 0.8736, "step": 870 }, { "epoch": 0.92, "grad_norm": 1.16753848747216, "learning_rate": 7.923976969595459e-06, "loss": 0.9112, "step": 875 }, { "epoch": 0.93, "grad_norm": 1.0772133099773151, "learning_rate": 7.901250703584245e-06, "loss": 0.9155, "step": 880 }, { "epoch": 0.94, "grad_norm": 1.1464686627860388, "learning_rate": 7.878433713946373e-06, "loss": 0.8962, "step": 885 }, { "epoch": 0.94, "grad_norm": 1.0835779136854178, "learning_rate": 7.855526714181041e-06, "loss": 0.9058, "step": 890 }, { "epoch": 0.95, "grad_norm": 1.171366478493349, "learning_rate": 7.832530420602113e-06, "loss": 0.8756, "step": 895 }, { "epoch": 0.95, "grad_norm": 1.040168900901505, "learning_rate": 7.809445552315714e-06, "loss": 0.8594, "step": 900 }, { "epoch": 0.96, "grad_norm": 1.02166560480321, "learning_rate": 7.786272831197745e-06, "loss": 0.8935, "step": 905 }, { "epoch": 0.96, "grad_norm": 1.1107392454183416, "learning_rate": 7.763012981871314e-06, "loss": 0.904, "step": 910 }, { "epoch": 0.97, "grad_norm": 0.9896358057101541, "learning_rate": 7.739666731684073e-06, "loss": 0.9068, "step": 915 }, { "epoch": 0.97, "grad_norm": 0.9788741930391702, "learning_rate": 7.716234810685476e-06, "loss": 0.8846, "step": 920 }, { "epoch": 0.98, "grad_norm": 0.9931045191442167, "learning_rate": 7.692717951603942e-06, "loss": 0.8584, "step": 925 }, { "epoch": 0.98, "grad_norm": 1.0645481368236074, "learning_rate": 7.669116889823955e-06, "loss": 0.8992, "step": 930 }, { "epoch": 0.99, "grad_norm": 0.9816731950451545, "learning_rate": 7.645432363363057e-06, "loss": 0.8851, "step": 935 }, { "epoch": 0.99, "grad_norm": 0.9899142833993008, "learning_rate": 7.621665112848776e-06, "loss": 0.8845, "step": 940 }, { "epoch": 1.0, "grad_norm": 1.0638888300871174, "learning_rate": 7.597815881495465e-06, "loss": 0.8773, "step": 945 }, { "epoch": 1.0, "grad_norm": 1.031662431521578, "learning_rate": 7.573885415081059e-06, "loss": 0.8258, "step": 950 }, { "epoch": 1.01, "grad_norm": 1.040426497974828, "learning_rate": 7.54987446192376e-06, "loss": 0.7907, "step": 955 }, { "epoch": 1.01, "grad_norm": 0.9887566903005512, "learning_rate": 7.525783772858624e-06, "loss": 0.8091, "step": 960 }, { "epoch": 1.02, "grad_norm": 1.0542179478307365, "learning_rate": 7.5016141012141e-06, "loss": 0.7815, "step": 965 }, { "epoch": 1.03, "grad_norm": 1.0738731959256824, "learning_rate": 7.477366202788456e-06, "loss": 0.7734, "step": 970 }, { "epoch": 1.03, "grad_norm": 0.9975806760235982, "learning_rate": 7.45304083582616e-06, "loss": 0.7824, "step": 975 }, { "epoch": 1.04, "grad_norm": 1.005274019925314, "learning_rate": 7.4286387609941544e-06, "loss": 0.769, "step": 980 }, { "epoch": 1.04, "grad_norm": 1.0937329481520819, "learning_rate": 7.40416074135808e-06, "loss": 0.791, "step": 985 }, { "epoch": 1.05, "grad_norm": 0.9987999174071854, "learning_rate": 7.379607542358414e-06, "loss": 0.7983, "step": 990 }, { "epoch": 1.05, "grad_norm": 1.074721973505265, "learning_rate": 7.3549799317865235e-06, "loss": 0.8264, "step": 995 }, { "epoch": 1.06, "grad_norm": 1.0023766389640552, "learning_rate": 7.330278679760673e-06, "loss": 0.8166, "step": 1000 }, { "epoch": 1.06, "grad_norm": 1.0263488491446793, "learning_rate": 7.3055045587019315e-06, "loss": 0.7756, "step": 1005 }, { "epoch": 1.07, "grad_norm": 1.222252310199244, "learning_rate": 7.280658343310016e-06, "loss": 0.8113, "step": 1010 }, { "epoch": 1.07, "grad_norm": 1.0803171037496995, "learning_rate": 7.255740810539078e-06, "loss": 0.7773, "step": 1015 }, { "epoch": 1.08, "grad_norm": 1.0429385720996782, "learning_rate": 7.230752739573398e-06, "loss": 0.7959, "step": 1020 }, { "epoch": 1.08, "grad_norm": 1.0525788357504489, "learning_rate": 7.205694911803019e-06, "loss": 0.7962, "step": 1025 }, { "epoch": 1.09, "grad_norm": 0.986228023483833, "learning_rate": 7.18056811079932e-06, "loss": 0.79, "step": 1030 }, { "epoch": 1.09, "grad_norm": 1.031179895714868, "learning_rate": 7.155373122290508e-06, "loss": 0.8101, "step": 1035 }, { "epoch": 1.1, "grad_norm": 1.0379629517770603, "learning_rate": 7.13011073413705e-06, "loss": 0.781, "step": 1040 }, { "epoch": 1.1, "grad_norm": 1.033153108919124, "learning_rate": 7.1047817363070325e-06, "loss": 0.8418, "step": 1045 }, { "epoch": 1.11, "grad_norm": 1.0357203376239867, "learning_rate": 7.079386920851466e-06, "loss": 0.8065, "step": 1050 }, { "epoch": 1.11, "grad_norm": 1.0540192082846203, "learning_rate": 7.053927081879505e-06, "loss": 0.7956, "step": 1055 }, { "epoch": 1.12, "grad_norm": 1.0552828635725824, "learning_rate": 7.0284030155336315e-06, "loss": 0.7945, "step": 1060 }, { "epoch": 1.13, "grad_norm": 0.9810627289945896, "learning_rate": 7.002815519964745e-06, "loss": 0.7965, "step": 1065 }, { "epoch": 1.13, "grad_norm": 1.0916102744452092, "learning_rate": 6.977165395307215e-06, "loss": 0.7991, "step": 1070 }, { "epoch": 1.14, "grad_norm": 1.1543690326062077, "learning_rate": 6.951453443653852e-06, "loss": 0.7896, "step": 1075 }, { "epoch": 1.14, "grad_norm": 1.1170103600405488, "learning_rate": 6.9256804690308276e-06, "loss": 0.7828, "step": 1080 }, { "epoch": 1.15, "grad_norm": 1.0526733296614392, "learning_rate": 6.899847277372538e-06, "loss": 0.7923, "step": 1085 }, { "epoch": 1.15, "grad_norm": 1.0770254342023697, "learning_rate": 6.873954676496395e-06, "loss": 0.8128, "step": 1090 }, { "epoch": 1.16, "grad_norm": 1.037705594081886, "learning_rate": 6.848003476077567e-06, "loss": 0.7856, "step": 1095 }, { "epoch": 1.16, "grad_norm": 1.0319807068181204, "learning_rate": 6.8219944876236645e-06, "loss": 0.7949, "step": 1100 }, { "epoch": 1.17, "grad_norm": 1.0927555007584646, "learning_rate": 6.795928524449354e-06, "loss": 0.7941, "step": 1105 }, { "epoch": 1.17, "grad_norm": 0.9869897993273156, "learning_rate": 6.769806401650936e-06, "loss": 0.7667, "step": 1110 }, { "epoch": 1.18, "grad_norm": 1.0055956062759406, "learning_rate": 6.743628936080852e-06, "loss": 0.7855, "step": 1115 }, { "epoch": 1.18, "grad_norm": 1.0283367881989096, "learning_rate": 6.717396946322137e-06, "loss": 0.7745, "step": 1120 }, { "epoch": 1.19, "grad_norm": 1.0345829389670045, "learning_rate": 6.6911112526628295e-06, "loss": 0.7842, "step": 1125 }, { "epoch": 1.19, "grad_norm": 1.0711135328845822, "learning_rate": 6.664772677070316e-06, "loss": 0.7558, "step": 1130 }, { "epoch": 1.2, "grad_norm": 0.9877769296594265, "learning_rate": 6.638382043165628e-06, "loss": 0.7788, "step": 1135 }, { "epoch": 1.2, "grad_norm": 1.131836138091609, "learning_rate": 6.611940176197688e-06, "loss": 0.7901, "step": 1140 }, { "epoch": 1.21, "grad_norm": 1.058249641590972, "learning_rate": 6.585447903017506e-06, "loss": 0.7936, "step": 1145 }, { "epoch": 1.22, "grad_norm": 1.073971008814511, "learning_rate": 6.558906052052314e-06, "loss": 0.7835, "step": 1150 }, { "epoch": 1.22, "grad_norm": 1.0491301969369466, "learning_rate": 6.532315453279673e-06, "loss": 0.7902, "step": 1155 }, { "epoch": 1.23, "grad_norm": 1.046297097483487, "learning_rate": 6.505676938201512e-06, "loss": 0.7767, "step": 1160 }, { "epoch": 1.23, "grad_norm": 1.046022517875942, "learning_rate": 6.478991339818128e-06, "loss": 0.8091, "step": 1165 }, { "epoch": 1.24, "grad_norm": 1.0086633248074561, "learning_rate": 6.4522594926021355e-06, "loss": 0.7797, "step": 1170 }, { "epoch": 1.24, "grad_norm": 1.0965955454651117, "learning_rate": 6.425482232472377e-06, "loss": 0.7702, "step": 1175 }, { "epoch": 1.25, "grad_norm": 1.0362189192150881, "learning_rate": 6.3986603967677805e-06, "loss": 0.7931, "step": 1180 }, { "epoch": 1.25, "grad_norm": 1.110468197330772, "learning_rate": 6.371794824221173e-06, "loss": 0.7917, "step": 1185 }, { "epoch": 1.26, "grad_norm": 1.0163659020071605, "learning_rate": 6.344886354933058e-06, "loss": 0.7886, "step": 1190 }, { "epoch": 1.26, "grad_norm": 1.0115549227695064, "learning_rate": 6.3179358303453386e-06, "loss": 0.7511, "step": 1195 }, { "epoch": 1.27, "grad_norm": 1.0872016119161863, "learning_rate": 6.290944093215016e-06, "loss": 0.8036, "step": 1200 }, { "epoch": 1.27, "grad_norm": 1.0553500518484338, "learning_rate": 6.263911987587822e-06, "loss": 0.7938, "step": 1205 }, { "epoch": 1.28, "grad_norm": 0.993815270148442, "learning_rate": 6.236840358771837e-06, "loss": 0.7788, "step": 1210 }, { "epoch": 1.28, "grad_norm": 1.0605675582324252, "learning_rate": 6.20973005331105e-06, "loss": 0.7781, "step": 1215 }, { "epoch": 1.29, "grad_norm": 1.0965085071552372, "learning_rate": 6.1825819189588885e-06, "loss": 0.7872, "step": 1220 }, { "epoch": 1.29, "grad_norm": 1.040866195350916, "learning_rate": 6.155396804651714e-06, "loss": 0.7966, "step": 1225 }, { "epoch": 1.3, "grad_norm": 1.0593376609536802, "learning_rate": 6.128175560482264e-06, "loss": 0.7832, "step": 1230 }, { "epoch": 1.31, "grad_norm": 1.0081718313330637, "learning_rate": 6.1009190376730785e-06, "loss": 0.7772, "step": 1235 }, { "epoch": 1.31, "grad_norm": 0.9892554397828908, "learning_rate": 6.07362808854988e-06, "loss": 0.7856, "step": 1240 }, { "epoch": 1.32, "grad_norm": 1.0515874983049542, "learning_rate": 6.046303566514919e-06, "loss": 0.7812, "step": 1245 }, { "epoch": 1.32, "grad_norm": 1.01738547568124, "learning_rate": 6.018946326020287e-06, "loss": 0.7824, "step": 1250 }, { "epoch": 1.33, "grad_norm": 0.992994982201507, "learning_rate": 5.991557222541201e-06, "loss": 0.7842, "step": 1255 }, { "epoch": 1.33, "grad_norm": 0.9928822859609259, "learning_rate": 5.964137112549251e-06, "loss": 0.7906, "step": 1260 }, { "epoch": 1.34, "grad_norm": 1.0673862770846931, "learning_rate": 5.9366868534856115e-06, "loss": 0.7896, "step": 1265 }, { "epoch": 1.34, "grad_norm": 1.0627251705995355, "learning_rate": 5.909207303734241e-06, "loss": 0.7965, "step": 1270 }, { "epoch": 1.35, "grad_norm": 1.0050051635503012, "learning_rate": 5.881699322595031e-06, "loss": 0.7775, "step": 1275 }, { "epoch": 1.35, "grad_norm": 1.0049258262531797, "learning_rate": 5.854163770256934e-06, "loss": 0.7659, "step": 1280 }, { "epoch": 1.36, "grad_norm": 1.1097225296353777, "learning_rate": 5.826601507771073e-06, "loss": 0.7699, "step": 1285 }, { "epoch": 1.36, "grad_norm": 1.0610730723756006, "learning_rate": 5.799013397023806e-06, "loss": 0.7996, "step": 1290 }, { "epoch": 1.37, "grad_norm": 1.0285633823079718, "learning_rate": 5.771400300709785e-06, "loss": 0.7829, "step": 1295 }, { "epoch": 1.37, "grad_norm": 1.0484599021027985, "learning_rate": 5.743763082304973e-06, "loss": 0.7619, "step": 1300 }, { "epoch": 1.38, "grad_norm": 1.0137701786577156, "learning_rate": 5.7161026060396375e-06, "loss": 0.798, "step": 1305 }, { "epoch": 1.38, "grad_norm": 1.0289414598602742, "learning_rate": 5.688419736871341e-06, "loss": 0.7827, "step": 1310 }, { "epoch": 1.39, "grad_norm": 1.058376335913828, "learning_rate": 5.660715340457874e-06, "loss": 0.7921, "step": 1315 }, { "epoch": 1.39, "grad_norm": 1.0011219088912342, "learning_rate": 5.632990283130204e-06, "loss": 0.781, "step": 1320 }, { "epoch": 1.4, "grad_norm": 0.984264955084216, "learning_rate": 5.605245431865368e-06, "loss": 0.7772, "step": 1325 }, { "epoch": 1.41, "grad_norm": 1.0151072044919451, "learning_rate": 5.577481654259377e-06, "loss": 0.7735, "step": 1330 }, { "epoch": 1.41, "grad_norm": 1.063533843295668, "learning_rate": 5.549699818500074e-06, "loss": 0.7682, "step": 1335 }, { "epoch": 1.42, "grad_norm": 1.0434635789190496, "learning_rate": 5.521900793339989e-06, "loss": 0.7915, "step": 1340 }, { "epoch": 1.42, "grad_norm": 1.0587561050751115, "learning_rate": 5.494085448069181e-06, "loss": 0.7997, "step": 1345 }, { "epoch": 1.43, "grad_norm": 1.0758864296233028, "learning_rate": 5.466254652488036e-06, "loss": 0.7964, "step": 1350 }, { "epoch": 1.43, "grad_norm": 1.0556751372323996, "learning_rate": 5.438409276880089e-06, "loss": 0.8062, "step": 1355 }, { "epoch": 1.44, "grad_norm": 0.9792859835280993, "learning_rate": 5.410550191984798e-06, "loss": 0.787, "step": 1360 }, { "epoch": 1.44, "grad_norm": 1.0231438624972786, "learning_rate": 5.3826782689703115e-06, "loss": 0.7803, "step": 1365 }, { "epoch": 1.45, "grad_norm": 1.0660534726358564, "learning_rate": 5.354794379406242e-06, "loss": 0.78, "step": 1370 }, { "epoch": 1.45, "grad_norm": 0.9527414539128428, "learning_rate": 5.3268993952363936e-06, "loss": 0.796, "step": 1375 }, { "epoch": 1.46, "grad_norm": 0.9870931434726852, "learning_rate": 5.29899418875151e-06, "loss": 0.7652, "step": 1380 }, { "epoch": 1.46, "grad_norm": 1.0537299945885146, "learning_rate": 5.271079632561992e-06, "loss": 0.7854, "step": 1385 }, { "epoch": 1.47, "grad_norm": 1.1396368040574916, "learning_rate": 5.243156599570606e-06, "loss": 0.7617, "step": 1390 }, { "epoch": 1.47, "grad_norm": 1.0924704024745873, "learning_rate": 5.2152259629451986e-06, "loss": 0.7713, "step": 1395 }, { "epoch": 1.48, "grad_norm": 1.021493417245078, "learning_rate": 5.18728859609138e-06, "loss": 0.7609, "step": 1400 }, { "epoch": 1.48, "grad_norm": 1.0148194958691719, "learning_rate": 5.159345372625223e-06, "loss": 0.7788, "step": 1405 }, { "epoch": 1.49, "grad_norm": 1.0402765811164951, "learning_rate": 5.131397166345938e-06, "loss": 0.7599, "step": 1410 }, { "epoch": 1.5, "grad_norm": 0.9966250584272072, "learning_rate": 5.103444851208549e-06, "loss": 0.7874, "step": 1415 }, { "epoch": 1.5, "grad_norm": 0.9871275158697829, "learning_rate": 5.075489301296567e-06, "loss": 0.7566, "step": 1420 }, { "epoch": 1.51, "grad_norm": 1.0896451679213162, "learning_rate": 5.047531390794661e-06, "loss": 0.7699, "step": 1425 }, { "epoch": 1.51, "grad_norm": 1.1203863877988638, "learning_rate": 5.019571993961307e-06, "loss": 0.8088, "step": 1430 }, { "epoch": 1.52, "grad_norm": 1.03311513179617, "learning_rate": 4.9916119851014664e-06, "loss": 0.7739, "step": 1435 }, { "epoch": 1.52, "grad_norm": 1.0389351009988612, "learning_rate": 4.96365223853924e-06, "loss": 0.7816, "step": 1440 }, { "epoch": 1.53, "grad_norm": 0.9960641498632878, "learning_rate": 4.93569362859052e-06, "loss": 0.775, "step": 1445 }, { "epoch": 1.53, "grad_norm": 0.9388823495229471, "learning_rate": 4.907737029535664e-06, "loss": 0.756, "step": 1450 }, { "epoch": 1.54, "grad_norm": 1.0662538022442485, "learning_rate": 4.8797833155921396e-06, "loss": 0.7992, "step": 1455 }, { "epoch": 1.54, "grad_norm": 1.0350212904727674, "learning_rate": 4.8518333608872015e-06, "loss": 0.7595, "step": 1460 }, { "epoch": 1.55, "grad_norm": 0.9967538128228846, "learning_rate": 4.823888039430551e-06, "loss": 0.7582, "step": 1465 }, { "epoch": 1.55, "grad_norm": 1.0139079612075497, "learning_rate": 4.795948225087001e-06, "loss": 0.7709, "step": 1470 }, { "epoch": 1.56, "grad_norm": 1.0510044388149635, "learning_rate": 4.7680147915491585e-06, "loss": 0.7692, "step": 1475 }, { "epoch": 1.56, "grad_norm": 1.0641353890612333, "learning_rate": 4.740088612310096e-06, "loss": 0.7847, "step": 1480 }, { "epoch": 1.57, "grad_norm": 1.0192435995305715, "learning_rate": 4.7121705606360424e-06, "loss": 0.7732, "step": 1485 }, { "epoch": 1.57, "grad_norm": 1.0076325415256413, "learning_rate": 4.684261509539072e-06, "loss": 0.7701, "step": 1490 }, { "epoch": 1.58, "grad_norm": 0.9707102286396411, "learning_rate": 4.65636233174981e-06, "loss": 0.77, "step": 1495 }, { "epoch": 1.59, "grad_norm": 1.0835636202474823, "learning_rate": 4.628473899690133e-06, "loss": 0.7849, "step": 1500 }, { "epoch": 1.59, "grad_norm": 1.0157410126136626, "learning_rate": 4.600597085445894e-06, "loss": 0.784, "step": 1505 }, { "epoch": 1.6, "grad_norm": 1.0616186913926178, "learning_rate": 4.572732760739653e-06, "loss": 0.7785, "step": 1510 }, { "epoch": 1.6, "grad_norm": 1.006516145178769, "learning_rate": 4.5448817969034165e-06, "loss": 0.7753, "step": 1515 }, { "epoch": 1.61, "grad_norm": 1.0480529823653495, "learning_rate": 4.517045064851386e-06, "loss": 0.7989, "step": 1520 }, { "epoch": 1.61, "grad_norm": 1.0432567441250045, "learning_rate": 4.489223435052732e-06, "loss": 0.7946, "step": 1525 }, { "epoch": 1.62, "grad_norm": 1.0461342178531015, "learning_rate": 4.461417777504363e-06, "loss": 0.7676, "step": 1530 }, { "epoch": 1.62, "grad_norm": 1.0045382622138492, "learning_rate": 4.433628961703733e-06, "loss": 0.7651, "step": 1535 }, { "epoch": 1.63, "grad_norm": 0.9890094489435823, "learning_rate": 4.405857856621644e-06, "loss": 0.7943, "step": 1540 }, { "epoch": 1.63, "grad_norm": 1.0127639919495397, "learning_rate": 4.378105330675074e-06, "loss": 0.7895, "step": 1545 }, { "epoch": 1.64, "grad_norm": 1.0398544121817734, "learning_rate": 4.350372251700025e-06, "loss": 0.8004, "step": 1550 }, { "epoch": 1.64, "grad_norm": 1.037857459368961, "learning_rate": 4.322659486924373e-06, "loss": 0.7963, "step": 1555 }, { "epoch": 1.65, "grad_norm": 1.106103919813531, "learning_rate": 4.294967902940768e-06, "loss": 0.787, "step": 1560 }, { "epoch": 1.65, "grad_norm": 1.0865617469424886, "learning_rate": 4.267298365679522e-06, "loss": 0.788, "step": 1565 }, { "epoch": 1.66, "grad_norm": 1.0303226290700802, "learning_rate": 4.239651740381534e-06, "loss": 0.7642, "step": 1570 }, { "epoch": 1.66, "grad_norm": 1.0512505166055992, "learning_rate": 4.212028891571237e-06, "loss": 0.7832, "step": 1575 }, { "epoch": 1.67, "grad_norm": 1.0750316874597787, "learning_rate": 4.184430683029552e-06, "loss": 0.7599, "step": 1580 }, { "epoch": 1.68, "grad_norm": 1.0622608820174235, "learning_rate": 4.156857977766896e-06, "loss": 0.7841, "step": 1585 }, { "epoch": 1.68, "grad_norm": 1.0023528643121005, "learning_rate": 4.129311637996182e-06, "loss": 0.7845, "step": 1590 }, { "epoch": 1.69, "grad_norm": 1.0597451506484419, "learning_rate": 4.101792525105857e-06, "loss": 0.7802, "step": 1595 }, { "epoch": 1.69, "grad_norm": 0.9622973096022323, "learning_rate": 4.0743014996329764e-06, "loss": 0.7678, "step": 1600 }, { "epoch": 1.7, "grad_norm": 1.051095411122212, "learning_rate": 4.046839421236276e-06, "loss": 0.7972, "step": 1605 }, { "epoch": 1.7, "grad_norm": 1.0082128589578265, "learning_rate": 4.019407148669312e-06, "loss": 0.7948, "step": 1610 }, { "epoch": 1.71, "grad_norm": 1.0901759578931909, "learning_rate": 3.992005539753592e-06, "loss": 0.7914, "step": 1615 }, { "epoch": 1.71, "grad_norm": 1.0584302499373435, "learning_rate": 3.964635451351758e-06, "loss": 0.7821, "step": 1620 }, { "epoch": 1.72, "grad_norm": 1.043189384648134, "learning_rate": 3.937297739340783e-06, "loss": 0.778, "step": 1625 }, { "epoch": 1.72, "grad_norm": 1.0245392793145456, "learning_rate": 3.909993258585219e-06, "loss": 0.7908, "step": 1630 }, { "epoch": 1.73, "grad_norm": 1.0082519645854728, "learning_rate": 3.882722862910458e-06, "loss": 0.7793, "step": 1635 }, { "epoch": 1.73, "grad_norm": 1.0211341337802105, "learning_rate": 3.8554874050760345e-06, "loss": 0.8042, "step": 1640 }, { "epoch": 1.74, "grad_norm": 0.9920127978660441, "learning_rate": 3.828287736748957e-06, "loss": 0.758, "step": 1645 }, { "epoch": 1.74, "grad_norm": 1.0187229111502758, "learning_rate": 3.8011247084770754e-06, "loss": 0.7986, "step": 1650 }, { "epoch": 1.75, "grad_norm": 0.9982295207578855, "learning_rate": 3.773999169662489e-06, "loss": 0.7623, "step": 1655 }, { "epoch": 1.75, "grad_norm": 1.025180441312379, "learning_rate": 3.746911968534982e-06, "loss": 0.7454, "step": 1660 }, { "epoch": 1.76, "grad_norm": 0.9884338430346545, "learning_rate": 3.7198639521254988e-06, "loss": 0.7671, "step": 1665 }, { "epoch": 1.76, "grad_norm": 0.9685352318412103, "learning_rate": 3.6928559662396574e-06, "loss": 0.7583, "step": 1670 }, { "epoch": 1.77, "grad_norm": 1.029404957630594, "learning_rate": 3.6658888554312967e-06, "loss": 0.7868, "step": 1675 }, { "epoch": 1.78, "grad_norm": 0.9921023940146521, "learning_rate": 3.6389634629760763e-06, "loss": 0.7555, "step": 1680 }, { "epoch": 1.78, "grad_norm": 1.017350986680598, "learning_rate": 3.612080630845096e-06, "loss": 0.7905, "step": 1685 }, { "epoch": 1.79, "grad_norm": 1.0430603602540587, "learning_rate": 3.5852411996785776e-06, "loss": 0.7947, "step": 1690 }, { "epoch": 1.79, "grad_norm": 0.9737056004061376, "learning_rate": 3.558446008759569e-06, "loss": 0.7789, "step": 1695 }, { "epoch": 1.8, "grad_norm": 1.0212119960635129, "learning_rate": 3.5316958959876985e-06, "loss": 0.7671, "step": 1700 }, { "epoch": 1.8, "grad_norm": 1.0072141418910243, "learning_rate": 3.504991697852983e-06, "loss": 0.7844, "step": 1705 }, { "epoch": 1.81, "grad_norm": 1.059809521658242, "learning_rate": 3.4783342494096627e-06, "loss": 0.7845, "step": 1710 }, { "epoch": 1.81, "grad_norm": 1.032182317108509, "learning_rate": 3.451724384250091e-06, "loss": 0.7792, "step": 1715 }, { "epoch": 1.82, "grad_norm": 0.9779053888998924, "learning_rate": 3.4251629344786675e-06, "loss": 0.7591, "step": 1720 }, { "epoch": 1.82, "grad_norm": 1.0116163318504925, "learning_rate": 3.398650730685813e-06, "loss": 0.7556, "step": 1725 }, { "epoch": 1.83, "grad_norm": 1.0511489470052602, "learning_rate": 3.372188601922006e-06, "loss": 0.7637, "step": 1730 }, { "epoch": 1.83, "grad_norm": 1.0172930500825146, "learning_rate": 3.3457773756718513e-06, "loss": 0.7696, "step": 1735 }, { "epoch": 1.84, "grad_norm": 1.039493994412079, "learning_rate": 3.3194178778282046e-06, "loss": 0.7931, "step": 1740 }, { "epoch": 1.84, "grad_norm": 1.033662637919394, "learning_rate": 3.293110932666349e-06, "loss": 0.7692, "step": 1745 }, { "epoch": 1.85, "grad_norm": 1.0584694868797393, "learning_rate": 3.2668573628182145e-06, "loss": 0.7792, "step": 1750 }, { "epoch": 1.85, "grad_norm": 0.994626270021195, "learning_rate": 3.2406579892466582e-06, "loss": 0.7682, "step": 1755 }, { "epoch": 1.86, "grad_norm": 0.9270237802993908, "learning_rate": 3.2145136312197943e-06, "loss": 0.7552, "step": 1760 }, { "epoch": 1.87, "grad_norm": 2.0595234604236357, "learning_rate": 3.18842510628537e-06, "loss": 0.7749, "step": 1765 }, { "epoch": 1.87, "grad_norm": 1.0396319816767299, "learning_rate": 3.162393230245203e-06, "loss": 0.804, "step": 1770 }, { "epoch": 1.88, "grad_norm": 1.0214462086054552, "learning_rate": 3.1364188171296677e-06, "loss": 0.7744, "step": 1775 }, { "epoch": 1.88, "grad_norm": 1.0145502545771508, "learning_rate": 3.110502679172246e-06, "loss": 0.7824, "step": 1780 }, { "epoch": 1.89, "grad_norm": 1.0196641711891408, "learning_rate": 3.084645626784124e-06, "loss": 0.7745, "step": 1785 }, { "epoch": 1.89, "grad_norm": 1.0197064636159427, "learning_rate": 3.058848468528852e-06, "loss": 0.8031, "step": 1790 }, { "epoch": 1.9, "grad_norm": 0.9907125667454302, "learning_rate": 3.03311201109706e-06, "loss": 0.7919, "step": 1795 }, { "epoch": 1.9, "grad_norm": 1.017942513059757, "learning_rate": 3.0074370592812286e-06, "loss": 0.7907, "step": 1800 }, { "epoch": 1.91, "grad_norm": 1.0821499695866912, "learning_rate": 2.9818244159505265e-06, "loss": 0.7901, "step": 1805 }, { "epoch": 1.91, "grad_norm": 0.9934394662674368, "learning_rate": 2.956274882025706e-06, "loss": 0.7638, "step": 1810 }, { "epoch": 1.92, "grad_norm": 1.0313411208961847, "learning_rate": 2.930789256454052e-06, "loss": 0.7553, "step": 1815 }, { "epoch": 1.92, "grad_norm": 0.9950833531614097, "learning_rate": 2.905368336184406e-06, "loss": 0.7576, "step": 1820 }, { "epoch": 1.93, "grad_norm": 0.9936896686220547, "learning_rate": 2.8800129161422365e-06, "loss": 0.7671, "step": 1825 }, { "epoch": 1.93, "grad_norm": 0.9909860465997411, "learning_rate": 2.8547237892047852e-06, "loss": 0.74, "step": 1830 }, { "epoch": 1.94, "grad_norm": 0.9788752840880554, "learning_rate": 2.8295017461762806e-06, "loss": 0.767, "step": 1835 }, { "epoch": 1.94, "grad_norm": 0.9764110020200104, "learning_rate": 2.804347575763193e-06, "loss": 0.7668, "step": 1840 }, { "epoch": 1.95, "grad_norm": 0.9772254707929505, "learning_rate": 2.7792620645495917e-06, "loss": 0.7425, "step": 1845 }, { "epoch": 1.96, "grad_norm": 1.0000854462976456, "learning_rate": 2.7542459969725215e-06, "loss": 0.7466, "step": 1850 }, { "epoch": 1.96, "grad_norm": 1.0352323998365711, "learning_rate": 2.729300155297504e-06, "loss": 0.771, "step": 1855 }, { "epoch": 1.97, "grad_norm": 0.9811051893834364, "learning_rate": 2.704425319594049e-06, "loss": 0.7778, "step": 1860 }, { "epoch": 1.97, "grad_norm": 1.0284677234046133, "learning_rate": 2.6796222677112825e-06, "loss": 0.7796, "step": 1865 }, { "epoch": 1.98, "grad_norm": 0.9664217044137716, "learning_rate": 2.6548917752535997e-06, "loss": 0.771, "step": 1870 }, { "epoch": 1.98, "grad_norm": 1.0008524753186703, "learning_rate": 2.6302346155564385e-06, "loss": 0.7963, "step": 1875 }, { "epoch": 1.99, "grad_norm": 1.0088045948631796, "learning_rate": 2.6056515596620715e-06, "loss": 0.7571, "step": 1880 }, { "epoch": 1.99, "grad_norm": 0.9727997698934588, "learning_rate": 2.581143376295516e-06, "loss": 0.7968, "step": 1885 }, { "epoch": 2.0, "grad_norm": 0.9760428822299934, "learning_rate": 2.556710831840481e-06, "loss": 0.7829, "step": 1890 }, { "epoch": 2.0, "grad_norm": 1.1893585643467264, "learning_rate": 2.5323546903154074e-06, "loss": 0.7363, "step": 1895 }, { "epoch": 2.01, "grad_norm": 1.0408498899558132, "learning_rate": 2.508075713349575e-06, "loss": 0.683, "step": 1900 }, { "epoch": 2.01, "grad_norm": 1.0852218097728863, "learning_rate": 2.483874660159294e-06, "loss": 0.6388, "step": 1905 }, { "epoch": 2.02, "grad_norm": 1.0636193658435114, "learning_rate": 2.45975228752415e-06, "loss": 0.6785, "step": 1910 }, { "epoch": 2.02, "grad_norm": 1.05164052954354, "learning_rate": 2.435709349763354e-06, "loss": 0.7024, "step": 1915 }, { "epoch": 2.03, "grad_norm": 1.0744751292672923, "learning_rate": 2.4117465987121357e-06, "loss": 0.6714, "step": 1920 }, { "epoch": 2.03, "grad_norm": 1.0221167769747221, "learning_rate": 2.387864783698258e-06, "loss": 0.6441, "step": 1925 }, { "epoch": 2.04, "grad_norm": 1.0453109653021675, "learning_rate": 2.3640646515185596e-06, "loss": 0.6668, "step": 1930 }, { "epoch": 2.04, "grad_norm": 1.0035196656143317, "learning_rate": 2.3403469464156235e-06, "loss": 0.6711, "step": 1935 }, { "epoch": 2.05, "grad_norm": 1.0614923887712562, "learning_rate": 2.31671241005449e-06, "loss": 0.6801, "step": 1940 }, { "epoch": 2.06, "grad_norm": 1.0457688195463548, "learning_rate": 2.2931617814994704e-06, "loss": 0.6676, "step": 1945 }, { "epoch": 2.06, "grad_norm": 1.094973586743587, "learning_rate": 2.269695797191032e-06, "loss": 0.6467, "step": 1950 }, { "epoch": 2.07, "grad_norm": 1.0312304548353073, "learning_rate": 2.2463151909227804e-06, "loss": 0.6626, "step": 1955 }, { "epoch": 2.07, "grad_norm": 1.0435526510546405, "learning_rate": 2.223020693818495e-06, "loss": 0.6565, "step": 1960 }, { "epoch": 2.08, "grad_norm": 1.0361388218534178, "learning_rate": 2.1998130343092866e-06, "loss": 0.655, "step": 1965 }, { "epoch": 2.08, "grad_norm": 1.071971382261616, "learning_rate": 2.176692938110801e-06, "loss": 0.6628, "step": 1970 }, { "epoch": 2.09, "grad_norm": 1.0449189624346316, "learning_rate": 2.1536611282005374e-06, "loss": 0.6742, "step": 1975 }, { "epoch": 2.09, "grad_norm": 1.0076278447431801, "learning_rate": 2.130718324795234e-06, "loss": 0.6615, "step": 1980 }, { "epoch": 2.1, "grad_norm": 1.044357139317297, "learning_rate": 2.107865245328354e-06, "loss": 0.6707, "step": 1985 }, { "epoch": 2.1, "grad_norm": 1.0155250644507565, "learning_rate": 2.0851026044276405e-06, "loss": 0.6701, "step": 1990 }, { "epoch": 2.11, "grad_norm": 1.012020172763002, "learning_rate": 2.0624311138927795e-06, "loss": 0.6531, "step": 1995 }, { "epoch": 2.11, "grad_norm": 1.0209851165233697, "learning_rate": 2.0398514826731326e-06, "loss": 0.6685, "step": 2000 }, { "epoch": 2.12, "grad_norm": 1.0147123852944229, "learning_rate": 2.017364416845579e-06, "loss": 0.6506, "step": 2005 }, { "epoch": 2.12, "grad_norm": 1.06994559921509, "learning_rate": 1.9949706195924235e-06, "loss": 0.6743, "step": 2010 }, { "epoch": 2.13, "grad_norm": 0.9930487524595831, "learning_rate": 1.97267079117942e-06, "loss": 0.6596, "step": 2015 }, { "epoch": 2.13, "grad_norm": 1.0334858708046972, "learning_rate": 1.950465628933863e-06, "loss": 0.6679, "step": 2020 }, { "epoch": 2.14, "grad_norm": 1.060064879245556, "learning_rate": 1.9283558272227866e-06, "loss": 0.6749, "step": 2025 }, { "epoch": 2.15, "grad_norm": 1.0171368650427, "learning_rate": 1.9063420774312509e-06, "loss": 0.6703, "step": 2030 }, { "epoch": 2.15, "grad_norm": 0.9646165360014197, "learning_rate": 1.8844250679407272e-06, "loss": 0.6878, "step": 2035 }, { "epoch": 2.16, "grad_norm": 1.0209055430674492, "learning_rate": 1.862605484107562e-06, "loss": 0.7052, "step": 2040 }, { "epoch": 2.16, "grad_norm": 1.0216869737250995, "learning_rate": 1.840884008241549e-06, "loss": 0.6778, "step": 2045 }, { "epoch": 2.17, "grad_norm": 0.990030094537176, "learning_rate": 1.819261319584602e-06, "loss": 0.675, "step": 2050 }, { "epoch": 2.17, "grad_norm": 0.9972968188321764, "learning_rate": 1.7977380942895007e-06, "loss": 0.6832, "step": 2055 }, { "epoch": 2.18, "grad_norm": 1.002919858574642, "learning_rate": 1.7763150053987532e-06, "loss": 0.6669, "step": 2060 }, { "epoch": 2.18, "grad_norm": 1.040641077805689, "learning_rate": 1.7549927228235547e-06, "loss": 0.6874, "step": 2065 }, { "epoch": 2.19, "grad_norm": 1.0136593089712416, "learning_rate": 1.7337719133228308e-06, "loss": 0.6662, "step": 2070 }, { "epoch": 2.19, "grad_norm": 1.0032381970613455, "learning_rate": 1.7126532404823898e-06, "loss": 0.657, "step": 2075 }, { "epoch": 2.2, "grad_norm": 1.0107311218156156, "learning_rate": 1.6916373646941774e-06, "loss": 0.6706, "step": 2080 }, { "epoch": 2.2, "grad_norm": 1.0313882769598175, "learning_rate": 1.6707249431356188e-06, "loss": 0.6803, "step": 2085 }, { "epoch": 2.21, "grad_norm": 1.0013867402651844, "learning_rate": 1.6499166297490716e-06, "loss": 0.6896, "step": 2090 }, { "epoch": 2.21, "grad_norm": 0.9974367112606389, "learning_rate": 1.6292130752213747e-06, "loss": 0.6773, "step": 2095 }, { "epoch": 2.22, "grad_norm": 1.0457782650116, "learning_rate": 1.6086149269635081e-06, "loss": 0.668, "step": 2100 }, { "epoch": 2.22, "grad_norm": 0.9930241935385495, "learning_rate": 1.5881228290903367e-06, "loss": 0.6508, "step": 2105 }, { "epoch": 2.23, "grad_norm": 1.0059354322817335, "learning_rate": 1.5677374224004793e-06, "loss": 0.6529, "step": 2110 }, { "epoch": 2.24, "grad_norm": 1.0338579100235163, "learning_rate": 1.547459344356262e-06, "loss": 0.6614, "step": 2115 }, { "epoch": 2.24, "grad_norm": 1.0203126239591027, "learning_rate": 1.5272892290637892e-06, "loss": 0.6749, "step": 2120 }, { "epoch": 2.25, "grad_norm": 0.983643586611109, "learning_rate": 1.5072277072531127e-06, "loss": 0.6517, "step": 2125 }, { "epoch": 2.25, "grad_norm": 1.0203957676102433, "learning_rate": 1.4872754062585126e-06, "loss": 0.6716, "step": 2130 }, { "epoch": 2.26, "grad_norm": 1.036201909144992, "learning_rate": 1.4674329499988737e-06, "loss": 0.6574, "step": 2135 }, { "epoch": 2.26, "grad_norm": 1.0277085537623492, "learning_rate": 1.4477009589581787e-06, "loss": 0.6593, "step": 2140 }, { "epoch": 2.27, "grad_norm": 0.9713425669443266, "learning_rate": 1.4280800501661057e-06, "loss": 0.6621, "step": 2145 }, { "epoch": 2.27, "grad_norm": 1.028497947768737, "learning_rate": 1.408570837178735e-06, "loss": 0.6656, "step": 2150 }, { "epoch": 2.28, "grad_norm": 1.0565632370972053, "learning_rate": 1.3891739300593559e-06, "loss": 0.6644, "step": 2155 }, { "epoch": 2.28, "grad_norm": 1.0043346444991121, "learning_rate": 1.369889935359402e-06, "loss": 0.6539, "step": 2160 }, { "epoch": 2.29, "grad_norm": 1.0294689299797029, "learning_rate": 1.3507194560994657e-06, "loss": 0.6666, "step": 2165 }, { "epoch": 2.29, "grad_norm": 1.0123495429792864, "learning_rate": 1.331663091750463e-06, "loss": 0.6928, "step": 2170 }, { "epoch": 2.3, "grad_norm": 0.9951164224382856, "learning_rate": 1.312721438214869e-06, "loss": 0.6501, "step": 2175 }, { "epoch": 2.3, "grad_norm": 1.025832661356824, "learning_rate": 1.293895087808098e-06, "loss": 0.6658, "step": 2180 }, { "epoch": 2.31, "grad_norm": 0.9888366700648139, "learning_rate": 1.2751846292399705e-06, "loss": 0.6592, "step": 2185 }, { "epoch": 2.31, "grad_norm": 1.0208359350524125, "learning_rate": 1.2565906475963102e-06, "loss": 0.6483, "step": 2190 }, { "epoch": 2.32, "grad_norm": 1.0568986951058392, "learning_rate": 1.2381137243206455e-06, "loss": 0.6557, "step": 2195 }, { "epoch": 2.32, "grad_norm": 0.9849389521844061, "learning_rate": 1.2197544371960317e-06, "loss": 0.6488, "step": 2200 }, { "epoch": 2.33, "grad_norm": 1.0466426799607875, "learning_rate": 1.2015133603269753e-06, "loss": 0.6596, "step": 2205 }, { "epoch": 2.34, "grad_norm": 0.9985742048846067, "learning_rate": 1.183391064121493e-06, "loss": 0.6572, "step": 2210 }, { "epoch": 2.34, "grad_norm": 0.9661312369342807, "learning_rate": 1.1653881152732582e-06, "loss": 0.6439, "step": 2215 }, { "epoch": 2.35, "grad_norm": 1.0327058718249167, "learning_rate": 1.1475050767439e-06, "loss": 0.6811, "step": 2220 }, { "epoch": 2.35, "grad_norm": 1.0365200638536969, "learning_rate": 1.129742507745382e-06, "loss": 0.6588, "step": 2225 }, { "epoch": 2.36, "grad_norm": 0.9804079029045045, "learning_rate": 1.1121009637225283e-06, "loss": 0.6783, "step": 2230 }, { "epoch": 2.36, "grad_norm": 1.0326866018136251, "learning_rate": 1.0945809963356442e-06, "loss": 0.6705, "step": 2235 }, { "epoch": 2.37, "grad_norm": 1.0314679157662048, "learning_rate": 1.0771831534432714e-06, "loss": 0.6353, "step": 2240 }, { "epoch": 2.37, "grad_norm": 0.9589889108924486, "learning_rate": 1.0599079790850542e-06, "loss": 0.655, "step": 2245 }, { "epoch": 2.38, "grad_norm": 0.9894914192305704, "learning_rate": 1.0427560134647308e-06, "loss": 0.643, "step": 2250 }, { "epoch": 2.38, "grad_norm": 1.0693419775513076, "learning_rate": 1.0257277929332332e-06, "loss": 0.6611, "step": 2255 }, { "epoch": 2.39, "grad_norm": 0.9951590219864285, "learning_rate": 1.0088238499719254e-06, "loss": 0.6403, "step": 2260 }, { "epoch": 2.39, "grad_norm": 1.0105626202971048, "learning_rate": 9.920447131759392e-07, "loss": 0.6707, "step": 2265 }, { "epoch": 2.4, "grad_norm": 1.0186289750333066, "learning_rate": 9.753909072376594e-07, "loss": 0.6809, "step": 2270 }, { "epoch": 2.4, "grad_norm": 1.0267980845318398, "learning_rate": 9.58862952930304e-07, "loss": 0.6642, "step": 2275 }, { "epoch": 2.41, "grad_norm": 1.0314667402705489, "learning_rate": 9.424613670916499e-07, "loss": 0.6815, "step": 2280 }, { "epoch": 2.41, "grad_norm": 0.9818510396592551, "learning_rate": 9.261866626078625e-07, "loss": 0.6579, "step": 2285 }, { "epoch": 2.42, "grad_norm": 0.998040916561116, "learning_rate": 9.100393483974612e-07, "loss": 0.6815, "step": 2290 }, { "epoch": 2.43, "grad_norm": 1.007529165875462, "learning_rate": 8.940199293954033e-07, "loss": 0.6609, "step": 2295 }, { "epoch": 2.43, "grad_norm": 1.0489165413908048, "learning_rate": 8.781289065373016e-07, "loss": 0.6661, "step": 2300 }, { "epoch": 2.44, "grad_norm": 1.0586483881635766, "learning_rate": 8.623667767437483e-07, "loss": 0.6494, "step": 2305 }, { "epoch": 2.44, "grad_norm": 0.970861929985865, "learning_rate": 8.467340329047874e-07, "loss": 0.6403, "step": 2310 }, { "epoch": 2.45, "grad_norm": 1.0315170437890622, "learning_rate": 8.312311638644888e-07, "loss": 0.6802, "step": 2315 }, { "epoch": 2.45, "grad_norm": 1.018615901485097, "learning_rate": 8.158586544056791e-07, "loss": 0.6813, "step": 2320 }, { "epoch": 2.46, "grad_norm": 0.9991739019084611, "learning_rate": 8.00616985234764e-07, "loss": 0.6757, "step": 2325 }, { "epoch": 2.46, "grad_norm": 1.039226698329409, "learning_rate": 7.855066329667121e-07, "loss": 0.6421, "step": 2330 }, { "epoch": 2.47, "grad_norm": 1.0505394427255816, "learning_rate": 7.705280701101392e-07, "loss": 0.6655, "step": 2335 }, { "epoch": 2.47, "grad_norm": 0.9750027460632938, "learning_rate": 7.556817650525383e-07, "loss": 0.6526, "step": 2340 }, { "epoch": 2.48, "grad_norm": 0.989246982143368, "learning_rate": 7.409681820456315e-07, "loss": 0.667, "step": 2345 }, { "epoch": 2.48, "grad_norm": 0.9977414734019189, "learning_rate": 7.263877811908553e-07, "loss": 0.6647, "step": 2350 }, { "epoch": 2.49, "grad_norm": 0.9875292562685886, "learning_rate": 7.11941018424967e-07, "loss": 0.667, "step": 2355 }, { "epoch": 2.49, "grad_norm": 0.9932801930288735, "learning_rate": 6.97628345505797e-07, "loss": 0.6511, "step": 2360 }, { "epoch": 2.5, "grad_norm": 1.0199295886729471, "learning_rate": 6.83450209998106e-07, "loss": 0.6556, "step": 2365 }, { "epoch": 2.5, "grad_norm": 1.0279710885988984, "learning_rate": 6.694070552596105e-07, "loss": 0.6676, "step": 2370 }, { "epoch": 2.51, "grad_norm": 1.0221845787587531, "learning_rate": 6.554993204270993e-07, "loss": 0.6512, "step": 2375 }, { "epoch": 2.52, "grad_norm": 0.9597530531552908, "learning_rate": 6.417274404027163e-07, "loss": 0.6482, "step": 2380 }, { "epoch": 2.52, "grad_norm": 1.0201542647464452, "learning_rate": 6.280918458403506e-07, "loss": 0.6623, "step": 2385 }, { "epoch": 2.53, "grad_norm": 0.9818765108255797, "learning_rate": 6.14592963132174e-07, "loss": 0.6599, "step": 2390 }, { "epoch": 2.53, "grad_norm": 1.0020031777534095, "learning_rate": 6.012312143953075e-07, "loss": 0.6818, "step": 2395 }, { "epoch": 2.54, "grad_norm": 1.020601700800406, "learning_rate": 5.880070174586228e-07, "loss": 0.6794, "step": 2400 }, { "epoch": 2.54, "grad_norm": 0.9781529112263975, "learning_rate": 5.74920785849673e-07, "loss": 0.6612, "step": 2405 }, { "epoch": 2.55, "grad_norm": 1.020456830272749, "learning_rate": 5.619729287817621e-07, "loss": 0.6638, "step": 2410 }, { "epoch": 2.55, "grad_norm": 1.0134058298180835, "learning_rate": 5.49163851141154e-07, "loss": 0.6468, "step": 2415 }, { "epoch": 2.56, "grad_norm": 1.0051724307379968, "learning_rate": 5.36493953474404e-07, "loss": 0.6411, "step": 2420 }, { "epoch": 2.56, "grad_norm": 0.9963926377815217, "learning_rate": 5.239636319758356e-07, "loss": 0.668, "step": 2425 }, { "epoch": 2.57, "grad_norm": 0.9731428272925532, "learning_rate": 5.115732784751576e-07, "loss": 0.6444, "step": 2430 }, { "epoch": 2.57, "grad_norm": 1.0185774017291327, "learning_rate": 4.993232804252018e-07, "loss": 0.6529, "step": 2435 }, { "epoch": 2.58, "grad_norm": 1.00711656230006, "learning_rate": 4.872140208898118e-07, "loss": 0.6539, "step": 2440 }, { "epoch": 2.58, "grad_norm": 1.0045164786035452, "learning_rate": 4.7524587853186866e-07, "loss": 0.6629, "step": 2445 }, { "epoch": 2.59, "grad_norm": 0.9961645157673277, "learning_rate": 4.634192276014399e-07, "loss": 0.6738, "step": 2450 }, { "epoch": 2.59, "grad_norm": 1.0214318273829783, "learning_rate": 4.5173443792408625e-07, "loss": 0.6552, "step": 2455 }, { "epoch": 2.6, "grad_norm": 1.0163355618069994, "learning_rate": 4.4019187488928914e-07, "loss": 0.6638, "step": 2460 }, { "epoch": 2.61, "grad_norm": 1.032574771687925, "learning_rate": 4.2879189943903335e-07, "loss": 0.6877, "step": 2465 }, { "epoch": 2.61, "grad_norm": 0.9930486578442914, "learning_rate": 4.1753486805651e-07, "loss": 0.6832, "step": 2470 }, { "epoch": 2.62, "grad_norm": 0.969259241462703, "learning_rate": 4.064211327549794e-07, "loss": 0.6738, "step": 2475 }, { "epoch": 2.62, "grad_norm": 1.018380412495952, "learning_rate": 3.95451041066755e-07, "loss": 0.671, "step": 2480 }, { "epoch": 2.63, "grad_norm": 0.9735720562840744, "learning_rate": 3.8462493603234064e-07, "loss": 0.6433, "step": 2485 }, { "epoch": 2.63, "grad_norm": 1.023935871901339, "learning_rate": 3.739431561897011e-07, "loss": 0.6593, "step": 2490 }, { "epoch": 2.64, "grad_norm": 0.9931869209408388, "learning_rate": 3.634060355636798e-07, "loss": 0.6647, "step": 2495 }, { "epoch": 2.64, "grad_norm": 1.0007736035504975, "learning_rate": 3.53013903655548e-07, "loss": 0.6683, "step": 2500 }, { "epoch": 2.65, "grad_norm": 0.9926593135266999, "learning_rate": 3.427670854327042e-07, "loss": 0.6668, "step": 2505 }, { "epoch": 2.65, "grad_norm": 0.9870259704326787, "learning_rate": 3.3266590131851296e-07, "loss": 0.6583, "step": 2510 }, { "epoch": 2.66, "grad_norm": 1.0298553599069395, "learning_rate": 3.227106671822849e-07, "loss": 0.6835, "step": 2515 }, { "epoch": 2.66, "grad_norm": 0.9915918166378904, "learning_rate": 3.1290169432939556e-07, "loss": 0.6428, "step": 2520 }, { "epoch": 2.67, "grad_norm": 1.060474012796049, "learning_rate": 3.03239289491557e-07, "loss": 0.6571, "step": 2525 }, { "epoch": 2.67, "grad_norm": 1.0203183687136719, "learning_rate": 2.937237548172206e-07, "loss": 0.6511, "step": 2530 }, { "epoch": 2.68, "grad_norm": 0.989507237700814, "learning_rate": 2.8435538786213134e-07, "loss": 0.6746, "step": 2535 }, { "epoch": 2.68, "grad_norm": 0.9853274639882493, "learning_rate": 2.7513448158002334e-07, "loss": 0.6657, "step": 2540 }, { "epoch": 2.69, "grad_norm": 0.9957797339050202, "learning_rate": 2.66061324313458e-07, "loss": 0.6496, "step": 2545 }, { "epoch": 2.69, "grad_norm": 1.0073836211394178, "learning_rate": 2.5713619978480653e-07, "loss": 0.6596, "step": 2550 }, { "epoch": 2.7, "grad_norm": 0.9798969178233458, "learning_rate": 2.483593870873829e-07, "loss": 0.654, "step": 2555 }, { "epoch": 2.71, "grad_norm": 0.9936847658098146, "learning_rate": 2.3973116067670665e-07, "loss": 0.6457, "step": 2560 }, { "epoch": 2.71, "grad_norm": 1.0224466038654803, "learning_rate": 2.3125179036193214e-07, "loss": 0.6572, "step": 2565 }, { "epoch": 2.72, "grad_norm": 1.0378183041017084, "learning_rate": 2.2292154129740117e-07, "loss": 0.6554, "step": 2570 }, { "epoch": 2.72, "grad_norm": 0.9787357607930246, "learning_rate": 2.147406739743596e-07, "loss": 0.6689, "step": 2575 }, { "epoch": 2.73, "grad_norm": 1.003947207260689, "learning_rate": 2.0670944421280646e-07, "loss": 0.6458, "step": 2580 }, { "epoch": 2.73, "grad_norm": 1.0063190015667964, "learning_rate": 1.9882810315349554e-07, "loss": 0.6648, "step": 2585 }, { "epoch": 2.74, "grad_norm": 1.0148103533053272, "learning_rate": 1.9109689725008317e-07, "loss": 0.6738, "step": 2590 }, { "epoch": 2.74, "grad_norm": 1.0122729219524842, "learning_rate": 1.8351606826142176e-07, "loss": 0.6796, "step": 2595 }, { "epoch": 2.75, "grad_norm": 1.0170129872933447, "learning_rate": 1.7608585324399684e-07, "loss": 0.6798, "step": 2600 }, { "epoch": 2.75, "grad_norm": 0.992464215850126, "learning_rate": 1.688064845445192e-07, "loss": 0.6695, "step": 2605 }, { "epoch": 2.76, "grad_norm": 0.9778375876093532, "learning_rate": 1.6167818979265282e-07, "loss": 0.6563, "step": 2610 }, { "epoch": 2.76, "grad_norm": 1.0165595693382412, "learning_rate": 1.5470119189390342e-07, "loss": 0.6709, "step": 2615 }, { "epoch": 2.77, "grad_norm": 0.9846854115443192, "learning_rate": 1.4787570902264293e-07, "loss": 0.6468, "step": 2620 }, { "epoch": 2.77, "grad_norm": 1.0226129803358943, "learning_rate": 1.4120195461529097e-07, "loss": 0.6699, "step": 2625 }, { "epoch": 2.78, "grad_norm": 1.0082916511837874, "learning_rate": 1.3468013736363694e-07, "loss": 0.6516, "step": 2630 }, { "epoch": 2.78, "grad_norm": 1.0086534086914538, "learning_rate": 1.2831046120831692e-07, "loss": 0.6483, "step": 2635 }, { "epoch": 2.79, "grad_norm": 0.9957571698657345, "learning_rate": 1.2209312533243535e-07, "loss": 0.6632, "step": 2640 }, { "epoch": 2.8, "grad_norm": 1.0298383480420663, "learning_rate": 1.1602832415533616e-07, "loss": 0.6645, "step": 2645 }, { "epoch": 2.8, "grad_norm": 1.0188314052602203, "learning_rate": 1.1011624732652437e-07, "loss": 0.6752, "step": 2650 }, { "epoch": 2.81, "grad_norm": 1.0019681746822835, "learning_rate": 1.0435707971973297e-07, "loss": 0.6573, "step": 2655 }, { "epoch": 2.81, "grad_norm": 0.9926022445477827, "learning_rate": 9.875100142714478e-08, "loss": 0.6396, "step": 2660 }, { "epoch": 2.82, "grad_norm": 0.9847567872289796, "learning_rate": 9.329818775376088e-08, "loss": 0.672, "step": 2665 }, { "epoch": 2.82, "grad_norm": 1.0103069579844817, "learning_rate": 8.79988092119144e-08, "loss": 0.678, "step": 2670 }, { "epoch": 2.83, "grad_norm": 1.0092463732513441, "learning_rate": 8.285303151594537e-08, "loss": 0.6837, "step": 2675 }, { "epoch": 2.83, "grad_norm": 1.0032753352403014, "learning_rate": 7.786101557701209e-08, "loss": 0.6494, "step": 2680 }, { "epoch": 2.84, "grad_norm": 1.0278927407365124, "learning_rate": 7.302291749806345e-08, "loss": 0.6597, "step": 2685 }, { "epoch": 2.84, "grad_norm": 0.9985234255556347, "learning_rate": 6.833888856895676e-08, "loss": 0.6672, "step": 2690 }, { "epoch": 2.85, "grad_norm": 1.0086435046290338, "learning_rate": 6.380907526172597e-08, "loss": 0.6768, "step": 2695 }, { "epoch": 2.85, "grad_norm": 0.9639413787477988, "learning_rate": 5.943361922600255e-08, "loss": 0.6346, "step": 2700 }, { "epoch": 2.86, "grad_norm": 0.9898392259409212, "learning_rate": 5.521265728458347e-08, "loss": 0.6655, "step": 2705 }, { "epoch": 2.86, "grad_norm": 1.0000733408715612, "learning_rate": 5.114632142915687e-08, "loss": 0.638, "step": 2710 }, { "epoch": 2.87, "grad_norm": 0.990452054352071, "learning_rate": 4.723473881617147e-08, "loss": 0.6583, "step": 2715 }, { "epoch": 2.87, "grad_norm": 0.988717000145255, "learning_rate": 4.347803176286025e-08, "loss": 0.6708, "step": 2720 }, { "epoch": 2.88, "grad_norm": 0.9868081897157113, "learning_rate": 3.98763177434186e-08, "loss": 0.6583, "step": 2725 }, { "epoch": 2.89, "grad_norm": 1.001603936622736, "learning_rate": 3.642970938532553e-08, "loss": 0.6754, "step": 2730 }, { "epoch": 2.89, "grad_norm": 1.0028854813842756, "learning_rate": 3.313831446582816e-08, "loss": 0.6784, "step": 2735 }, { "epoch": 2.9, "grad_norm": 0.9840591494137083, "learning_rate": 3.000223590856666e-08, "loss": 0.6651, "step": 2740 }, { "epoch": 2.9, "grad_norm": 1.0425902900408417, "learning_rate": 2.7021571780356804e-08, "loss": 0.6489, "step": 2745 }, { "epoch": 2.91, "grad_norm": 1.0016271763738829, "learning_rate": 2.419641528812522e-08, "loss": 0.6501, "step": 2750 }, { "epoch": 2.91, "grad_norm": 0.9875844742537229, "learning_rate": 2.1526854775992255e-08, "loss": 0.667, "step": 2755 }, { "epoch": 2.92, "grad_norm": 0.9909068409835267, "learning_rate": 1.901297372251143e-08, "loss": 0.6649, "step": 2760 }, { "epoch": 2.92, "grad_norm": 1.0200770120528766, "learning_rate": 1.665485073805817e-08, "loss": 0.6542, "step": 2765 }, { "epoch": 2.93, "grad_norm": 0.9699214260408161, "learning_rate": 1.4452559562370683e-08, "loss": 0.6644, "step": 2770 }, { "epoch": 2.93, "grad_norm": 0.9962874170809767, "learning_rate": 1.2406169062246232e-08, "loss": 0.6502, "step": 2775 }, { "epoch": 2.94, "grad_norm": 1.0264867036759864, "learning_rate": 1.0515743229385645e-08, "loss": 0.6698, "step": 2780 }, { "epoch": 2.94, "grad_norm": 1.0133222133442825, "learning_rate": 8.781341178393244e-09, "loss": 0.6723, "step": 2785 }, { "epoch": 2.95, "grad_norm": 1.0159129157737807, "learning_rate": 7.203017144927771e-09, "loss": 0.6561, "step": 2790 }, { "epoch": 2.95, "grad_norm": 0.9931795490054022, "learning_rate": 5.780820484007632e-09, "loss": 0.6563, "step": 2795 }, { "epoch": 2.96, "grad_norm": 1.0195254872888724, "learning_rate": 4.514795668466576e-09, "loss": 0.6808, "step": 2800 }, { "epoch": 2.96, "grad_norm": 1.0210108366337896, "learning_rate": 3.4049822875614757e-09, "loss": 0.6723, "step": 2805 }, { "epoch": 2.97, "grad_norm": 0.9891130306027911, "learning_rate": 2.4514150457377594e-09, "loss": 0.6763, "step": 2810 }, { "epoch": 2.97, "grad_norm": 0.9876265686294937, "learning_rate": 1.654123761541393e-09, "loss": 0.6652, "step": 2815 }, { "epoch": 2.98, "grad_norm": 0.9719073327336301, "learning_rate": 1.0131333666885124e-09, "loss": 0.6793, "step": 2820 }, { "epoch": 2.99, "grad_norm": 1.004648101535836, "learning_rate": 5.284639052832718e-10, "loss": 0.6643, "step": 2825 }, { "epoch": 2.99, "grad_norm": 1.0172517540637482, "learning_rate": 2.0013053319334341e-10, "loss": 0.6768, "step": 2830 }, { "epoch": 3.0, "grad_norm": 0.9650966122076953, "learning_rate": 2.814351757529643e-11, "loss": 0.6356, "step": 2835 }, { "epoch": 3.0, "step": 2838, "total_flos": 1471706245890048.0, "train_loss": 0.8058284866381398, "train_runtime": 31310.8966, "train_samples_per_second": 5.802, "train_steps_per_second": 0.091 } ], "logging_steps": 5, "max_steps": 2838, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1471706245890048.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }