{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 1075, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009302325581395349, "grad_norm": 11.875020980834961, "learning_rate": 1.8518518518518518e-07, "loss": 2.051, "step": 1 }, { "epoch": 0.004651162790697674, "grad_norm": 10.683406829833984, "learning_rate": 9.259259259259259e-07, "loss": 2.08, "step": 5 }, { "epoch": 0.009302325581395349, "grad_norm": 7.126027584075928, "learning_rate": 1.8518518518518519e-06, "loss": 1.9888, "step": 10 }, { "epoch": 0.013953488372093023, "grad_norm": 4.20051908493042, "learning_rate": 2.7777777777777783e-06, "loss": 1.7001, "step": 15 }, { "epoch": 0.018604651162790697, "grad_norm": 3.7895281314849854, "learning_rate": 3.7037037037037037e-06, "loss": 1.4567, "step": 20 }, { "epoch": 0.023255813953488372, "grad_norm": 2.5122790336608887, "learning_rate": 4.62962962962963e-06, "loss": 1.223, "step": 25 }, { "epoch": 0.027906976744186046, "grad_norm": 1.9781367778778076, "learning_rate": 5.555555555555557e-06, "loss": 1.1428, "step": 30 }, { "epoch": 0.03255813953488372, "grad_norm": 1.9213618040084839, "learning_rate": 6.481481481481482e-06, "loss": 1.1077, "step": 35 }, { "epoch": 0.037209302325581395, "grad_norm": 1.7331029176712036, "learning_rate": 7.4074074074074075e-06, "loss": 1.067, "step": 40 }, { "epoch": 0.04186046511627907, "grad_norm": 1.8956832885742188, "learning_rate": 8.333333333333334e-06, "loss": 1.0386, "step": 45 }, { "epoch": 0.046511627906976744, "grad_norm": 1.9119031429290771, "learning_rate": 9.25925925925926e-06, "loss": 1.0394, "step": 50 }, { "epoch": 0.05116279069767442, "grad_norm": 2.192683696746826, "learning_rate": 1.0185185185185186e-05, "loss": 1.1009, "step": 55 }, { "epoch": 0.05581395348837209, "grad_norm": 1.9372894763946533, "learning_rate": 1.1111111111111113e-05, "loss": 1.0395, "step": 60 }, { "epoch": 0.06046511627906977, "grad_norm": 1.9333980083465576, "learning_rate": 1.2037037037037039e-05, "loss": 0.9987, "step": 65 }, { "epoch": 0.06511627906976744, "grad_norm": 1.9521620273590088, "learning_rate": 1.2962962962962964e-05, "loss": 0.9374, "step": 70 }, { "epoch": 0.06976744186046512, "grad_norm": 1.8810534477233887, "learning_rate": 1.388888888888889e-05, "loss": 1.0206, "step": 75 }, { "epoch": 0.07441860465116279, "grad_norm": 1.9386012554168701, "learning_rate": 1.4814814814814815e-05, "loss": 1.0259, "step": 80 }, { "epoch": 0.07906976744186046, "grad_norm": 2.0746471881866455, "learning_rate": 1.5740740740740744e-05, "loss": 0.9524, "step": 85 }, { "epoch": 0.08372093023255814, "grad_norm": 1.917472243309021, "learning_rate": 1.6666666666666667e-05, "loss": 0.9492, "step": 90 }, { "epoch": 0.08837209302325581, "grad_norm": 2.0328681468963623, "learning_rate": 1.7592592592592595e-05, "loss": 0.9659, "step": 95 }, { "epoch": 0.09302325581395349, "grad_norm": 1.9418821334838867, "learning_rate": 1.851851851851852e-05, "loss": 0.93, "step": 100 }, { "epoch": 0.09767441860465116, "grad_norm": 1.881958246231079, "learning_rate": 1.9444444444444445e-05, "loss": 0.8999, "step": 105 }, { "epoch": 0.10232558139534884, "grad_norm": 2.234872817993164, "learning_rate": 1.999978890630352e-05, "loss": 0.9269, "step": 110 }, { "epoch": 0.10697674418604651, "grad_norm": 1.785520315170288, "learning_rate": 1.9997414204566915e-05, "loss": 0.8628, "step": 115 }, { "epoch": 0.11162790697674418, "grad_norm": 1.9457143545150757, "learning_rate": 1.9992401562656023e-05, "loss": 0.8544, "step": 120 }, { "epoch": 0.11627906976744186, "grad_norm": 1.8361793756484985, "learning_rate": 1.9984752303217796e-05, "loss": 0.882, "step": 125 }, { "epoch": 0.12093023255813953, "grad_norm": 1.75851571559906, "learning_rate": 1.9974468444603022e-05, "loss": 0.8743, "step": 130 }, { "epoch": 0.12558139534883722, "grad_norm": 2.0207371711730957, "learning_rate": 1.9961552700333736e-05, "loss": 0.8875, "step": 135 }, { "epoch": 0.13023255813953488, "grad_norm": 1.6466691493988037, "learning_rate": 1.9946008478387238e-05, "loss": 0.8418, "step": 140 }, { "epoch": 0.13488372093023257, "grad_norm": 1.8543481826782227, "learning_rate": 1.992783988029686e-05, "loss": 0.8621, "step": 145 }, { "epoch": 0.13953488372093023, "grad_norm": 1.7923078536987305, "learning_rate": 1.9907051700069716e-05, "loss": 0.8261, "step": 150 }, { "epoch": 0.14418604651162792, "grad_norm": 1.9153462648391724, "learning_rate": 1.9883649422921746e-05, "loss": 0.8522, "step": 155 }, { "epoch": 0.14883720930232558, "grad_norm": 1.9015884399414062, "learning_rate": 1.985763922383038e-05, "loss": 0.8329, "step": 160 }, { "epoch": 0.15348837209302327, "grad_norm": 1.7250375747680664, "learning_rate": 1.982902796590519e-05, "loss": 0.8074, "step": 165 }, { "epoch": 0.15813953488372093, "grad_norm": 1.9175854921340942, "learning_rate": 1.979782319857697e-05, "loss": 0.8451, "step": 170 }, { "epoch": 0.16279069767441862, "grad_norm": 1.7632246017456055, "learning_rate": 1.9764033155605746e-05, "loss": 0.84, "step": 175 }, { "epoch": 0.16744186046511628, "grad_norm": 2.0264551639556885, "learning_rate": 1.9727666752908174e-05, "loss": 0.8402, "step": 180 }, { "epoch": 0.17209302325581396, "grad_norm": 1.8138450384140015, "learning_rate": 1.9688733586204977e-05, "loss": 0.7737, "step": 185 }, { "epoch": 0.17674418604651163, "grad_norm": 1.8395872116088867, "learning_rate": 1.9647243928489e-05, "loss": 0.7852, "step": 190 }, { "epoch": 0.1813953488372093, "grad_norm": 1.8603692054748535, "learning_rate": 1.960320872731454e-05, "loss": 0.7625, "step": 195 }, { "epoch": 0.18604651162790697, "grad_norm": 1.7493908405303955, "learning_rate": 1.955663960190873e-05, "loss": 0.74, "step": 200 }, { "epoch": 0.19069767441860466, "grad_norm": 1.9812886714935303, "learning_rate": 1.9507548840105618e-05, "loss": 0.7716, "step": 205 }, { "epoch": 0.19534883720930232, "grad_norm": 1.745795488357544, "learning_rate": 1.9455949395103917e-05, "loss": 0.7337, "step": 210 }, { "epoch": 0.2, "grad_norm": 2.082470178604126, "learning_rate": 1.9401854882049122e-05, "loss": 0.7687, "step": 215 }, { "epoch": 0.20465116279069767, "grad_norm": 1.8943344354629517, "learning_rate": 1.9345279574440982e-05, "loss": 0.74, "step": 220 }, { "epoch": 0.20930232558139536, "grad_norm": 2.0032765865325928, "learning_rate": 1.928623840036728e-05, "loss": 0.7029, "step": 225 }, { "epoch": 0.21395348837209302, "grad_norm": 1.9782109260559082, "learning_rate": 1.922474693856486e-05, "loss": 0.7164, "step": 230 }, { "epoch": 0.2186046511627907, "grad_norm": 1.9029327630996704, "learning_rate": 1.916082141430899e-05, "loss": 0.7358, "step": 235 }, { "epoch": 0.22325581395348837, "grad_norm": 2.0331649780273438, "learning_rate": 1.9094478695132138e-05, "loss": 0.741, "step": 240 }, { "epoch": 0.22790697674418606, "grad_norm": 1.7450425624847412, "learning_rate": 1.9025736286373232e-05, "loss": 0.7351, "step": 245 }, { "epoch": 0.23255813953488372, "grad_norm": 1.6871542930603027, "learning_rate": 1.895461232655871e-05, "loss": 0.6267, "step": 250 }, { "epoch": 0.2372093023255814, "grad_norm": 1.6395697593688965, "learning_rate": 1.8881125582616403e-05, "loss": 0.6552, "step": 255 }, { "epoch": 0.24186046511627907, "grad_norm": 1.7953416109085083, "learning_rate": 1.880529544492368e-05, "loss": 0.6172, "step": 260 }, { "epoch": 0.24651162790697675, "grad_norm": 1.5235698223114014, "learning_rate": 1.872714192219105e-05, "loss": 0.6552, "step": 265 }, { "epoch": 0.25116279069767444, "grad_norm": 1.8140872716903687, "learning_rate": 1.8646685636182616e-05, "loss": 0.6739, "step": 270 }, { "epoch": 0.2558139534883721, "grad_norm": 1.5865305662155151, "learning_rate": 1.856394781627477e-05, "loss": 0.6621, "step": 275 }, { "epoch": 0.26046511627906976, "grad_norm": 1.7430166006088257, "learning_rate": 1.847895029385458e-05, "loss": 0.6248, "step": 280 }, { "epoch": 0.2651162790697674, "grad_norm": 1.6584974527359009, "learning_rate": 1.8391715496559275e-05, "loss": 0.5992, "step": 285 }, { "epoch": 0.26976744186046514, "grad_norm": 1.7287700176239014, "learning_rate": 1.8302266442358474e-05, "loss": 0.6171, "step": 290 }, { "epoch": 0.2744186046511628, "grad_norm": 1.5523782968521118, "learning_rate": 1.821062673348059e-05, "loss": 0.6301, "step": 295 }, { "epoch": 0.27906976744186046, "grad_norm": 1.5865797996520996, "learning_rate": 1.8116820550185108e-05, "loss": 0.6035, "step": 300 }, { "epoch": 0.2837209302325581, "grad_norm": 1.5850545167922974, "learning_rate": 1.8020872644382313e-05, "loss": 0.5909, "step": 305 }, { "epoch": 0.28837209302325584, "grad_norm": 1.5747253894805908, "learning_rate": 1.7922808333102207e-05, "loss": 0.5466, "step": 310 }, { "epoch": 0.2930232558139535, "grad_norm": 1.623345136642456, "learning_rate": 1.7822653491814305e-05, "loss": 0.5297, "step": 315 }, { "epoch": 0.29767441860465116, "grad_norm": 1.6470658779144287, "learning_rate": 1.772043454760004e-05, "loss": 0.5613, "step": 320 }, { "epoch": 0.3023255813953488, "grad_norm": 1.6495023965835571, "learning_rate": 1.7616178472179718e-05, "loss": 0.6115, "step": 325 }, { "epoch": 0.30697674418604654, "grad_norm": 1.4790008068084717, "learning_rate": 1.750991277479563e-05, "loss": 0.5912, "step": 330 }, { "epoch": 0.3116279069767442, "grad_norm": 1.5336066484451294, "learning_rate": 1.740166549495345e-05, "loss": 0.5643, "step": 335 }, { "epoch": 0.31627906976744186, "grad_norm": 1.2903903722763062, "learning_rate": 1.7291465195023654e-05, "loss": 0.5232, "step": 340 }, { "epoch": 0.3209302325581395, "grad_norm": 1.5350030660629272, "learning_rate": 1.717934095270497e-05, "loss": 0.5611, "step": 345 }, { "epoch": 0.32558139534883723, "grad_norm": 1.7635351419448853, "learning_rate": 1.7065322353351904e-05, "loss": 0.5913, "step": 350 }, { "epoch": 0.3302325581395349, "grad_norm": 1.598311424255371, "learning_rate": 1.6949439482168254e-05, "loss": 0.5547, "step": 355 }, { "epoch": 0.33488372093023255, "grad_norm": 1.7346688508987427, "learning_rate": 1.6831722916268787e-05, "loss": 0.5869, "step": 360 }, { "epoch": 0.3395348837209302, "grad_norm": 1.4408459663391113, "learning_rate": 1.671220371661106e-05, "loss": 0.4912, "step": 365 }, { "epoch": 0.34418604651162793, "grad_norm": 1.4295024871826172, "learning_rate": 1.6590913419799635e-05, "loss": 0.5315, "step": 370 }, { "epoch": 0.3488372093023256, "grad_norm": 1.418042778968811, "learning_rate": 1.646788402976474e-05, "loss": 0.5181, "step": 375 }, { "epoch": 0.35348837209302325, "grad_norm": 1.5987695455551147, "learning_rate": 1.6343148009317658e-05, "loss": 0.5049, "step": 380 }, { "epoch": 0.3581395348837209, "grad_norm": 1.3464775085449219, "learning_rate": 1.6216738271585e-05, "loss": 0.4994, "step": 385 }, { "epoch": 0.3627906976744186, "grad_norm": 1.3645445108413696, "learning_rate": 1.6088688171324185e-05, "loss": 0.4882, "step": 390 }, { "epoch": 0.3674418604651163, "grad_norm": 1.6013556718826294, "learning_rate": 1.5959031496122365e-05, "loss": 0.4811, "step": 395 }, { "epoch": 0.37209302325581395, "grad_norm": 1.5721732378005981, "learning_rate": 1.582780245748118e-05, "loss": 0.4583, "step": 400 }, { "epoch": 0.3767441860465116, "grad_norm": 1.4035416841506958, "learning_rate": 1.569503568178961e-05, "loss": 0.48, "step": 405 }, { "epoch": 0.3813953488372093, "grad_norm": 1.390092134475708, "learning_rate": 1.5560766201187386e-05, "loss": 0.4064, "step": 410 }, { "epoch": 0.386046511627907, "grad_norm": 1.379281759262085, "learning_rate": 1.5425029444321346e-05, "loss": 0.4275, "step": 415 }, { "epoch": 0.39069767441860465, "grad_norm": 1.3608301877975464, "learning_rate": 1.5287861226997125e-05, "loss": 0.4429, "step": 420 }, { "epoch": 0.3953488372093023, "grad_norm": 1.401227355003357, "learning_rate": 1.5149297742728738e-05, "loss": 0.448, "step": 425 }, { "epoch": 0.4, "grad_norm": 1.491600513458252, "learning_rate": 1.500937555318847e-05, "loss": 0.4135, "step": 430 }, { "epoch": 0.4046511627906977, "grad_norm": 1.6559197902679443, "learning_rate": 1.4868131578559633e-05, "loss": 0.3997, "step": 435 }, { "epoch": 0.40930232558139534, "grad_norm": 1.5281630754470825, "learning_rate": 1.4725603087794716e-05, "loss": 0.4267, "step": 440 }, { "epoch": 0.413953488372093, "grad_norm": 1.2546498775482178, "learning_rate": 1.4581827688781531e-05, "loss": 0.3771, "step": 445 }, { "epoch": 0.4186046511627907, "grad_norm": 1.2490854263305664, "learning_rate": 1.4436843318419898e-05, "loss": 0.413, "step": 450 }, { "epoch": 0.4232558139534884, "grad_norm": 1.1986746788024902, "learning_rate": 1.4290688232611526e-05, "loss": 0.3392, "step": 455 }, { "epoch": 0.42790697674418604, "grad_norm": 1.1714967489242554, "learning_rate": 1.4143400996165748e-05, "loss": 0.3577, "step": 460 }, { "epoch": 0.4325581395348837, "grad_norm": 1.1918331384658813, "learning_rate": 1.3995020472623692e-05, "loss": 0.3689, "step": 465 }, { "epoch": 0.4372093023255814, "grad_norm": 1.3175437450408936, "learning_rate": 1.3845585814003686e-05, "loss": 0.402, "step": 470 }, { "epoch": 0.4418604651162791, "grad_norm": 1.1682705879211426, "learning_rate": 1.36951364504705e-05, "loss": 0.3189, "step": 475 }, { "epoch": 0.44651162790697674, "grad_norm": 1.3450775146484375, "learning_rate": 1.3543712079931233e-05, "loss": 0.4095, "step": 480 }, { "epoch": 0.4511627906976744, "grad_norm": 1.1513640880584717, "learning_rate": 1.3391352657560512e-05, "loss": 0.3358, "step": 485 }, { "epoch": 0.4558139534883721, "grad_norm": 1.440575361251831, "learning_rate": 1.323809838525785e-05, "loss": 0.3549, "step": 490 }, { "epoch": 0.4604651162790698, "grad_norm": 1.3695510625839233, "learning_rate": 1.3083989701039868e-05, "loss": 0.3339, "step": 495 }, { "epoch": 0.46511627906976744, "grad_norm": 1.2495150566101074, "learning_rate": 1.2929067268370234e-05, "loss": 0.3522, "step": 500 }, { "epoch": 0.4697674418604651, "grad_norm": 1.4666467905044556, "learning_rate": 1.2773371965430114e-05, "loss": 0.302, "step": 505 }, { "epoch": 0.4744186046511628, "grad_norm": 1.2039730548858643, "learning_rate": 1.2616944874331965e-05, "loss": 0.2903, "step": 510 }, { "epoch": 0.4790697674418605, "grad_norm": 1.35472571849823, "learning_rate": 1.2459827270279499e-05, "loss": 0.3078, "step": 515 }, { "epoch": 0.48372093023255813, "grad_norm": 1.094884991645813, "learning_rate": 1.2302060610676736e-05, "loss": 0.2858, "step": 520 }, { "epoch": 0.4883720930232558, "grad_norm": 1.1338796615600586, "learning_rate": 1.2143686524188954e-05, "loss": 0.2972, "step": 525 }, { "epoch": 0.4930232558139535, "grad_norm": 1.2089378833770752, "learning_rate": 1.1984746799758442e-05, "loss": 0.2982, "step": 530 }, { "epoch": 0.49767441860465117, "grad_norm": 1.0935381650924683, "learning_rate": 1.1825283375578006e-05, "loss": 0.2761, "step": 535 }, { "epoch": 0.5023255813953489, "grad_norm": 1.144473671913147, "learning_rate": 1.1665338328025027e-05, "loss": 0.2972, "step": 540 }, { "epoch": 0.5069767441860465, "grad_norm": 1.0310620069503784, "learning_rate": 1.1504953860559115e-05, "loss": 0.2793, "step": 545 }, { "epoch": 0.5116279069767442, "grad_norm": 1.4676309823989868, "learning_rate": 1.1344172292586218e-05, "loss": 0.344, "step": 550 }, { "epoch": 0.5162790697674419, "grad_norm": 1.2346969842910767, "learning_rate": 1.1183036048292099e-05, "loss": 0.2786, "step": 555 }, { "epoch": 0.5209302325581395, "grad_norm": 1.2141753435134888, "learning_rate": 1.1021587645448222e-05, "loss": 0.2714, "step": 560 }, { "epoch": 0.5255813953488372, "grad_norm": 1.0154790878295898, "learning_rate": 1.0859869684192907e-05, "loss": 0.2789, "step": 565 }, { "epoch": 0.5302325581395348, "grad_norm": 1.2959997653961182, "learning_rate": 1.0697924835790759e-05, "loss": 0.2942, "step": 570 }, { "epoch": 0.5348837209302325, "grad_norm": 1.1473537683486938, "learning_rate": 1.0535795831373337e-05, "loss": 0.2849, "step": 575 }, { "epoch": 0.5395348837209303, "grad_norm": 1.1317942142486572, "learning_rate": 1.0373525450664017e-05, "loss": 0.2832, "step": 580 }, { "epoch": 0.5441860465116279, "grad_norm": 1.0453064441680908, "learning_rate": 1.0211156510690043e-05, "loss": 0.2403, "step": 585 }, { "epoch": 0.5488372093023256, "grad_norm": 1.1681984663009644, "learning_rate": 1.0048731854484734e-05, "loss": 0.2288, "step": 590 }, { "epoch": 0.5534883720930233, "grad_norm": 1.335357904434204, "learning_rate": 9.886294339782805e-06, "loss": 0.2771, "step": 595 }, { "epoch": 0.5581395348837209, "grad_norm": 0.9871159195899963, "learning_rate": 9.723886827711858e-06, "loss": 0.1839, "step": 600 }, { "epoch": 0.5627906976744186, "grad_norm": 1.029579997062683, "learning_rate": 9.561552171482925e-06, "loss": 0.2361, "step": 605 }, { "epoch": 0.5674418604651162, "grad_norm": 1.2840648889541626, "learning_rate": 9.399333205083131e-06, "loss": 0.2674, "step": 610 }, { "epoch": 0.5720930232558139, "grad_norm": 1.0546510219573975, "learning_rate": 9.237272731973429e-06, "loss": 0.2438, "step": 615 }, { "epoch": 0.5767441860465117, "grad_norm": 1.0226281881332397, "learning_rate": 9.075413513794368e-06, "loss": 0.2418, "step": 620 }, { "epoch": 0.5813953488372093, "grad_norm": 0.9618441462516785, "learning_rate": 8.913798259082929e-06, "loss": 0.211, "step": 625 }, { "epoch": 0.586046511627907, "grad_norm": 1.1201444864273071, "learning_rate": 8.752469612003332e-06, "loss": 0.2234, "step": 630 }, { "epoch": 0.5906976744186047, "grad_norm": 0.9489670395851135, "learning_rate": 8.591470141094878e-06, "loss": 0.2159, "step": 635 }, { "epoch": 0.5953488372093023, "grad_norm": 1.0621603727340698, "learning_rate": 8.430842328039686e-06, "loss": 0.2343, "step": 640 }, { "epoch": 0.6, "grad_norm": 0.8708460927009583, "learning_rate": 8.270628556453417e-06, "loss": 0.2096, "step": 645 }, { "epoch": 0.6046511627906976, "grad_norm": 1.0906603336334229, "learning_rate": 8.110871100701807e-06, "loss": 0.2113, "step": 650 }, { "epoch": 0.6093023255813953, "grad_norm": 1.080915093421936, "learning_rate": 7.951612114746078e-06, "loss": 0.2193, "step": 655 }, { "epoch": 0.6139534883720931, "grad_norm": 2.644400119781494, "learning_rate": 7.792893621020083e-06, "loss": 0.1959, "step": 660 }, { "epoch": 0.6186046511627907, "grad_norm": 1.0472601652145386, "learning_rate": 7.634757499342191e-06, "loss": 0.2238, "step": 665 }, { "epoch": 0.6232558139534884, "grad_norm": 0.9129474759101868, "learning_rate": 7.477245475864772e-06, "loss": 0.1924, "step": 670 }, { "epoch": 0.627906976744186, "grad_norm": 1.0401651859283447, "learning_rate": 7.3203991120642335e-06, "loss": 0.2231, "step": 675 }, { "epoch": 0.6325581395348837, "grad_norm": 0.832706093788147, "learning_rate": 7.164259793774539e-06, "loss": 0.2334, "step": 680 }, { "epoch": 0.6372093023255814, "grad_norm": 1.1918411254882812, "learning_rate": 7.008868720267021e-06, "loss": 0.1935, "step": 685 }, { "epoch": 0.641860465116279, "grad_norm": 0.9831671714782715, "learning_rate": 6.854266893379471e-06, "loss": 0.1781, "step": 690 }, { "epoch": 0.6465116279069767, "grad_norm": 0.9214596152305603, "learning_rate": 6.700495106697296e-06, "loss": 0.2065, "step": 695 }, { "epoch": 0.6511627906976745, "grad_norm": 1.054748296737671, "learning_rate": 6.5475939347896185e-06, "loss": 0.1833, "step": 700 }, { "epoch": 0.6558139534883721, "grad_norm": 1.0860075950622559, "learning_rate": 6.395603722503205e-06, "loss": 0.2082, "step": 705 }, { "epoch": 0.6604651162790698, "grad_norm": 0.9816461205482483, "learning_rate": 6.244564574316958e-06, "loss": 0.19, "step": 710 }, { "epoch": 0.6651162790697674, "grad_norm": 0.8675970435142517, "learning_rate": 6.094516343759879e-06, "loss": 0.175, "step": 715 }, { "epoch": 0.6697674418604651, "grad_norm": 0.8550778031349182, "learning_rate": 5.945498622895205e-06, "loss": 0.1655, "step": 720 }, { "epoch": 0.6744186046511628, "grad_norm": 0.8525641560554504, "learning_rate": 5.7975507318735776e-06, "loss": 0.1705, "step": 725 }, { "epoch": 0.6790697674418604, "grad_norm": 0.8267152309417725, "learning_rate": 5.650711708557941e-06, "loss": 0.1377, "step": 730 }, { "epoch": 0.6837209302325581, "grad_norm": 0.9349808096885681, "learning_rate": 5.505020298222899e-06, "loss": 0.1853, "step": 735 }, { "epoch": 0.6883720930232559, "grad_norm": 0.8384344577789307, "learning_rate": 5.360514943331323e-06, "loss": 0.1537, "step": 740 }, { "epoch": 0.6930232558139535, "grad_norm": 0.846179723739624, "learning_rate": 5.217233773390835e-06, "loss": 0.1404, "step": 745 }, { "epoch": 0.6976744186046512, "grad_norm": 0.8937086462974548, "learning_rate": 5.075214594892857e-06, "loss": 0.1531, "step": 750 }, { "epoch": 0.7023255813953488, "grad_norm": 0.8686032295227051, "learning_rate": 4.934494881336925e-06, "loss": 0.17, "step": 755 }, { "epoch": 0.7069767441860465, "grad_norm": 0.9571747779846191, "learning_rate": 4.795111763342816e-06, "loss": 0.1855, "step": 760 }, { "epoch": 0.7116279069767442, "grad_norm": 0.8048515319824219, "learning_rate": 4.657102018853218e-06, "loss": 0.1686, "step": 765 }, { "epoch": 0.7162790697674418, "grad_norm": 0.8267503976821899, "learning_rate": 4.520502063429392e-06, "loss": 0.1724, "step": 770 }, { "epoch": 0.7209302325581395, "grad_norm": 0.8278976082801819, "learning_rate": 4.385347940642495e-06, "loss": 0.183, "step": 775 }, { "epoch": 0.7255813953488373, "grad_norm": 0.8425613045692444, "learning_rate": 4.2516753125630225e-06, "loss": 0.1593, "step": 780 }, { "epoch": 0.7302325581395349, "grad_norm": 0.9145649075508118, "learning_rate": 4.119519450350961e-06, "loss": 0.1787, "step": 785 }, { "epoch": 0.7348837209302326, "grad_norm": 0.8136866092681885, "learning_rate": 3.988915224949041e-06, "loss": 0.1595, "step": 790 }, { "epoch": 0.7395348837209302, "grad_norm": 0.8466197848320007, "learning_rate": 3.859897097881625e-06, "loss": 0.1673, "step": 795 }, { "epoch": 0.7441860465116279, "grad_norm": 0.7777575850486755, "learning_rate": 3.732499112161592e-06, "loss": 0.1482, "step": 800 }, { "epoch": 0.7488372093023256, "grad_norm": 0.9253669381141663, "learning_rate": 3.6067548833077002e-06, "loss": 0.176, "step": 805 }, { "epoch": 0.7534883720930232, "grad_norm": 0.8771972060203552, "learning_rate": 3.4826975904746917e-06, "loss": 0.1422, "step": 810 }, { "epoch": 0.7581395348837209, "grad_norm": 0.8986210227012634, "learning_rate": 3.360359967698589e-06, "loss": 0.1273, "step": 815 }, { "epoch": 0.7627906976744186, "grad_norm": 0.8482417464256287, "learning_rate": 3.2397742952594067e-06, "loss": 0.1462, "step": 820 }, { "epoch": 0.7674418604651163, "grad_norm": 0.6971471309661865, "learning_rate": 3.120972391163608e-06, "loss": 0.1232, "step": 825 }, { "epoch": 0.772093023255814, "grad_norm": 0.9284904599189758, "learning_rate": 3.003985602748537e-06, "loss": 0.1467, "step": 830 }, { "epoch": 0.7767441860465116, "grad_norm": 0.7171467542648315, "learning_rate": 2.8888447984110234e-06, "loss": 0.1365, "step": 835 }, { "epoch": 0.7813953488372093, "grad_norm": 0.9581470489501953, "learning_rate": 2.7755803594624043e-06, "loss": 0.1469, "step": 840 }, { "epoch": 0.786046511627907, "grad_norm": 0.8160574436187744, "learning_rate": 2.664222172112022e-06, "loss": 0.1676, "step": 845 }, { "epoch": 0.7906976744186046, "grad_norm": 0.835128128528595, "learning_rate": 2.554799619581393e-06, "loss": 0.1431, "step": 850 }, { "epoch": 0.7953488372093023, "grad_norm": 0.8313025832176208, "learning_rate": 2.447341574351081e-06, "loss": 0.1533, "step": 855 }, { "epoch": 0.8, "grad_norm": 0.8108298778533936, "learning_rate": 2.3418763905423337e-06, "loss": 0.1362, "step": 860 }, { "epoch": 0.8046511627906977, "grad_norm": 0.8765959739685059, "learning_rate": 2.2384318964355123e-06, "loss": 0.1418, "step": 865 }, { "epoch": 0.8093023255813954, "grad_norm": 0.609005331993103, "learning_rate": 2.137035387127253e-06, "loss": 0.1259, "step": 870 }, { "epoch": 0.813953488372093, "grad_norm": 0.7476295232772827, "learning_rate": 2.037713617328323e-06, "loss": 0.1577, "step": 875 }, { "epoch": 0.8186046511627907, "grad_norm": 0.8610689640045166, "learning_rate": 1.940492794304053e-06, "loss": 0.158, "step": 880 }, { "epoch": 0.8232558139534883, "grad_norm": 0.6711591482162476, "learning_rate": 1.845398570959247e-06, "loss": 0.1272, "step": 885 }, { "epoch": 0.827906976744186, "grad_norm": 0.8629456758499146, "learning_rate": 1.7524560390693312e-06, "loss": 0.1777, "step": 890 }, { "epoch": 0.8325581395348837, "grad_norm": 0.6571215987205505, "learning_rate": 1.6616897226595963e-06, "loss": 0.1256, "step": 895 }, { "epoch": 0.8372093023255814, "grad_norm": 0.9528214931488037, "learning_rate": 1.5731235715342242e-06, "loss": 0.1499, "step": 900 }, { "epoch": 0.8418604651162791, "grad_norm": 0.7758918404579163, "learning_rate": 1.4867809549568434e-06, "loss": 0.148, "step": 905 }, { "epoch": 0.8465116279069768, "grad_norm": 0.7741161584854126, "learning_rate": 1.40268465548426e-06, "loss": 0.1443, "step": 910 }, { "epoch": 0.8511627906976744, "grad_norm": 0.7391246557235718, "learning_rate": 1.3208568629549968e-06, "loss": 0.1024, "step": 915 }, { "epoch": 0.8558139534883721, "grad_norm": 0.8599534630775452, "learning_rate": 1.241319168634245e-06, "loss": 0.1398, "step": 920 }, { "epoch": 0.8604651162790697, "grad_norm": 0.7439330816268921, "learning_rate": 1.1640925595167342e-06, "loss": 0.1092, "step": 925 }, { "epoch": 0.8651162790697674, "grad_norm": 0.8411828875541687, "learning_rate": 1.089197412789058e-06, "loss": 0.1234, "step": 930 }, { "epoch": 0.8697674418604651, "grad_norm": 0.8133601546287537, "learning_rate": 1.0166534904529113e-06, "loss": 0.1281, "step": 935 }, { "epoch": 0.8744186046511628, "grad_norm": 0.9273954033851624, "learning_rate": 9.464799341106268e-07, "loss": 0.1468, "step": 940 }, { "epoch": 0.8790697674418605, "grad_norm": 0.916169285774231, "learning_rate": 8.786952599144528e-07, "loss": 0.1181, "step": 945 }, { "epoch": 0.8837209302325582, "grad_norm": 0.9071162939071655, "learning_rate": 8.133173536808204e-07, "loss": 0.1349, "step": 950 }, { "epoch": 0.8883720930232558, "grad_norm": 0.9697039723396301, "learning_rate": 7.503634661709613e-07, "loss": 0.1525, "step": 955 }, { "epoch": 0.8930232558139535, "grad_norm": 0.6566289663314819, "learning_rate": 6.898502085390757e-07, "loss": 0.1513, "step": 960 }, { "epoch": 0.8976744186046511, "grad_norm": 0.6727085113525391, "learning_rate": 6.317935479492887e-07, "loss": 0.1254, "step": 965 }, { "epoch": 0.9023255813953488, "grad_norm": 0.7659772038459778, "learning_rate": 5.762088033625012e-07, "loss": 0.1301, "step": 970 }, { "epoch": 0.9069767441860465, "grad_norm": 0.7282540798187256, "learning_rate": 5.23110641494311e-07, "loss": 0.1153, "step": 975 }, { "epoch": 0.9116279069767442, "grad_norm": 0.8052628040313721, "learning_rate": 4.7251307294500783e-07, "loss": 0.1255, "step": 980 }, { "epoch": 0.9162790697674419, "grad_norm": 0.7424523830413818, "learning_rate": 4.244294485027156e-07, "loss": 0.1189, "step": 985 }, { "epoch": 0.9209302325581395, "grad_norm": 0.7578551173210144, "learning_rate": 3.7887245562061137e-07, "loss": 0.1178, "step": 990 }, { "epoch": 0.9255813953488372, "grad_norm": 0.7584931254386902, "learning_rate": 3.358541150691952e-07, "loss": 0.1152, "step": 995 }, { "epoch": 0.9302325581395349, "grad_norm": 0.7564160823822021, "learning_rate": 2.9538577776445976e-07, "loss": 0.1246, "step": 1000 }, { "epoch": 0.9348837209302325, "grad_norm": 0.8552773594856262, "learning_rate": 2.5747812177280953e-07, "loss": 0.1196, "step": 1005 }, { "epoch": 0.9395348837209302, "grad_norm": 0.8341315984725952, "learning_rate": 2.2214114949352882e-07, "loss": 0.1289, "step": 1010 }, { "epoch": 0.9441860465116279, "grad_norm": 0.7836688756942749, "learning_rate": 1.8938418501951549e-07, "loss": 0.1342, "step": 1015 }, { "epoch": 0.9488372093023256, "grad_norm": 0.7940887212753296, "learning_rate": 1.5921587167701535e-07, "loss": 0.1396, "step": 1020 }, { "epoch": 0.9534883720930233, "grad_norm": 0.7409055233001709, "learning_rate": 1.3164416974496997e-07, "loss": 0.1532, "step": 1025 }, { "epoch": 0.958139534883721, "grad_norm": 0.8953080177307129, "learning_rate": 1.0667635435459788e-07, "loss": 0.1497, "step": 1030 }, { "epoch": 0.9627906976744186, "grad_norm": 1.0246886014938354, "learning_rate": 8.431901356976801e-08, "loss": 0.1201, "step": 1035 }, { "epoch": 0.9674418604651163, "grad_norm": 0.7326242923736572, "learning_rate": 6.457804664865119e-08, "loss": 0.1157, "step": 1040 }, { "epoch": 0.9720930232558139, "grad_norm": 0.779400110244751, "learning_rate": 4.745866248713204e-08, "loss": 0.1297, "step": 1045 }, { "epoch": 0.9767441860465116, "grad_norm": 0.7084469795227051, "learning_rate": 3.296537824438284e-08, "loss": 0.1329, "step": 1050 }, { "epoch": 0.9813953488372092, "grad_norm": 0.7685924768447876, "learning_rate": 2.1102018150943592e-08, "loss": 0.1248, "step": 1055 }, { "epoch": 0.986046511627907, "grad_norm": 0.8336236476898193, "learning_rate": 1.187171249966701e-08, "loss": 0.1359, "step": 1060 }, { "epoch": 0.9906976744186047, "grad_norm": 0.868739902973175, "learning_rate": 5.276896819738086e-09, "loss": 0.1422, "step": 1065 }, { "epoch": 0.9953488372093023, "grad_norm": 0.8377354145050049, "learning_rate": 1.3193112340414981e-09, "loss": 0.1383, "step": 1070 }, { "epoch": 1.0, "grad_norm": 0.7257831692695618, "learning_rate": 0.0, "loss": 0.1415, "step": 1075 }, { "epoch": 1.0, "eval_loss": 1.3583940267562866, "eval_runtime": 18.7721, "eval_samples_per_second": 52.684, "eval_steps_per_second": 1.651, "step": 1075 }, { "epoch": 1.0, "step": 1075, "total_flos": 1.4944007725514752e+18, "train_loss": 0.43516844599745996, "train_runtime": 3425.5343, "train_samples_per_second": 10.039, "train_steps_per_second": 0.314 } ], "logging_steps": 5, "max_steps": 1075, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.4944007725514752e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }