{ "best_metric": 1.1526199579238892, "best_model_checkpoint": "data/tinyllama_moe_sft_ultrachat-slimorca/checkpoint-2000", "epoch": 0.9997585124366095, "eval_steps": 100, "global_step": 2070, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.6666666666666668e-07, "loss": 2.9792, "step": 1 }, { "epoch": 0.0, "learning_rate": 8.333333333333333e-07, "loss": 2.9452, "step": 5 }, { "epoch": 0.0, "learning_rate": 1.6666666666666667e-06, "loss": 2.9114, "step": 10 }, { "epoch": 0.01, "learning_rate": 2.5e-06, "loss": 2.8843, "step": 15 }, { "epoch": 0.01, "learning_rate": 3.3333333333333333e-06, "loss": 2.6761, "step": 20 }, { "epoch": 0.01, "learning_rate": 4.166666666666667e-06, "loss": 2.3539, "step": 25 }, { "epoch": 0.01, "learning_rate": 5e-06, "loss": 2.1166, "step": 30 }, { "epoch": 0.02, "learning_rate": 5.833333333333334e-06, "loss": 2.0101, "step": 35 }, { "epoch": 0.02, "learning_rate": 6.666666666666667e-06, "loss": 1.8725, "step": 40 }, { "epoch": 0.02, "learning_rate": 7.500000000000001e-06, "loss": 1.7979, "step": 45 }, { "epoch": 0.02, "learning_rate": 8.333333333333334e-06, "loss": 1.7447, "step": 50 }, { "epoch": 0.03, "learning_rate": 9.166666666666666e-06, "loss": 1.6985, "step": 55 }, { "epoch": 0.03, "learning_rate": 1e-05, "loss": 1.6134, "step": 60 }, { "epoch": 0.03, "learning_rate": 1.0833333333333334e-05, "loss": 1.6174, "step": 65 }, { "epoch": 0.03, "learning_rate": 1.1666666666666668e-05, "loss": 1.5786, "step": 70 }, { "epoch": 0.04, "learning_rate": 1.25e-05, "loss": 1.5374, "step": 75 }, { "epoch": 0.04, "learning_rate": 1.3333333333333333e-05, "loss": 1.523, "step": 80 }, { "epoch": 0.04, "learning_rate": 1.416666666666667e-05, "loss": 1.5384, "step": 85 }, { "epoch": 0.04, "learning_rate": 1.5000000000000002e-05, "loss": 1.4798, "step": 90 }, { "epoch": 0.05, "learning_rate": 1.5833333333333333e-05, "loss": 1.4584, "step": 95 }, { "epoch": 0.05, "learning_rate": 1.6666666666666667e-05, "loss": 1.4601, "step": 100 }, { "epoch": 0.05, "eval_loss": 1.3360612392425537, "eval_runtime": 428.8225, "eval_samples_per_second": 37.699, "eval_steps_per_second": 1.18, "step": 100 }, { "epoch": 0.05, "learning_rate": 1.7500000000000002e-05, "loss": 1.4256, "step": 105 }, { "epoch": 0.05, "learning_rate": 1.8333333333333333e-05, "loss": 1.4164, "step": 110 }, { "epoch": 0.06, "learning_rate": 1.916666666666667e-05, "loss": 1.427, "step": 115 }, { "epoch": 0.06, "learning_rate": 2e-05, "loss": 1.4041, "step": 120 }, { "epoch": 0.06, "learning_rate": 1.9999675557165282e-05, "loss": 1.404, "step": 125 }, { "epoch": 0.06, "learning_rate": 1.9998702249713747e-05, "loss": 1.4163, "step": 130 }, { "epoch": 0.07, "learning_rate": 1.9997080140801932e-05, "loss": 1.3984, "step": 135 }, { "epoch": 0.07, "learning_rate": 1.9994809335686152e-05, "loss": 1.3717, "step": 140 }, { "epoch": 0.07, "learning_rate": 1.9991889981715696e-05, "loss": 1.3905, "step": 145 }, { "epoch": 0.07, "learning_rate": 1.998832226832327e-05, "loss": 1.3486, "step": 150 }, { "epoch": 0.07, "learning_rate": 1.9984106427012667e-05, "loss": 1.3748, "step": 155 }, { "epoch": 0.08, "learning_rate": 1.9979242731343803e-05, "loss": 1.3651, "step": 160 }, { "epoch": 0.08, "learning_rate": 1.9973731496914914e-05, "loss": 1.3452, "step": 165 }, { "epoch": 0.08, "learning_rate": 1.9967573081342103e-05, "loss": 1.3533, "step": 170 }, { "epoch": 0.08, "learning_rate": 1.9960767884236132e-05, "loss": 1.3422, "step": 175 }, { "epoch": 0.09, "learning_rate": 1.995331634717649e-05, "loss": 1.3161, "step": 180 }, { "epoch": 0.09, "learning_rate": 1.9945218953682736e-05, "loss": 1.341, "step": 185 }, { "epoch": 0.09, "learning_rate": 1.9936476229183133e-05, "loss": 1.3373, "step": 190 }, { "epoch": 0.09, "learning_rate": 1.992708874098054e-05, "loss": 1.3365, "step": 195 }, { "epoch": 0.1, "learning_rate": 1.9917057098215624e-05, "loss": 1.3324, "step": 200 }, { "epoch": 0.1, "eval_loss": 1.2566393613815308, "eval_runtime": 585.8351, "eval_samples_per_second": 27.595, "eval_steps_per_second": 0.864, "step": 200 }, { "epoch": 0.1, "learning_rate": 1.9906381951827295e-05, "loss": 1.3309, "step": 205 }, { "epoch": 0.1, "learning_rate": 1.9895063994510512e-05, "loss": 1.3236, "step": 210 }, { "epoch": 0.1, "learning_rate": 1.9883103960671305e-05, "loss": 1.3254, "step": 215 }, { "epoch": 0.11, "learning_rate": 1.9870502626379127e-05, "loss": 1.3107, "step": 220 }, { "epoch": 0.11, "learning_rate": 1.985726080931651e-05, "loss": 1.3345, "step": 225 }, { "epoch": 0.11, "learning_rate": 1.9843379368725978e-05, "loss": 1.3303, "step": 230 }, { "epoch": 0.11, "learning_rate": 1.9828859205354326e-05, "loss": 1.3179, "step": 235 }, { "epoch": 0.12, "learning_rate": 1.9813701261394136e-05, "loss": 1.2992, "step": 240 }, { "epoch": 0.12, "learning_rate": 1.979790652042268e-05, "loss": 1.3117, "step": 245 }, { "epoch": 0.12, "learning_rate": 1.9781476007338058e-05, "loss": 1.3035, "step": 250 }, { "epoch": 0.12, "learning_rate": 1.9764410788292724e-05, "loss": 1.2918, "step": 255 }, { "epoch": 0.13, "learning_rate": 1.9746711970624282e-05, "loss": 1.3105, "step": 260 }, { "epoch": 0.13, "learning_rate": 1.9728380702783644e-05, "loss": 1.3266, "step": 265 }, { "epoch": 0.13, "learning_rate": 1.9709418174260523e-05, "loss": 1.3068, "step": 270 }, { "epoch": 0.13, "learning_rate": 1.968982561550621e-05, "loss": 1.3045, "step": 275 }, { "epoch": 0.14, "learning_rate": 1.9669604297853766e-05, "loss": 1.3042, "step": 280 }, { "epoch": 0.14, "learning_rate": 1.9648755533435517e-05, "loss": 1.3033, "step": 285 }, { "epoch": 0.14, "learning_rate": 1.962728067509791e-05, "loss": 1.2891, "step": 290 }, { "epoch": 0.14, "learning_rate": 1.9605181116313725e-05, "loss": 1.2984, "step": 295 }, { "epoch": 0.14, "learning_rate": 1.9582458291091664e-05, "loss": 1.2946, "step": 300 }, { "epoch": 0.14, "eval_loss": 1.2279455661773682, "eval_runtime": 426.1287, "eval_samples_per_second": 37.937, "eval_steps_per_second": 1.187, "step": 300 }, { "epoch": 0.15, "learning_rate": 1.955911367388329e-05, "loss": 1.2973, "step": 305 }, { "epoch": 0.15, "learning_rate": 1.9535148779487365e-05, "loss": 1.2933, "step": 310 }, { "epoch": 0.15, "learning_rate": 1.9510565162951538e-05, "loss": 1.2931, "step": 315 }, { "epoch": 0.15, "learning_rate": 1.9485364419471454e-05, "loss": 1.2918, "step": 320 }, { "epoch": 0.16, "learning_rate": 1.9459548184287254e-05, "loss": 1.2965, "step": 325 }, { "epoch": 0.16, "learning_rate": 1.9433118132577432e-05, "loss": 1.2924, "step": 330 }, { "epoch": 0.16, "learning_rate": 1.9406075979350175e-05, "loss": 1.3012, "step": 335 }, { "epoch": 0.16, "learning_rate": 1.9378423479332045e-05, "loss": 1.2948, "step": 340 }, { "epoch": 0.17, "learning_rate": 1.9350162426854152e-05, "loss": 1.2708, "step": 345 }, { "epoch": 0.17, "learning_rate": 1.932129465573568e-05, "loss": 1.2749, "step": 350 }, { "epoch": 0.17, "learning_rate": 1.9291822039164934e-05, "loss": 1.2849, "step": 355 }, { "epoch": 0.17, "learning_rate": 1.9261746489577767e-05, "loss": 1.2926, "step": 360 }, { "epoch": 0.18, "learning_rate": 1.923106995853349e-05, "loss": 1.2743, "step": 365 }, { "epoch": 0.18, "learning_rate": 1.9199794436588244e-05, "loss": 1.2767, "step": 370 }, { "epoch": 0.18, "learning_rate": 1.9167921953165827e-05, "loss": 1.2673, "step": 375 }, { "epoch": 0.18, "learning_rate": 1.913545457642601e-05, "loss": 1.2751, "step": 380 }, { "epoch": 0.19, "learning_rate": 1.9102394413130348e-05, "loss": 1.2782, "step": 385 }, { "epoch": 0.19, "learning_rate": 1.9068743608505454e-05, "loss": 1.2673, "step": 390 }, { "epoch": 0.19, "learning_rate": 1.9034504346103825e-05, "loss": 1.2675, "step": 395 }, { "epoch": 0.19, "learning_rate": 1.8999678847662124e-05, "loss": 1.2767, "step": 400 }, { "epoch": 0.19, "eval_loss": 1.2110730409622192, "eval_runtime": 424.7414, "eval_samples_per_second": 38.061, "eval_steps_per_second": 1.191, "step": 400 }, { "epoch": 0.2, "learning_rate": 1.896426937295704e-05, "loss": 1.2654, "step": 405 }, { "epoch": 0.2, "learning_rate": 1.892827821965864e-05, "loss": 1.2771, "step": 410 }, { "epoch": 0.2, "learning_rate": 1.8891707723181294e-05, "loss": 1.2757, "step": 415 }, { "epoch": 0.2, "learning_rate": 1.8854560256532098e-05, "loss": 1.2621, "step": 420 }, { "epoch": 0.21, "learning_rate": 1.881683823015694e-05, "loss": 1.2785, "step": 425 }, { "epoch": 0.21, "learning_rate": 1.8778544091784047e-05, "loss": 1.252, "step": 430 }, { "epoch": 0.21, "learning_rate": 1.873968032626518e-05, "loss": 1.2634, "step": 435 }, { "epoch": 0.21, "learning_rate": 1.8700249455414394e-05, "loss": 1.2811, "step": 440 }, { "epoch": 0.21, "learning_rate": 1.866025403784439e-05, "loss": 1.2621, "step": 445 }, { "epoch": 0.22, "learning_rate": 1.8619696668800494e-05, "loss": 1.266, "step": 450 }, { "epoch": 0.22, "learning_rate": 1.8578579979992266e-05, "loss": 1.2579, "step": 455 }, { "epoch": 0.22, "learning_rate": 1.8536906639422724e-05, "loss": 1.2456, "step": 460 }, { "epoch": 0.22, "learning_rate": 1.8494679351215212e-05, "loss": 1.236, "step": 465 }, { "epoch": 0.23, "learning_rate": 1.845190085543795e-05, "loss": 1.2619, "step": 470 }, { "epoch": 0.23, "learning_rate": 1.8408573927926225e-05, "loss": 1.2551, "step": 475 }, { "epoch": 0.23, "learning_rate": 1.8364701380102267e-05, "loss": 1.2534, "step": 480 }, { "epoch": 0.23, "learning_rate": 1.8320286058792845e-05, "loss": 1.2637, "step": 485 }, { "epoch": 0.24, "learning_rate": 1.82753308460445e-05, "loss": 1.2571, "step": 490 }, { "epoch": 0.24, "learning_rate": 1.8229838658936566e-05, "loss": 1.2657, "step": 495 }, { "epoch": 0.24, "learning_rate": 1.818381244939187e-05, "loss": 1.2298, "step": 500 }, { "epoch": 0.24, "eval_loss": 1.1995348930358887, "eval_runtime": 424.3499, "eval_samples_per_second": 38.096, "eval_steps_per_second": 1.192, "step": 500 }, { "epoch": 0.24, "learning_rate": 1.81372552039852e-05, "loss": 1.2547, "step": 505 }, { "epoch": 0.25, "learning_rate": 1.8090169943749477e-05, "loss": 1.2452, "step": 510 }, { "epoch": 0.25, "learning_rate": 1.804255972397977e-05, "loss": 1.246, "step": 515 }, { "epoch": 0.25, "learning_rate": 1.7994427634035016e-05, "loss": 1.2402, "step": 520 }, { "epoch": 0.25, "learning_rate": 1.7945776797137544e-05, "loss": 1.2531, "step": 525 }, { "epoch": 0.26, "learning_rate": 1.7896610370170452e-05, "loss": 1.243, "step": 530 }, { "epoch": 0.26, "learning_rate": 1.7846931543472722e-05, "loss": 1.2344, "step": 535 }, { "epoch": 0.26, "learning_rate": 1.7796743540632226e-05, "loss": 1.2555, "step": 540 }, { "epoch": 0.26, "learning_rate": 1.7746049618276545e-05, "loss": 1.2283, "step": 545 }, { "epoch": 0.27, "learning_rate": 1.769485306586166e-05, "loss": 1.235, "step": 550 }, { "epoch": 0.27, "learning_rate": 1.7643157205458483e-05, "loss": 1.2255, "step": 555 }, { "epoch": 0.27, "learning_rate": 1.7590965391537316e-05, "loss": 1.2409, "step": 560 }, { "epoch": 0.27, "learning_rate": 1.753828101075017e-05, "loss": 1.2303, "step": 565 }, { "epoch": 0.28, "learning_rate": 1.7485107481711014e-05, "loss": 1.2293, "step": 570 }, { "epoch": 0.28, "learning_rate": 1.7431448254773943e-05, "loss": 1.2354, "step": 575 }, { "epoch": 0.28, "learning_rate": 1.7377306811809306e-05, "loss": 1.2277, "step": 580 }, { "epoch": 0.28, "learning_rate": 1.7322686665977738e-05, "loss": 1.2437, "step": 585 }, { "epoch": 0.28, "learning_rate": 1.7267591361502233e-05, "loss": 1.2332, "step": 590 }, { "epoch": 0.29, "learning_rate": 1.7212024473438145e-05, "loss": 1.2539, "step": 595 }, { "epoch": 0.29, "learning_rate": 1.715598960744121e-05, "loss": 1.2247, "step": 600 }, { "epoch": 0.29, "eval_loss": 1.190222978591919, "eval_runtime": 425.341, "eval_samples_per_second": 38.007, "eval_steps_per_second": 1.19, "step": 600 }, { "epoch": 0.29, "learning_rate": 1.7099490399533583e-05, "loss": 1.2454, "step": 605 }, { "epoch": 0.29, "learning_rate": 1.7042530515867897e-05, "loss": 1.2263, "step": 610 }, { "epoch": 0.3, "learning_rate": 1.6985113652489374e-05, "loss": 1.2203, "step": 615 }, { "epoch": 0.3, "learning_rate": 1.6927243535095995e-05, "loss": 1.2256, "step": 620 }, { "epoch": 0.3, "learning_rate": 1.6868923918796753e-05, "loss": 1.236, "step": 625 }, { "epoch": 0.3, "learning_rate": 1.6810158587867973e-05, "loss": 1.244, "step": 630 }, { "epoch": 0.31, "learning_rate": 1.6750951355507763e-05, "loss": 1.2408, "step": 635 }, { "epoch": 0.31, "learning_rate": 1.6691306063588583e-05, "loss": 1.216, "step": 640 }, { "epoch": 0.31, "learning_rate": 1.6631226582407954e-05, "loss": 1.2315, "step": 645 }, { "epoch": 0.31, "learning_rate": 1.657071681043731e-05, "loss": 1.2348, "step": 650 }, { "epoch": 0.32, "learning_rate": 1.650978067406904e-05, "loss": 1.2338, "step": 655 }, { "epoch": 0.32, "learning_rate": 1.6448422127361707e-05, "loss": 1.2434, "step": 660 }, { "epoch": 0.32, "learning_rate": 1.638664515178348e-05, "loss": 1.2237, "step": 665 }, { "epoch": 0.32, "learning_rate": 1.6324453755953772e-05, "loss": 1.2026, "step": 670 }, { "epoch": 0.33, "learning_rate": 1.626185197538314e-05, "loss": 1.2498, "step": 675 }, { "epoch": 0.33, "learning_rate": 1.6198843872211404e-05, "loss": 1.2291, "step": 680 }, { "epoch": 0.33, "learning_rate": 1.613543353494409e-05, "loss": 1.2269, "step": 685 }, { "epoch": 0.33, "learning_rate": 1.6071625078187113e-05, "loss": 1.2363, "step": 690 }, { "epoch": 0.34, "learning_rate": 1.600742264237979e-05, "loss": 1.207, "step": 695 }, { "epoch": 0.34, "learning_rate": 1.5942830393526176e-05, "loss": 1.2208, "step": 700 }, { "epoch": 0.34, "eval_loss": 1.18331778049469, "eval_runtime": 427.6678, "eval_samples_per_second": 37.8, "eval_steps_per_second": 1.183, "step": 700 }, { "epoch": 0.34, "learning_rate": 1.5877852522924733e-05, "loss": 1.2137, "step": 705 }, { "epoch": 0.34, "learning_rate": 1.5812493246896368e-05, "loss": 1.2171, "step": 710 }, { "epoch": 0.35, "learning_rate": 1.574675680651084e-05, "loss": 1.2311, "step": 715 }, { "epoch": 0.35, "learning_rate": 1.568064746731156e-05, "loss": 1.2106, "step": 720 }, { "epoch": 0.35, "learning_rate": 1.561416951903881e-05, "loss": 1.2061, "step": 725 }, { "epoch": 0.35, "learning_rate": 1.554732727535139e-05, "loss": 1.2039, "step": 730 }, { "epoch": 0.35, "learning_rate": 1.5480125073546705e-05, "loss": 1.1872, "step": 735 }, { "epoch": 0.36, "learning_rate": 1.5412567274279316e-05, "loss": 1.2143, "step": 740 }, { "epoch": 0.36, "learning_rate": 1.5344658261278013e-05, "loss": 1.21, "step": 745 }, { "epoch": 0.36, "learning_rate": 1.527640244106133e-05, "loss": 1.198, "step": 750 }, { "epoch": 0.36, "learning_rate": 1.5207804242651625e-05, "loss": 1.2096, "step": 755 }, { "epoch": 0.37, "learning_rate": 1.5138868117287689e-05, "loss": 1.2292, "step": 760 }, { "epoch": 0.37, "learning_rate": 1.5069598538135905e-05, "loss": 1.2208, "step": 765 }, { "epoch": 0.37, "learning_rate": 1.5000000000000002e-05, "loss": 1.2113, "step": 770 }, { "epoch": 0.37, "learning_rate": 1.4930077019029376e-05, "loss": 1.2139, "step": 775 }, { "epoch": 0.38, "learning_rate": 1.485983413242606e-05, "loss": 1.2268, "step": 780 }, { "epoch": 0.38, "learning_rate": 1.4789275898150309e-05, "loss": 1.2414, "step": 785 }, { "epoch": 0.38, "learning_rate": 1.471840689462482e-05, "loss": 1.2141, "step": 790 }, { "epoch": 0.38, "learning_rate": 1.4647231720437687e-05, "loss": 1.1926, "step": 795 }, { "epoch": 0.39, "learning_rate": 1.4575754994043956e-05, "loss": 1.2375, "step": 800 }, { "epoch": 0.39, "eval_loss": 1.1774698495864868, "eval_runtime": 422.1872, "eval_samples_per_second": 38.291, "eval_steps_per_second": 1.199, "step": 800 }, { "epoch": 0.39, "learning_rate": 1.450398135346597e-05, "loss": 1.2201, "step": 805 }, { "epoch": 0.39, "learning_rate": 1.4431915455992416e-05, "loss": 1.2204, "step": 810 }, { "epoch": 0.39, "learning_rate": 1.4359561977876102e-05, "loss": 1.2133, "step": 815 }, { "epoch": 0.4, "learning_rate": 1.4286925614030542e-05, "loss": 1.2086, "step": 820 }, { "epoch": 0.4, "learning_rate": 1.4214011077725293e-05, "loss": 1.2039, "step": 825 }, { "epoch": 0.4, "learning_rate": 1.414082310028012e-05, "loss": 1.1965, "step": 830 }, { "epoch": 0.4, "learning_rate": 1.4067366430758004e-05, "loss": 1.2217, "step": 835 }, { "epoch": 0.41, "learning_rate": 1.3993645835656955e-05, "loss": 1.231, "step": 840 }, { "epoch": 0.41, "learning_rate": 1.3919666098600753e-05, "loss": 1.206, "step": 845 }, { "epoch": 0.41, "learning_rate": 1.3845432020028511e-05, "loss": 1.2025, "step": 850 }, { "epoch": 0.41, "learning_rate": 1.3770948416883205e-05, "loss": 1.2339, "step": 855 }, { "epoch": 0.42, "learning_rate": 1.369622012229911e-05, "loss": 1.2021, "step": 860 }, { "epoch": 0.42, "learning_rate": 1.362125198528817e-05, "loss": 1.2036, "step": 865 }, { "epoch": 0.42, "learning_rate": 1.3546048870425356e-05, "loss": 1.2242, "step": 870 }, { "epoch": 0.42, "learning_rate": 1.347061565753303e-05, "loss": 1.2259, "step": 875 }, { "epoch": 0.43, "learning_rate": 1.3394957241364273e-05, "loss": 1.1964, "step": 880 }, { "epoch": 0.43, "learning_rate": 1.3319078531285286e-05, "loss": 1.2042, "step": 885 }, { "epoch": 0.43, "learning_rate": 1.3242984450956829e-05, "loss": 1.2158, "step": 890 }, { "epoch": 0.43, "learning_rate": 1.3166679938014728e-05, "loss": 1.2052, "step": 895 }, { "epoch": 0.43, "learning_rate": 1.3090169943749475e-05, "loss": 1.2038, "step": 900 }, { "epoch": 0.43, "eval_loss": 1.1725637912750244, "eval_runtime": 421.5684, "eval_samples_per_second": 38.347, "eval_steps_per_second": 1.2, "step": 900 }, { "epoch": 0.44, "learning_rate": 1.301345943278496e-05, "loss": 1.2096, "step": 905 }, { "epoch": 0.44, "learning_rate": 1.293655338275631e-05, "loss": 1.2106, "step": 910 }, { "epoch": 0.44, "learning_rate": 1.2859456783986892e-05, "loss": 1.189, "step": 915 }, { "epoch": 0.44, "learning_rate": 1.2782174639164528e-05, "loss": 1.1913, "step": 920 }, { "epoch": 0.45, "learning_rate": 1.270471196301684e-05, "loss": 1.214, "step": 925 }, { "epoch": 0.45, "learning_rate": 1.262707378198587e-05, "loss": 1.2046, "step": 930 }, { "epoch": 0.45, "learning_rate": 1.2549265133901934e-05, "loss": 1.1957, "step": 935 }, { "epoch": 0.45, "learning_rate": 1.2471291067656696e-05, "loss": 1.1824, "step": 940 }, { "epoch": 0.46, "learning_rate": 1.2393156642875579e-05, "loss": 1.2013, "step": 945 }, { "epoch": 0.46, "learning_rate": 1.2314866929589434e-05, "loss": 1.209, "step": 950 }, { "epoch": 0.46, "learning_rate": 1.2236427007905558e-05, "loss": 1.1864, "step": 955 }, { "epoch": 0.46, "learning_rate": 1.2157841967678064e-05, "loss": 1.2149, "step": 960 }, { "epoch": 0.47, "learning_rate": 1.2079116908177592e-05, "loss": 1.1946, "step": 965 }, { "epoch": 0.47, "learning_rate": 1.2000256937760446e-05, "loss": 1.2138, "step": 970 }, { "epoch": 0.47, "learning_rate": 1.1921267173537085e-05, "loss": 1.2145, "step": 975 }, { "epoch": 0.47, "learning_rate": 1.1842152741040117e-05, "loss": 1.1983, "step": 980 }, { "epoch": 0.48, "learning_rate": 1.1762918773891691e-05, "loss": 1.1901, "step": 985 }, { "epoch": 0.48, "learning_rate": 1.1683570413470384e-05, "loss": 1.2152, "step": 990 }, { "epoch": 0.48, "learning_rate": 1.1604112808577603e-05, "loss": 1.2067, "step": 995 }, { "epoch": 0.48, "learning_rate": 1.1524551115103455e-05, "loss": 1.1926, "step": 1000 }, { "epoch": 0.48, "eval_loss": 1.168326735496521, "eval_runtime": 421.3368, "eval_samples_per_second": 38.368, "eval_steps_per_second": 1.201, "step": 1000 }, { "epoch": 0.49, "learning_rate": 1.1444890495692214e-05, "loss": 1.1972, "step": 1005 }, { "epoch": 0.49, "learning_rate": 1.1365136119407318e-05, "loss": 1.1905, "step": 1010 }, { "epoch": 0.49, "learning_rate": 1.1285293161395948e-05, "loss": 1.1882, "step": 1015 }, { "epoch": 0.49, "learning_rate": 1.1205366802553231e-05, "loss": 1.2066, "step": 1020 }, { "epoch": 0.5, "learning_rate": 1.1125362229186056e-05, "loss": 1.1901, "step": 1025 }, { "epoch": 0.5, "learning_rate": 1.1045284632676535e-05, "loss": 1.1869, "step": 1030 }, { "epoch": 0.5, "learning_rate": 1.0965139209145153e-05, "loss": 1.1932, "step": 1035 }, { "epoch": 0.5, "learning_rate": 1.0884931159113585e-05, "loss": 1.182, "step": 1040 }, { "epoch": 0.5, "learning_rate": 1.0804665687167262e-05, "loss": 1.214, "step": 1045 }, { "epoch": 0.51, "learning_rate": 1.0724348001617626e-05, "loss": 1.193, "step": 1050 }, { "epoch": 0.51, "learning_rate": 1.0643983314164195e-05, "loss": 1.207, "step": 1055 }, { "epoch": 0.51, "learning_rate": 1.0563576839556375e-05, "loss": 1.2182, "step": 1060 }, { "epoch": 0.51, "learning_rate": 1.0483133795255072e-05, "loss": 1.1954, "step": 1065 }, { "epoch": 0.52, "learning_rate": 1.0402659401094154e-05, "loss": 1.1991, "step": 1070 }, { "epoch": 0.52, "learning_rate": 1.0322158878941733e-05, "loss": 1.1939, "step": 1075 }, { "epoch": 0.52, "learning_rate": 1.0241637452361323e-05, "loss": 1.184, "step": 1080 }, { "epoch": 0.52, "learning_rate": 1.0161100346272913e-05, "loss": 1.2052, "step": 1085 }, { "epoch": 0.53, "learning_rate": 1.0080552786613899e-05, "loss": 1.2077, "step": 1090 }, { "epoch": 0.53, "learning_rate": 1e-05, "loss": 1.1848, "step": 1095 }, { "epoch": 0.53, "learning_rate": 9.919447213386103e-06, "loss": 1.1933, "step": 1100 }, { "epoch": 0.53, "eval_loss": 1.1648716926574707, "eval_runtime": 424.4832, "eval_samples_per_second": 38.084, "eval_steps_per_second": 1.192, "step": 1100 }, { "epoch": 0.53, "learning_rate": 9.838899653727088e-06, "loss": 1.1803, "step": 1105 }, { "epoch": 0.54, "learning_rate": 9.75836254763868e-06, "loss": 1.1981, "step": 1110 }, { "epoch": 0.54, "learning_rate": 9.677841121058274e-06, "loss": 1.199, "step": 1115 }, { "epoch": 0.54, "learning_rate": 9.597340598905851e-06, "loss": 1.215, "step": 1120 }, { "epoch": 0.54, "learning_rate": 9.516866204744932e-06, "loss": 1.1851, "step": 1125 }, { "epoch": 0.55, "learning_rate": 9.436423160443625e-06, "loss": 1.1746, "step": 1130 }, { "epoch": 0.55, "learning_rate": 9.356016685835807e-06, "loss": 1.1955, "step": 1135 }, { "epoch": 0.55, "learning_rate": 9.275651998382377e-06, "loss": 1.1971, "step": 1140 }, { "epoch": 0.55, "learning_rate": 9.195334312832742e-06, "loss": 1.1919, "step": 1145 }, { "epoch": 0.56, "learning_rate": 9.115068840886418e-06, "loss": 1.1744, "step": 1150 }, { "epoch": 0.56, "learning_rate": 9.034860790854848e-06, "loss": 1.1884, "step": 1155 }, { "epoch": 0.56, "learning_rate": 8.954715367323468e-06, "loss": 1.1796, "step": 1160 }, { "epoch": 0.56, "learning_rate": 8.874637770813947e-06, "loss": 1.191, "step": 1165 }, { "epoch": 0.57, "learning_rate": 8.79463319744677e-06, "loss": 1.196, "step": 1170 }, { "epoch": 0.57, "learning_rate": 8.714706838604056e-06, "loss": 1.2032, "step": 1175 }, { "epoch": 0.57, "learning_rate": 8.634863880592687e-06, "loss": 1.1897, "step": 1180 }, { "epoch": 0.57, "learning_rate": 8.55510950430779e-06, "loss": 1.1964, "step": 1185 }, { "epoch": 0.57, "learning_rate": 8.475448884896546e-06, "loss": 1.1858, "step": 1190 }, { "epoch": 0.58, "learning_rate": 8.395887191422397e-06, "loss": 1.1918, "step": 1195 }, { "epoch": 0.58, "learning_rate": 8.316429586529616e-06, "loss": 1.1893, "step": 1200 }, { "epoch": 0.58, "eval_loss": 1.1618335247039795, "eval_runtime": 423.2525, "eval_samples_per_second": 38.195, "eval_steps_per_second": 1.196, "step": 1200 }, { "epoch": 0.58, "learning_rate": 8.23708122610831e-06, "loss": 1.1907, "step": 1205 }, { "epoch": 0.58, "learning_rate": 8.157847258959885e-06, "loss": 1.2021, "step": 1210 }, { "epoch": 0.59, "learning_rate": 8.078732826462917e-06, "loss": 1.1999, "step": 1215 }, { "epoch": 0.59, "learning_rate": 7.999743062239557e-06, "loss": 1.1716, "step": 1220 }, { "epoch": 0.59, "learning_rate": 7.92088309182241e-06, "loss": 1.1864, "step": 1225 }, { "epoch": 0.59, "learning_rate": 7.84215803232194e-06, "loss": 1.1855, "step": 1230 }, { "epoch": 0.6, "learning_rate": 7.763572992094448e-06, "loss": 1.1899, "step": 1235 }, { "epoch": 0.6, "learning_rate": 7.685133070410571e-06, "loss": 1.1814, "step": 1240 }, { "epoch": 0.6, "learning_rate": 7.606843357124426e-06, "loss": 1.2001, "step": 1245 }, { "epoch": 0.6, "learning_rate": 7.5287089323433035e-06, "loss": 1.1886, "step": 1250 }, { "epoch": 0.61, "learning_rate": 7.450734866098066e-06, "loss": 1.2065, "step": 1255 }, { "epoch": 0.61, "learning_rate": 7.372926218014131e-06, "loss": 1.1812, "step": 1260 }, { "epoch": 0.61, "learning_rate": 7.2952880369831635e-06, "loss": 1.1729, "step": 1265 }, { "epoch": 0.61, "learning_rate": 7.217825360835475e-06, "loss": 1.1843, "step": 1270 }, { "epoch": 0.62, "learning_rate": 7.140543216013109e-06, "loss": 1.1864, "step": 1275 }, { "epoch": 0.62, "learning_rate": 7.063446617243695e-06, "loss": 1.1875, "step": 1280 }, { "epoch": 0.62, "learning_rate": 6.986540567215043e-06, "loss": 1.209, "step": 1285 }, { "epoch": 0.62, "learning_rate": 6.909830056250527e-06, "loss": 1.2043, "step": 1290 }, { "epoch": 0.63, "learning_rate": 6.833320061985278e-06, "loss": 1.1849, "step": 1295 }, { "epoch": 0.63, "learning_rate": 6.757015549043174e-06, "loss": 1.2029, "step": 1300 }, { "epoch": 0.63, "eval_loss": 1.1593303680419922, "eval_runtime": 425.0269, "eval_samples_per_second": 38.035, "eval_steps_per_second": 1.191, "step": 1300 }, { "epoch": 0.63, "learning_rate": 6.680921468714718e-06, "loss": 1.182, "step": 1305 }, { "epoch": 0.63, "learning_rate": 6.605042758635729e-06, "loss": 1.1861, "step": 1310 }, { "epoch": 0.64, "learning_rate": 6.529384342466971e-06, "loss": 1.1725, "step": 1315 }, { "epoch": 0.64, "learning_rate": 6.453951129574644e-06, "loss": 1.1873, "step": 1320 }, { "epoch": 0.64, "learning_rate": 6.378748014711834e-06, "loss": 1.1856, "step": 1325 }, { "epoch": 0.64, "learning_rate": 6.30377987770089e-06, "loss": 1.1836, "step": 1330 }, { "epoch": 0.64, "learning_rate": 6.229051583116796e-06, "loss": 1.1768, "step": 1335 }, { "epoch": 0.65, "learning_rate": 6.154567979971493e-06, "loss": 1.1871, "step": 1340 }, { "epoch": 0.65, "learning_rate": 6.080333901399252e-06, "loss": 1.1747, "step": 1345 }, { "epoch": 0.65, "learning_rate": 6.006354164343047e-06, "loss": 1.2114, "step": 1350 }, { "epoch": 0.65, "learning_rate": 5.932633569242e-06, "loss": 1.1872, "step": 1355 }, { "epoch": 0.66, "learning_rate": 5.859176899719883e-06, "loss": 1.1945, "step": 1360 }, { "epoch": 0.66, "learning_rate": 5.785988922274711e-06, "loss": 1.1785, "step": 1365 }, { "epoch": 0.66, "learning_rate": 5.713074385969457e-06, "loss": 1.2026, "step": 1370 }, { "epoch": 0.66, "learning_rate": 5.640438022123898e-06, "loss": 1.1928, "step": 1375 }, { "epoch": 0.67, "learning_rate": 5.5680845440075885e-06, "loss": 1.1775, "step": 1380 }, { "epoch": 0.67, "learning_rate": 5.496018646534032e-06, "loss": 1.1988, "step": 1385 }, { "epoch": 0.67, "learning_rate": 5.424245005956048e-06, "loss": 1.199, "step": 1390 }, { "epoch": 0.67, "learning_rate": 5.352768279562315e-06, "loss": 1.2145, "step": 1395 }, { "epoch": 0.68, "learning_rate": 5.28159310537518e-06, "loss": 1.2201, "step": 1400 }, { "epoch": 0.68, "eval_loss": 1.1572028398513794, "eval_runtime": 422.5597, "eval_samples_per_second": 38.257, "eval_steps_per_second": 1.197, "step": 1400 }, { "epoch": 0.68, "learning_rate": 5.210724101849696e-06, "loss": 1.2036, "step": 1405 }, { "epoch": 0.68, "learning_rate": 5.14016586757394e-06, "loss": 1.1714, "step": 1410 }, { "epoch": 0.68, "learning_rate": 5.069922980970626e-06, "loss": 1.164, "step": 1415 }, { "epoch": 0.69, "learning_rate": 5.000000000000003e-06, "loss": 1.1985, "step": 1420 }, { "epoch": 0.69, "learning_rate": 4.930401461864099e-06, "loss": 1.1966, "step": 1425 }, { "epoch": 0.69, "learning_rate": 4.861131882712314e-06, "loss": 1.1939, "step": 1430 }, { "epoch": 0.69, "learning_rate": 4.7921957573483756e-06, "loss": 1.2031, "step": 1435 }, { "epoch": 0.7, "learning_rate": 4.7235975589386715e-06, "loss": 1.1858, "step": 1440 }, { "epoch": 0.7, "learning_rate": 4.655341738721989e-06, "loss": 1.1909, "step": 1445 }, { "epoch": 0.7, "learning_rate": 4.587432725720687e-06, "loss": 1.1826, "step": 1450 }, { "epoch": 0.7, "learning_rate": 4.519874926453303e-06, "loss": 1.1905, "step": 1455 }, { "epoch": 0.71, "learning_rate": 4.4526727246486116e-06, "loss": 1.1671, "step": 1460 }, { "epoch": 0.71, "learning_rate": 4.385830480961192e-06, "loss": 1.196, "step": 1465 }, { "epoch": 0.71, "learning_rate": 4.319352532688444e-06, "loss": 1.1855, "step": 1470 }, { "epoch": 0.71, "learning_rate": 4.2532431934891646e-06, "loss": 1.1721, "step": 1475 }, { "epoch": 0.71, "learning_rate": 4.187506753103637e-06, "loss": 1.1905, "step": 1480 }, { "epoch": 0.72, "learning_rate": 4.12214747707527e-06, "loss": 1.1937, "step": 1485 }, { "epoch": 0.72, "learning_rate": 4.057169606473828e-06, "loss": 1.1809, "step": 1490 }, { "epoch": 0.72, "learning_rate": 3.99257735762021e-06, "loss": 1.1805, "step": 1495 }, { "epoch": 0.72, "learning_rate": 3.9283749218128885e-06, "loss": 1.1741, "step": 1500 }, { "epoch": 0.72, "eval_loss": 1.155676007270813, "eval_runtime": 424.7643, "eval_samples_per_second": 38.059, "eval_steps_per_second": 1.191, "step": 1500 }, { "epoch": 0.73, "learning_rate": 3.864566465055913e-06, "loss": 1.178, "step": 1505 }, { "epoch": 0.73, "learning_rate": 3.8011561277885965e-06, "loss": 1.1738, "step": 1510 }, { "epoch": 0.73, "learning_rate": 3.738148024616863e-06, "loss": 1.2035, "step": 1515 }, { "epoch": 0.73, "learning_rate": 3.6755462440462288e-06, "loss": 1.1699, "step": 1520 }, { "epoch": 0.74, "learning_rate": 3.6133548482165225e-06, "loss": 1.1839, "step": 1525 }, { "epoch": 0.74, "learning_rate": 3.5515778726382967e-06, "loss": 1.1988, "step": 1530 }, { "epoch": 0.74, "learning_rate": 3.4902193259309627e-06, "loss": 1.1747, "step": 1535 }, { "epoch": 0.74, "learning_rate": 3.4292831895626944e-06, "loss": 1.1824, "step": 1540 }, { "epoch": 0.75, "learning_rate": 3.3687734175920505e-06, "loss": 1.1772, "step": 1545 }, { "epoch": 0.75, "learning_rate": 3.308693936411421e-06, "loss": 1.1655, "step": 1550 }, { "epoch": 0.75, "learning_rate": 3.2490486444922396e-06, "loss": 1.1734, "step": 1555 }, { "epoch": 0.75, "learning_rate": 3.1898414121320277e-06, "loss": 1.1759, "step": 1560 }, { "epoch": 0.76, "learning_rate": 3.131076081203247e-06, "loss": 1.1901, "step": 1565 }, { "epoch": 0.76, "learning_rate": 3.0727564649040066e-06, "loss": 1.174, "step": 1570 }, { "epoch": 0.76, "learning_rate": 3.0148863475106315e-06, "loss": 1.1773, "step": 1575 }, { "epoch": 0.76, "learning_rate": 2.9574694841321082e-06, "loss": 1.1741, "step": 1580 }, { "epoch": 0.77, "learning_rate": 2.900509600466418e-06, "loss": 1.179, "step": 1585 }, { "epoch": 0.77, "learning_rate": 2.8440103925587904e-06, "loss": 1.1896, "step": 1590 }, { "epoch": 0.77, "learning_rate": 2.7879755265618558e-06, "loss": 1.1705, "step": 1595 }, { "epoch": 0.77, "learning_rate": 2.73240863849777e-06, "loss": 1.1813, "step": 1600 }, { "epoch": 0.77, "eval_loss": 1.1545099020004272, "eval_runtime": 425.9638, "eval_samples_per_second": 37.952, "eval_steps_per_second": 1.188, "step": 1600 }, { "epoch": 0.78, "learning_rate": 2.6773133340222677e-06, "loss": 1.1822, "step": 1605 }, { "epoch": 0.78, "learning_rate": 2.622693188190699e-06, "loss": 1.1801, "step": 1610 }, { "epoch": 0.78, "learning_rate": 2.5685517452260566e-06, "loss": 1.1689, "step": 1615 }, { "epoch": 0.78, "learning_rate": 2.514892518288988e-06, "loss": 1.1649, "step": 1620 }, { "epoch": 0.78, "learning_rate": 2.4617189892498326e-06, "loss": 1.1727, "step": 1625 }, { "epoch": 0.79, "learning_rate": 2.4090346084626857e-06, "loss": 1.1716, "step": 1630 }, { "epoch": 0.79, "learning_rate": 2.3568427945415163e-06, "loss": 1.1942, "step": 1635 }, { "epoch": 0.79, "learning_rate": 2.3051469341383403e-06, "loss": 1.1729, "step": 1640 }, { "epoch": 0.79, "learning_rate": 2.2539503817234553e-06, "loss": 1.1856, "step": 1645 }, { "epoch": 0.8, "learning_rate": 2.2032564593677773e-06, "loss": 1.1631, "step": 1650 }, { "epoch": 0.8, "learning_rate": 2.153068456527283e-06, "loss": 1.1741, "step": 1655 }, { "epoch": 0.8, "learning_rate": 2.103389629829551e-06, "loss": 1.1656, "step": 1660 }, { "epoch": 0.8, "learning_rate": 2.0542232028624585e-06, "loss": 1.1743, "step": 1665 }, { "epoch": 0.81, "learning_rate": 2.0055723659649907e-06, "loss": 1.1863, "step": 1670 }, { "epoch": 0.81, "learning_rate": 1.9574402760202315e-06, "loss": 1.1892, "step": 1675 }, { "epoch": 0.81, "learning_rate": 1.9098300562505266e-06, "loss": 1.1999, "step": 1680 }, { "epoch": 0.81, "learning_rate": 1.8627447960148036e-06, "loss": 1.173, "step": 1685 }, { "epoch": 0.82, "learning_rate": 1.8161875506081294e-06, "loss": 1.1957, "step": 1690 }, { "epoch": 0.82, "learning_rate": 1.7701613410634367e-06, "loss": 1.178, "step": 1695 }, { "epoch": 0.82, "learning_rate": 1.7246691539555027e-06, "loss": 1.1668, "step": 1700 }, { "epoch": 0.82, "eval_loss": 1.153558611869812, "eval_runtime": 426.9817, "eval_samples_per_second": 37.861, "eval_steps_per_second": 1.185, "step": 1700 }, { "epoch": 0.82, "learning_rate": 1.6797139412071583e-06, "loss": 1.1847, "step": 1705 }, { "epoch": 0.83, "learning_rate": 1.6352986198977327e-06, "loss": 1.1803, "step": 1710 }, { "epoch": 0.83, "learning_rate": 1.5914260720737796e-06, "loss": 1.1755, "step": 1715 }, { "epoch": 0.83, "learning_rate": 1.5480991445620541e-06, "loss": 1.1839, "step": 1720 }, { "epoch": 0.83, "learning_rate": 1.5053206487847916e-06, "loss": 1.1679, "step": 1725 }, { "epoch": 0.84, "learning_rate": 1.4630933605772801e-06, "loss": 1.1912, "step": 1730 }, { "epoch": 0.84, "learning_rate": 1.4214200200077343e-06, "loss": 1.172, "step": 1735 }, { "epoch": 0.84, "learning_rate": 1.3803033311995072e-06, "loss": 1.193, "step": 1740 }, { "epoch": 0.84, "learning_rate": 1.339745962155613e-06, "loss": 1.1819, "step": 1745 }, { "epoch": 0.85, "learning_rate": 1.2997505445856085e-06, "loss": 1.1911, "step": 1750 }, { "epoch": 0.85, "learning_rate": 1.2603196737348211e-06, "loss": 1.1804, "step": 1755 }, { "epoch": 0.85, "learning_rate": 1.2214559082159538e-06, "loss": 1.1797, "step": 1760 }, { "epoch": 0.85, "learning_rate": 1.1831617698430609e-06, "loss": 1.1686, "step": 1765 }, { "epoch": 0.85, "learning_rate": 1.1454397434679022e-06, "loss": 1.1825, "step": 1770 }, { "epoch": 0.86, "learning_rate": 1.1082922768187098e-06, "loss": 1.1881, "step": 1775 }, { "epoch": 0.86, "learning_rate": 1.0717217803413605e-06, "loss": 1.1746, "step": 1780 }, { "epoch": 0.86, "learning_rate": 1.0357306270429623e-06, "loss": 1.1833, "step": 1785 }, { "epoch": 0.86, "learning_rate": 1.0003211523378798e-06, "loss": 1.1932, "step": 1790 }, { "epoch": 0.87, "learning_rate": 9.65495653896179e-07, "loss": 1.1858, "step": 1795 }, { "epoch": 0.87, "learning_rate": 9.312563914945461e-07, "loss": 1.1495, "step": 1800 }, { "epoch": 0.87, "eval_loss": 1.1530314683914185, "eval_runtime": 426.2825, "eval_samples_per_second": 37.923, "eval_steps_per_second": 1.187, "step": 1800 }, { "epoch": 0.87, "learning_rate": 8.976055868696543e-07, "loss": 1.1573, "step": 1805 }, { "epoch": 0.87, "learning_rate": 8.645454235739903e-07, "loss": 1.1824, "step": 1810 }, { "epoch": 0.88, "learning_rate": 8.320780468341761e-07, "loss": 1.1805, "step": 1815 }, { "epoch": 0.88, "learning_rate": 8.002055634117578e-07, "loss": 1.1705, "step": 1820 }, { "epoch": 0.88, "learning_rate": 7.689300414665124e-07, "loss": 1.1755, "step": 1825 }, { "epoch": 0.88, "learning_rate": 7.382535104222366e-07, "loss": 1.1529, "step": 1830 }, { "epoch": 0.89, "learning_rate": 7.08177960835068e-07, "loss": 1.1716, "step": 1835 }, { "epoch": 0.89, "learning_rate": 6.787053442643233e-07, "loss": 1.1828, "step": 1840 }, { "epoch": 0.89, "learning_rate": 6.498375731458529e-07, "loss": 1.1944, "step": 1845 }, { "epoch": 0.89, "learning_rate": 6.215765206679569e-07, "loss": 1.1898, "step": 1850 }, { "epoch": 0.9, "learning_rate": 5.939240206498287e-07, "loss": 1.169, "step": 1855 }, { "epoch": 0.9, "learning_rate": 5.668818674225684e-07, "loss": 1.1748, "step": 1860 }, { "epoch": 0.9, "learning_rate": 5.404518157127481e-07, "loss": 1.1664, "step": 1865 }, { "epoch": 0.9, "learning_rate": 5.146355805285452e-07, "loss": 1.1663, "step": 1870 }, { "epoch": 0.91, "learning_rate": 4.894348370484648e-07, "loss": 1.1845, "step": 1875 }, { "epoch": 0.91, "learning_rate": 4.6485122051263764e-07, "loss": 1.168, "step": 1880 }, { "epoch": 0.91, "learning_rate": 4.408863261167096e-07, "loss": 1.1911, "step": 1885 }, { "epoch": 0.91, "learning_rate": 4.1754170890833777e-07, "loss": 1.1901, "step": 1890 }, { "epoch": 0.92, "learning_rate": 3.9481888368627764e-07, "loss": 1.1845, "step": 1895 }, { "epoch": 0.92, "learning_rate": 3.7271932490209327e-07, "loss": 1.1595, "step": 1900 }, { "epoch": 0.92, "eval_loss": 1.1527146100997925, "eval_runtime": 427.0976, "eval_samples_per_second": 37.851, "eval_steps_per_second": 1.185, "step": 1900 }, { "epoch": 0.92, "learning_rate": 3.5124446656448654e-07, "loss": 1.1582, "step": 1905 }, { "epoch": 0.92, "learning_rate": 3.303957021462378e-07, "loss": 1.1688, "step": 1910 }, { "epoch": 0.92, "learning_rate": 3.101743844937943e-07, "loss": 1.1785, "step": 1915 }, { "epoch": 0.93, "learning_rate": 2.905818257394799e-07, "loss": 1.182, "step": 1920 }, { "epoch": 0.93, "learning_rate": 2.716192972163556e-07, "loss": 1.1625, "step": 1925 }, { "epoch": 0.93, "learning_rate": 2.532880293757223e-07, "loss": 1.1795, "step": 1930 }, { "epoch": 0.93, "learning_rate": 2.355892117072789e-07, "loss": 1.1623, "step": 1935 }, { "epoch": 0.94, "learning_rate": 2.1852399266194312e-07, "loss": 1.1802, "step": 1940 }, { "epoch": 0.94, "learning_rate": 2.0209347957732328e-07, "loss": 1.1782, "step": 1945 }, { "epoch": 0.94, "learning_rate": 1.8629873860586567e-07, "loss": 1.153, "step": 1950 }, { "epoch": 0.94, "learning_rate": 1.711407946456789e-07, "loss": 1.1797, "step": 1955 }, { "epoch": 0.95, "learning_rate": 1.5662063127402262e-07, "loss": 1.1657, "step": 1960 }, { "epoch": 0.95, "learning_rate": 1.4273919068349184e-07, "loss": 1.1736, "step": 1965 }, { "epoch": 0.95, "learning_rate": 1.2949737362087156e-07, "loss": 1.1771, "step": 1970 }, { "epoch": 0.95, "learning_rate": 1.1689603932869664e-07, "loss": 1.1638, "step": 1975 }, { "epoch": 0.96, "learning_rate": 1.0493600548948879e-07, "loss": 1.1764, "step": 1980 }, { "epoch": 0.96, "learning_rate": 9.36180481727067e-08, "loss": 1.1829, "step": 1985 }, { "epoch": 0.96, "learning_rate": 8.29429017843797e-08, "loss": 1.1888, "step": 1990 }, { "epoch": 0.96, "learning_rate": 7.291125901946027e-08, "loss": 1.1743, "step": 1995 }, { "epoch": 0.97, "learning_rate": 6.352377081687011e-08, "loss": 1.1607, "step": 2000 }, { "epoch": 0.97, "eval_loss": 1.1526199579238892, "eval_runtime": 426.4931, "eval_samples_per_second": 37.904, "eval_steps_per_second": 1.186, "step": 2000 }, { "epoch": 0.97, "learning_rate": 5.4781046317267103e-08, "loss": 1.1562, "step": 2005 }, { "epoch": 0.97, "learning_rate": 4.6683652823513725e-08, "loss": 1.1882, "step": 2010 }, { "epoch": 0.97, "learning_rate": 3.923211576387087e-08, "loss": 1.178, "step": 2015 }, { "epoch": 0.98, "learning_rate": 3.242691865790071e-08, "loss": 1.165, "step": 2020 }, { "epoch": 0.98, "learning_rate": 2.6268503085089547e-08, "loss": 1.1853, "step": 2025 }, { "epoch": 0.98, "learning_rate": 2.0757268656198536e-08, "loss": 1.1643, "step": 2030 }, { "epoch": 0.98, "learning_rate": 1.5893572987333293e-08, "loss": 1.1776, "step": 2035 }, { "epoch": 0.99, "learning_rate": 1.1677731676733584e-08, "loss": 1.1739, "step": 2040 }, { "epoch": 0.99, "learning_rate": 8.110018284304132e-09, "loss": 1.1549, "step": 2045 }, { "epoch": 0.99, "learning_rate": 5.190664313851068e-09, "loss": 1.1693, "step": 2050 }, { "epoch": 0.99, "learning_rate": 2.9198591980705847e-09, "loss": 1.1632, "step": 2055 }, { "epoch": 0.99, "learning_rate": 1.2977502862532298e-09, "loss": 1.1761, "step": 2060 }, { "epoch": 1.0, "learning_rate": 3.244428347204398e-10, "loss": 1.1803, "step": 2065 }, { "epoch": 1.0, "learning_rate": 0.0, "loss": 1.1949, "step": 2070 }, { "epoch": 1.0, "step": 2070, "total_flos": 2.0719233559829676e+19, "train_loss": 1.2525324755820675, "train_runtime": 32325.2724, "train_samples_per_second": 8.198, "train_steps_per_second": 0.064 } ], "logging_steps": 5, "max_steps": 2070, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 2.0719233559829676e+19, "train_batch_size": 16, "trial_name": null, "trial_params": null }