{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9993739889922224, "eval_steps": 500, "global_step": 1559, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006410352719642222, "grad_norm": 0.10498046875, "learning_rate": 1.282051282051282e-06, "loss": 1.8493, "step": 1 }, { "epoch": 0.003205176359821111, "grad_norm": 0.1103515625, "learning_rate": 6.41025641025641e-06, "loss": 1.8865, "step": 5 }, { "epoch": 0.006410352719642222, "grad_norm": 0.1005859375, "learning_rate": 1.282051282051282e-05, "loss": 1.8383, "step": 10 }, { "epoch": 0.009615529079463333, "grad_norm": 0.10693359375, "learning_rate": 1.923076923076923e-05, "loss": 1.8385, "step": 15 }, { "epoch": 0.012820705439284444, "grad_norm": 0.1103515625, "learning_rate": 2.564102564102564e-05, "loss": 1.8346, "step": 20 }, { "epoch": 0.016025881799105555, "grad_norm": 0.1298828125, "learning_rate": 3.205128205128206e-05, "loss": 1.8127, "step": 25 }, { "epoch": 0.019231058158926666, "grad_norm": 0.1435546875, "learning_rate": 3.846153846153846e-05, "loss": 1.7981, "step": 30 }, { "epoch": 0.022436234518747777, "grad_norm": 0.1494140625, "learning_rate": 4.4871794871794874e-05, "loss": 1.7907, "step": 35 }, { "epoch": 0.025641410878568888, "grad_norm": 0.1416015625, "learning_rate": 5.128205128205128e-05, "loss": 1.7468, "step": 40 }, { "epoch": 0.02884658723839, "grad_norm": 0.1328125, "learning_rate": 5.769230769230769e-05, "loss": 1.7105, "step": 45 }, { "epoch": 0.03205176359821111, "grad_norm": 0.126953125, "learning_rate": 6.410256410256412e-05, "loss": 1.6887, "step": 50 }, { "epoch": 0.035256939958032224, "grad_norm": 0.107421875, "learning_rate": 7.051282051282052e-05, "loss": 1.6757, "step": 55 }, { "epoch": 0.03846211631785333, "grad_norm": 0.10009765625, "learning_rate": 7.692307692307693e-05, "loss": 1.6736, "step": 60 }, { "epoch": 0.041667292677674446, "grad_norm": 0.078125, "learning_rate": 8.333333333333334e-05, "loss": 1.6252, "step": 65 }, { "epoch": 0.04487246903749555, "grad_norm": 0.06201171875, "learning_rate": 8.974358974358975e-05, "loss": 1.5655, "step": 70 }, { "epoch": 0.04807764539731667, "grad_norm": 0.049072265625, "learning_rate": 9.615384615384617e-05, "loss": 1.5646, "step": 75 }, { "epoch": 0.051282821757137775, "grad_norm": 0.04345703125, "learning_rate": 0.00010256410256410256, "loss": 1.5861, "step": 80 }, { "epoch": 0.05448799811695889, "grad_norm": 0.039794921875, "learning_rate": 0.00010897435897435896, "loss": 1.5379, "step": 85 }, { "epoch": 0.05769317447678, "grad_norm": 0.0400390625, "learning_rate": 0.00011538461538461538, "loss": 1.5557, "step": 90 }, { "epoch": 0.06089835083660111, "grad_norm": 0.037841796875, "learning_rate": 0.00012179487179487179, "loss": 1.5102, "step": 95 }, { "epoch": 0.06410352719642222, "grad_norm": 0.038330078125, "learning_rate": 0.00012820512820512823, "loss": 1.5048, "step": 100 }, { "epoch": 0.06730870355624333, "grad_norm": 0.033935546875, "learning_rate": 0.00013461538461538464, "loss": 1.5127, "step": 105 }, { "epoch": 0.07051387991606445, "grad_norm": 0.03173828125, "learning_rate": 0.00014102564102564104, "loss": 1.5161, "step": 110 }, { "epoch": 0.07371905627588556, "grad_norm": 0.0301513671875, "learning_rate": 0.00014743589743589745, "loss": 1.4948, "step": 115 }, { "epoch": 0.07692423263570666, "grad_norm": 0.03125, "learning_rate": 0.00015384615384615385, "loss": 1.4584, "step": 120 }, { "epoch": 0.08012940899552777, "grad_norm": 0.029052734375, "learning_rate": 0.00016025641025641028, "loss": 1.4704, "step": 125 }, { "epoch": 0.08333458535534889, "grad_norm": 0.0279541015625, "learning_rate": 0.0001666666666666667, "loss": 1.4411, "step": 130 }, { "epoch": 0.08653976171517, "grad_norm": 0.0263671875, "learning_rate": 0.0001730769230769231, "loss": 1.4723, "step": 135 }, { "epoch": 0.0897449380749911, "grad_norm": 0.02685546875, "learning_rate": 0.0001794871794871795, "loss": 1.4505, "step": 140 }, { "epoch": 0.09295011443481223, "grad_norm": 0.0291748046875, "learning_rate": 0.0001858974358974359, "loss": 1.4367, "step": 145 }, { "epoch": 0.09615529079463334, "grad_norm": 0.0262451171875, "learning_rate": 0.00019230769230769233, "loss": 1.4291, "step": 150 }, { "epoch": 0.09936046715445444, "grad_norm": 0.0390625, "learning_rate": 0.00019871794871794874, "loss": 1.4075, "step": 155 }, { "epoch": 0.10256564351427555, "grad_norm": 0.03857421875, "learning_rate": 0.00019999598882613538, "loss": 1.4203, "step": 160 }, { "epoch": 0.10577081987409667, "grad_norm": 0.029541015625, "learning_rate": 0.00019997969398381457, "loss": 1.4188, "step": 165 }, { "epoch": 0.10897599623391778, "grad_norm": 0.025146484375, "learning_rate": 0.00019995086681563726, "loss": 1.4512, "step": 170 }, { "epoch": 0.11218117259373889, "grad_norm": 0.025146484375, "learning_rate": 0.0001999095109350519, "loss": 1.417, "step": 175 }, { "epoch": 0.11538634895356, "grad_norm": 0.02734375, "learning_rate": 0.0001998556315259648, "loss": 1.4309, "step": 180 }, { "epoch": 0.11859152531338112, "grad_norm": 0.0255126953125, "learning_rate": 0.00019978923534209054, "loss": 1.4201, "step": 185 }, { "epoch": 0.12179670167320222, "grad_norm": 0.0286865234375, "learning_rate": 0.00019971033070610518, "loss": 1.4187, "step": 190 }, { "epoch": 0.12500187803302334, "grad_norm": 0.030517578125, "learning_rate": 0.0001996189275086033, "loss": 1.4153, "step": 195 }, { "epoch": 0.12820705439284444, "grad_norm": 0.0272216796875, "learning_rate": 0.00019951503720685784, "loss": 1.4279, "step": 200 }, { "epoch": 0.13141223075266556, "grad_norm": 0.0267333984375, "learning_rate": 0.0001993986728233844, "loss": 1.4052, "step": 205 }, { "epoch": 0.13461740711248665, "grad_norm": 0.0264892578125, "learning_rate": 0.0001992698489443085, "loss": 1.3943, "step": 210 }, { "epoch": 0.13782258347230777, "grad_norm": 0.032470703125, "learning_rate": 0.0001991285817175375, "loss": 1.3931, "step": 215 }, { "epoch": 0.1410277598321289, "grad_norm": 0.0291748046875, "learning_rate": 0.0001989748888507363, "loss": 1.3931, "step": 220 }, { "epoch": 0.14423293619195, "grad_norm": 0.03125, "learning_rate": 0.00019880878960910772, "loss": 1.3899, "step": 225 }, { "epoch": 0.1474381125517711, "grad_norm": 0.0322265625, "learning_rate": 0.0001986303048129778, "loss": 1.4305, "step": 230 }, { "epoch": 0.15064328891159223, "grad_norm": 0.033203125, "learning_rate": 0.0001984394568351858, "loss": 1.4028, "step": 235 }, { "epoch": 0.15384846527141333, "grad_norm": 0.03369140625, "learning_rate": 0.00019823626959827997, "loss": 1.3758, "step": 240 }, { "epoch": 0.15705364163123445, "grad_norm": 0.041015625, "learning_rate": 0.0001980207685715186, "loss": 1.407, "step": 245 }, { "epoch": 0.16025881799105554, "grad_norm": 0.034912109375, "learning_rate": 0.00019779298076767795, "loss": 1.3923, "step": 250 }, { "epoch": 0.16346399435087666, "grad_norm": 0.047119140625, "learning_rate": 0.00019755293473966572, "loss": 1.3967, "step": 255 }, { "epoch": 0.16666917071069778, "grad_norm": 0.043701171875, "learning_rate": 0.00019730066057694235, "loss": 1.4007, "step": 260 }, { "epoch": 0.16987434707051888, "grad_norm": 0.050048828125, "learning_rate": 0.00019703618990174918, "loss": 1.3978, "step": 265 }, { "epoch": 0.17307952343034, "grad_norm": 0.048095703125, "learning_rate": 0.00019675955586514468, "loss": 1.3744, "step": 270 }, { "epoch": 0.17628469979016112, "grad_norm": 0.033935546875, "learning_rate": 0.00019647079314284897, "loss": 1.3929, "step": 275 }, { "epoch": 0.1794898761499822, "grad_norm": 0.033935546875, "learning_rate": 0.0001961699379308974, "loss": 1.4031, "step": 280 }, { "epoch": 0.18269505250980334, "grad_norm": 0.04052734375, "learning_rate": 0.0001958570279411032, "loss": 1.3813, "step": 285 }, { "epoch": 0.18590022886962446, "grad_norm": 0.052734375, "learning_rate": 0.00019553210239633056, "loss": 1.3956, "step": 290 }, { "epoch": 0.18910540522944555, "grad_norm": 0.048095703125, "learning_rate": 0.00019519520202557797, "loss": 1.3988, "step": 295 }, { "epoch": 0.19231058158926667, "grad_norm": 0.037109375, "learning_rate": 0.00019484636905887296, "loss": 1.3925, "step": 300 }, { "epoch": 0.19551575794908777, "grad_norm": 0.036865234375, "learning_rate": 0.00019448564722197853, "loss": 1.376, "step": 305 }, { "epoch": 0.1987209343089089, "grad_norm": 0.04052734375, "learning_rate": 0.00019411308173091228, "loss": 1.3974, "step": 310 }, { "epoch": 0.20192611066873, "grad_norm": 0.052490234375, "learning_rate": 0.0001937287192862787, "loss": 1.3765, "step": 315 }, { "epoch": 0.2051312870285511, "grad_norm": 0.059326171875, "learning_rate": 0.00019333260806741502, "loss": 1.3769, "step": 320 }, { "epoch": 0.20833646338837222, "grad_norm": 0.052490234375, "learning_rate": 0.00019292479772635237, "loss": 1.3792, "step": 325 }, { "epoch": 0.21154163974819334, "grad_norm": 0.048583984375, "learning_rate": 0.00019250533938159166, "loss": 1.3968, "step": 330 }, { "epoch": 0.21474681610801444, "grad_norm": 0.040283203125, "learning_rate": 0.00019207428561169608, "loss": 1.38, "step": 335 }, { "epoch": 0.21795199246783556, "grad_norm": 0.043701171875, "learning_rate": 0.0001916316904487005, "loss": 1.3737, "step": 340 }, { "epoch": 0.22115716882765665, "grad_norm": 0.03759765625, "learning_rate": 0.00019117760937133844, "loss": 1.4065, "step": 345 }, { "epoch": 0.22436234518747777, "grad_norm": 0.038330078125, "learning_rate": 0.00019071209929808806, "loss": 1.4012, "step": 350 }, { "epoch": 0.2275675215472989, "grad_norm": 0.041748046875, "learning_rate": 0.00019023521858003742, "loss": 1.3941, "step": 355 }, { "epoch": 0.23077269790712, "grad_norm": 0.037841796875, "learning_rate": 0.00018974702699357029, "loss": 1.4072, "step": 360 }, { "epoch": 0.2339778742669411, "grad_norm": 0.03759765625, "learning_rate": 0.00018924758573287315, "loss": 1.3531, "step": 365 }, { "epoch": 0.23718305062676223, "grad_norm": 0.03662109375, "learning_rate": 0.00018873695740226468, "loss": 1.3682, "step": 370 }, { "epoch": 0.24038822698658333, "grad_norm": 0.047607421875, "learning_rate": 0.0001882152060083484, "loss": 1.3796, "step": 375 }, { "epoch": 0.24359340334640445, "grad_norm": 0.041015625, "learning_rate": 0.00018768239695198945, "loss": 1.3835, "step": 380 }, { "epoch": 0.24679857970622554, "grad_norm": 0.04541015625, "learning_rate": 0.0001871385970201168, "loss": 1.3678, "step": 385 }, { "epoch": 0.2500037560660467, "grad_norm": 0.04345703125, "learning_rate": 0.00018658387437735135, "loss": 1.3778, "step": 390 }, { "epoch": 0.2532089324258678, "grad_norm": 0.06396484375, "learning_rate": 0.00018601829855746185, "loss": 1.3811, "step": 395 }, { "epoch": 0.2564141087856889, "grad_norm": 0.057373046875, "learning_rate": 0.00018544194045464886, "loss": 1.3851, "step": 400 }, { "epoch": 0.25961928514551, "grad_norm": 0.0458984375, "learning_rate": 0.0001848548723146581, "loss": 1.3865, "step": 405 }, { "epoch": 0.2628244615053311, "grad_norm": 0.047119140625, "learning_rate": 0.00018425716772572473, "loss": 1.3638, "step": 410 }, { "epoch": 0.2660296378651522, "grad_norm": 0.04443359375, "learning_rate": 0.00018364890160934904, "loss": 1.3918, "step": 415 }, { "epoch": 0.2692348142249733, "grad_norm": 0.042236328125, "learning_rate": 0.00018303015021090525, "loss": 1.3794, "step": 420 }, { "epoch": 0.27243999058479446, "grad_norm": 0.06005859375, "learning_rate": 0.00018240099109008412, "loss": 1.3836, "step": 425 }, { "epoch": 0.27564516694461555, "grad_norm": 0.05419921875, "learning_rate": 0.000181761503111171, "loss": 1.3676, "step": 430 }, { "epoch": 0.27885034330443664, "grad_norm": 0.04443359375, "learning_rate": 0.0001811117664331604, "loss": 1.3513, "step": 435 }, { "epoch": 0.2820555196642578, "grad_norm": 0.047607421875, "learning_rate": 0.00018045186249970784, "loss": 1.3602, "step": 440 }, { "epoch": 0.2852606960240789, "grad_norm": 0.043212890625, "learning_rate": 0.00017978187402892148, "loss": 1.3468, "step": 445 }, { "epoch": 0.2884658723839, "grad_norm": 0.05078125, "learning_rate": 0.00017910188500299304, "loss": 1.3651, "step": 450 }, { "epoch": 0.29167104874372113, "grad_norm": 0.04296875, "learning_rate": 0.00017841198065767107, "loss": 1.3763, "step": 455 }, { "epoch": 0.2948762251035422, "grad_norm": 0.044921875, "learning_rate": 0.00017771224747157652, "loss": 1.3597, "step": 460 }, { "epoch": 0.2980814014633633, "grad_norm": 0.0654296875, "learning_rate": 0.00017700277315536305, "loss": 1.3558, "step": 465 }, { "epoch": 0.30128657782318446, "grad_norm": 0.052978515625, "learning_rate": 0.00017628364664072218, "loss": 1.3534, "step": 470 }, { "epoch": 0.30449175418300556, "grad_norm": 0.04248046875, "learning_rate": 0.00017555495806923635, "loss": 1.3525, "step": 475 }, { "epoch": 0.30769693054282665, "grad_norm": 0.044189453125, "learning_rate": 0.00017481679878107926, "loss": 1.3715, "step": 480 }, { "epoch": 0.3109021069026478, "grad_norm": 0.058837890625, "learning_rate": 0.00017406926130356692, "loss": 1.3689, "step": 485 }, { "epoch": 0.3141072832624689, "grad_norm": 0.095703125, "learning_rate": 0.00017331243933955918, "loss": 1.3686, "step": 490 }, { "epoch": 0.31731245962229, "grad_norm": 0.059326171875, "learning_rate": 0.00017254642775571438, "loss": 1.3784, "step": 495 }, { "epoch": 0.3205176359821111, "grad_norm": 0.07373046875, "learning_rate": 0.00017177132257059787, "loss": 1.3488, "step": 500 }, { "epoch": 0.32372281234193223, "grad_norm": 0.0439453125, "learning_rate": 0.00017098722094264617, "loss": 1.3789, "step": 505 }, { "epoch": 0.3269279887017533, "grad_norm": 0.052490234375, "learning_rate": 0.00017019422115798833, "loss": 1.3414, "step": 510 }, { "epoch": 0.3301331650615744, "grad_norm": 0.0458984375, "learning_rate": 0.0001693924226181259, "loss": 1.3667, "step": 515 }, { "epoch": 0.33333834142139557, "grad_norm": 0.05322265625, "learning_rate": 0.00016858192582747304, "loss": 1.3749, "step": 520 }, { "epoch": 0.33654351778121666, "grad_norm": 0.0634765625, "learning_rate": 0.00016776283238075851, "loss": 1.3929, "step": 525 }, { "epoch": 0.33974869414103775, "grad_norm": 0.050537109375, "learning_rate": 0.00016693524495029068, "loss": 1.3527, "step": 530 }, { "epoch": 0.3429538705008589, "grad_norm": 0.059814453125, "learning_rate": 0.00016609926727308806, "loss": 1.3577, "step": 535 }, { "epoch": 0.34615904686068, "grad_norm": 0.07861328125, "learning_rate": 0.00016525500413787554, "loss": 1.3639, "step": 540 }, { "epoch": 0.3493642232205011, "grad_norm": 0.0595703125, "learning_rate": 0.00016440256137194965, "loss": 1.3608, "step": 545 }, { "epoch": 0.35256939958032224, "grad_norm": 0.052978515625, "learning_rate": 0.0001635420458279131, "loss": 1.3324, "step": 550 }, { "epoch": 0.35577457594014333, "grad_norm": 0.062255859375, "learning_rate": 0.0001626735653702809, "loss": 1.3283, "step": 555 }, { "epoch": 0.3589797522999644, "grad_norm": 0.04931640625, "learning_rate": 0.00016179722886195967, "loss": 1.3287, "step": 560 }, { "epoch": 0.3621849286597856, "grad_norm": 0.0703125, "learning_rate": 0.00016091314615060195, "loss": 1.3799, "step": 565 }, { "epoch": 0.36539010501960667, "grad_norm": 0.051025390625, "learning_rate": 0.00016002142805483685, "loss": 1.3399, "step": 570 }, { "epoch": 0.36859528137942776, "grad_norm": 0.05908203125, "learning_rate": 0.00015912218635037896, "loss": 1.3698, "step": 575 }, { "epoch": 0.3718004577392489, "grad_norm": 0.05078125, "learning_rate": 0.0001582155337560177, "loss": 1.3378, "step": 580 }, { "epoch": 0.37500563409907, "grad_norm": 0.051025390625, "learning_rate": 0.00015730158391948784, "loss": 1.337, "step": 585 }, { "epoch": 0.3782108104588911, "grad_norm": 0.0498046875, "learning_rate": 0.0001563804514032242, "loss": 1.3527, "step": 590 }, { "epoch": 0.3814159868187122, "grad_norm": 0.052734375, "learning_rate": 0.0001554522516700011, "loss": 1.3583, "step": 595 }, { "epoch": 0.38462116317853334, "grad_norm": 0.06201171875, "learning_rate": 0.00015451710106845955, "loss": 1.3421, "step": 600 }, { "epoch": 0.38782633953835444, "grad_norm": 0.050537109375, "learning_rate": 0.0001535751168185228, "loss": 1.3577, "step": 605 }, { "epoch": 0.39103151589817553, "grad_norm": 0.05517578125, "learning_rate": 0.00015262641699670328, "loss": 1.3706, "step": 610 }, { "epoch": 0.3942366922579967, "grad_norm": 0.054931640625, "learning_rate": 0.0001516711205213016, "loss": 1.3439, "step": 615 }, { "epoch": 0.3974418686178178, "grad_norm": 0.0478515625, "learning_rate": 0.00015070934713750042, "loss": 1.3353, "step": 620 }, { "epoch": 0.40064704497763887, "grad_norm": 0.048583984375, "learning_rate": 0.00014974121740235456, "loss": 1.3489, "step": 625 }, { "epoch": 0.40385222133746, "grad_norm": 0.057373046875, "learning_rate": 0.00014876685266967924, "loss": 1.3481, "step": 630 }, { "epoch": 0.4070573976972811, "grad_norm": 0.053466796875, "learning_rate": 0.00014778637507483866, "loss": 1.3533, "step": 635 }, { "epoch": 0.4102625740571022, "grad_norm": 0.06494140625, "learning_rate": 0.0001467999075194363, "loss": 1.3522, "step": 640 }, { "epoch": 0.41346775041692335, "grad_norm": 0.06689453125, "learning_rate": 0.00014580757365590963, "loss": 1.3712, "step": 645 }, { "epoch": 0.41667292677674445, "grad_norm": 0.053955078125, "learning_rate": 0.00014480949787203014, "loss": 1.3606, "step": 650 }, { "epoch": 0.41987810313656554, "grad_norm": 0.046875, "learning_rate": 0.0001438058052753118, "loss": 1.3488, "step": 655 }, { "epoch": 0.4230832794963867, "grad_norm": 0.058837890625, "learning_rate": 0.00014279662167732867, "loss": 1.342, "step": 660 }, { "epoch": 0.4262884558562078, "grad_norm": 0.07080078125, "learning_rate": 0.00014178207357794486, "loss": 1.3712, "step": 665 }, { "epoch": 0.4294936322160289, "grad_norm": 0.05029296875, "learning_rate": 0.00014076228814945778, "loss": 1.3227, "step": 670 }, { "epoch": 0.43269880857585, "grad_norm": 0.06982421875, "learning_rate": 0.00013973739322065728, "loss": 1.3201, "step": 675 }, { "epoch": 0.4359039849356711, "grad_norm": 0.05029296875, "learning_rate": 0.00013870751726080256, "loss": 1.3406, "step": 680 }, { "epoch": 0.4391091612954922, "grad_norm": 0.06201171875, "learning_rate": 0.00013767278936351854, "loss": 1.3636, "step": 685 }, { "epoch": 0.4423143376553133, "grad_norm": 0.0458984375, "learning_rate": 0.0001366333392306143, "loss": 1.3576, "step": 690 }, { "epoch": 0.44551951401513445, "grad_norm": 0.06005859375, "learning_rate": 0.00013558929715582515, "loss": 1.3517, "step": 695 }, { "epoch": 0.44872469037495555, "grad_norm": 0.05126953125, "learning_rate": 0.00013454079400848027, "loss": 1.3376, "step": 700 }, { "epoch": 0.45192986673477664, "grad_norm": 0.059326171875, "learning_rate": 0.00013348796121709862, "loss": 1.3633, "step": 705 }, { "epoch": 0.4551350430945978, "grad_norm": 0.05078125, "learning_rate": 0.00013243093075291444, "loss": 1.3217, "step": 710 }, { "epoch": 0.4583402194544189, "grad_norm": 0.056884765625, "learning_rate": 0.00013136983511333482, "loss": 1.3265, "step": 715 }, { "epoch": 0.46154539581424, "grad_norm": 0.05859375, "learning_rate": 0.00013030480730533145, "loss": 1.3451, "step": 720 }, { "epoch": 0.4647505721740611, "grad_norm": 0.054443359375, "learning_rate": 0.00012923598082876812, "loss": 1.376, "step": 725 }, { "epoch": 0.4679557485338822, "grad_norm": 0.058349609375, "learning_rate": 0.0001281634896596669, "loss": 1.3524, "step": 730 }, { "epoch": 0.4711609248937033, "grad_norm": 0.0634765625, "learning_rate": 0.00012708746823341446, "loss": 1.3599, "step": 735 }, { "epoch": 0.47436610125352446, "grad_norm": 0.053466796875, "learning_rate": 0.00012600805142791042, "loss": 1.3416, "step": 740 }, { "epoch": 0.47757127761334556, "grad_norm": 0.055419921875, "learning_rate": 0.000124925374546661, "loss": 1.3574, "step": 745 }, { "epoch": 0.48077645397316665, "grad_norm": 0.052978515625, "learning_rate": 0.0001238395733018187, "loss": 1.3574, "step": 750 }, { "epoch": 0.4839816303329878, "grad_norm": 0.053466796875, "learning_rate": 0.00012275078379717089, "loss": 1.3341, "step": 755 }, { "epoch": 0.4871868066928089, "grad_norm": 0.0556640625, "learning_rate": 0.00012165914251107952, "loss": 1.3241, "step": 760 }, { "epoch": 0.49039198305263, "grad_norm": 0.054443359375, "learning_rate": 0.00012056478627937365, "loss": 1.3788, "step": 765 }, { "epoch": 0.4935971594124511, "grad_norm": 0.049560546875, "learning_rate": 0.00011946785227819726, "loss": 1.3581, "step": 770 }, { "epoch": 0.49680233577227223, "grad_norm": 0.05615234375, "learning_rate": 0.00011836847800681443, "loss": 1.3328, "step": 775 }, { "epoch": 0.5000075121320934, "grad_norm": 0.0556640625, "learning_rate": 0.00011726680127037401, "loss": 1.3533, "step": 780 }, { "epoch": 0.5032126884919145, "grad_norm": 0.05419921875, "learning_rate": 0.00011616296016263582, "loss": 1.3622, "step": 785 }, { "epoch": 0.5064178648517356, "grad_norm": 0.049072265625, "learning_rate": 0.00011505709304866084, "loss": 1.3446, "step": 790 }, { "epoch": 0.5096230412115567, "grad_norm": 0.0712890625, "learning_rate": 0.00011394933854746733, "loss": 1.3384, "step": 795 }, { "epoch": 0.5128282175713778, "grad_norm": 0.055908203125, "learning_rate": 0.00011283983551465511, "loss": 1.3378, "step": 800 }, { "epoch": 0.5160333939311988, "grad_norm": 0.060791015625, "learning_rate": 0.00011172872302500017, "loss": 1.3656, "step": 805 }, { "epoch": 0.51923857029102, "grad_norm": 0.0791015625, "learning_rate": 0.00011061614035502193, "loss": 1.3521, "step": 810 }, { "epoch": 0.5224437466508411, "grad_norm": 0.05859375, "learning_rate": 0.00010950222696552486, "loss": 1.3614, "step": 815 }, { "epoch": 0.5256489230106622, "grad_norm": 0.08203125, "learning_rate": 0.00010838712248411753, "loss": 1.3314, "step": 820 }, { "epoch": 0.5288540993704833, "grad_norm": 0.05322265625, "learning_rate": 0.00010727096668771036, "loss": 1.338, "step": 825 }, { "epoch": 0.5320592757303044, "grad_norm": 0.0556640625, "learning_rate": 0.0001061538994849946, "loss": 1.3611, "step": 830 }, { "epoch": 0.5352644520901255, "grad_norm": 0.06201171875, "learning_rate": 0.00010503606089890529, "loss": 1.3175, "step": 835 }, { "epoch": 0.5384696284499466, "grad_norm": 0.05712890625, "learning_rate": 0.00010391759104906928, "loss": 1.3525, "step": 840 }, { "epoch": 0.5416748048097678, "grad_norm": 0.0498046875, "learning_rate": 0.00010279863013424154, "loss": 1.3313, "step": 845 }, { "epoch": 0.5448799811695889, "grad_norm": 0.051025390625, "learning_rate": 0.00010167931841473142, "loss": 1.3349, "step": 850 }, { "epoch": 0.54808515752941, "grad_norm": 0.06298828125, "learning_rate": 0.00010055979619482112, "loss": 1.3408, "step": 855 }, { "epoch": 0.5512903338892311, "grad_norm": 0.058837890625, "learning_rate": 9.944020380517889e-05, "loss": 1.3175, "step": 860 }, { "epoch": 0.5544955102490522, "grad_norm": 0.050048828125, "learning_rate": 9.832068158526862e-05, "loss": 1.3375, "step": 865 }, { "epoch": 0.5577006866088733, "grad_norm": 0.0498046875, "learning_rate": 9.720136986575848e-05, "loss": 1.3475, "step": 870 }, { "epoch": 0.5609058629686945, "grad_norm": 0.051513671875, "learning_rate": 9.608240895093076e-05, "loss": 1.3295, "step": 875 }, { "epoch": 0.5641110393285156, "grad_norm": 0.046142578125, "learning_rate": 9.496393910109472e-05, "loss": 1.3429, "step": 880 }, { "epoch": 0.5673162156883367, "grad_norm": 0.04443359375, "learning_rate": 9.384610051500545e-05, "loss": 1.3293, "step": 885 }, { "epoch": 0.5705213920481578, "grad_norm": 0.052734375, "learning_rate": 9.272903331228968e-05, "loss": 1.3498, "step": 890 }, { "epoch": 0.5737265684079789, "grad_norm": 0.062255859375, "learning_rate": 9.161287751588248e-05, "loss": 1.3351, "step": 895 }, { "epoch": 0.5769317447678, "grad_norm": 0.064453125, "learning_rate": 9.049777303447516e-05, "loss": 1.353, "step": 900 }, { "epoch": 0.5801369211276212, "grad_norm": 0.0556640625, "learning_rate": 8.938385964497808e-05, "loss": 1.3363, "step": 905 }, { "epoch": 0.5833420974874423, "grad_norm": 0.06201171875, "learning_rate": 8.827127697499984e-05, "loss": 1.3696, "step": 910 }, { "epoch": 0.5865472738472634, "grad_norm": 0.080078125, "learning_rate": 8.71601644853449e-05, "loss": 1.3481, "step": 915 }, { "epoch": 0.5897524502070844, "grad_norm": 0.06884765625, "learning_rate": 8.605066145253268e-05, "loss": 1.3256, "step": 920 }, { "epoch": 0.5929576265669055, "grad_norm": 0.051513671875, "learning_rate": 8.494290695133917e-05, "loss": 1.3544, "step": 925 }, { "epoch": 0.5961628029267266, "grad_norm": 0.05810546875, "learning_rate": 8.383703983736419e-05, "loss": 1.3443, "step": 930 }, { "epoch": 0.5993679792865477, "grad_norm": 0.06103515625, "learning_rate": 8.2733198729626e-05, "loss": 1.3816, "step": 935 }, { "epoch": 0.6025731556463689, "grad_norm": 0.046142578125, "learning_rate": 8.163152199318558e-05, "loss": 1.3247, "step": 940 }, { "epoch": 0.60577833200619, "grad_norm": 0.053466796875, "learning_rate": 8.053214772180277e-05, "loss": 1.3532, "step": 945 }, { "epoch": 0.6089835083660111, "grad_norm": 0.05419921875, "learning_rate": 7.94352137206264e-05, "loss": 1.3443, "step": 950 }, { "epoch": 0.6121886847258322, "grad_norm": 0.047119140625, "learning_rate": 7.83408574889205e-05, "loss": 1.3327, "step": 955 }, { "epoch": 0.6153938610856533, "grad_norm": 0.0537109375, "learning_rate": 7.724921620282916e-05, "loss": 1.334, "step": 960 }, { "epoch": 0.6185990374454744, "grad_norm": 0.0703125, "learning_rate": 7.616042669818133e-05, "loss": 1.3572, "step": 965 }, { "epoch": 0.6218042138052956, "grad_norm": 0.055419921875, "learning_rate": 7.507462545333903e-05, "loss": 1.3322, "step": 970 }, { "epoch": 0.6250093901651167, "grad_norm": 0.07958984375, "learning_rate": 7.399194857208961e-05, "loss": 1.3222, "step": 975 }, { "epoch": 0.6282145665249378, "grad_norm": 0.05078125, "learning_rate": 7.291253176658561e-05, "loss": 1.3375, "step": 980 }, { "epoch": 0.6314197428847589, "grad_norm": 0.08251953125, "learning_rate": 7.183651034033313e-05, "loss": 1.3397, "step": 985 }, { "epoch": 0.63462491924458, "grad_norm": 0.04931640625, "learning_rate": 7.07640191712319e-05, "loss": 1.34, "step": 990 }, { "epoch": 0.6378300956044011, "grad_norm": 0.049072265625, "learning_rate": 6.969519269466857e-05, "loss": 1.3344, "step": 995 }, { "epoch": 0.6410352719642222, "grad_norm": 0.052490234375, "learning_rate": 6.863016488666517e-05, "loss": 1.3475, "step": 1000 }, { "epoch": 0.6442404483240434, "grad_norm": 0.04736328125, "learning_rate": 6.756906924708558e-05, "loss": 1.3317, "step": 1005 }, { "epoch": 0.6474456246838645, "grad_norm": 0.050537109375, "learning_rate": 6.651203878290139e-05, "loss": 1.3243, "step": 1010 }, { "epoch": 0.6506508010436856, "grad_norm": 0.053955078125, "learning_rate": 6.545920599151975e-05, "loss": 1.3351, "step": 1015 }, { "epoch": 0.6538559774035066, "grad_norm": 0.058837890625, "learning_rate": 6.441070284417487e-05, "loss": 1.3536, "step": 1020 }, { "epoch": 0.6570611537633277, "grad_norm": 0.060791015625, "learning_rate": 6.336666076938572e-05, "loss": 1.3064, "step": 1025 }, { "epoch": 0.6602663301231488, "grad_norm": 0.056396484375, "learning_rate": 6.232721063648148e-05, "loss": 1.3496, "step": 1030 }, { "epoch": 0.66347150648297, "grad_norm": 0.0478515625, "learning_rate": 6.12924827391975e-05, "loss": 1.3487, "step": 1035 }, { "epoch": 0.6666766828427911, "grad_norm": 0.05126953125, "learning_rate": 6.026260677934272e-05, "loss": 1.3241, "step": 1040 }, { "epoch": 0.6698818592026122, "grad_norm": 0.0478515625, "learning_rate": 5.9237711850542246e-05, "loss": 1.3454, "step": 1045 }, { "epoch": 0.6730870355624333, "grad_norm": 0.046142578125, "learning_rate": 5.8217926422055126e-05, "loss": 1.3364, "step": 1050 }, { "epoch": 0.6762922119222544, "grad_norm": 0.054443359375, "learning_rate": 5.7203378322671355e-05, "loss": 1.3152, "step": 1055 }, { "epoch": 0.6794973882820755, "grad_norm": 0.0546875, "learning_rate": 5.619419472468823e-05, "loss": 1.3486, "step": 1060 }, { "epoch": 0.6827025646418967, "grad_norm": 0.05029296875, "learning_rate": 5.519050212796986e-05, "loss": 1.3301, "step": 1065 }, { "epoch": 0.6859077410017178, "grad_norm": 0.051513671875, "learning_rate": 5.419242634409039e-05, "loss": 1.3279, "step": 1070 }, { "epoch": 0.6891129173615389, "grad_norm": 0.0478515625, "learning_rate": 5.32000924805637e-05, "loss": 1.3415, "step": 1075 }, { "epoch": 0.69231809372136, "grad_norm": 0.04638671875, "learning_rate": 5.2213624925161386e-05, "loss": 1.3449, "step": 1080 }, { "epoch": 0.6955232700811811, "grad_norm": 0.04541015625, "learning_rate": 5.123314733032074e-05, "loss": 1.3442, "step": 1085 }, { "epoch": 0.6987284464410022, "grad_norm": 0.04736328125, "learning_rate": 5.0258782597645446e-05, "loss": 1.3309, "step": 1090 }, { "epoch": 0.7019336228008233, "grad_norm": 0.0478515625, "learning_rate": 4.929065286249959e-05, "loss": 1.3564, "step": 1095 }, { "epoch": 0.7051387991606445, "grad_norm": 0.048095703125, "learning_rate": 4.832887947869841e-05, "loss": 1.3578, "step": 1100 }, { "epoch": 0.7083439755204656, "grad_norm": 0.047119140625, "learning_rate": 4.737358300329673e-05, "loss": 1.3417, "step": 1105 }, { "epoch": 0.7115491518802867, "grad_norm": 0.05029296875, "learning_rate": 4.642488318147723e-05, "loss": 1.3259, "step": 1110 }, { "epoch": 0.7147543282401078, "grad_norm": 0.052001953125, "learning_rate": 4.548289893154051e-05, "loss": 1.3568, "step": 1115 }, { "epoch": 0.7179595045999289, "grad_norm": 0.047607421875, "learning_rate": 4.4547748329998925e-05, "loss": 1.3211, "step": 1120 }, { "epoch": 0.72116468095975, "grad_norm": 0.05126953125, "learning_rate": 4.361954859677584e-05, "loss": 1.3398, "step": 1125 }, { "epoch": 0.7243698573195712, "grad_norm": 0.048095703125, "learning_rate": 4.2698416080512204e-05, "loss": 1.3266, "step": 1130 }, { "epoch": 0.7275750336793922, "grad_norm": 0.050048828125, "learning_rate": 4.1784466243982324e-05, "loss": 1.3447, "step": 1135 }, { "epoch": 0.7307802100392133, "grad_norm": 0.052001953125, "learning_rate": 4.0877813649621076e-05, "loss": 1.3385, "step": 1140 }, { "epoch": 0.7339853863990344, "grad_norm": 0.04638671875, "learning_rate": 3.997857194516319e-05, "loss": 1.3403, "step": 1145 }, { "epoch": 0.7371905627588555, "grad_norm": 0.05078125, "learning_rate": 3.9086853849398065e-05, "loss": 1.3503, "step": 1150 }, { "epoch": 0.7403957391186766, "grad_norm": 0.06396484375, "learning_rate": 3.8202771138040336e-05, "loss": 1.3354, "step": 1155 }, { "epoch": 0.7436009154784978, "grad_norm": 0.05078125, "learning_rate": 3.732643462971912e-05, "loss": 1.3258, "step": 1160 }, { "epoch": 0.7468060918383189, "grad_norm": 0.049560546875, "learning_rate": 3.6457954172086896e-05, "loss": 1.3493, "step": 1165 }, { "epoch": 0.75001126819814, "grad_norm": 0.046875, "learning_rate": 3.559743862805034e-05, "loss": 1.3275, "step": 1170 }, { "epoch": 0.7532164445579611, "grad_norm": 0.045654296875, "learning_rate": 3.47449958621245e-05, "loss": 1.3148, "step": 1175 }, { "epoch": 0.7564216209177822, "grad_norm": 0.051513671875, "learning_rate": 3.390073272691198e-05, "loss": 1.3338, "step": 1180 }, { "epoch": 0.7596267972776033, "grad_norm": 0.049072265625, "learning_rate": 3.306475504970931e-05, "loss": 1.2935, "step": 1185 }, { "epoch": 0.7628319736374244, "grad_norm": 0.04833984375, "learning_rate": 3.2237167619241495e-05, "loss": 1.3275, "step": 1190 }, { "epoch": 0.7660371499972456, "grad_norm": 0.056396484375, "learning_rate": 3.141807417252697e-05, "loss": 1.3461, "step": 1195 }, { "epoch": 0.7692423263570667, "grad_norm": 0.04345703125, "learning_rate": 3.060757738187409e-05, "loss": 1.3394, "step": 1200 }, { "epoch": 0.7724475027168878, "grad_norm": 0.053955078125, "learning_rate": 2.980577884201169e-05, "loss": 1.3511, "step": 1205 }, { "epoch": 0.7756526790767089, "grad_norm": 0.04736328125, "learning_rate": 2.9012779057353855e-05, "loss": 1.3213, "step": 1210 }, { "epoch": 0.77885785543653, "grad_norm": 0.0576171875, "learning_rate": 2.822867742940214e-05, "loss": 1.3384, "step": 1215 }, { "epoch": 0.7820630317963511, "grad_norm": 0.04833984375, "learning_rate": 2.745357224428563e-05, "loss": 1.343, "step": 1220 }, { "epoch": 0.7852682081561723, "grad_norm": 0.049560546875, "learning_rate": 2.6687560660440858e-05, "loss": 1.3541, "step": 1225 }, { "epoch": 0.7884733845159934, "grad_norm": 0.047607421875, "learning_rate": 2.593073869643312e-05, "loss": 1.3491, "step": 1230 }, { "epoch": 0.7916785608758145, "grad_norm": 0.04248046875, "learning_rate": 2.518320121892076e-05, "loss": 1.3439, "step": 1235 }, { "epoch": 0.7948837372356355, "grad_norm": 0.04736328125, "learning_rate": 2.4445041930763678e-05, "loss": 1.3236, "step": 1240 }, { "epoch": 0.7980889135954566, "grad_norm": 0.0478515625, "learning_rate": 2.371635335927781e-05, "loss": 1.3505, "step": 1245 }, { "epoch": 0.8012940899552777, "grad_norm": 0.0517578125, "learning_rate": 2.2997226844636977e-05, "loss": 1.3223, "step": 1250 }, { "epoch": 0.8044992663150989, "grad_norm": 0.046630859375, "learning_rate": 2.2287752528423468e-05, "loss": 1.3282, "step": 1255 }, { "epoch": 0.80770444267492, "grad_norm": 0.046875, "learning_rate": 2.1588019342328968e-05, "loss": 1.3294, "step": 1260 }, { "epoch": 0.8109096190347411, "grad_norm": 0.0439453125, "learning_rate": 2.089811499700699e-05, "loss": 1.3356, "step": 1265 }, { "epoch": 0.8141147953945622, "grad_norm": 0.045654296875, "learning_rate": 2.021812597107855e-05, "loss": 1.3486, "step": 1270 }, { "epoch": 0.8173199717543833, "grad_norm": 0.04931640625, "learning_rate": 1.954813750029216e-05, "loss": 1.3492, "step": 1275 }, { "epoch": 0.8205251481142044, "grad_norm": 0.05126953125, "learning_rate": 1.8888233566839653e-05, "loss": 1.329, "step": 1280 }, { "epoch": 0.8237303244740255, "grad_norm": 0.048095703125, "learning_rate": 1.8238496888828982e-05, "loss": 1.317, "step": 1285 }, { "epoch": 0.8269355008338467, "grad_norm": 0.051513671875, "learning_rate": 1.759900890991589e-05, "loss": 1.3177, "step": 1290 }, { "epoch": 0.8301406771936678, "grad_norm": 0.0458984375, "learning_rate": 1.696984978909476e-05, "loss": 1.323, "step": 1295 }, { "epoch": 0.8333458535534889, "grad_norm": 0.0439453125, "learning_rate": 1.6351098390650966e-05, "loss": 1.3517, "step": 1300 }, { "epoch": 0.83655102991331, "grad_norm": 0.052978515625, "learning_rate": 1.5742832274275288e-05, "loss": 1.35, "step": 1305 }, { "epoch": 0.8397562062731311, "grad_norm": 0.049072265625, "learning_rate": 1.514512768534193e-05, "loss": 1.3614, "step": 1310 }, { "epoch": 0.8429613826329522, "grad_norm": 0.0439453125, "learning_rate": 1.4558059545351143e-05, "loss": 1.3389, "step": 1315 }, { "epoch": 0.8461665589927734, "grad_norm": 0.04541015625, "learning_rate": 1.3981701442538153e-05, "loss": 1.3272, "step": 1320 }, { "epoch": 0.8493717353525945, "grad_norm": 0.048583984375, "learning_rate": 1.3416125622648668e-05, "loss": 1.3324, "step": 1325 }, { "epoch": 0.8525769117124156, "grad_norm": 0.04541015625, "learning_rate": 1.286140297988323e-05, "loss": 1.3352, "step": 1330 }, { "epoch": 0.8557820880722367, "grad_norm": 0.04443359375, "learning_rate": 1.231760304801054e-05, "loss": 1.3361, "step": 1335 }, { "epoch": 0.8589872644320578, "grad_norm": 0.047119140625, "learning_rate": 1.1784793991651621e-05, "loss": 1.3252, "step": 1340 }, { "epoch": 0.8621924407918788, "grad_norm": 0.044189453125, "learning_rate": 1.1263042597735362e-05, "loss": 1.3468, "step": 1345 }, { "epoch": 0.8653976171517, "grad_norm": 0.046630859375, "learning_rate": 1.0752414267126875e-05, "loss": 1.3301, "step": 1350 }, { "epoch": 0.8686027935115211, "grad_norm": 0.05029296875, "learning_rate": 1.0252973006429733e-05, "loss": 1.36, "step": 1355 }, { "epoch": 0.8718079698713422, "grad_norm": 0.047119140625, "learning_rate": 9.764781419962577e-06, "loss": 1.3482, "step": 1360 }, { "epoch": 0.8750131462311633, "grad_norm": 0.04638671875, "learning_rate": 9.287900701911944e-06, "loss": 1.3232, "step": 1365 }, { "epoch": 0.8782183225909844, "grad_norm": 0.04931640625, "learning_rate": 8.822390628661582e-06, "loss": 1.3571, "step": 1370 }, { "epoch": 0.8814234989508055, "grad_norm": 0.044921875, "learning_rate": 8.368309551299536e-06, "loss": 1.3274, "step": 1375 }, { "epoch": 0.8846286753106266, "grad_norm": 0.04541015625, "learning_rate": 7.92571438830394e-06, "loss": 1.3656, "step": 1380 }, { "epoch": 0.8878338516704478, "grad_norm": 0.046142578125, "learning_rate": 7.494660618408378e-06, "loss": 1.3659, "step": 1385 }, { "epoch": 0.8910390280302689, "grad_norm": 0.04541015625, "learning_rate": 7.075202273647652e-06, "loss": 1.3305, "step": 1390 }, { "epoch": 0.89424420439009, "grad_norm": 0.046875, "learning_rate": 6.667391932584999e-06, "loss": 1.36, "step": 1395 }, { "epoch": 0.8974493807499111, "grad_norm": 0.0458984375, "learning_rate": 6.271280713721317e-06, "loss": 1.3382, "step": 1400 }, { "epoch": 0.9006545571097322, "grad_norm": 0.04638671875, "learning_rate": 5.886918269087716e-06, "loss": 1.326, "step": 1405 }, { "epoch": 0.9038597334695533, "grad_norm": 0.046875, "learning_rate": 5.514352778021492e-06, "loss": 1.3602, "step": 1410 }, { "epoch": 0.9070649098293745, "grad_norm": 0.046142578125, "learning_rate": 5.153630941127063e-06, "loss": 1.3407, "step": 1415 }, { "epoch": 0.9102700861891956, "grad_norm": 0.046875, "learning_rate": 4.804797974422026e-06, "loss": 1.3241, "step": 1420 }, { "epoch": 0.9134752625490167, "grad_norm": 0.050537109375, "learning_rate": 4.4678976036694355e-06, "loss": 1.3324, "step": 1425 }, { "epoch": 0.9166804389088378, "grad_norm": 0.04443359375, "learning_rate": 4.142972058896811e-06, "loss": 1.3267, "step": 1430 }, { "epoch": 0.9198856152686589, "grad_norm": 0.044921875, "learning_rate": 3.830062069102602e-06, "loss": 1.3496, "step": 1435 }, { "epoch": 0.92309079162848, "grad_norm": 0.046630859375, "learning_rate": 3.529206857151035e-06, "loss": 1.3481, "step": 1440 }, { "epoch": 0.9262959679883012, "grad_norm": 0.04345703125, "learning_rate": 3.240444134855347e-06, "loss": 1.3433, "step": 1445 }, { "epoch": 0.9295011443481223, "grad_norm": 0.045654296875, "learning_rate": 2.963810098250841e-06, "loss": 1.3555, "step": 1450 }, { "epoch": 0.9327063207079433, "grad_norm": 0.044921875, "learning_rate": 2.6993394230576674e-06, "loss": 1.3218, "step": 1455 }, { "epoch": 0.9359114970677644, "grad_norm": 0.04638671875, "learning_rate": 2.4470652603343023e-06, "loss": 1.346, "step": 1460 }, { "epoch": 0.9391166734275855, "grad_norm": 0.044677734375, "learning_rate": 2.2070192323220607e-06, "loss": 1.3551, "step": 1465 }, { "epoch": 0.9423218497874066, "grad_norm": 0.0439453125, "learning_rate": 1.9792314284813986e-06, "loss": 1.3262, "step": 1470 }, { "epoch": 0.9455270261472277, "grad_norm": 0.04736328125, "learning_rate": 1.763730401720065e-06, "loss": 1.3257, "step": 1475 }, { "epoch": 0.9487322025070489, "grad_norm": 0.046142578125, "learning_rate": 1.5605431648141878e-06, "loss": 1.3158, "step": 1480 }, { "epoch": 0.95193737886687, "grad_norm": 0.044677734375, "learning_rate": 1.3696951870222018e-06, "loss": 1.3637, "step": 1485 }, { "epoch": 0.9551425552266911, "grad_norm": 0.053466796875, "learning_rate": 1.1912103908922945e-06, "loss": 1.3337, "step": 1490 }, { "epoch": 0.9583477315865122, "grad_norm": 0.050048828125, "learning_rate": 1.0251111492637244e-06, "loss": 1.3557, "step": 1495 }, { "epoch": 0.9615529079463333, "grad_norm": 0.05126953125, "learning_rate": 8.714182824624883e-07, "loss": 1.3373, "step": 1500 }, { "epoch": 0.9647580843061544, "grad_norm": 0.0458984375, "learning_rate": 7.301510556914859e-07, "loss": 1.3274, "step": 1505 }, { "epoch": 0.9679632606659756, "grad_norm": 0.05224609375, "learning_rate": 6.01327176615607e-07, "loss": 1.3894, "step": 1510 }, { "epoch": 0.9711684370257967, "grad_norm": 0.045166015625, "learning_rate": 4.84962793142163e-07, "loss": 1.3419, "step": 1515 }, { "epoch": 0.9743736133856178, "grad_norm": 0.044677734375, "learning_rate": 3.8107249139672783e-07, "loss": 1.3321, "step": 1520 }, { "epoch": 0.9775787897454389, "grad_norm": 0.0478515625, "learning_rate": 2.89669293894812e-07, "loss": 1.3497, "step": 1525 }, { "epoch": 0.98078396610526, "grad_norm": 0.049072265625, "learning_rate": 2.1076465790946798e-07, "loss": 1.3518, "step": 1530 }, { "epoch": 0.9839891424650811, "grad_norm": 0.04638671875, "learning_rate": 1.443684740351947e-07, "loss": 1.3224, "step": 1535 }, { "epoch": 0.9871943188249022, "grad_norm": 0.049072265625, "learning_rate": 9.048906494811826e-08, "loss": 1.3513, "step": 1540 }, { "epoch": 0.9903994951847234, "grad_norm": 0.050048828125, "learning_rate": 4.9133184362748497e-08, "loss": 1.3494, "step": 1545 }, { "epoch": 0.9936046715445445, "grad_norm": 0.04443359375, "learning_rate": 2.0306016185456244e-08, "loss": 1.3344, "step": 1550 }, { "epoch": 0.9968098479043656, "grad_norm": 0.047119140625, "learning_rate": 4.011173864637563e-09, "loss": 1.3662, "step": 1555 }, { "epoch": 0.9993739889922224, "eval_loss": 1.4191993474960327, "eval_runtime": 1938.5869, "eval_samples_per_second": 7.3, "eval_steps_per_second": 7.3, "step": 1559 }, { "epoch": 0.9993739889922224, "step": 1559, "total_flos": 3.232184148701479e+18, "train_loss": 0.016414370117774753, "train_runtime": 2971.8566, "train_samples_per_second": 67.189, "train_steps_per_second": 0.525 } ], "logging_steps": 5, "max_steps": 1559, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 3.232184148701479e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }