{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998211091234347, "eval_steps": 500, "global_step": 1257, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.011926058437686345, "grad_norm": 0.5661849975585938, "learning_rate": 4.999804802810596e-05, "loss": 1.4421, "step": 5 }, { "epoch": 0.02385211687537269, "grad_norm": 0.5908129811286926, "learning_rate": 4.999219241723937e-05, "loss": 1.3306, "step": 10 }, { "epoch": 0.03577817531305903, "grad_norm": 0.6965810060501099, "learning_rate": 4.998243408179925e-05, "loss": 1.1873, "step": 15 }, { "epoch": 0.04770423375074538, "grad_norm": 0.5752276182174683, "learning_rate": 4.9968774545625344e-05, "loss": 1.0583, "step": 20 }, { "epoch": 0.05963029218843172, "grad_norm": 0.6488135457038879, "learning_rate": 4.9951215941760075e-05, "loss": 1.0045, "step": 25 }, { "epoch": 0.07155635062611806, "grad_norm": 0.7977299094200134, "learning_rate": 4.992976101211558e-05, "loss": 0.9836, "step": 30 }, { "epoch": 0.08348240906380441, "grad_norm": 0.8842712044715881, "learning_rate": 4.99044131070454e-05, "loss": 0.8925, "step": 35 }, { "epoch": 0.09540846750149076, "grad_norm": 0.798170804977417, "learning_rate": 4.987517618482142e-05, "loss": 0.8783, "step": 40 }, { "epoch": 0.1073345259391771, "grad_norm": 0.8196467757225037, "learning_rate": 4.984205481101565e-05, "loss": 0.8681, "step": 45 }, { "epoch": 0.11926058437686345, "grad_norm": 0.9973539113998413, "learning_rate": 4.980505415778738e-05, "loss": 0.8251, "step": 50 }, { "epoch": 0.13118664281454978, "grad_norm": 0.8836011290550232, "learning_rate": 4.97641800030754e-05, "loss": 0.8079, "step": 55 }, { "epoch": 0.14311270125223613, "grad_norm": 0.9255298972129822, "learning_rate": 4.971943872969582e-05, "loss": 0.7769, "step": 60 }, { "epoch": 0.15503875968992248, "grad_norm": 0.9712087512016296, "learning_rate": 4.967083732434529e-05, "loss": 0.777, "step": 65 }, { "epoch": 0.16696481812760883, "grad_norm": 0.7609415054321289, "learning_rate": 4.961838337650997e-05, "loss": 0.7303, "step": 70 }, { "epoch": 0.17889087656529518, "grad_norm": 0.8297658562660217, "learning_rate": 4.9562085077280443e-05, "loss": 0.7442, "step": 75 }, { "epoch": 0.19081693500298152, "grad_norm": 1.01498281955719, "learning_rate": 4.950195121807251e-05, "loss": 0.7077, "step": 80 }, { "epoch": 0.20274299344066785, "grad_norm": 0.9629709720611572, "learning_rate": 4.943799118925443e-05, "loss": 0.673, "step": 85 }, { "epoch": 0.2146690518783542, "grad_norm": 0.9121557474136353, "learning_rate": 4.937021497868047e-05, "loss": 0.7109, "step": 90 }, { "epoch": 0.22659511031604054, "grad_norm": 0.841573178768158, "learning_rate": 4.9298633170131304e-05, "loss": 0.6798, "step": 95 }, { "epoch": 0.2385211687537269, "grad_norm": 0.9693652391433716, "learning_rate": 4.922325694166119e-05, "loss": 0.6471, "step": 100 }, { "epoch": 0.2504472271914132, "grad_norm": 1.1198012828826904, "learning_rate": 4.9144098063852485e-05, "loss": 0.6456, "step": 105 }, { "epoch": 0.26237328562909956, "grad_norm": 1.1725982427597046, "learning_rate": 4.9061168897977564e-05, "loss": 0.6462, "step": 110 }, { "epoch": 0.2742993440667859, "grad_norm": 0.9855665564537048, "learning_rate": 4.8974482394068514e-05, "loss": 0.6594, "step": 115 }, { "epoch": 0.28622540250447226, "grad_norm": 0.8321542739868164, "learning_rate": 4.888405208889486e-05, "loss": 0.6372, "step": 120 }, { "epoch": 0.2981514609421586, "grad_norm": 0.9451864957809448, "learning_rate": 4.878989210384972e-05, "loss": 0.6594, "step": 125 }, { "epoch": 0.31007751937984496, "grad_norm": 1.1336216926574707, "learning_rate": 4.869201714274467e-05, "loss": 0.6544, "step": 130 }, { "epoch": 0.3220035778175313, "grad_norm": 0.8466928601264954, "learning_rate": 4.8590442489513543e-05, "loss": 0.6275, "step": 135 }, { "epoch": 0.33392963625521765, "grad_norm": 0.979365885257721, "learning_rate": 4.8485184005825815e-05, "loss": 0.6348, "step": 140 }, { "epoch": 0.345855694692904, "grad_norm": 0.9159572124481201, "learning_rate": 4.837625812860961e-05, "loss": 0.6045, "step": 145 }, { "epoch": 0.35778175313059035, "grad_norm": 1.0542089939117432, "learning_rate": 4.8263681867485e-05, "loss": 0.6487, "step": 150 }, { "epoch": 0.3697078115682767, "grad_norm": 1.0264719724655151, "learning_rate": 4.814747280210782e-05, "loss": 0.6026, "step": 155 }, { "epoch": 0.38163387000596305, "grad_norm": 0.9389889240264893, "learning_rate": 4.80276490794244e-05, "loss": 0.6083, "step": 160 }, { "epoch": 0.3935599284436494, "grad_norm": 1.1090476512908936, "learning_rate": 4.790422941083786e-05, "loss": 0.5957, "step": 165 }, { "epoch": 0.4054859868813357, "grad_norm": 1.0268605947494507, "learning_rate": 4.7777233069286154e-05, "loss": 0.6538, "step": 170 }, { "epoch": 0.41741204531902204, "grad_norm": 1.240955114364624, "learning_rate": 4.7646679886232414e-05, "loss": 0.62, "step": 175 }, { "epoch": 0.4293381037567084, "grad_norm": 1.2029043436050415, "learning_rate": 4.7512590248568163e-05, "loss": 0.6461, "step": 180 }, { "epoch": 0.44126416219439474, "grad_norm": 1.0567660331726074, "learning_rate": 4.7374985095429725e-05, "loss": 0.633, "step": 185 }, { "epoch": 0.4531902206320811, "grad_norm": 0.928354024887085, "learning_rate": 4.723388591492841e-05, "loss": 0.633, "step": 190 }, { "epoch": 0.46511627906976744, "grad_norm": 0.8453586101531982, "learning_rate": 4.708931474079499e-05, "loss": 0.6082, "step": 195 }, { "epoch": 0.4770423375074538, "grad_norm": 1.33816397190094, "learning_rate": 4.6941294148938954e-05, "loss": 0.6176, "step": 200 }, { "epoch": 0.48896839594514013, "grad_norm": 0.9532414078712463, "learning_rate": 4.678984725392309e-05, "loss": 0.6168, "step": 205 }, { "epoch": 0.5008944543828264, "grad_norm": 0.9017360210418701, "learning_rate": 4.6634997705354024e-05, "loss": 0.6454, "step": 210 }, { "epoch": 0.5128205128205128, "grad_norm": 1.1095216274261475, "learning_rate": 4.6476769684189065e-05, "loss": 0.5738, "step": 215 }, { "epoch": 0.5247465712581991, "grad_norm": 0.8017715811729431, "learning_rate": 4.631518789896023e-05, "loss": 0.5978, "step": 220 }, { "epoch": 0.5366726296958855, "grad_norm": 1.0233415365219116, "learning_rate": 4.6150277581915804e-05, "loss": 0.6375, "step": 225 }, { "epoch": 0.5485986881335718, "grad_norm": 0.9298461675643921, "learning_rate": 4.598206448508007e-05, "loss": 0.606, "step": 230 }, { "epoch": 0.5605247465712582, "grad_norm": 0.9300760626792908, "learning_rate": 4.581057487623204e-05, "loss": 0.6261, "step": 235 }, { "epoch": 0.5724508050089445, "grad_norm": 1.4629548788070679, "learning_rate": 4.5635835534803406e-05, "loss": 0.5877, "step": 240 }, { "epoch": 0.5843768634466309, "grad_norm": 0.916437029838562, "learning_rate": 4.545787374769686e-05, "loss": 0.5821, "step": 245 }, { "epoch": 0.5963029218843172, "grad_norm": 0.9588505625724792, "learning_rate": 4.527671730502491e-05, "loss": 0.6126, "step": 250 }, { "epoch": 0.6082289803220036, "grad_norm": 1.068697214126587, "learning_rate": 4.5092394495770335e-05, "loss": 0.6034, "step": 255 }, { "epoch": 0.6201550387596899, "grad_norm": 0.816047191619873, "learning_rate": 4.490493410336857e-05, "loss": 0.6088, "step": 260 }, { "epoch": 0.6320810971973763, "grad_norm": 0.8465267419815063, "learning_rate": 4.4714365401213e-05, "loss": 0.6067, "step": 265 }, { "epoch": 0.6440071556350626, "grad_norm": 0.9819373488426208, "learning_rate": 4.4520718148083665e-05, "loss": 0.6334, "step": 270 }, { "epoch": 0.655933214072749, "grad_norm": 0.7895939350128174, "learning_rate": 4.43240225835002e-05, "loss": 0.6093, "step": 275 }, { "epoch": 0.6678592725104353, "grad_norm": 1.0516682863235474, "learning_rate": 4.41243094229997e-05, "loss": 0.5939, "step": 280 }, { "epoch": 0.6797853309481217, "grad_norm": 0.8521412014961243, "learning_rate": 4.392160985334027e-05, "loss": 0.5483, "step": 285 }, { "epoch": 0.691711389385808, "grad_norm": 0.847909152507782, "learning_rate": 4.371595552763093e-05, "loss": 0.5946, "step": 290 }, { "epoch": 0.7036374478234944, "grad_norm": 1.1502934694290161, "learning_rate": 4.350737856038878e-05, "loss": 0.5651, "step": 295 }, { "epoch": 0.7155635062611807, "grad_norm": 1.0434402227401733, "learning_rate": 4.3295911522524044e-05, "loss": 0.5965, "step": 300 }, { "epoch": 0.727489564698867, "grad_norm": 1.1517246961593628, "learning_rate": 4.308158743625388e-05, "loss": 0.5751, "step": 305 }, { "epoch": 0.7394156231365534, "grad_norm": 1.1459076404571533, "learning_rate": 4.286443976994569e-05, "loss": 0.6043, "step": 310 }, { "epoch": 0.7513416815742398, "grad_norm": 1.0111889839172363, "learning_rate": 4.264450243289079e-05, "loss": 0.5948, "step": 315 }, { "epoch": 0.7632677400119261, "grad_norm": 1.0213602781295776, "learning_rate": 4.2421809770009225e-05, "loss": 0.5538, "step": 320 }, { "epoch": 0.7751937984496124, "grad_norm": 1.0203680992126465, "learning_rate": 4.219639655648651e-05, "loss": 0.5696, "step": 325 }, { "epoch": 0.7871198568872988, "grad_norm": 0.8697240352630615, "learning_rate": 4.196829799234321e-05, "loss": 0.5613, "step": 330 }, { "epoch": 0.7990459153249851, "grad_norm": 0.8935692310333252, "learning_rate": 4.173754969693826e-05, "loss": 0.5712, "step": 335 }, { "epoch": 0.8109719737626714, "grad_norm": 0.7936813831329346, "learning_rate": 4.1504187703406604e-05, "loss": 0.5693, "step": 340 }, { "epoch": 0.8228980322003577, "grad_norm": 1.0851740837097168, "learning_rate": 4.126824845303248e-05, "loss": 0.5911, "step": 345 }, { "epoch": 0.8348240906380441, "grad_norm": 0.9009485244750977, "learning_rate": 4.102976878955869e-05, "loss": 0.5551, "step": 350 }, { "epoch": 0.8467501490757304, "grad_norm": 1.1172417402267456, "learning_rate": 4.0788785953433286e-05, "loss": 0.564, "step": 355 }, { "epoch": 0.8586762075134168, "grad_norm": 1.2822695970535278, "learning_rate": 4.05453375759941e-05, "loss": 0.5766, "step": 360 }, { "epoch": 0.8706022659511031, "grad_norm": 1.327506422996521, "learning_rate": 4.0299461673592376e-05, "loss": 0.5289, "step": 365 }, { "epoch": 0.8825283243887895, "grad_norm": 1.0519084930419922, "learning_rate": 4.0051196641656185e-05, "loss": 0.5839, "step": 370 }, { "epoch": 0.8944543828264758, "grad_norm": 0.8493732810020447, "learning_rate": 3.980058124869469e-05, "loss": 0.583, "step": 375 }, { "epoch": 0.9063804412641622, "grad_norm": 0.9003544449806213, "learning_rate": 3.9547654630244156e-05, "loss": 0.5645, "step": 380 }, { "epoch": 0.9183064997018485, "grad_norm": 0.8397880792617798, "learning_rate": 3.929245628275662e-05, "loss": 0.6013, "step": 385 }, { "epoch": 0.9302325581395349, "grad_norm": 1.000611424446106, "learning_rate": 3.903502605743222e-05, "loss": 0.5657, "step": 390 }, { "epoch": 0.9421586165772212, "grad_norm": 0.9028176665306091, "learning_rate": 3.877540415399612e-05, "loss": 0.6077, "step": 395 }, { "epoch": 0.9540846750149076, "grad_norm": 0.8427708745002747, "learning_rate": 3.851363111442101e-05, "loss": 0.5553, "step": 400 }, { "epoch": 0.9660107334525939, "grad_norm": 1.1009222269058228, "learning_rate": 3.8249747816596136e-05, "loss": 0.5806, "step": 405 }, { "epoch": 0.9779367918902803, "grad_norm": 1.3395967483520508, "learning_rate": 3.7983795467943975e-05, "loss": 0.5463, "step": 410 }, { "epoch": 0.9898628503279666, "grad_norm": 0.856643557548523, "learning_rate": 3.77158155989853e-05, "loss": 0.6032, "step": 415 }, { "epoch": 1.0017889087656529, "grad_norm": 1.985956072807312, "learning_rate": 3.74458500568539e-05, "loss": 0.663, "step": 420 }, { "epoch": 1.0137149672033392, "grad_norm": 1.099455714225769, "learning_rate": 3.717394099876182e-05, "loss": 0.5532, "step": 425 }, { "epoch": 1.0256410256410255, "grad_norm": 0.8070193529129028, "learning_rate": 3.690013088541619e-05, "loss": 0.537, "step": 430 }, { "epoch": 1.037567084078712, "grad_norm": 1.1234451532363892, "learning_rate": 3.662446247438867e-05, "loss": 0.5192, "step": 435 }, { "epoch": 1.0494931425163982, "grad_norm": 1.0410645008087158, "learning_rate": 3.6346978813438464e-05, "loss": 0.5525, "step": 440 }, { "epoch": 1.0614192009540846, "grad_norm": 1.0676367282867432, "learning_rate": 3.606772323379017e-05, "loss": 0.5546, "step": 445 }, { "epoch": 1.073345259391771, "grad_norm": 1.1925772428512573, "learning_rate": 3.57867393433672e-05, "loss": 0.5298, "step": 450 }, { "epoch": 1.0852713178294573, "grad_norm": 1.0415118932724, "learning_rate": 3.55040710199821e-05, "loss": 0.5354, "step": 455 }, { "epoch": 1.0971973762671436, "grad_norm": 0.9862761497497559, "learning_rate": 3.521976240448468e-05, "loss": 0.5173, "step": 460 }, { "epoch": 1.10912343470483, "grad_norm": 1.0136168003082275, "learning_rate": 3.493385789386906e-05, "loss": 0.5424, "step": 465 }, { "epoch": 1.1210494931425163, "grad_norm": 0.999031126499176, "learning_rate": 3.464640213434079e-05, "loss": 0.574, "step": 470 }, { "epoch": 1.1329755515802027, "grad_norm": 1.0496968030929565, "learning_rate": 3.435744001434492e-05, "loss": 0.5152, "step": 475 }, { "epoch": 1.144901610017889, "grad_norm": 1.0987663269042969, "learning_rate": 3.40670166575564e-05, "loss": 0.5504, "step": 480 }, { "epoch": 1.1568276684555754, "grad_norm": 1.0069026947021484, "learning_rate": 3.3775177415833605e-05, "loss": 0.5547, "step": 485 }, { "epoch": 1.1687537268932617, "grad_norm": 1.194651484489441, "learning_rate": 3.348196786213633e-05, "loss": 0.5319, "step": 490 }, { "epoch": 1.180679785330948, "grad_norm": 0.9666855335235596, "learning_rate": 3.3187433783409216e-05, "loss": 0.5541, "step": 495 }, { "epoch": 1.1926058437686344, "grad_norm": 0.8826591372489929, "learning_rate": 3.289162117343173e-05, "loss": 0.5329, "step": 500 }, { "epoch": 1.2045319022063208, "grad_norm": 1.1290160417556763, "learning_rate": 3.259457622563593e-05, "loss": 0.5964, "step": 505 }, { "epoch": 1.2164579606440071, "grad_norm": 1.078009009361267, "learning_rate": 3.229634532589296e-05, "loss": 0.5315, "step": 510 }, { "epoch": 1.2283840190816935, "grad_norm": 1.1144967079162598, "learning_rate": 3.199697504526955e-05, "loss": 0.5698, "step": 515 }, { "epoch": 1.2403100775193798, "grad_norm": 1.3256494998931885, "learning_rate": 3.169651213275562e-05, "loss": 0.5234, "step": 520 }, { "epoch": 1.2522361359570662, "grad_norm": 1.0332247018814087, "learning_rate": 3.139500350796397e-05, "loss": 0.5386, "step": 525 }, { "epoch": 1.2641621943947525, "grad_norm": 1.2091331481933594, "learning_rate": 3.1092496253803546e-05, "loss": 0.5136, "step": 530 }, { "epoch": 1.2760882528324389, "grad_norm": 1.0742837190628052, "learning_rate": 3.078903760912695e-05, "loss": 0.5341, "step": 535 }, { "epoch": 1.2880143112701252, "grad_norm": 1.5551700592041016, "learning_rate": 3.048467496135384e-05, "loss": 0.5613, "step": 540 }, { "epoch": 1.2999403697078116, "grad_norm": 1.0054267644882202, "learning_rate": 3.017945583907092e-05, "loss": 0.5076, "step": 545 }, { "epoch": 1.311866428145498, "grad_norm": 1.081240177154541, "learning_rate": 2.9873427904610057e-05, "loss": 0.5115, "step": 550 }, { "epoch": 1.3237924865831843, "grad_norm": 1.120842456817627, "learning_rate": 2.956663894660539e-05, "loss": 0.5467, "step": 555 }, { "epoch": 1.3357185450208706, "grad_norm": 1.022741675376892, "learning_rate": 2.9259136872530812e-05, "loss": 0.523, "step": 560 }, { "epoch": 1.347644603458557, "grad_norm": 1.0805561542510986, "learning_rate": 2.8950969701218783e-05, "loss": 0.5541, "step": 565 }, { "epoch": 1.3595706618962433, "grad_norm": 1.1941672563552856, "learning_rate": 2.864218555536188e-05, "loss": 0.5303, "step": 570 }, { "epoch": 1.3714967203339297, "grad_norm": 1.0475640296936035, "learning_rate": 2.833283265399801e-05, "loss": 0.544, "step": 575 }, { "epoch": 1.383422778771616, "grad_norm": 1.258143424987793, "learning_rate": 2.8022959304980695e-05, "loss": 0.5596, "step": 580 }, { "epoch": 1.3953488372093024, "grad_norm": 1.044134259223938, "learning_rate": 2.7712613897435357e-05, "loss": 0.5003, "step": 585 }, { "epoch": 1.4072748956469887, "grad_norm": 1.100220799446106, "learning_rate": 2.7401844894203056e-05, "loss": 0.5629, "step": 590 }, { "epoch": 1.419200954084675, "grad_norm": 0.9259990453720093, "learning_rate": 2.7090700824272557e-05, "loss": 0.4516, "step": 595 }, { "epoch": 1.4311270125223614, "grad_norm": 0.8607438206672668, "learning_rate": 2.6779230275202243e-05, "loss": 0.538, "step": 600 }, { "epoch": 1.4430530709600478, "grad_norm": 1.2486215829849243, "learning_rate": 2.6467481885532704e-05, "loss": 0.5314, "step": 605 }, { "epoch": 1.454979129397734, "grad_norm": 1.1418797969818115, "learning_rate": 2.6155504337191516e-05, "loss": 0.4841, "step": 610 }, { "epoch": 1.4669051878354205, "grad_norm": 1.3612860441207886, "learning_rate": 2.5843346347891163e-05, "loss": 0.5133, "step": 615 }, { "epoch": 1.4788312462731068, "grad_norm": 1.1785390377044678, "learning_rate": 2.5531056663521362e-05, "loss": 0.5309, "step": 620 }, { "epoch": 1.4907573047107932, "grad_norm": 0.8908374905586243, "learning_rate": 2.521868405053706e-05, "loss": 0.5022, "step": 625 }, { "epoch": 1.5026833631484795, "grad_norm": 1.031840443611145, "learning_rate": 2.4906277288343123e-05, "loss": 0.5254, "step": 630 }, { "epoch": 1.5146094215861656, "grad_norm": 1.5957837104797363, "learning_rate": 2.459388516167711e-05, "loss": 0.5507, "step": 635 }, { "epoch": 1.5265354800238522, "grad_norm": 1.2121431827545166, "learning_rate": 2.428155645299111e-05, "loss": 0.4943, "step": 640 }, { "epoch": 1.5384615384615383, "grad_norm": 0.9426692724227905, "learning_rate": 2.3969339934834012e-05, "loss": 0.5111, "step": 645 }, { "epoch": 1.550387596899225, "grad_norm": 1.2159509658813477, "learning_rate": 2.3657284362235274e-05, "loss": 0.5302, "step": 650 }, { "epoch": 1.562313655336911, "grad_norm": 1.1508508920669556, "learning_rate": 2.3345438465091455e-05, "loss": 0.5299, "step": 655 }, { "epoch": 1.5742397137745976, "grad_norm": 1.2826497554779053, "learning_rate": 2.303385094055669e-05, "loss": 0.5453, "step": 660 }, { "epoch": 1.5861657722122837, "grad_norm": 1.520505666732788, "learning_rate": 2.2722570445438214e-05, "loss": 0.5804, "step": 665 }, { "epoch": 1.5980918306499703, "grad_norm": 1.1682236194610596, "learning_rate": 2.2411645588598232e-05, "loss": 0.5072, "step": 670 }, { "epoch": 1.6100178890876564, "grad_norm": 1.1223719120025635, "learning_rate": 2.2101124923363267e-05, "loss": 0.5287, "step": 675 }, { "epoch": 1.621943947525343, "grad_norm": 1.0129245519638062, "learning_rate": 2.1791056939942228e-05, "loss": 0.5343, "step": 680 }, { "epoch": 1.6338700059630291, "grad_norm": 1.00519597530365, "learning_rate": 2.1481490057854217e-05, "loss": 0.5329, "step": 685 }, { "epoch": 1.6457960644007157, "grad_norm": 1.1941732168197632, "learning_rate": 2.1172472618367483e-05, "loss": 0.5591, "step": 690 }, { "epoch": 1.6577221228384018, "grad_norm": 0.9398036599159241, "learning_rate": 2.0864052876950552e-05, "loss": 0.5254, "step": 695 }, { "epoch": 1.6696481812760884, "grad_norm": 0.8543565273284912, "learning_rate": 2.0556278995736782e-05, "loss": 0.5212, "step": 700 }, { "epoch": 1.6815742397137745, "grad_norm": 1.0366357564926147, "learning_rate": 2.024919903600344e-05, "loss": 0.5276, "step": 705 }, { "epoch": 1.693500298151461, "grad_norm": 1.2778667211532593, "learning_rate": 1.9942860950666574e-05, "loss": 0.5203, "step": 710 }, { "epoch": 1.7054263565891472, "grad_norm": 1.2391997575759888, "learning_rate": 1.9637312576792776e-05, "loss": 0.5291, "step": 715 }, { "epoch": 1.7173524150268338, "grad_norm": 1.0726789236068726, "learning_rate": 1.9332601628129128e-05, "loss": 0.5299, "step": 720 }, { "epoch": 1.72927847346452, "grad_norm": 1.1761037111282349, "learning_rate": 1.9028775687652217e-05, "loss": 0.5476, "step": 725 }, { "epoch": 1.7412045319022065, "grad_norm": 1.257358431816101, "learning_rate": 1.8725882200137762e-05, "loss": 0.5333, "step": 730 }, { "epoch": 1.7531305903398926, "grad_norm": 1.121039628982544, "learning_rate": 1.8423968464751722e-05, "loss": 0.4962, "step": 735 }, { "epoch": 1.7650566487775792, "grad_norm": 1.1800827980041504, "learning_rate": 1.812308162766418e-05, "loss": 0.4986, "step": 740 }, { "epoch": 1.7769827072152653, "grad_norm": 1.262669324874878, "learning_rate": 1.7823268674687077e-05, "loss": 0.5461, "step": 745 }, { "epoch": 1.7889087656529516, "grad_norm": 1.1524207592010498, "learning_rate": 1.7524576423937025e-05, "loss": 0.5392, "step": 750 }, { "epoch": 1.800834824090638, "grad_norm": 0.9975520968437195, "learning_rate": 1.7227051518524286e-05, "loss": 0.5162, "step": 755 }, { "epoch": 1.8127608825283243, "grad_norm": 1.1616145372390747, "learning_rate": 1.6930740419269132e-05, "loss": 0.5431, "step": 760 }, { "epoch": 1.8246869409660107, "grad_norm": 0.9507209658622742, "learning_rate": 1.6635689397446562e-05, "loss": 0.5037, "step": 765 }, { "epoch": 1.836612999403697, "grad_norm": 1.0409976243972778, "learning_rate": 1.6341944527560736e-05, "loss": 0.5164, "step": 770 }, { "epoch": 1.8485390578413834, "grad_norm": 0.8729709386825562, "learning_rate": 1.6049551680150047e-05, "loss": 0.495, "step": 775 }, { "epoch": 1.8604651162790697, "grad_norm": 1.223853588104248, "learning_rate": 1.5758556514624118e-05, "loss": 0.5834, "step": 780 }, { "epoch": 1.872391174716756, "grad_norm": 1.184877634048462, "learning_rate": 1.5469004472133696e-05, "loss": 0.5459, "step": 785 }, { "epoch": 1.8843172331544424, "grad_norm": 1.163509488105774, "learning_rate": 1.5180940768474689e-05, "loss": 0.516, "step": 790 }, { "epoch": 1.8962432915921288, "grad_norm": 1.2189282178878784, "learning_rate": 1.489441038702735e-05, "loss": 0.5519, "step": 795 }, { "epoch": 1.9081693500298151, "grad_norm": 1.1891826391220093, "learning_rate": 1.4609458071731796e-05, "loss": 0.5173, "step": 800 }, { "epoch": 1.9200954084675015, "grad_norm": 1.3610830307006836, "learning_rate": 1.4326128320100867e-05, "loss": 0.517, "step": 805 }, { "epoch": 1.9320214669051878, "grad_norm": 1.148544192314148, "learning_rate": 1.4044465376271532e-05, "loss": 0.4906, "step": 810 }, { "epoch": 1.9439475253428742, "grad_norm": 1.404956340789795, "learning_rate": 1.3764513224095762e-05, "loss": 0.5598, "step": 815 }, { "epoch": 1.9558735837805605, "grad_norm": 1.206635594367981, "learning_rate": 1.3486315580272202e-05, "loss": 0.5276, "step": 820 }, { "epoch": 1.9677996422182469, "grad_norm": 0.8971341848373413, "learning_rate": 1.320991588751938e-05, "loss": 0.4966, "step": 825 }, { "epoch": 1.9797257006559332, "grad_norm": 1.1437649726867676, "learning_rate": 1.2935357307791826e-05, "loss": 0.5133, "step": 830 }, { "epoch": 1.9916517590936196, "grad_norm": 1.2897300720214844, "learning_rate": 1.2662682715540031e-05, "loss": 0.4814, "step": 835 }, { "epoch": 2.0035778175313057, "grad_norm": 1.0928142070770264, "learning_rate": 1.2391934691015213e-05, "loss": 0.6167, "step": 840 }, { "epoch": 2.0155038759689923, "grad_norm": 1.1107832193374634, "learning_rate": 1.2123155513620108e-05, "loss": 0.4915, "step": 845 }, { "epoch": 2.0274299344066784, "grad_norm": 1.2382787466049194, "learning_rate": 1.1856387155306715e-05, "loss": 0.5116, "step": 850 }, { "epoch": 2.039355992844365, "grad_norm": 1.46440589427948, "learning_rate": 1.1591671274022035e-05, "loss": 0.4754, "step": 855 }, { "epoch": 2.051282051282051, "grad_norm": 1.4522420167922974, "learning_rate": 1.1329049207202904e-05, "loss": 0.4426, "step": 860 }, { "epoch": 2.0632081097197377, "grad_norm": 1.3376868963241577, "learning_rate": 1.1068561965320764e-05, "loss": 0.5026, "step": 865 }, { "epoch": 2.075134168157424, "grad_norm": 1.1326589584350586, "learning_rate": 1.0810250225477611e-05, "loss": 0.4872, "step": 870 }, { "epoch": 2.0870602265951104, "grad_norm": 1.5386810302734375, "learning_rate": 1.055415432505393e-05, "loss": 0.5022, "step": 875 }, { "epoch": 2.0989862850327965, "grad_norm": 1.2370398044586182, "learning_rate": 1.0300314255409704e-05, "loss": 0.5388, "step": 880 }, { "epoch": 2.110912343470483, "grad_norm": 1.2834645509719849, "learning_rate": 1.004876965563945e-05, "loss": 0.481, "step": 885 }, { "epoch": 2.122838401908169, "grad_norm": 1.3606491088867188, "learning_rate": 9.79955980638229e-06, "loss": 0.4829, "step": 890 }, { "epoch": 2.1347644603458558, "grad_norm": 1.3206753730773926, "learning_rate": 9.552723623687934e-06, "loss": 0.4902, "step": 895 }, { "epoch": 2.146690518783542, "grad_norm": 1.1643896102905273, "learning_rate": 9.308299652939666e-06, "loss": 0.5297, "step": 900 }, { "epoch": 2.1586165772212285, "grad_norm": 1.0991599559783936, "learning_rate": 9.066326062835179e-06, "loss": 0.4845, "step": 905 }, { "epoch": 2.1705426356589146, "grad_norm": 1.1832717657089233, "learning_rate": 8.826840639426218e-06, "loss": 0.5086, "step": 910 }, { "epoch": 2.182468694096601, "grad_norm": 1.385024070739746, "learning_rate": 8.589880780218049e-06, "loss": 0.5136, "step": 915 }, { "epoch": 2.1943947525342873, "grad_norm": 1.1657204627990723, "learning_rate": 8.355483488329471e-06, "loss": 0.5137, "step": 920 }, { "epoch": 2.206320810971974, "grad_norm": 1.0423134565353394, "learning_rate": 8.123685366714556e-06, "loss": 0.5032, "step": 925 }, { "epoch": 2.21824686940966, "grad_norm": 1.2326374053955078, "learning_rate": 7.89452261244677e-06, "loss": 0.4658, "step": 930 }, { "epoch": 2.2301729278473466, "grad_norm": 1.3008525371551514, "learning_rate": 7.66803101106657e-06, "loss": 0.5261, "step": 935 }, { "epoch": 2.2420989862850327, "grad_norm": 1.296058177947998, "learning_rate": 7.44424593099316e-06, "loss": 0.5216, "step": 940 }, { "epoch": 2.2540250447227193, "grad_norm": 1.3323789834976196, "learning_rate": 7.223202318001465e-06, "loss": 0.4949, "step": 945 }, { "epoch": 2.2659511031604054, "grad_norm": 1.2150744199752808, "learning_rate": 7.0049346897650745e-06, "loss": 0.464, "step": 950 }, { "epoch": 2.277877161598092, "grad_norm": 1.1837234497070312, "learning_rate": 6.789477130466057e-06, "loss": 0.5057, "step": 955 }, { "epoch": 2.289803220035778, "grad_norm": 1.0720417499542236, "learning_rate": 6.576863285472415e-06, "loss": 0.4875, "step": 960 }, { "epoch": 2.3017292784734646, "grad_norm": 1.4399152994155884, "learning_rate": 6.367126356084127e-06, "loss": 0.4753, "step": 965 }, { "epoch": 2.3136553369111508, "grad_norm": 1.3286051750183105, "learning_rate": 6.160299094348488e-06, "loss": 0.4725, "step": 970 }, { "epoch": 2.3255813953488373, "grad_norm": 1.3082605600357056, "learning_rate": 5.956413797945657e-06, "loss": 0.5095, "step": 975 }, { "epoch": 2.3375074537865235, "grad_norm": 1.4694650173187256, "learning_rate": 5.755502305145089e-06, "loss": 0.5099, "step": 980 }, { "epoch": 2.34943351222421, "grad_norm": 1.1760421991348267, "learning_rate": 5.557595989833747e-06, "loss": 0.4744, "step": 985 }, { "epoch": 2.361359570661896, "grad_norm": 1.5401874780654907, "learning_rate": 5.36272575661684e-06, "loss": 0.5126, "step": 990 }, { "epoch": 2.3732856290995827, "grad_norm": 1.114990234375, "learning_rate": 5.170922035991838e-06, "loss": 0.4636, "step": 995 }, { "epoch": 2.385211687537269, "grad_norm": 1.2573448419570923, "learning_rate": 4.9822147795964805e-06, "loss": 0.4817, "step": 1000 }, { "epoch": 2.3971377459749554, "grad_norm": 1.2121353149414062, "learning_rate": 4.7966334555316265e-06, "loss": 0.5238, "step": 1005 }, { "epoch": 2.4090638044126416, "grad_norm": 1.6261403560638428, "learning_rate": 4.614207043759556e-06, "loss": 0.4462, "step": 1010 }, { "epoch": 2.420989862850328, "grad_norm": 1.122673511505127, "learning_rate": 4.434964031578562e-06, "loss": 0.473, "step": 1015 }, { "epoch": 2.4329159212880143, "grad_norm": 1.2363194227218628, "learning_rate": 4.258932409174402e-06, "loss": 0.5256, "step": 1020 }, { "epoch": 2.444841979725701, "grad_norm": 1.0842331647872925, "learning_rate": 4.086139665249414e-06, "loss": 0.4561, "step": 1025 }, { "epoch": 2.456768038163387, "grad_norm": 1.498393177986145, "learning_rate": 3.9166127827299864e-06, "loss": 0.5378, "step": 1030 }, { "epoch": 2.4686940966010735, "grad_norm": 1.6119855642318726, "learning_rate": 3.7503782345529094e-06, "loss": 0.5014, "step": 1035 }, { "epoch": 2.4806201550387597, "grad_norm": 1.502034306526184, "learning_rate": 3.5874619795314537e-06, "loss": 0.4967, "step": 1040 }, { "epoch": 2.4925462134764462, "grad_norm": 1.162150263786316, "learning_rate": 3.427889458301678e-06, "loss": 0.4403, "step": 1045 }, { "epoch": 2.5044722719141324, "grad_norm": 1.1627933979034424, "learning_rate": 3.271685589349721e-06, "loss": 0.5167, "step": 1050 }, { "epoch": 2.5163983303518185, "grad_norm": 1.4371596574783325, "learning_rate": 3.1188747651205274e-06, "loss": 0.4748, "step": 1055 }, { "epoch": 2.528324388789505, "grad_norm": 1.236284852027893, "learning_rate": 2.9694808482088166e-06, "loss": 0.5, "step": 1060 }, { "epoch": 2.5402504472271916, "grad_norm": 1.4691346883773804, "learning_rate": 2.8235271676327448e-06, "loss": 0.4844, "step": 1065 }, { "epoch": 2.5521765056648777, "grad_norm": 1.3840566873550415, "learning_rate": 2.6810365151909045e-06, "loss": 0.4588, "step": 1070 }, { "epoch": 2.564102564102564, "grad_norm": 1.2048531770706177, "learning_rate": 2.5420311419031867e-06, "loss": 0.4561, "step": 1075 }, { "epoch": 2.5760286225402504, "grad_norm": 1.124703049659729, "learning_rate": 2.4065327545361394e-06, "loss": 0.4869, "step": 1080 }, { "epoch": 2.587954680977937, "grad_norm": 1.2013908624649048, "learning_rate": 2.2745625122132577e-06, "loss": 0.4966, "step": 1085 }, { "epoch": 2.599880739415623, "grad_norm": 1.136554479598999, "learning_rate": 2.1461410231108647e-06, "loss": 0.4629, "step": 1090 }, { "epoch": 2.6118067978533093, "grad_norm": 1.4465354681015015, "learning_rate": 2.0212883412399407e-06, "loss": 0.5041, "step": 1095 }, { "epoch": 2.623732856290996, "grad_norm": 1.2693179845809937, "learning_rate": 1.9000239633145567e-06, "loss": 0.523, "step": 1100 }, { "epoch": 2.6356589147286824, "grad_norm": 1.115979552268982, "learning_rate": 1.7823668257073046e-06, "loss": 0.4703, "step": 1105 }, { "epoch": 2.6475849731663685, "grad_norm": 1.3214530944824219, "learning_rate": 1.668335301492255e-06, "loss": 0.4987, "step": 1110 }, { "epoch": 2.6595110316040547, "grad_norm": 1.200873851776123, "learning_rate": 1.5579471975758265e-06, "loss": 0.5113, "step": 1115 }, { "epoch": 2.6714370900417412, "grad_norm": 1.3861937522888184, "learning_rate": 1.4512197519161247e-06, "loss": 0.472, "step": 1120 }, { "epoch": 2.683363148479428, "grad_norm": 1.4684505462646484, "learning_rate": 1.3481696308310783e-06, "loss": 0.511, "step": 1125 }, { "epoch": 2.695289206917114, "grad_norm": 1.1317168474197388, "learning_rate": 1.2488129263959065e-06, "loss": 0.5292, "step": 1130 }, { "epoch": 2.7072152653548, "grad_norm": 1.110571026802063, "learning_rate": 1.1531651539301634e-06, "loss": 0.4581, "step": 1135 }, { "epoch": 2.7191413237924866, "grad_norm": 1.1490678787231445, "learning_rate": 1.0612412495749348e-06, "loss": 0.5219, "step": 1140 }, { "epoch": 2.731067382230173, "grad_norm": 1.3028978109359741, "learning_rate": 9.730555679604374e-07, "loss": 0.4821, "step": 1145 }, { "epoch": 2.7429934406678593, "grad_norm": 1.3833311796188354, "learning_rate": 8.886218799644325e-07, "loss": 0.4483, "step": 1150 }, { "epoch": 2.7549194991055455, "grad_norm": 1.686269998550415, "learning_rate": 8.07953370561787e-07, "loss": 0.5451, "step": 1155 }, { "epoch": 2.766845557543232, "grad_norm": 1.3892123699188232, "learning_rate": 7.310626367655476e-07, "loss": 0.4644, "step": 1160 }, { "epoch": 2.778771615980918, "grad_norm": 1.3619730472564697, "learning_rate": 6.579616856598165e-07, "loss": 0.5294, "step": 1165 }, { "epoch": 2.7906976744186047, "grad_norm": 1.2040001153945923, "learning_rate": 5.886619325247561e-07, "loss": 0.4743, "step": 1170 }, { "epoch": 2.802623732856291, "grad_norm": 1.2743037939071655, "learning_rate": 5.23174199053994e-07, "loss": 0.5137, "step": 1175 }, { "epoch": 2.8145497912939774, "grad_norm": 1.5897459983825684, "learning_rate": 4.615087116647432e-07, "loss": 0.4973, "step": 1180 }, { "epoch": 2.8264758497316635, "grad_norm": 1.2881656885147095, "learning_rate": 4.0367509990085764e-07, "loss": 0.4836, "step": 1185 }, { "epoch": 2.83840190816935, "grad_norm": 1.1827319860458374, "learning_rate": 3.4968239492911557e-07, "loss": 0.5003, "step": 1190 }, { "epoch": 2.8503279666070362, "grad_norm": 1.4347820281982422, "learning_rate": 2.9953902812892563e-07, "loss": 0.4861, "step": 1195 }, { "epoch": 2.862254025044723, "grad_norm": 1.0762193202972412, "learning_rate": 2.532528297756992e-07, "loss": 0.4464, "step": 1200 }, { "epoch": 2.874180083482409, "grad_norm": 1.3653563261032104, "learning_rate": 2.1083102781809805e-07, "loss": 0.5013, "step": 1205 }, { "epoch": 2.8861061419200955, "grad_norm": 1.1515675783157349, "learning_rate": 1.7228024674932896e-07, "loss": 0.4679, "step": 1210 }, { "epoch": 2.8980322003577816, "grad_norm": 1.3256670236587524, "learning_rate": 1.3760650657268782e-07, "loss": 0.4972, "step": 1215 }, { "epoch": 2.909958258795468, "grad_norm": 1.1949636936187744, "learning_rate": 1.0681522186147264e-07, "loss": 0.4953, "step": 1220 }, { "epoch": 2.9218843172331543, "grad_norm": 1.1865308284759521, "learning_rate": 7.991120091347392e-08, "loss": 0.4842, "step": 1225 }, { "epoch": 2.933810375670841, "grad_norm": 1.071702480316162, "learning_rate": 5.6898645000108554e-08, "loss": 0.4684, "step": 1230 }, { "epoch": 2.945736434108527, "grad_norm": 1.2573881149291992, "learning_rate": 3.7781147710364006e-08, "loss": 0.4477, "step": 1235 }, { "epoch": 2.9576624925462136, "grad_norm": 1.0853211879730225, "learning_rate": 2.256169438963618e-08, "loss": 0.4787, "step": 1240 }, { "epoch": 2.9695885509838997, "grad_norm": 1.2400619983673096, "learning_rate": 1.1242661673527277e-08, "loss": 0.532, "step": 1245 }, { "epoch": 2.9815146094215863, "grad_norm": 1.2700461149215698, "learning_rate": 3.82581711674268e-09, "loss": 0.5079, "step": 1250 }, { "epoch": 2.9934406678592724, "grad_norm": 1.2555512189865112, "learning_rate": 3.123189170506358e-10, "loss": 0.478, "step": 1255 }, { "epoch": 2.998211091234347, "step": 1257, "total_flos": 4.3809870198708634e+17, "train_loss": 0.564498576457859, "train_runtime": 5097.4001, "train_samples_per_second": 1.974, "train_steps_per_second": 0.247 } ], "logging_steps": 5, "max_steps": 1257, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.3809870198708634e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }