{ "best_metric": 0.16189107298851013, "best_model_checkpoint": "checkpoints/rft-finetune-llama-3.1-8b-math50k/math50k/finetune-llama-3.1-8b-math50k-step-1/checkpoint-2421", "epoch": 0.999721059972106, "eval_steps": 269, "global_step": 2688, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018596001859600185, "grad_norm": 23.25, "learning_rate": 1.8587360594795542e-07, "loss": 0.5644, "step": 5 }, { "epoch": 0.003719200371920037, "grad_norm": 24.625, "learning_rate": 3.7174721189591085e-07, "loss": 0.5677, "step": 10 }, { "epoch": 0.005578800557880056, "grad_norm": 20.625, "learning_rate": 5.576208178438662e-07, "loss": 0.5478, "step": 15 }, { "epoch": 0.007438400743840074, "grad_norm": 18.125, "learning_rate": 7.434944237918217e-07, "loss": 0.5364, "step": 20 }, { "epoch": 0.009298000929800094, "grad_norm": 17.875, "learning_rate": 9.29368029739777e-07, "loss": 0.4842, "step": 25 }, { "epoch": 0.011157601115760111, "grad_norm": 14.3125, "learning_rate": 1.1152416356877324e-06, "loss": 0.4142, "step": 30 }, { "epoch": 0.01301720130172013, "grad_norm": 14.75, "learning_rate": 1.3011152416356879e-06, "loss": 0.3353, "step": 35 }, { "epoch": 0.014876801487680148, "grad_norm": 8.4375, "learning_rate": 1.4869888475836434e-06, "loss": 0.2517, "step": 40 }, { "epoch": 0.016736401673640166, "grad_norm": 5.875, "learning_rate": 1.6728624535315987e-06, "loss": 0.1879, "step": 45 }, { "epoch": 0.018596001859600187, "grad_norm": 3.5625, "learning_rate": 1.858736059479554e-06, "loss": 0.1624, "step": 50 }, { "epoch": 0.020455602045560205, "grad_norm": 2.984375, "learning_rate": 2.0446096654275095e-06, "loss": 0.1526, "step": 55 }, { "epoch": 0.022315202231520222, "grad_norm": 3.328125, "learning_rate": 2.2304832713754648e-06, "loss": 0.1651, "step": 60 }, { "epoch": 0.02417480241748024, "grad_norm": 3.046875, "learning_rate": 2.41635687732342e-06, "loss": 0.1557, "step": 65 }, { "epoch": 0.02603440260344026, "grad_norm": 2.890625, "learning_rate": 2.6022304832713758e-06, "loss": 0.1586, "step": 70 }, { "epoch": 0.02789400278940028, "grad_norm": 2.546875, "learning_rate": 2.788104089219331e-06, "loss": 0.1491, "step": 75 }, { "epoch": 0.029753602975360297, "grad_norm": 2.671875, "learning_rate": 2.973977695167287e-06, "loss": 0.1569, "step": 80 }, { "epoch": 0.03161320316132032, "grad_norm": 2.640625, "learning_rate": 3.159851301115242e-06, "loss": 0.1491, "step": 85 }, { "epoch": 0.03347280334728033, "grad_norm": 2.234375, "learning_rate": 3.3457249070631974e-06, "loss": 0.1459, "step": 90 }, { "epoch": 0.03533240353324035, "grad_norm": 2.453125, "learning_rate": 3.531598513011153e-06, "loss": 0.1548, "step": 95 }, { "epoch": 0.037192003719200374, "grad_norm": 2.265625, "learning_rate": 3.717472118959108e-06, "loss": 0.1529, "step": 100 }, { "epoch": 0.03905160390516039, "grad_norm": 2.5625, "learning_rate": 3.903345724907064e-06, "loss": 0.1543, "step": 105 }, { "epoch": 0.04091120409112041, "grad_norm": 2.453125, "learning_rate": 4.089219330855019e-06, "loss": 0.1521, "step": 110 }, { "epoch": 0.04277080427708043, "grad_norm": 2.5, "learning_rate": 4.275092936802974e-06, "loss": 0.153, "step": 115 }, { "epoch": 0.044630404463040445, "grad_norm": 2.515625, "learning_rate": 4.4609665427509296e-06, "loss": 0.1423, "step": 120 }, { "epoch": 0.046490004649000466, "grad_norm": 2.609375, "learning_rate": 4.646840148698885e-06, "loss": 0.1479, "step": 125 }, { "epoch": 0.04834960483496048, "grad_norm": 2.328125, "learning_rate": 4.83271375464684e-06, "loss": 0.1501, "step": 130 }, { "epoch": 0.0502092050209205, "grad_norm": 2.796875, "learning_rate": 5.0185873605947954e-06, "loss": 0.1611, "step": 135 }, { "epoch": 0.05206880520688052, "grad_norm": 2.484375, "learning_rate": 5.2044609665427516e-06, "loss": 0.1506, "step": 140 }, { "epoch": 0.05392840539284054, "grad_norm": 2.578125, "learning_rate": 5.390334572490706e-06, "loss": 0.1532, "step": 145 }, { "epoch": 0.05578800557880056, "grad_norm": 2.5625, "learning_rate": 5.576208178438662e-06, "loss": 0.1523, "step": 150 }, { "epoch": 0.05764760576476058, "grad_norm": 2.859375, "learning_rate": 5.7620817843866174e-06, "loss": 0.1584, "step": 155 }, { "epoch": 0.05950720595072059, "grad_norm": 2.59375, "learning_rate": 5.947955390334574e-06, "loss": 0.1504, "step": 160 }, { "epoch": 0.061366806136680614, "grad_norm": 2.609375, "learning_rate": 6.133828996282528e-06, "loss": 0.1609, "step": 165 }, { "epoch": 0.06322640632264064, "grad_norm": 2.28125, "learning_rate": 6.319702602230484e-06, "loss": 0.1546, "step": 170 }, { "epoch": 0.06508600650860065, "grad_norm": 2.421875, "learning_rate": 6.5055762081784395e-06, "loss": 0.1492, "step": 175 }, { "epoch": 0.06694560669456066, "grad_norm": 3.34375, "learning_rate": 6.691449814126395e-06, "loss": 0.1427, "step": 180 }, { "epoch": 0.06880520688052069, "grad_norm": 2.625, "learning_rate": 6.87732342007435e-06, "loss": 0.1569, "step": 185 }, { "epoch": 0.0706648070664807, "grad_norm": 2.21875, "learning_rate": 7.063197026022306e-06, "loss": 0.1523, "step": 190 }, { "epoch": 0.07252440725244072, "grad_norm": 3.046875, "learning_rate": 7.249070631970261e-06, "loss": 0.1609, "step": 195 }, { "epoch": 0.07438400743840075, "grad_norm": 2.359375, "learning_rate": 7.434944237918216e-06, "loss": 0.1586, "step": 200 }, { "epoch": 0.07624360762436076, "grad_norm": 2.609375, "learning_rate": 7.620817843866172e-06, "loss": 0.1536, "step": 205 }, { "epoch": 0.07810320781032078, "grad_norm": 2.671875, "learning_rate": 7.806691449814127e-06, "loss": 0.1632, "step": 210 }, { "epoch": 0.0799628079962808, "grad_norm": 2.4375, "learning_rate": 7.992565055762083e-06, "loss": 0.1558, "step": 215 }, { "epoch": 0.08182240818224082, "grad_norm": 2.4375, "learning_rate": 8.178438661710038e-06, "loss": 0.1534, "step": 220 }, { "epoch": 0.08368200836820083, "grad_norm": 2.78125, "learning_rate": 8.364312267657993e-06, "loss": 0.1576, "step": 225 }, { "epoch": 0.08554160855416086, "grad_norm": 2.40625, "learning_rate": 8.550185873605949e-06, "loss": 0.1643, "step": 230 }, { "epoch": 0.08740120874012088, "grad_norm": 2.609375, "learning_rate": 8.736059479553904e-06, "loss": 0.1599, "step": 235 }, { "epoch": 0.08926080892608089, "grad_norm": 2.5625, "learning_rate": 8.921933085501859e-06, "loss": 0.1573, "step": 240 }, { "epoch": 0.09112040911204092, "grad_norm": 2.828125, "learning_rate": 9.107806691449816e-06, "loss": 0.1614, "step": 245 }, { "epoch": 0.09298000929800093, "grad_norm": 2.296875, "learning_rate": 9.29368029739777e-06, "loss": 0.152, "step": 250 }, { "epoch": 0.09483960948396095, "grad_norm": 2.625, "learning_rate": 9.479553903345727e-06, "loss": 0.1531, "step": 255 }, { "epoch": 0.09669920966992096, "grad_norm": 2.4375, "learning_rate": 9.66542750929368e-06, "loss": 0.1592, "step": 260 }, { "epoch": 0.09855880985588099, "grad_norm": 2.609375, "learning_rate": 9.851301115241636e-06, "loss": 0.158, "step": 265 }, { "epoch": 0.100046490004649, "eval_loss": 0.182636097073555, "eval_runtime": 33.5333, "eval_samples_per_second": 306.71, "eval_steps_per_second": 9.602, "step": 269 }, { "epoch": 0.100418410041841, "grad_norm": 2.71875, "learning_rate": 9.99586606035552e-06, "loss": 0.1626, "step": 270 }, { "epoch": 0.10227801022780102, "grad_norm": 2.609375, "learning_rate": 9.975196362133114e-06, "loss": 0.1612, "step": 275 }, { "epoch": 0.10413761041376104, "grad_norm": 2.875, "learning_rate": 9.954526663910708e-06, "loss": 0.1575, "step": 280 }, { "epoch": 0.10599721059972106, "grad_norm": 2.421875, "learning_rate": 9.933856965688302e-06, "loss": 0.1661, "step": 285 }, { "epoch": 0.10785681078568107, "grad_norm": 2.46875, "learning_rate": 9.913187267465896e-06, "loss": 0.1663, "step": 290 }, { "epoch": 0.1097164109716411, "grad_norm": 2.578125, "learning_rate": 9.89251756924349e-06, "loss": 0.1742, "step": 295 }, { "epoch": 0.11157601115760112, "grad_norm": 2.578125, "learning_rate": 9.871847871021084e-06, "loss": 0.1564, "step": 300 }, { "epoch": 0.11343561134356113, "grad_norm": 4.125, "learning_rate": 9.851178172798678e-06, "loss": 0.1633, "step": 305 }, { "epoch": 0.11529521152952116, "grad_norm": 2.765625, "learning_rate": 9.830508474576272e-06, "loss": 0.1665, "step": 310 }, { "epoch": 0.11715481171548117, "grad_norm": 2.921875, "learning_rate": 9.809838776353866e-06, "loss": 0.1739, "step": 315 }, { "epoch": 0.11901441190144119, "grad_norm": 2.296875, "learning_rate": 9.78916907813146e-06, "loss": 0.1621, "step": 320 }, { "epoch": 0.12087401208740121, "grad_norm": 2.5625, "learning_rate": 9.768499379909055e-06, "loss": 0.167, "step": 325 }, { "epoch": 0.12273361227336123, "grad_norm": 2.703125, "learning_rate": 9.747829681686649e-06, "loss": 0.167, "step": 330 }, { "epoch": 0.12459321245932124, "grad_norm": 2.75, "learning_rate": 9.727159983464243e-06, "loss": 0.1601, "step": 335 }, { "epoch": 0.12645281264528127, "grad_norm": 2.703125, "learning_rate": 9.706490285241837e-06, "loss": 0.1705, "step": 340 }, { "epoch": 0.12831241283124128, "grad_norm": 2.59375, "learning_rate": 9.685820587019429e-06, "loss": 0.1575, "step": 345 }, { "epoch": 0.1301720130172013, "grad_norm": 2.40625, "learning_rate": 9.665150888797025e-06, "loss": 0.161, "step": 350 }, { "epoch": 0.1320316132031613, "grad_norm": 2.3125, "learning_rate": 9.644481190574619e-06, "loss": 0.1734, "step": 355 }, { "epoch": 0.13389121338912133, "grad_norm": 2.40625, "learning_rate": 9.623811492352211e-06, "loss": 0.1635, "step": 360 }, { "epoch": 0.13575081357508137, "grad_norm": 2.53125, "learning_rate": 9.603141794129807e-06, "loss": 0.1536, "step": 365 }, { "epoch": 0.13761041376104138, "grad_norm": 2.5625, "learning_rate": 9.5824720959074e-06, "loss": 0.1595, "step": 370 }, { "epoch": 0.1394700139470014, "grad_norm": 2.96875, "learning_rate": 9.561802397684995e-06, "loss": 0.1638, "step": 375 }, { "epoch": 0.1413296141329614, "grad_norm": 2.46875, "learning_rate": 9.54113269946259e-06, "loss": 0.1649, "step": 380 }, { "epoch": 0.14318921431892143, "grad_norm": 2.65625, "learning_rate": 9.520463001240182e-06, "loss": 0.1559, "step": 385 }, { "epoch": 0.14504881450488144, "grad_norm": 2.84375, "learning_rate": 9.499793303017778e-06, "loss": 0.1628, "step": 390 }, { "epoch": 0.14690841469084148, "grad_norm": 2.484375, "learning_rate": 9.47912360479537e-06, "loss": 0.1653, "step": 395 }, { "epoch": 0.1487680148768015, "grad_norm": 2.78125, "learning_rate": 9.458453906572966e-06, "loss": 0.1589, "step": 400 }, { "epoch": 0.1506276150627615, "grad_norm": 2.265625, "learning_rate": 9.437784208350558e-06, "loss": 0.1548, "step": 405 }, { "epoch": 0.15248721524872152, "grad_norm": 2.34375, "learning_rate": 9.417114510128152e-06, "loss": 0.1567, "step": 410 }, { "epoch": 0.15434681543468154, "grad_norm": 3.0625, "learning_rate": 9.396444811905748e-06, "loss": 0.1611, "step": 415 }, { "epoch": 0.15620641562064155, "grad_norm": 2.65625, "learning_rate": 9.37577511368334e-06, "loss": 0.1647, "step": 420 }, { "epoch": 0.15806601580660157, "grad_norm": 2.796875, "learning_rate": 9.355105415460936e-06, "loss": 0.1594, "step": 425 }, { "epoch": 0.1599256159925616, "grad_norm": 2.34375, "learning_rate": 9.334435717238529e-06, "loss": 0.1607, "step": 430 }, { "epoch": 0.16178521617852162, "grad_norm": 2.421875, "learning_rate": 9.313766019016123e-06, "loss": 0.1591, "step": 435 }, { "epoch": 0.16364481636448164, "grad_norm": 2.359375, "learning_rate": 9.293096320793717e-06, "loss": 0.1596, "step": 440 }, { "epoch": 0.16550441655044165, "grad_norm": 2.53125, "learning_rate": 9.27242662257131e-06, "loss": 0.1562, "step": 445 }, { "epoch": 0.16736401673640167, "grad_norm": 2.359375, "learning_rate": 9.251756924348905e-06, "loss": 0.1604, "step": 450 }, { "epoch": 0.16922361692236168, "grad_norm": 2.578125, "learning_rate": 9.231087226126499e-06, "loss": 0.1637, "step": 455 }, { "epoch": 0.17108321710832172, "grad_norm": 2.734375, "learning_rate": 9.210417527904093e-06, "loss": 0.1543, "step": 460 }, { "epoch": 0.17294281729428174, "grad_norm": 2.375, "learning_rate": 9.189747829681687e-06, "loss": 0.1573, "step": 465 }, { "epoch": 0.17480241748024175, "grad_norm": 2.34375, "learning_rate": 9.169078131459281e-06, "loss": 0.1511, "step": 470 }, { "epoch": 0.17666201766620176, "grad_norm": 2.3125, "learning_rate": 9.148408433236875e-06, "loss": 0.152, "step": 475 }, { "epoch": 0.17852161785216178, "grad_norm": 2.90625, "learning_rate": 9.12773873501447e-06, "loss": 0.1561, "step": 480 }, { "epoch": 0.1803812180381218, "grad_norm": 2.375, "learning_rate": 9.107069036792063e-06, "loss": 0.1557, "step": 485 }, { "epoch": 0.18224081822408184, "grad_norm": 2.234375, "learning_rate": 9.086399338569657e-06, "loss": 0.159, "step": 490 }, { "epoch": 0.18410041841004185, "grad_norm": 2.359375, "learning_rate": 9.065729640347252e-06, "loss": 0.1493, "step": 495 }, { "epoch": 0.18596001859600186, "grad_norm": 2.765625, "learning_rate": 9.045059942124846e-06, "loss": 0.1604, "step": 500 }, { "epoch": 0.18781961878196188, "grad_norm": 2.28125, "learning_rate": 9.02439024390244e-06, "loss": 0.159, "step": 505 }, { "epoch": 0.1896792189679219, "grad_norm": 2.625, "learning_rate": 9.003720545680034e-06, "loss": 0.1663, "step": 510 }, { "epoch": 0.1915388191538819, "grad_norm": 2.609375, "learning_rate": 8.983050847457628e-06, "loss": 0.1559, "step": 515 }, { "epoch": 0.19339841933984192, "grad_norm": 2.421875, "learning_rate": 8.962381149235222e-06, "loss": 0.1528, "step": 520 }, { "epoch": 0.19525801952580196, "grad_norm": 2.234375, "learning_rate": 8.941711451012816e-06, "loss": 0.1544, "step": 525 }, { "epoch": 0.19711761971176198, "grad_norm": 2.609375, "learning_rate": 8.92104175279041e-06, "loss": 0.1559, "step": 530 }, { "epoch": 0.198977219897722, "grad_norm": 2.640625, "learning_rate": 8.900372054568004e-06, "loss": 0.1648, "step": 535 }, { "epoch": 0.200092980009298, "eval_loss": 0.17836953699588776, "eval_runtime": 33.5274, "eval_samples_per_second": 306.764, "eval_steps_per_second": 9.604, "step": 538 }, { "epoch": 0.200836820083682, "grad_norm": 2.3125, "learning_rate": 8.879702356345598e-06, "loss": 0.168, "step": 540 }, { "epoch": 0.20269642026964202, "grad_norm": 2.109375, "learning_rate": 8.859032658123192e-06, "loss": 0.1605, "step": 545 }, { "epoch": 0.20455602045560203, "grad_norm": 2.796875, "learning_rate": 8.838362959900786e-06, "loss": 0.1626, "step": 550 }, { "epoch": 0.20641562064156208, "grad_norm": 2.203125, "learning_rate": 8.81769326167838e-06, "loss": 0.1535, "step": 555 }, { "epoch": 0.2082752208275221, "grad_norm": 2.546875, "learning_rate": 8.797023563455975e-06, "loss": 0.1566, "step": 560 }, { "epoch": 0.2101348210134821, "grad_norm": 2.4375, "learning_rate": 8.776353865233567e-06, "loss": 0.1587, "step": 565 }, { "epoch": 0.21199442119944212, "grad_norm": 2.640625, "learning_rate": 8.755684167011163e-06, "loss": 0.1609, "step": 570 }, { "epoch": 0.21385402138540213, "grad_norm": 2.828125, "learning_rate": 8.735014468788757e-06, "loss": 0.1522, "step": 575 }, { "epoch": 0.21571362157136215, "grad_norm": 2.28125, "learning_rate": 8.714344770566351e-06, "loss": 0.1631, "step": 580 }, { "epoch": 0.2175732217573222, "grad_norm": 2.28125, "learning_rate": 8.693675072343945e-06, "loss": 0.1662, "step": 585 }, { "epoch": 0.2194328219432822, "grad_norm": 2.328125, "learning_rate": 8.673005374121537e-06, "loss": 0.154, "step": 590 }, { "epoch": 0.22129242212924222, "grad_norm": 2.171875, "learning_rate": 8.652335675899133e-06, "loss": 0.1576, "step": 595 }, { "epoch": 0.22315202231520223, "grad_norm": 2.53125, "learning_rate": 8.631665977676727e-06, "loss": 0.1599, "step": 600 }, { "epoch": 0.22501162250116225, "grad_norm": 2.046875, "learning_rate": 8.610996279454321e-06, "loss": 0.1563, "step": 605 }, { "epoch": 0.22687122268712226, "grad_norm": 2.453125, "learning_rate": 8.590326581231915e-06, "loss": 0.1585, "step": 610 }, { "epoch": 0.22873082287308227, "grad_norm": 2.40625, "learning_rate": 8.569656883009508e-06, "loss": 0.1566, "step": 615 }, { "epoch": 0.23059042305904232, "grad_norm": 2.328125, "learning_rate": 8.548987184787104e-06, "loss": 0.16, "step": 620 }, { "epoch": 0.23245002324500233, "grad_norm": 2.109375, "learning_rate": 8.528317486564696e-06, "loss": 0.1503, "step": 625 }, { "epoch": 0.23430962343096234, "grad_norm": 2.375, "learning_rate": 8.507647788342292e-06, "loss": 0.158, "step": 630 }, { "epoch": 0.23616922361692236, "grad_norm": 2.609375, "learning_rate": 8.486978090119886e-06, "loss": 0.1615, "step": 635 }, { "epoch": 0.23802882380288237, "grad_norm": 2.296875, "learning_rate": 8.466308391897478e-06, "loss": 0.1569, "step": 640 }, { "epoch": 0.2398884239888424, "grad_norm": 2.953125, "learning_rate": 8.445638693675074e-06, "loss": 0.1536, "step": 645 }, { "epoch": 0.24174802417480243, "grad_norm": 2.4375, "learning_rate": 8.424968995452666e-06, "loss": 0.1525, "step": 650 }, { "epoch": 0.24360762436076244, "grad_norm": 2.359375, "learning_rate": 8.40429929723026e-06, "loss": 0.1461, "step": 655 }, { "epoch": 0.24546722454672246, "grad_norm": 2.546875, "learning_rate": 8.383629599007855e-06, "loss": 0.1585, "step": 660 }, { "epoch": 0.24732682473268247, "grad_norm": 2.390625, "learning_rate": 8.362959900785449e-06, "loss": 0.1494, "step": 665 }, { "epoch": 0.24918642491864249, "grad_norm": 2.53125, "learning_rate": 8.342290202563044e-06, "loss": 0.1589, "step": 670 }, { "epoch": 0.2510460251046025, "grad_norm": 2.5, "learning_rate": 8.321620504340637e-06, "loss": 0.1554, "step": 675 }, { "epoch": 0.25290562529056254, "grad_norm": 2.515625, "learning_rate": 8.30095080611823e-06, "loss": 0.1539, "step": 680 }, { "epoch": 0.2547652254765225, "grad_norm": 2.296875, "learning_rate": 8.280281107895825e-06, "loss": 0.1518, "step": 685 }, { "epoch": 0.25662482566248257, "grad_norm": 2.578125, "learning_rate": 8.259611409673419e-06, "loss": 0.1545, "step": 690 }, { "epoch": 0.2584844258484426, "grad_norm": 2.515625, "learning_rate": 8.238941711451015e-06, "loss": 0.1535, "step": 695 }, { "epoch": 0.2603440260344026, "grad_norm": 2.484375, "learning_rate": 8.218272013228607e-06, "loss": 0.1541, "step": 700 }, { "epoch": 0.26220362622036264, "grad_norm": 2.3125, "learning_rate": 8.197602315006201e-06, "loss": 0.1518, "step": 705 }, { "epoch": 0.2640632264063226, "grad_norm": 2.328125, "learning_rate": 8.176932616783795e-06, "loss": 0.1532, "step": 710 }, { "epoch": 0.26592282659228267, "grad_norm": 2.3125, "learning_rate": 8.15626291856139e-06, "loss": 0.1554, "step": 715 }, { "epoch": 0.26778242677824265, "grad_norm": 2.09375, "learning_rate": 8.135593220338983e-06, "loss": 0.1485, "step": 720 }, { "epoch": 0.2696420269642027, "grad_norm": 2.234375, "learning_rate": 8.114923522116578e-06, "loss": 0.1564, "step": 725 }, { "epoch": 0.27150162715016274, "grad_norm": 2.171875, "learning_rate": 8.094253823894172e-06, "loss": 0.1523, "step": 730 }, { "epoch": 0.2733612273361227, "grad_norm": 2.28125, "learning_rate": 8.073584125671766e-06, "loss": 0.1565, "step": 735 }, { "epoch": 0.27522082752208277, "grad_norm": 2.171875, "learning_rate": 8.05291442744936e-06, "loss": 0.1482, "step": 740 }, { "epoch": 0.27708042770804275, "grad_norm": 2.296875, "learning_rate": 8.032244729226954e-06, "loss": 0.155, "step": 745 }, { "epoch": 0.2789400278940028, "grad_norm": 2.390625, "learning_rate": 8.011575031004548e-06, "loss": 0.1449, "step": 750 }, { "epoch": 0.2807996280799628, "grad_norm": 2.109375, "learning_rate": 7.990905332782142e-06, "loss": 0.151, "step": 755 }, { "epoch": 0.2826592282659228, "grad_norm": 2.5, "learning_rate": 7.970235634559736e-06, "loss": 0.1576, "step": 760 }, { "epoch": 0.28451882845188287, "grad_norm": 2.296875, "learning_rate": 7.94956593633733e-06, "loss": 0.1625, "step": 765 }, { "epoch": 0.28637842863784285, "grad_norm": 2.5, "learning_rate": 7.928896238114924e-06, "loss": 0.1558, "step": 770 }, { "epoch": 0.2882380288238029, "grad_norm": 2.453125, "learning_rate": 7.908226539892518e-06, "loss": 0.1512, "step": 775 }, { "epoch": 0.2900976290097629, "grad_norm": 2.171875, "learning_rate": 7.887556841670112e-06, "loss": 0.1568, "step": 780 }, { "epoch": 0.2919572291957229, "grad_norm": 2.34375, "learning_rate": 7.866887143447707e-06, "loss": 0.1561, "step": 785 }, { "epoch": 0.29381682938168296, "grad_norm": 2.40625, "learning_rate": 7.8462174452253e-06, "loss": 0.1592, "step": 790 }, { "epoch": 0.29567642956764295, "grad_norm": 2.1875, "learning_rate": 7.825547747002895e-06, "loss": 0.1481, "step": 795 }, { "epoch": 0.297536029753603, "grad_norm": 2.015625, "learning_rate": 7.804878048780489e-06, "loss": 0.1478, "step": 800 }, { "epoch": 0.299395629939563, "grad_norm": 2.125, "learning_rate": 7.784208350558083e-06, "loss": 0.1562, "step": 805 }, { "epoch": 0.300139470013947, "eval_loss": 0.17311781644821167, "eval_runtime": 33.5454, "eval_samples_per_second": 306.599, "eval_steps_per_second": 9.599, "step": 807 }, { "epoch": 0.301255230125523, "grad_norm": 2.234375, "learning_rate": 7.763538652335677e-06, "loss": 0.1632, "step": 810 }, { "epoch": 0.303114830311483, "grad_norm": 2.25, "learning_rate": 7.742868954113271e-06, "loss": 0.1533, "step": 815 }, { "epoch": 0.30497443049744305, "grad_norm": 2.3125, "learning_rate": 7.722199255890865e-06, "loss": 0.1567, "step": 820 }, { "epoch": 0.3068340306834031, "grad_norm": 2.46875, "learning_rate": 7.70152955766846e-06, "loss": 0.1494, "step": 825 }, { "epoch": 0.3086936308693631, "grad_norm": 2.375, "learning_rate": 7.680859859446053e-06, "loss": 0.1539, "step": 830 }, { "epoch": 0.3105532310553231, "grad_norm": 2.453125, "learning_rate": 7.660190161223646e-06, "loss": 0.152, "step": 835 }, { "epoch": 0.3124128312412831, "grad_norm": 2.234375, "learning_rate": 7.639520463001241e-06, "loss": 0.1491, "step": 840 }, { "epoch": 0.31427243142724315, "grad_norm": 2.4375, "learning_rate": 7.618850764778835e-06, "loss": 0.1475, "step": 845 }, { "epoch": 0.31613203161320313, "grad_norm": 2.546875, "learning_rate": 7.59818106655643e-06, "loss": 0.1527, "step": 850 }, { "epoch": 0.3179916317991632, "grad_norm": 2.453125, "learning_rate": 7.577511368334023e-06, "loss": 0.1527, "step": 855 }, { "epoch": 0.3198512319851232, "grad_norm": 2.09375, "learning_rate": 7.556841670111617e-06, "loss": 0.1491, "step": 860 }, { "epoch": 0.3217108321710832, "grad_norm": 2.578125, "learning_rate": 7.536171971889211e-06, "loss": 0.1509, "step": 865 }, { "epoch": 0.32357043235704325, "grad_norm": 2.390625, "learning_rate": 7.515502273666805e-06, "loss": 0.1501, "step": 870 }, { "epoch": 0.32543003254300323, "grad_norm": 2.46875, "learning_rate": 7.4948325754444e-06, "loss": 0.1509, "step": 875 }, { "epoch": 0.3272896327289633, "grad_norm": 1.9296875, "learning_rate": 7.474162877221993e-06, "loss": 0.1449, "step": 880 }, { "epoch": 0.3291492329149233, "grad_norm": 2.125, "learning_rate": 7.4534931789995864e-06, "loss": 0.148, "step": 885 }, { "epoch": 0.3310088331008833, "grad_norm": 1.8984375, "learning_rate": 7.432823480777181e-06, "loss": 0.1459, "step": 890 }, { "epoch": 0.33286843328684335, "grad_norm": 2.28125, "learning_rate": 7.4121537825547755e-06, "loss": 0.1604, "step": 895 }, { "epoch": 0.33472803347280333, "grad_norm": 2.171875, "learning_rate": 7.3914840843323695e-06, "loss": 0.1491, "step": 900 }, { "epoch": 0.3365876336587634, "grad_norm": 1.8828125, "learning_rate": 7.370814386109964e-06, "loss": 0.1453, "step": 905 }, { "epoch": 0.33844723384472336, "grad_norm": 2.203125, "learning_rate": 7.350144687887557e-06, "loss": 0.1485, "step": 910 }, { "epoch": 0.3403068340306834, "grad_norm": 2.921875, "learning_rate": 7.329474989665152e-06, "loss": 0.1503, "step": 915 }, { "epoch": 0.34216643421664344, "grad_norm": 2.21875, "learning_rate": 7.308805291442745e-06, "loss": 0.1553, "step": 920 }, { "epoch": 0.34402603440260343, "grad_norm": 2.1875, "learning_rate": 7.288135593220339e-06, "loss": 0.1485, "step": 925 }, { "epoch": 0.3458856345885635, "grad_norm": 2.125, "learning_rate": 7.267465894997934e-06, "loss": 0.1616, "step": 930 }, { "epoch": 0.34774523477452346, "grad_norm": 2.09375, "learning_rate": 7.246796196775527e-06, "loss": 0.1444, "step": 935 }, { "epoch": 0.3496048349604835, "grad_norm": 2.03125, "learning_rate": 7.226126498553122e-06, "loss": 0.1489, "step": 940 }, { "epoch": 0.3514644351464435, "grad_norm": 2.421875, "learning_rate": 7.205456800330715e-06, "loss": 0.1459, "step": 945 }, { "epoch": 0.35332403533240353, "grad_norm": 2.21875, "learning_rate": 7.1847871021083095e-06, "loss": 0.1472, "step": 950 }, { "epoch": 0.35518363551836357, "grad_norm": 2.546875, "learning_rate": 7.164117403885904e-06, "loss": 0.1488, "step": 955 }, { "epoch": 0.35704323570432356, "grad_norm": 2.1875, "learning_rate": 7.143447705663498e-06, "loss": 0.1418, "step": 960 }, { "epoch": 0.3589028358902836, "grad_norm": 2.359375, "learning_rate": 7.122778007441093e-06, "loss": 0.1523, "step": 965 }, { "epoch": 0.3607624360762436, "grad_norm": 1.9375, "learning_rate": 7.102108309218686e-06, "loss": 0.1517, "step": 970 }, { "epoch": 0.36262203626220363, "grad_norm": 2.234375, "learning_rate": 7.08143861099628e-06, "loss": 0.1486, "step": 975 }, { "epoch": 0.36448163644816367, "grad_norm": 2.46875, "learning_rate": 7.060768912773874e-06, "loss": 0.1517, "step": 980 }, { "epoch": 0.36634123663412366, "grad_norm": 2.25, "learning_rate": 7.040099214551468e-06, "loss": 0.1514, "step": 985 }, { "epoch": 0.3682008368200837, "grad_norm": 2.171875, "learning_rate": 7.019429516329063e-06, "loss": 0.1442, "step": 990 }, { "epoch": 0.3700604370060437, "grad_norm": 2.234375, "learning_rate": 6.998759818106656e-06, "loss": 0.1558, "step": 995 }, { "epoch": 0.3719200371920037, "grad_norm": 2.015625, "learning_rate": 6.97809011988425e-06, "loss": 0.1462, "step": 1000 }, { "epoch": 0.3737796373779637, "grad_norm": 2.234375, "learning_rate": 6.957420421661844e-06, "loss": 0.1455, "step": 1005 }, { "epoch": 0.37563923756392376, "grad_norm": 1.9765625, "learning_rate": 6.9367507234394385e-06, "loss": 0.1477, "step": 1010 }, { "epoch": 0.3774988377498838, "grad_norm": 2.5, "learning_rate": 6.9160810252170325e-06, "loss": 0.1542, "step": 1015 }, { "epoch": 0.3793584379358438, "grad_norm": 2.046875, "learning_rate": 6.895411326994627e-06, "loss": 0.1518, "step": 1020 }, { "epoch": 0.3812180381218038, "grad_norm": 2.21875, "learning_rate": 6.87474162877222e-06, "loss": 0.1554, "step": 1025 }, { "epoch": 0.3830776383077638, "grad_norm": 2.21875, "learning_rate": 6.854071930549815e-06, "loss": 0.1465, "step": 1030 }, { "epoch": 0.38493723849372385, "grad_norm": 2.5, "learning_rate": 6.833402232327409e-06, "loss": 0.1462, "step": 1035 }, { "epoch": 0.38679683867968384, "grad_norm": 2.90625, "learning_rate": 6.812732534105002e-06, "loss": 0.1509, "step": 1040 }, { "epoch": 0.3886564388656439, "grad_norm": 2.046875, "learning_rate": 6.792062835882597e-06, "loss": 0.1497, "step": 1045 }, { "epoch": 0.3905160390516039, "grad_norm": 2.375, "learning_rate": 6.77139313766019e-06, "loss": 0.1496, "step": 1050 }, { "epoch": 0.3923756392375639, "grad_norm": 2.328125, "learning_rate": 6.750723439437785e-06, "loss": 0.1439, "step": 1055 }, { "epoch": 0.39423523942352395, "grad_norm": 2.1875, "learning_rate": 6.730053741215378e-06, "loss": 0.1433, "step": 1060 }, { "epoch": 0.39609483960948394, "grad_norm": 2.015625, "learning_rate": 6.7093840429929725e-06, "loss": 0.1542, "step": 1065 }, { "epoch": 0.397954439795444, "grad_norm": 2.125, "learning_rate": 6.6887143447705674e-06, "loss": 0.1373, "step": 1070 }, { "epoch": 0.399814039981404, "grad_norm": 2.328125, "learning_rate": 6.668044646548161e-06, "loss": 0.1475, "step": 1075 }, { "epoch": 0.400185960018596, "eval_loss": 0.16861507296562195, "eval_runtime": 33.5378, "eval_samples_per_second": 306.669, "eval_steps_per_second": 9.601, "step": 1076 }, { "epoch": 0.401673640167364, "grad_norm": 2.3125, "learning_rate": 6.647374948325756e-06, "loss": 0.1518, "step": 1080 }, { "epoch": 0.40353324035332405, "grad_norm": 2.125, "learning_rate": 6.626705250103349e-06, "loss": 0.1466, "step": 1085 }, { "epoch": 0.40539284053928404, "grad_norm": 2.328125, "learning_rate": 6.606035551880943e-06, "loss": 0.1472, "step": 1090 }, { "epoch": 0.4072524407252441, "grad_norm": 2.46875, "learning_rate": 6.585365853658538e-06, "loss": 0.1469, "step": 1095 }, { "epoch": 0.40911204091120407, "grad_norm": 2.015625, "learning_rate": 6.564696155436131e-06, "loss": 0.1413, "step": 1100 }, { "epoch": 0.4109716410971641, "grad_norm": 2.078125, "learning_rate": 6.544026457213726e-06, "loss": 0.151, "step": 1105 }, { "epoch": 0.41283124128312415, "grad_norm": 2.1875, "learning_rate": 6.523356758991319e-06, "loss": 0.1492, "step": 1110 }, { "epoch": 0.41469084146908414, "grad_norm": 2.640625, "learning_rate": 6.502687060768913e-06, "loss": 0.1415, "step": 1115 }, { "epoch": 0.4165504416550442, "grad_norm": 2.40625, "learning_rate": 6.482017362546507e-06, "loss": 0.1425, "step": 1120 }, { "epoch": 0.41841004184100417, "grad_norm": 2.25, "learning_rate": 6.4613476643241015e-06, "loss": 0.1482, "step": 1125 }, { "epoch": 0.4202696420269642, "grad_norm": 2.140625, "learning_rate": 6.440677966101695e-06, "loss": 0.1435, "step": 1130 }, { "epoch": 0.4221292422129242, "grad_norm": 2.1875, "learning_rate": 6.42000826787929e-06, "loss": 0.1448, "step": 1135 }, { "epoch": 0.42398884239888424, "grad_norm": 2.015625, "learning_rate": 6.399338569656883e-06, "loss": 0.1452, "step": 1140 }, { "epoch": 0.4258484425848443, "grad_norm": 2.234375, "learning_rate": 6.378668871434478e-06, "loss": 0.1498, "step": 1145 }, { "epoch": 0.42770804277080426, "grad_norm": 2.03125, "learning_rate": 6.357999173212072e-06, "loss": 0.146, "step": 1150 }, { "epoch": 0.4295676429567643, "grad_norm": 2.234375, "learning_rate": 6.337329474989665e-06, "loss": 0.1426, "step": 1155 }, { "epoch": 0.4314272431427243, "grad_norm": 2.046875, "learning_rate": 6.31665977676726e-06, "loss": 0.1387, "step": 1160 }, { "epoch": 0.43328684332868433, "grad_norm": 2.171875, "learning_rate": 6.295990078544853e-06, "loss": 0.1495, "step": 1165 }, { "epoch": 0.4351464435146444, "grad_norm": 2.15625, "learning_rate": 6.275320380322448e-06, "loss": 0.1367, "step": 1170 }, { "epoch": 0.43700604370060436, "grad_norm": 2.171875, "learning_rate": 6.254650682100042e-06, "loss": 0.1469, "step": 1175 }, { "epoch": 0.4388656438865644, "grad_norm": 2.328125, "learning_rate": 6.2339809838776355e-06, "loss": 0.1466, "step": 1180 }, { "epoch": 0.4407252440725244, "grad_norm": 2.09375, "learning_rate": 6.2133112856552304e-06, "loss": 0.1382, "step": 1185 }, { "epoch": 0.44258484425848443, "grad_norm": 2.078125, "learning_rate": 6.192641587432824e-06, "loss": 0.146, "step": 1190 }, { "epoch": 0.4444444444444444, "grad_norm": 2.28125, "learning_rate": 6.171971889210419e-06, "loss": 0.1453, "step": 1195 }, { "epoch": 0.44630404463040446, "grad_norm": 2.28125, "learning_rate": 6.151302190988012e-06, "loss": 0.1559, "step": 1200 }, { "epoch": 0.4481636448163645, "grad_norm": 2.203125, "learning_rate": 6.130632492765606e-06, "loss": 0.1461, "step": 1205 }, { "epoch": 0.4500232450023245, "grad_norm": 2.6875, "learning_rate": 6.109962794543201e-06, "loss": 0.1512, "step": 1210 }, { "epoch": 0.45188284518828453, "grad_norm": 2.03125, "learning_rate": 6.089293096320794e-06, "loss": 0.1455, "step": 1215 }, { "epoch": 0.4537424453742445, "grad_norm": 2.203125, "learning_rate": 6.068623398098388e-06, "loss": 0.1458, "step": 1220 }, { "epoch": 0.45560204556020456, "grad_norm": 2.078125, "learning_rate": 6.047953699875982e-06, "loss": 0.1452, "step": 1225 }, { "epoch": 0.45746164574616455, "grad_norm": 2.109375, "learning_rate": 6.027284001653576e-06, "loss": 0.1427, "step": 1230 }, { "epoch": 0.4593212459321246, "grad_norm": 2.1875, "learning_rate": 6.00661430343117e-06, "loss": 0.1433, "step": 1235 }, { "epoch": 0.46118084611808463, "grad_norm": 2.28125, "learning_rate": 5.9859446052087645e-06, "loss": 0.143, "step": 1240 }, { "epoch": 0.4630404463040446, "grad_norm": 2.40625, "learning_rate": 5.965274906986358e-06, "loss": 0.1357, "step": 1245 }, { "epoch": 0.46490004649000466, "grad_norm": 2.046875, "learning_rate": 5.944605208763953e-06, "loss": 0.1469, "step": 1250 }, { "epoch": 0.46675964667596465, "grad_norm": 2.09375, "learning_rate": 5.923935510541547e-06, "loss": 0.1466, "step": 1255 }, { "epoch": 0.4686192468619247, "grad_norm": 2.25, "learning_rate": 5.903265812319141e-06, "loss": 0.1451, "step": 1260 }, { "epoch": 0.47047884704788473, "grad_norm": 2.125, "learning_rate": 5.882596114096735e-06, "loss": 0.1475, "step": 1265 }, { "epoch": 0.4723384472338447, "grad_norm": 1.8515625, "learning_rate": 5.861926415874328e-06, "loss": 0.1412, "step": 1270 }, { "epoch": 0.47419804741980476, "grad_norm": 2.078125, "learning_rate": 5.841256717651923e-06, "loss": 0.1461, "step": 1275 }, { "epoch": 0.47605764760576474, "grad_norm": 2.0625, "learning_rate": 5.820587019429516e-06, "loss": 0.1437, "step": 1280 }, { "epoch": 0.4779172477917248, "grad_norm": 2.625, "learning_rate": 5.799917321207111e-06, "loss": 0.145, "step": 1285 }, { "epoch": 0.4797768479776848, "grad_norm": 2.359375, "learning_rate": 5.779247622984705e-06, "loss": 0.1407, "step": 1290 }, { "epoch": 0.4816364481636448, "grad_norm": 2.171875, "learning_rate": 5.7585779247622985e-06, "loss": 0.1455, "step": 1295 }, { "epoch": 0.48349604834960486, "grad_norm": 2.015625, "learning_rate": 5.7379082265398934e-06, "loss": 0.1437, "step": 1300 }, { "epoch": 0.48535564853556484, "grad_norm": 2.171875, "learning_rate": 5.717238528317487e-06, "loss": 0.1423, "step": 1305 }, { "epoch": 0.4872152487215249, "grad_norm": 2.265625, "learning_rate": 5.696568830095081e-06, "loss": 0.1538, "step": 1310 }, { "epoch": 0.48907484890748487, "grad_norm": 2.125, "learning_rate": 5.675899131872676e-06, "loss": 0.1477, "step": 1315 }, { "epoch": 0.4909344490934449, "grad_norm": 2.21875, "learning_rate": 5.655229433650269e-06, "loss": 0.1467, "step": 1320 }, { "epoch": 0.49279404927940496, "grad_norm": 2.140625, "learning_rate": 5.634559735427864e-06, "loss": 0.1357, "step": 1325 }, { "epoch": 0.49465364946536494, "grad_norm": 2.328125, "learning_rate": 5.613890037205457e-06, "loss": 0.1483, "step": 1330 }, { "epoch": 0.496513249651325, "grad_norm": 2.359375, "learning_rate": 5.593220338983051e-06, "loss": 0.1485, "step": 1335 }, { "epoch": 0.49837284983728497, "grad_norm": 2.078125, "learning_rate": 5.572550640760645e-06, "loss": 0.1472, "step": 1340 }, { "epoch": 0.500232450023245, "grad_norm": 2.46875, "learning_rate": 5.551880942538239e-06, "loss": 0.1515, "step": 1345 }, { "epoch": 0.500232450023245, "eval_loss": 0.16567149758338928, "eval_runtime": 33.5148, "eval_samples_per_second": 306.88, "eval_steps_per_second": 9.608, "step": 1345 }, { "epoch": 0.502092050209205, "grad_norm": 2.25, "learning_rate": 5.531211244315834e-06, "loss": 0.1434, "step": 1350 }, { "epoch": 0.503951650395165, "grad_norm": 2.515625, "learning_rate": 5.5105415460934275e-06, "loss": 0.1489, "step": 1355 }, { "epoch": 0.5058112505811251, "grad_norm": 2.125, "learning_rate": 5.489871847871021e-06, "loss": 0.1423, "step": 1360 }, { "epoch": 0.5076708507670851, "grad_norm": 1.984375, "learning_rate": 5.469202149648616e-06, "loss": 0.1532, "step": 1365 }, { "epoch": 0.509530450953045, "grad_norm": 2.328125, "learning_rate": 5.44853245142621e-06, "loss": 0.1474, "step": 1370 }, { "epoch": 0.5113900511390052, "grad_norm": 2.125, "learning_rate": 5.427862753203804e-06, "loss": 0.141, "step": 1375 }, { "epoch": 0.5132496513249651, "grad_norm": 2.21875, "learning_rate": 5.407193054981398e-06, "loss": 0.1432, "step": 1380 }, { "epoch": 0.5151092515109251, "grad_norm": 1.984375, "learning_rate": 5.386523356758991e-06, "loss": 0.1511, "step": 1385 }, { "epoch": 0.5169688516968852, "grad_norm": 2.265625, "learning_rate": 5.365853658536586e-06, "loss": 0.1479, "step": 1390 }, { "epoch": 0.5188284518828452, "grad_norm": 2.078125, "learning_rate": 5.34518396031418e-06, "loss": 0.1377, "step": 1395 }, { "epoch": 0.5206880520688052, "grad_norm": 1.9609375, "learning_rate": 5.324514262091773e-06, "loss": 0.1413, "step": 1400 }, { "epoch": 0.5225476522547652, "grad_norm": 2.25, "learning_rate": 5.303844563869368e-06, "loss": 0.1357, "step": 1405 }, { "epoch": 0.5244072524407253, "grad_norm": 1.96875, "learning_rate": 5.2831748656469615e-06, "loss": 0.1454, "step": 1410 }, { "epoch": 0.5262668526266853, "grad_norm": 2.125, "learning_rate": 5.2625051674245564e-06, "loss": 0.1455, "step": 1415 }, { "epoch": 0.5281264528126453, "grad_norm": 2.203125, "learning_rate": 5.24183546920215e-06, "loss": 0.1477, "step": 1420 }, { "epoch": 0.5299860529986054, "grad_norm": 1.890625, "learning_rate": 5.221165770979744e-06, "loss": 0.1383, "step": 1425 }, { "epoch": 0.5318456531845653, "grad_norm": 2.21875, "learning_rate": 5.200496072757339e-06, "loss": 0.1435, "step": 1430 }, { "epoch": 0.5337052533705253, "grad_norm": 2.328125, "learning_rate": 5.179826374534932e-06, "loss": 0.1492, "step": 1435 }, { "epoch": 0.5355648535564853, "grad_norm": 2.078125, "learning_rate": 5.159156676312527e-06, "loss": 0.1387, "step": 1440 }, { "epoch": 0.5374244537424454, "grad_norm": 2.578125, "learning_rate": 5.13848697809012e-06, "loss": 0.1475, "step": 1445 }, { "epoch": 0.5392840539284054, "grad_norm": 2.296875, "learning_rate": 5.117817279867714e-06, "loss": 0.1459, "step": 1450 }, { "epoch": 0.5411436541143654, "grad_norm": 2.453125, "learning_rate": 5.097147581645308e-06, "loss": 0.1536, "step": 1455 }, { "epoch": 0.5430032543003255, "grad_norm": 2.0625, "learning_rate": 5.076477883422902e-06, "loss": 0.1467, "step": 1460 }, { "epoch": 0.5448628544862855, "grad_norm": 2.34375, "learning_rate": 5.055808185200497e-06, "loss": 0.1452, "step": 1465 }, { "epoch": 0.5467224546722455, "grad_norm": 2.25, "learning_rate": 5.0351384869780905e-06, "loss": 0.1467, "step": 1470 }, { "epoch": 0.5485820548582054, "grad_norm": 2.25, "learning_rate": 5.0144687887556846e-06, "loss": 0.1422, "step": 1475 }, { "epoch": 0.5504416550441655, "grad_norm": 2.046875, "learning_rate": 4.993799090533279e-06, "loss": 0.1348, "step": 1480 }, { "epoch": 0.5523012552301255, "grad_norm": 2.359375, "learning_rate": 4.973129392310873e-06, "loss": 0.1468, "step": 1485 }, { "epoch": 0.5541608554160855, "grad_norm": 2.046875, "learning_rate": 4.952459694088467e-06, "loss": 0.1431, "step": 1490 }, { "epoch": 0.5560204556020456, "grad_norm": 2.234375, "learning_rate": 4.931789995866061e-06, "loss": 0.1513, "step": 1495 }, { "epoch": 0.5578800557880056, "grad_norm": 2.03125, "learning_rate": 4.911120297643655e-06, "loss": 0.1388, "step": 1500 }, { "epoch": 0.5597396559739656, "grad_norm": 2.078125, "learning_rate": 4.890450599421249e-06, "loss": 0.1392, "step": 1505 }, { "epoch": 0.5615992561599256, "grad_norm": 1.9609375, "learning_rate": 4.869780901198843e-06, "loss": 0.1417, "step": 1510 }, { "epoch": 0.5634588563458857, "grad_norm": 2.234375, "learning_rate": 4.849111202976437e-06, "loss": 0.1361, "step": 1515 }, { "epoch": 0.5653184565318456, "grad_norm": 2.25, "learning_rate": 4.828441504754031e-06, "loss": 0.1434, "step": 1520 }, { "epoch": 0.5671780567178056, "grad_norm": 2.390625, "learning_rate": 4.8077718065316245e-06, "loss": 0.1496, "step": 1525 }, { "epoch": 0.5690376569037657, "grad_norm": 2.046875, "learning_rate": 4.787102108309219e-06, "loss": 0.1463, "step": 1530 }, { "epoch": 0.5708972570897257, "grad_norm": 2.046875, "learning_rate": 4.7664324100868135e-06, "loss": 0.1323, "step": 1535 }, { "epoch": 0.5727568572756857, "grad_norm": 1.9609375, "learning_rate": 4.745762711864408e-06, "loss": 0.1437, "step": 1540 }, { "epoch": 0.5746164574616457, "grad_norm": 2.25, "learning_rate": 4.725093013642002e-06, "loss": 0.1543, "step": 1545 }, { "epoch": 0.5764760576476058, "grad_norm": 2.171875, "learning_rate": 4.704423315419595e-06, "loss": 0.1409, "step": 1550 }, { "epoch": 0.5783356578335658, "grad_norm": 1.9375, "learning_rate": 4.683753617197189e-06, "loss": 0.1439, "step": 1555 }, { "epoch": 0.5801952580195258, "grad_norm": 2.015625, "learning_rate": 4.663083918974783e-06, "loss": 0.1395, "step": 1560 }, { "epoch": 0.5820548582054859, "grad_norm": 2.0625, "learning_rate": 4.642414220752377e-06, "loss": 0.1416, "step": 1565 }, { "epoch": 0.5839144583914458, "grad_norm": 2.734375, "learning_rate": 4.621744522529971e-06, "loss": 0.1385, "step": 1570 }, { "epoch": 0.5857740585774058, "grad_norm": 2.1875, "learning_rate": 4.601074824307565e-06, "loss": 0.1382, "step": 1575 }, { "epoch": 0.5876336587633659, "grad_norm": 2.109375, "learning_rate": 4.580405126085159e-06, "loss": 0.1378, "step": 1580 }, { "epoch": 0.5894932589493259, "grad_norm": 2.078125, "learning_rate": 4.5597354278627535e-06, "loss": 0.1369, "step": 1585 }, { "epoch": 0.5913528591352859, "grad_norm": 1.9921875, "learning_rate": 4.5390657296403476e-06, "loss": 0.145, "step": 1590 }, { "epoch": 0.5932124593212459, "grad_norm": 2.078125, "learning_rate": 4.518396031417942e-06, "loss": 0.1376, "step": 1595 }, { "epoch": 0.595072059507206, "grad_norm": 2.46875, "learning_rate": 4.497726333195536e-06, "loss": 0.1398, "step": 1600 }, { "epoch": 0.596931659693166, "grad_norm": 2.0, "learning_rate": 4.47705663497313e-06, "loss": 0.144, "step": 1605 }, { "epoch": 0.598791259879126, "grad_norm": 2.078125, "learning_rate": 4.456386936750724e-06, "loss": 0.1344, "step": 1610 }, { "epoch": 0.600278940027894, "eval_loss": 0.16357110440731049, "eval_runtime": 33.5259, "eval_samples_per_second": 306.778, "eval_steps_per_second": 9.605, "step": 1614 }, { "epoch": 0.6006508600650861, "grad_norm": 2.140625, "learning_rate": 4.435717238528318e-06, "loss": 0.1411, "step": 1615 }, { "epoch": 0.602510460251046, "grad_norm": 2.15625, "learning_rate": 4.415047540305912e-06, "loss": 0.1454, "step": 1620 }, { "epoch": 0.604370060437006, "grad_norm": 1.984375, "learning_rate": 4.394377842083506e-06, "loss": 0.1457, "step": 1625 }, { "epoch": 0.606229660622966, "grad_norm": 2.78125, "learning_rate": 4.3737081438611e-06, "loss": 0.1404, "step": 1630 }, { "epoch": 0.6080892608089261, "grad_norm": 1.9375, "learning_rate": 4.353038445638694e-06, "loss": 0.1401, "step": 1635 }, { "epoch": 0.6099488609948861, "grad_norm": 1.9921875, "learning_rate": 4.3323687474162875e-06, "loss": 0.1373, "step": 1640 }, { "epoch": 0.6118084611808461, "grad_norm": 2.078125, "learning_rate": 4.3116990491938824e-06, "loss": 0.1409, "step": 1645 }, { "epoch": 0.6136680613668062, "grad_norm": 1.984375, "learning_rate": 4.2910293509714765e-06, "loss": 0.1416, "step": 1650 }, { "epoch": 0.6155276615527662, "grad_norm": 2.140625, "learning_rate": 4.270359652749071e-06, "loss": 0.1458, "step": 1655 }, { "epoch": 0.6173872617387262, "grad_norm": 2.25, "learning_rate": 4.249689954526664e-06, "loss": 0.1459, "step": 1660 }, { "epoch": 0.6192468619246861, "grad_norm": 1.7734375, "learning_rate": 4.229020256304258e-06, "loss": 0.1383, "step": 1665 }, { "epoch": 0.6211064621106462, "grad_norm": 2.109375, "learning_rate": 4.208350558081852e-06, "loss": 0.1451, "step": 1670 }, { "epoch": 0.6229660622966062, "grad_norm": 2.03125, "learning_rate": 4.187680859859447e-06, "loss": 0.1385, "step": 1675 }, { "epoch": 0.6248256624825662, "grad_norm": 2.3125, "learning_rate": 4.167011161637041e-06, "loss": 0.141, "step": 1680 }, { "epoch": 0.6266852626685263, "grad_norm": 2.265625, "learning_rate": 4.146341463414634e-06, "loss": 0.1428, "step": 1685 }, { "epoch": 0.6285448628544863, "grad_norm": 2.03125, "learning_rate": 4.125671765192228e-06, "loss": 0.1425, "step": 1690 }, { "epoch": 0.6304044630404463, "grad_norm": 1.890625, "learning_rate": 4.105002066969822e-06, "loss": 0.1369, "step": 1695 }, { "epoch": 0.6322640632264063, "grad_norm": 2.0625, "learning_rate": 4.0843323687474165e-06, "loss": 0.1351, "step": 1700 }, { "epoch": 0.6341236634123664, "grad_norm": 2.09375, "learning_rate": 4.0636626705250106e-06, "loss": 0.1393, "step": 1705 }, { "epoch": 0.6359832635983264, "grad_norm": 2.203125, "learning_rate": 4.042992972302605e-06, "loss": 0.137, "step": 1710 }, { "epoch": 0.6378428637842863, "grad_norm": 2.265625, "learning_rate": 4.022323274080199e-06, "loss": 0.1409, "step": 1715 }, { "epoch": 0.6397024639702464, "grad_norm": 1.9453125, "learning_rate": 4.001653575857793e-06, "loss": 0.1459, "step": 1720 }, { "epoch": 0.6415620641562064, "grad_norm": 1.8359375, "learning_rate": 3.980983877635387e-06, "loss": 0.1385, "step": 1725 }, { "epoch": 0.6434216643421664, "grad_norm": 2.015625, "learning_rate": 3.960314179412981e-06, "loss": 0.153, "step": 1730 }, { "epoch": 0.6452812645281265, "grad_norm": 2.1875, "learning_rate": 3.939644481190575e-06, "loss": 0.1397, "step": 1735 }, { "epoch": 0.6471408647140865, "grad_norm": 2.078125, "learning_rate": 3.918974782968169e-06, "loss": 0.1405, "step": 1740 }, { "epoch": 0.6490004649000465, "grad_norm": 1.8984375, "learning_rate": 3.898305084745763e-06, "loss": 0.1391, "step": 1745 }, { "epoch": 0.6508600650860065, "grad_norm": 2.21875, "learning_rate": 3.8776353865233564e-06, "loss": 0.1409, "step": 1750 }, { "epoch": 0.6527196652719666, "grad_norm": 2.21875, "learning_rate": 3.856965688300951e-06, "loss": 0.1402, "step": 1755 }, { "epoch": 0.6545792654579266, "grad_norm": 2.21875, "learning_rate": 3.8362959900785454e-06, "loss": 0.1502, "step": 1760 }, { "epoch": 0.6564388656438865, "grad_norm": 2.109375, "learning_rate": 3.8156262918561395e-06, "loss": 0.1408, "step": 1765 }, { "epoch": 0.6582984658298466, "grad_norm": 2.25, "learning_rate": 3.7949565936337336e-06, "loss": 0.1445, "step": 1770 }, { "epoch": 0.6601580660158066, "grad_norm": 1.890625, "learning_rate": 3.7742868954113273e-06, "loss": 0.1417, "step": 1775 }, { "epoch": 0.6620176662017666, "grad_norm": 2.09375, "learning_rate": 3.7536171971889213e-06, "loss": 0.1402, "step": 1780 }, { "epoch": 0.6638772663877266, "grad_norm": 2.03125, "learning_rate": 3.7329474989665154e-06, "loss": 0.1428, "step": 1785 }, { "epoch": 0.6657368665736867, "grad_norm": 2.015625, "learning_rate": 3.7122778007441095e-06, "loss": 0.1408, "step": 1790 }, { "epoch": 0.6675964667596467, "grad_norm": 1.921875, "learning_rate": 3.6916081025217036e-06, "loss": 0.1428, "step": 1795 }, { "epoch": 0.6694560669456067, "grad_norm": 2.515625, "learning_rate": 3.6709384042992972e-06, "loss": 0.1463, "step": 1800 }, { "epoch": 0.6713156671315668, "grad_norm": 2.109375, "learning_rate": 3.6502687060768917e-06, "loss": 0.1365, "step": 1805 }, { "epoch": 0.6731752673175267, "grad_norm": 2.015625, "learning_rate": 3.629599007854486e-06, "loss": 0.1362, "step": 1810 }, { "epoch": 0.6750348675034867, "grad_norm": 2.3125, "learning_rate": 3.60892930963208e-06, "loss": 0.1378, "step": 1815 }, { "epoch": 0.6768944676894467, "grad_norm": 1.96875, "learning_rate": 3.5882596114096736e-06, "loss": 0.1323, "step": 1820 }, { "epoch": 0.6787540678754068, "grad_norm": 2.015625, "learning_rate": 3.5675899131872676e-06, "loss": 0.1406, "step": 1825 }, { "epoch": 0.6806136680613668, "grad_norm": 2.15625, "learning_rate": 3.5469202149648617e-06, "loss": 0.1416, "step": 1830 }, { "epoch": 0.6824732682473268, "grad_norm": 1.953125, "learning_rate": 3.526250516742456e-06, "loss": 0.1347, "step": 1835 }, { "epoch": 0.6843328684332869, "grad_norm": 2.21875, "learning_rate": 3.5055808185200503e-06, "loss": 0.148, "step": 1840 }, { "epoch": 0.6861924686192469, "grad_norm": 2.015625, "learning_rate": 3.484911120297644e-06, "loss": 0.1443, "step": 1845 }, { "epoch": 0.6880520688052069, "grad_norm": 1.953125, "learning_rate": 3.464241422075238e-06, "loss": 0.138, "step": 1850 }, { "epoch": 0.6899116689911668, "grad_norm": 2.015625, "learning_rate": 3.443571723852832e-06, "loss": 0.1466, "step": 1855 }, { "epoch": 0.691771269177127, "grad_norm": 2.34375, "learning_rate": 3.422902025630426e-06, "loss": 0.1403, "step": 1860 }, { "epoch": 0.6936308693630869, "grad_norm": 2.03125, "learning_rate": 3.40223232740802e-06, "loss": 0.1361, "step": 1865 }, { "epoch": 0.6954904695490469, "grad_norm": 2.265625, "learning_rate": 3.381562629185614e-06, "loss": 0.1393, "step": 1870 }, { "epoch": 0.697350069735007, "grad_norm": 2.09375, "learning_rate": 3.360892930963208e-06, "loss": 0.1352, "step": 1875 }, { "epoch": 0.699209669920967, "grad_norm": 2.34375, "learning_rate": 3.3402232327408025e-06, "loss": 0.1387, "step": 1880 }, { "epoch": 0.700325430032543, "eval_loss": 0.16304655373096466, "eval_runtime": 33.5474, "eval_samples_per_second": 306.581, "eval_steps_per_second": 9.598, "step": 1883 }, { "epoch": 0.701069270106927, "grad_norm": 2.015625, "learning_rate": 3.3195535345183966e-06, "loss": 0.1373, "step": 1885 }, { "epoch": 0.702928870292887, "grad_norm": 2.046875, "learning_rate": 3.2988838362959903e-06, "loss": 0.1392, "step": 1890 }, { "epoch": 0.7047884704788471, "grad_norm": 1.9375, "learning_rate": 3.2782141380735843e-06, "loss": 0.1453, "step": 1895 }, { "epoch": 0.7066480706648071, "grad_norm": 1.8359375, "learning_rate": 3.2575444398511784e-06, "loss": 0.141, "step": 1900 }, { "epoch": 0.708507670850767, "grad_norm": 2.015625, "learning_rate": 3.2368747416287725e-06, "loss": 0.1346, "step": 1905 }, { "epoch": 0.7103672710367271, "grad_norm": 2.046875, "learning_rate": 3.216205043406366e-06, "loss": 0.1375, "step": 1910 }, { "epoch": 0.7122268712226871, "grad_norm": 1.9609375, "learning_rate": 3.1955353451839607e-06, "loss": 0.1433, "step": 1915 }, { "epoch": 0.7140864714086471, "grad_norm": 2.375, "learning_rate": 3.1748656469615547e-06, "loss": 0.1487, "step": 1920 }, { "epoch": 0.7159460715946072, "grad_norm": 2.25, "learning_rate": 3.154195948739149e-06, "loss": 0.1439, "step": 1925 }, { "epoch": 0.7178056717805672, "grad_norm": 2.140625, "learning_rate": 3.133526250516743e-06, "loss": 0.1362, "step": 1930 }, { "epoch": 0.7196652719665272, "grad_norm": 2.15625, "learning_rate": 3.1128565522943366e-06, "loss": 0.1373, "step": 1935 }, { "epoch": 0.7215248721524872, "grad_norm": 2.453125, "learning_rate": 3.0921868540719306e-06, "loss": 0.1476, "step": 1940 }, { "epoch": 0.7233844723384473, "grad_norm": 2.15625, "learning_rate": 3.0715171558495247e-06, "loss": 0.1363, "step": 1945 }, { "epoch": 0.7252440725244073, "grad_norm": 2.265625, "learning_rate": 3.0508474576271192e-06, "loss": 0.1338, "step": 1950 }, { "epoch": 0.7271036727103672, "grad_norm": 2.140625, "learning_rate": 3.030177759404713e-06, "loss": 0.1338, "step": 1955 }, { "epoch": 0.7289632728963273, "grad_norm": 2.25, "learning_rate": 3.009508061182307e-06, "loss": 0.1391, "step": 1960 }, { "epoch": 0.7308228730822873, "grad_norm": 2.140625, "learning_rate": 2.988838362959901e-06, "loss": 0.146, "step": 1965 }, { "epoch": 0.7326824732682473, "grad_norm": 2.140625, "learning_rate": 2.968168664737495e-06, "loss": 0.138, "step": 1970 }, { "epoch": 0.7345420734542073, "grad_norm": 2.078125, "learning_rate": 2.947498966515089e-06, "loss": 0.1395, "step": 1975 }, { "epoch": 0.7364016736401674, "grad_norm": 2.125, "learning_rate": 2.926829268292683e-06, "loss": 0.1409, "step": 1980 }, { "epoch": 0.7382612738261274, "grad_norm": 2.265625, "learning_rate": 2.906159570070277e-06, "loss": 0.1404, "step": 1985 }, { "epoch": 0.7401208740120874, "grad_norm": 2.046875, "learning_rate": 2.8854898718478715e-06, "loss": 0.1371, "step": 1990 }, { "epoch": 0.7419804741980475, "grad_norm": 2.234375, "learning_rate": 2.8648201736254655e-06, "loss": 0.1385, "step": 1995 }, { "epoch": 0.7438400743840075, "grad_norm": 2.265625, "learning_rate": 2.844150475403059e-06, "loss": 0.1393, "step": 2000 }, { "epoch": 0.7456996745699674, "grad_norm": 1.8515625, "learning_rate": 2.8234807771806533e-06, "loss": 0.1329, "step": 2005 }, { "epoch": 0.7475592747559274, "grad_norm": 1.9765625, "learning_rate": 2.8028110789582473e-06, "loss": 0.1418, "step": 2010 }, { "epoch": 0.7494188749418875, "grad_norm": 2.046875, "learning_rate": 2.7821413807358414e-06, "loss": 0.1366, "step": 2015 }, { "epoch": 0.7512784751278475, "grad_norm": 2.140625, "learning_rate": 2.761471682513436e-06, "loss": 0.1378, "step": 2020 }, { "epoch": 0.7531380753138075, "grad_norm": 2.078125, "learning_rate": 2.7408019842910296e-06, "loss": 0.1392, "step": 2025 }, { "epoch": 0.7549976754997676, "grad_norm": 2.34375, "learning_rate": 2.7201322860686237e-06, "loss": 0.141, "step": 2030 }, { "epoch": 0.7568572756857276, "grad_norm": 2.46875, "learning_rate": 2.6994625878462178e-06, "loss": 0.1391, "step": 2035 }, { "epoch": 0.7587168758716876, "grad_norm": 2.140625, "learning_rate": 2.678792889623812e-06, "loss": 0.1408, "step": 2040 }, { "epoch": 0.7605764760576476, "grad_norm": 2.140625, "learning_rate": 2.6581231914014055e-06, "loss": 0.1382, "step": 2045 }, { "epoch": 0.7624360762436077, "grad_norm": 2.171875, "learning_rate": 2.6374534931789996e-06, "loss": 0.1424, "step": 2050 }, { "epoch": 0.7642956764295676, "grad_norm": 2.0, "learning_rate": 2.6167837949565936e-06, "loss": 0.1297, "step": 2055 }, { "epoch": 0.7661552766155276, "grad_norm": 1.921875, "learning_rate": 2.596114096734188e-06, "loss": 0.1398, "step": 2060 }, { "epoch": 0.7680148768014877, "grad_norm": 2.5, "learning_rate": 2.5754443985117822e-06, "loss": 0.1418, "step": 2065 }, { "epoch": 0.7698744769874477, "grad_norm": 2.15625, "learning_rate": 2.554774700289376e-06, "loss": 0.137, "step": 2070 }, { "epoch": 0.7717340771734077, "grad_norm": 1.9296875, "learning_rate": 2.53410500206697e-06, "loss": 0.1409, "step": 2075 }, { "epoch": 0.7735936773593677, "grad_norm": 2.015625, "learning_rate": 2.513435303844564e-06, "loss": 0.1345, "step": 2080 }, { "epoch": 0.7754532775453278, "grad_norm": 1.8828125, "learning_rate": 2.492765605622158e-06, "loss": 0.1445, "step": 2085 }, { "epoch": 0.7773128777312878, "grad_norm": 2.34375, "learning_rate": 2.4720959073997522e-06, "loss": 0.1414, "step": 2090 }, { "epoch": 0.7791724779172478, "grad_norm": 2.015625, "learning_rate": 2.4514262091773463e-06, "loss": 0.1373, "step": 2095 }, { "epoch": 0.7810320781032078, "grad_norm": 2.265625, "learning_rate": 2.4307565109549404e-06, "loss": 0.1367, "step": 2100 }, { "epoch": 0.7828916782891678, "grad_norm": 2.109375, "learning_rate": 2.4100868127325345e-06, "loss": 0.1332, "step": 2105 }, { "epoch": 0.7847512784751278, "grad_norm": 2.125, "learning_rate": 2.389417114510128e-06, "loss": 0.1368, "step": 2110 }, { "epoch": 0.7866108786610879, "grad_norm": 1.984375, "learning_rate": 2.3687474162877226e-06, "loss": 0.1439, "step": 2115 }, { "epoch": 0.7884704788470479, "grad_norm": 2.21875, "learning_rate": 2.3480777180653163e-06, "loss": 0.1423, "step": 2120 }, { "epoch": 0.7903300790330079, "grad_norm": 2.09375, "learning_rate": 2.3274080198429104e-06, "loss": 0.1384, "step": 2125 }, { "epoch": 0.7921896792189679, "grad_norm": 2.34375, "learning_rate": 2.3067383216205044e-06, "loss": 0.138, "step": 2130 }, { "epoch": 0.794049279404928, "grad_norm": 2.03125, "learning_rate": 2.2860686233980985e-06, "loss": 0.1368, "step": 2135 }, { "epoch": 0.795908879590888, "grad_norm": 2.0625, "learning_rate": 2.2653989251756926e-06, "loss": 0.1315, "step": 2140 }, { "epoch": 0.797768479776848, "grad_norm": 2.34375, "learning_rate": 2.2447292269532867e-06, "loss": 0.1433, "step": 2145 }, { "epoch": 0.799628079962808, "grad_norm": 2.390625, "learning_rate": 2.2240595287308808e-06, "loss": 0.1403, "step": 2150 }, { "epoch": 0.800371920037192, "eval_loss": 0.16228820383548737, "eval_runtime": 33.5429, "eval_samples_per_second": 306.622, "eval_steps_per_second": 9.6, "step": 2152 }, { "epoch": 0.801487680148768, "grad_norm": 2.171875, "learning_rate": 2.203389830508475e-06, "loss": 0.138, "step": 2155 }, { "epoch": 0.803347280334728, "grad_norm": 2.234375, "learning_rate": 2.182720132286069e-06, "loss": 0.1403, "step": 2160 }, { "epoch": 0.805206880520688, "grad_norm": 1.984375, "learning_rate": 2.1620504340636626e-06, "loss": 0.1398, "step": 2165 }, { "epoch": 0.8070664807066481, "grad_norm": 2.078125, "learning_rate": 2.141380735841257e-06, "loss": 0.1459, "step": 2170 }, { "epoch": 0.8089260808926081, "grad_norm": 2.171875, "learning_rate": 2.1207110376188507e-06, "loss": 0.1374, "step": 2175 }, { "epoch": 0.8107856810785681, "grad_norm": 1.9453125, "learning_rate": 2.100041339396445e-06, "loss": 0.138, "step": 2180 }, { "epoch": 0.8126452812645282, "grad_norm": 2.265625, "learning_rate": 2.079371641174039e-06, "loss": 0.1389, "step": 2185 }, { "epoch": 0.8145048814504882, "grad_norm": 1.890625, "learning_rate": 2.058701942951633e-06, "loss": 0.1393, "step": 2190 }, { "epoch": 0.8163644816364481, "grad_norm": 2.203125, "learning_rate": 2.038032244729227e-06, "loss": 0.1341, "step": 2195 }, { "epoch": 0.8182240818224081, "grad_norm": 2.140625, "learning_rate": 2.017362546506821e-06, "loss": 0.1413, "step": 2200 }, { "epoch": 0.8200836820083682, "grad_norm": 2.21875, "learning_rate": 1.9966928482844152e-06, "loss": 0.1401, "step": 2205 }, { "epoch": 0.8219432821943282, "grad_norm": 2.296875, "learning_rate": 1.9760231500620093e-06, "loss": 0.1442, "step": 2210 }, { "epoch": 0.8238028823802882, "grad_norm": 2.140625, "learning_rate": 1.9553534518396034e-06, "loss": 0.144, "step": 2215 }, { "epoch": 0.8256624825662483, "grad_norm": 2.125, "learning_rate": 1.934683753617197e-06, "loss": 0.1353, "step": 2220 }, { "epoch": 0.8275220827522083, "grad_norm": 2.0, "learning_rate": 1.9140140553947915e-06, "loss": 0.1394, "step": 2225 }, { "epoch": 0.8293816829381683, "grad_norm": 1.9765625, "learning_rate": 1.8933443571723856e-06, "loss": 0.1385, "step": 2230 }, { "epoch": 0.8312412831241283, "grad_norm": 2.28125, "learning_rate": 1.8726746589499795e-06, "loss": 0.1451, "step": 2235 }, { "epoch": 0.8331008833100884, "grad_norm": 2.015625, "learning_rate": 1.8520049607275736e-06, "loss": 0.1366, "step": 2240 }, { "epoch": 0.8349604834960483, "grad_norm": 2.203125, "learning_rate": 1.8313352625051674e-06, "loss": 0.1348, "step": 2245 }, { "epoch": 0.8368200836820083, "grad_norm": 2.09375, "learning_rate": 1.8106655642827617e-06, "loss": 0.1365, "step": 2250 }, { "epoch": 0.8386796838679684, "grad_norm": 2.09375, "learning_rate": 1.7899958660603556e-06, "loss": 0.1366, "step": 2255 }, { "epoch": 0.8405392840539284, "grad_norm": 2.09375, "learning_rate": 1.7693261678379497e-06, "loss": 0.1444, "step": 2260 }, { "epoch": 0.8423988842398884, "grad_norm": 2.375, "learning_rate": 1.7486564696155435e-06, "loss": 0.144, "step": 2265 }, { "epoch": 0.8442584844258484, "grad_norm": 2.09375, "learning_rate": 1.7279867713931378e-06, "loss": 0.1403, "step": 2270 }, { "epoch": 0.8461180846118085, "grad_norm": 2.0, "learning_rate": 1.707317073170732e-06, "loss": 0.1427, "step": 2275 }, { "epoch": 0.8479776847977685, "grad_norm": 1.9921875, "learning_rate": 1.6866473749483258e-06, "loss": 0.1367, "step": 2280 }, { "epoch": 0.8498372849837285, "grad_norm": 2.140625, "learning_rate": 1.66597767672592e-06, "loss": 0.1474, "step": 2285 }, { "epoch": 0.8516968851696886, "grad_norm": 2.125, "learning_rate": 1.645307978503514e-06, "loss": 0.1476, "step": 2290 }, { "epoch": 0.8535564853556485, "grad_norm": 2.140625, "learning_rate": 1.624638280281108e-06, "loss": 0.1423, "step": 2295 }, { "epoch": 0.8554160855416085, "grad_norm": 2.296875, "learning_rate": 1.603968582058702e-06, "loss": 0.137, "step": 2300 }, { "epoch": 0.8572756857275686, "grad_norm": 2.046875, "learning_rate": 1.5832988838362962e-06, "loss": 0.1332, "step": 2305 }, { "epoch": 0.8591352859135286, "grad_norm": 2.5, "learning_rate": 1.56262918561389e-06, "loss": 0.141, "step": 2310 }, { "epoch": 0.8609948860994886, "grad_norm": 2.390625, "learning_rate": 1.5419594873914841e-06, "loss": 0.1375, "step": 2315 }, { "epoch": 0.8628544862854486, "grad_norm": 2.046875, "learning_rate": 1.5212897891690784e-06, "loss": 0.1413, "step": 2320 }, { "epoch": 0.8647140864714087, "grad_norm": 1.8828125, "learning_rate": 1.5006200909466723e-06, "loss": 0.1455, "step": 2325 }, { "epoch": 0.8665736866573687, "grad_norm": 2.234375, "learning_rate": 1.4799503927242664e-06, "loss": 0.1442, "step": 2330 }, { "epoch": 0.8684332868433287, "grad_norm": 2.109375, "learning_rate": 1.4592806945018602e-06, "loss": 0.1326, "step": 2335 }, { "epoch": 0.8702928870292888, "grad_norm": 2.265625, "learning_rate": 1.4386109962794545e-06, "loss": 0.1418, "step": 2340 }, { "epoch": 0.8721524872152487, "grad_norm": 2.125, "learning_rate": 1.4179412980570484e-06, "loss": 0.1398, "step": 2345 }, { "epoch": 0.8740120874012087, "grad_norm": 2.0, "learning_rate": 1.3972715998346425e-06, "loss": 0.139, "step": 2350 }, { "epoch": 0.8758716875871687, "grad_norm": 2.671875, "learning_rate": 1.3766019016122364e-06, "loss": 0.1506, "step": 2355 }, { "epoch": 0.8777312877731288, "grad_norm": 2.09375, "learning_rate": 1.3559322033898307e-06, "loss": 0.1396, "step": 2360 }, { "epoch": 0.8795908879590888, "grad_norm": 1.953125, "learning_rate": 1.3352625051674247e-06, "loss": 0.1293, "step": 2365 }, { "epoch": 0.8814504881450488, "grad_norm": 2.03125, "learning_rate": 1.3145928069450186e-06, "loss": 0.1399, "step": 2370 }, { "epoch": 0.8833100883310089, "grad_norm": 2.046875, "learning_rate": 1.2939231087226129e-06, "loss": 0.1365, "step": 2375 }, { "epoch": 0.8851696885169689, "grad_norm": 2.0625, "learning_rate": 1.2732534105002068e-06, "loss": 0.1397, "step": 2380 }, { "epoch": 0.8870292887029289, "grad_norm": 2.015625, "learning_rate": 1.2525837122778008e-06, "loss": 0.1437, "step": 2385 }, { "epoch": 0.8888888888888888, "grad_norm": 1.984375, "learning_rate": 1.231914014055395e-06, "loss": 0.1418, "step": 2390 }, { "epoch": 0.8907484890748489, "grad_norm": 2.03125, "learning_rate": 1.211244315832989e-06, "loss": 0.1384, "step": 2395 }, { "epoch": 0.8926080892608089, "grad_norm": 2.359375, "learning_rate": 1.190574617610583e-06, "loss": 0.1445, "step": 2400 }, { "epoch": 0.8944676894467689, "grad_norm": 2.140625, "learning_rate": 1.169904919388177e-06, "loss": 0.1358, "step": 2405 }, { "epoch": 0.896327289632729, "grad_norm": 2.171875, "learning_rate": 1.149235221165771e-06, "loss": 0.1532, "step": 2410 }, { "epoch": 0.898186889818689, "grad_norm": 1.96875, "learning_rate": 1.1285655229433651e-06, "loss": 0.1416, "step": 2415 }, { "epoch": 0.900046490004649, "grad_norm": 1.9140625, "learning_rate": 1.1078958247209592e-06, "loss": 0.1383, "step": 2420 }, { "epoch": 0.900418410041841, "eval_loss": 0.16189107298851013, "eval_runtime": 33.532, "eval_samples_per_second": 306.722, "eval_steps_per_second": 9.603, "step": 2421 }, { "epoch": 0.901906090190609, "grad_norm": 2.234375, "learning_rate": 1.0872261264985533e-06, "loss": 0.1319, "step": 2425 }, { "epoch": 0.9037656903765691, "grad_norm": 2.28125, "learning_rate": 1.0665564282761474e-06, "loss": 0.1448, "step": 2430 }, { "epoch": 0.905625290562529, "grad_norm": 2.265625, "learning_rate": 1.0458867300537414e-06, "loss": 0.1479, "step": 2435 }, { "epoch": 0.907484890748489, "grad_norm": 2.15625, "learning_rate": 1.0252170318313353e-06, "loss": 0.1396, "step": 2440 }, { "epoch": 0.9093444909344491, "grad_norm": 2.203125, "learning_rate": 1.0045473336089294e-06, "loss": 0.1494, "step": 2445 }, { "epoch": 0.9112040911204091, "grad_norm": 2.0625, "learning_rate": 9.838776353865235e-07, "loss": 0.1362, "step": 2450 }, { "epoch": 0.9130636913063691, "grad_norm": 2.0625, "learning_rate": 9.632079371641175e-07, "loss": 0.1371, "step": 2455 }, { "epoch": 0.9149232914923291, "grad_norm": 2.1875, "learning_rate": 9.425382389417115e-07, "loss": 0.139, "step": 2460 }, { "epoch": 0.9167828916782892, "grad_norm": 2.03125, "learning_rate": 9.218685407193055e-07, "loss": 0.1339, "step": 2465 }, { "epoch": 0.9186424918642492, "grad_norm": 2.21875, "learning_rate": 9.011988424968997e-07, "loss": 0.1427, "step": 2470 }, { "epoch": 0.9205020920502092, "grad_norm": 1.9140625, "learning_rate": 8.805291442744937e-07, "loss": 0.1302, "step": 2475 }, { "epoch": 0.9223616922361693, "grad_norm": 1.9296875, "learning_rate": 8.598594460520877e-07, "loss": 0.1385, "step": 2480 }, { "epoch": 0.9242212924221292, "grad_norm": 2.3125, "learning_rate": 8.391897478296818e-07, "loss": 0.1396, "step": 2485 }, { "epoch": 0.9260808926080892, "grad_norm": 2.34375, "learning_rate": 8.185200496072758e-07, "loss": 0.1447, "step": 2490 }, { "epoch": 0.9279404927940493, "grad_norm": 2.109375, "learning_rate": 7.978503513848699e-07, "loss": 0.1362, "step": 2495 }, { "epoch": 0.9298000929800093, "grad_norm": 2.03125, "learning_rate": 7.771806531624638e-07, "loss": 0.1418, "step": 2500 }, { "epoch": 0.9316596931659693, "grad_norm": 1.96875, "learning_rate": 7.565109549400579e-07, "loss": 0.1372, "step": 2505 }, { "epoch": 0.9335192933519293, "grad_norm": 1.8984375, "learning_rate": 7.358412567176519e-07, "loss": 0.1421, "step": 2510 }, { "epoch": 0.9353788935378894, "grad_norm": 2.171875, "learning_rate": 7.151715584952461e-07, "loss": 0.1426, "step": 2515 }, { "epoch": 0.9372384937238494, "grad_norm": 2.125, "learning_rate": 6.945018602728401e-07, "loss": 0.1361, "step": 2520 }, { "epoch": 0.9390980939098094, "grad_norm": 2.140625, "learning_rate": 6.738321620504341e-07, "loss": 0.1367, "step": 2525 }, { "epoch": 0.9409576940957695, "grad_norm": 2.015625, "learning_rate": 6.531624638280282e-07, "loss": 0.1408, "step": 2530 }, { "epoch": 0.9428172942817294, "grad_norm": 2.265625, "learning_rate": 6.324927656056222e-07, "loss": 0.141, "step": 2535 }, { "epoch": 0.9446768944676894, "grad_norm": 2.359375, "learning_rate": 6.118230673832163e-07, "loss": 0.1346, "step": 2540 }, { "epoch": 0.9465364946536494, "grad_norm": 2.09375, "learning_rate": 5.911533691608104e-07, "loss": 0.14, "step": 2545 }, { "epoch": 0.9483960948396095, "grad_norm": 1.8984375, "learning_rate": 5.704836709384043e-07, "loss": 0.1347, "step": 2550 }, { "epoch": 0.9502556950255695, "grad_norm": 1.96875, "learning_rate": 5.498139727159984e-07, "loss": 0.1354, "step": 2555 }, { "epoch": 0.9521152952115295, "grad_norm": 2.203125, "learning_rate": 5.291442744935924e-07, "loss": 0.1394, "step": 2560 }, { "epoch": 0.9539748953974896, "grad_norm": 1.9453125, "learning_rate": 5.084745762711865e-07, "loss": 0.1392, "step": 2565 }, { "epoch": 0.9558344955834496, "grad_norm": 1.9140625, "learning_rate": 4.878048780487805e-07, "loss": 0.1394, "step": 2570 }, { "epoch": 0.9576940957694096, "grad_norm": 2.015625, "learning_rate": 4.671351798263746e-07, "loss": 0.1401, "step": 2575 }, { "epoch": 0.9595536959553695, "grad_norm": 1.8984375, "learning_rate": 4.464654816039686e-07, "loss": 0.1324, "step": 2580 }, { "epoch": 0.9614132961413296, "grad_norm": 2.0, "learning_rate": 4.2579578338156263e-07, "loss": 0.1408, "step": 2585 }, { "epoch": 0.9632728963272896, "grad_norm": 2.078125, "learning_rate": 4.051260851591567e-07, "loss": 0.1336, "step": 2590 }, { "epoch": 0.9651324965132496, "grad_norm": 2.234375, "learning_rate": 3.8445638693675074e-07, "loss": 0.14, "step": 2595 }, { "epoch": 0.9669920966992097, "grad_norm": 2.296875, "learning_rate": 3.6378668871434477e-07, "loss": 0.1442, "step": 2600 }, { "epoch": 0.9688516968851697, "grad_norm": 2.140625, "learning_rate": 3.431169904919388e-07, "loss": 0.1428, "step": 2605 }, { "epoch": 0.9707112970711297, "grad_norm": 2.0625, "learning_rate": 3.2244729226953293e-07, "loss": 0.1405, "step": 2610 }, { "epoch": 0.9725708972570897, "grad_norm": 2.0625, "learning_rate": 3.0177759404712695e-07, "loss": 0.1325, "step": 2615 }, { "epoch": 0.9744304974430498, "grad_norm": 1.921875, "learning_rate": 2.81107895824721e-07, "loss": 0.139, "step": 2620 }, { "epoch": 0.9762900976290098, "grad_norm": 2.171875, "learning_rate": 2.6043819760231506e-07, "loss": 0.139, "step": 2625 }, { "epoch": 0.9781496978149697, "grad_norm": 2.03125, "learning_rate": 2.397684993799091e-07, "loss": 0.1393, "step": 2630 }, { "epoch": 0.9800092980009298, "grad_norm": 2.421875, "learning_rate": 2.1909880115750314e-07, "loss": 0.1484, "step": 2635 }, { "epoch": 0.9818688981868898, "grad_norm": 2.09375, "learning_rate": 1.9842910293509717e-07, "loss": 0.1426, "step": 2640 }, { "epoch": 0.9837284983728498, "grad_norm": 2.46875, "learning_rate": 1.777594047126912e-07, "loss": 0.1441, "step": 2645 }, { "epoch": 0.9855880985588099, "grad_norm": 2.140625, "learning_rate": 1.5708970649028525e-07, "loss": 0.1423, "step": 2650 }, { "epoch": 0.9874476987447699, "grad_norm": 2.03125, "learning_rate": 1.364200082678793e-07, "loss": 0.139, "step": 2655 }, { "epoch": 0.9893072989307299, "grad_norm": 2.0, "learning_rate": 1.1575031004547335e-07, "loss": 0.1355, "step": 2660 }, { "epoch": 0.9911668991166899, "grad_norm": 2.0625, "learning_rate": 9.50806118230674e-08, "loss": 0.1443, "step": 2665 }, { "epoch": 0.99302649930265, "grad_norm": 2.015625, "learning_rate": 7.441091360066144e-08, "loss": 0.1404, "step": 2670 }, { "epoch": 0.99488609948861, "grad_norm": 2.28125, "learning_rate": 5.3741215378255483e-08, "loss": 0.1501, "step": 2675 }, { "epoch": 0.9967456996745699, "grad_norm": 2.078125, "learning_rate": 3.3071517155849524e-08, "loss": 0.1454, "step": 2680 }, { "epoch": 0.99860529986053, "grad_norm": 1.890625, "learning_rate": 1.2401818933443573e-08, "loss": 0.1473, "step": 2685 }, { "epoch": 0.999721059972106, "step": 2688, "total_flos": 2.4410094732092375e+18, "train_loss": 0.15163021730924292, "train_runtime": 3273.2323, "train_samples_per_second": 52.569, "train_steps_per_second": 0.821 } ], "logging_steps": 5, "max_steps": 2688, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 269, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4410094732092375e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }