bhaviktheslider's picture
Model save
1c9851f verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.1244167962674965,
"eval_steps": 500,
"global_step": 250,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 736.4702529907227,
"epoch": 0.049766718506998445,
"grad_norm": 0.2507069706916809,
"kl": 0.0,
"learning_rate": 7.142857142857142e-08,
"loss": 0.0,
"reward": 0.04415178840281442,
"reward_std": 0.07034091584500857,
"rewards/equation_reward_func": 0.04415178793715313,
"rewards/format_reward_func": 0.0,
"step": 2
},
{
"completion_length": 723.1704015731812,
"epoch": 0.09953343701399689,
"grad_norm": 0.19884330034255981,
"kl": 2.0936699339557663e-05,
"learning_rate": 1.4285714285714285e-07,
"loss": 0.0,
"reward": 0.040647323767188936,
"reward_std": 0.0637543131451821,
"rewards/equation_reward_func": 0.04064732347615063,
"rewards/format_reward_func": 0.0,
"step": 4
},
{
"completion_length": 726.5163822174072,
"epoch": 0.14930015552099535,
"grad_norm": 0.21145105361938477,
"kl": 0.00019492170304147294,
"learning_rate": 2.1428571428571426e-07,
"loss": 0.0,
"reward": 0.04095238326408435,
"reward_std": 0.06441530691517983,
"rewards/equation_reward_func": 0.040952383089461364,
"rewards/format_reward_func": 0.0,
"step": 6
},
{
"completion_length": 737.8207015991211,
"epoch": 0.19906687402799378,
"grad_norm": 0.2020396590232849,
"kl": 0.020334478189397487,
"learning_rate": 2.857142857142857e-07,
"loss": 0.0,
"reward": 0.03635416827455629,
"reward_std": 0.05670656039728783,
"rewards/equation_reward_func": 0.03635416833276395,
"rewards/format_reward_func": 0.0,
"step": 8
},
{
"completion_length": 718.5461435317993,
"epoch": 0.24883359253499224,
"grad_norm": 27.479997634887695,
"kl": 9.990212610488015,
"learning_rate": 3.5714285714285716e-07,
"loss": 0.01,
"reward": 0.04543898967676796,
"reward_std": 0.07194261607946828,
"rewards/equation_reward_func": 0.04543899020063691,
"rewards/format_reward_func": 0.0,
"step": 10
},
{
"completion_length": 721.6964378356934,
"epoch": 0.2986003110419907,
"grad_norm": 0.19479116797447205,
"kl": 0.005109551766508957,
"learning_rate": 4.285714285714285e-07,
"loss": 0.0,
"reward": 0.04148065741173923,
"reward_std": 0.0677548690000549,
"rewards/equation_reward_func": 0.04148065741173923,
"rewards/format_reward_func": 0.0,
"step": 12
},
{
"completion_length": 725.9003086090088,
"epoch": 0.3483670295489891,
"grad_norm": 0.24158786237239838,
"kl": 0.502096803898894,
"learning_rate": 5e-07,
"loss": 0.0005,
"reward": 0.04821428797731642,
"reward_std": 0.07803500922454987,
"rewards/equation_reward_func": 0.048214288559393026,
"rewards/format_reward_func": 0.0,
"step": 14
},
{
"completion_length": 722.5781373977661,
"epoch": 0.39813374805598756,
"grad_norm": 0.290544331073761,
"kl": 0.23321715661586495,
"learning_rate": 4.999740409224932e-07,
"loss": 0.0002,
"reward": 0.05134672833082732,
"reward_std": 0.07946415679180063,
"rewards/equation_reward_func": 0.05134672856365796,
"rewards/format_reward_func": 0.0,
"step": 16
},
{
"completion_length": 723.7433156967163,
"epoch": 0.447900466562986,
"grad_norm": 4.716856479644775,
"kl": 0.8582769820350222,
"learning_rate": 4.998961690809627e-07,
"loss": 0.0009,
"reward": 0.050446430934243836,
"reward_std": 0.07721506280358881,
"rewards/equation_reward_func": 0.050446430992451496,
"rewards/format_reward_func": 0.0,
"step": 18
},
{
"completion_length": 729.6198072433472,
"epoch": 0.4976671850699845,
"grad_norm": 0.23771615326404572,
"kl": 0.3220994914881885,
"learning_rate": 4.997664006472578e-07,
"loss": 0.0003,
"reward": 0.045706847246037796,
"reward_std": 0.07288302374945488,
"rewards/equation_reward_func": 0.045706847246037796,
"rewards/format_reward_func": 0.0,
"step": 20
},
{
"completion_length": 712.8631057739258,
"epoch": 0.5474339035769828,
"grad_norm": 0.3750320374965668,
"kl": 0.27479040302569047,
"learning_rate": 4.995847625707292e-07,
"loss": 0.0003,
"reward": 0.05489583619055338,
"reward_std": 0.0833382241835352,
"rewards/equation_reward_func": 0.054895836423384026,
"rewards/format_reward_func": 0.0,
"step": 22
},
{
"completion_length": 731.4576015472412,
"epoch": 0.5972006220839814,
"grad_norm": 0.2089901864528656,
"kl": 0.12606932656490244,
"learning_rate": 4.993512925726318e-07,
"loss": 0.0001,
"reward": 0.0600520860607503,
"reward_std": 0.0899482914537657,
"rewards/equation_reward_func": 0.06005208553688135,
"rewards/format_reward_func": 0.0,
"step": 24
},
{
"completion_length": 706.6056671142578,
"epoch": 0.6469673405909798,
"grad_norm": 0.17620234191417694,
"kl": 0.1556346261058934,
"learning_rate": 4.990660391382923e-07,
"loss": 0.0002,
"reward": 0.05229166932986118,
"reward_std": 0.07350753628998064,
"rewards/equation_reward_func": 0.05229166956269182,
"rewards/format_reward_func": 0.0,
"step": 26
},
{
"completion_length": 727.1808137893677,
"epoch": 0.6967340590979783,
"grad_norm": 0.19479602575302124,
"kl": 0.12168441573157907,
"learning_rate": 4.987290615070384e-07,
"loss": 0.0001,
"reward": 0.053683038655435666,
"reward_std": 0.08042177859169897,
"rewards/equation_reward_func": 0.053683039121096954,
"rewards/format_reward_func": 0.0,
"step": 28
},
{
"completion_length": 720.8891496658325,
"epoch": 0.7465007776049767,
"grad_norm": 0.1857473999261856,
"kl": 0.1632600230514072,
"learning_rate": 4.983404296598978e-07,
"loss": 0.0002,
"reward": 0.05391369271092117,
"reward_std": 0.08413292915793136,
"rewards/equation_reward_func": 0.053913692419882864,
"rewards/format_reward_func": 0.0,
"step": 30
},
{
"completion_length": 720.4337940216064,
"epoch": 0.7962674961119751,
"grad_norm": 0.23092247545719147,
"kl": 0.15535293571883813,
"learning_rate": 4.979002243050646e-07,
"loss": 0.0002,
"reward": 0.05988095561042428,
"reward_std": 0.09168167802272364,
"rewards/equation_reward_func": 0.05988095601787791,
"rewards/format_reward_func": 0.0,
"step": 32
},
{
"completion_length": 718.6845378875732,
"epoch": 0.8460342146189735,
"grad_norm": 0.23407958447933197,
"kl": 0.25782948260894045,
"learning_rate": 4.974085368611381e-07,
"loss": 0.0003,
"reward": 0.06691220620996319,
"reward_std": 0.09768064138188493,
"rewards/equation_reward_func": 0.06691220562788658,
"rewards/format_reward_func": 0.0,
"step": 34
},
{
"completion_length": 718.0454006195068,
"epoch": 0.895800933125972,
"grad_norm": 0.3076172471046448,
"kl": 0.2404527408652939,
"learning_rate": 4.968654694381379e-07,
"loss": 0.0002,
"reward": 0.07349702704232186,
"reward_std": 0.10955648736853618,
"rewards/equation_reward_func": 0.07349702733336017,
"rewards/format_reward_func": 0.0,
"step": 36
},
{
"completion_length": 704.1488237380981,
"epoch": 0.9455676516329704,
"grad_norm": 0.2561110258102417,
"kl": 0.43795167771168053,
"learning_rate": 4.962711348162987e-07,
"loss": 0.0004,
"reward": 0.06241815793327987,
"reward_std": 0.09217380215704907,
"rewards/equation_reward_func": 0.0624181583407335,
"rewards/format_reward_func": 0.0,
"step": 38
},
{
"completion_length": 707.3921279907227,
"epoch": 0.995334370139969,
"grad_norm": 0.3400561511516571,
"kl": 0.5494289128109813,
"learning_rate": 4.956256564226487e-07,
"loss": 0.0005,
"reward": 0.0764508958091028,
"reward_std": 0.11110821401234716,
"rewards/equation_reward_func": 0.07645089708967134,
"rewards/format_reward_func": 0.0,
"step": 40
},
{
"completion_length": 715.0272221156529,
"epoch": 1.0497667185069985,
"grad_norm": 0.26081565022468567,
"kl": 0.4236157455614635,
"learning_rate": 4.949291683053768e-07,
"loss": 0.0005,
"reward": 0.07186394860701902,
"reward_std": 0.10362207902861494,
"rewards/equation_reward_func": 0.07186394876667432,
"rewards/format_reward_func": 0.0,
"step": 42
},
{
"completion_length": 714.9486722946167,
"epoch": 1.0995334370139969,
"grad_norm": 0.29378727078437805,
"kl": 0.3755593653768301,
"learning_rate": 4.941818151059955e-07,
"loss": 0.0004,
"reward": 0.0799404798890464,
"reward_std": 0.11443577655882109,
"rewards/equation_reward_func": 0.07994047965621576,
"rewards/format_reward_func": 0.0,
"step": 44
},
{
"completion_length": 727.829628944397,
"epoch": 1.1493001555209954,
"grad_norm": 2045.599365234375,
"kl": 128.7541933595203,
"learning_rate": 4.933837520293017e-07,
"loss": 0.1288,
"reward": 0.06808780113351531,
"reward_std": 0.09949399236938916,
"rewards/equation_reward_func": 0.06808780090068467,
"rewards/format_reward_func": 0.0,
"step": 46
},
{
"completion_length": 709.632453918457,
"epoch": 1.1990668740279937,
"grad_norm": 0.2698291838169098,
"kl": 0.4989726666826755,
"learning_rate": 4.925351448111454e-07,
"loss": 0.0005,
"reward": 0.09389881315291859,
"reward_std": 0.13221543522377033,
"rewards/equation_reward_func": 0.09389881303650327,
"rewards/format_reward_func": 0.0,
"step": 48
},
{
"completion_length": 719.485878944397,
"epoch": 1.2488335925349923,
"grad_norm": 0.36381521821022034,
"kl": 0.550471473718062,
"learning_rate": 4.91636169684011e-07,
"loss": 0.0006,
"reward": 0.08360863462439738,
"reward_std": 0.11854775344545487,
"rewards/equation_reward_func": 0.08360863421694376,
"rewards/format_reward_func": 0.0,
"step": 50
},
{
"completion_length": 725.6599855422974,
"epoch": 1.2986003110419908,
"grad_norm": 0.3374347686767578,
"kl": 0.663099701050669,
"learning_rate": 4.906870133404186e-07,
"loss": 0.0007,
"reward": 0.08503720644512214,
"reward_std": 0.12180299674218986,
"rewards/equation_reward_func": 0.0850372067943681,
"rewards/format_reward_func": 0.0,
"step": 52
},
{
"completion_length": 723.972484588623,
"epoch": 1.3483670295489891,
"grad_norm": 1.0345810651779175,
"kl": 0.9573397457133979,
"learning_rate": 4.896878728941531e-07,
"loss": 0.001,
"reward": 0.09177827867097221,
"reward_std": 0.12253864679951221,
"rewards/equation_reward_func": 0.09177827744861133,
"rewards/format_reward_func": 0.0,
"step": 54
},
{
"completion_length": 712.2269496917725,
"epoch": 1.3981337480559874,
"grad_norm": 0.27968963980674744,
"kl": 0.8391579431481659,
"learning_rate": 4.886389558393284e-07,
"loss": 0.0008,
"reward": 0.08570684934966266,
"reward_std": 0.1181660912843654,
"rewards/equation_reward_func": 0.08570684841834009,
"rewards/format_reward_func": 0.0,
"step": 56
},
{
"completion_length": 730.5327529907227,
"epoch": 1.447900466562986,
"grad_norm": 0.28138798475265503,
"kl": 0.9094656470697373,
"learning_rate": 4.875404800072976e-07,
"loss": 0.0009,
"reward": 0.08794643338478636,
"reward_std": 0.12104765651747584,
"rewards/equation_reward_func": 0.08794643309374806,
"rewards/format_reward_func": 0.0,
"step": 58
},
{
"completion_length": 732.3861742019653,
"epoch": 1.4976671850699845,
"grad_norm": 0.34412360191345215,
"kl": 1.009782899171114,
"learning_rate": 4.86392673521415e-07,
"loss": 0.001,
"reward": 0.10000744601711631,
"reward_std": 0.13957228315121029,
"rewards/equation_reward_func": 0.10000744566787034,
"rewards/format_reward_func": 0.0,
"step": 60
},
{
"completion_length": 725.0677175521851,
"epoch": 1.5474339035769828,
"grad_norm": 0.3454972207546234,
"kl": 1.0763904643245041,
"learning_rate": 4.851957747496606e-07,
"loss": 0.0011,
"reward": 0.10212798128486611,
"reward_std": 0.13816983328433707,
"rewards/equation_reward_func": 0.10212798012071289,
"rewards/format_reward_func": 0.0,
"step": 62
},
{
"completion_length": 730.5171251296997,
"epoch": 1.5972006220839814,
"grad_norm": 0.3473067581653595,
"kl": 1.4565551071427763,
"learning_rate": 4.839500322551386e-07,
"loss": 0.0015,
"reward": 0.10485119439545088,
"reward_std": 0.14129075466189533,
"rewards/equation_reward_func": 0.10485119334771298,
"rewards/format_reward_func": 0.0,
"step": 64
},
{
"completion_length": 735.0320043563843,
"epoch": 1.64696734059098,
"grad_norm": 0.3159619867801666,
"kl": 1.5041364189237356,
"learning_rate": 4.826557047444563e-07,
"loss": 0.0015,
"reward": 0.10093006424722262,
"reward_std": 0.13811934839759488,
"rewards/equation_reward_func": 0.1009300641308073,
"rewards/format_reward_func": 0.0,
"step": 66
},
{
"completion_length": 730.7455463409424,
"epoch": 1.6967340590979783,
"grad_norm": 1.146909236907959,
"kl": 2.238507369533181,
"learning_rate": 4.813130610139993e-07,
"loss": 0.0022,
"reward": 0.10973958898102865,
"reward_std": 0.13851106038782746,
"rewards/equation_reward_func": 0.10973958781687543,
"rewards/format_reward_func": 0.0,
"step": 68
},
{
"completion_length": 712.6971893310547,
"epoch": 1.7465007776049766,
"grad_norm": 7.27742338180542,
"kl": 3.2542791040614247,
"learning_rate": 4.799223798941089e-07,
"loss": 0.0033,
"reward": 0.12900298138265498,
"reward_std": 0.15667404458508827,
"rewards/equation_reward_func": 0.1290029831288848,
"rewards/format_reward_func": 0.0,
"step": 70
},
{
"completion_length": 729.6331987380981,
"epoch": 1.7962674961119751,
"grad_norm": 10.986953735351562,
"kl": 4.106183127500117,
"learning_rate": 4.78483950191177e-07,
"loss": 0.0041,
"reward": 0.12543899397132918,
"reward_std": 0.16567694948753342,
"rewards/equation_reward_func": 0.12543899344746023,
"rewards/format_reward_func": 0.0,
"step": 72
},
{
"completion_length": 737.0245656967163,
"epoch": 1.8460342146189737,
"grad_norm": 1.6122727394104004,
"kl": 3.731540434062481,
"learning_rate": 4.769980706276687e-07,
"loss": 0.0037,
"reward": 0.12507440976332873,
"reward_std": 0.159569505834952,
"rewards/equation_reward_func": 0.12507440929766744,
"rewards/format_reward_func": 0.0,
"step": 74
},
{
"completion_length": 729.0632581710815,
"epoch": 1.895800933125972,
"grad_norm": 0.5852969288825989,
"kl": 2.9793617641553283,
"learning_rate": 4.7546504978008595e-07,
"loss": 0.003,
"reward": 0.12817708833608776,
"reward_std": 0.1600989469443448,
"rewards/equation_reward_func": 0.1281770879868418,
"rewards/format_reward_func": 0.0,
"step": 76
},
{
"completion_length": 734.6302223205566,
"epoch": 1.9455676516329703,
"grad_norm": 0.9090600609779358,
"kl": 3.139740688726306,
"learning_rate": 4.738852060148848e-07,
"loss": 0.0031,
"reward": 0.13495536311529577,
"reward_std": 0.1720278718858026,
"rewards/equation_reward_func": 0.13495536299888045,
"rewards/format_reward_func": 0.0,
"step": 78
},
{
"completion_length": 742.833345413208,
"epoch": 1.995334370139969,
"grad_norm": 0.5681818723678589,
"kl": 3.712686972692609,
"learning_rate": 4.722588674223593e-07,
"loss": 0.0037,
"reward": 0.13085565919755027,
"reward_std": 0.15991040458902717,
"rewards/equation_reward_func": 0.1308556593139656,
"rewards/format_reward_func": 0.0,
"step": 80
},
{
"completion_length": 717.2042718184622,
"epoch": 2.0248833592534994,
"grad_norm": 1.5164953470230103,
"kl": 5.466580171334116,
"learning_rate": 4.70586371748506e-07,
"loss": 0.0032,
"reward": 0.14641604347056464,
"reward_std": 0.18159407436063416,
"rewards/equation_reward_func": 0.1464160444509042,
"rewards/format_reward_func": 0.0,
"step": 82
},
{
"completion_length": 730.2589464187622,
"epoch": 2.0746500777604977,
"grad_norm": 0.6375504732131958,
"kl": 4.280845553614199,
"learning_rate": 4.6886806632488363e-07,
"loss": 0.0043,
"reward": 0.14213542238576338,
"reward_std": 0.1740714008337818,
"rewards/equation_reward_func": 0.14213542168727145,
"rewards/format_reward_func": 0.0,
"step": 84
},
{
"completion_length": 744.4538831710815,
"epoch": 2.124416796267496,
"grad_norm": 0.9480769038200378,
"kl": 7.16812994517386,
"learning_rate": 4.6710430799648143e-07,
"loss": 0.0072,
"reward": 0.12831845637992956,
"reward_std": 0.1582361755426973,
"rewards/equation_reward_func": 0.12831845649634488,
"rewards/format_reward_func": 0.0,
"step": 86
},
{
"completion_length": 732.5520973205566,
"epoch": 2.1741835147744943,
"grad_norm": 16.496623992919922,
"kl": 10.49539315700531,
"learning_rate": 4.652954630476127e-07,
"loss": 0.0105,
"reward": 0.14677828032290563,
"reward_std": 0.1764058277476579,
"rewards/equation_reward_func": 0.1467782796244137,
"rewards/format_reward_func": 0.0,
"step": 88
},
{
"completion_length": 736.1361722946167,
"epoch": 2.223950233281493,
"grad_norm": 2.352017879486084,
"kl": 10.109702784568071,
"learning_rate": 4.6344190712584713e-07,
"loss": 0.0101,
"reward": 0.13781250565079972,
"reward_std": 0.1627702646655962,
"rewards/equation_reward_func": 0.13781250413740054,
"rewards/format_reward_func": 0.0,
"step": 90
},
{
"completion_length": 749.1317129135132,
"epoch": 2.2737169517884914,
"grad_norm": 3.804121255874634,
"kl": 15.052036292850971,
"learning_rate": 4.615440251639995e-07,
"loss": 0.0151,
"reward": 0.14105655340244994,
"reward_std": 0.17247924709226936,
"rewards/equation_reward_func": 0.14105655369348824,
"rewards/format_reward_func": 0.0,
"step": 92
},
{
"completion_length": 717.3884019851685,
"epoch": 2.3234836702954897,
"grad_norm": 2.226238489151001,
"kl": 12.018643591552973,
"learning_rate": 4.596022113001894e-07,
"loss": 0.012,
"reward": 0.15741816238733009,
"reward_std": 0.17923290858743712,
"rewards/equation_reward_func": 0.15741816128138453,
"rewards/format_reward_func": 0.0,
"step": 94
},
{
"completion_length": 726.2500143051147,
"epoch": 2.3732503888024885,
"grad_norm": 2.1459925174713135,
"kl": 12.27118530496955,
"learning_rate": 4.576168687959895e-07,
"loss": 0.0123,
"reward": 0.16154762578662485,
"reward_std": 0.18940409342758358,
"rewards/equation_reward_func": 0.16154762508813292,
"rewards/format_reward_func": 0.0,
"step": 96
},
{
"completion_length": 711.6696538925171,
"epoch": 2.423017107309487,
"grad_norm": 1.4883497953414917,
"kl": 15.596692271530628,
"learning_rate": 4.555884099526793e-07,
"loss": 0.0156,
"reward": 0.15925595845328644,
"reward_std": 0.1815938005456701,
"rewards/equation_reward_func": 0.1592559577547945,
"rewards/format_reward_func": 0.0,
"step": 98
},
{
"completion_length": 719.6242723464966,
"epoch": 2.472783825816485,
"grad_norm": 4.10906982421875,
"kl": 17.258602559566498,
"learning_rate": 4.5351725602562174e-07,
"loss": 0.0173,
"reward": 0.17212054354604334,
"reward_std": 0.18435519566992298,
"rewards/equation_reward_func": 0.17212054308038205,
"rewards/format_reward_func": 0.0,
"step": 100
},
{
"completion_length": 697.6637020111084,
"epoch": 2.522550544323484,
"grad_norm": 1.1079808473587036,
"kl": 14.344636462628841,
"learning_rate": 4.514038371367791e-07,
"loss": 0.0143,
"reward": 0.17430060362676159,
"reward_std": 0.19522728596348315,
"rewards/equation_reward_func": 0.17430060246260837,
"rewards/format_reward_func": 0.0,
"step": 102
},
{
"completion_length": 695.2105755805969,
"epoch": 2.5723172628304822,
"grad_norm": 1.298901081085205,
"kl": 15.563006613403559,
"learning_rate": 4.4924859218538936e-07,
"loss": 0.0156,
"reward": 0.17871280398685485,
"reward_std": 0.19645729020703584,
"rewards/equation_reward_func": 0.17871280352119356,
"rewards/format_reward_func": 0.0,
"step": 104
},
{
"completion_length": 687.2507581710815,
"epoch": 2.6220839813374806,
"grad_norm": 1.333657145500183,
"kl": 14.787582196295261,
"learning_rate": 4.470519687568185e-07,
"loss": 0.0148,
"reward": 0.19031250709667802,
"reward_std": 0.2006249635014683,
"rewards/equation_reward_func": 0.19031250721309334,
"rewards/format_reward_func": 0.0,
"step": 106
},
{
"completion_length": 672.3839402198792,
"epoch": 2.671850699844479,
"grad_norm": 1.4585353136062622,
"kl": 20.08526621758938,
"learning_rate": 4.4481442302960923e-07,
"loss": 0.0201,
"reward": 0.18158482806757092,
"reward_std": 0.1955818484420888,
"rewards/equation_reward_func": 0.18158482783474028,
"rewards/format_reward_func": 0.0,
"step": 108
},
{
"completion_length": 651.4077491760254,
"epoch": 2.721617418351477,
"grad_norm": 1.516221523284912,
"kl": 17.027776926755905,
"learning_rate": 4.4253641968074505e-07,
"loss": 0.017,
"reward": 0.1995759003330022,
"reward_std": 0.21349556557834148,
"rewards/equation_reward_func": 0.19957590056583285,
"rewards/format_reward_func": 0.0,
"step": 110
},
{
"completion_length": 672.9442043304443,
"epoch": 2.771384136858476,
"grad_norm": 2.0658159255981445,
"kl": 20.176754418760538,
"learning_rate": 4.402184317891501e-07,
"loss": 0.0202,
"reward": 0.20375744753982872,
"reward_std": 0.18776777852326632,
"rewards/equation_reward_func": 0.2037574463756755,
"rewards/format_reward_func": 0.0,
"step": 112
},
{
"completion_length": 665.7247114181519,
"epoch": 2.8211508553654743,
"grad_norm": 2.339445114135742,
"kl": 22.64492540061474,
"learning_rate": 4.37860940737443e-07,
"loss": 0.0226,
"reward": 0.1926413766341284,
"reward_std": 0.2001927924575284,
"rewards/equation_reward_func": 0.19264137593563646,
"rewards/format_reward_func": 0.0,
"step": 114
},
{
"completion_length": 669.665937423706,
"epoch": 2.8709175738724726,
"grad_norm": 2.852607011795044,
"kl": 32.22943264245987,
"learning_rate": 4.354644361119671e-07,
"loss": 0.0322,
"reward": 0.19950893591158092,
"reward_std": 0.1933421454159543,
"rewards/equation_reward_func": 0.19950893614441156,
"rewards/format_reward_func": 0.0,
"step": 116
},
{
"completion_length": 670.7053713798523,
"epoch": 2.9206842923794714,
"grad_norm": 2.6619129180908203,
"kl": 27.73328886926174,
"learning_rate": 4.3302941560111716e-07,
"loss": 0.0277,
"reward": 0.19388393545523286,
"reward_std": 0.19777346146292984,
"rewards/equation_reward_func": 0.1938839361537248,
"rewards/format_reward_func": 0.0,
"step": 118
},
{
"completion_length": 676.3571548461914,
"epoch": 2.9704510108864697,
"grad_norm": 3.816153049468994,
"kl": 27.2223904132843,
"learning_rate": 4.3055638489198236e-07,
"loss": 0.0272,
"reward": 0.20729167491663247,
"reward_std": 0.20934273721650243,
"rewards/equation_reward_func": 0.20729167328681797,
"rewards/format_reward_func": 0.0,
"step": 120
},
{
"completion_length": 659.7907361482319,
"epoch": 3.0,
"grad_norm": 0.624527633190155,
"kl": 27.528421577654388,
"learning_rate": 4.280458575653296e-07,
"loss": 0.0163,
"reward": 0.20659148869545838,
"reward_std": 0.19081004316869535,
"rewards/equation_reward_func": 0.20659148947973,
"rewards/format_reward_func": 0.0,
"step": 122
},
{
"completion_length": 659.4025421142578,
"epoch": 3.0497667185069983,
"grad_norm": 3.345853567123413,
"kl": 21.34368522465229,
"learning_rate": 4.2549835498894665e-07,
"loss": 0.0213,
"reward": 0.22118304355535656,
"reward_std": 0.21869899448938668,
"rewards/equation_reward_func": 0.22118304437026381,
"rewards/format_reward_func": 0.0,
"step": 124
},
{
"completion_length": 672.1183128356934,
"epoch": 3.099533437013997,
"grad_norm": 6.106723785400391,
"kl": 23.556977652013302,
"learning_rate": 4.229144062093679e-07,
"loss": 0.0236,
"reward": 0.21467262762598693,
"reward_std": 0.2053254572674632,
"rewards/equation_reward_func": 0.21467262762598693,
"rewards/format_reward_func": 0.0,
"step": 126
},
{
"completion_length": 653.0297751426697,
"epoch": 3.1493001555209954,
"grad_norm": 5.746135234832764,
"kl": 26.1618300229311,
"learning_rate": 4.2029454784200675e-07,
"loss": 0.0262,
"reward": 0.21742560202255845,
"reward_std": 0.2172505116323009,
"rewards/equation_reward_func": 0.217425603303127,
"rewards/format_reward_func": 0.0,
"step": 128
},
{
"completion_length": 645.058048248291,
"epoch": 3.1990668740279937,
"grad_norm": 60.6376953125,
"kl": 53.1397475451231,
"learning_rate": 4.1763932395971433e-07,
"loss": 0.0531,
"reward": 0.2241517937509343,
"reward_std": 0.20952896296512336,
"rewards/equation_reward_func": 0.22415179491508752,
"rewards/format_reward_func": 0.0,
"step": 130
},
{
"completion_length": 632.6659345626831,
"epoch": 3.248833592534992,
"grad_norm": 5.82427978515625,
"kl": 41.686398059129715,
"learning_rate": 4.1494928597979117e-07,
"loss": 0.0417,
"reward": 0.22440477029886097,
"reward_std": 0.2128691952675581,
"rewards/equation_reward_func": 0.22440477076452225,
"rewards/format_reward_func": 0.0,
"step": 132
},
{
"completion_length": 639.6711411476135,
"epoch": 3.298600311041991,
"grad_norm": 3.375183343887329,
"kl": 36.797510489821434,
"learning_rate": 4.122249925494726e-07,
"loss": 0.0368,
"reward": 0.2161235201638192,
"reward_std": 0.20362528192345053,
"rewards/equation_reward_func": 0.21612352062948048,
"rewards/format_reward_func": 0.0,
"step": 134
},
{
"completion_length": 651.2276935577393,
"epoch": 3.348367029548989,
"grad_norm": 5.04212760925293,
"kl": 37.60325849056244,
"learning_rate": 4.094670094299131e-07,
"loss": 0.0376,
"reward": 0.22996280749794096,
"reward_std": 0.214357816032134,
"rewards/equation_reward_func": 0.22996280703227967,
"rewards/format_reward_func": 0.0,
"step": 136
},
{
"completion_length": 631.5751585960388,
"epoch": 3.3981337480559874,
"grad_norm": 4.119243144989014,
"kl": 43.57139265537262,
"learning_rate": 4.066759093786931e-07,
"loss": 0.0436,
"reward": 0.2285714359022677,
"reward_std": 0.21766341011971235,
"rewards/equation_reward_func": 0.22857143532019109,
"rewards/format_reward_func": 0.0,
"step": 138
},
{
"completion_length": 647.8214359283447,
"epoch": 3.447900466562986,
"grad_norm": 7.117722988128662,
"kl": 60.4551947414875,
"learning_rate": 4.038522720308732e-07,
"loss": 0.0605,
"reward": 0.21806548640597612,
"reward_std": 0.20702184177935123,
"rewards/equation_reward_func": 0.2180654831463471,
"rewards/format_reward_func": 0.0,
"step": 140
},
{
"completion_length": 609.9583463668823,
"epoch": 3.4976671850699845,
"grad_norm": 4.748437881469727,
"kl": 58.59304141998291,
"learning_rate": 4.009966837786194e-07,
"loss": 0.0586,
"reward": 0.2300297737820074,
"reward_std": 0.20853826915845275,
"rewards/equation_reward_func": 0.23002976982388645,
"rewards/format_reward_func": 0.0,
"step": 142
},
{
"completion_length": 631.8430180549622,
"epoch": 3.547433903576983,
"grad_norm": 8.042330741882324,
"kl": 82.30807757377625,
"learning_rate": 3.981097376494259e-07,
"loss": 0.0823,
"reward": 0.21836310264188796,
"reward_std": 0.20933940180111676,
"rewards/equation_reward_func": 0.21836310101207346,
"rewards/format_reward_func": 0.0,
"step": 144
},
{
"completion_length": 624.0669736862183,
"epoch": 3.5972006220839816,
"grad_norm": 7.811219692230225,
"kl": 77.89375275373459,
"learning_rate": 3.951920331829592e-07,
"loss": 0.0779,
"reward": 0.2207961401436478,
"reward_std": 0.21105306909885257,
"rewards/equation_reward_func": 0.22079613932874054,
"rewards/format_reward_func": 0.0,
"step": 146
},
{
"completion_length": 623.5215888023376,
"epoch": 3.64696734059098,
"grad_norm": 8.836230278015137,
"kl": 65.97143815457821,
"learning_rate": 3.922441763065506e-07,
"loss": 0.066,
"reward": 0.2193824496353045,
"reward_std": 0.20604081987403333,
"rewards/equation_reward_func": 0.21938244777265936,
"rewards/format_reward_func": 0.0,
"step": 148
},
{
"completion_length": 634.7611751556396,
"epoch": 3.6967340590979783,
"grad_norm": 5.354574680328369,
"kl": 56.36278319358826,
"learning_rate": 3.8926677920936093e-07,
"loss": 0.0564,
"reward": 0.2112648879410699,
"reward_std": 0.2029515573522076,
"rewards/equation_reward_func": 0.21126488805748522,
"rewards/format_reward_func": 0.0,
"step": 150
},
{
"completion_length": 636.0297775268555,
"epoch": 3.7465007776049766,
"grad_norm": 5.276882648468018,
"kl": 65.72037261724472,
"learning_rate": 3.862604602152464e-07,
"loss": 0.0657,
"reward": 0.20753721124492586,
"reward_std": 0.20195745571982116,
"rewards/equation_reward_func": 0.20753721171058714,
"rewards/format_reward_func": 0.0,
"step": 152
},
{
"completion_length": 634.954626083374,
"epoch": 3.796267496111975,
"grad_norm": 8.027347564697266,
"kl": 77.93326985836029,
"learning_rate": 3.8322584365434934e-07,
"loss": 0.0779,
"reward": 0.2165699511533603,
"reward_std": 0.2101849897298962,
"rewards/equation_reward_func": 0.2165699495235458,
"rewards/format_reward_func": 0.0,
"step": 154
},
{
"completion_length": 638.3660817146301,
"epoch": 3.8460342146189737,
"grad_norm": 4.954690456390381,
"kl": 83.4894488453865,
"learning_rate": 3.8016355973344173e-07,
"loss": 0.0835,
"reward": 0.21200893796049058,
"reward_std": 0.21022081119008362,
"rewards/equation_reward_func": 0.21200893679633737,
"rewards/format_reward_func": 0.0,
"step": 156
},
{
"completion_length": 620.3281378746033,
"epoch": 3.895800933125972,
"grad_norm": 4.270212650299072,
"kl": 82.2349089384079,
"learning_rate": 3.7707424440504863e-07,
"loss": 0.0822,
"reward": 0.211755960714072,
"reward_std": 0.20715959300287068,
"rewards/equation_reward_func": 0.21175595885142684,
"rewards/format_reward_func": 0.0,
"step": 158
},
{
"completion_length": 632.0409350395203,
"epoch": 3.9455676516329703,
"grad_norm": 4.687271595001221,
"kl": 90.35439342260361,
"learning_rate": 3.739585392353787e-07,
"loss": 0.0904,
"reward": 0.21921131818089634,
"reward_std": 0.20252067118417472,
"rewards/equation_reward_func": 0.21921131608542055,
"rewards/format_reward_func": 0.0,
"step": 160
},
{
"completion_length": 630.2678661346436,
"epoch": 3.995334370139969,
"grad_norm": 5.595997333526611,
"kl": 95.46352458000183,
"learning_rate": 3.7081709127108767e-07,
"loss": 0.0955,
"reward": 0.22013393603265285,
"reward_std": 0.2177246706560254,
"rewards/equation_reward_func": 0.2201339368475601,
"rewards/format_reward_func": 0.0,
"step": 162
},
{
"completion_length": 632.1065288342928,
"epoch": 4.024883359253499,
"grad_norm": 8.787236213684082,
"kl": 144.07192611694336,
"learning_rate": 3.6765055290490513e-07,
"loss": 0.0855,
"reward": 0.20649123721216855,
"reward_std": 0.21240881752026708,
"rewards/equation_reward_func": 0.2064912359377271,
"rewards/format_reward_func": 0.0,
"step": 164
},
{
"completion_length": 619.5156345367432,
"epoch": 4.074650077760498,
"grad_norm": 7.552036762237549,
"kl": 137.199125289917,
"learning_rate": 3.644595817401501e-07,
"loss": 0.1372,
"reward": 0.2162797685014084,
"reward_std": 0.21547920361626893,
"rewards/equation_reward_func": 0.2162797685014084,
"rewards/format_reward_func": 0.0,
"step": 166
},
{
"completion_length": 618.7634057998657,
"epoch": 4.1244167962674965,
"grad_norm": 6.8007354736328125,
"kl": 103.6235063970089,
"learning_rate": 3.6124484045416483e-07,
"loss": 0.1036,
"reward": 0.23168899782467633,
"reward_std": 0.21457487577572465,
"rewards/equation_reward_func": 0.23168899829033762,
"rewards/format_reward_func": 0.0,
"step": 168
},
{
"completion_length": 637.4136991500854,
"epoch": 4.174183514774494,
"grad_norm": 8.004964828491211,
"kl": 113.37393373250961,
"learning_rate": 3.580069966606949e-07,
"loss": 0.1134,
"reward": 0.21156250836793333,
"reward_std": 0.2123116059228778,
"rewards/equation_reward_func": 0.21156250790227205,
"rewards/format_reward_func": 0.0,
"step": 170
},
{
"completion_length": 634.7485208511353,
"epoch": 4.223950233281493,
"grad_norm": 7.898318290710449,
"kl": 109.72896337509155,
"learning_rate": 3.547467227712444e-07,
"loss": 0.1097,
"reward": 0.2029910811688751,
"reward_std": 0.20662414643447846,
"rewards/equation_reward_func": 0.20299108081962913,
"rewards/format_reward_func": 0.0,
"step": 172
},
{
"completion_length": 621.2730751037598,
"epoch": 4.273716951788492,
"grad_norm": 7.211435317993164,
"kl": 99.61057341098785,
"learning_rate": 3.5146469585543386e-07,
"loss": 0.0996,
"reward": 0.22819941327907145,
"reward_std": 0.2186455992050469,
"rewards/equation_reward_func": 0.22819941234774888,
"rewards/format_reward_func": 0.0,
"step": 174
},
{
"completion_length": 640.9628086090088,
"epoch": 4.32348367029549,
"grad_norm": 7.790672302246094,
"kl": 93.87813127040863,
"learning_rate": 3.481615975003922e-07,
"loss": 0.0939,
"reward": 0.2149925670819357,
"reward_std": 0.20749260939192027,
"rewards/equation_reward_func": 0.2149925702251494,
"rewards/format_reward_func": 0.0,
"step": 176
},
{
"completion_length": 615.1093888282776,
"epoch": 4.3732503888024885,
"grad_norm": 22.329519271850586,
"kl": 87.78260296583176,
"learning_rate": 3.448381136692089e-07,
"loss": 0.0878,
"reward": 0.21617560542654246,
"reward_std": 0.20247984025627375,
"rewards/equation_reward_func": 0.2161756035638973,
"rewards/format_reward_func": 0.0,
"step": 178
},
{
"completion_length": 629.4829001426697,
"epoch": 4.423017107309486,
"grad_norm": 13.893996238708496,
"kl": 98.21013808250427,
"learning_rate": 3.4149493455847897e-07,
"loss": 0.0982,
"reward": 0.21152530901599675,
"reward_std": 0.2093647257424891,
"rewards/equation_reward_func": 0.21152530668769032,
"rewards/format_reward_func": 0.0,
"step": 180
},
{
"completion_length": 623.7224802970886,
"epoch": 4.472783825816485,
"grad_norm": 7.4938130378723145,
"kl": 149.59339570999146,
"learning_rate": 3.3813275445496766e-07,
"loss": 0.1496,
"reward": 0.2145535812014714,
"reward_std": 0.2063142586266622,
"rewards/equation_reward_func": 0.214553578523919,
"rewards/format_reward_func": 0.0,
"step": 182
},
{
"completion_length": 639.263400554657,
"epoch": 4.522550544323484,
"grad_norm": 6.325891494750977,
"kl": 147.64970636367798,
"learning_rate": 3.347522715914262e-07,
"loss": 0.1476,
"reward": 0.20923363824840635,
"reward_std": 0.20685563085135072,
"rewards/equation_reward_func": 0.20923363824840635,
"rewards/format_reward_func": 0.0,
"step": 184
},
{
"completion_length": 636.6897439956665,
"epoch": 4.572317262830482,
"grad_norm": 4.635812759399414,
"kl": 130.48132091760635,
"learning_rate": 3.313541880015877e-07,
"loss": 0.1305,
"reward": 0.21598215226549655,
"reward_std": 0.2006415540818125,
"rewards/equation_reward_func": 0.21598214923869818,
"rewards/format_reward_func": 0.0,
"step": 186
},
{
"completion_length": 631.9933152198792,
"epoch": 4.6220839813374806,
"grad_norm": 7.933198928833008,
"kl": 118.75544810295105,
"learning_rate": 3.279392093743747e-07,
"loss": 0.1188,
"reward": 0.22688244911842048,
"reward_std": 0.22052743670064956,
"rewards/equation_reward_func": 0.22688244772143662,
"rewards/format_reward_func": 0.0,
"step": 188
},
{
"completion_length": 632.7038769721985,
"epoch": 4.671850699844479,
"grad_norm": 6.763364791870117,
"kl": 112.75827008485794,
"learning_rate": 3.245080449073459e-07,
"loss": 0.1128,
"reward": 0.2060937569476664,
"reward_std": 0.20044768252409995,
"rewards/equation_reward_func": 0.2060937574133277,
"rewards/format_reward_func": 0.0,
"step": 190
},
{
"completion_length": 632.4464421272278,
"epoch": 4.721617418351477,
"grad_norm": 4.295353412628174,
"kl": 108.82453501224518,
"learning_rate": 3.210614071594162e-07,
"loss": 0.1088,
"reward": 0.20745536405593157,
"reward_std": 0.21275918127503246,
"rewards/equation_reward_func": 0.2074553637066856,
"rewards/format_reward_func": 0.0,
"step": 192
},
{
"completion_length": 634.1763515472412,
"epoch": 4.771384136858476,
"grad_norm": 4.46217679977417,
"kl": 118.317107796669,
"learning_rate": 3.1760001190287695e-07,
"loss": 0.1183,
"reward": 0.20520090113859624,
"reward_std": 0.2021206704666838,
"rewards/equation_reward_func": 0.20520090113859624,
"rewards/format_reward_func": 0.0,
"step": 194
},
{
"completion_length": 620.2395968437195,
"epoch": 4.821150855365475,
"grad_norm": 4.841196060180664,
"kl": 119.24478554725647,
"learning_rate": 3.141245779747502e-07,
"loss": 0.1192,
"reward": 0.21259673358872533,
"reward_std": 0.21422103908844292,
"rewards/equation_reward_func": 0.21259673358872533,
"rewards/format_reward_func": 0.0,
"step": 196
},
{
"completion_length": 609.0446557998657,
"epoch": 4.870917573872473,
"grad_norm": 4.3330559730529785,
"kl": 119.67610502243042,
"learning_rate": 3.106358271275056e-07,
"loss": 0.1197,
"reward": 0.22683036630041897,
"reward_std": 0.20717181416694075,
"rewards/equation_reward_func": 0.22683036653324962,
"rewards/format_reward_func": 0.0,
"step": 198
},
{
"completion_length": 614.8869152069092,
"epoch": 4.920684292379471,
"grad_norm": 92.09661102294922,
"kl": 144.53644692897797,
"learning_rate": 3.0713448387917227e-07,
"loss": 0.1445,
"reward": 0.21901042643003166,
"reward_std": 0.20682094641961157,
"rewards/equation_reward_func": 0.2190104245673865,
"rewards/format_reward_func": 0.0,
"step": 200
},
{
"completion_length": 631.4241156578064,
"epoch": 4.970451010886469,
"grad_norm": 6.355322360992432,
"kl": 154.4233751296997,
"learning_rate": 3.0362127536287636e-07,
"loss": 0.1544,
"reward": 0.21773066406603903,
"reward_std": 0.21250074298586696,
"rewards/equation_reward_func": 0.2177306618541479,
"rewards/format_reward_func": 0.0,
"step": 202
},
{
"completion_length": 624.7180488987973,
"epoch": 5.0,
"grad_norm": 5.770173072814941,
"kl": 161.87928571199117,
"learning_rate": 3.0009693117583523e-07,
"loss": 0.0961,
"reward": 0.21541354177813782,
"reward_std": 0.20374000229333578,
"rewards/equation_reward_func": 0.215413541386002,
"rewards/format_reward_func": 0.0,
"step": 204
},
{
"completion_length": 624.5647420883179,
"epoch": 5.049766718506999,
"grad_norm": 6.884070873260498,
"kl": 157.92570447921753,
"learning_rate": 2.965621832278401e-07,
"loss": 0.1579,
"reward": 0.22669643780682236,
"reward_std": 0.20801680884324014,
"rewards/equation_reward_func": 0.22669643454719335,
"rewards/format_reward_func": 0.0,
"step": 206
},
{
"completion_length": 614.1570081710815,
"epoch": 5.099533437013997,
"grad_norm": 4.670907497406006,
"kl": 134.14546036720276,
"learning_rate": 2.9301776558925875e-07,
"loss": 0.1341,
"reward": 0.2188244123244658,
"reward_std": 0.20453347032889724,
"rewards/equation_reward_func": 0.21882441325578839,
"rewards/format_reward_func": 0.0,
"step": 208
},
{
"completion_length": 614.4702506065369,
"epoch": 5.149300155520995,
"grad_norm": 14.716873168945312,
"kl": 109.80421262979507,
"learning_rate": 2.894644143385885e-07,
"loss": 0.1098,
"reward": 0.21839286445174366,
"reward_std": 0.20062782417517155,
"rewards/equation_reward_func": 0.21839286398608238,
"rewards/format_reward_func": 0.0,
"step": 210
},
{
"completion_length": 622.4672718048096,
"epoch": 5.199066874027994,
"grad_norm": 10.858051300048828,
"kl": 114.28983092308044,
"learning_rate": 2.859028674095937e-07,
"loss": 0.1143,
"reward": 0.2192782819038257,
"reward_std": 0.2128367607947439,
"rewards/equation_reward_func": 0.21927828167099506,
"rewards/format_reward_func": 0.0,
"step": 212
},
{
"completion_length": 612.6160840988159,
"epoch": 5.248833592534992,
"grad_norm": 3.8785901069641113,
"kl": 125.06462055444717,
"learning_rate": 2.823338644380566e-07,
"loss": 0.1251,
"reward": 0.23020090232603252,
"reward_std": 0.2176531965378672,
"rewards/equation_reward_func": 0.23020089999772608,
"rewards/format_reward_func": 0.0,
"step": 214
},
{
"completion_length": 635.8995633125305,
"epoch": 5.298600311041991,
"grad_norm": 5.062567234039307,
"kl": 148.21274209022522,
"learning_rate": 2.7875814660817504e-07,
"loss": 0.1482,
"reward": 0.2193973324028775,
"reward_std": 0.22195886494591832,
"rewards/equation_reward_func": 0.21939733054023236,
"rewards/format_reward_func": 0.0,
"step": 216
},
{
"completion_length": 630.8229269981384,
"epoch": 5.348367029548989,
"grad_norm": 5.181402206420898,
"kl": 165.8618984222412,
"learning_rate": 2.751764564986396e-07,
"loss": 0.1659,
"reward": 0.2077009006170556,
"reward_std": 0.2193935844115913,
"rewards/equation_reward_func": 0.2077009001513943,
"rewards/format_reward_func": 0.0,
"step": 218
},
{
"completion_length": 628.6517939567566,
"epoch": 5.3981337480559874,
"grad_norm": 4.105767726898193,
"kl": 148.7712802886963,
"learning_rate": 2.715895379284194e-07,
"loss": 0.1488,
"reward": 0.2191815583501011,
"reward_std": 0.20989621221087873,
"rewards/equation_reward_func": 0.21918155602179468,
"rewards/format_reward_func": 0.0,
"step": 220
},
{
"completion_length": 629.8006067276001,
"epoch": 5.447900466562986,
"grad_norm": 3.895611524581909,
"kl": 142.22095596790314,
"learning_rate": 2.6799813580229174e-07,
"loss": 0.1422,
"reward": 0.22290923492982984,
"reward_std": 0.21323461562860757,
"rewards/equation_reward_func": 0.2229092346969992,
"rewards/format_reward_func": 0.0,
"step": 222
},
{
"completion_length": 608.6183171272278,
"epoch": 5.497667185069984,
"grad_norm": 6.331876277923584,
"kl": 135.1478552222252,
"learning_rate": 2.6440299595614606e-07,
"loss": 0.1351,
"reward": 0.21991072362288833,
"reward_std": 0.22133340197615325,
"rewards/equation_reward_func": 0.21991072269156575,
"rewards/format_reward_func": 0.0,
"step": 224
},
{
"completion_length": 611.6756086349487,
"epoch": 5.547433903576983,
"grad_norm": 3.41554594039917,
"kl": 135.47022581100464,
"learning_rate": 2.6080486500209347e-07,
"loss": 0.1355,
"reward": 0.21784971025772393,
"reward_std": 0.21086209290660918,
"rewards/equation_reward_func": 0.2178497090935707,
"rewards/format_reward_func": 0.0,
"step": 226
},
{
"completion_length": 609.0922722816467,
"epoch": 5.597200622083982,
"grad_norm": 4.638352870941162,
"kl": 149.68241280317307,
"learning_rate": 2.572044901734166e-07,
"loss": 0.1497,
"reward": 0.22438989242073148,
"reward_std": 0.2241612394573167,
"rewards/equation_reward_func": 0.2243898919550702,
"rewards/format_reward_func": 0.0,
"step": 228
},
{
"completion_length": 629.8534321784973,
"epoch": 5.6469673405909795,
"grad_norm": 4.474099159240723,
"kl": 164.97060561180115,
"learning_rate": 2.536026191693893e-07,
"loss": 0.165,
"reward": 0.2060565553838387,
"reward_std": 0.21067888580728322,
"rewards/equation_reward_func": 0.20605655445251614,
"rewards/format_reward_func": 0.0,
"step": 230
},
{
"completion_length": 626.8482217788696,
"epoch": 5.696734059097978,
"grad_norm": 9.778329849243164,
"kl": 169.21773087978363,
"learning_rate": 2.5e-07,
"loss": 0.1692,
"reward": 0.20911459170747548,
"reward_std": 0.21599237713962793,
"rewards/equation_reward_func": 0.2091145912418142,
"rewards/format_reward_func": 0.0,
"step": 232
},
{
"completion_length": 629.8660821914673,
"epoch": 5.746500777604977,
"grad_norm": 5.210114479064941,
"kl": 171.0250325202942,
"learning_rate": 2.4639738083061073e-07,
"loss": 0.171,
"reward": 0.2135788791347295,
"reward_std": 0.20587447995785624,
"rewards/equation_reward_func": 0.21357887890189886,
"rewards/format_reward_func": 0.0,
"step": 234
},
{
"completion_length": 628.7165260314941,
"epoch": 5.796267496111975,
"grad_norm": 4.644392490386963,
"kl": 149.7915449142456,
"learning_rate": 2.4279550982658345e-07,
"loss": 0.1498,
"reward": 0.20833334070630372,
"reward_std": 0.21195052459370345,
"rewards/equation_reward_func": 0.20833334047347307,
"rewards/format_reward_func": 0.0,
"step": 236
},
{
"completion_length": 628.755964756012,
"epoch": 5.846034214618974,
"grad_norm": 6.456798076629639,
"kl": 442.08424025774,
"learning_rate": 2.3919513499790646e-07,
"loss": 0.4421,
"reward": 0.22005209047347307,
"reward_std": 0.21488765871617943,
"rewards/equation_reward_func": 0.22005209024064243,
"rewards/format_reward_func": 0.0,
"step": 238
},
{
"completion_length": 612.3988199234009,
"epoch": 5.895800933125972,
"grad_norm": 9.304161071777344,
"kl": 118.21684062480927,
"learning_rate": 2.3559700404385394e-07,
"loss": 0.1182,
"reward": 0.22447917505633086,
"reward_std": 0.211615604814142,
"rewards/equation_reward_func": 0.22447917482350022,
"rewards/format_reward_func": 0.0,
"step": 240
},
{
"completion_length": 633.3660821914673,
"epoch": 5.94556765163297,
"grad_norm": 5.745642185211182,
"kl": 133.20424818992615,
"learning_rate": 2.3200186419770823e-07,
"loss": 0.1332,
"reward": 0.2242708442499861,
"reward_std": 0.2152464333921671,
"rewards/equation_reward_func": 0.22427084331866354,
"rewards/format_reward_func": 0.0,
"step": 242
},
{
"completion_length": 618.1235270500183,
"epoch": 5.995334370139969,
"grad_norm": 4.167017936706543,
"kl": 143.97905486822128,
"learning_rate": 2.284104620715807e-07,
"loss": 0.144,
"reward": 0.22046875627711415,
"reward_std": 0.21442426112480462,
"rewards/equation_reward_func": 0.22046875732485205,
"rewards/format_reward_func": 0.0,
"step": 244
},
{
"completion_length": 634.5175580476459,
"epoch": 6.024883359253499,
"grad_norm": 3.44785213470459,
"kl": 167.55113441065737,
"learning_rate": 2.2482354350136043e-07,
"loss": 0.0995,
"reward": 0.21961153769179395,
"reward_std": 0.2146961924276854,
"rewards/equation_reward_func": 0.21961153769179395,
"rewards/format_reward_func": 0.0,
"step": 246
},
{
"completion_length": 634.5863180160522,
"epoch": 6.074650077760498,
"grad_norm": 7.954348564147949,
"kl": 163.61565399169922,
"learning_rate": 2.2124185339182496e-07,
"loss": 0.1636,
"reward": 0.23546131700277328,
"reward_std": 0.2178129724925384,
"rewards/equation_reward_func": 0.23546131781768054,
"rewards/format_reward_func": 0.0,
"step": 248
},
{
"completion_length": 610.0825996398926,
"epoch": 6.1244167962674965,
"grad_norm": 4.648006439208984,
"kl": 167.8152883052826,
"learning_rate": 2.1766613556194344e-07,
"loss": 0.1678,
"reward": 0.22144346224376932,
"reward_std": 0.21030379901640117,
"rewards/equation_reward_func": 0.22144346177810803,
"rewards/format_reward_func": 0.0,
"step": 250
},
{
"epoch": 6.1244167962674965,
"step": 250,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 0.0058,
"train_samples_per_second": 3851297.791,
"train_steps_per_second": 17193.294
}
],
"logging_steps": 2,
"max_steps": 100,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 25,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}