{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.1244167962674965, "eval_steps": 500, "global_step": 250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 736.4702529907227, "epoch": 0.049766718506998445, "grad_norm": 0.2507069706916809, "kl": 0.0, "learning_rate": 7.142857142857142e-08, "loss": 0.0, "reward": 0.04415178840281442, "reward_std": 0.07034091584500857, "rewards/equation_reward_func": 0.04415178793715313, "rewards/format_reward_func": 0.0, "step": 2 }, { "completion_length": 723.1704015731812, "epoch": 0.09953343701399689, "grad_norm": 0.19884330034255981, "kl": 2.0936699339557663e-05, "learning_rate": 1.4285714285714285e-07, "loss": 0.0, "reward": 0.040647323767188936, "reward_std": 0.0637543131451821, "rewards/equation_reward_func": 0.04064732347615063, "rewards/format_reward_func": 0.0, "step": 4 }, { "completion_length": 726.5163822174072, "epoch": 0.14930015552099535, "grad_norm": 0.21145105361938477, "kl": 0.00019492170304147294, "learning_rate": 2.1428571428571426e-07, "loss": 0.0, "reward": 0.04095238326408435, "reward_std": 0.06441530691517983, "rewards/equation_reward_func": 0.040952383089461364, "rewards/format_reward_func": 0.0, "step": 6 }, { "completion_length": 737.8207015991211, "epoch": 0.19906687402799378, "grad_norm": 0.2020396590232849, "kl": 0.020334478189397487, "learning_rate": 2.857142857142857e-07, "loss": 0.0, "reward": 0.03635416827455629, "reward_std": 0.05670656039728783, "rewards/equation_reward_func": 0.03635416833276395, "rewards/format_reward_func": 0.0, "step": 8 }, { "completion_length": 718.5461435317993, "epoch": 0.24883359253499224, "grad_norm": 27.479997634887695, "kl": 9.990212610488015, "learning_rate": 3.5714285714285716e-07, "loss": 0.01, "reward": 0.04543898967676796, "reward_std": 0.07194261607946828, "rewards/equation_reward_func": 0.04543899020063691, "rewards/format_reward_func": 0.0, "step": 10 }, { "completion_length": 721.6964378356934, "epoch": 0.2986003110419907, "grad_norm": 0.19479116797447205, "kl": 0.005109551766508957, "learning_rate": 4.285714285714285e-07, "loss": 0.0, "reward": 0.04148065741173923, "reward_std": 0.0677548690000549, "rewards/equation_reward_func": 0.04148065741173923, "rewards/format_reward_func": 0.0, "step": 12 }, { "completion_length": 725.9003086090088, "epoch": 0.3483670295489891, "grad_norm": 0.24158786237239838, "kl": 0.502096803898894, "learning_rate": 5e-07, "loss": 0.0005, "reward": 0.04821428797731642, "reward_std": 0.07803500922454987, "rewards/equation_reward_func": 0.048214288559393026, "rewards/format_reward_func": 0.0, "step": 14 }, { "completion_length": 722.5781373977661, "epoch": 0.39813374805598756, "grad_norm": 0.290544331073761, "kl": 0.23321715661586495, "learning_rate": 4.999740409224932e-07, "loss": 0.0002, "reward": 0.05134672833082732, "reward_std": 0.07946415679180063, "rewards/equation_reward_func": 0.05134672856365796, "rewards/format_reward_func": 0.0, "step": 16 }, { "completion_length": 723.7433156967163, "epoch": 0.447900466562986, "grad_norm": 4.716856479644775, "kl": 0.8582769820350222, "learning_rate": 4.998961690809627e-07, "loss": 0.0009, "reward": 0.050446430934243836, "reward_std": 0.07721506280358881, "rewards/equation_reward_func": 0.050446430992451496, "rewards/format_reward_func": 0.0, "step": 18 }, { "completion_length": 729.6198072433472, "epoch": 0.4976671850699845, "grad_norm": 0.23771615326404572, "kl": 0.3220994914881885, "learning_rate": 4.997664006472578e-07, "loss": 0.0003, "reward": 0.045706847246037796, "reward_std": 0.07288302374945488, "rewards/equation_reward_func": 0.045706847246037796, "rewards/format_reward_func": 0.0, "step": 20 }, { "completion_length": 712.8631057739258, "epoch": 0.5474339035769828, "grad_norm": 0.3750320374965668, "kl": 0.27479040302569047, "learning_rate": 4.995847625707292e-07, "loss": 0.0003, "reward": 0.05489583619055338, "reward_std": 0.0833382241835352, "rewards/equation_reward_func": 0.054895836423384026, "rewards/format_reward_func": 0.0, "step": 22 }, { "completion_length": 731.4576015472412, "epoch": 0.5972006220839814, "grad_norm": 0.2089901864528656, "kl": 0.12606932656490244, "learning_rate": 4.993512925726318e-07, "loss": 0.0001, "reward": 0.0600520860607503, "reward_std": 0.0899482914537657, "rewards/equation_reward_func": 0.06005208553688135, "rewards/format_reward_func": 0.0, "step": 24 }, { "completion_length": 706.6056671142578, "epoch": 0.6469673405909798, "grad_norm": 0.17620234191417694, "kl": 0.1556346261058934, "learning_rate": 4.990660391382923e-07, "loss": 0.0002, "reward": 0.05229166932986118, "reward_std": 0.07350753628998064, "rewards/equation_reward_func": 0.05229166956269182, "rewards/format_reward_func": 0.0, "step": 26 }, { "completion_length": 727.1808137893677, "epoch": 0.6967340590979783, "grad_norm": 0.19479602575302124, "kl": 0.12168441573157907, "learning_rate": 4.987290615070384e-07, "loss": 0.0001, "reward": 0.053683038655435666, "reward_std": 0.08042177859169897, "rewards/equation_reward_func": 0.053683039121096954, "rewards/format_reward_func": 0.0, "step": 28 }, { "completion_length": 720.8891496658325, "epoch": 0.7465007776049767, "grad_norm": 0.1857473999261856, "kl": 0.1632600230514072, "learning_rate": 4.983404296598978e-07, "loss": 0.0002, "reward": 0.05391369271092117, "reward_std": 0.08413292915793136, "rewards/equation_reward_func": 0.053913692419882864, "rewards/format_reward_func": 0.0, "step": 30 }, { "completion_length": 720.4337940216064, "epoch": 0.7962674961119751, "grad_norm": 0.23092247545719147, "kl": 0.15535293571883813, "learning_rate": 4.979002243050646e-07, "loss": 0.0002, "reward": 0.05988095561042428, "reward_std": 0.09168167802272364, "rewards/equation_reward_func": 0.05988095601787791, "rewards/format_reward_func": 0.0, "step": 32 }, { "completion_length": 718.6845378875732, "epoch": 0.8460342146189735, "grad_norm": 0.23407958447933197, "kl": 0.25782948260894045, "learning_rate": 4.974085368611381e-07, "loss": 0.0003, "reward": 0.06691220620996319, "reward_std": 0.09768064138188493, "rewards/equation_reward_func": 0.06691220562788658, "rewards/format_reward_func": 0.0, "step": 34 }, { "completion_length": 718.0454006195068, "epoch": 0.895800933125972, "grad_norm": 0.3076172471046448, "kl": 0.2404527408652939, "learning_rate": 4.968654694381379e-07, "loss": 0.0002, "reward": 0.07349702704232186, "reward_std": 0.10955648736853618, "rewards/equation_reward_func": 0.07349702733336017, "rewards/format_reward_func": 0.0, "step": 36 }, { "completion_length": 704.1488237380981, "epoch": 0.9455676516329704, "grad_norm": 0.2561110258102417, "kl": 0.43795167771168053, "learning_rate": 4.962711348162987e-07, "loss": 0.0004, "reward": 0.06241815793327987, "reward_std": 0.09217380215704907, "rewards/equation_reward_func": 0.0624181583407335, "rewards/format_reward_func": 0.0, "step": 38 }, { "completion_length": 707.3921279907227, "epoch": 0.995334370139969, "grad_norm": 0.3400561511516571, "kl": 0.5494289128109813, "learning_rate": 4.956256564226487e-07, "loss": 0.0005, "reward": 0.0764508958091028, "reward_std": 0.11110821401234716, "rewards/equation_reward_func": 0.07645089708967134, "rewards/format_reward_func": 0.0, "step": 40 }, { "completion_length": 715.0272221156529, "epoch": 1.0497667185069985, "grad_norm": 0.26081565022468567, "kl": 0.4236157455614635, "learning_rate": 4.949291683053768e-07, "loss": 0.0005, "reward": 0.07186394860701902, "reward_std": 0.10362207902861494, "rewards/equation_reward_func": 0.07186394876667432, "rewards/format_reward_func": 0.0, "step": 42 }, { "completion_length": 714.9486722946167, "epoch": 1.0995334370139969, "grad_norm": 0.29378727078437805, "kl": 0.3755593653768301, "learning_rate": 4.941818151059955e-07, "loss": 0.0004, "reward": 0.0799404798890464, "reward_std": 0.11443577655882109, "rewards/equation_reward_func": 0.07994047965621576, "rewards/format_reward_func": 0.0, "step": 44 }, { "completion_length": 727.829628944397, "epoch": 1.1493001555209954, "grad_norm": 2045.599365234375, "kl": 128.7541933595203, "learning_rate": 4.933837520293017e-07, "loss": 0.1288, "reward": 0.06808780113351531, "reward_std": 0.09949399236938916, "rewards/equation_reward_func": 0.06808780090068467, "rewards/format_reward_func": 0.0, "step": 46 }, { "completion_length": 709.632453918457, "epoch": 1.1990668740279937, "grad_norm": 0.2698291838169098, "kl": 0.4989726666826755, "learning_rate": 4.925351448111454e-07, "loss": 0.0005, "reward": 0.09389881315291859, "reward_std": 0.13221543522377033, "rewards/equation_reward_func": 0.09389881303650327, "rewards/format_reward_func": 0.0, "step": 48 }, { "completion_length": 719.485878944397, "epoch": 1.2488335925349923, "grad_norm": 0.36381521821022034, "kl": 0.550471473718062, "learning_rate": 4.91636169684011e-07, "loss": 0.0006, "reward": 0.08360863462439738, "reward_std": 0.11854775344545487, "rewards/equation_reward_func": 0.08360863421694376, "rewards/format_reward_func": 0.0, "step": 50 }, { "completion_length": 725.6599855422974, "epoch": 1.2986003110419908, "grad_norm": 0.3374347686767578, "kl": 0.663099701050669, "learning_rate": 4.906870133404186e-07, "loss": 0.0007, "reward": 0.08503720644512214, "reward_std": 0.12180299674218986, "rewards/equation_reward_func": 0.0850372067943681, "rewards/format_reward_func": 0.0, "step": 52 }, { "completion_length": 723.972484588623, "epoch": 1.3483670295489891, "grad_norm": 1.0345810651779175, "kl": 0.9573397457133979, "learning_rate": 4.896878728941531e-07, "loss": 0.001, "reward": 0.09177827867097221, "reward_std": 0.12253864679951221, "rewards/equation_reward_func": 0.09177827744861133, "rewards/format_reward_func": 0.0, "step": 54 }, { "completion_length": 712.2269496917725, "epoch": 1.3981337480559874, "grad_norm": 0.27968963980674744, "kl": 0.8391579431481659, "learning_rate": 4.886389558393284e-07, "loss": 0.0008, "reward": 0.08570684934966266, "reward_std": 0.1181660912843654, "rewards/equation_reward_func": 0.08570684841834009, "rewards/format_reward_func": 0.0, "step": 56 }, { "completion_length": 730.5327529907227, "epoch": 1.447900466562986, "grad_norm": 0.28138798475265503, "kl": 0.9094656470697373, "learning_rate": 4.875404800072976e-07, "loss": 0.0009, "reward": 0.08794643338478636, "reward_std": 0.12104765651747584, "rewards/equation_reward_func": 0.08794643309374806, "rewards/format_reward_func": 0.0, "step": 58 }, { "completion_length": 732.3861742019653, "epoch": 1.4976671850699845, "grad_norm": 0.34412360191345215, "kl": 1.009782899171114, "learning_rate": 4.86392673521415e-07, "loss": 0.001, "reward": 0.10000744601711631, "reward_std": 0.13957228315121029, "rewards/equation_reward_func": 0.10000744566787034, "rewards/format_reward_func": 0.0, "step": 60 }, { "completion_length": 725.0677175521851, "epoch": 1.5474339035769828, "grad_norm": 0.3454972207546234, "kl": 1.0763904643245041, "learning_rate": 4.851957747496606e-07, "loss": 0.0011, "reward": 0.10212798128486611, "reward_std": 0.13816983328433707, "rewards/equation_reward_func": 0.10212798012071289, "rewards/format_reward_func": 0.0, "step": 62 }, { "completion_length": 730.5171251296997, "epoch": 1.5972006220839814, "grad_norm": 0.3473067581653595, "kl": 1.4565551071427763, "learning_rate": 4.839500322551386e-07, "loss": 0.0015, "reward": 0.10485119439545088, "reward_std": 0.14129075466189533, "rewards/equation_reward_func": 0.10485119334771298, "rewards/format_reward_func": 0.0, "step": 64 }, { "completion_length": 735.0320043563843, "epoch": 1.64696734059098, "grad_norm": 0.3159619867801666, "kl": 1.5041364189237356, "learning_rate": 4.826557047444563e-07, "loss": 0.0015, "reward": 0.10093006424722262, "reward_std": 0.13811934839759488, "rewards/equation_reward_func": 0.1009300641308073, "rewards/format_reward_func": 0.0, "step": 66 }, { "completion_length": 730.7455463409424, "epoch": 1.6967340590979783, "grad_norm": 1.146909236907959, "kl": 2.238507369533181, "learning_rate": 4.813130610139993e-07, "loss": 0.0022, "reward": 0.10973958898102865, "reward_std": 0.13851106038782746, "rewards/equation_reward_func": 0.10973958781687543, "rewards/format_reward_func": 0.0, "step": 68 }, { "completion_length": 712.6971893310547, "epoch": 1.7465007776049766, "grad_norm": 7.27742338180542, "kl": 3.2542791040614247, "learning_rate": 4.799223798941089e-07, "loss": 0.0033, "reward": 0.12900298138265498, "reward_std": 0.15667404458508827, "rewards/equation_reward_func": 0.1290029831288848, "rewards/format_reward_func": 0.0, "step": 70 }, { "completion_length": 729.6331987380981, "epoch": 1.7962674961119751, "grad_norm": 10.986953735351562, "kl": 4.106183127500117, "learning_rate": 4.78483950191177e-07, "loss": 0.0041, "reward": 0.12543899397132918, "reward_std": 0.16567694948753342, "rewards/equation_reward_func": 0.12543899344746023, "rewards/format_reward_func": 0.0, "step": 72 }, { "completion_length": 737.0245656967163, "epoch": 1.8460342146189737, "grad_norm": 1.6122727394104004, "kl": 3.731540434062481, "learning_rate": 4.769980706276687e-07, "loss": 0.0037, "reward": 0.12507440976332873, "reward_std": 0.159569505834952, "rewards/equation_reward_func": 0.12507440929766744, "rewards/format_reward_func": 0.0, "step": 74 }, { "completion_length": 729.0632581710815, "epoch": 1.895800933125972, "grad_norm": 0.5852969288825989, "kl": 2.9793617641553283, "learning_rate": 4.7546504978008595e-07, "loss": 0.003, "reward": 0.12817708833608776, "reward_std": 0.1600989469443448, "rewards/equation_reward_func": 0.1281770879868418, "rewards/format_reward_func": 0.0, "step": 76 }, { "completion_length": 734.6302223205566, "epoch": 1.9455676516329703, "grad_norm": 0.9090600609779358, "kl": 3.139740688726306, "learning_rate": 4.738852060148848e-07, "loss": 0.0031, "reward": 0.13495536311529577, "reward_std": 0.1720278718858026, "rewards/equation_reward_func": 0.13495536299888045, "rewards/format_reward_func": 0.0, "step": 78 }, { "completion_length": 742.833345413208, "epoch": 1.995334370139969, "grad_norm": 0.5681818723678589, "kl": 3.712686972692609, "learning_rate": 4.722588674223593e-07, "loss": 0.0037, "reward": 0.13085565919755027, "reward_std": 0.15991040458902717, "rewards/equation_reward_func": 0.1308556593139656, "rewards/format_reward_func": 0.0, "step": 80 }, { "completion_length": 717.2042718184622, "epoch": 2.0248833592534994, "grad_norm": 1.5164953470230103, "kl": 5.466580171334116, "learning_rate": 4.70586371748506e-07, "loss": 0.0032, "reward": 0.14641604347056464, "reward_std": 0.18159407436063416, "rewards/equation_reward_func": 0.1464160444509042, "rewards/format_reward_func": 0.0, "step": 82 }, { "completion_length": 730.2589464187622, "epoch": 2.0746500777604977, "grad_norm": 0.6375504732131958, "kl": 4.280845553614199, "learning_rate": 4.6886806632488363e-07, "loss": 0.0043, "reward": 0.14213542238576338, "reward_std": 0.1740714008337818, "rewards/equation_reward_func": 0.14213542168727145, "rewards/format_reward_func": 0.0, "step": 84 }, { "completion_length": 744.4538831710815, "epoch": 2.124416796267496, "grad_norm": 0.9480769038200378, "kl": 7.16812994517386, "learning_rate": 4.6710430799648143e-07, "loss": 0.0072, "reward": 0.12831845637992956, "reward_std": 0.1582361755426973, "rewards/equation_reward_func": 0.12831845649634488, "rewards/format_reward_func": 0.0, "step": 86 }, { "completion_length": 732.5520973205566, "epoch": 2.1741835147744943, "grad_norm": 16.496623992919922, "kl": 10.49539315700531, "learning_rate": 4.652954630476127e-07, "loss": 0.0105, "reward": 0.14677828032290563, "reward_std": 0.1764058277476579, "rewards/equation_reward_func": 0.1467782796244137, "rewards/format_reward_func": 0.0, "step": 88 }, { "completion_length": 736.1361722946167, "epoch": 2.223950233281493, "grad_norm": 2.352017879486084, "kl": 10.109702784568071, "learning_rate": 4.6344190712584713e-07, "loss": 0.0101, "reward": 0.13781250565079972, "reward_std": 0.1627702646655962, "rewards/equation_reward_func": 0.13781250413740054, "rewards/format_reward_func": 0.0, "step": 90 }, { "completion_length": 749.1317129135132, "epoch": 2.2737169517884914, "grad_norm": 3.804121255874634, "kl": 15.052036292850971, "learning_rate": 4.615440251639995e-07, "loss": 0.0151, "reward": 0.14105655340244994, "reward_std": 0.17247924709226936, "rewards/equation_reward_func": 0.14105655369348824, "rewards/format_reward_func": 0.0, "step": 92 }, { "completion_length": 717.3884019851685, "epoch": 2.3234836702954897, "grad_norm": 2.226238489151001, "kl": 12.018643591552973, "learning_rate": 4.596022113001894e-07, "loss": 0.012, "reward": 0.15741816238733009, "reward_std": 0.17923290858743712, "rewards/equation_reward_func": 0.15741816128138453, "rewards/format_reward_func": 0.0, "step": 94 }, { "completion_length": 726.2500143051147, "epoch": 2.3732503888024885, "grad_norm": 2.1459925174713135, "kl": 12.27118530496955, "learning_rate": 4.576168687959895e-07, "loss": 0.0123, "reward": 0.16154762578662485, "reward_std": 0.18940409342758358, "rewards/equation_reward_func": 0.16154762508813292, "rewards/format_reward_func": 0.0, "step": 96 }, { "completion_length": 711.6696538925171, "epoch": 2.423017107309487, "grad_norm": 1.4883497953414917, "kl": 15.596692271530628, "learning_rate": 4.555884099526793e-07, "loss": 0.0156, "reward": 0.15925595845328644, "reward_std": 0.1815938005456701, "rewards/equation_reward_func": 0.1592559577547945, "rewards/format_reward_func": 0.0, "step": 98 }, { "completion_length": 719.6242723464966, "epoch": 2.472783825816485, "grad_norm": 4.10906982421875, "kl": 17.258602559566498, "learning_rate": 4.5351725602562174e-07, "loss": 0.0173, "reward": 0.17212054354604334, "reward_std": 0.18435519566992298, "rewards/equation_reward_func": 0.17212054308038205, "rewards/format_reward_func": 0.0, "step": 100 }, { "completion_length": 697.6637020111084, "epoch": 2.522550544323484, "grad_norm": 1.1079808473587036, "kl": 14.344636462628841, "learning_rate": 4.514038371367791e-07, "loss": 0.0143, "reward": 0.17430060362676159, "reward_std": 0.19522728596348315, "rewards/equation_reward_func": 0.17430060246260837, "rewards/format_reward_func": 0.0, "step": 102 }, { "completion_length": 695.2105755805969, "epoch": 2.5723172628304822, "grad_norm": 1.298901081085205, "kl": 15.563006613403559, "learning_rate": 4.4924859218538936e-07, "loss": 0.0156, "reward": 0.17871280398685485, "reward_std": 0.19645729020703584, "rewards/equation_reward_func": 0.17871280352119356, "rewards/format_reward_func": 0.0, "step": 104 }, { "completion_length": 687.2507581710815, "epoch": 2.6220839813374806, "grad_norm": 1.333657145500183, "kl": 14.787582196295261, "learning_rate": 4.470519687568185e-07, "loss": 0.0148, "reward": 0.19031250709667802, "reward_std": 0.2006249635014683, "rewards/equation_reward_func": 0.19031250721309334, "rewards/format_reward_func": 0.0, "step": 106 }, { "completion_length": 672.3839402198792, "epoch": 2.671850699844479, "grad_norm": 1.4585353136062622, "kl": 20.08526621758938, "learning_rate": 4.4481442302960923e-07, "loss": 0.0201, "reward": 0.18158482806757092, "reward_std": 0.1955818484420888, "rewards/equation_reward_func": 0.18158482783474028, "rewards/format_reward_func": 0.0, "step": 108 }, { "completion_length": 651.4077491760254, "epoch": 2.721617418351477, "grad_norm": 1.516221523284912, "kl": 17.027776926755905, "learning_rate": 4.4253641968074505e-07, "loss": 0.017, "reward": 0.1995759003330022, "reward_std": 0.21349556557834148, "rewards/equation_reward_func": 0.19957590056583285, "rewards/format_reward_func": 0.0, "step": 110 }, { "completion_length": 672.9442043304443, "epoch": 2.771384136858476, "grad_norm": 2.0658159255981445, "kl": 20.176754418760538, "learning_rate": 4.402184317891501e-07, "loss": 0.0202, "reward": 0.20375744753982872, "reward_std": 0.18776777852326632, "rewards/equation_reward_func": 0.2037574463756755, "rewards/format_reward_func": 0.0, "step": 112 }, { "completion_length": 665.7247114181519, "epoch": 2.8211508553654743, "grad_norm": 2.339445114135742, "kl": 22.64492540061474, "learning_rate": 4.37860940737443e-07, "loss": 0.0226, "reward": 0.1926413766341284, "reward_std": 0.2001927924575284, "rewards/equation_reward_func": 0.19264137593563646, "rewards/format_reward_func": 0.0, "step": 114 }, { "completion_length": 669.665937423706, "epoch": 2.8709175738724726, "grad_norm": 2.852607011795044, "kl": 32.22943264245987, "learning_rate": 4.354644361119671e-07, "loss": 0.0322, "reward": 0.19950893591158092, "reward_std": 0.1933421454159543, "rewards/equation_reward_func": 0.19950893614441156, "rewards/format_reward_func": 0.0, "step": 116 }, { "completion_length": 670.7053713798523, "epoch": 2.9206842923794714, "grad_norm": 2.6619129180908203, "kl": 27.73328886926174, "learning_rate": 4.3302941560111716e-07, "loss": 0.0277, "reward": 0.19388393545523286, "reward_std": 0.19777346146292984, "rewards/equation_reward_func": 0.1938839361537248, "rewards/format_reward_func": 0.0, "step": 118 }, { "completion_length": 676.3571548461914, "epoch": 2.9704510108864697, "grad_norm": 3.816153049468994, "kl": 27.2223904132843, "learning_rate": 4.3055638489198236e-07, "loss": 0.0272, "reward": 0.20729167491663247, "reward_std": 0.20934273721650243, "rewards/equation_reward_func": 0.20729167328681797, "rewards/format_reward_func": 0.0, "step": 120 }, { "completion_length": 659.7907361482319, "epoch": 3.0, "grad_norm": 0.624527633190155, "kl": 27.528421577654388, "learning_rate": 4.280458575653296e-07, "loss": 0.0163, "reward": 0.20659148869545838, "reward_std": 0.19081004316869535, "rewards/equation_reward_func": 0.20659148947973, "rewards/format_reward_func": 0.0, "step": 122 }, { "completion_length": 659.4025421142578, "epoch": 3.0497667185069983, "grad_norm": 3.345853567123413, "kl": 21.34368522465229, "learning_rate": 4.2549835498894665e-07, "loss": 0.0213, "reward": 0.22118304355535656, "reward_std": 0.21869899448938668, "rewards/equation_reward_func": 0.22118304437026381, "rewards/format_reward_func": 0.0, "step": 124 }, { "completion_length": 672.1183128356934, "epoch": 3.099533437013997, "grad_norm": 6.106723785400391, "kl": 23.556977652013302, "learning_rate": 4.229144062093679e-07, "loss": 0.0236, "reward": 0.21467262762598693, "reward_std": 0.2053254572674632, "rewards/equation_reward_func": 0.21467262762598693, "rewards/format_reward_func": 0.0, "step": 126 }, { "completion_length": 653.0297751426697, "epoch": 3.1493001555209954, "grad_norm": 5.746135234832764, "kl": 26.1618300229311, "learning_rate": 4.2029454784200675e-07, "loss": 0.0262, "reward": 0.21742560202255845, "reward_std": 0.2172505116323009, "rewards/equation_reward_func": 0.217425603303127, "rewards/format_reward_func": 0.0, "step": 128 }, { "completion_length": 645.058048248291, "epoch": 3.1990668740279937, "grad_norm": 60.6376953125, "kl": 53.1397475451231, "learning_rate": 4.1763932395971433e-07, "loss": 0.0531, "reward": 0.2241517937509343, "reward_std": 0.20952896296512336, "rewards/equation_reward_func": 0.22415179491508752, "rewards/format_reward_func": 0.0, "step": 130 }, { "completion_length": 632.6659345626831, "epoch": 3.248833592534992, "grad_norm": 5.82427978515625, "kl": 41.686398059129715, "learning_rate": 4.1494928597979117e-07, "loss": 0.0417, "reward": 0.22440477029886097, "reward_std": 0.2128691952675581, "rewards/equation_reward_func": 0.22440477076452225, "rewards/format_reward_func": 0.0, "step": 132 }, { "completion_length": 639.6711411476135, "epoch": 3.298600311041991, "grad_norm": 3.375183343887329, "kl": 36.797510489821434, "learning_rate": 4.122249925494726e-07, "loss": 0.0368, "reward": 0.2161235201638192, "reward_std": 0.20362528192345053, "rewards/equation_reward_func": 0.21612352062948048, "rewards/format_reward_func": 0.0, "step": 134 }, { "completion_length": 651.2276935577393, "epoch": 3.348367029548989, "grad_norm": 5.04212760925293, "kl": 37.60325849056244, "learning_rate": 4.094670094299131e-07, "loss": 0.0376, "reward": 0.22996280749794096, "reward_std": 0.214357816032134, "rewards/equation_reward_func": 0.22996280703227967, "rewards/format_reward_func": 0.0, "step": 136 }, { "completion_length": 631.5751585960388, "epoch": 3.3981337480559874, "grad_norm": 4.119243144989014, "kl": 43.57139265537262, "learning_rate": 4.066759093786931e-07, "loss": 0.0436, "reward": 0.2285714359022677, "reward_std": 0.21766341011971235, "rewards/equation_reward_func": 0.22857143532019109, "rewards/format_reward_func": 0.0, "step": 138 }, { "completion_length": 647.8214359283447, "epoch": 3.447900466562986, "grad_norm": 7.117722988128662, "kl": 60.4551947414875, "learning_rate": 4.038522720308732e-07, "loss": 0.0605, "reward": 0.21806548640597612, "reward_std": 0.20702184177935123, "rewards/equation_reward_func": 0.2180654831463471, "rewards/format_reward_func": 0.0, "step": 140 }, { "completion_length": 609.9583463668823, "epoch": 3.4976671850699845, "grad_norm": 4.748437881469727, "kl": 58.59304141998291, "learning_rate": 4.009966837786194e-07, "loss": 0.0586, "reward": 0.2300297737820074, "reward_std": 0.20853826915845275, "rewards/equation_reward_func": 0.23002976982388645, "rewards/format_reward_func": 0.0, "step": 142 }, { "completion_length": 631.8430180549622, "epoch": 3.547433903576983, "grad_norm": 8.042330741882324, "kl": 82.30807757377625, "learning_rate": 3.981097376494259e-07, "loss": 0.0823, "reward": 0.21836310264188796, "reward_std": 0.20933940180111676, "rewards/equation_reward_func": 0.21836310101207346, "rewards/format_reward_func": 0.0, "step": 144 }, { "completion_length": 624.0669736862183, "epoch": 3.5972006220839816, "grad_norm": 7.811219692230225, "kl": 77.89375275373459, "learning_rate": 3.951920331829592e-07, "loss": 0.0779, "reward": 0.2207961401436478, "reward_std": 0.21105306909885257, "rewards/equation_reward_func": 0.22079613932874054, "rewards/format_reward_func": 0.0, "step": 146 }, { "completion_length": 623.5215888023376, "epoch": 3.64696734059098, "grad_norm": 8.836230278015137, "kl": 65.97143815457821, "learning_rate": 3.922441763065506e-07, "loss": 0.066, "reward": 0.2193824496353045, "reward_std": 0.20604081987403333, "rewards/equation_reward_func": 0.21938244777265936, "rewards/format_reward_func": 0.0, "step": 148 }, { "completion_length": 634.7611751556396, "epoch": 3.6967340590979783, "grad_norm": 5.354574680328369, "kl": 56.36278319358826, "learning_rate": 3.8926677920936093e-07, "loss": 0.0564, "reward": 0.2112648879410699, "reward_std": 0.2029515573522076, "rewards/equation_reward_func": 0.21126488805748522, "rewards/format_reward_func": 0.0, "step": 150 }, { "completion_length": 636.0297775268555, "epoch": 3.7465007776049766, "grad_norm": 5.276882648468018, "kl": 65.72037261724472, "learning_rate": 3.862604602152464e-07, "loss": 0.0657, "reward": 0.20753721124492586, "reward_std": 0.20195745571982116, "rewards/equation_reward_func": 0.20753721171058714, "rewards/format_reward_func": 0.0, "step": 152 }, { "completion_length": 634.954626083374, "epoch": 3.796267496111975, "grad_norm": 8.027347564697266, "kl": 77.93326985836029, "learning_rate": 3.8322584365434934e-07, "loss": 0.0779, "reward": 0.2165699511533603, "reward_std": 0.2101849897298962, "rewards/equation_reward_func": 0.2165699495235458, "rewards/format_reward_func": 0.0, "step": 154 }, { "completion_length": 638.3660817146301, "epoch": 3.8460342146189737, "grad_norm": 4.954690456390381, "kl": 83.4894488453865, "learning_rate": 3.8016355973344173e-07, "loss": 0.0835, "reward": 0.21200893796049058, "reward_std": 0.21022081119008362, "rewards/equation_reward_func": 0.21200893679633737, "rewards/format_reward_func": 0.0, "step": 156 }, { "completion_length": 620.3281378746033, "epoch": 3.895800933125972, "grad_norm": 4.270212650299072, "kl": 82.2349089384079, "learning_rate": 3.7707424440504863e-07, "loss": 0.0822, "reward": 0.211755960714072, "reward_std": 0.20715959300287068, "rewards/equation_reward_func": 0.21175595885142684, "rewards/format_reward_func": 0.0, "step": 158 }, { "completion_length": 632.0409350395203, "epoch": 3.9455676516329703, "grad_norm": 4.687271595001221, "kl": 90.35439342260361, "learning_rate": 3.739585392353787e-07, "loss": 0.0904, "reward": 0.21921131818089634, "reward_std": 0.20252067118417472, "rewards/equation_reward_func": 0.21921131608542055, "rewards/format_reward_func": 0.0, "step": 160 }, { "completion_length": 630.2678661346436, "epoch": 3.995334370139969, "grad_norm": 5.595997333526611, "kl": 95.46352458000183, "learning_rate": 3.7081709127108767e-07, "loss": 0.0955, "reward": 0.22013393603265285, "reward_std": 0.2177246706560254, "rewards/equation_reward_func": 0.2201339368475601, "rewards/format_reward_func": 0.0, "step": 162 }, { "completion_length": 632.1065288342928, "epoch": 4.024883359253499, "grad_norm": 8.787236213684082, "kl": 144.07192611694336, "learning_rate": 3.6765055290490513e-07, "loss": 0.0855, "reward": 0.20649123721216855, "reward_std": 0.21240881752026708, "rewards/equation_reward_func": 0.2064912359377271, "rewards/format_reward_func": 0.0, "step": 164 }, { "completion_length": 619.5156345367432, "epoch": 4.074650077760498, "grad_norm": 7.552036762237549, "kl": 137.199125289917, "learning_rate": 3.644595817401501e-07, "loss": 0.1372, "reward": 0.2162797685014084, "reward_std": 0.21547920361626893, "rewards/equation_reward_func": 0.2162797685014084, "rewards/format_reward_func": 0.0, "step": 166 }, { "completion_length": 618.7634057998657, "epoch": 4.1244167962674965, "grad_norm": 6.8007354736328125, "kl": 103.6235063970089, "learning_rate": 3.6124484045416483e-07, "loss": 0.1036, "reward": 0.23168899782467633, "reward_std": 0.21457487577572465, "rewards/equation_reward_func": 0.23168899829033762, "rewards/format_reward_func": 0.0, "step": 168 }, { "completion_length": 637.4136991500854, "epoch": 4.174183514774494, "grad_norm": 8.004964828491211, "kl": 113.37393373250961, "learning_rate": 3.580069966606949e-07, "loss": 0.1134, "reward": 0.21156250836793333, "reward_std": 0.2123116059228778, "rewards/equation_reward_func": 0.21156250790227205, "rewards/format_reward_func": 0.0, "step": 170 }, { "completion_length": 634.7485208511353, "epoch": 4.223950233281493, "grad_norm": 7.898318290710449, "kl": 109.72896337509155, "learning_rate": 3.547467227712444e-07, "loss": 0.1097, "reward": 0.2029910811688751, "reward_std": 0.20662414643447846, "rewards/equation_reward_func": 0.20299108081962913, "rewards/format_reward_func": 0.0, "step": 172 }, { "completion_length": 621.2730751037598, "epoch": 4.273716951788492, "grad_norm": 7.211435317993164, "kl": 99.61057341098785, "learning_rate": 3.5146469585543386e-07, "loss": 0.0996, "reward": 0.22819941327907145, "reward_std": 0.2186455992050469, "rewards/equation_reward_func": 0.22819941234774888, "rewards/format_reward_func": 0.0, "step": 174 }, { "completion_length": 640.9628086090088, "epoch": 4.32348367029549, "grad_norm": 7.790672302246094, "kl": 93.87813127040863, "learning_rate": 3.481615975003922e-07, "loss": 0.0939, "reward": 0.2149925670819357, "reward_std": 0.20749260939192027, "rewards/equation_reward_func": 0.2149925702251494, "rewards/format_reward_func": 0.0, "step": 176 }, { "completion_length": 615.1093888282776, "epoch": 4.3732503888024885, "grad_norm": 22.329519271850586, "kl": 87.78260296583176, "learning_rate": 3.448381136692089e-07, "loss": 0.0878, "reward": 0.21617560542654246, "reward_std": 0.20247984025627375, "rewards/equation_reward_func": 0.2161756035638973, "rewards/format_reward_func": 0.0, "step": 178 }, { "completion_length": 629.4829001426697, "epoch": 4.423017107309486, "grad_norm": 13.893996238708496, "kl": 98.21013808250427, "learning_rate": 3.4149493455847897e-07, "loss": 0.0982, "reward": 0.21152530901599675, "reward_std": 0.2093647257424891, "rewards/equation_reward_func": 0.21152530668769032, "rewards/format_reward_func": 0.0, "step": 180 }, { "completion_length": 623.7224802970886, "epoch": 4.472783825816485, "grad_norm": 7.4938130378723145, "kl": 149.59339570999146, "learning_rate": 3.3813275445496766e-07, "loss": 0.1496, "reward": 0.2145535812014714, "reward_std": 0.2063142586266622, "rewards/equation_reward_func": 0.214553578523919, "rewards/format_reward_func": 0.0, "step": 182 }, { "completion_length": 639.263400554657, "epoch": 4.522550544323484, "grad_norm": 6.325891494750977, "kl": 147.64970636367798, "learning_rate": 3.347522715914262e-07, "loss": 0.1476, "reward": 0.20923363824840635, "reward_std": 0.20685563085135072, "rewards/equation_reward_func": 0.20923363824840635, "rewards/format_reward_func": 0.0, "step": 184 }, { "completion_length": 636.6897439956665, "epoch": 4.572317262830482, "grad_norm": 4.635812759399414, "kl": 130.48132091760635, "learning_rate": 3.313541880015877e-07, "loss": 0.1305, "reward": 0.21598215226549655, "reward_std": 0.2006415540818125, "rewards/equation_reward_func": 0.21598214923869818, "rewards/format_reward_func": 0.0, "step": 186 }, { "completion_length": 631.9933152198792, "epoch": 4.6220839813374806, "grad_norm": 7.933198928833008, "kl": 118.75544810295105, "learning_rate": 3.279392093743747e-07, "loss": 0.1188, "reward": 0.22688244911842048, "reward_std": 0.22052743670064956, "rewards/equation_reward_func": 0.22688244772143662, "rewards/format_reward_func": 0.0, "step": 188 }, { "completion_length": 632.7038769721985, "epoch": 4.671850699844479, "grad_norm": 6.763364791870117, "kl": 112.75827008485794, "learning_rate": 3.245080449073459e-07, "loss": 0.1128, "reward": 0.2060937569476664, "reward_std": 0.20044768252409995, "rewards/equation_reward_func": 0.2060937574133277, "rewards/format_reward_func": 0.0, "step": 190 }, { "completion_length": 632.4464421272278, "epoch": 4.721617418351477, "grad_norm": 4.295353412628174, "kl": 108.82453501224518, "learning_rate": 3.210614071594162e-07, "loss": 0.1088, "reward": 0.20745536405593157, "reward_std": 0.21275918127503246, "rewards/equation_reward_func": 0.2074553637066856, "rewards/format_reward_func": 0.0, "step": 192 }, { "completion_length": 634.1763515472412, "epoch": 4.771384136858476, "grad_norm": 4.46217679977417, "kl": 118.317107796669, "learning_rate": 3.1760001190287695e-07, "loss": 0.1183, "reward": 0.20520090113859624, "reward_std": 0.2021206704666838, "rewards/equation_reward_func": 0.20520090113859624, "rewards/format_reward_func": 0.0, "step": 194 }, { "completion_length": 620.2395968437195, "epoch": 4.821150855365475, "grad_norm": 4.841196060180664, "kl": 119.24478554725647, "learning_rate": 3.141245779747502e-07, "loss": 0.1192, "reward": 0.21259673358872533, "reward_std": 0.21422103908844292, "rewards/equation_reward_func": 0.21259673358872533, "rewards/format_reward_func": 0.0, "step": 196 }, { "completion_length": 609.0446557998657, "epoch": 4.870917573872473, "grad_norm": 4.3330559730529785, "kl": 119.67610502243042, "learning_rate": 3.106358271275056e-07, "loss": 0.1197, "reward": 0.22683036630041897, "reward_std": 0.20717181416694075, "rewards/equation_reward_func": 0.22683036653324962, "rewards/format_reward_func": 0.0, "step": 198 }, { "completion_length": 614.8869152069092, "epoch": 4.920684292379471, "grad_norm": 92.09661102294922, "kl": 144.53644692897797, "learning_rate": 3.0713448387917227e-07, "loss": 0.1445, "reward": 0.21901042643003166, "reward_std": 0.20682094641961157, "rewards/equation_reward_func": 0.2190104245673865, "rewards/format_reward_func": 0.0, "step": 200 }, { "completion_length": 631.4241156578064, "epoch": 4.970451010886469, "grad_norm": 6.355322360992432, "kl": 154.4233751296997, "learning_rate": 3.0362127536287636e-07, "loss": 0.1544, "reward": 0.21773066406603903, "reward_std": 0.21250074298586696, "rewards/equation_reward_func": 0.2177306618541479, "rewards/format_reward_func": 0.0, "step": 202 }, { "completion_length": 624.7180488987973, "epoch": 5.0, "grad_norm": 5.770173072814941, "kl": 161.87928571199117, "learning_rate": 3.0009693117583523e-07, "loss": 0.0961, "reward": 0.21541354177813782, "reward_std": 0.20374000229333578, "rewards/equation_reward_func": 0.215413541386002, "rewards/format_reward_func": 0.0, "step": 204 }, { "completion_length": 624.5647420883179, "epoch": 5.049766718506999, "grad_norm": 6.884070873260498, "kl": 157.92570447921753, "learning_rate": 2.965621832278401e-07, "loss": 0.1579, "reward": 0.22669643780682236, "reward_std": 0.20801680884324014, "rewards/equation_reward_func": 0.22669643454719335, "rewards/format_reward_func": 0.0, "step": 206 }, { "completion_length": 614.1570081710815, "epoch": 5.099533437013997, "grad_norm": 4.670907497406006, "kl": 134.14546036720276, "learning_rate": 2.9301776558925875e-07, "loss": 0.1341, "reward": 0.2188244123244658, "reward_std": 0.20453347032889724, "rewards/equation_reward_func": 0.21882441325578839, "rewards/format_reward_func": 0.0, "step": 208 }, { "completion_length": 614.4702506065369, "epoch": 5.149300155520995, "grad_norm": 14.716873168945312, "kl": 109.80421262979507, "learning_rate": 2.894644143385885e-07, "loss": 0.1098, "reward": 0.21839286445174366, "reward_std": 0.20062782417517155, "rewards/equation_reward_func": 0.21839286398608238, "rewards/format_reward_func": 0.0, "step": 210 }, { "completion_length": 622.4672718048096, "epoch": 5.199066874027994, "grad_norm": 10.858051300048828, "kl": 114.28983092308044, "learning_rate": 2.859028674095937e-07, "loss": 0.1143, "reward": 0.2192782819038257, "reward_std": 0.2128367607947439, "rewards/equation_reward_func": 0.21927828167099506, "rewards/format_reward_func": 0.0, "step": 212 }, { "completion_length": 612.6160840988159, "epoch": 5.248833592534992, "grad_norm": 3.8785901069641113, "kl": 125.06462055444717, "learning_rate": 2.823338644380566e-07, "loss": 0.1251, "reward": 0.23020090232603252, "reward_std": 0.2176531965378672, "rewards/equation_reward_func": 0.23020089999772608, "rewards/format_reward_func": 0.0, "step": 214 }, { "completion_length": 635.8995633125305, "epoch": 5.298600311041991, "grad_norm": 5.062567234039307, "kl": 148.21274209022522, "learning_rate": 2.7875814660817504e-07, "loss": 0.1482, "reward": 0.2193973324028775, "reward_std": 0.22195886494591832, "rewards/equation_reward_func": 0.21939733054023236, "rewards/format_reward_func": 0.0, "step": 216 }, { "completion_length": 630.8229269981384, "epoch": 5.348367029548989, "grad_norm": 5.181402206420898, "kl": 165.8618984222412, "learning_rate": 2.751764564986396e-07, "loss": 0.1659, "reward": 0.2077009006170556, "reward_std": 0.2193935844115913, "rewards/equation_reward_func": 0.2077009001513943, "rewards/format_reward_func": 0.0, "step": 218 }, { "completion_length": 628.6517939567566, "epoch": 5.3981337480559874, "grad_norm": 4.105767726898193, "kl": 148.7712802886963, "learning_rate": 2.715895379284194e-07, "loss": 0.1488, "reward": 0.2191815583501011, "reward_std": 0.20989621221087873, "rewards/equation_reward_func": 0.21918155602179468, "rewards/format_reward_func": 0.0, "step": 220 }, { "completion_length": 629.8006067276001, "epoch": 5.447900466562986, "grad_norm": 3.895611524581909, "kl": 142.22095596790314, "learning_rate": 2.6799813580229174e-07, "loss": 0.1422, "reward": 0.22290923492982984, "reward_std": 0.21323461562860757, "rewards/equation_reward_func": 0.2229092346969992, "rewards/format_reward_func": 0.0, "step": 222 }, { "completion_length": 608.6183171272278, "epoch": 5.497667185069984, "grad_norm": 6.331876277923584, "kl": 135.1478552222252, "learning_rate": 2.6440299595614606e-07, "loss": 0.1351, "reward": 0.21991072362288833, "reward_std": 0.22133340197615325, "rewards/equation_reward_func": 0.21991072269156575, "rewards/format_reward_func": 0.0, "step": 224 }, { "completion_length": 611.6756086349487, "epoch": 5.547433903576983, "grad_norm": 3.41554594039917, "kl": 135.47022581100464, "learning_rate": 2.6080486500209347e-07, "loss": 0.1355, "reward": 0.21784971025772393, "reward_std": 0.21086209290660918, "rewards/equation_reward_func": 0.2178497090935707, "rewards/format_reward_func": 0.0, "step": 226 }, { "completion_length": 609.0922722816467, "epoch": 5.597200622083982, "grad_norm": 4.638352870941162, "kl": 149.68241280317307, "learning_rate": 2.572044901734166e-07, "loss": 0.1497, "reward": 0.22438989242073148, "reward_std": 0.2241612394573167, "rewards/equation_reward_func": 0.2243898919550702, "rewards/format_reward_func": 0.0, "step": 228 }, { "completion_length": 629.8534321784973, "epoch": 5.6469673405909795, "grad_norm": 4.474099159240723, "kl": 164.97060561180115, "learning_rate": 2.536026191693893e-07, "loss": 0.165, "reward": 0.2060565553838387, "reward_std": 0.21067888580728322, "rewards/equation_reward_func": 0.20605655445251614, "rewards/format_reward_func": 0.0, "step": 230 }, { "completion_length": 626.8482217788696, "epoch": 5.696734059097978, "grad_norm": 9.778329849243164, "kl": 169.21773087978363, "learning_rate": 2.5e-07, "loss": 0.1692, "reward": 0.20911459170747548, "reward_std": 0.21599237713962793, "rewards/equation_reward_func": 0.2091145912418142, "rewards/format_reward_func": 0.0, "step": 232 }, { "completion_length": 629.8660821914673, "epoch": 5.746500777604977, "grad_norm": 5.210114479064941, "kl": 171.0250325202942, "learning_rate": 2.4639738083061073e-07, "loss": 0.171, "reward": 0.2135788791347295, "reward_std": 0.20587447995785624, "rewards/equation_reward_func": 0.21357887890189886, "rewards/format_reward_func": 0.0, "step": 234 }, { "completion_length": 628.7165260314941, "epoch": 5.796267496111975, "grad_norm": 4.644392490386963, "kl": 149.7915449142456, "learning_rate": 2.4279550982658345e-07, "loss": 0.1498, "reward": 0.20833334070630372, "reward_std": 0.21195052459370345, "rewards/equation_reward_func": 0.20833334047347307, "rewards/format_reward_func": 0.0, "step": 236 }, { "completion_length": 628.755964756012, "epoch": 5.846034214618974, "grad_norm": 6.456798076629639, "kl": 442.08424025774, "learning_rate": 2.3919513499790646e-07, "loss": 0.4421, "reward": 0.22005209047347307, "reward_std": 0.21488765871617943, "rewards/equation_reward_func": 0.22005209024064243, "rewards/format_reward_func": 0.0, "step": 238 }, { "completion_length": 612.3988199234009, "epoch": 5.895800933125972, "grad_norm": 9.304161071777344, "kl": 118.21684062480927, "learning_rate": 2.3559700404385394e-07, "loss": 0.1182, "reward": 0.22447917505633086, "reward_std": 0.211615604814142, "rewards/equation_reward_func": 0.22447917482350022, "rewards/format_reward_func": 0.0, "step": 240 }, { "completion_length": 633.3660821914673, "epoch": 5.94556765163297, "grad_norm": 5.745642185211182, "kl": 133.20424818992615, "learning_rate": 2.3200186419770823e-07, "loss": 0.1332, "reward": 0.2242708442499861, "reward_std": 0.2152464333921671, "rewards/equation_reward_func": 0.22427084331866354, "rewards/format_reward_func": 0.0, "step": 242 }, { "completion_length": 618.1235270500183, "epoch": 5.995334370139969, "grad_norm": 4.167017936706543, "kl": 143.97905486822128, "learning_rate": 2.284104620715807e-07, "loss": 0.144, "reward": 0.22046875627711415, "reward_std": 0.21442426112480462, "rewards/equation_reward_func": 0.22046875732485205, "rewards/format_reward_func": 0.0, "step": 244 }, { "completion_length": 634.5175580476459, "epoch": 6.024883359253499, "grad_norm": 3.44785213470459, "kl": 167.55113441065737, "learning_rate": 2.2482354350136043e-07, "loss": 0.0995, "reward": 0.21961153769179395, "reward_std": 0.2146961924276854, "rewards/equation_reward_func": 0.21961153769179395, "rewards/format_reward_func": 0.0, "step": 246 }, { "completion_length": 634.5863180160522, "epoch": 6.074650077760498, "grad_norm": 7.954348564147949, "kl": 163.61565399169922, "learning_rate": 2.2124185339182496e-07, "loss": 0.1636, "reward": 0.23546131700277328, "reward_std": 0.2178129724925384, "rewards/equation_reward_func": 0.23546131781768054, "rewards/format_reward_func": 0.0, "step": 248 }, { "completion_length": 610.0825996398926, "epoch": 6.1244167962674965, "grad_norm": 4.648006439208984, "kl": 167.8152883052826, "learning_rate": 2.1766613556194344e-07, "loss": 0.1678, "reward": 0.22144346224376932, "reward_std": 0.21030379901640117, "rewards/equation_reward_func": 0.22144346177810803, "rewards/format_reward_func": 0.0, "step": 250 }, { "epoch": 6.1244167962674965, "step": 250, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 0.0058, "train_samples_per_second": 3851297.791, "train_steps_per_second": 17193.294 } ], "logging_steps": 2, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }