|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 6.1244167962674965, |
|
"eval_steps": 500, |
|
"global_step": 250, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"completion_length": 736.4702529907227, |
|
"epoch": 0.049766718506998445, |
|
"grad_norm": 0.2507069706916809, |
|
"kl": 0.0, |
|
"learning_rate": 7.142857142857142e-08, |
|
"loss": 0.0, |
|
"reward": 0.04415178840281442, |
|
"reward_std": 0.07034091584500857, |
|
"rewards/equation_reward_func": 0.04415178793715313, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 2 |
|
}, |
|
{ |
|
"completion_length": 723.1704015731812, |
|
"epoch": 0.09953343701399689, |
|
"grad_norm": 0.19884330034255981, |
|
"kl": 2.0936699339557663e-05, |
|
"learning_rate": 1.4285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.040647323767188936, |
|
"reward_std": 0.0637543131451821, |
|
"rewards/equation_reward_func": 0.04064732347615063, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 4 |
|
}, |
|
{ |
|
"completion_length": 726.5163822174072, |
|
"epoch": 0.14930015552099535, |
|
"grad_norm": 0.21145105361938477, |
|
"kl": 0.00019492170304147294, |
|
"learning_rate": 2.1428571428571426e-07, |
|
"loss": 0.0, |
|
"reward": 0.04095238326408435, |
|
"reward_std": 0.06441530691517983, |
|
"rewards/equation_reward_func": 0.040952383089461364, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 6 |
|
}, |
|
{ |
|
"completion_length": 737.8207015991211, |
|
"epoch": 0.19906687402799378, |
|
"grad_norm": 0.2020396590232849, |
|
"kl": 0.020334478189397487, |
|
"learning_rate": 2.857142857142857e-07, |
|
"loss": 0.0, |
|
"reward": 0.03635416827455629, |
|
"reward_std": 0.05670656039728783, |
|
"rewards/equation_reward_func": 0.03635416833276395, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 8 |
|
}, |
|
{ |
|
"completion_length": 718.5461435317993, |
|
"epoch": 0.24883359253499224, |
|
"grad_norm": 27.479997634887695, |
|
"kl": 9.990212610488015, |
|
"learning_rate": 3.5714285714285716e-07, |
|
"loss": 0.01, |
|
"reward": 0.04543898967676796, |
|
"reward_std": 0.07194261607946828, |
|
"rewards/equation_reward_func": 0.04543899020063691, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 10 |
|
}, |
|
{ |
|
"completion_length": 721.6964378356934, |
|
"epoch": 0.2986003110419907, |
|
"grad_norm": 0.19479116797447205, |
|
"kl": 0.005109551766508957, |
|
"learning_rate": 4.285714285714285e-07, |
|
"loss": 0.0, |
|
"reward": 0.04148065741173923, |
|
"reward_std": 0.0677548690000549, |
|
"rewards/equation_reward_func": 0.04148065741173923, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 12 |
|
}, |
|
{ |
|
"completion_length": 725.9003086090088, |
|
"epoch": 0.3483670295489891, |
|
"grad_norm": 0.24158786237239838, |
|
"kl": 0.502096803898894, |
|
"learning_rate": 5e-07, |
|
"loss": 0.0005, |
|
"reward": 0.04821428797731642, |
|
"reward_std": 0.07803500922454987, |
|
"rewards/equation_reward_func": 0.048214288559393026, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 14 |
|
}, |
|
{ |
|
"completion_length": 722.5781373977661, |
|
"epoch": 0.39813374805598756, |
|
"grad_norm": 0.290544331073761, |
|
"kl": 0.23321715661586495, |
|
"learning_rate": 4.999740409224932e-07, |
|
"loss": 0.0002, |
|
"reward": 0.05134672833082732, |
|
"reward_std": 0.07946415679180063, |
|
"rewards/equation_reward_func": 0.05134672856365796, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 16 |
|
}, |
|
{ |
|
"completion_length": 723.7433156967163, |
|
"epoch": 0.447900466562986, |
|
"grad_norm": 4.716856479644775, |
|
"kl": 0.8582769820350222, |
|
"learning_rate": 4.998961690809627e-07, |
|
"loss": 0.0009, |
|
"reward": 0.050446430934243836, |
|
"reward_std": 0.07721506280358881, |
|
"rewards/equation_reward_func": 0.050446430992451496, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 18 |
|
}, |
|
{ |
|
"completion_length": 729.6198072433472, |
|
"epoch": 0.4976671850699845, |
|
"grad_norm": 0.23771615326404572, |
|
"kl": 0.3220994914881885, |
|
"learning_rate": 4.997664006472578e-07, |
|
"loss": 0.0003, |
|
"reward": 0.045706847246037796, |
|
"reward_std": 0.07288302374945488, |
|
"rewards/equation_reward_func": 0.045706847246037796, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 20 |
|
}, |
|
{ |
|
"completion_length": 712.8631057739258, |
|
"epoch": 0.5474339035769828, |
|
"grad_norm": 0.3750320374965668, |
|
"kl": 0.27479040302569047, |
|
"learning_rate": 4.995847625707292e-07, |
|
"loss": 0.0003, |
|
"reward": 0.05489583619055338, |
|
"reward_std": 0.0833382241835352, |
|
"rewards/equation_reward_func": 0.054895836423384026, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 22 |
|
}, |
|
{ |
|
"completion_length": 731.4576015472412, |
|
"epoch": 0.5972006220839814, |
|
"grad_norm": 0.2089901864528656, |
|
"kl": 0.12606932656490244, |
|
"learning_rate": 4.993512925726318e-07, |
|
"loss": 0.0001, |
|
"reward": 0.0600520860607503, |
|
"reward_std": 0.0899482914537657, |
|
"rewards/equation_reward_func": 0.06005208553688135, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 24 |
|
}, |
|
{ |
|
"completion_length": 706.6056671142578, |
|
"epoch": 0.6469673405909798, |
|
"grad_norm": 0.17620234191417694, |
|
"kl": 0.1556346261058934, |
|
"learning_rate": 4.990660391382923e-07, |
|
"loss": 0.0002, |
|
"reward": 0.05229166932986118, |
|
"reward_std": 0.07350753628998064, |
|
"rewards/equation_reward_func": 0.05229166956269182, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 26 |
|
}, |
|
{ |
|
"completion_length": 727.1808137893677, |
|
"epoch": 0.6967340590979783, |
|
"grad_norm": 0.19479602575302124, |
|
"kl": 0.12168441573157907, |
|
"learning_rate": 4.987290615070384e-07, |
|
"loss": 0.0001, |
|
"reward": 0.053683038655435666, |
|
"reward_std": 0.08042177859169897, |
|
"rewards/equation_reward_func": 0.053683039121096954, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 28 |
|
}, |
|
{ |
|
"completion_length": 720.8891496658325, |
|
"epoch": 0.7465007776049767, |
|
"grad_norm": 0.1857473999261856, |
|
"kl": 0.1632600230514072, |
|
"learning_rate": 4.983404296598978e-07, |
|
"loss": 0.0002, |
|
"reward": 0.05391369271092117, |
|
"reward_std": 0.08413292915793136, |
|
"rewards/equation_reward_func": 0.053913692419882864, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 30 |
|
}, |
|
{ |
|
"completion_length": 720.4337940216064, |
|
"epoch": 0.7962674961119751, |
|
"grad_norm": 0.23092247545719147, |
|
"kl": 0.15535293571883813, |
|
"learning_rate": 4.979002243050646e-07, |
|
"loss": 0.0002, |
|
"reward": 0.05988095561042428, |
|
"reward_std": 0.09168167802272364, |
|
"rewards/equation_reward_func": 0.05988095601787791, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 32 |
|
}, |
|
{ |
|
"completion_length": 718.6845378875732, |
|
"epoch": 0.8460342146189735, |
|
"grad_norm": 0.23407958447933197, |
|
"kl": 0.25782948260894045, |
|
"learning_rate": 4.974085368611381e-07, |
|
"loss": 0.0003, |
|
"reward": 0.06691220620996319, |
|
"reward_std": 0.09768064138188493, |
|
"rewards/equation_reward_func": 0.06691220562788658, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 34 |
|
}, |
|
{ |
|
"completion_length": 718.0454006195068, |
|
"epoch": 0.895800933125972, |
|
"grad_norm": 0.3076172471046448, |
|
"kl": 0.2404527408652939, |
|
"learning_rate": 4.968654694381379e-07, |
|
"loss": 0.0002, |
|
"reward": 0.07349702704232186, |
|
"reward_std": 0.10955648736853618, |
|
"rewards/equation_reward_func": 0.07349702733336017, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 36 |
|
}, |
|
{ |
|
"completion_length": 704.1488237380981, |
|
"epoch": 0.9455676516329704, |
|
"grad_norm": 0.2561110258102417, |
|
"kl": 0.43795167771168053, |
|
"learning_rate": 4.962711348162987e-07, |
|
"loss": 0.0004, |
|
"reward": 0.06241815793327987, |
|
"reward_std": 0.09217380215704907, |
|
"rewards/equation_reward_func": 0.0624181583407335, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 38 |
|
}, |
|
{ |
|
"completion_length": 707.3921279907227, |
|
"epoch": 0.995334370139969, |
|
"grad_norm": 0.3400561511516571, |
|
"kl": 0.5494289128109813, |
|
"learning_rate": 4.956256564226487e-07, |
|
"loss": 0.0005, |
|
"reward": 0.0764508958091028, |
|
"reward_std": 0.11110821401234716, |
|
"rewards/equation_reward_func": 0.07645089708967134, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 40 |
|
}, |
|
{ |
|
"completion_length": 715.0272221156529, |
|
"epoch": 1.0497667185069985, |
|
"grad_norm": 0.26081565022468567, |
|
"kl": 0.4236157455614635, |
|
"learning_rate": 4.949291683053768e-07, |
|
"loss": 0.0005, |
|
"reward": 0.07186394860701902, |
|
"reward_std": 0.10362207902861494, |
|
"rewards/equation_reward_func": 0.07186394876667432, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 42 |
|
}, |
|
{ |
|
"completion_length": 714.9486722946167, |
|
"epoch": 1.0995334370139969, |
|
"grad_norm": 0.29378727078437805, |
|
"kl": 0.3755593653768301, |
|
"learning_rate": 4.941818151059955e-07, |
|
"loss": 0.0004, |
|
"reward": 0.0799404798890464, |
|
"reward_std": 0.11443577655882109, |
|
"rewards/equation_reward_func": 0.07994047965621576, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 44 |
|
}, |
|
{ |
|
"completion_length": 727.829628944397, |
|
"epoch": 1.1493001555209954, |
|
"grad_norm": 2045.599365234375, |
|
"kl": 128.7541933595203, |
|
"learning_rate": 4.933837520293017e-07, |
|
"loss": 0.1288, |
|
"reward": 0.06808780113351531, |
|
"reward_std": 0.09949399236938916, |
|
"rewards/equation_reward_func": 0.06808780090068467, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 46 |
|
}, |
|
{ |
|
"completion_length": 709.632453918457, |
|
"epoch": 1.1990668740279937, |
|
"grad_norm": 0.2698291838169098, |
|
"kl": 0.4989726666826755, |
|
"learning_rate": 4.925351448111454e-07, |
|
"loss": 0.0005, |
|
"reward": 0.09389881315291859, |
|
"reward_std": 0.13221543522377033, |
|
"rewards/equation_reward_func": 0.09389881303650327, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 48 |
|
}, |
|
{ |
|
"completion_length": 719.485878944397, |
|
"epoch": 1.2488335925349923, |
|
"grad_norm": 0.36381521821022034, |
|
"kl": 0.550471473718062, |
|
"learning_rate": 4.91636169684011e-07, |
|
"loss": 0.0006, |
|
"reward": 0.08360863462439738, |
|
"reward_std": 0.11854775344545487, |
|
"rewards/equation_reward_func": 0.08360863421694376, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 50 |
|
}, |
|
{ |
|
"completion_length": 725.6599855422974, |
|
"epoch": 1.2986003110419908, |
|
"grad_norm": 0.3374347686767578, |
|
"kl": 0.663099701050669, |
|
"learning_rate": 4.906870133404186e-07, |
|
"loss": 0.0007, |
|
"reward": 0.08503720644512214, |
|
"reward_std": 0.12180299674218986, |
|
"rewards/equation_reward_func": 0.0850372067943681, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 52 |
|
}, |
|
{ |
|
"completion_length": 723.972484588623, |
|
"epoch": 1.3483670295489891, |
|
"grad_norm": 1.0345810651779175, |
|
"kl": 0.9573397457133979, |
|
"learning_rate": 4.896878728941531e-07, |
|
"loss": 0.001, |
|
"reward": 0.09177827867097221, |
|
"reward_std": 0.12253864679951221, |
|
"rewards/equation_reward_func": 0.09177827744861133, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 54 |
|
}, |
|
{ |
|
"completion_length": 712.2269496917725, |
|
"epoch": 1.3981337480559874, |
|
"grad_norm": 0.27968963980674744, |
|
"kl": 0.8391579431481659, |
|
"learning_rate": 4.886389558393284e-07, |
|
"loss": 0.0008, |
|
"reward": 0.08570684934966266, |
|
"reward_std": 0.1181660912843654, |
|
"rewards/equation_reward_func": 0.08570684841834009, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 56 |
|
}, |
|
{ |
|
"completion_length": 730.5327529907227, |
|
"epoch": 1.447900466562986, |
|
"grad_norm": 0.28138798475265503, |
|
"kl": 0.9094656470697373, |
|
"learning_rate": 4.875404800072976e-07, |
|
"loss": 0.0009, |
|
"reward": 0.08794643338478636, |
|
"reward_std": 0.12104765651747584, |
|
"rewards/equation_reward_func": 0.08794643309374806, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 58 |
|
}, |
|
{ |
|
"completion_length": 732.3861742019653, |
|
"epoch": 1.4976671850699845, |
|
"grad_norm": 0.34412360191345215, |
|
"kl": 1.009782899171114, |
|
"learning_rate": 4.86392673521415e-07, |
|
"loss": 0.001, |
|
"reward": 0.10000744601711631, |
|
"reward_std": 0.13957228315121029, |
|
"rewards/equation_reward_func": 0.10000744566787034, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 60 |
|
}, |
|
{ |
|
"completion_length": 725.0677175521851, |
|
"epoch": 1.5474339035769828, |
|
"grad_norm": 0.3454972207546234, |
|
"kl": 1.0763904643245041, |
|
"learning_rate": 4.851957747496606e-07, |
|
"loss": 0.0011, |
|
"reward": 0.10212798128486611, |
|
"reward_std": 0.13816983328433707, |
|
"rewards/equation_reward_func": 0.10212798012071289, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 62 |
|
}, |
|
{ |
|
"completion_length": 730.5171251296997, |
|
"epoch": 1.5972006220839814, |
|
"grad_norm": 0.3473067581653595, |
|
"kl": 1.4565551071427763, |
|
"learning_rate": 4.839500322551386e-07, |
|
"loss": 0.0015, |
|
"reward": 0.10485119439545088, |
|
"reward_std": 0.14129075466189533, |
|
"rewards/equation_reward_func": 0.10485119334771298, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 64 |
|
}, |
|
{ |
|
"completion_length": 735.0320043563843, |
|
"epoch": 1.64696734059098, |
|
"grad_norm": 0.3159619867801666, |
|
"kl": 1.5041364189237356, |
|
"learning_rate": 4.826557047444563e-07, |
|
"loss": 0.0015, |
|
"reward": 0.10093006424722262, |
|
"reward_std": 0.13811934839759488, |
|
"rewards/equation_reward_func": 0.1009300641308073, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 66 |
|
}, |
|
{ |
|
"completion_length": 730.7455463409424, |
|
"epoch": 1.6967340590979783, |
|
"grad_norm": 1.146909236907959, |
|
"kl": 2.238507369533181, |
|
"learning_rate": 4.813130610139993e-07, |
|
"loss": 0.0022, |
|
"reward": 0.10973958898102865, |
|
"reward_std": 0.13851106038782746, |
|
"rewards/equation_reward_func": 0.10973958781687543, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 68 |
|
}, |
|
{ |
|
"completion_length": 712.6971893310547, |
|
"epoch": 1.7465007776049766, |
|
"grad_norm": 7.27742338180542, |
|
"kl": 3.2542791040614247, |
|
"learning_rate": 4.799223798941089e-07, |
|
"loss": 0.0033, |
|
"reward": 0.12900298138265498, |
|
"reward_std": 0.15667404458508827, |
|
"rewards/equation_reward_func": 0.1290029831288848, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 70 |
|
}, |
|
{ |
|
"completion_length": 729.6331987380981, |
|
"epoch": 1.7962674961119751, |
|
"grad_norm": 10.986953735351562, |
|
"kl": 4.106183127500117, |
|
"learning_rate": 4.78483950191177e-07, |
|
"loss": 0.0041, |
|
"reward": 0.12543899397132918, |
|
"reward_std": 0.16567694948753342, |
|
"rewards/equation_reward_func": 0.12543899344746023, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 72 |
|
}, |
|
{ |
|
"completion_length": 737.0245656967163, |
|
"epoch": 1.8460342146189737, |
|
"grad_norm": 1.6122727394104004, |
|
"kl": 3.731540434062481, |
|
"learning_rate": 4.769980706276687e-07, |
|
"loss": 0.0037, |
|
"reward": 0.12507440976332873, |
|
"reward_std": 0.159569505834952, |
|
"rewards/equation_reward_func": 0.12507440929766744, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 74 |
|
}, |
|
{ |
|
"completion_length": 729.0632581710815, |
|
"epoch": 1.895800933125972, |
|
"grad_norm": 0.5852969288825989, |
|
"kl": 2.9793617641553283, |
|
"learning_rate": 4.7546504978008595e-07, |
|
"loss": 0.003, |
|
"reward": 0.12817708833608776, |
|
"reward_std": 0.1600989469443448, |
|
"rewards/equation_reward_func": 0.1281770879868418, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 76 |
|
}, |
|
{ |
|
"completion_length": 734.6302223205566, |
|
"epoch": 1.9455676516329703, |
|
"grad_norm": 0.9090600609779358, |
|
"kl": 3.139740688726306, |
|
"learning_rate": 4.738852060148848e-07, |
|
"loss": 0.0031, |
|
"reward": 0.13495536311529577, |
|
"reward_std": 0.1720278718858026, |
|
"rewards/equation_reward_func": 0.13495536299888045, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 78 |
|
}, |
|
{ |
|
"completion_length": 742.833345413208, |
|
"epoch": 1.995334370139969, |
|
"grad_norm": 0.5681818723678589, |
|
"kl": 3.712686972692609, |
|
"learning_rate": 4.722588674223593e-07, |
|
"loss": 0.0037, |
|
"reward": 0.13085565919755027, |
|
"reward_std": 0.15991040458902717, |
|
"rewards/equation_reward_func": 0.1308556593139656, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 80 |
|
}, |
|
{ |
|
"completion_length": 717.2042718184622, |
|
"epoch": 2.0248833592534994, |
|
"grad_norm": 1.5164953470230103, |
|
"kl": 5.466580171334116, |
|
"learning_rate": 4.70586371748506e-07, |
|
"loss": 0.0032, |
|
"reward": 0.14641604347056464, |
|
"reward_std": 0.18159407436063416, |
|
"rewards/equation_reward_func": 0.1464160444509042, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 82 |
|
}, |
|
{ |
|
"completion_length": 730.2589464187622, |
|
"epoch": 2.0746500777604977, |
|
"grad_norm": 0.6375504732131958, |
|
"kl": 4.280845553614199, |
|
"learning_rate": 4.6886806632488363e-07, |
|
"loss": 0.0043, |
|
"reward": 0.14213542238576338, |
|
"reward_std": 0.1740714008337818, |
|
"rewards/equation_reward_func": 0.14213542168727145, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 84 |
|
}, |
|
{ |
|
"completion_length": 744.4538831710815, |
|
"epoch": 2.124416796267496, |
|
"grad_norm": 0.9480769038200378, |
|
"kl": 7.16812994517386, |
|
"learning_rate": 4.6710430799648143e-07, |
|
"loss": 0.0072, |
|
"reward": 0.12831845637992956, |
|
"reward_std": 0.1582361755426973, |
|
"rewards/equation_reward_func": 0.12831845649634488, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 86 |
|
}, |
|
{ |
|
"completion_length": 732.5520973205566, |
|
"epoch": 2.1741835147744943, |
|
"grad_norm": 16.496623992919922, |
|
"kl": 10.49539315700531, |
|
"learning_rate": 4.652954630476127e-07, |
|
"loss": 0.0105, |
|
"reward": 0.14677828032290563, |
|
"reward_std": 0.1764058277476579, |
|
"rewards/equation_reward_func": 0.1467782796244137, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 88 |
|
}, |
|
{ |
|
"completion_length": 736.1361722946167, |
|
"epoch": 2.223950233281493, |
|
"grad_norm": 2.352017879486084, |
|
"kl": 10.109702784568071, |
|
"learning_rate": 4.6344190712584713e-07, |
|
"loss": 0.0101, |
|
"reward": 0.13781250565079972, |
|
"reward_std": 0.1627702646655962, |
|
"rewards/equation_reward_func": 0.13781250413740054, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 90 |
|
}, |
|
{ |
|
"completion_length": 749.1317129135132, |
|
"epoch": 2.2737169517884914, |
|
"grad_norm": 3.804121255874634, |
|
"kl": 15.052036292850971, |
|
"learning_rate": 4.615440251639995e-07, |
|
"loss": 0.0151, |
|
"reward": 0.14105655340244994, |
|
"reward_std": 0.17247924709226936, |
|
"rewards/equation_reward_func": 0.14105655369348824, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 92 |
|
}, |
|
{ |
|
"completion_length": 717.3884019851685, |
|
"epoch": 2.3234836702954897, |
|
"grad_norm": 2.226238489151001, |
|
"kl": 12.018643591552973, |
|
"learning_rate": 4.596022113001894e-07, |
|
"loss": 0.012, |
|
"reward": 0.15741816238733009, |
|
"reward_std": 0.17923290858743712, |
|
"rewards/equation_reward_func": 0.15741816128138453, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 94 |
|
}, |
|
{ |
|
"completion_length": 726.2500143051147, |
|
"epoch": 2.3732503888024885, |
|
"grad_norm": 2.1459925174713135, |
|
"kl": 12.27118530496955, |
|
"learning_rate": 4.576168687959895e-07, |
|
"loss": 0.0123, |
|
"reward": 0.16154762578662485, |
|
"reward_std": 0.18940409342758358, |
|
"rewards/equation_reward_func": 0.16154762508813292, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 96 |
|
}, |
|
{ |
|
"completion_length": 711.6696538925171, |
|
"epoch": 2.423017107309487, |
|
"grad_norm": 1.4883497953414917, |
|
"kl": 15.596692271530628, |
|
"learning_rate": 4.555884099526793e-07, |
|
"loss": 0.0156, |
|
"reward": 0.15925595845328644, |
|
"reward_std": 0.1815938005456701, |
|
"rewards/equation_reward_func": 0.1592559577547945, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 98 |
|
}, |
|
{ |
|
"completion_length": 719.6242723464966, |
|
"epoch": 2.472783825816485, |
|
"grad_norm": 4.10906982421875, |
|
"kl": 17.258602559566498, |
|
"learning_rate": 4.5351725602562174e-07, |
|
"loss": 0.0173, |
|
"reward": 0.17212054354604334, |
|
"reward_std": 0.18435519566992298, |
|
"rewards/equation_reward_func": 0.17212054308038205, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 100 |
|
}, |
|
{ |
|
"completion_length": 697.6637020111084, |
|
"epoch": 2.522550544323484, |
|
"grad_norm": 1.1079808473587036, |
|
"kl": 14.344636462628841, |
|
"learning_rate": 4.514038371367791e-07, |
|
"loss": 0.0143, |
|
"reward": 0.17430060362676159, |
|
"reward_std": 0.19522728596348315, |
|
"rewards/equation_reward_func": 0.17430060246260837, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 102 |
|
}, |
|
{ |
|
"completion_length": 695.2105755805969, |
|
"epoch": 2.5723172628304822, |
|
"grad_norm": 1.298901081085205, |
|
"kl": 15.563006613403559, |
|
"learning_rate": 4.4924859218538936e-07, |
|
"loss": 0.0156, |
|
"reward": 0.17871280398685485, |
|
"reward_std": 0.19645729020703584, |
|
"rewards/equation_reward_func": 0.17871280352119356, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 104 |
|
}, |
|
{ |
|
"completion_length": 687.2507581710815, |
|
"epoch": 2.6220839813374806, |
|
"grad_norm": 1.333657145500183, |
|
"kl": 14.787582196295261, |
|
"learning_rate": 4.470519687568185e-07, |
|
"loss": 0.0148, |
|
"reward": 0.19031250709667802, |
|
"reward_std": 0.2006249635014683, |
|
"rewards/equation_reward_func": 0.19031250721309334, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 106 |
|
}, |
|
{ |
|
"completion_length": 672.3839402198792, |
|
"epoch": 2.671850699844479, |
|
"grad_norm": 1.4585353136062622, |
|
"kl": 20.08526621758938, |
|
"learning_rate": 4.4481442302960923e-07, |
|
"loss": 0.0201, |
|
"reward": 0.18158482806757092, |
|
"reward_std": 0.1955818484420888, |
|
"rewards/equation_reward_func": 0.18158482783474028, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 108 |
|
}, |
|
{ |
|
"completion_length": 651.4077491760254, |
|
"epoch": 2.721617418351477, |
|
"grad_norm": 1.516221523284912, |
|
"kl": 17.027776926755905, |
|
"learning_rate": 4.4253641968074505e-07, |
|
"loss": 0.017, |
|
"reward": 0.1995759003330022, |
|
"reward_std": 0.21349556557834148, |
|
"rewards/equation_reward_func": 0.19957590056583285, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 110 |
|
}, |
|
{ |
|
"completion_length": 672.9442043304443, |
|
"epoch": 2.771384136858476, |
|
"grad_norm": 2.0658159255981445, |
|
"kl": 20.176754418760538, |
|
"learning_rate": 4.402184317891501e-07, |
|
"loss": 0.0202, |
|
"reward": 0.20375744753982872, |
|
"reward_std": 0.18776777852326632, |
|
"rewards/equation_reward_func": 0.2037574463756755, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 112 |
|
}, |
|
{ |
|
"completion_length": 665.7247114181519, |
|
"epoch": 2.8211508553654743, |
|
"grad_norm": 2.339445114135742, |
|
"kl": 22.64492540061474, |
|
"learning_rate": 4.37860940737443e-07, |
|
"loss": 0.0226, |
|
"reward": 0.1926413766341284, |
|
"reward_std": 0.2001927924575284, |
|
"rewards/equation_reward_func": 0.19264137593563646, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 114 |
|
}, |
|
{ |
|
"completion_length": 669.665937423706, |
|
"epoch": 2.8709175738724726, |
|
"grad_norm": 2.852607011795044, |
|
"kl": 32.22943264245987, |
|
"learning_rate": 4.354644361119671e-07, |
|
"loss": 0.0322, |
|
"reward": 0.19950893591158092, |
|
"reward_std": 0.1933421454159543, |
|
"rewards/equation_reward_func": 0.19950893614441156, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 116 |
|
}, |
|
{ |
|
"completion_length": 670.7053713798523, |
|
"epoch": 2.9206842923794714, |
|
"grad_norm": 2.6619129180908203, |
|
"kl": 27.73328886926174, |
|
"learning_rate": 4.3302941560111716e-07, |
|
"loss": 0.0277, |
|
"reward": 0.19388393545523286, |
|
"reward_std": 0.19777346146292984, |
|
"rewards/equation_reward_func": 0.1938839361537248, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 118 |
|
}, |
|
{ |
|
"completion_length": 676.3571548461914, |
|
"epoch": 2.9704510108864697, |
|
"grad_norm": 3.816153049468994, |
|
"kl": 27.2223904132843, |
|
"learning_rate": 4.3055638489198236e-07, |
|
"loss": 0.0272, |
|
"reward": 0.20729167491663247, |
|
"reward_std": 0.20934273721650243, |
|
"rewards/equation_reward_func": 0.20729167328681797, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 120 |
|
}, |
|
{ |
|
"completion_length": 659.7907361482319, |
|
"epoch": 3.0, |
|
"grad_norm": 0.624527633190155, |
|
"kl": 27.528421577654388, |
|
"learning_rate": 4.280458575653296e-07, |
|
"loss": 0.0163, |
|
"reward": 0.20659148869545838, |
|
"reward_std": 0.19081004316869535, |
|
"rewards/equation_reward_func": 0.20659148947973, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 122 |
|
}, |
|
{ |
|
"completion_length": 659.4025421142578, |
|
"epoch": 3.0497667185069983, |
|
"grad_norm": 3.345853567123413, |
|
"kl": 21.34368522465229, |
|
"learning_rate": 4.2549835498894665e-07, |
|
"loss": 0.0213, |
|
"reward": 0.22118304355535656, |
|
"reward_std": 0.21869899448938668, |
|
"rewards/equation_reward_func": 0.22118304437026381, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 124 |
|
}, |
|
{ |
|
"completion_length": 672.1183128356934, |
|
"epoch": 3.099533437013997, |
|
"grad_norm": 6.106723785400391, |
|
"kl": 23.556977652013302, |
|
"learning_rate": 4.229144062093679e-07, |
|
"loss": 0.0236, |
|
"reward": 0.21467262762598693, |
|
"reward_std": 0.2053254572674632, |
|
"rewards/equation_reward_func": 0.21467262762598693, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 126 |
|
}, |
|
{ |
|
"completion_length": 653.0297751426697, |
|
"epoch": 3.1493001555209954, |
|
"grad_norm": 5.746135234832764, |
|
"kl": 26.1618300229311, |
|
"learning_rate": 4.2029454784200675e-07, |
|
"loss": 0.0262, |
|
"reward": 0.21742560202255845, |
|
"reward_std": 0.2172505116323009, |
|
"rewards/equation_reward_func": 0.217425603303127, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 128 |
|
}, |
|
{ |
|
"completion_length": 645.058048248291, |
|
"epoch": 3.1990668740279937, |
|
"grad_norm": 60.6376953125, |
|
"kl": 53.1397475451231, |
|
"learning_rate": 4.1763932395971433e-07, |
|
"loss": 0.0531, |
|
"reward": 0.2241517937509343, |
|
"reward_std": 0.20952896296512336, |
|
"rewards/equation_reward_func": 0.22415179491508752, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 130 |
|
}, |
|
{ |
|
"completion_length": 632.6659345626831, |
|
"epoch": 3.248833592534992, |
|
"grad_norm": 5.82427978515625, |
|
"kl": 41.686398059129715, |
|
"learning_rate": 4.1494928597979117e-07, |
|
"loss": 0.0417, |
|
"reward": 0.22440477029886097, |
|
"reward_std": 0.2128691952675581, |
|
"rewards/equation_reward_func": 0.22440477076452225, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 132 |
|
}, |
|
{ |
|
"completion_length": 639.6711411476135, |
|
"epoch": 3.298600311041991, |
|
"grad_norm": 3.375183343887329, |
|
"kl": 36.797510489821434, |
|
"learning_rate": 4.122249925494726e-07, |
|
"loss": 0.0368, |
|
"reward": 0.2161235201638192, |
|
"reward_std": 0.20362528192345053, |
|
"rewards/equation_reward_func": 0.21612352062948048, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 134 |
|
}, |
|
{ |
|
"completion_length": 651.2276935577393, |
|
"epoch": 3.348367029548989, |
|
"grad_norm": 5.04212760925293, |
|
"kl": 37.60325849056244, |
|
"learning_rate": 4.094670094299131e-07, |
|
"loss": 0.0376, |
|
"reward": 0.22996280749794096, |
|
"reward_std": 0.214357816032134, |
|
"rewards/equation_reward_func": 0.22996280703227967, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 136 |
|
}, |
|
{ |
|
"completion_length": 631.5751585960388, |
|
"epoch": 3.3981337480559874, |
|
"grad_norm": 4.119243144989014, |
|
"kl": 43.57139265537262, |
|
"learning_rate": 4.066759093786931e-07, |
|
"loss": 0.0436, |
|
"reward": 0.2285714359022677, |
|
"reward_std": 0.21766341011971235, |
|
"rewards/equation_reward_func": 0.22857143532019109, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 138 |
|
}, |
|
{ |
|
"completion_length": 647.8214359283447, |
|
"epoch": 3.447900466562986, |
|
"grad_norm": 7.117722988128662, |
|
"kl": 60.4551947414875, |
|
"learning_rate": 4.038522720308732e-07, |
|
"loss": 0.0605, |
|
"reward": 0.21806548640597612, |
|
"reward_std": 0.20702184177935123, |
|
"rewards/equation_reward_func": 0.2180654831463471, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 140 |
|
}, |
|
{ |
|
"completion_length": 609.9583463668823, |
|
"epoch": 3.4976671850699845, |
|
"grad_norm": 4.748437881469727, |
|
"kl": 58.59304141998291, |
|
"learning_rate": 4.009966837786194e-07, |
|
"loss": 0.0586, |
|
"reward": 0.2300297737820074, |
|
"reward_std": 0.20853826915845275, |
|
"rewards/equation_reward_func": 0.23002976982388645, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 142 |
|
}, |
|
{ |
|
"completion_length": 631.8430180549622, |
|
"epoch": 3.547433903576983, |
|
"grad_norm": 8.042330741882324, |
|
"kl": 82.30807757377625, |
|
"learning_rate": 3.981097376494259e-07, |
|
"loss": 0.0823, |
|
"reward": 0.21836310264188796, |
|
"reward_std": 0.20933940180111676, |
|
"rewards/equation_reward_func": 0.21836310101207346, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 144 |
|
}, |
|
{ |
|
"completion_length": 624.0669736862183, |
|
"epoch": 3.5972006220839816, |
|
"grad_norm": 7.811219692230225, |
|
"kl": 77.89375275373459, |
|
"learning_rate": 3.951920331829592e-07, |
|
"loss": 0.0779, |
|
"reward": 0.2207961401436478, |
|
"reward_std": 0.21105306909885257, |
|
"rewards/equation_reward_func": 0.22079613932874054, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 146 |
|
}, |
|
{ |
|
"completion_length": 623.5215888023376, |
|
"epoch": 3.64696734059098, |
|
"grad_norm": 8.836230278015137, |
|
"kl": 65.97143815457821, |
|
"learning_rate": 3.922441763065506e-07, |
|
"loss": 0.066, |
|
"reward": 0.2193824496353045, |
|
"reward_std": 0.20604081987403333, |
|
"rewards/equation_reward_func": 0.21938244777265936, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 148 |
|
}, |
|
{ |
|
"completion_length": 634.7611751556396, |
|
"epoch": 3.6967340590979783, |
|
"grad_norm": 5.354574680328369, |
|
"kl": 56.36278319358826, |
|
"learning_rate": 3.8926677920936093e-07, |
|
"loss": 0.0564, |
|
"reward": 0.2112648879410699, |
|
"reward_std": 0.2029515573522076, |
|
"rewards/equation_reward_func": 0.21126488805748522, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 150 |
|
}, |
|
{ |
|
"completion_length": 636.0297775268555, |
|
"epoch": 3.7465007776049766, |
|
"grad_norm": 5.276882648468018, |
|
"kl": 65.72037261724472, |
|
"learning_rate": 3.862604602152464e-07, |
|
"loss": 0.0657, |
|
"reward": 0.20753721124492586, |
|
"reward_std": 0.20195745571982116, |
|
"rewards/equation_reward_func": 0.20753721171058714, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 152 |
|
}, |
|
{ |
|
"completion_length": 634.954626083374, |
|
"epoch": 3.796267496111975, |
|
"grad_norm": 8.027347564697266, |
|
"kl": 77.93326985836029, |
|
"learning_rate": 3.8322584365434934e-07, |
|
"loss": 0.0779, |
|
"reward": 0.2165699511533603, |
|
"reward_std": 0.2101849897298962, |
|
"rewards/equation_reward_func": 0.2165699495235458, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 154 |
|
}, |
|
{ |
|
"completion_length": 638.3660817146301, |
|
"epoch": 3.8460342146189737, |
|
"grad_norm": 4.954690456390381, |
|
"kl": 83.4894488453865, |
|
"learning_rate": 3.8016355973344173e-07, |
|
"loss": 0.0835, |
|
"reward": 0.21200893796049058, |
|
"reward_std": 0.21022081119008362, |
|
"rewards/equation_reward_func": 0.21200893679633737, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 156 |
|
}, |
|
{ |
|
"completion_length": 620.3281378746033, |
|
"epoch": 3.895800933125972, |
|
"grad_norm": 4.270212650299072, |
|
"kl": 82.2349089384079, |
|
"learning_rate": 3.7707424440504863e-07, |
|
"loss": 0.0822, |
|
"reward": 0.211755960714072, |
|
"reward_std": 0.20715959300287068, |
|
"rewards/equation_reward_func": 0.21175595885142684, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 158 |
|
}, |
|
{ |
|
"completion_length": 632.0409350395203, |
|
"epoch": 3.9455676516329703, |
|
"grad_norm": 4.687271595001221, |
|
"kl": 90.35439342260361, |
|
"learning_rate": 3.739585392353787e-07, |
|
"loss": 0.0904, |
|
"reward": 0.21921131818089634, |
|
"reward_std": 0.20252067118417472, |
|
"rewards/equation_reward_func": 0.21921131608542055, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 160 |
|
}, |
|
{ |
|
"completion_length": 630.2678661346436, |
|
"epoch": 3.995334370139969, |
|
"grad_norm": 5.595997333526611, |
|
"kl": 95.46352458000183, |
|
"learning_rate": 3.7081709127108767e-07, |
|
"loss": 0.0955, |
|
"reward": 0.22013393603265285, |
|
"reward_std": 0.2177246706560254, |
|
"rewards/equation_reward_func": 0.2201339368475601, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 162 |
|
}, |
|
{ |
|
"completion_length": 632.1065288342928, |
|
"epoch": 4.024883359253499, |
|
"grad_norm": 8.787236213684082, |
|
"kl": 144.07192611694336, |
|
"learning_rate": 3.6765055290490513e-07, |
|
"loss": 0.0855, |
|
"reward": 0.20649123721216855, |
|
"reward_std": 0.21240881752026708, |
|
"rewards/equation_reward_func": 0.2064912359377271, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 164 |
|
}, |
|
{ |
|
"completion_length": 619.5156345367432, |
|
"epoch": 4.074650077760498, |
|
"grad_norm": 7.552036762237549, |
|
"kl": 137.199125289917, |
|
"learning_rate": 3.644595817401501e-07, |
|
"loss": 0.1372, |
|
"reward": 0.2162797685014084, |
|
"reward_std": 0.21547920361626893, |
|
"rewards/equation_reward_func": 0.2162797685014084, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 166 |
|
}, |
|
{ |
|
"completion_length": 618.7634057998657, |
|
"epoch": 4.1244167962674965, |
|
"grad_norm": 6.8007354736328125, |
|
"kl": 103.6235063970089, |
|
"learning_rate": 3.6124484045416483e-07, |
|
"loss": 0.1036, |
|
"reward": 0.23168899782467633, |
|
"reward_std": 0.21457487577572465, |
|
"rewards/equation_reward_func": 0.23168899829033762, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 168 |
|
}, |
|
{ |
|
"completion_length": 637.4136991500854, |
|
"epoch": 4.174183514774494, |
|
"grad_norm": 8.004964828491211, |
|
"kl": 113.37393373250961, |
|
"learning_rate": 3.580069966606949e-07, |
|
"loss": 0.1134, |
|
"reward": 0.21156250836793333, |
|
"reward_std": 0.2123116059228778, |
|
"rewards/equation_reward_func": 0.21156250790227205, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 170 |
|
}, |
|
{ |
|
"completion_length": 634.7485208511353, |
|
"epoch": 4.223950233281493, |
|
"grad_norm": 7.898318290710449, |
|
"kl": 109.72896337509155, |
|
"learning_rate": 3.547467227712444e-07, |
|
"loss": 0.1097, |
|
"reward": 0.2029910811688751, |
|
"reward_std": 0.20662414643447846, |
|
"rewards/equation_reward_func": 0.20299108081962913, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 172 |
|
}, |
|
{ |
|
"completion_length": 621.2730751037598, |
|
"epoch": 4.273716951788492, |
|
"grad_norm": 7.211435317993164, |
|
"kl": 99.61057341098785, |
|
"learning_rate": 3.5146469585543386e-07, |
|
"loss": 0.0996, |
|
"reward": 0.22819941327907145, |
|
"reward_std": 0.2186455992050469, |
|
"rewards/equation_reward_func": 0.22819941234774888, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 174 |
|
}, |
|
{ |
|
"completion_length": 640.9628086090088, |
|
"epoch": 4.32348367029549, |
|
"grad_norm": 7.790672302246094, |
|
"kl": 93.87813127040863, |
|
"learning_rate": 3.481615975003922e-07, |
|
"loss": 0.0939, |
|
"reward": 0.2149925670819357, |
|
"reward_std": 0.20749260939192027, |
|
"rewards/equation_reward_func": 0.2149925702251494, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 176 |
|
}, |
|
{ |
|
"completion_length": 615.1093888282776, |
|
"epoch": 4.3732503888024885, |
|
"grad_norm": 22.329519271850586, |
|
"kl": 87.78260296583176, |
|
"learning_rate": 3.448381136692089e-07, |
|
"loss": 0.0878, |
|
"reward": 0.21617560542654246, |
|
"reward_std": 0.20247984025627375, |
|
"rewards/equation_reward_func": 0.2161756035638973, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 178 |
|
}, |
|
{ |
|
"completion_length": 629.4829001426697, |
|
"epoch": 4.423017107309486, |
|
"grad_norm": 13.893996238708496, |
|
"kl": 98.21013808250427, |
|
"learning_rate": 3.4149493455847897e-07, |
|
"loss": 0.0982, |
|
"reward": 0.21152530901599675, |
|
"reward_std": 0.2093647257424891, |
|
"rewards/equation_reward_func": 0.21152530668769032, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 180 |
|
}, |
|
{ |
|
"completion_length": 623.7224802970886, |
|
"epoch": 4.472783825816485, |
|
"grad_norm": 7.4938130378723145, |
|
"kl": 149.59339570999146, |
|
"learning_rate": 3.3813275445496766e-07, |
|
"loss": 0.1496, |
|
"reward": 0.2145535812014714, |
|
"reward_std": 0.2063142586266622, |
|
"rewards/equation_reward_func": 0.214553578523919, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 182 |
|
}, |
|
{ |
|
"completion_length": 639.263400554657, |
|
"epoch": 4.522550544323484, |
|
"grad_norm": 6.325891494750977, |
|
"kl": 147.64970636367798, |
|
"learning_rate": 3.347522715914262e-07, |
|
"loss": 0.1476, |
|
"reward": 0.20923363824840635, |
|
"reward_std": 0.20685563085135072, |
|
"rewards/equation_reward_func": 0.20923363824840635, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 184 |
|
}, |
|
{ |
|
"completion_length": 636.6897439956665, |
|
"epoch": 4.572317262830482, |
|
"grad_norm": 4.635812759399414, |
|
"kl": 130.48132091760635, |
|
"learning_rate": 3.313541880015877e-07, |
|
"loss": 0.1305, |
|
"reward": 0.21598215226549655, |
|
"reward_std": 0.2006415540818125, |
|
"rewards/equation_reward_func": 0.21598214923869818, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 186 |
|
}, |
|
{ |
|
"completion_length": 631.9933152198792, |
|
"epoch": 4.6220839813374806, |
|
"grad_norm": 7.933198928833008, |
|
"kl": 118.75544810295105, |
|
"learning_rate": 3.279392093743747e-07, |
|
"loss": 0.1188, |
|
"reward": 0.22688244911842048, |
|
"reward_std": 0.22052743670064956, |
|
"rewards/equation_reward_func": 0.22688244772143662, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 188 |
|
}, |
|
{ |
|
"completion_length": 632.7038769721985, |
|
"epoch": 4.671850699844479, |
|
"grad_norm": 6.763364791870117, |
|
"kl": 112.75827008485794, |
|
"learning_rate": 3.245080449073459e-07, |
|
"loss": 0.1128, |
|
"reward": 0.2060937569476664, |
|
"reward_std": 0.20044768252409995, |
|
"rewards/equation_reward_func": 0.2060937574133277, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 190 |
|
}, |
|
{ |
|
"completion_length": 632.4464421272278, |
|
"epoch": 4.721617418351477, |
|
"grad_norm": 4.295353412628174, |
|
"kl": 108.82453501224518, |
|
"learning_rate": 3.210614071594162e-07, |
|
"loss": 0.1088, |
|
"reward": 0.20745536405593157, |
|
"reward_std": 0.21275918127503246, |
|
"rewards/equation_reward_func": 0.2074553637066856, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 192 |
|
}, |
|
{ |
|
"completion_length": 634.1763515472412, |
|
"epoch": 4.771384136858476, |
|
"grad_norm": 4.46217679977417, |
|
"kl": 118.317107796669, |
|
"learning_rate": 3.1760001190287695e-07, |
|
"loss": 0.1183, |
|
"reward": 0.20520090113859624, |
|
"reward_std": 0.2021206704666838, |
|
"rewards/equation_reward_func": 0.20520090113859624, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 194 |
|
}, |
|
{ |
|
"completion_length": 620.2395968437195, |
|
"epoch": 4.821150855365475, |
|
"grad_norm": 4.841196060180664, |
|
"kl": 119.24478554725647, |
|
"learning_rate": 3.141245779747502e-07, |
|
"loss": 0.1192, |
|
"reward": 0.21259673358872533, |
|
"reward_std": 0.21422103908844292, |
|
"rewards/equation_reward_func": 0.21259673358872533, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 196 |
|
}, |
|
{ |
|
"completion_length": 609.0446557998657, |
|
"epoch": 4.870917573872473, |
|
"grad_norm": 4.3330559730529785, |
|
"kl": 119.67610502243042, |
|
"learning_rate": 3.106358271275056e-07, |
|
"loss": 0.1197, |
|
"reward": 0.22683036630041897, |
|
"reward_std": 0.20717181416694075, |
|
"rewards/equation_reward_func": 0.22683036653324962, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 198 |
|
}, |
|
{ |
|
"completion_length": 614.8869152069092, |
|
"epoch": 4.920684292379471, |
|
"grad_norm": 92.09661102294922, |
|
"kl": 144.53644692897797, |
|
"learning_rate": 3.0713448387917227e-07, |
|
"loss": 0.1445, |
|
"reward": 0.21901042643003166, |
|
"reward_std": 0.20682094641961157, |
|
"rewards/equation_reward_func": 0.2190104245673865, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 200 |
|
}, |
|
{ |
|
"completion_length": 631.4241156578064, |
|
"epoch": 4.970451010886469, |
|
"grad_norm": 6.355322360992432, |
|
"kl": 154.4233751296997, |
|
"learning_rate": 3.0362127536287636e-07, |
|
"loss": 0.1544, |
|
"reward": 0.21773066406603903, |
|
"reward_std": 0.21250074298586696, |
|
"rewards/equation_reward_func": 0.2177306618541479, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 202 |
|
}, |
|
{ |
|
"completion_length": 624.7180488987973, |
|
"epoch": 5.0, |
|
"grad_norm": 5.770173072814941, |
|
"kl": 161.87928571199117, |
|
"learning_rate": 3.0009693117583523e-07, |
|
"loss": 0.0961, |
|
"reward": 0.21541354177813782, |
|
"reward_std": 0.20374000229333578, |
|
"rewards/equation_reward_func": 0.215413541386002, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 204 |
|
}, |
|
{ |
|
"completion_length": 624.5647420883179, |
|
"epoch": 5.049766718506999, |
|
"grad_norm": 6.884070873260498, |
|
"kl": 157.92570447921753, |
|
"learning_rate": 2.965621832278401e-07, |
|
"loss": 0.1579, |
|
"reward": 0.22669643780682236, |
|
"reward_std": 0.20801680884324014, |
|
"rewards/equation_reward_func": 0.22669643454719335, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 206 |
|
}, |
|
{ |
|
"completion_length": 614.1570081710815, |
|
"epoch": 5.099533437013997, |
|
"grad_norm": 4.670907497406006, |
|
"kl": 134.14546036720276, |
|
"learning_rate": 2.9301776558925875e-07, |
|
"loss": 0.1341, |
|
"reward": 0.2188244123244658, |
|
"reward_std": 0.20453347032889724, |
|
"rewards/equation_reward_func": 0.21882441325578839, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 208 |
|
}, |
|
{ |
|
"completion_length": 614.4702506065369, |
|
"epoch": 5.149300155520995, |
|
"grad_norm": 14.716873168945312, |
|
"kl": 109.80421262979507, |
|
"learning_rate": 2.894644143385885e-07, |
|
"loss": 0.1098, |
|
"reward": 0.21839286445174366, |
|
"reward_std": 0.20062782417517155, |
|
"rewards/equation_reward_func": 0.21839286398608238, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 210 |
|
}, |
|
{ |
|
"completion_length": 622.4672718048096, |
|
"epoch": 5.199066874027994, |
|
"grad_norm": 10.858051300048828, |
|
"kl": 114.28983092308044, |
|
"learning_rate": 2.859028674095937e-07, |
|
"loss": 0.1143, |
|
"reward": 0.2192782819038257, |
|
"reward_std": 0.2128367607947439, |
|
"rewards/equation_reward_func": 0.21927828167099506, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 212 |
|
}, |
|
{ |
|
"completion_length": 612.6160840988159, |
|
"epoch": 5.248833592534992, |
|
"grad_norm": 3.8785901069641113, |
|
"kl": 125.06462055444717, |
|
"learning_rate": 2.823338644380566e-07, |
|
"loss": 0.1251, |
|
"reward": 0.23020090232603252, |
|
"reward_std": 0.2176531965378672, |
|
"rewards/equation_reward_func": 0.23020089999772608, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 214 |
|
}, |
|
{ |
|
"completion_length": 635.8995633125305, |
|
"epoch": 5.298600311041991, |
|
"grad_norm": 5.062567234039307, |
|
"kl": 148.21274209022522, |
|
"learning_rate": 2.7875814660817504e-07, |
|
"loss": 0.1482, |
|
"reward": 0.2193973324028775, |
|
"reward_std": 0.22195886494591832, |
|
"rewards/equation_reward_func": 0.21939733054023236, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 216 |
|
}, |
|
{ |
|
"completion_length": 630.8229269981384, |
|
"epoch": 5.348367029548989, |
|
"grad_norm": 5.181402206420898, |
|
"kl": 165.8618984222412, |
|
"learning_rate": 2.751764564986396e-07, |
|
"loss": 0.1659, |
|
"reward": 0.2077009006170556, |
|
"reward_std": 0.2193935844115913, |
|
"rewards/equation_reward_func": 0.2077009001513943, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 218 |
|
}, |
|
{ |
|
"completion_length": 628.6517939567566, |
|
"epoch": 5.3981337480559874, |
|
"grad_norm": 4.105767726898193, |
|
"kl": 148.7712802886963, |
|
"learning_rate": 2.715895379284194e-07, |
|
"loss": 0.1488, |
|
"reward": 0.2191815583501011, |
|
"reward_std": 0.20989621221087873, |
|
"rewards/equation_reward_func": 0.21918155602179468, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 220 |
|
}, |
|
{ |
|
"completion_length": 629.8006067276001, |
|
"epoch": 5.447900466562986, |
|
"grad_norm": 3.895611524581909, |
|
"kl": 142.22095596790314, |
|
"learning_rate": 2.6799813580229174e-07, |
|
"loss": 0.1422, |
|
"reward": 0.22290923492982984, |
|
"reward_std": 0.21323461562860757, |
|
"rewards/equation_reward_func": 0.2229092346969992, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 222 |
|
}, |
|
{ |
|
"completion_length": 608.6183171272278, |
|
"epoch": 5.497667185069984, |
|
"grad_norm": 6.331876277923584, |
|
"kl": 135.1478552222252, |
|
"learning_rate": 2.6440299595614606e-07, |
|
"loss": 0.1351, |
|
"reward": 0.21991072362288833, |
|
"reward_std": 0.22133340197615325, |
|
"rewards/equation_reward_func": 0.21991072269156575, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 224 |
|
}, |
|
{ |
|
"completion_length": 611.6756086349487, |
|
"epoch": 5.547433903576983, |
|
"grad_norm": 3.41554594039917, |
|
"kl": 135.47022581100464, |
|
"learning_rate": 2.6080486500209347e-07, |
|
"loss": 0.1355, |
|
"reward": 0.21784971025772393, |
|
"reward_std": 0.21086209290660918, |
|
"rewards/equation_reward_func": 0.2178497090935707, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 226 |
|
}, |
|
{ |
|
"completion_length": 609.0922722816467, |
|
"epoch": 5.597200622083982, |
|
"grad_norm": 4.638352870941162, |
|
"kl": 149.68241280317307, |
|
"learning_rate": 2.572044901734166e-07, |
|
"loss": 0.1497, |
|
"reward": 0.22438989242073148, |
|
"reward_std": 0.2241612394573167, |
|
"rewards/equation_reward_func": 0.2243898919550702, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 228 |
|
}, |
|
{ |
|
"completion_length": 629.8534321784973, |
|
"epoch": 5.6469673405909795, |
|
"grad_norm": 4.474099159240723, |
|
"kl": 164.97060561180115, |
|
"learning_rate": 2.536026191693893e-07, |
|
"loss": 0.165, |
|
"reward": 0.2060565553838387, |
|
"reward_std": 0.21067888580728322, |
|
"rewards/equation_reward_func": 0.20605655445251614, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 230 |
|
}, |
|
{ |
|
"completion_length": 626.8482217788696, |
|
"epoch": 5.696734059097978, |
|
"grad_norm": 9.778329849243164, |
|
"kl": 169.21773087978363, |
|
"learning_rate": 2.5e-07, |
|
"loss": 0.1692, |
|
"reward": 0.20911459170747548, |
|
"reward_std": 0.21599237713962793, |
|
"rewards/equation_reward_func": 0.2091145912418142, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 232 |
|
}, |
|
{ |
|
"completion_length": 629.8660821914673, |
|
"epoch": 5.746500777604977, |
|
"grad_norm": 5.210114479064941, |
|
"kl": 171.0250325202942, |
|
"learning_rate": 2.4639738083061073e-07, |
|
"loss": 0.171, |
|
"reward": 0.2135788791347295, |
|
"reward_std": 0.20587447995785624, |
|
"rewards/equation_reward_func": 0.21357887890189886, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 234 |
|
}, |
|
{ |
|
"completion_length": 628.7165260314941, |
|
"epoch": 5.796267496111975, |
|
"grad_norm": 4.644392490386963, |
|
"kl": 149.7915449142456, |
|
"learning_rate": 2.4279550982658345e-07, |
|
"loss": 0.1498, |
|
"reward": 0.20833334070630372, |
|
"reward_std": 0.21195052459370345, |
|
"rewards/equation_reward_func": 0.20833334047347307, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"completion_length": 628.755964756012, |
|
"epoch": 5.846034214618974, |
|
"grad_norm": 6.456798076629639, |
|
"kl": 442.08424025774, |
|
"learning_rate": 2.3919513499790646e-07, |
|
"loss": 0.4421, |
|
"reward": 0.22005209047347307, |
|
"reward_std": 0.21488765871617943, |
|
"rewards/equation_reward_func": 0.22005209024064243, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"completion_length": 612.3988199234009, |
|
"epoch": 5.895800933125972, |
|
"grad_norm": 9.304161071777344, |
|
"kl": 118.21684062480927, |
|
"learning_rate": 2.3559700404385394e-07, |
|
"loss": 0.1182, |
|
"reward": 0.22447917505633086, |
|
"reward_std": 0.211615604814142, |
|
"rewards/equation_reward_func": 0.22447917482350022, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"completion_length": 633.3660821914673, |
|
"epoch": 5.94556765163297, |
|
"grad_norm": 5.745642185211182, |
|
"kl": 133.20424818992615, |
|
"learning_rate": 2.3200186419770823e-07, |
|
"loss": 0.1332, |
|
"reward": 0.2242708442499861, |
|
"reward_std": 0.2152464333921671, |
|
"rewards/equation_reward_func": 0.22427084331866354, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"completion_length": 618.1235270500183, |
|
"epoch": 5.995334370139969, |
|
"grad_norm": 4.167017936706543, |
|
"kl": 143.97905486822128, |
|
"learning_rate": 2.284104620715807e-07, |
|
"loss": 0.144, |
|
"reward": 0.22046875627711415, |
|
"reward_std": 0.21442426112480462, |
|
"rewards/equation_reward_func": 0.22046875732485205, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"completion_length": 634.5175580476459, |
|
"epoch": 6.024883359253499, |
|
"grad_norm": 3.44785213470459, |
|
"kl": 167.55113441065737, |
|
"learning_rate": 2.2482354350136043e-07, |
|
"loss": 0.0995, |
|
"reward": 0.21961153769179395, |
|
"reward_std": 0.2146961924276854, |
|
"rewards/equation_reward_func": 0.21961153769179395, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 246 |
|
}, |
|
{ |
|
"completion_length": 634.5863180160522, |
|
"epoch": 6.074650077760498, |
|
"grad_norm": 7.954348564147949, |
|
"kl": 163.61565399169922, |
|
"learning_rate": 2.2124185339182496e-07, |
|
"loss": 0.1636, |
|
"reward": 0.23546131700277328, |
|
"reward_std": 0.2178129724925384, |
|
"rewards/equation_reward_func": 0.23546131781768054, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 248 |
|
}, |
|
{ |
|
"completion_length": 610.0825996398926, |
|
"epoch": 6.1244167962674965, |
|
"grad_norm": 4.648006439208984, |
|
"kl": 167.8152883052826, |
|
"learning_rate": 2.1766613556194344e-07, |
|
"loss": 0.1678, |
|
"reward": 0.22144346224376932, |
|
"reward_std": 0.21030379901640117, |
|
"rewards/equation_reward_func": 0.22144346177810803, |
|
"rewards/format_reward_func": 0.0, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 6.1244167962674965, |
|
"step": 250, |
|
"total_flos": 0.0, |
|
"train_loss": 0.0, |
|
"train_runtime": 0.0058, |
|
"train_samples_per_second": 3851297.791, |
|
"train_steps_per_second": 17193.294 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|