{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.22251891410769917, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 196.6666717529297, "epoch": 0.0004450378282153983, "grad_norm": 0.7149407267570496, "kl": 0.00043882918544113636, "learning_rate": 2.2222222222222224e-08, "loss": 0.0, "reward": -0.312666654586792, "reward_std": 0.39005160331726074, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.312666654586792, "step": 1 }, { "completion_length": 179.0, "epoch": 0.0008900756564307966, "grad_norm": 1.2034916877746582, "kl": 0.00043298103264532983, "learning_rate": 4.444444444444445e-08, "loss": 0.0, "reward": -0.2953333556652069, "reward_std": 0.3479785621166229, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2953333556652069, "step": 2 }, { "completion_length": 200.0, "epoch": 0.0013351134846461949, "grad_norm": 0.00107863440643996, "kl": 0.0003771593910641968, "learning_rate": 6.666666666666668e-08, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 3 }, { "completion_length": 200.0, "epoch": 0.0017801513128615932, "grad_norm": 0.7310553789138794, "kl": 0.00044770282693207264, "learning_rate": 8.88888888888889e-08, "loss": 0.0, "reward": -0.085999995470047, "reward_std": 0.3784494996070862, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.085999995470047, "step": 4 }, { "completion_length": 174.83334350585938, "epoch": 0.0022251891410769915, "grad_norm": 0.861896276473999, "kl": 0.00042549317004159093, "learning_rate": 1.1111111111111112e-07, "loss": 0.0, "reward": -0.09316667169332504, "reward_std": 0.2892870306968689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09316667169332504, "step": 5 }, { "completion_length": 200.0, "epoch": 0.0026702269692923898, "grad_norm": 0.0017526125302538276, "kl": 0.0004182992852292955, "learning_rate": 1.3333333333333336e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 6 }, { "completion_length": 166.5, "epoch": 0.003115264797507788, "grad_norm": 0.9585131406784058, "kl": 0.00047177166561596096, "learning_rate": 1.5555555555555556e-07, "loss": 0.0, "reward": -0.33766669034957886, "reward_std": 0.2008279412984848, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3376666307449341, "step": 7 }, { "completion_length": 200.0, "epoch": 0.0035603026257231864, "grad_norm": 0.6275889873504639, "kl": 0.0004587940056808293, "learning_rate": 1.777777777777778e-07, "loss": 0.0, "reward": -0.4321666657924652, "reward_std": 0.2741382122039795, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4321666657924652, "step": 8 }, { "completion_length": 152.5, "epoch": 0.004005340453938585, "grad_norm": 0.7820535898208618, "kl": 0.0003722615947481245, "learning_rate": 2.0000000000000002e-07, "loss": 0.0, "reward": -0.24449998140335083, "reward_std": 0.2453835904598236, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24449998140335083, "step": 9 }, { "completion_length": 200.0, "epoch": 0.004450378282153983, "grad_norm": 0.6632036566734314, "kl": 0.00043898209696635604, "learning_rate": 2.2222222222222224e-07, "loss": 0.0, "reward": -0.006833334919065237, "reward_std": 0.3229244351387024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.006833334919065237, "step": 10 }, { "completion_length": 200.0, "epoch": 0.004895416110369382, "grad_norm": 0.7768407464027405, "kl": 0.00042121915612369776, "learning_rate": 2.444444444444445e-07, "loss": 0.0, "reward": -0.5506666898727417, "reward_std": 0.06373277306556702, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5506666898727417, "step": 11 }, { "completion_length": 199.6666717529297, "epoch": 0.0053404539385847796, "grad_norm": 0.7278463244438171, "kl": 0.0004068611597176641, "learning_rate": 2.666666666666667e-07, "loss": 0.0, "reward": -0.2864999771118164, "reward_std": 0.334197998046875, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2864999771118164, "step": 12 }, { "completion_length": 200.0, "epoch": 0.005785491766800178, "grad_norm": 0.6500892043113708, "kl": 0.0003313750494271517, "learning_rate": 2.888888888888889e-07, "loss": 0.0, "reward": -0.32199999690055847, "reward_std": 0.34629470109939575, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32199999690055847, "step": 13 }, { "completion_length": 200.0, "epoch": 0.006230529595015576, "grad_norm": 0.7193189859390259, "kl": 0.0003894752007909119, "learning_rate": 3.111111111111111e-07, "loss": 0.0, "reward": -0.08683334290981293, "reward_std": 0.32831722497940063, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08683334290981293, "step": 14 }, { "completion_length": 200.0, "epoch": 0.006675567423230975, "grad_norm": 0.6580405235290527, "kl": 0.00035931920865550637, "learning_rate": 3.3333333333333335e-07, "loss": 0.0, "reward": 0.006666670553386211, "reward_std": 0.28985628485679626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.006666670553386211, "step": 15 }, { "completion_length": 200.0, "epoch": 0.007120605251446373, "grad_norm": 0.7829955220222473, "kl": 0.0004430452245287597, "learning_rate": 3.555555555555556e-07, "loss": 0.0, "reward": -0.4845000207424164, "reward_std": 0.30278095602989197, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4845000207424164, "step": 16 }, { "completion_length": 199.6666717529297, "epoch": 0.0075656430796617715, "grad_norm": 0.7301868796348572, "kl": 0.0004354792181402445, "learning_rate": 3.777777777777778e-07, "loss": 0.0, "reward": -0.5509999990463257, "reward_std": 0.09809383004903793, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5509999990463257, "step": 17 }, { "completion_length": 195.33334350585938, "epoch": 0.00801068090787717, "grad_norm": 0.627457320690155, "kl": 0.0004219269612804055, "learning_rate": 4.0000000000000003e-07, "loss": 0.0, "reward": -0.382999986410141, "reward_std": 0.258131742477417, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.382999986410141, "step": 18 }, { "completion_length": 198.5, "epoch": 0.008455718736092568, "grad_norm": 0.6126062273979187, "kl": 0.00037288008024916053, "learning_rate": 4.2222222222222226e-07, "loss": 0.0, "reward": -0.5198333263397217, "reward_std": 0.06735106557607651, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5198333859443665, "step": 19 }, { "completion_length": 200.0, "epoch": 0.008900756564307966, "grad_norm": 0.7298072576522827, "kl": 0.00042569750803522766, "learning_rate": 4.444444444444445e-07, "loss": 0.0, "reward": -0.12316666543483734, "reward_std": 0.38789814710617065, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12316666543483734, "step": 20 }, { "completion_length": 200.0, "epoch": 0.009345794392523364, "grad_norm": 0.7508992552757263, "kl": 0.0003784544242080301, "learning_rate": 4.666666666666667e-07, "loss": 0.0, "reward": 0.007999996654689312, "reward_std": 0.2865903079509735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007999996654689312, "step": 21 }, { "completion_length": 200.0, "epoch": 0.009790832220738763, "grad_norm": 0.7774270176887512, "kl": 0.0004591545439325273, "learning_rate": 4.88888888888889e-07, "loss": 0.0, "reward": -0.12583334743976593, "reward_std": 0.3894747495651245, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12583334743976593, "step": 22 }, { "completion_length": 178.6666717529297, "epoch": 0.010235870048954161, "grad_norm": 0.9822273254394531, "kl": 0.0003580343909561634, "learning_rate": 5.111111111111112e-07, "loss": 0.0, "reward": -0.40816670656204224, "reward_std": 0.2941179871559143, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.40816670656204224, "step": 23 }, { "completion_length": 195.6666717529297, "epoch": 0.010680907877169559, "grad_norm": 0.7280099987983704, "kl": 0.0003752989578060806, "learning_rate": 5.333333333333335e-07, "loss": 0.0, "reward": -0.5036666393280029, "reward_std": 0.06944254040718079, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5036666393280029, "step": 24 }, { "completion_length": 194.1666717529297, "epoch": 0.011125945705384957, "grad_norm": 0.7669874429702759, "kl": 0.0003995794686488807, "learning_rate": 5.555555555555555e-07, "loss": 0.0, "reward": -0.1613333374261856, "reward_std": 0.3338997960090637, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1613333374261856, "step": 25 }, { "completion_length": 200.0, "epoch": 0.011570983533600357, "grad_norm": 0.0017349894624203444, "kl": 0.00044584478018805385, "learning_rate": 5.777777777777778e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 26 }, { "completion_length": 200.0, "epoch": 0.012016021361815754, "grad_norm": 0.7146498560905457, "kl": 0.0004711821093223989, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 0.004333337303251028, "reward_std": 0.2955717444419861, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.004333337303251028, "step": 27 }, { "completion_length": 194.0, "epoch": 0.012461059190031152, "grad_norm": 0.7583353519439697, "kl": 0.0003412925580050796, "learning_rate": 6.222222222222223e-07, "loss": 0.0, "reward": -0.32066667079925537, "reward_std": 0.34830141067504883, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32066667079925537, "step": 28 }, { "completion_length": 200.0, "epoch": 0.01290609701824655, "grad_norm": 0.6495766639709473, "kl": 0.0003407001495361328, "learning_rate": 6.444444444444445e-07, "loss": 0.0, "reward": 0.013833334669470787, "reward_std": 0.2723016142845154, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013833334669470787, "step": 29 }, { "completion_length": 200.0, "epoch": 0.01335113484646195, "grad_norm": 0.6830177307128906, "kl": 0.00040841297595761716, "learning_rate": 6.666666666666667e-07, "loss": 0.0, "reward": -0.38466668128967285, "reward_std": 0.3953356444835663, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.38466668128967285, "step": 30 }, { "completion_length": 197.0, "epoch": 0.013796172674677348, "grad_norm": 0.6479209661483765, "kl": 0.00035427496186457574, "learning_rate": 6.88888888888889e-07, "loss": 0.0, "reward": -0.41850003600120544, "reward_std": 0.27061542868614197, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.41850003600120544, "step": 31 }, { "completion_length": 200.0, "epoch": 0.014241210502892745, "grad_norm": 0.001505751977674663, "kl": 0.00040443878970108926, "learning_rate": 7.111111111111112e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 32 }, { "completion_length": 194.83334350585938, "epoch": 0.014686248331108143, "grad_norm": 0.8729995489120483, "kl": 0.00045898579992353916, "learning_rate": 7.333333333333334e-07, "loss": 0.0, "reward": -0.4713333547115326, "reward_std": 0.0908794030547142, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4713333547115326, "step": 33 }, { "completion_length": 200.0, "epoch": 0.015131286159323543, "grad_norm": 0.83029705286026, "kl": 0.0004323194734752178, "learning_rate": 7.555555555555556e-07, "loss": 0.0, "reward": -0.3368333578109741, "reward_std": 0.35846197605133057, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33683332800865173, "step": 34 }, { "completion_length": 200.0, "epoch": 0.01557632398753894, "grad_norm": 0.678088903427124, "kl": 0.0004401813494041562, "learning_rate": 7.777777777777779e-07, "loss": 0.0, "reward": -0.22800001502037048, "reward_std": 0.38724154233932495, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22800001502037048, "step": 35 }, { "completion_length": 200.0, "epoch": 0.01602136181575434, "grad_norm": 0.7426117658615112, "kl": 0.0003528599627315998, "learning_rate": 8.000000000000001e-07, "loss": 0.0, "reward": -0.19583332538604736, "reward_std": 0.3525362014770508, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19583332538604736, "step": 36 }, { "completion_length": 200.0, "epoch": 0.016466399643969738, "grad_norm": 0.0014730676775798202, "kl": 0.000418979674577713, "learning_rate": 8.222222222222223e-07, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 37 }, { "completion_length": 200.0, "epoch": 0.016911437472185136, "grad_norm": 0.6773275136947632, "kl": 0.0003312948392704129, "learning_rate": 8.444444444444445e-07, "loss": 0.0, "reward": 0.016833335161209106, "reward_std": 0.2649531364440918, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016833335161209106, "step": 38 }, { "completion_length": 200.0, "epoch": 0.017356475300400534, "grad_norm": 0.4805554449558258, "kl": 0.00037392970989458263, "learning_rate": 8.666666666666668e-07, "loss": 0.0, "reward": -0.4266667068004608, "reward_std": 0.2752167582511902, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4266667068004608, "step": 39 }, { "completion_length": 188.33334350585938, "epoch": 0.017801513128615932, "grad_norm": 0.8252691626548767, "kl": 0.00034736632369458675, "learning_rate": 8.88888888888889e-07, "loss": 0.0, "reward": -0.43516668677330017, "reward_std": 0.10999348759651184, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43516668677330017, "step": 40 }, { "completion_length": 195.83334350585938, "epoch": 0.01824655095683133, "grad_norm": 0.6390711069107056, "kl": 0.00039921089773997664, "learning_rate": 9.111111111111113e-07, "loss": 0.0, "reward": -0.44216665625572205, "reward_std": 0.1345755159854889, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44216665625572205, "step": 41 }, { "completion_length": 200.0, "epoch": 0.018691588785046728, "grad_norm": 0.584726095199585, "kl": 0.000410672917496413, "learning_rate": 9.333333333333334e-07, "loss": 0.0, "reward": -0.08250001072883606, "reward_std": 0.3218010365962982, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08250000327825546, "step": 42 }, { "completion_length": 200.0, "epoch": 0.01913662661326213, "grad_norm": 0.7584457993507385, "kl": 0.0004279993590898812, "learning_rate": 9.555555555555556e-07, "loss": 0.0, "reward": -0.1798333376646042, "reward_std": 0.4852597117424011, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1798333376646042, "step": 43 }, { "completion_length": 200.0, "epoch": 0.019581664441477527, "grad_norm": 0.6866025328636169, "kl": 0.0004159708914812654, "learning_rate": 9.77777777777778e-07, "loss": 0.0, "reward": 0.022333335131406784, "reward_std": 0.25148093700408936, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022333335131406784, "step": 44 }, { "completion_length": 200.0, "epoch": 0.020026702269692925, "grad_norm": 0.6524875164031982, "kl": 0.0004553616454359144, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "reward": -0.30799999833106995, "reward_std": 0.33663925528526306, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.30799999833106995, "step": 45 }, { "completion_length": 197.1666717529297, "epoch": 0.020471740097908322, "grad_norm": 1.1161606311798096, "kl": 0.0004416520241647959, "learning_rate": 1.0222222222222223e-06, "loss": 0.0, "reward": -0.609333336353302, "reward_std": 0.2037426233291626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.609333336353302, "step": 46 }, { "completion_length": 200.0, "epoch": 0.02091677792612372, "grad_norm": 0.0014684926718473434, "kl": 0.0004343845648691058, "learning_rate": 1.0444444444444445e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 47 }, { "completion_length": 200.0, "epoch": 0.021361815754339118, "grad_norm": 0.7850491404533386, "kl": 0.0004720585129689425, "learning_rate": 1.066666666666667e-06, "loss": 0.0, "reward": -0.24550001323223114, "reward_std": 0.4167482554912567, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24550001323223114, "step": 48 }, { "completion_length": 174.0, "epoch": 0.021806853582554516, "grad_norm": 0.6916747093200684, "kl": 0.0003871291410177946, "learning_rate": 1.0888888888888889e-06, "loss": 0.0, "reward": -0.3348333239555359, "reward_std": 0.15262427926063538, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3348333537578583, "step": 49 }, { "completion_length": 200.0, "epoch": 0.022251891410769914, "grad_norm": 0.6415843367576599, "kl": 0.0004480450297705829, "learning_rate": 1.111111111111111e-06, "loss": 0.0, "reward": -0.5586666464805603, "reward_std": 0.04939500615000725, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5586667060852051, "step": 50 }, { "completion_length": 200.0, "epoch": 0.022696929238985315, "grad_norm": 0.7905615568161011, "kl": 0.00043076984002254903, "learning_rate": 1.1333333333333334e-06, "loss": 0.0, "reward": -0.5353333353996277, "reward_std": 0.01973491534590721, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5353333353996277, "step": 51 }, { "completion_length": 195.33334350585938, "epoch": 0.023141967067200713, "grad_norm": 0.6714709401130676, "kl": 0.000462042837170884, "learning_rate": 1.1555555555555556e-06, "loss": 0.0, "reward": -0.45649999380111694, "reward_std": 0.0799018144607544, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45650002360343933, "step": 52 }, { "completion_length": 200.0, "epoch": 0.02358700489541611, "grad_norm": 0.002091527683660388, "kl": 0.0004879847401753068, "learning_rate": 1.1777777777777778e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 53 }, { "completion_length": 200.0, "epoch": 0.02403204272363151, "grad_norm": 0.7136774659156799, "kl": 0.00044702840386889875, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 0.019333332777023315, "reward_std": 0.25882941484451294, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019333332777023315, "step": 54 }, { "completion_length": 191.6666717529297, "epoch": 0.024477080551846907, "grad_norm": 1.0719506740570068, "kl": 0.00041414215229451656, "learning_rate": 1.2222222222222223e-06, "loss": 0.0, "reward": 0.0806666687130928, "reward_std": 0.10859404504299164, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0806666687130928, "step": 55 }, { "completion_length": 200.0, "epoch": 0.024922118380062305, "grad_norm": 0.002473922213539481, "kl": 0.0005147390766069293, "learning_rate": 1.2444444444444445e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 56 }, { "completion_length": 200.0, "epoch": 0.025367156208277702, "grad_norm": 0.7581794857978821, "kl": 0.0004257292894180864, "learning_rate": 1.2666666666666669e-06, "loss": 0.0, "reward": -0.09583333134651184, "reward_std": 0.34220486879348755, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09583333134651184, "step": 57 }, { "completion_length": 200.0, "epoch": 0.0258121940364931, "grad_norm": 0.6923669576644897, "kl": 0.0004169998865108937, "learning_rate": 1.288888888888889e-06, "loss": 0.0, "reward": -0.3475000262260437, "reward_std": 0.3798324763774872, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3475000262260437, "step": 58 }, { "completion_length": 109.0, "epoch": 0.0262572318647085, "grad_norm": 1.0712668895721436, "kl": 0.00041128776501864195, "learning_rate": 1.3111111111111112e-06, "loss": 0.0, "reward": -0.06133333593606949, "reward_std": 0.20354819297790527, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06133333966135979, "step": 59 }, { "completion_length": 200.0, "epoch": 0.0267022696929239, "grad_norm": 0.6178426742553711, "kl": 0.0004559112712740898, "learning_rate": 1.3333333333333334e-06, "loss": 0.0, "reward": 0.023666664958000183, "reward_std": 0.2482149600982666, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023666664958000183, "step": 60 }, { "completion_length": 200.0, "epoch": 0.027147307521139297, "grad_norm": 0.6320557594299316, "kl": 0.0005056136287748814, "learning_rate": 1.3555555555555558e-06, "loss": 0.0, "reward": -0.4624999761581421, "reward_std": 0.29319530725479126, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4624999761581421, "step": 61 }, { "completion_length": 196.83334350585938, "epoch": 0.027592345349354695, "grad_norm": 0.7537997364997864, "kl": 0.0004290025099180639, "learning_rate": 1.377777777777778e-06, "loss": 0.0, "reward": -0.48366665840148926, "reward_std": 0.12520170211791992, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.48366665840148926, "step": 62 }, { "completion_length": 200.0, "epoch": 0.028037383177570093, "grad_norm": 0.0018171067349612713, "kl": 0.0004468559636734426, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 63 }, { "completion_length": 200.0, "epoch": 0.02848242100578549, "grad_norm": 0.6882118582725525, "kl": 0.0004662362043745816, "learning_rate": 1.4222222222222223e-06, "loss": 0.0, "reward": 0.01850000023841858, "reward_std": 0.26087066531181335, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01850000023841858, "step": 64 }, { "completion_length": 195.33334350585938, "epoch": 0.02892745883400089, "grad_norm": 0.7403289675712585, "kl": 0.00043322655255906284, "learning_rate": 1.4444444444444445e-06, "loss": 0.0, "reward": -0.47933337092399597, "reward_std": 0.09026111662387848, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.47933337092399597, "step": 65 }, { "completion_length": 200.0, "epoch": 0.029372496662216287, "grad_norm": 0.0017793107545003295, "kl": 0.00043216149788349867, "learning_rate": 1.4666666666666669e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 66 }, { "completion_length": 199.0, "epoch": 0.029817534490431688, "grad_norm": 0.6182725429534912, "kl": 0.00037733028875663877, "learning_rate": 1.4888888888888888e-06, "loss": 0.0, "reward": -0.07999999821186066, "reward_std": 0.32039913535118103, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08000000566244125, "step": 67 }, { "completion_length": 130.6666717529297, "epoch": 0.030262572318647086, "grad_norm": 1.1008906364440918, "kl": 0.0003855052054859698, "learning_rate": 1.5111111111111112e-06, "loss": 0.0, "reward": -0.04516666755080223, "reward_std": 0.15019378066062927, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04516666755080223, "step": 68 }, { "completion_length": 200.0, "epoch": 0.030707610146862484, "grad_norm": 0.6984473466873169, "kl": 0.00044030696153640747, "learning_rate": 1.5333333333333334e-06, "loss": 0.0, "reward": -0.34150001406669617, "reward_std": 0.364574134349823, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34150001406669617, "step": 69 }, { "completion_length": 195.33334350585938, "epoch": 0.03115264797507788, "grad_norm": 0.6155555248260498, "kl": 0.00039145484333857894, "learning_rate": 1.5555555555555558e-06, "loss": 0.0, "reward": -0.3800000548362732, "reward_std": 0.2700370252132416, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3799999952316284, "step": 70 }, { "completion_length": 200.0, "epoch": 0.03159768580329328, "grad_norm": 0.5811487436294556, "kl": 0.0003899557632394135, "learning_rate": 1.5777777777777778e-06, "loss": 0.0, "reward": 0.014166663400828838, "reward_std": 0.2714851498603821, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.014166663400828838, "step": 71 }, { "completion_length": 200.0, "epoch": 0.03204272363150868, "grad_norm": 0.6491996049880981, "kl": 0.0004053341690450907, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 0.01066666841506958, "reward_std": 0.2800583243370056, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01066666841506958, "step": 72 }, { "completion_length": 192.6666717529297, "epoch": 0.032487761459724075, "grad_norm": 0.7621211409568787, "kl": 0.0005313883302733302, "learning_rate": 1.6222222222222223e-06, "loss": 0.0, "reward": -0.4951666593551636, "reward_std": 0.05715736746788025, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4951666593551636, "step": 73 }, { "completion_length": 200.0, "epoch": 0.032932799287939477, "grad_norm": 0.7010623216629028, "kl": 0.00042472081258893013, "learning_rate": 1.6444444444444447e-06, "loss": 0.0, "reward": -0.0989999994635582, "reward_std": 0.34915727376937866, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0989999994635582, "step": 74 }, { "completion_length": 200.0, "epoch": 0.03337783711615487, "grad_norm": 0.7583237886428833, "kl": 0.00046256266068667173, "learning_rate": 1.6666666666666667e-06, "loss": 0.0, "reward": -0.09333333373069763, "reward_std": 0.33824294805526733, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09333333373069763, "step": 75 }, { "completion_length": 200.0, "epoch": 0.03382287494437027, "grad_norm": 0.7639651894569397, "kl": 0.00039581506280228496, "learning_rate": 1.688888888888889e-06, "loss": 0.0, "reward": -0.2276666760444641, "reward_std": 0.3896222710609436, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2276666760444641, "step": 76 }, { "completion_length": 164.1666717529297, "epoch": 0.03426791277258567, "grad_norm": 0.8515862822532654, "kl": 0.0004627097805496305, "learning_rate": 1.7111111111111112e-06, "loss": 0.0, "reward": -0.3383333683013916, "reward_std": 0.19050845503807068, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3383333683013916, "step": 77 }, { "completion_length": 200.0, "epoch": 0.03471295060080107, "grad_norm": 0.6962697505950928, "kl": 0.000421873846789822, "learning_rate": 1.7333333333333336e-06, "loss": 0.0, "reward": 0.013333330862224102, "reward_std": 0.2735263705253601, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013333330862224102, "step": 78 }, { "completion_length": 200.0, "epoch": 0.03515798842901647, "grad_norm": 0.6623450517654419, "kl": 0.0004432189743965864, "learning_rate": 1.7555555555555556e-06, "loss": 0.0, "reward": -0.0011666715145111084, "reward_std": 0.3090439736843109, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0011666715145111084, "step": 79 }, { "completion_length": 145.5, "epoch": 0.035603026257231864, "grad_norm": 0.9290313720703125, "kl": 0.0004976950585842133, "learning_rate": 1.777777777777778e-06, "loss": 0.0, "reward": -0.1913333386182785, "reward_std": 0.20369061827659607, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1913333535194397, "step": 80 }, { "completion_length": 200.0, "epoch": 0.036048064085447265, "grad_norm": 0.00226940237917006, "kl": 0.00044577824883162975, "learning_rate": 1.8000000000000001e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 81 }, { "completion_length": 200.0, "epoch": 0.03649310191366266, "grad_norm": 0.7341117262840271, "kl": 0.0003961599140893668, "learning_rate": 1.8222222222222225e-06, "loss": 0.0, "reward": 0.012833337299525738, "reward_std": 0.27475112676620483, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012833337299525738, "step": 82 }, { "completion_length": 176.83334350585938, "epoch": 0.03693813974187806, "grad_norm": 1.1084420680999756, "kl": 0.0004944322863593698, "learning_rate": 1.8444444444444445e-06, "loss": 0.0, "reward": -0.4231666922569275, "reward_std": 0.38362035155296326, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4231666922569275, "step": 83 }, { "completion_length": 200.0, "epoch": 0.037383177570093455, "grad_norm": 0.5538046956062317, "kl": 0.0004035194288007915, "learning_rate": 1.8666666666666669e-06, "loss": 0.0, "reward": 0.020999997854232788, "reward_std": 0.2547469735145569, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020999997854232788, "step": 84 }, { "completion_length": 199.6666717529297, "epoch": 0.037828215398308856, "grad_norm": 0.6329994797706604, "kl": 0.00045469467295333743, "learning_rate": 1.888888888888889e-06, "loss": 0.0, "reward": -0.1913333386182785, "reward_std": 0.3470242917537689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1913333386182785, "step": 85 }, { "completion_length": 200.0, "epoch": 0.03827325322652426, "grad_norm": 0.7051041722297668, "kl": 0.00048318642075173557, "learning_rate": 1.9111111111111112e-06, "loss": 0.0, "reward": 0.012499998323619366, "reward_std": 0.27556759119033813, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.012499998323619366, "step": 86 }, { "completion_length": 200.0, "epoch": 0.03871829105473965, "grad_norm": 0.9833978414535522, "kl": 0.0004773414693772793, "learning_rate": 1.9333333333333336e-06, "loss": 0.0, "reward": -0.37816667556762695, "reward_std": 0.35067784786224365, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.37816667556762695, "step": 87 }, { "completion_length": 200.0, "epoch": 0.039163328882955054, "grad_norm": 0.7608519196510315, "kl": 0.0004284613241907209, "learning_rate": 1.955555555555556e-06, "loss": 0.0, "reward": -0.4285000264644623, "reward_std": 0.27370041608810425, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4285000264644623, "step": 88 }, { "completion_length": 200.0, "epoch": 0.03960836671117045, "grad_norm": 0.8064699769020081, "kl": 0.00046465283958241343, "learning_rate": 1.977777777777778e-06, "loss": 0.0, "reward": -0.6141666769981384, "reward_std": 0.05563242360949516, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.6141666769981384, "step": 89 }, { "completion_length": 200.0, "epoch": 0.04005340453938585, "grad_norm": 0.7693816423416138, "kl": 0.0004991068853996694, "learning_rate": 2.0000000000000003e-06, "loss": 0.0, "reward": -0.21366667747497559, "reward_std": 0.42142030596733093, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21366667747497559, "step": 90 }, { "completion_length": 170.83334350585938, "epoch": 0.040498442367601244, "grad_norm": 0.8475862145423889, "kl": 0.0004466826212592423, "learning_rate": 2.0222222222222223e-06, "loss": 0.0, "reward": -0.25600001215934753, "reward_std": 0.3458733558654785, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.25600001215934753, "step": 91 }, { "completion_length": 200.0, "epoch": 0.040943480195816645, "grad_norm": 0.001588360988534987, "kl": 0.00045517744729295373, "learning_rate": 2.0444444444444447e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 92 }, { "completion_length": 168.83334350585938, "epoch": 0.04138851802403204, "grad_norm": 0.7196416854858398, "kl": 0.0005219130543991923, "learning_rate": 2.0666666666666666e-06, "loss": 0.0, "reward": -0.28700000047683716, "reward_std": 0.23125484585762024, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.28700003027915955, "step": 93 }, { "completion_length": 200.0, "epoch": 0.04183355585224744, "grad_norm": 0.0016818700823932886, "kl": 0.0004399843164719641, "learning_rate": 2.088888888888889e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 94 }, { "completion_length": 187.33334350585938, "epoch": 0.04227859368046284, "grad_norm": 0.662451446056366, "kl": 0.0005664011696353555, "learning_rate": 2.1111111111111114e-06, "loss": 0.0, "reward": -0.07083334028720856, "reward_std": 0.4042125344276428, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07083334028720856, "step": 95 }, { "completion_length": 200.0, "epoch": 0.042723631508678236, "grad_norm": 0.8778610229492188, "kl": 0.0005331834545359015, "learning_rate": 2.133333333333334e-06, "loss": 0.0, "reward": 0.013499995693564415, "reward_std": 0.27311810851097107, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.013499995693564415, "step": 96 }, { "completion_length": 200.0, "epoch": 0.04316866933689364, "grad_norm": 0.7499815821647644, "kl": 0.0005722627975046635, "learning_rate": 2.1555555555555558e-06, "loss": 0.0, "reward": -0.36016663908958435, "reward_std": 0.37692034244537354, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.36016666889190674, "step": 97 }, { "completion_length": 199.6666717529297, "epoch": 0.04361370716510903, "grad_norm": 0.6992683410644531, "kl": 0.0006015513790771365, "learning_rate": 2.1777777777777777e-06, "loss": 0.0, "reward": -0.3303333520889282, "reward_std": 0.3531128168106079, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3303333520889282, "step": 98 }, { "completion_length": 200.0, "epoch": 0.044058744993324434, "grad_norm": 0.7034227848052979, "kl": 0.0005352528532966971, "learning_rate": 2.2e-06, "loss": 0.0, "reward": -0.5301666855812073, "reward_std": 0.03528975695371628, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5301666855812073, "step": 99 }, { "completion_length": 200.0, "epoch": 0.04450378282153983, "grad_norm": 0.7136642932891846, "kl": 0.0005618830909952521, "learning_rate": 2.222222222222222e-06, "loss": 0.0, "reward": 0.023666664958000183, "reward_std": 0.2482149600982666, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023666664958000183, "step": 100 }, { "completion_length": 200.0, "epoch": 0.04494882064975523, "grad_norm": 0.9584755301475525, "kl": 0.0008303936338052154, "learning_rate": 2.2444444444444445e-06, "loss": 0.0, "reward": -0.34933334589004517, "reward_std": 0.369188129901886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.34933334589004517, "step": 101 }, { "completion_length": 200.0, "epoch": 0.04539385847797063, "grad_norm": 0.810990035533905, "kl": 0.0007378787267953157, "learning_rate": 2.266666666666667e-06, "loss": 0.0, "reward": -0.5558333396911621, "reward_std": 0.02642286941409111, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5558333396911621, "step": 102 }, { "completion_length": 200.0, "epoch": 0.045838896306186025, "grad_norm": 0.8462039232254028, "kl": 0.0007519976934418082, "learning_rate": 2.2888888888888892e-06, "loss": 0.0, "reward": -0.5571666955947876, "reward_std": 0.042873844504356384, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5571666955947876, "step": 103 }, { "completion_length": 152.33334350585938, "epoch": 0.046283934134401426, "grad_norm": 0.8582832217216492, "kl": 0.0006616115570068359, "learning_rate": 2.311111111111111e-06, "loss": 0.0, "reward": -0.20516666769981384, "reward_std": 0.3065997064113617, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20516668260097504, "step": 104 }, { "completion_length": 200.0, "epoch": 0.04672897196261682, "grad_norm": 0.751700758934021, "kl": 0.0007129245204851031, "learning_rate": 2.3333333333333336e-06, "loss": 0.0, "reward": -0.24383334815502167, "reward_std": 0.40426644682884216, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24383334815502167, "step": 105 }, { "completion_length": 200.0, "epoch": 0.04717400979083222, "grad_norm": 0.7948376536369324, "kl": 0.0009101564064621925, "learning_rate": 2.3555555555555555e-06, "loss": 0.0, "reward": -0.26483333110809326, "reward_std": 0.44175583124160767, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26483333110809326, "step": 106 }, { "completion_length": 200.0, "epoch": 0.047619047619047616, "grad_norm": 0.6845387816429138, "kl": 0.0005481390398927033, "learning_rate": 2.377777777777778e-06, "loss": 0.0, "reward": -0.08950000256299973, "reward_std": 0.3333369195461273, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08950000256299973, "step": 107 }, { "completion_length": 174.33334350585938, "epoch": 0.04806408544726302, "grad_norm": 0.9041478633880615, "kl": 0.0010797386057674885, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": -0.3813333511352539, "reward_std": 0.1361376941204071, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3813333511352539, "step": 108 }, { "completion_length": 200.0, "epoch": 0.04850912327547842, "grad_norm": 0.8546884059906006, "kl": 0.0009791944175958633, "learning_rate": 2.4222222222222223e-06, "loss": 0.0, "reward": -0.17666666209697723, "reward_std": 0.3318732678890228, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17666666209697723, "step": 109 }, { "completion_length": 200.0, "epoch": 0.04895416110369381, "grad_norm": 0.002719539450481534, "kl": 0.0005873936461284757, "learning_rate": 2.4444444444444447e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 110 }, { "completion_length": 200.0, "epoch": 0.049399198931909215, "grad_norm": 0.7770993113517761, "kl": 0.0009914538823068142, "learning_rate": 2.466666666666667e-06, "loss": 0.0, "reward": -0.3736667037010193, "reward_std": 0.3873085379600525, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3736667037010193, "step": 111 }, { "completion_length": 200.0, "epoch": 0.04984423676012461, "grad_norm": 0.6850268244743347, "kl": 0.0007792222313582897, "learning_rate": 2.488888888888889e-06, "loss": 0.0, "reward": -0.5395000576972961, "reward_std": 0.025041967630386353, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5395000576972961, "step": 112 }, { "completion_length": 198.33334350585938, "epoch": 0.05028927458834001, "grad_norm": 0.7239099144935608, "kl": 0.001012115739285946, "learning_rate": 2.5111111111111114e-06, "loss": 0.0, "reward": -0.42133331298828125, "reward_std": 0.27339982986450195, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.42133331298828125, "step": 113 }, { "completion_length": 200.0, "epoch": 0.050734312416555405, "grad_norm": 0.5771994590759277, "kl": 0.0015950507950037718, "learning_rate": 2.5333333333333338e-06, "loss": 0.0001, "reward": -0.621666669845581, "reward_std": 0.031443070620298386, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.621666669845581, "step": 114 }, { "completion_length": 187.6666717529297, "epoch": 0.051179350244770806, "grad_norm": 0.8059555292129517, "kl": 0.0011016380740329623, "learning_rate": 2.5555555555555557e-06, "loss": 0.0, "reward": -0.31283336877822876, "reward_std": 0.25179630517959595, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31283336877822876, "step": 115 }, { "completion_length": 200.0, "epoch": 0.0516243880729862, "grad_norm": 0.7699070572853088, "kl": 0.000991647131741047, "learning_rate": 2.577777777777778e-06, "loss": 0.0, "reward": -0.4051666855812073, "reward_std": 0.26059579849243164, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4051666855812073, "step": 116 }, { "completion_length": 200.0, "epoch": 0.0520694259012016, "grad_norm": 0.6547927856445312, "kl": 0.0008666824433021247, "learning_rate": 2.6e-06, "loss": 0.0, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 117 }, { "completion_length": 198.1666717529297, "epoch": 0.052514463729417, "grad_norm": 0.8903563022613525, "kl": 0.0012721801176667213, "learning_rate": 2.6222222222222225e-06, "loss": 0.0001, "reward": -0.5, "reward_std": 0.10164842009544373, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.5, "step": 118 }, { "completion_length": 200.0, "epoch": 0.0529595015576324, "grad_norm": 0.7130769491195679, "kl": 0.0009466246119700372, "learning_rate": 2.6444444444444444e-06, "loss": 0.0, "reward": -0.07766667008399963, "reward_std": 0.36592990159988403, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07766667008399963, "step": 119 }, { "completion_length": 200.0, "epoch": 0.0534045393858478, "grad_norm": 0.010055916383862495, "kl": 0.0016336208209395409, "learning_rate": 2.666666666666667e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 120 }, { "completion_length": 200.0, "epoch": 0.05384957721406319, "grad_norm": 0.7626153826713562, "kl": 0.0019612584728747606, "learning_rate": 2.6888888888888892e-06, "loss": 0.0001, "reward": -0.44099998474121094, "reward_std": 0.27870485186576843, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4410000443458557, "step": 121 }, { "completion_length": 200.0, "epoch": 0.054294615042278595, "grad_norm": 0.7897221446037292, "kl": 0.001940418384037912, "learning_rate": 2.7111111111111116e-06, "loss": 0.0001, "reward": -0.20816665887832642, "reward_std": 0.36510735750198364, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.20816665887832642, "step": 122 }, { "completion_length": 200.0, "epoch": 0.05473965287049399, "grad_norm": 0.004020801745355129, "kl": 0.0008703676867298782, "learning_rate": 2.7333333333333336e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 123 }, { "completion_length": 200.0, "epoch": 0.05518469069870939, "grad_norm": 0.005421373061835766, "kl": 0.0008717196760699153, "learning_rate": 2.755555555555556e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 124 }, { "completion_length": 178.33334350585938, "epoch": 0.05562972852692479, "grad_norm": 0.8171259164810181, "kl": 0.0014178266283124685, "learning_rate": 2.7777777777777783e-06, "loss": 0.0001, "reward": -0.16316668689250946, "reward_std": 0.3424929082393646, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.16316668689250946, "step": 125 }, { "completion_length": 200.0, "epoch": 0.056074766355140186, "grad_norm": 0.811120331287384, "kl": 0.00203678198158741, "learning_rate": 2.8000000000000003e-06, "loss": 0.0001, "reward": -0.4819999933242798, "reward_std": 0.3060758113861084, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4819999933242798, "step": 126 }, { "completion_length": 200.0, "epoch": 0.05651980418335559, "grad_norm": 0.004288077354431152, "kl": 0.00076559919398278, "learning_rate": 2.8222222222222223e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 127 }, { "completion_length": 200.0, "epoch": 0.05696484201157098, "grad_norm": 0.7662628293037415, "kl": 0.0013816291466355324, "learning_rate": 2.8444444444444446e-06, "loss": 0.0001, "reward": -0.4233333468437195, "reward_std": 0.2736119031906128, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.4233333468437195, "step": 128 }, { "completion_length": 200.0, "epoch": 0.05740987983978638, "grad_norm": 0.7000979781150818, "kl": 0.0022010253742337227, "learning_rate": 2.866666666666667e-06, "loss": 0.0001, "reward": -0.43416666984558105, "reward_std": 0.2743176221847534, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.43416666984558105, "step": 129 }, { "completion_length": 189.5, "epoch": 0.05785491766800178, "grad_norm": 0.6776527762413025, "kl": 0.0026975106447935104, "learning_rate": 2.888888888888889e-06, "loss": 0.0001, "reward": -0.035999998450279236, "reward_std": 0.27313145995140076, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03599999472498894, "step": 130 }, { "completion_length": 200.0, "epoch": 0.05829995549621718, "grad_norm": 0.7056086659431458, "kl": 0.001560688717290759, "learning_rate": 2.9111111111111114e-06, "loss": 0.0001, "reward": -0.31833332777023315, "reward_std": 0.3454664647579193, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.31833332777023315, "step": 131 }, { "completion_length": 172.5, "epoch": 0.05874499332443257, "grad_norm": 1.0837831497192383, "kl": 0.0017425650730729103, "learning_rate": 2.9333333333333338e-06, "loss": 0.0001, "reward": -0.3088333308696747, "reward_std": 0.3001648783683777, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3088333308696747, "step": 132 }, { "completion_length": 200.0, "epoch": 0.059190031152647975, "grad_norm": 0.7270302176475525, "kl": 0.0014150540810078382, "learning_rate": 2.955555555555556e-06, "loss": 0.0001, "reward": -0.53083336353302, "reward_std": 0.02674633078277111, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.53083336353302, "step": 133 }, { "completion_length": 181.5, "epoch": 0.059635068980863376, "grad_norm": 0.6710302233695984, "kl": 0.0028289398178458214, "learning_rate": 2.9777777777777777e-06, "loss": 0.0001, "reward": -0.15183335542678833, "reward_std": 0.3105810284614563, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15183335542678833, "step": 134 }, { "completion_length": 200.0, "epoch": 0.06008010680907877, "grad_norm": 0.8479624390602112, "kl": 0.0018316200003027916, "learning_rate": 3e-06, "loss": 0.0001, "reward": 0.02383333444595337, "reward_std": 0.24780671298503876, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02383333444595337, "step": 135 }, { "completion_length": 200.0, "epoch": 0.06052514463729417, "grad_norm": 0.8236755728721619, "kl": 0.002227437449619174, "learning_rate": 3.0222222222222225e-06, "loss": 0.0001, "reward": -0.3340000510215759, "reward_std": 0.35908883810043335, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3340000510215759, "step": 136 }, { "completion_length": 200.0, "epoch": 0.060970182465509566, "grad_norm": 0.011570720933377743, "kl": 0.0022140974178910255, "learning_rate": 3.044444444444445e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 137 }, { "completion_length": 200.0, "epoch": 0.06141522029372497, "grad_norm": 0.7572616338729858, "kl": 0.003316813614219427, "learning_rate": 3.066666666666667e-06, "loss": 0.0001, "reward": -0.18433332443237305, "reward_std": 0.340387225151062, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18433332443237305, "step": 138 }, { "completion_length": 200.0, "epoch": 0.06186025812194036, "grad_norm": 0.00493138050660491, "kl": 0.0011617927812039852, "learning_rate": 3.088888888888889e-06, "loss": 0.0, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 139 }, { "completion_length": 200.0, "epoch": 0.06230529595015576, "grad_norm": 0.6994268894195557, "kl": 0.003730988595634699, "learning_rate": 3.1111111111111116e-06, "loss": 0.0001, "reward": -0.10350000858306885, "reward_std": 0.35402247309684753, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10350000858306885, "step": 140 }, { "completion_length": 188.6666717529297, "epoch": 0.06275033377837116, "grad_norm": 0.7533054351806641, "kl": 0.007150403223931789, "learning_rate": 3.133333333333334e-06, "loss": 0.0003, "reward": -0.21416668593883514, "reward_std": 0.36423152685165405, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21416667103767395, "step": 141 }, { "completion_length": 200.0, "epoch": 0.06319537160658656, "grad_norm": 0.853130042552948, "kl": 0.0037856251001358032, "learning_rate": 3.1555555555555555e-06, "loss": 0.0002, "reward": -0.45483335852622986, "reward_std": 0.23466865718364716, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.45483335852622986, "step": 142 }, { "completion_length": 184.5, "epoch": 0.06364040943480195, "grad_norm": 0.6948954463005066, "kl": 0.003319720271974802, "learning_rate": 3.177777777777778e-06, "loss": 0.0001, "reward": -0.33100003004074097, "reward_std": 0.12057197093963623, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.33100003004074097, "step": 143 }, { "completion_length": 200.0, "epoch": 0.06408544726301736, "grad_norm": 0.6649389266967773, "kl": 0.004403320141136646, "learning_rate": 3.2000000000000003e-06, "loss": 0.0002, "reward": -0.33233335614204407, "reward_std": 0.3574983775615692, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3323333263397217, "step": 144 }, { "completion_length": 200.0, "epoch": 0.06453048509123276, "grad_norm": 0.008188747800886631, "kl": 0.0016727236798033118, "learning_rate": 3.2222222222222227e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 145 }, { "completion_length": 197.0, "epoch": 0.06497552291944815, "grad_norm": 0.7850582599639893, "kl": 0.004564880859106779, "learning_rate": 3.2444444444444446e-06, "loss": 0.0002, "reward": -0.18733333051204681, "reward_std": 0.36280059814453125, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.18733333051204681, "step": 146 }, { "completion_length": 200.0, "epoch": 0.06542056074766354, "grad_norm": 0.8551555275917053, "kl": 0.005018829368054867, "learning_rate": 3.266666666666667e-06, "loss": 0.0002, "reward": -0.22599999606609344, "reward_std": 0.3867609202861786, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22599999606609344, "step": 147 }, { "completion_length": 200.0, "epoch": 0.06586559857587895, "grad_norm": 0.011043811216950417, "kl": 0.0033470559865236282, "learning_rate": 3.2888888888888894e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 148 }, { "completion_length": 187.6666717529297, "epoch": 0.06631063640409435, "grad_norm": 0.7791420221328735, "kl": 0.007582447957247496, "learning_rate": 3.3111111111111118e-06, "loss": 0.0003, "reward": -0.24450001120567322, "reward_std": 0.23151740431785583, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24450001120567322, "step": 149 }, { "completion_length": 200.0, "epoch": 0.06675567423230974, "grad_norm": 0.7961557507514954, "kl": 0.005281232297420502, "learning_rate": 3.3333333333333333e-06, "loss": 0.0002, "reward": 0.025166666135191917, "reward_std": 0.2445407211780548, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025166666135191917, "step": 150 }, { "completion_length": 191.1666717529297, "epoch": 0.06720071206052515, "grad_norm": 0.7541393041610718, "kl": 0.008166976273059845, "learning_rate": 3.3555555555555557e-06, "loss": 0.0003, "reward": -0.26350000500679016, "reward_std": 0.3062899112701416, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26350000500679016, "step": 151 }, { "completion_length": 138.5, "epoch": 0.06764574988874054, "grad_norm": 1.3180961608886719, "kl": 0.0019082196522504091, "learning_rate": 3.377777777777778e-06, "loss": 0.0001, "reward": -0.19033333659172058, "reward_std": 0.2720842957496643, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19033333659172058, "step": 152 }, { "completion_length": 200.0, "epoch": 0.06809078771695594, "grad_norm": 0.014968675561249256, "kl": 0.0043890452943742275, "learning_rate": 3.4000000000000005e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 153 }, { "completion_length": 200.0, "epoch": 0.06853582554517133, "grad_norm": 1.0916509628295898, "kl": 0.006353219039738178, "learning_rate": 3.4222222222222224e-06, "loss": 0.0003, "reward": -0.014833331108093262, "reward_std": 0.4068416953086853, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.014833331108093262, "step": 154 }, { "completion_length": 200.0, "epoch": 0.06898086337338674, "grad_norm": 0.7793182730674744, "kl": 0.00863957405090332, "learning_rate": 3.444444444444445e-06, "loss": 0.0003, "reward": -0.10183333605527878, "reward_std": 0.4037016034126282, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10183333605527878, "step": 155 }, { "completion_length": 200.0, "epoch": 0.06942590120160214, "grad_norm": 0.012789091095328331, "kl": 0.0035248759668320417, "learning_rate": 3.4666666666666672e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 156 }, { "completion_length": 199.83334350585938, "epoch": 0.06987093902981753, "grad_norm": 0.8552259802818298, "kl": 0.007797297090291977, "learning_rate": 3.4888888888888896e-06, "loss": 0.0003, "reward": -0.32883334159851074, "reward_std": 0.3524216115474701, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32883334159851074, "step": 157 }, { "completion_length": 200.0, "epoch": 0.07031597685803294, "grad_norm": 0.7439639568328857, "kl": 0.008432665839791298, "learning_rate": 3.511111111111111e-06, "loss": 0.0003, "reward": -0.3968333601951599, "reward_std": 0.25783050060272217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3968333601951599, "step": 158 }, { "completion_length": 200.0, "epoch": 0.07076101468624833, "grad_norm": 0.8293086290359497, "kl": 0.015361580066382885, "learning_rate": 3.5333333333333335e-06, "loss": 0.0006, "reward": -0.09950000047683716, "reward_std": 0.34847769141197205, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09950000047683716, "step": 159 }, { "completion_length": 200.0, "epoch": 0.07120605251446373, "grad_norm": 0.02794010564684868, "kl": 0.011896876618266106, "learning_rate": 3.555555555555556e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 160 }, { "completion_length": 168.5, "epoch": 0.07165109034267912, "grad_norm": 1.1416176557540894, "kl": 0.01758456416428089, "learning_rate": 3.577777777777778e-06, "loss": 0.0007, "reward": -0.21533334255218506, "reward_std": 0.36748045682907104, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.21533334255218506, "step": 161 }, { "completion_length": 200.0, "epoch": 0.07209612817089453, "grad_norm": 0.8033695220947266, "kl": 0.016339905560016632, "learning_rate": 3.6000000000000003e-06, "loss": 0.0007, "reward": 0.022333335131406784, "reward_std": 0.25148093700408936, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022333335131406784, "step": 162 }, { "completion_length": 200.0, "epoch": 0.07254116599910992, "grad_norm": 0.013050896115601063, "kl": 0.005038695875555277, "learning_rate": 3.6222222222222226e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 163 }, { "completion_length": 200.0, "epoch": 0.07298620382732532, "grad_norm": 0.8717074990272522, "kl": 0.01940556988120079, "learning_rate": 3.644444444444445e-06, "loss": 0.0008, "reward": -0.44333335757255554, "reward_std": 0.27983972430229187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.44333332777023315, "step": 164 }, { "completion_length": 200.0, "epoch": 0.07343124165554073, "grad_norm": 0.7769482135772705, "kl": 0.008692685514688492, "learning_rate": 3.6666666666666666e-06, "loss": 0.0003, "reward": -0.1525000035762787, "reward_std": 0.4397725462913513, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1525000035762787, "step": 165 }, { "completion_length": 200.0, "epoch": 0.07387627948375612, "grad_norm": 0.009532845579087734, "kl": 0.003257167525589466, "learning_rate": 3.688888888888889e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 166 }, { "completion_length": 200.0, "epoch": 0.07432131731197152, "grad_norm": 0.766445517539978, "kl": 0.01774817332625389, "learning_rate": 3.7111111111111113e-06, "loss": 0.0007, "reward": 0.01966666243970394, "reward_std": 0.25801295042037964, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01966666243970394, "step": 167 }, { "completion_length": 192.83334350585938, "epoch": 0.07476635514018691, "grad_norm": 0.7344871759414673, "kl": 0.019676849246025085, "learning_rate": 3.7333333333333337e-06, "loss": 0.0008, "reward": -0.3711666762828827, "reward_std": 0.27725324034690857, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.3711666762828827, "step": 168 }, { "completion_length": 200.0, "epoch": 0.07521139296840232, "grad_norm": 0.708225429058075, "kl": 0.01603546366095543, "learning_rate": 3.7555555555555557e-06, "loss": 0.0006, "reward": -0.007833331823348999, "reward_std": 0.32537388801574707, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007833331823348999, "step": 169 }, { "completion_length": 200.0, "epoch": 0.07565643079661771, "grad_norm": 0.012713441625237465, "kl": 0.005252152215689421, "learning_rate": 3.777777777777778e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 170 }, { "completion_length": 200.0, "epoch": 0.07610146862483311, "grad_norm": 0.013440362177789211, "kl": 0.006324666552245617, "learning_rate": 3.8000000000000005e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 171 }, { "completion_length": 200.0, "epoch": 0.07654650645304852, "grad_norm": 0.018537871539592743, "kl": 0.0071646831929683685, "learning_rate": 3.8222222222222224e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 172 }, { "completion_length": 200.0, "epoch": 0.07699154428126391, "grad_norm": 0.01842048391699791, "kl": 0.008270082995295525, "learning_rate": 3.844444444444445e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 173 }, { "completion_length": 200.0, "epoch": 0.0774365821094793, "grad_norm": 0.7180718779563904, "kl": 0.023133087903261185, "learning_rate": 3.866666666666667e-06, "loss": 0.0009, "reward": -0.32500001788139343, "reward_std": 0.35587525367736816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.32500001788139343, "step": 174 }, { "completion_length": 200.0, "epoch": 0.0778816199376947, "grad_norm": 0.8401187062263489, "kl": 0.013387969695031643, "learning_rate": 3.88888888888889e-06, "loss": 0.0005, "reward": 0.029499998316168785, "reward_std": 0.2993685007095337, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.029499998316168785, "step": 175 }, { "completion_length": 200.0, "epoch": 0.07832665776591011, "grad_norm": 0.0299467034637928, "kl": 0.015433305874466896, "learning_rate": 3.911111111111112e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 176 }, { "completion_length": 200.0, "epoch": 0.0787716955941255, "grad_norm": 0.01781369000673294, "kl": 0.00665412237867713, "learning_rate": 3.9333333333333335e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 177 }, { "completion_length": 200.0, "epoch": 0.0792167334223409, "grad_norm": 0.029804501682519913, "kl": 0.015158621594309807, "learning_rate": 3.955555555555556e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 178 }, { "completion_length": 200.0, "epoch": 0.0796617712505563, "grad_norm": 0.020057376474142075, "kl": 0.005600334610790014, "learning_rate": 3.977777777777778e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 179 }, { "completion_length": 200.0, "epoch": 0.0801068090787717, "grad_norm": 0.6821267008781433, "kl": 0.01735800690948963, "learning_rate": 4.000000000000001e-06, "loss": 0.0007, "reward": -0.2288333624601364, "reward_std": 0.3883763253688812, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2288333624601364, "step": 180 }, { "completion_length": 200.0, "epoch": 0.08055184690698709, "grad_norm": 0.8316364884376526, "kl": 0.03854802995920181, "learning_rate": 4.022222222222222e-06, "loss": 0.0015, "reward": -0.11349999904632568, "reward_std": 0.37036940455436707, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11349999904632568, "step": 181 }, { "completion_length": 200.0, "epoch": 0.08099688473520249, "grad_norm": 0.8583576679229736, "kl": 0.014207671396434307, "learning_rate": 4.044444444444445e-06, "loss": 0.0006, "reward": 0.002499997615814209, "reward_std": 0.30006250739097595, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.002499997615814209, "step": 182 }, { "completion_length": 200.0, "epoch": 0.0814419225634179, "grad_norm": 0.0153994495049119, "kl": 0.007102414965629578, "learning_rate": 4.066666666666667e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 183 }, { "completion_length": 200.0, "epoch": 0.08188696039163329, "grad_norm": 0.8327325582504272, "kl": 0.022632954642176628, "learning_rate": 4.088888888888889e-06, "loss": 0.0009, "reward": 0.014666667208075523, "reward_std": 0.27026039361953735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.014666667208075523, "step": 184 }, { "completion_length": 200.0, "epoch": 0.08233199821984868, "grad_norm": 0.010787020437419415, "kl": 0.004412154667079449, "learning_rate": 4.111111111111111e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 185 }, { "completion_length": 200.0, "epoch": 0.08277703604806408, "grad_norm": 0.029312826693058014, "kl": 0.01270313374698162, "learning_rate": 4.133333333333333e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 186 }, { "completion_length": 175.1666717529297, "epoch": 0.08322207387627949, "grad_norm": 0.8411778211593628, "kl": 0.028145212680101395, "learning_rate": 4.155555555555556e-06, "loss": 0.0011, "reward": -0.01066666841506958, "reward_std": 0.17476919293403625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.01066666841506958, "step": 187 }, { "completion_length": 200.0, "epoch": 0.08366711170449488, "grad_norm": 0.023354342207312584, "kl": 0.00851635355502367, "learning_rate": 4.177777777777778e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 188 }, { "completion_length": 192.5, "epoch": 0.08411214953271028, "grad_norm": 0.8554210662841797, "kl": 0.01744459569454193, "learning_rate": 4.2000000000000004e-06, "loss": 0.0007, "reward": -0.04233333468437195, "reward_std": 0.2719578444957733, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04233333468437195, "step": 189 }, { "completion_length": 177.33334350585938, "epoch": 0.08455718736092568, "grad_norm": 0.8024124503135681, "kl": 0.015208413824439049, "learning_rate": 4.222222222222223e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 190 }, { "completion_length": 200.0, "epoch": 0.08500222518914108, "grad_norm": 0.6993169188499451, "kl": 0.014287407509982586, "learning_rate": 4.244444444444445e-06, "loss": 0.0006, "reward": -0.1301666796207428, "reward_std": 0.39577287435531616, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1301666796207428, "step": 191 }, { "completion_length": 200.0, "epoch": 0.08544726301735647, "grad_norm": 0.6678306460380554, "kl": 0.013930333778262138, "learning_rate": 4.266666666666668e-06, "loss": 0.0006, "reward": 0.043666668236255646, "reward_std": 0.26521819829940796, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.043666668236255646, "step": 192 }, { "completion_length": 200.0, "epoch": 0.08589230084557187, "grad_norm": 0.028331147506833076, "kl": 0.017971333116292953, "learning_rate": 4.288888888888889e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 193 }, { "completion_length": 200.0, "epoch": 0.08633733867378728, "grad_norm": 0.027453351765871048, "kl": 0.009431993588805199, "learning_rate": 4.3111111111111115e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 194 }, { "completion_length": 200.0, "epoch": 0.08678237650200267, "grad_norm": 0.773445725440979, "kl": 0.023339729756116867, "learning_rate": 4.333333333333334e-06, "loss": 0.0009, "reward": -0.10633333772420883, "reward_std": 0.35838064551353455, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10633333772420883, "step": 195 }, { "completion_length": 200.0, "epoch": 0.08722741433021806, "grad_norm": 0.014650012366473675, "kl": 0.007890871725976467, "learning_rate": 4.3555555555555555e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 196 }, { "completion_length": 200.0, "epoch": 0.08767245215843347, "grad_norm": 0.025273794308304787, "kl": 0.01200440526008606, "learning_rate": 4.377777777777778e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 197 }, { "completion_length": 192.33334350585938, "epoch": 0.08811748998664887, "grad_norm": 0.7459567785263062, "kl": 0.01451034378260374, "learning_rate": 4.4e-06, "loss": 0.0006, "reward": -0.26500001549720764, "reward_std": 0.33120569586753845, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.26500001549720764, "step": 198 }, { "completion_length": 200.0, "epoch": 0.08856252781486426, "grad_norm": 0.010312313213944435, "kl": 0.0038611218333244324, "learning_rate": 4.422222222222223e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 199 }, { "completion_length": 200.0, "epoch": 0.08900756564307966, "grad_norm": 0.6166565418243408, "kl": 0.009738167747855186, "learning_rate": 4.444444444444444e-06, "loss": 0.0004, "reward": 0.0035000047646462917, "reward_std": 0.2976129949092865, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0035000047646462917, "step": 200 }, { "completion_length": 200.0, "epoch": 0.08945260347129506, "grad_norm": 0.008906065486371517, "kl": 0.0037507950328290462, "learning_rate": 4.4666666666666665e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 201 }, { "completion_length": 198.1666717529297, "epoch": 0.08989764129951046, "grad_norm": 0.9748014807701111, "kl": 0.009199577383697033, "learning_rate": 4.488888888888889e-06, "loss": 0.0004, "reward": 0.04983333498239517, "reward_std": 0.18411998450756073, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04983333498239517, "step": 202 }, { "completion_length": 200.0, "epoch": 0.09034267912772585, "grad_norm": 0.04424808546900749, "kl": 0.01891401596367359, "learning_rate": 4.511111111111111e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 203 }, { "completion_length": 200.0, "epoch": 0.09078771695594126, "grad_norm": 0.008351677097380161, "kl": 0.003573081223294139, "learning_rate": 4.533333333333334e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 204 }, { "completion_length": 200.0, "epoch": 0.09123275478415666, "grad_norm": 0.7445893883705139, "kl": 0.014485888183116913, "learning_rate": 4.555555555555556e-06, "loss": 0.0006, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 205 }, { "completion_length": 200.0, "epoch": 0.09167779261237205, "grad_norm": 0.6814647316932678, "kl": 0.017133373767137527, "learning_rate": 4.5777777777777785e-06, "loss": 0.0007, "reward": -0.08250000327825546, "reward_std": 0.37661266326904297, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08250000327825546, "step": 206 }, { "completion_length": 200.0, "epoch": 0.09212283044058744, "grad_norm": 0.013401404023170471, "kl": 0.0045086476020514965, "learning_rate": 4.600000000000001e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 207 }, { "completion_length": 200.0, "epoch": 0.09256786826880285, "grad_norm": 0.015258646570146084, "kl": 0.005985723342746496, "learning_rate": 4.622222222222222e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 208 }, { "completion_length": 200.0, "epoch": 0.09301290609701825, "grad_norm": 0.017956389114260674, "kl": 0.0076329647563397884, "learning_rate": 4.644444444444445e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 209 }, { "completion_length": 200.0, "epoch": 0.09345794392523364, "grad_norm": 0.6878367066383362, "kl": 0.00822862982749939, "learning_rate": 4.666666666666667e-06, "loss": 0.0003, "reward": 0.025333335623145103, "reward_std": 0.24413248896598816, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025333335623145103, "step": 210 }, { "completion_length": 185.6666717529297, "epoch": 0.09390298175344905, "grad_norm": 0.7083204388618469, "kl": 0.008970928378403187, "learning_rate": 4.6888888888888895e-06, "loss": 0.0004, "reward": -0.10883334279060364, "reward_std": 0.31478020548820496, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10883334279060364, "step": 211 }, { "completion_length": 200.0, "epoch": 0.09434801958166444, "grad_norm": 0.010259282775223255, "kl": 0.005621683783829212, "learning_rate": 4.711111111111111e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 212 }, { "completion_length": 200.0, "epoch": 0.09479305740987984, "grad_norm": 0.63719642162323, "kl": 0.017996463924646378, "learning_rate": 4.7333333333333335e-06, "loss": 0.0007, "reward": -0.2148333489894867, "reward_std": 0.37843701243400574, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.2148333489894867, "step": 213 }, { "completion_length": 200.0, "epoch": 0.09523809523809523, "grad_norm": 0.7535067796707153, "kl": 0.009188219904899597, "learning_rate": 4.755555555555556e-06, "loss": 0.0004, "reward": 0.025833334773778915, "reward_std": 0.24290774762630463, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025833334773778915, "step": 214 }, { "completion_length": 200.0, "epoch": 0.09568313306631064, "grad_norm": 0.03754749149084091, "kl": 0.023878587409853935, "learning_rate": 4.777777777777778e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 215 }, { "completion_length": 195.1666717529297, "epoch": 0.09612817089452604, "grad_norm": 0.7495411038398743, "kl": 0.020265337079763412, "learning_rate": 4.800000000000001e-06, "loss": 0.0008, "reward": -0.1301666796207428, "reward_std": 0.2998015582561493, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1301666796207428, "step": 216 }, { "completion_length": 200.0, "epoch": 0.09657320872274143, "grad_norm": 0.8328439593315125, "kl": 0.012733696028590202, "learning_rate": 4.822222222222222e-06, "loss": 0.0005, "reward": -0.10983332991600037, "reward_std": 0.36434730887413025, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10983332991600037, "step": 217 }, { "completion_length": 200.0, "epoch": 0.09701824655095684, "grad_norm": 0.014833502471446991, "kl": 0.005706641357392073, "learning_rate": 4.8444444444444446e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 218 }, { "completion_length": 200.0, "epoch": 0.09746328437917223, "grad_norm": 0.01487264595925808, "kl": 0.007111959159374237, "learning_rate": 4.866666666666667e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 219 }, { "completion_length": 191.5, "epoch": 0.09790832220738763, "grad_norm": 0.6546903252601624, "kl": 0.03114478290081024, "learning_rate": 4.888888888888889e-06, "loss": 0.0012, "reward": -0.02316666767001152, "reward_std": 0.3049527406692505, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02316666767001152, "step": 220 }, { "completion_length": 200.0, "epoch": 0.09835336003560302, "grad_norm": 0.012690722942352295, "kl": 0.004840874578803778, "learning_rate": 4.911111111111112e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 221 }, { "completion_length": 200.0, "epoch": 0.09879839786381843, "grad_norm": 0.8420057892799377, "kl": 0.01995580643415451, "learning_rate": 4.933333333333334e-06, "loss": 0.0008, "reward": -0.11483334004878998, "reward_std": 0.37201637029647827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11483334004878998, "step": 222 }, { "completion_length": 162.6666717529297, "epoch": 0.09924343569203382, "grad_norm": 0.7063876390457153, "kl": 0.018741153180599213, "learning_rate": 4.9555555555555565e-06, "loss": 0.0007, "reward": -0.15850001573562622, "reward_std": 0.2223670333623886, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.15850001573562622, "step": 223 }, { "completion_length": 142.1666717529297, "epoch": 0.09968847352024922, "grad_norm": 1.3013415336608887, "kl": 0.0186506025493145, "learning_rate": 4.977777777777778e-06, "loss": 0.0007, "reward": 0.08866667002439499, "reward_std": 0.11559354513883591, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08866667002439499, "step": 224 }, { "completion_length": 200.0, "epoch": 0.10013351134846461, "grad_norm": 0.8148112297058105, "kl": 0.014559872448444366, "learning_rate": 5e-06, "loss": 0.0006, "reward": 0.011333337053656578, "reward_std": 0.27842533588409424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011333337053656578, "step": 225 }, { "completion_length": 200.0, "epoch": 0.10057854917668002, "grad_norm": 0.7223935127258301, "kl": 0.01846488006412983, "learning_rate": 4.999996982499377e-06, "loss": 0.0007, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 226 }, { "completion_length": 176.83334350585938, "epoch": 0.10102358700489542, "grad_norm": 0.855940580368042, "kl": 0.023898562416434288, "learning_rate": 4.9999879300047904e-06, "loss": 0.001, "reward": -0.06350000202655792, "reward_std": 0.24804334342479706, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06350000202655792, "step": 227 }, { "completion_length": 178.1666717529297, "epoch": 0.10146862483311081, "grad_norm": 1.3559764623641968, "kl": 0.012699100188910961, "learning_rate": 4.999972842538094e-06, "loss": 0.0005, "reward": 0.0833333358168602, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0833333358168602, "step": 228 }, { "completion_length": 200.0, "epoch": 0.10191366266132622, "grad_norm": 0.11357201635837555, "kl": 0.02485606074333191, "learning_rate": 4.999951720135707e-06, "loss": 0.001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 229 }, { "completion_length": 193.5, "epoch": 0.10235870048954161, "grad_norm": 1.0450315475463867, "kl": 0.022413700819015503, "learning_rate": 4.999924562848623e-06, "loss": 0.0009, "reward": -0.27666670083999634, "reward_std": 0.32778817415237427, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.27666670083999634, "step": 230 }, { "completion_length": 200.0, "epoch": 0.102803738317757, "grad_norm": 0.022118445485830307, "kl": 0.011586411856114864, "learning_rate": 4.999891370742395e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 231 }, { "completion_length": 200.0, "epoch": 0.1032487761459724, "grad_norm": 0.032451968640089035, "kl": 0.01942615956068039, "learning_rate": 4.999852143897152e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 232 }, { "completion_length": 187.33334350585938, "epoch": 0.10369381397418781, "grad_norm": 1.0973916053771973, "kl": 0.028928395360708237, "learning_rate": 4.999806882407586e-06, "loss": 0.0012, "reward": 0.09416666626930237, "reward_std": 0.07552593946456909, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09416666626930237, "step": 233 }, { "completion_length": 200.0, "epoch": 0.1041388518024032, "grad_norm": 0.8492021560668945, "kl": 0.020361032336950302, "learning_rate": 4.9997555863829584e-06, "loss": 0.0008, "reward": -0.0728333443403244, "reward_std": 0.3074478507041931, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0728333443403244, "step": 234 }, { "completion_length": 200.0, "epoch": 0.1045838896306186, "grad_norm": 0.7689980268478394, "kl": 0.010467594489455223, "learning_rate": 4.999698255947099e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 235 }, { "completion_length": 200.0, "epoch": 0.105028927458834, "grad_norm": 0.022514864802360535, "kl": 0.016714762896299362, "learning_rate": 4.9996348912384025e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 236 }, { "completion_length": 200.0, "epoch": 0.1054739652870494, "grad_norm": 0.7497798800468445, "kl": 0.035260215401649475, "learning_rate": 4.999565492409831e-06, "loss": 0.0014, "reward": -0.07899999618530273, "reward_std": 0.3160455822944641, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07899999618530273, "step": 237 }, { "completion_length": 180.33334350585938, "epoch": 0.1059190031152648, "grad_norm": 1.1552839279174805, "kl": 0.020859047770500183, "learning_rate": 4.999490059628914e-06, "loss": 0.0008, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 238 }, { "completion_length": 200.0, "epoch": 0.10636404094348019, "grad_norm": 0.014450768008828163, "kl": 0.00946769304573536, "learning_rate": 4.999408593077747e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 239 }, { "completion_length": 200.0, "epoch": 0.1068090787716956, "grad_norm": 0.6490687131881714, "kl": 0.013119819574058056, "learning_rate": 4.999321092952989e-06, "loss": 0.0005, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 240 }, { "completion_length": 198.1666717529297, "epoch": 0.10725411659991099, "grad_norm": 0.8787567615509033, "kl": 0.027725404128432274, "learning_rate": 4.999227559465865e-06, "loss": 0.0011, "reward": -0.060833342373371124, "reward_std": 0.2964027523994446, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.060833342373371124, "step": 241 }, { "completion_length": 200.0, "epoch": 0.10769915442812639, "grad_norm": 0.6240781545639038, "kl": 0.011513415724039078, "learning_rate": 4.999127992842167e-06, "loss": 0.0005, "reward": 0.019833337515592575, "reward_std": 0.2576046586036682, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019833337515592575, "step": 242 }, { "completion_length": 200.0, "epoch": 0.1081441922563418, "grad_norm": 0.021726641803979874, "kl": 0.011281192302703857, "learning_rate": 4.999022393322246e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 243 }, { "completion_length": 200.0, "epoch": 0.10858923008455719, "grad_norm": 0.01203103642910719, "kl": 0.007502372842282057, "learning_rate": 4.998910761161022e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 244 }, { "completion_length": 200.0, "epoch": 0.10903426791277258, "grad_norm": 0.03352230042219162, "kl": 0.015283550135791302, "learning_rate": 4.998793096627973e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 245 }, { "completion_length": 180.0, "epoch": 0.10947930574098798, "grad_norm": 0.6931204199790955, "kl": 0.025455031543970108, "learning_rate": 4.998669400007142e-06, "loss": 0.001, "reward": -0.0663333386182785, "reward_std": 0.25824224948883057, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0663333386182785, "step": 246 }, { "completion_length": 179.33334350585938, "epoch": 0.10992434356920339, "grad_norm": 0.7052227258682251, "kl": 0.01645379140973091, "learning_rate": 4.998539671597134e-06, "loss": 0.0007, "reward": -0.1290000081062317, "reward_std": 0.243107408285141, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1290000081062317, "step": 247 }, { "completion_length": 200.0, "epoch": 0.11036938139741878, "grad_norm": 0.6620118021965027, "kl": 0.01992572657763958, "learning_rate": 4.998403911711112e-06, "loss": 0.0008, "reward": 0.02500000223517418, "reward_std": 0.24494896829128265, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02500000223517418, "step": 248 }, { "completion_length": 200.0, "epoch": 0.11081441922563418, "grad_norm": 0.012199520133435726, "kl": 0.01109787356108427, "learning_rate": 4.9982621206768e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 249 }, { "completion_length": 200.0, "epoch": 0.11125945705384958, "grad_norm": 0.6646403074264526, "kl": 0.017368197441101074, "learning_rate": 4.998114298836483e-06, "loss": 0.0007, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 250 }, { "completion_length": 200.0, "epoch": 0.11170449488206498, "grad_norm": 0.64476478099823, "kl": 0.018091298639774323, "learning_rate": 4.997960446547002e-06, "loss": 0.0007, "reward": 0.007833331823348999, "reward_std": 0.28699856996536255, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007833331823348999, "step": 251 }, { "completion_length": 200.0, "epoch": 0.11214953271028037, "grad_norm": 0.01579858362674713, "kl": 0.009715499356389046, "learning_rate": 4.997800564179758e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 252 }, { "completion_length": 200.0, "epoch": 0.11259457053849577, "grad_norm": 0.7200601696968079, "kl": 0.02384258806705475, "learning_rate": 4.997634652120704e-06, "loss": 0.001, "reward": 0.016166668385267258, "reward_std": 0.2665861248970032, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016166668385267258, "step": 253 }, { "completion_length": 193.33334350585938, "epoch": 0.11303960836671118, "grad_norm": 0.7668142318725586, "kl": 0.029656527563929558, "learning_rate": 4.997462710770356e-06, "loss": 0.0012, "reward": -0.13450001180171967, "reward_std": 0.29981911182403564, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13450001180171967, "step": 254 }, { "completion_length": 200.0, "epoch": 0.11348464619492657, "grad_norm": 0.02032412961125374, "kl": 0.014308085665106773, "learning_rate": 4.997284740543776e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 255 }, { "completion_length": 198.1666717529297, "epoch": 0.11392968402314196, "grad_norm": 0.025216558948159218, "kl": 0.022948317229747772, "learning_rate": 4.997100741870587e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 256 }, { "completion_length": 200.0, "epoch": 0.11437472185135737, "grad_norm": 0.0239882729947567, "kl": 0.01751275360584259, "learning_rate": 4.996910715194963e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 257 }, { "completion_length": 200.0, "epoch": 0.11481975967957277, "grad_norm": 0.028994852676987648, "kl": 0.01953146606683731, "learning_rate": 4.9967146609756254e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 258 }, { "completion_length": 200.0, "epoch": 0.11526479750778816, "grad_norm": 0.7920787334442139, "kl": 0.024792861193418503, "learning_rate": 4.996512579685851e-06, "loss": 0.001, "reward": 0.034333329647779465, "reward_std": 0.28770241141319275, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.034333329647779465, "step": 259 }, { "completion_length": 200.0, "epoch": 0.11570983533600356, "grad_norm": 0.027261994779109955, "kl": 0.01596745476126671, "learning_rate": 4.996304471813464e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 260 }, { "completion_length": 199.6666717529297, "epoch": 0.11615487316421896, "grad_norm": 0.7512991428375244, "kl": 0.020810922607779503, "learning_rate": 4.996090337860836e-06, "loss": 0.0008, "reward": 0.03583333641290665, "reward_std": 0.218412846326828, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03583333641290665, "step": 261 }, { "completion_length": 200.0, "epoch": 0.11659991099243436, "grad_norm": 0.01981634460389614, "kl": 0.02145499363541603, "learning_rate": 4.995870178344888e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 262 }, { "completion_length": 200.0, "epoch": 0.11704494882064975, "grad_norm": 0.015385876409709454, "kl": 0.010222600772976875, "learning_rate": 4.995643993797084e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 263 }, { "completion_length": 200.0, "epoch": 0.11748998664886515, "grad_norm": 0.010453257709741592, "kl": 0.004751277156174183, "learning_rate": 4.995411784763434e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 264 }, { "completion_length": 200.0, "epoch": 0.11793502447708056, "grad_norm": 0.02353510819375515, "kl": 0.01597435772418976, "learning_rate": 4.995173551804491e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 265 }, { "completion_length": 200.0, "epoch": 0.11838006230529595, "grad_norm": 0.6973554491996765, "kl": 0.013702675700187683, "learning_rate": 4.9949292954953486e-06, "loss": 0.0005, "reward": 0.020166665315628052, "reward_std": 0.2567881941795349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020166665315628052, "step": 266 }, { "completion_length": 200.0, "epoch": 0.11882510013351134, "grad_norm": 0.7307512760162354, "kl": 0.02334902063012123, "learning_rate": 4.994679016425642e-06, "loss": 0.0009, "reward": 0.007999996654689312, "reward_std": 0.2865903079509735, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007999996654689312, "step": 267 }, { "completion_length": 196.5, "epoch": 0.11927013796172675, "grad_norm": 0.7821613550186157, "kl": 0.023126548156142235, "learning_rate": 4.994422715199546e-06, "loss": 0.0009, "reward": -0.35100001096725464, "reward_std": 0.24427853524684906, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.35100001096725464, "step": 268 }, { "completion_length": 200.0, "epoch": 0.11971517578994215, "grad_norm": 0.014924556948244572, "kl": 0.010245325975120068, "learning_rate": 4.99416039243577e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 269 }, { "completion_length": 200.0, "epoch": 0.12016021361815754, "grad_norm": 0.011302834376692772, "kl": 0.0074605681002140045, "learning_rate": 4.993892048767563e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 270 }, { "completion_length": 196.83334350585938, "epoch": 0.12060525144637294, "grad_norm": 0.7507510185241699, "kl": 0.01992712914943695, "learning_rate": 4.993617684842707e-06, "loss": 0.0008, "reward": 0.0560000017285347, "reward_std": 0.1690147966146469, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0560000017285347, "step": 271 }, { "completion_length": 186.33334350585938, "epoch": 0.12105028927458834, "grad_norm": 0.642690896987915, "kl": 0.03827090188860893, "learning_rate": 4.9933373013235156e-06, "loss": 0.0015, "reward": -0.01666666567325592, "reward_std": 0.2657710909843445, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.01666666567325592, "step": 272 }, { "completion_length": 200.0, "epoch": 0.12149532710280374, "grad_norm": 0.016800519078969955, "kl": 0.007222745567560196, "learning_rate": 4.993050898886833e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 273 }, { "completion_length": 200.0, "epoch": 0.12194036493101913, "grad_norm": 0.758976936340332, "kl": 0.018932368606328964, "learning_rate": 4.992758478224039e-06, "loss": 0.0008, "reward": -0.24283334612846375, "reward_std": 0.41019290685653687, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.24283334612846375, "step": 274 }, { "completion_length": 200.0, "epoch": 0.12238540275923454, "grad_norm": 0.020172713324427605, "kl": 0.012656650505959988, "learning_rate": 4.992460040041034e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 275 }, { "completion_length": 200.0, "epoch": 0.12283044058744993, "grad_norm": 0.01721290498971939, "kl": 0.008657376281917095, "learning_rate": 4.992155585058248e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 276 }, { "completion_length": 186.83334350585938, "epoch": 0.12327547841566533, "grad_norm": 0.6902337074279785, "kl": 0.06522956490516663, "learning_rate": 4.991845114010638e-06, "loss": 0.0026, "reward": -0.09683333337306976, "reward_std": 0.308468759059906, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09683333337306976, "step": 277 }, { "completion_length": 200.0, "epoch": 0.12372051624388072, "grad_norm": 0.013619703240692616, "kl": 0.009151134639978409, "learning_rate": 4.99152862764768e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 278 }, { "completion_length": 200.0, "epoch": 0.12416555407209613, "grad_norm": 0.01365516148507595, "kl": 0.005762772168964148, "learning_rate": 4.99120612673337e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 279 }, { "completion_length": 200.0, "epoch": 0.12461059190031153, "grad_norm": 0.03844517469406128, "kl": 0.009146707132458687, "learning_rate": 4.990877612046228e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 280 }, { "completion_length": 200.0, "epoch": 0.12505562972852693, "grad_norm": 0.010963845066726208, "kl": 0.004802503623068333, "learning_rate": 4.9905430843792886e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 281 }, { "completion_length": 200.0, "epoch": 0.12550066755674233, "grad_norm": 0.018975911661982536, "kl": 0.00843195803463459, "learning_rate": 4.9902025445401e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 282 }, { "completion_length": 191.33334350585938, "epoch": 0.12594570538495772, "grad_norm": 0.9152283072471619, "kl": 0.01359584927558899, "learning_rate": 4.989855993350728e-06, "loss": 0.0005, "reward": 0.08433333039283752, "reward_std": 0.09961257874965668, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08433333039283752, "step": 283 }, { "completion_length": 199.33334350585938, "epoch": 0.12639074321317312, "grad_norm": 0.6327641606330872, "kl": 0.011363311670720577, "learning_rate": 4.989503431647744e-06, "loss": 0.0005, "reward": 0.05283333361148834, "reward_std": 0.17677150666713715, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05283333361148834, "step": 284 }, { "completion_length": 200.0, "epoch": 0.1268357810413885, "grad_norm": 0.015086417086422443, "kl": 0.006470312364399433, "learning_rate": 4.9891448602822355e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 285 }, { "completion_length": 200.0, "epoch": 0.1272808188696039, "grad_norm": 0.6536492705345154, "kl": 0.015819404274225235, "learning_rate": 4.988780280119792e-06, "loss": 0.0006, "reward": -0.09083332866430283, "reward_std": 0.33507877588272095, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09083332866430283, "step": 286 }, { "completion_length": 200.0, "epoch": 0.1277258566978193, "grad_norm": 0.008064402267336845, "kl": 0.002744730096310377, "learning_rate": 4.988409692040511e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 287 }, { "completion_length": 180.83334350585938, "epoch": 0.12817089452603472, "grad_norm": 0.819420337677002, "kl": 0.06618692725896835, "learning_rate": 4.988033096938991e-06, "loss": 0.0026, "reward": 0.021166665479540825, "reward_std": 0.24596862494945526, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021166665479540825, "step": 288 }, { "completion_length": 200.0, "epoch": 0.12861593235425012, "grad_norm": 0.017804304137825966, "kl": 0.0107121542096138, "learning_rate": 4.9876504957243345e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 289 }, { "completion_length": 200.0, "epoch": 0.1290609701824655, "grad_norm": 0.009461048990488052, "kl": 0.005491574760526419, "learning_rate": 4.987261889320141e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 290 }, { "completion_length": 200.0, "epoch": 0.1295060080106809, "grad_norm": 0.020793907344341278, "kl": 0.006585664115846157, "learning_rate": 4.986867278664505e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 291 }, { "completion_length": 200.0, "epoch": 0.1299510458388963, "grad_norm": 0.01623818278312683, "kl": 0.014776560477912426, "learning_rate": 4.9864666647100176e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 292 }, { "completion_length": 200.0, "epoch": 0.1303960836671117, "grad_norm": 0.015048404224216938, "kl": 0.006943022832274437, "learning_rate": 4.986060048423761e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 293 }, { "completion_length": 200.0, "epoch": 0.1308411214953271, "grad_norm": 0.10332320630550385, "kl": 0.018736328929662704, "learning_rate": 4.985647430787308e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 294 }, { "completion_length": 200.0, "epoch": 0.1312861593235425, "grad_norm": 0.7041745185852051, "kl": 0.013492944650352001, "learning_rate": 4.985228812796717e-06, "loss": 0.0005, "reward": 0.0403333380818367, "reward_std": 0.20739012956619263, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0403333380818367, "step": 295 }, { "completion_length": 180.33334350585938, "epoch": 0.1317311971517579, "grad_norm": 1.2074172496795654, "kl": 0.017207711935043335, "learning_rate": 4.984804195462532e-06, "loss": 0.0007, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 296 }, { "completion_length": 200.0, "epoch": 0.1321762349799733, "grad_norm": 0.007344384212046862, "kl": 0.0033924030140042305, "learning_rate": 4.984373579809778e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 297 }, { "completion_length": 200.0, "epoch": 0.1326212728081887, "grad_norm": 0.6581267714500427, "kl": 0.004817100241780281, "learning_rate": 4.983936966877964e-06, "loss": 0.0002, "reward": 0.02033333107829094, "reward_std": 0.2563799321651459, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02033333107829094, "step": 298 }, { "completion_length": 200.0, "epoch": 0.1330663106364041, "grad_norm": 0.6918825507164001, "kl": 0.008517519570887089, "learning_rate": 4.983494357721074e-06, "loss": 0.0003, "reward": -0.029333334416151047, "reward_std": 0.3780378997325897, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.029333334416151047, "step": 299 }, { "completion_length": 200.0, "epoch": 0.13351134846461948, "grad_norm": 0.6355785727500916, "kl": 0.007732154801487923, "learning_rate": 4.983045753407564e-06, "loss": 0.0003, "reward": -0.12200000137090683, "reward_std": 0.3381112515926361, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.12200000137090683, "step": 300 }, { "completion_length": 200.0, "epoch": 0.13395638629283488, "grad_norm": 0.00788823515176773, "kl": 0.004913420882076025, "learning_rate": 4.982591155020367e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 301 }, { "completion_length": 200.0, "epoch": 0.1344014241210503, "grad_norm": 0.019300375133752823, "kl": 0.0115005848929286, "learning_rate": 4.9821305636568835e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 302 }, { "completion_length": 200.0, "epoch": 0.1348464619492657, "grad_norm": 0.5956396460533142, "kl": 0.014617225155234337, "learning_rate": 4.981663980428981e-06, "loss": 0.0006, "reward": 0.019499998539686203, "reward_std": 0.2584211826324463, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019499998539686203, "step": 303 }, { "completion_length": 200.0, "epoch": 0.1352914997774811, "grad_norm": 0.013532106764614582, "kl": 0.007755351718515158, "learning_rate": 4.981191406462991e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 304 }, { "completion_length": 200.0, "epoch": 0.13573653760569648, "grad_norm": 0.6766030788421631, "kl": 0.019166380167007446, "learning_rate": 4.9807128428997085e-06, "loss": 0.0008, "reward": 0.020166665315628052, "reward_std": 0.2567881941795349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.020166665315628052, "step": 305 }, { "completion_length": 200.0, "epoch": 0.13618157543391188, "grad_norm": 0.0559423454105854, "kl": 0.023686787113547325, "learning_rate": 4.980228290894386e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 306 }, { "completion_length": 200.0, "epoch": 0.13662661326212727, "grad_norm": 0.03602616861462593, "kl": 0.021832283586263657, "learning_rate": 4.979737751616732e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 307 }, { "completion_length": 200.0, "epoch": 0.13707165109034267, "grad_norm": 1.0004466772079468, "kl": 0.01577741652727127, "learning_rate": 4.979241226250908e-06, "loss": 0.0006, "reward": 0.010333329439163208, "reward_std": 0.2808748185634613, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.010333329439163208, "step": 308 }, { "completion_length": 200.0, "epoch": 0.1375166889185581, "grad_norm": 0.01729634776711464, "kl": 0.007570373825728893, "learning_rate": 4.9787387159955265e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 309 }, { "completion_length": 200.0, "epoch": 0.13796172674677348, "grad_norm": 0.0166346225887537, "kl": 0.007312558591365814, "learning_rate": 4.978230222063649e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 310 }, { "completion_length": 200.0, "epoch": 0.13840676457498888, "grad_norm": 0.018261654302477837, "kl": 0.009910644963383675, "learning_rate": 4.9777157456827785e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 311 }, { "completion_length": 200.0, "epoch": 0.13885180240320427, "grad_norm": 0.02585846185684204, "kl": 0.011723216623067856, "learning_rate": 4.977195288094863e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 312 }, { "completion_length": 200.0, "epoch": 0.13929684023141967, "grad_norm": 0.02551482431590557, "kl": 0.011289785616099834, "learning_rate": 4.976668850556284e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 313 }, { "completion_length": 200.0, "epoch": 0.13974187805963506, "grad_norm": 0.022060496732592583, "kl": 0.01366241555660963, "learning_rate": 4.976136434337866e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 314 }, { "completion_length": 200.0, "epoch": 0.14018691588785046, "grad_norm": 0.5639723539352417, "kl": 0.006856884807348251, "learning_rate": 4.97559804072486e-06, "loss": 0.0003, "reward": 0.022499999031424522, "reward_std": 0.2510727047920227, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.022499999031424522, "step": 315 }, { "completion_length": 196.0, "epoch": 0.14063195371606588, "grad_norm": 0.8882031440734863, "kl": 0.012949245050549507, "learning_rate": 4.9750536710169485e-06, "loss": 0.0005, "reward": 0.061000000685453415, "reward_std": 0.1567673534154892, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.061000000685453415, "step": 316 }, { "completion_length": 200.0, "epoch": 0.14107699154428127, "grad_norm": 0.738691508769989, "kl": 0.018215883523225784, "learning_rate": 4.97450332652824e-06, "loss": 0.0007, "reward": -0.19816666841506958, "reward_std": 0.35409173369407654, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19816666841506958, "step": 317 }, { "completion_length": 200.0, "epoch": 0.14152202937249667, "grad_norm": 0.008259186521172523, "kl": 0.004001545254141092, "learning_rate": 4.973947008587268e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 318 }, { "completion_length": 183.0, "epoch": 0.14196706720071206, "grad_norm": 0.8659139275550842, "kl": 0.02181245945394039, "learning_rate": 4.973384718536982e-06, "loss": 0.0009, "reward": 0.03533333167433739, "reward_std": 0.18040475249290466, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03533333167433739, "step": 319 }, { "completion_length": 200.0, "epoch": 0.14241210502892745, "grad_norm": 0.013459905050694942, "kl": 0.008506972342729568, "learning_rate": 4.972816457734752e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 320 }, { "completion_length": 200.0, "epoch": 0.14285714285714285, "grad_norm": 0.6455613970756531, "kl": 0.04512510821223259, "learning_rate": 4.972242227552358e-06, "loss": 0.0018, "reward": 0.008666664361953735, "reward_std": 0.2849573493003845, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.008666664361953735, "step": 321 }, { "completion_length": 182.33334350585938, "epoch": 0.14330218068535824, "grad_norm": 0.7501981854438782, "kl": 0.01761593297123909, "learning_rate": 4.971662029375995e-06, "loss": 0.0007, "reward": -0.17916667461395264, "reward_std": 0.331107497215271, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.17916667461395264, "step": 322 }, { "completion_length": 200.0, "epoch": 0.14374721851357367, "grad_norm": 0.008645527996122837, "kl": 0.003186706220731139, "learning_rate": 4.97107586460626e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 323 }, { "completion_length": 175.83334350585938, "epoch": 0.14419225634178906, "grad_norm": 0.7112619876861572, "kl": 0.030575290322303772, "learning_rate": 4.970483734658154e-06, "loss": 0.0012, "reward": 0.08583333343267441, "reward_std": 0.12314936518669128, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08583333343267441, "step": 324 }, { "completion_length": 200.0, "epoch": 0.14463729417000445, "grad_norm": 0.035192981362342834, "kl": 0.008340008556842804, "learning_rate": 4.969885640961081e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 325 }, { "completion_length": 190.0, "epoch": 0.14508233199821985, "grad_norm": 0.7466467618942261, "kl": 0.0285557322204113, "learning_rate": 4.969281584958838e-06, "loss": 0.0011, "reward": 0.015166670083999634, "reward_std": 0.2690356373786926, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.015166670083999634, "step": 326 }, { "completion_length": 191.1666717529297, "epoch": 0.14552736982643524, "grad_norm": 0.802043616771698, "kl": 0.025279924273490906, "learning_rate": 4.968671568109617e-06, "loss": 0.001, "reward": 0.0898333415389061, "reward_std": 0.21741704642772675, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0898333415389061, "step": 327 }, { "completion_length": 167.5, "epoch": 0.14597240765465064, "grad_norm": 0.812122642993927, "kl": 0.01720350608229637, "learning_rate": 4.968055591885999e-06, "loss": 0.0007, "reward": -0.06016666814684868, "reward_std": 0.24665558338165283, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.06016666814684868, "step": 328 }, { "completion_length": 200.0, "epoch": 0.14641744548286603, "grad_norm": 0.7395609021186829, "kl": 0.05103464424610138, "learning_rate": 4.967433657774952e-06, "loss": 0.002, "reward": -0.07100000232458115, "reward_std": 0.3038104772567749, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07100000232458115, "step": 329 }, { "completion_length": 179.83334350585938, "epoch": 0.14686248331108145, "grad_norm": 0.7787045836448669, "kl": 0.04071980342268944, "learning_rate": 4.9668057672778225e-06, "loss": 0.0016, "reward": 0.047833334654569626, "reward_std": 0.21287593245506287, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.047833334654569626, "step": 330 }, { "completion_length": 200.0, "epoch": 0.14730752113929685, "grad_norm": 0.6824626326560974, "kl": 0.008245850913226604, "learning_rate": 4.966171921910341e-06, "loss": 0.0003, "reward": -0.09216667711734772, "reward_std": 0.34250280261039734, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09216667711734772, "step": 331 }, { "completion_length": 200.0, "epoch": 0.14775255896751224, "grad_norm": 0.01006829272955656, "kl": 0.004723408259451389, "learning_rate": 4.96553212320261e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 332 }, { "completion_length": 200.0, "epoch": 0.14819759679572764, "grad_norm": 0.7323021292686462, "kl": 0.010274862870573997, "learning_rate": 4.9648863726991035e-06, "loss": 0.0004, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 333 }, { "completion_length": 200.0, "epoch": 0.14864263462394303, "grad_norm": 0.024399923160672188, "kl": 0.013128578662872314, "learning_rate": 4.964234671958663e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 334 }, { "completion_length": 200.0, "epoch": 0.14908767245215843, "grad_norm": 0.010077173821628094, "kl": 0.004746645223349333, "learning_rate": 4.963577022554496e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 335 }, { "completion_length": 200.0, "epoch": 0.14953271028037382, "grad_norm": 0.011270418763160706, "kl": 0.005329379811882973, "learning_rate": 4.962913426074166e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 336 }, { "completion_length": 200.0, "epoch": 0.14997774810858924, "grad_norm": 0.012212223373353481, "kl": 0.006592839956283569, "learning_rate": 4.9622438841195986e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 337 }, { "completion_length": 200.0, "epoch": 0.15042278593680464, "grad_norm": 0.007356896065175533, "kl": 0.003410342149436474, "learning_rate": 4.961568398307065e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 338 }, { "completion_length": 199.5, "epoch": 0.15086782376502003, "grad_norm": 0.6488639116287231, "kl": 0.01651693880558014, "learning_rate": 4.960886970267191e-06, "loss": 0.0007, "reward": 0.05366666615009308, "reward_std": 0.17473027110099792, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05366666615009308, "step": 339 }, { "completion_length": 200.0, "epoch": 0.15131286159323543, "grad_norm": 0.024112718179821968, "kl": 0.018834218382835388, "learning_rate": 4.960199601644943e-06, "loss": 0.0008, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 340 }, { "completion_length": 200.0, "epoch": 0.15175789942145082, "grad_norm": 0.013589066453278065, "kl": 0.01101887971162796, "learning_rate": 4.959506294099629e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 341 }, { "completion_length": 200.0, "epoch": 0.15220293724966621, "grad_norm": 0.010311473160982132, "kl": 0.007442638278007507, "learning_rate": 4.958807049304893e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 342 }, { "completion_length": 171.1666717529297, "epoch": 0.1526479750778816, "grad_norm": 5.103739261627197, "kl": 0.3760009706020355, "learning_rate": 4.958101868948715e-06, "loss": 0.015, "reward": 0.1758333444595337, "reward_std": 0.12451573461294174, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1758333444595337, "step": 343 }, { "completion_length": 200.0, "epoch": 0.15309301290609703, "grad_norm": 0.013698318973183632, "kl": 0.011166570708155632, "learning_rate": 4.957390754733398e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 344 }, { "completion_length": 200.0, "epoch": 0.15353805073431243, "grad_norm": 0.014360117726027966, "kl": 0.010611571371555328, "learning_rate": 4.956673708375574e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 345 }, { "completion_length": 189.5, "epoch": 0.15398308856252782, "grad_norm": 0.7445936799049377, "kl": 0.028817251324653625, "learning_rate": 4.955950731606192e-06, "loss": 0.0012, "reward": 0.07383333891630173, "reward_std": 0.12533222138881683, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07383333891630173, "step": 346 }, { "completion_length": 184.0, "epoch": 0.15442812639074321, "grad_norm": 0.7008059024810791, "kl": 0.05760132521390915, "learning_rate": 4.9552218261705185e-06, "loss": 0.0023, "reward": -0.22599999606609344, "reward_std": 0.35765790939331055, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.22599999606609344, "step": 347 }, { "completion_length": 200.0, "epoch": 0.1548731642189586, "grad_norm": 0.6954376697540283, "kl": 0.007197665050625801, "learning_rate": 4.954486993828132e-06, "loss": 0.0003, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 348 }, { "completion_length": 200.0, "epoch": 0.155318202047174, "grad_norm": 0.009056415408849716, "kl": 0.007077607326209545, "learning_rate": 4.953746236352917e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 349 }, { "completion_length": 200.0, "epoch": 0.1557632398753894, "grad_norm": 0.01317316759377718, "kl": 0.01229158602654934, "learning_rate": 4.952999555533065e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 350 }, { "completion_length": 195.5, "epoch": 0.15620827770360482, "grad_norm": 0.6882670521736145, "kl": 0.024929411709308624, "learning_rate": 4.952246953171062e-06, "loss": 0.001, "reward": -0.04200000315904617, "reward_std": 0.33165282011032104, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.04200000315904617, "step": 351 }, { "completion_length": 195.6666717529297, "epoch": 0.15665331553182021, "grad_norm": 0.676509439945221, "kl": 0.04691646993160248, "learning_rate": 4.951488431083689e-06, "loss": 0.0019, "reward": -0.13633333146572113, "reward_std": 0.29477566480636597, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13633333146572113, "step": 352 }, { "completion_length": 200.0, "epoch": 0.1570983533600356, "grad_norm": 0.6416396498680115, "kl": 0.01036627497524023, "learning_rate": 4.950723991102022e-06, "loss": 0.0004, "reward": -0.014666667208075523, "reward_std": 0.34211206436157227, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.014666667208075523, "step": 353 }, { "completion_length": 200.0, "epoch": 0.157543391188251, "grad_norm": 0.6665915846824646, "kl": 0.03278065845370293, "learning_rate": 4.949953635071417e-06, "loss": 0.0013, "reward": -0.08783333748579025, "reward_std": 0.3303600549697876, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.08783333748579025, "step": 354 }, { "completion_length": 191.5, "epoch": 0.1579884290164664, "grad_norm": 0.9799753427505493, "kl": 0.01656440459191799, "learning_rate": 4.949177364851515e-06, "loss": 0.0007, "reward": 0.0429999977350235, "reward_std": 0.20085816085338593, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0429999977350235, "step": 355 }, { "completion_length": 200.0, "epoch": 0.1584334668446818, "grad_norm": 0.016740066930651665, "kl": 0.01629803143441677, "learning_rate": 4.9483951823162326e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 356 }, { "completion_length": 200.0, "epoch": 0.1588785046728972, "grad_norm": 0.014137927442789078, "kl": 0.007187969516962767, "learning_rate": 4.947607089353758e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 357 }, { "completion_length": 200.0, "epoch": 0.1593235425011126, "grad_norm": 0.6014176607131958, "kl": 0.018046831712126732, "learning_rate": 4.946813087866549e-06, "loss": 0.0007, "reward": 0.007333338260650635, "reward_std": 0.2882232666015625, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.007333338260650635, "step": 358 }, { "completion_length": 200.0, "epoch": 0.159768580329328, "grad_norm": 0.013204401358962059, "kl": 0.013151012361049652, "learning_rate": 4.946013179771325e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 359 }, { "completion_length": 199.5, "epoch": 0.1602136181575434, "grad_norm": 0.6359446048736572, "kl": 0.02717038244009018, "learning_rate": 4.9452073669990656e-06, "loss": 0.0011, "reward": 0.03733333572745323, "reward_std": 0.2147386074066162, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03733333572745323, "step": 360 }, { "completion_length": 200.0, "epoch": 0.1606586559857588, "grad_norm": 0.013449462130665779, "kl": 0.006884987931698561, "learning_rate": 4.944395651495002e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 361 }, { "completion_length": 187.6666717529297, "epoch": 0.16110369381397419, "grad_norm": 0.818776547908783, "kl": 0.0218803733587265, "learning_rate": 4.9435780352186154e-06, "loss": 0.0009, "reward": -0.00916666816920042, "reward_std": 0.21735171973705292, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.009166665375232697, "step": 362 }, { "completion_length": 200.0, "epoch": 0.16154873164218958, "grad_norm": 0.013731640763580799, "kl": 0.0065148696303367615, "learning_rate": 4.942754520143634e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 363 }, { "completion_length": 200.0, "epoch": 0.16199376947040497, "grad_norm": 0.009971830062568188, "kl": 0.012283856980502605, "learning_rate": 4.9419251082580216e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 364 }, { "completion_length": 200.0, "epoch": 0.16243880729862037, "grad_norm": 0.6981242895126343, "kl": 0.022827019914984703, "learning_rate": 4.94108980156398e-06, "loss": 0.0009, "reward": -0.02850000187754631, "reward_std": 0.3187059760093689, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.02850000187754631, "step": 365 }, { "completion_length": 186.1666717529297, "epoch": 0.1628838451268358, "grad_norm": 0.7663524746894836, "kl": 0.02265150099992752, "learning_rate": 4.940248602077939e-06, "loss": 0.0009, "reward": 0.006000000052154064, "reward_std": 0.2016005963087082, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.006000000052154064, "step": 366 }, { "completion_length": 200.0, "epoch": 0.16332888295505119, "grad_norm": 0.01284782588481903, "kl": 0.009670613333582878, "learning_rate": 4.939401511830556e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 367 }, { "completion_length": 200.0, "epoch": 0.16377392078326658, "grad_norm": 0.01885703206062317, "kl": 0.009269410744309425, "learning_rate": 4.938548532866706e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 368 }, { "completion_length": 200.0, "epoch": 0.16421895861148197, "grad_norm": 0.6328880190849304, "kl": 0.01867607608437538, "learning_rate": 4.937689667245481e-06, "loss": 0.0007, "reward": 0.021500002592802048, "reward_std": 0.2535221576690674, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021500002592802048, "step": 369 }, { "completion_length": 200.0, "epoch": 0.16466399643969737, "grad_norm": 0.010342692025005817, "kl": 0.0059143840335309505, "learning_rate": 4.936824917040184e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 370 }, { "completion_length": 200.0, "epoch": 0.16510903426791276, "grad_norm": 0.6799391508102417, "kl": 0.01873181015253067, "learning_rate": 4.935954284338321e-06, "loss": 0.0007, "reward": 0.005833338014781475, "reward_std": 0.2918975353240967, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.005833338014781475, "step": 371 }, { "completion_length": 200.0, "epoch": 0.16555407209612816, "grad_norm": 0.017115725204348564, "kl": 0.010264448821544647, "learning_rate": 4.9350777712415995e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 372 }, { "completion_length": 200.0, "epoch": 0.16599910992434358, "grad_norm": 0.010852769017219543, "kl": 0.007528107613325119, "learning_rate": 4.934195379865925e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 373 }, { "completion_length": 168.83334350585938, "epoch": 0.16644414775255897, "grad_norm": 1.29231595993042, "kl": 0.18531188368797302, "learning_rate": 4.933307112341388e-06, "loss": 0.0074, "reward": 0.1628333330154419, "reward_std": 0.13582256436347961, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1628333330154419, "step": 374 }, { "completion_length": 200.0, "epoch": 0.16688918558077437, "grad_norm": 0.009757625870406628, "kl": 0.01133672520518303, "learning_rate": 4.932412970812269e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 375 }, { "completion_length": 199.5, "epoch": 0.16733422340898976, "grad_norm": 0.01278277114033699, "kl": 0.009305099956691265, "learning_rate": 4.931512957437024e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 376 }, { "completion_length": 200.0, "epoch": 0.16777926123720516, "grad_norm": 0.013093134388327599, "kl": 0.007520940154790878, "learning_rate": 4.930607074388287e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 377 }, { "completion_length": 200.0, "epoch": 0.16822429906542055, "grad_norm": 0.011228191666305065, "kl": 0.008690441027283669, "learning_rate": 4.92969532385286e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 378 }, { "completion_length": 185.5, "epoch": 0.16866933689363595, "grad_norm": 0.7297086119651794, "kl": 0.026256537064909935, "learning_rate": 4.928777708031709e-06, "loss": 0.0011, "reward": 0.016166668385267258, "reward_std": 0.16865399479866028, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.016166668385267258, "step": 379 }, { "completion_length": 200.0, "epoch": 0.16911437472185137, "grad_norm": 0.015303281135857105, "kl": 0.01288022380322218, "learning_rate": 4.927854229139959e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 380 }, { "completion_length": 200.0, "epoch": 0.16955941255006676, "grad_norm": 0.011454589664936066, "kl": 0.006882285233587027, "learning_rate": 4.9269248894068886e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 381 }, { "completion_length": 200.0, "epoch": 0.17000445037828216, "grad_norm": 0.014687180519104004, "kl": 0.014141609892249107, "learning_rate": 4.9259896910759246e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 382 }, { "completion_length": 200.0, "epoch": 0.17044948820649755, "grad_norm": 0.016531087458133698, "kl": 0.00648513063788414, "learning_rate": 4.925048636404635e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 383 }, { "completion_length": 200.0, "epoch": 0.17089452603471295, "grad_norm": 0.00957447849214077, "kl": 0.0061697340570390224, "learning_rate": 4.9241017276647295e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 384 }, { "completion_length": 200.0, "epoch": 0.17133956386292834, "grad_norm": 0.01072862558066845, "kl": 0.005429576151072979, "learning_rate": 4.923148967142043e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 385 }, { "completion_length": 200.0, "epoch": 0.17178460169114373, "grad_norm": 0.024660624563694, "kl": 0.009238356724381447, "learning_rate": 4.9221903571365406e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 386 }, { "completion_length": 200.0, "epoch": 0.17222963951935916, "grad_norm": 0.7495405673980713, "kl": 0.0185337346047163, "learning_rate": 4.921225899962308e-06, "loss": 0.0007, "reward": -0.13200001418590546, "reward_std": 0.3997148871421814, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13200001418590546, "step": 387 }, { "completion_length": 200.0, "epoch": 0.17267467734757455, "grad_norm": 0.010481936857104301, "kl": 0.00567130371928215, "learning_rate": 4.920255597947545e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 388 }, { "completion_length": 200.0, "epoch": 0.17311971517578995, "grad_norm": 0.007845093496143818, "kl": 0.003956751897931099, "learning_rate": 4.919279453434561e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 389 }, { "completion_length": 181.0, "epoch": 0.17356475300400534, "grad_norm": 0.6615269184112549, "kl": 0.030248617753386497, "learning_rate": 4.918297468779771e-06, "loss": 0.0012, "reward": -0.13499999046325684, "reward_std": 0.35328683257102966, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.13499999046325684, "step": 390 }, { "completion_length": 200.0, "epoch": 0.17400979083222073, "grad_norm": 0.691724419593811, "kl": 0.02659853920340538, "learning_rate": 4.917309646353682e-06, "loss": 0.0011, "reward": -0.09699999541044235, "reward_std": 0.34400463104248047, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09699999541044235, "step": 391 }, { "completion_length": 184.5, "epoch": 0.17445482866043613, "grad_norm": 0.6306662559509277, "kl": 0.0601261667907238, "learning_rate": 4.916315988540903e-06, "loss": 0.0024, "reward": -0.09049999713897705, "reward_std": 0.25501197576522827, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09049999713897705, "step": 392 }, { "completion_length": 200.0, "epoch": 0.17489986648865152, "grad_norm": 0.012944119051098824, "kl": 0.01151751633733511, "learning_rate": 4.9153164977401215e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 393 }, { "completion_length": 200.0, "epoch": 0.17534490431686695, "grad_norm": 0.7263959646224976, "kl": 0.011329833418130875, "learning_rate": 4.914311176364109e-06, "loss": 0.0005, "reward": 0.021500002592802048, "reward_std": 0.2535221576690674, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.021500002592802048, "step": 394 }, { "completion_length": 200.0, "epoch": 0.17578994214508234, "grad_norm": 0.00732870027422905, "kl": 0.004891596734523773, "learning_rate": 4.913300026839714e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 395 }, { "completion_length": 200.0, "epoch": 0.17623497997329773, "grad_norm": 0.7324749827384949, "kl": 0.01928497850894928, "learning_rate": 4.912283051607849e-06, "loss": 0.0008, "reward": 0.02199999988079071, "reward_std": 0.25229746103286743, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.02199999988079071, "step": 396 }, { "completion_length": 200.0, "epoch": 0.17668001780151313, "grad_norm": 0.8718001842498779, "kl": 0.13316845893859863, "learning_rate": 4.911260253123494e-06, "loss": 0.0053, "reward": -0.1536666750907898, "reward_std": 0.30690693855285645, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1536666750907898, "step": 397 }, { "completion_length": 200.0, "epoch": 0.17712505562972852, "grad_norm": 0.7277849912643433, "kl": 0.01658281497657299, "learning_rate": 4.9102316338556844e-06, "loss": 0.0007, "reward": -0.0003333290515001863, "reward_std": 0.3070027232170105, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0003333290515001863, "step": 398 }, { "completion_length": 200.0, "epoch": 0.17757009345794392, "grad_norm": 0.6445873975753784, "kl": 0.02959112823009491, "learning_rate": 4.909197196287509e-06, "loss": 0.0012, "reward": -0.1274999976158142, "reward_std": 0.44477805495262146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.1274999976158142, "step": 399 }, { "completion_length": 172.33334350585938, "epoch": 0.1780151312861593, "grad_norm": 0.7523168921470642, "kl": 0.0322515144944191, "learning_rate": 4.908156942916101e-06, "loss": 0.0013, "reward": -0.03199999779462814, "reward_std": 0.2558077275753021, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03199999779462814, "step": 400 }, { "completion_length": 167.0, "epoch": 0.17846016911437473, "grad_norm": 0.8686386942863464, "kl": 0.056089848279953, "learning_rate": 4.90711087625263e-06, "loss": 0.0022, "reward": 0.07116666436195374, "reward_std": 0.14513224363327026, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.07116666436195374, "step": 401 }, { "completion_length": 200.0, "epoch": 0.17890520694259013, "grad_norm": 0.00890452042222023, "kl": 0.005404898431152105, "learning_rate": 4.906058998822303e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 402 }, { "completion_length": 200.0, "epoch": 0.17935024477080552, "grad_norm": 0.9637120962142944, "kl": 0.02167494222521782, "learning_rate": 4.905001313164353e-06, "loss": 0.0009, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 403 }, { "completion_length": 200.0, "epoch": 0.17979528259902092, "grad_norm": 0.007997624576091766, "kl": 0.004169671796262264, "learning_rate": 4.9039378218320325e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 404 }, { "completion_length": 200.0, "epoch": 0.1802403204272363, "grad_norm": 0.016796903684735298, "kl": 0.011683585122227669, "learning_rate": 4.902868527392612e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 405 }, { "completion_length": 200.0, "epoch": 0.1806853582554517, "grad_norm": 0.015744149684906006, "kl": 0.012497194111347198, "learning_rate": 4.9017934324273655e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 406 }, { "completion_length": 199.1666717529297, "epoch": 0.1811303960836671, "grad_norm": 0.5119321346282959, "kl": 0.028500860556960106, "learning_rate": 4.900712539531577e-06, "loss": 0.0011, "reward": 0.03933333605527878, "reward_std": 0.2756495475769043, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.03933333605527878, "step": 407 }, { "completion_length": 193.6666717529297, "epoch": 0.18157543391188252, "grad_norm": 0.009107636287808418, "kl": 0.009039022959768772, "learning_rate": 4.89962585131452e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 408 }, { "completion_length": 200.0, "epoch": 0.18202047174009792, "grad_norm": 0.013876304030418396, "kl": 0.009923950769007206, "learning_rate": 4.898533370399459e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 409 }, { "completion_length": 200.0, "epoch": 0.1824655095683133, "grad_norm": 0.03001904860138893, "kl": 0.01629229262471199, "learning_rate": 4.897435099423647e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 410 }, { "completion_length": 200.0, "epoch": 0.1829105473965287, "grad_norm": 0.020497044548392296, "kl": 0.0096663823351264, "learning_rate": 4.896331041038309e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 411 }, { "completion_length": 200.0, "epoch": 0.1833555852247441, "grad_norm": 0.9498317241668701, "kl": 0.01545047014951706, "learning_rate": 4.895221197908643e-06, "loss": 0.0006, "reward": 0.001833329675719142, "reward_std": 0.3016955256462097, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.001833329675719142, "step": 412 }, { "completion_length": 200.0, "epoch": 0.1838006230529595, "grad_norm": 0.006678743753582239, "kl": 0.003321468597277999, "learning_rate": 4.89410557271381e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 413 }, { "completion_length": 200.0, "epoch": 0.1842456608811749, "grad_norm": 0.009199023246765137, "kl": 0.005622576922178268, "learning_rate": 4.8929841681469295e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 414 }, { "completion_length": 200.0, "epoch": 0.1846906987093903, "grad_norm": 0.7114378213882446, "kl": 0.02522313967347145, "learning_rate": 4.891856986915073e-06, "loss": 0.001, "reward": -0.10500000417232513, "reward_std": 0.31319066882133484, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10500000417232513, "step": 415 }, { "completion_length": 200.0, "epoch": 0.1851357365376057, "grad_norm": 0.011489290744066238, "kl": 0.008540185168385506, "learning_rate": 4.8907240317392565e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 416 }, { "completion_length": 200.0, "epoch": 0.1855807743658211, "grad_norm": 0.6549782156944275, "kl": 0.023001134395599365, "learning_rate": 4.889585305354436e-06, "loss": 0.0009, "reward": 0.05899999663233757, "reward_std": 0.30181917548179626, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05899999663233757, "step": 417 }, { "completion_length": 200.0, "epoch": 0.1860258121940365, "grad_norm": 0.6743258833885193, "kl": 0.014783745631575584, "learning_rate": 4.888440810509496e-06, "loss": 0.0006, "reward": -0.007166664116084576, "reward_std": 0.267223060131073, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.007166664116084576, "step": 418 }, { "completion_length": 200.0, "epoch": 0.1864708500222519, "grad_norm": 0.011519741266965866, "kl": 0.006074823439121246, "learning_rate": 4.887290549967247e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 419 }, { "completion_length": 183.1666717529297, "epoch": 0.18691588785046728, "grad_norm": 0.8046799302101135, "kl": 0.018539931625127792, "learning_rate": 4.886134526504421e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.07905694097280502, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 420 }, { "completion_length": 200.0, "epoch": 0.18736092567868268, "grad_norm": 0.02228482812643051, "kl": 0.022595927119255066, "learning_rate": 4.884972742911656e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 421 }, { "completion_length": 170.0, "epoch": 0.1878059635068981, "grad_norm": 1.0265932083129883, "kl": 0.03965630382299423, "learning_rate": 4.8838052019935005e-06, "loss": 0.0016, "reward": -0.19983333349227905, "reward_std": 0.36561259627342224, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.19983333349227905, "step": 422 }, { "completion_length": 200.0, "epoch": 0.1882510013351135, "grad_norm": 0.008268559351563454, "kl": 0.006413072347640991, "learning_rate": 4.882631906568398e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 423 }, { "completion_length": 200.0, "epoch": 0.1886960391633289, "grad_norm": 0.011764715425670147, "kl": 0.00969620794057846, "learning_rate": 4.881452859468685e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 424 }, { "completion_length": 200.0, "epoch": 0.18914107699154428, "grad_norm": 0.013174543157219887, "kl": 0.012654460966587067, "learning_rate": 4.880268063540581e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 425 }, { "completion_length": 200.0, "epoch": 0.18958611481975968, "grad_norm": 0.009335462003946304, "kl": 0.011132560670375824, "learning_rate": 4.8790775216441835e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 426 }, { "completion_length": 141.1666717529297, "epoch": 0.19003115264797507, "grad_norm": 0.8435110449790955, "kl": 0.023837899789214134, "learning_rate": 4.877881236653463e-06, "loss": 0.001, "reward": -0.03983333706855774, "reward_std": 0.24120646715164185, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.03983333706855774, "step": 427 }, { "completion_length": 199.6666717529297, "epoch": 0.19047619047619047, "grad_norm": 0.6499157547950745, "kl": 0.01160873007029295, "learning_rate": 4.8766792114562495e-06, "loss": 0.0005, "reward": -0.09033333510160446, "reward_std": 0.351275771856308, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09033333510160446, "step": 428 }, { "completion_length": 197.83334350585938, "epoch": 0.1909212283044059, "grad_norm": 0.015310406684875488, "kl": 0.010684727691113949, "learning_rate": 4.875471448954234e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 429 }, { "completion_length": 200.0, "epoch": 0.19136626613262128, "grad_norm": 0.6541863679885864, "kl": 0.01754167675971985, "learning_rate": 4.874257952062957e-06, "loss": 0.0007, "reward": 0.00533333420753479, "reward_std": 0.2931222915649414, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00533333420753479, "step": 430 }, { "completion_length": 200.0, "epoch": 0.19181130396083668, "grad_norm": 0.009647169150412083, "kl": 0.005298088304698467, "learning_rate": 4.873038723711798e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 431 }, { "completion_length": 200.0, "epoch": 0.19225634178905207, "grad_norm": 0.010216983035206795, "kl": 0.007101266644895077, "learning_rate": 4.871813766843977e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 432 }, { "completion_length": 200.0, "epoch": 0.19270137961726747, "grad_norm": 0.007988468743860722, "kl": 0.003836569143459201, "learning_rate": 4.870583084416539e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 433 }, { "completion_length": 200.0, "epoch": 0.19314641744548286, "grad_norm": 0.7281708717346191, "kl": 0.016352718695998192, "learning_rate": 4.869346679400353e-06, "loss": 0.0007, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 434 }, { "completion_length": 200.0, "epoch": 0.19359145527369825, "grad_norm": 0.009215595200657845, "kl": 0.006540496833622456, "learning_rate": 4.868104554780101e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 435 }, { "completion_length": 178.0, "epoch": 0.19403649310191368, "grad_norm": 0.6501026749610901, "kl": 0.030550524592399597, "learning_rate": 4.866856713554271e-06, "loss": 0.0012, "reward": 0.04383333772420883, "reward_std": 0.15619271993637085, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.04383333772420883, "step": 436 }, { "completion_length": 200.0, "epoch": 0.19448153093012907, "grad_norm": 0.010590564459562302, "kl": 0.006376064382493496, "learning_rate": 4.865603158735155e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 437 }, { "completion_length": 199.6666717529297, "epoch": 0.19492656875834447, "grad_norm": 0.5724078416824341, "kl": 0.015208952128887177, "learning_rate": 4.864343893348834e-06, "loss": 0.0006, "reward": 0.0690000057220459, "reward_std": 0.20461182296276093, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0690000057220459, "step": 438 }, { "completion_length": 200.0, "epoch": 0.19537160658655986, "grad_norm": 0.5766968727111816, "kl": 0.011872484348714352, "learning_rate": 4.863078920435173e-06, "loss": 0.0005, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 439 }, { "completion_length": 200.0, "epoch": 0.19581664441477525, "grad_norm": 0.0435682088136673, "kl": 0.017930179834365845, "learning_rate": 4.861808243047822e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 440 }, { "completion_length": 200.0, "epoch": 0.19626168224299065, "grad_norm": 0.012740753591060638, "kl": 0.005490332376211882, "learning_rate": 4.860531864254192e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 441 }, { "completion_length": 200.0, "epoch": 0.19670672007120604, "grad_norm": 0.007875868119299412, "kl": 0.003752867691218853, "learning_rate": 4.8592497871354646e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 442 }, { "completion_length": 200.0, "epoch": 0.19715175789942144, "grad_norm": 0.00968019850552082, "kl": 0.0069991848431527615, "learning_rate": 4.857962014786575e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 443 }, { "completion_length": 200.0, "epoch": 0.19759679572763686, "grad_norm": 0.6327012777328491, "kl": 0.013870742172002792, "learning_rate": 4.856668550316203e-06, "loss": 0.0006, "reward": 0.008500000461935997, "reward_std": 0.2853655517101288, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.008500000461935997, "step": 444 }, { "completion_length": 200.0, "epoch": 0.19804183355585225, "grad_norm": 0.6365529298782349, "kl": 0.01527687069028616, "learning_rate": 4.855369396846778e-06, "loss": 0.0006, "reward": 0.0004999985685572028, "reward_std": 0.248800128698349, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0004999985685572028, "step": 445 }, { "completion_length": 200.0, "epoch": 0.19848687138406765, "grad_norm": 0.013388743624091148, "kl": 0.0054934462532401085, "learning_rate": 4.854064557514452e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 446 }, { "completion_length": 147.33334350585938, "epoch": 0.19893190921228304, "grad_norm": 0.80791836977005, "kl": 0.06670716404914856, "learning_rate": 4.8527540354691095e-06, "loss": 0.0027, "reward": 0.14483334124088287, "reward_std": 0.055607251822948456, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.14483334124088287, "step": 447 }, { "completion_length": 200.0, "epoch": 0.19937694704049844, "grad_norm": 0.008610348217189312, "kl": 0.0053973570466041565, "learning_rate": 4.8514378338743525e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 448 }, { "completion_length": 200.0, "epoch": 0.19982198486871383, "grad_norm": 0.6462050676345825, "kl": 0.022491535171866417, "learning_rate": 4.850115955907491e-06, "loss": 0.0009, "reward": 0.1666666716337204, "reward_std": 0.06454972922801971, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1666666716337204, "step": 449 }, { "completion_length": 200.0, "epoch": 0.20026702269692923, "grad_norm": 0.013753926381468773, "kl": 0.003668756689876318, "learning_rate": 4.8487884047595395e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 450 }, { "completion_length": 200.0, "epoch": 0.20071206052514465, "grad_norm": 0.01476313453167677, "kl": 0.009049910120666027, "learning_rate": 4.847455183635207e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 451 }, { "completion_length": 194.5, "epoch": 0.20115709835336004, "grad_norm": 0.7963857650756836, "kl": 0.024605944752693176, "learning_rate": 4.846116295752891e-06, "loss": 0.001, "reward": 0.08583333343267441, "reward_std": 0.1649368703365326, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.08583333343267441, "step": 452 }, { "completion_length": 197.33334350585938, "epoch": 0.20160213618157544, "grad_norm": 0.008257864974439144, "kl": 0.00925783533602953, "learning_rate": 4.844771744344666e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 453 }, { "completion_length": 169.5, "epoch": 0.20204717400979083, "grad_norm": 0.7012379169464111, "kl": 0.032544177025556564, "learning_rate": 4.843421532656281e-06, "loss": 0.0013, "reward": 0.028833335265517235, "reward_std": 0.24847166240215302, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.028833335265517235, "step": 454 }, { "completion_length": 133.5, "epoch": 0.20249221183800623, "grad_norm": 1.1525821685791016, "kl": 0.030363596975803375, "learning_rate": 4.8420656639471466e-06, "loss": 0.0012, "reward": -0.10766666382551193, "reward_std": 0.2986413836479187, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.10766667127609253, "step": 455 }, { "completion_length": 189.1666717529297, "epoch": 0.20293724966622162, "grad_norm": 0.01882368139922619, "kl": 0.018681103363633156, "learning_rate": 4.84070414149033e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 456 }, { "completion_length": 192.33334350585938, "epoch": 0.20338228749443701, "grad_norm": 0.907418429851532, "kl": 0.016113460063934326, "learning_rate": 4.83933696857255e-06, "loss": 0.0006, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 457 }, { "completion_length": 200.0, "epoch": 0.20382732532265244, "grad_norm": 0.02320541813969612, "kl": 0.012556570582091808, "learning_rate": 4.83796414849416e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 458 }, { "completion_length": 189.1666717529297, "epoch": 0.20427236315086783, "grad_norm": 0.8501640558242798, "kl": 0.023711485788226128, "learning_rate": 4.836585684569148e-06, "loss": 0.0009, "reward": 0.05000000447034836, "reward_std": 0.18371173739433289, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.05000000447034836, "step": 459 }, { "completion_length": 200.0, "epoch": 0.20471740097908322, "grad_norm": 0.0207956675440073, "kl": 0.008212253451347351, "learning_rate": 4.83520158012513e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 460 }, { "completion_length": 199.33334350585938, "epoch": 0.20516243880729862, "grad_norm": 0.01620439812541008, "kl": 0.011511600576341152, "learning_rate": 4.833811838503331e-06, "loss": 0.0005, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 461 }, { "completion_length": 200.0, "epoch": 0.205607476635514, "grad_norm": 0.8750971555709839, "kl": 0.007969997823238373, "learning_rate": 4.83241646305859e-06, "loss": 0.0003, "reward": -0.0003333290515001863, "reward_std": 0.3070027232170105, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0003333290515001863, "step": 462 }, { "completion_length": 200.0, "epoch": 0.2060525144637294, "grad_norm": 0.012503387406468391, "kl": 0.005554807838052511, "learning_rate": 4.8310154571593435e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 463 }, { "completion_length": 200.0, "epoch": 0.2064975522919448, "grad_norm": 0.01813841424882412, "kl": 0.00543005159124732, "learning_rate": 4.829608824187621e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 464 }, { "completion_length": 200.0, "epoch": 0.20694259012016022, "grad_norm": 0.007825582288205624, "kl": 0.007653496228158474, "learning_rate": 4.828196567539034e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 465 }, { "completion_length": 200.0, "epoch": 0.20738762794837562, "grad_norm": 0.03445260971784592, "kl": 0.011151362210512161, "learning_rate": 4.826778690622772e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 466 }, { "completion_length": 192.6666717529297, "epoch": 0.207832665776591, "grad_norm": 0.6254943609237671, "kl": 0.0196918286383152, "learning_rate": 4.82535519686159e-06, "loss": 0.0008, "reward": -0.05883334204554558, "reward_std": 0.3185080885887146, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.05883333832025528, "step": 467 }, { "completion_length": 183.33334350585938, "epoch": 0.2082777036048064, "grad_norm": 0.7328706383705139, "kl": 0.045631419867277145, "learning_rate": 4.823926089691803e-06, "loss": 0.0018, "reward": -0.11100000143051147, "reward_std": 0.37771734595298767, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.11100000143051147, "step": 468 }, { "completion_length": 200.0, "epoch": 0.2087227414330218, "grad_norm": 0.013255412690341473, "kl": 0.011069796979427338, "learning_rate": 4.822491372563276e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 469 }, { "completion_length": 200.0, "epoch": 0.2091677792612372, "grad_norm": 0.022944483906030655, "kl": 0.013812100514769554, "learning_rate": 4.821051048939416e-06, "loss": 0.0006, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 470 }, { "completion_length": 200.0, "epoch": 0.2096128170894526, "grad_norm": 0.009076782502233982, "kl": 0.0035323970951139927, "learning_rate": 4.819605122297167e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 471 }, { "completion_length": 200.0, "epoch": 0.210057854917668, "grad_norm": 0.047525037080049515, "kl": 0.008770107291638851, "learning_rate": 4.818153596126995e-06, "loss": 0.0004, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 472 }, { "completion_length": 200.0, "epoch": 0.2105028927458834, "grad_norm": 0.7111804485321045, "kl": 0.02760786935687065, "learning_rate": 4.816696473932886e-06, "loss": 0.0011, "reward": -0.012666663154959679, "reward_std": 0.33721309900283813, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.012666663154959679, "step": 473 }, { "completion_length": 70.66667175292969, "epoch": 0.2109479305740988, "grad_norm": 1.4313840866088867, "kl": 0.02774934470653534, "learning_rate": 4.815233759232333e-06, "loss": 0.0011, "reward": 0.01666666753590107, "reward_std": 0.054006174206733704, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.01666666753590107, "step": 474 }, { "completion_length": 200.0, "epoch": 0.2113929684023142, "grad_norm": 0.008336659520864487, "kl": 0.003154546720907092, "learning_rate": 4.8137654555563305e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 475 }, { "completion_length": 200.0, "epoch": 0.2118380062305296, "grad_norm": 0.6730583310127258, "kl": 0.02612406387925148, "learning_rate": 4.812291566449363e-06, "loss": 0.001, "reward": 0.1458333432674408, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 476 }, { "completion_length": 200.0, "epoch": 0.21228304405874499, "grad_norm": 0.6641086935997009, "kl": 0.013326774351298809, "learning_rate": 4.810812095469401e-06, "loss": 0.0005, "reward": 0.011333337053656578, "reward_std": 0.27842533588409424, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.011333337053656578, "step": 477 }, { "completion_length": 196.0, "epoch": 0.21272808188696038, "grad_norm": 0.02167276106774807, "kl": 0.016343414783477783, "learning_rate": 4.809327046187888e-06, "loss": 0.0007, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 478 }, { "completion_length": 200.0, "epoch": 0.2131731197151758, "grad_norm": 0.6960393786430359, "kl": 0.00471863616257906, "learning_rate": 4.807836422189733e-06, "loss": 0.0002, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 479 }, { "completion_length": 200.0, "epoch": 0.2136181575433912, "grad_norm": 0.03306792676448822, "kl": 0.022989537566900253, "learning_rate": 4.806340227073304e-06, "loss": 0.0009, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 480 }, { "completion_length": 186.1666717529297, "epoch": 0.2140631953716066, "grad_norm": 0.7957750558853149, "kl": 0.0064355782233178616, "learning_rate": 4.8048384644504165e-06, "loss": 0.0003, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 481 }, { "completion_length": 190.0, "epoch": 0.21450823319982198, "grad_norm": 0.7082236409187317, "kl": 0.05344567820429802, "learning_rate": 4.8033311379463255e-06, "loss": 0.0021, "reward": -0.0898333415389061, "reward_std": 0.3882114887237549, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.0898333415389061, "step": 482 }, { "completion_length": 200.0, "epoch": 0.21495327102803738, "grad_norm": 0.019929101690649986, "kl": 0.008283906616270542, "learning_rate": 4.801818251199718e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 483 }, { "completion_length": 185.5, "epoch": 0.21539830885625277, "grad_norm": 0.6174662113189697, "kl": 0.02444113790988922, "learning_rate": 4.800299807862705e-06, "loss": 0.001, "reward": 0.00916666816920042, "reward_std": 0.22120074927806854, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.00916666816920042, "step": 484 }, { "completion_length": 200.0, "epoch": 0.21584334668446817, "grad_norm": 0.6985958814620972, "kl": 0.047310732305049896, "learning_rate": 4.798775811600807e-06, "loss": 0.0019, "reward": -0.09299999475479126, "reward_std": 0.3469115197658539, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09299999475479126, "step": 485 }, { "completion_length": 200.0, "epoch": 0.2162883845126836, "grad_norm": 0.008483149111270905, "kl": 0.004622921347618103, "learning_rate": 4.7972462660929546e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 486 }, { "completion_length": 190.6666717529297, "epoch": 0.21673342234089898, "grad_norm": 0.8138934969902039, "kl": 0.022382408380508423, "learning_rate": 4.795711175031467e-06, "loss": 0.0009, "reward": 0.1458333432674408, "reward_std": 0.05103103816509247, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1458333432674408, "step": 487 }, { "completion_length": 200.0, "epoch": 0.21717846016911438, "grad_norm": 0.010969250462949276, "kl": 0.005018382798880339, "learning_rate": 4.79417054212206e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 488 }, { "completion_length": 200.0, "epoch": 0.21762349799732977, "grad_norm": 0.007726567331701517, "kl": 0.003069926518946886, "learning_rate": 4.792624371083819e-06, "loss": 0.0001, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 489 }, { "completion_length": 200.0, "epoch": 0.21806853582554517, "grad_norm": 0.00889444351196289, "kl": 0.004114137962460518, "learning_rate": 4.791072665649203e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 490 }, { "completion_length": 187.0, "epoch": 0.21851357365376056, "grad_norm": 0.9759646654129028, "kl": 0.02656198851764202, "learning_rate": 4.789515429564029e-06, "loss": 0.0011, "reward": 0.13449999690055847, "reward_std": 0.023270150646567345, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.13449999690055847, "step": 491 }, { "completion_length": 164.6666717529297, "epoch": 0.21895861148197596, "grad_norm": 0.698348343372345, "kl": 0.03639654442667961, "learning_rate": 4.787952666587465e-06, "loss": 0.0015, "reward": 0.06016666814684868, "reward_std": 0.2878718078136444, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06016666814684868, "step": 492 }, { "completion_length": 181.83334350585938, "epoch": 0.21940364931019138, "grad_norm": 0.6999357342720032, "kl": 0.0428909957408905, "learning_rate": 4.786384380492024e-06, "loss": 0.0017, "reward": 0.16099999845027924, "reward_std": 0.13888844847679138, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.16099999845027924, "step": 493 }, { "completion_length": 184.83334350585938, "epoch": 0.21984868713840677, "grad_norm": 0.7373248338699341, "kl": 0.051096897572278976, "learning_rate": 4.784810575063546e-06, "loss": 0.002, "reward": 0.019499998539686203, "reward_std": 0.3235452175140381, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.019499998539686203, "step": 494 }, { "completion_length": 198.5, "epoch": 0.22029372496662217, "grad_norm": 0.8084774017333984, "kl": 0.012246180325746536, "learning_rate": 4.783231254101201e-06, "loss": 0.0005, "reward": 0.1041666716337204, "reward_std": 0.05103103443980217, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.1041666716337204, "step": 495 }, { "completion_length": 200.0, "epoch": 0.22073876279483756, "grad_norm": 0.008205811493098736, "kl": 0.007350212894380093, "learning_rate": 4.781646421417469e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 496 }, { "completion_length": 200.0, "epoch": 0.22118380062305296, "grad_norm": 0.018955878913402557, "kl": 0.005687872879207134, "learning_rate": 4.780056080838138e-06, "loss": 0.0002, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 497 }, { "completion_length": 200.0, "epoch": 0.22162883845126835, "grad_norm": 0.17352764308452606, "kl": 0.04136586934328079, "learning_rate": 4.77846023620229e-06, "loss": 0.0017, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 498 }, { "completion_length": 199.0, "epoch": 0.22207387627948375, "grad_norm": 0.014824754558503628, "kl": 0.008487005718052387, "learning_rate": 4.776858891362296e-06, "loss": 0.0003, "reward": 0.125, "reward_std": 0.0, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.125, "step": 499 }, { "completion_length": 200.0, "epoch": 0.22251891410769917, "grad_norm": 0.7017592191696167, "kl": 0.012654486112296581, "learning_rate": 4.775252050183802e-06, "loss": 0.0005, "reward": 0.025499999523162842, "reward_std": 0.24372422695159912, "rewards/correctness_reward_func": 0.0, "rewards/int_reward_func": 0.0, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025499999523162842, "step": 500 } ], "logging_steps": 1, "max_steps": 2247, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }